数据分析中常用的Python技巧

1. 条件表达式

import math
# 普通写法
def get_log(x):
    if x > 0: 
        y = math.log(x) 
    else: 
        y = float('nan') 
    return y
x = 5
log_val1 = get_log(x)
# 使用条件表达式
log_val2 = math.log(x) if x > 0 else float('nan')

print(log_val1)
print(log_val2)

2. 列表推导式

print('找出1000内的偶数(for循环):')
l1 = []
for i in range(1000):
    if i % 2 == 0:
        l1.append(i)
print(l1)

print('找出1000内的偶数(列表推导式):')
l2 = [i for i in range(1000) if i % 2 == 0]
print(l2)

3. Python常用容器类型

# list列表
l = [1, 'a', 2, 'b']
print(type(l))
print('修改前:', l)

# 修改list的内容
l[0] = 3
print('修改后:', l)

# 末尾添加元素
l.append(4)
print('添加后:', l)

# 遍历list
print('遍历list(for循环):')
for item in l:
    print(item)
    
# 通过索引遍历list
print('遍历list(while循环):')
i = 0
while i != len(l):
    print(l[i])
    i += 1
    
# 列表合并
print('列表合并(+):', [1, 2] + [3, 4])

# 列表重复
print('列表重复(*):', [1, 2] * 5)

# 判断元素是否在列表中
print('判断元素存在(in):', 1 in [1, 2])
# 2 tuple元组
t = (1, 'a', 2, 'b')
print(type(t))

#元组的内容不能修改,否则会报错
# t[0] = 3 

# 遍历tuple
print('遍历list(for循环):')
for item in t:
    print(item)
    
# 通过索引遍历tuple
print('遍历tuple(while循环):')
i = 0
while i != len(t):
    print(t[i])
    i += 1
    
# 解包 unpack
a, b, _, _ = t
print('unpack: ', c)

# 确保unpack接收的变量个数和tuple的长度相同,否则报错
# 经常出现在函数返回值的赋值时
# a, b, c = t
# 3 dictiona字典
d = {'小象学院': 'http://www.chinahadoop.cn/',
    '百度': 'https://www.baidu.com/',
    '阿里巴巴': 'https://www.alibaba.com/',
    '腾讯': 'https://www.tencent.com/'}
print('通过key获取value: ', d['小象学院'])
# 遍历keyprint('遍历key: ')
for key in d.keys(): 
    print(key) 
# 遍历valueprint('遍历value: ')
for value in d.values(): 
    print(value)
 # 遍历itemprint('遍历item: ')
for key, value in d.items(): 
    print(key + ': ' + value)
# format输出格式print('format输出格式:')
for key, value in d.items(): 
    print('{}的网址是{}'.format(key, value))
# 4 set集合
print('创建set:')
my_set = {1, 2, 3}
print(my_set)
my_set = set([1, 2, 3, 2])
print(my_set)
print('添加单个元素:')
my_set.add(3)print('添加3', my_set)
my_set.add(4)print('添加4', my_set)
print('添加多个元素:')
my_set.update([4, 5, 6])
print(my_set)

4. Counter

初始化
import collections
c1 = collections.Counter(['a', 'b', 'c', 'a', 'b', 'b'])
c2 = collections.Counter({'a':2, 'b':3, 'c':1})
c3 = collections.Counter(a=2, b=3, c=1)
更新内容
# 注意这里是做“加法”,不是“替换”
c1.update({'a': 4, 'c': -2, 'd': 4})
print(c1)访问内容
print('a=', c1['a'])
print('b=', c1['b'])
# 对比和dict的区别
print('e=', c1['e'])
element()方法
for element in c1.elements():
    print(element)
most_common()方法
c1.most_common(3)

5. defaultdict

# 统计每个字母出现的次数
s = 'chinadoop'
# 使用Counter
print(collections.Counter(s))
# 使用dict
counter = {}
for c in s:
    if c not in counter:
        counter[c] = 1
    else:
        counter[c] += 1
        
print(counter.items())

# 使用defaultdict
counter2 = collections.defaultdict(int)
for c in s:
    counter2[c] += 1
print(counter2.items())

# 记录相同元素的列表
colors = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = collections.defaultdict(list)
for k, v in colors:
    d[k].append(v)

print(d.items())

6. map()函数

import math

print('示例1,获取两个列表对应位置上的最小值:')
l1 = [1, 3, 5, 7, 9]
l2 = [2, 4, 6, 8, 10]
mins = map(min, l1, l2)
print(mins)

# map()函数操作时,直到访问数据时才会执行
for item in mins:
    print(item)

print('示例2,对列表中的元素进行平方根操作:')
squared = map(math.sqrt, l2)
print(squared)
print(list(squared))

7. 匿名函数 lambda

# my_func = lambda a, b, c: a * b
# print(my_func)
# print(my_func(1, 2, 3))

# 结合map
print('lambda结合map')
l1 = [1, 3, 5, 7, 9]
l2 = [2, 4, 6, 8, 10]
result = map(lambda x, y: x * 2 + y, l1, l2)
print(list(result))

8. Python操作CSV数据文件

数据grades.csv下载地址:点击打开链接
import csv

with open('grades.csv') as csvfile:
    grades_data = list(csv.DictReader(csvfile))
    
print('记录个数:', len(grades_data))
print('前2条记录:', grades_data[:2])
print('列名:', list(grades_data[0].keys()))
avg_assign1 = sum([float(row['assignment1_grade']) for row in grades_data]) / len(grades_data) 
print('assignment1平均分数:', avg_assign1)
assign1_sub_month = set(row['assignment1_submission'][:7] for row in grades_data)
print(assign1_sub_month)

猜你喜欢

转载自blog.csdn.net/happy5205205/article/details/80828014