''' 【课程2.14】 数值计算和统计基础 常用数学、统计方法 '''
# 基本参数:axis、skipna import numpy as np import pandas as pd df = pd.DataFrame({'key1':[4,5,3,np.nan,2], 'key2':[1,2,np.nan,4,5], 'key3':[1,2,3,'j','k']}, index = ['a','b','c','d','e']) print(df) print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype) print('-----') m1 = df.mean() print(m1,type(m1)) print('单独统计一列:',df['key2'].mean()) print('-----') # np.nan :空值 # .mean()计算均值 # 只统计数字列 # 可以通过索引单独统计一列 m2 = df.mean(axis=1) print(m2) print('-----') # axis参数:默认为0,以列来计算,axis=1,以行来计算,这里就按照行来汇总了 m3 = df.mean(skipna=False) print(m3) print('-----') # skipna参数:是否忽略NaN,默认True,如False,有NaN的列统计结果仍未NaN
输出:
key1 key2 key3 a 4.0 1.0 1 b 5.0 2.0 2 c 3.0 NaN 3 d NaN 4.0 j e 2.0 5.0 k float64 float64 object ----- key1 3.5 key2 3.0 dtype: float64 <class 'pandas.core.series.Series'> 单独统计一列: 3.0 ----- a 2.5 b 3.5 c 3.0 d 4.0 e 3.5 dtype: float64 ----- key1 NaN key2 NaN dtype: float64 -----
# 主要数学计算方法,可用于Series和DataFrame(1) df = pd.DataFrame({'key1':np.arange(10), 'key2':np.random.rand(10)*10}) print(df) print('-----') print(df.count(),'→ count统计非Na值的数量\n') print(df.min(),'→ min统计最小值\n',df['key2'].max(),'→ max统计最大值\n') print(df.quantile(q=0.75),'→ quantile统计分位数,参数q确定位置\n') print(df.sum(),'→ sum求和\n') print(df.mean(),'→ mean求平均值\n') print(df.median(),'→ median求算数中位数,50%分位数\n') print(df.std(),'\n',df.var(),'→ std,var分别求标准差,方差\n') print(df.skew(),'→ skew样本的偏度\n') print(df.kurt(),'→ kurt样本的峰度\n')
输出:
key1 key2 0 0 4.667989 1 1 4.336625 2 2 0.746852 3 3 9.670919 4 4 8.732045 5 5 0.013751 6 6 8.963752 7 7 0.279303 8 8 8.586821 9 9 8.899657 ----- key1 10 key2 10 dtype: int64 → count统计非Na值的数量 key1 0.000000 key2 0.013751 dtype: float64 → min统计最小值 9.67091932107 → max统计最大值 key1 6.750000 key2 8.857754 dtype: float64 → quantile统计分位数,参数q确定位置 key1 45.000000 key2 54.897714 dtype: float64 → sum求和 key1 4.500000 key2 5.489771 dtype: float64 → mean求平均值 key1 4.500000 key2 6.627405 dtype: float64 → median求算数中位数,50%分位数 key1 3.027650 key2 3.984945 dtype: float64 key1 9.166667 key2 15.879783 dtype: float64 → std,var分别求标准差,方差 key1 0.000000 key2 -0.430166 dtype: float64 → skew样本的偏度 key1 -1.200000 key2 -1.800296 dtype: float64 → kurt样本的峰度
# 主要数学计算方法,可用于Series和DataFrame(2) df['key1_s'] = df['key1'].cumsum() df['key2_s'] = df['key2'].cumsum() print(df,'→ cumsum样本的累计和\n') df['key1_p'] = df['key1'].cumprod() df['key2_p'] = df['key2'].cumprod() print(df,'→ cumprod样本的累计积\n') print(df.cummax(),'\n',df.cummin(),'→ cummax,cummin分别求累计最大值,累计最小值\n') # 会填充key1,和key2的值
输出:
key1 key2 key1_s key2_s 0 0 4.667989 0 4.667989 1 1 4.336625 1 9.004614 2 2 0.746852 3 9.751466 3 3 9.670919 6 19.422386 4 4 8.732045 10 28.154431 5 5 0.013751 15 28.168182 6 6 8.963752 21 37.131934 7 7 0.279303 28 37.411236 8 8 8.586821 36 45.998057 9 9 8.899657 45 54.897714 → cumsum样本的累计和 key1 key2 key1_s key2_s key1_p key2_p 0 0 4.667989 0 4.667989 0 4.667989 1 1 4.336625 1 9.004614 0 20.243318 2 2 0.746852 3 9.751466 0 15.118767 3 3 9.670919 6 19.422386 0 146.212377 4 4 8.732045 10 28.154431 0 1276.733069 5 5 0.013751 15 28.168182 0 17.556729 6 6 8.963752 21 37.131934 0 157.374157 7 7 0.279303 28 37.411236 0 43.955024 8 8 8.586821 36 45.998057 0 377.433921 9 9 8.899657 45 54.897714 0 3359.032396 → cumprod样本的累计积 key1 key2 key1_s key2_s key1_p key2_p 0 0.0 4.667989 0.0 4.667989 0.0 4.667989 1 1.0 4.667989 1.0 9.004614 0.0 20.243318 2 2.0 4.667989 3.0 9.751466 0.0 20.243318 3 3.0 9.670919 6.0 19.422386 0.0 146.212377 4 4.0 9.670919 10.0 28.154431 0.0 1276.733069 5 5.0 9.670919 15.0 28.168182 0.0 1276.733069 6 6.0 9.670919 21.0 37.131934 0.0 1276.733069 7 7.0 9.670919 28.0 37.411236 0.0 1276.733069 8 8.0 9.670919 36.0 45.998057 0.0 1276.733069 9 9.0 9.670919 45.0 54.897714 0.0 3359.032396 key1 key2 key1_s key2_s key1_p key2_p 0 0.0 4.667989 0.0 4.667989 0.0 4.667989 1 0.0 4.336625 0.0 4.667989 0.0 4.667989 2 0.0 0.746852 0.0 4.667989 0.0 4.667989 3 0.0 0.746852 0.0 4.667989 0.0 4.667989 4 0.0 0.746852 0.0 4.667989 0.0 4.667989 5 0.0 0.013751 0.0 4.667989 0.0 4.667989 6 0.0 0.013751 0.0 4.667989 0.0 4.667989 7 0.0 0.013751 0.0 4.667989 0.0 4.667989 8 0.0 0.013751 0.0 4.667989 0.0 4.667989 9 0.0 0.013751 0.0 4.667989 0.0 4.667989 → cummax,cummin分别求累计最大值,累计最小值
# 唯一值:.unique() s = pd.Series(list('asdvasdcfgg')) sq = s.unique() print(s) print(sq,type(sq)) print(pd.Series(sq)) # 得到一个唯一值数组 # 通过pd.Series重新变成新的Series sq.sort() print(sq) # 重新排序
输出:
0 a 1 s 2 d 3 v 4 a 5 s 6 d 7 c 8 f 9 g 10 g dtype: object ['a' 's' 'd' 'v' 'c' 'f' 'g'] <class 'numpy.ndarray'> 0 a 1 s 2 d 3 v 4 c 5 f 6 g dtype: object ['a' 'c' 'd' 'f' 'g' 's' 'v']
# 值计数:.value_counts() sc = s.value_counts(sort = False) # 也可以这样写:pd.value_counts(sc, sort = False) print(sc) # 得到一个新的Series,计算出不同值出现的频率 # sort参数:排序,默认为True
输出:
s 2 d 2 v 1 c 1 a 2 g 2 f 1 dtype: int64
# 成员资格:.isin() s = pd.Series(np.arange(10,15)) df = pd.DataFrame({'key1':list('asdcbvasd'), 'key2':np.arange(4,13)}) print(s) print(df) print('-----') print(s.isin([5,14])) print(df.isin(['a','bc','10',8])) # 用[]表示 # 得到一个布尔值的Series或者Dataframe
输出:
0 10 1 11 2 12 3 13 4 14 dtype: int32 key1 key2 0 a 4 1 s 5 2 d 6 3 c 7 4 b 8 5 v 9 6 a 10 7 s 11 8 d 12 ----- 0 False 1 False 2 False 3 False 4 True dtype: bool key1 key2 0 True False 1 False False 2 False False 3 False False 4 False True 5 False False 6 True False 7 False False 8 False False
''' 【课程2.15】 文本数据 Pandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作 '''
# 通过str访问,且自动排除丢失/ NA值 s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj']) df = pd.DataFrame({'key1':list('abcdef'), 'key2':['hee','fv','w','hija','123',np.nan]}) print(s) print(df) print('-----') print(s.str.count('b')) print(df['key2'].str.upper()) print('-----') # 直接通过.str调用字符串方法 # 可以对Series、Dataframe使用 # 自动过滤NaN值 df.columns = df.columns.str.upper() print(df) # df.columns是一个Index对象,也可使用.str
输出:
0 A 1 b 2 C 3 bbhello 4 123 5 NaN 6 hj dtype: object key1 key2 0 a hee 1 b fv 2 c w 3 d hija 4 e 123 5 f NaN ----- 0 0.0 1 1.0 2 0.0 3 2.0 4 0.0 5 NaN 6 0.0 dtype: float64 0 HEE 1 FV 2 W 3 HIJA 4 123 5 NaN Name: key2, dtype: object ----- KEY1 KEY2 0 a hee 1 b fv 2 c w 3 d hija 4 e 123 5 f NaN
# 字符串常用方法(1) - lower,upper,len,startswith,endswith s = pd.Series(['A','b','bbhello','123',np.nan]) print(s.str.lower(),'→ lower小写\n') print(s.str.upper(),'→ upper大写\n') print(s.str.len(),'→ len字符长度\n') print(s.str.startswith('b'),'→ 判断起始是否为a\n') print(s.str.endswith('3'),'→ 判断结束是否为3\n')
输出:
0 a 1 b 2 bbhello 3 123 4 NaN dtype: object → lower小写 0 A 1 B 2 BBHELLO 3 123 4 NaN dtype: object → upper大写 0 1.0 1 1.0 2 7.0 3 3.0 4 NaN dtype: float64 → len字符长度 0 False 1 True 2 True 3 False 4 NaN dtype: object → 判断起始是否为a 0 False 1 False 2 False 3 True 4 NaN dtype: object → 判断结束是否为3
# 字符串常用方法(2) - strip s = pd.Series([' jack', 'jill ', ' jesse ', 'frank']) df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '], index=range(3)) print(s) print(df) print('-----') print(s.str.strip()) # 去除字符串中的空格 print(s.str.lstrip()) # 去除字符串中的左空格 print(s.str.rstrip()) # 去除字符串中的右空格 df.columns = df.columns.str.strip() print(df) # 这里去掉了columns的前后空格,但没有去掉中间空格
输出:
0 jack 1 jill 2 jesse 3 frank dtype: object Column A Column B 0 0.647766 0.094747 1 0.342940 -0.660643 2 1.183315 -0.143729 ----- 0 jack 1 jill 2 jesse 3 frank dtype: object 0 jack 1 jill 2 jesse 3 frank dtype: object 0 jack 1 jill 2 jesse 3 frank dtype: object Column A Column B 0 0.647766 0.094747 1 0.342940 -0.660643 2 1.183315 -0.143729
# 字符串常用方法(3) - replace df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '], index=range(3)) df.columns = df.columns.str.replace(' ','-') print(df) # 替换 df.columns = df.columns.str.replace('-','hehe',n=1) print(df) # n:替换个数
输出:
-Column-A- -Column-B- 0 1.855227 -0.519479 1 -0.400376 -0.421383 2 -0.293797 -0.432481 heheColumn-A- heheColumn-B- 0 1.855227 -0.519479 1 -0.400376 -0.421383 2 -0.293797 -0.432481
# 字符串常用方法(4) - split、rsplit s = pd.Series(['a,b,c','1,2,3',['a,,,c'],np.nan]) print(s.str.split(',')) print('-----') # 类似字符串的split print(s.str.split(',')[0]) print('-----') # 直接索引得到一个list print(s.str.split(',').str[0]) print(s.str.split(',').str.get(1)) print('-----') # 可以使用get或[]符号访问拆分列表中的元素 print(s.str.split(',', expand=True)) print(s.str.split(',', expand=True, n = 1)) print(s.str.rsplit(',', expand=True, n = 1)) print('-----') # 可以使用expand可以轻松扩展此操作以返回DataFrame # n参数限制分割数 # rsplit类似于split,反向工作,即从字符串的末尾到字符串的开头 df = pd.DataFrame({'key1':['a,b,c','1,2,3',[':,., ']], 'key2':['a-b-c','1-2-3',[':-.- ']]}) print(df['key2'].str.split('-')) # Dataframe使用split
输出:
0 [a, b, c] 1 [1, 2, 3] 2 NaN 3 NaN dtype: object ----- ['a', 'b', 'c'] ----- 0 a 1 1 2 NaN 3 NaN dtype: object 0 b 1 2 2 NaN 3 NaN dtype: object ----- 0 1 2 0 a b c 1 1 2 3 2 NaN None None 3 NaN None None 0 1 0 a b,c 1 1 2,3 2 NaN None 3 NaN None 0 1 0 a,b c 1 1,2 3 2 NaN None 3 NaN None ----- 0 [a, b, c] 1 [1, 2, 3] 2 NaN Name: key2, dtype: object
# 字符串索引 s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj']) df = pd.DataFrame({'key1':list('abcdef'), 'key2':['hee','fv','w','hija','123',np.nan]}) print(s.str[0]) # 取第一个字符串 print(s.str[:2]) # 取前两个字符串 print(df['key2'].str[0]) # str之后和字符串本身索引方式相同
输出:
0 A 1 b 2 C 3 b 4 1 5 NaN 6 h dtype: object 0 A 1 b 2 C 3 bb 4 12 5 NaN 6 hj dtype: object 0 h 1 f 2 w 3 h 4 1 5 NaN Name: key2, dtype: object