#常用数学,统计方法
import numpy as np
import pandas as pd
In [7]:
df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
'key2':[1,2,np.nan,4,5],
'key3':['a','b','c','d','e']})
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
float64 float64 object
In [14]:
print(df)
#单一列的均值 axis默认为0,按列来计算
print(df.mean())
print(df['key2'].mean())
#axis为1,按照行来计算
print(df.mean(axis = 1))
key1 key2 key3
0 4.0 1.0 a
1 5.0 2.0 b
2 3.0 NaN c
3 NaN 4.0 d
4 2.0 5.0 e
key1 3.5
key2 3.0
dtype: float64
3.0
0 2.5
1 3.5
2 3.0
3 4.0
4 3.5
dtype: float64
In [30]:
#skipna 参数选择是否忽略NaN,默认为True,如果是False的列统计结果仍未NaN
df = pd.DataFrame(np.random.rand(20).reshape(10,2),columns=['a','b'])
#列计算
df['mean'] = df.mean(axis = 1)
#行计算
df.loc['mean'] = df.mean()
df
Out[30]:
a b mean
0 0.713674 0.378652 0.546163
1 0.881657 0.230902 0.556280
2 0.342403 0.473300 0.407851
3 0.321717 0.740015 0.530866
4 0.649596 0.021103 0.335350
5 0.011607 0.829877 0.420742
6 0.964894 0.209440 0.587167
7 0.338171 0.541400 0.439786
8 0.909710 0.121635 0.515672
9 0.934586 0.768681 0.851634
mean 0.606802 0.431500 0.519151
In [35]:
print("统计非NAN的值的数量\n",df.count())
统计非NAN的值的数量
a 11
b 11
mean 11
dtype: int64
In [36]:
print("最小值\n",df.min())
最小值
a 0.011607
b 0.021103
mean 0.335350
dtype: float64
In [37]:
print("最大值\n",df.max())
最大值
a 0.964894
b 0.829877
mean 0.851634
dtype: float64
In [38]:
print("求和\n",df.sum())
求和
a 6.674818
b 4.746505
mean 5.710661
dtype: float64
In [39]:
print("平均值\n",df.mean())
平均值
a 0.606802
b 0.431500
mean 0.519151
dtype: float64
In [40]:
print("中位数\n",df.median())
中位数
a 0.649596
b 0.431500
mean 0.519151
dtype: float64
In [41]:
print("标准差,方差\n",df.std(),df.var())
标准差,方差
a 0.315250
b 0.271696
mean 0.134008
dtype: float64 a 0.099382
b 0.073819
mean 0.017958
dtype: float64
In [42]:
print("skew样本偏度\n",df.skew())
skew样本偏度
a -0.549653
b 0.087022
mean 1.427140
dtype: float64
In [43]:
print("kurt样本偏度\n",df.kurt())
kurt样本偏度
a -0.743035
b -1.173489
mean 3.605701
dtype: float64
In [51]:
#主要数学计算方法
#累计和
df['a_sum'] = df['a'].cumsum()
df['b_sum'] = df['b'].cumsum()
#累计积
df['a_p'] = df['a'].cumprod()
df['b_p'] = df['b'].cumprod()
print(df)
#累计计算最大值和最小值
print(df.cummax(),"\n",df.cummin())
a b mean a_sum b_sum a_p b_p
0 0.713674 0.378652 0.546163 0.713674 0.378652 0.713674 0.378652
1 0.881657 0.230902 0.556280 1.595332 0.609554 0.629216 0.087432
2 0.342403 0.473300 0.407851 1.937735 1.082854 0.215445 0.041381
3 0.321717 0.740015 0.530866 2.259452 1.822868 0.069312 0.030623
4 0.649596 0.021103 0.335350 2.909048 1.843972 0.045025 0.000646
5 0.011607 0.829877 0.420742 2.920655 2.673849 0.000523 0.000536
6 0.964894 0.209440 0.587167 3.885549 2.883288 0.000504 0.000112
7 0.338171 0.541400 0.439786 4.223721 3.424688 0.000171 0.000061
8 0.909710 0.121635 0.515672 5.133431 3.546323 0.000155 0.000007
9 0.934586 0.768681 0.851634 6.068016 4.315004 0.000145 0.000006
mean 0.606802 0.431500 0.519151 6.674818 4.746505 0.000088 0.000002
a b mean a_sum b_sum a_p b_p
0 0.713674 0.378652 0.546163 0.713674 0.378652 0.713674 0.378652
1 0.881657 0.378652 0.556280 1.595332 0.609554 0.713674 0.378652
2 0.881657 0.473300 0.556280 1.937735 1.082854 0.713674 0.378652
3 0.881657 0.740015 0.556280 2.259452 1.822868 0.713674 0.378652
4 0.881657 0.740015 0.556280 2.909048 1.843972 0.713674 0.378652
5 0.881657 0.829877 0.556280 2.920655 2.673849 0.713674 0.378652
6 0.964894 0.829877 0.587167 3.885549 2.883288 0.713674 0.378652
7 0.964894 0.829877 0.587167 4.223721 3.424688 0.713674 0.378652
8 0.964894 0.829877 0.587167 5.133431 3.546323 0.713674 0.378652
9 0.964894 0.829877 0.851634 6.068016 4.315004 0.713674 0.378652
mean 0.964894 0.829877 0.851634 6.674818 4.746505 0.713674 0.378652
a b mean a_sum b_sum a_p b_p
0 0.713674 0.378652 0.546163 0.713674 0.378652 0.713674 0.378652
1 0.713674 0.230902 0.546163 0.713674 0.378652 0.629216 0.087432
2 0.342403 0.230902 0.407851 0.713674 0.378652 0.215445 0.041381
3 0.321717 0.230902 0.407851 0.713674 0.378652 0.069312 0.030623
4 0.321717 0.021103 0.335350 0.713674 0.378652 0.045025 0.000646
5 0.011607 0.021103 0.335350 0.713674 0.378652 0.000523 0.000536
6 0.011607 0.021103 0.335350 0.713674 0.378652 0.000504 0.000112
7 0.011607 0.021103 0.335350 0.713674 0.378652 0.000171 0.000061
8 0.011607 0.021103 0.335350 0.713674 0.378652 0.000155 0.000007
9 0.011607 0.021103 0.335350 0.713674 0.378652 0.000145 0.000006
mean 0.011607 0.021103 0.335350 0.713674 0.378652 0.000088 0.000002
In [74]:
#唯一值 得到唯一值
s = pd.Series(list("aabacdefg"))
print(s.unique())
print(s.count())
['a' 'b' 'c' 'd' 'e' 'f' 'g']
9
In [75]:
#成员资格 isin
print(s.isin(['a']))
0 True
1 True
2 False
3 True
4 False
5 False
6 False
7 False
8 False
dtype: bool
In [79]:
#小作业
ip = eval(input("please input a list:"))
s = pd.Series(ip)
def f(s):
s1 = s.unique()
if len(s1) == len(s):
print("yes")
else:
print("no")
f(s)
please input a list:1,2,3,4,5,1
no
#文本数据
import numpy as np
import pandas as pd
In [3]:
s = pd.Series(['a','b','c','hello','123',np.nan,'shit'])
df = pd.DataFrame({'key1':list('abcdef'),
'key2':['hee','a','hija','123','w',np.nan]})
In [5]:
print(s)
print(df)
0 a
1 b
2 c
3 hello
4 123
5 NaN
6 shit
dtype: object
key1 key2
0 a hee
1 b a
2 c hija
3 d 123
4 e w
5 f NaN
In [15]:
#调用字符串方法
print(s.str.upper())#过滤掉NaN
print(s.str.count('a'))
print(df['key1'].str.upper())
0 A
1 B
2 C
3 HELLO
4 123
5 NaN
6 SHIT
dtype: object
0 1.0
1 0.0
2 0.0
3 0.0
4 0.0
5 NaN
6 0.0
dtype: float64
0 A
1 B
2 C
3 D
4 E
5 F
Name: key1, dtype: object
In [21]:
#常用字符串方法
print(s.str.upper())
print(s.str.lower())
print(s.str.len())
print(s.str.startswith('a'))
print(s.str.endswith('f'))
#去掉字符串的空格 还可以是左空格或者是右空格
print(s.str.strip())
print(s.str.lstrip())
print(s.str.rstrip())
0 A
1 B
2 C
3 HELLO
4 123
5 NaN
6 SHIT
dtype: object
0 a
1 b
2 c
3 hello
4 123
5 NaN
6 shit
dtype: object
0 1.0
1 1.0
2 1.0
3 5.0
4 3.0
5 NaN
6 4.0
dtype: float64
0 True
1 False
2 False
3 False
4 False
5 NaN
6 False
dtype: object
0 False
1 False
2 False
3 False
4 False
5 NaN
6 False
dtype: object
0 a
1 b
2 c
3 hello
4 123
5 NaN
6 shit
dtype: object
0 a
1 b
2 c
3 hello
4 123
5 NaN
6 shit
dtype: object
0 a
1 b
2 c
3 hello
4 123
5 NaN
6 shit
dtype: object
Out[21]:
0 a
1 b
2 c
3 hello
4 123
5 NaN
6 shit
dtype: object
In [32]:
#replace n是替换的个数
df = pd.DataFrame(np.random.rand(3,2),columns=[' Colum A',' Colun B'],index=range(3))
df.columns.str.replace(' ','_',n=1)
Out[32]:
Index(['_Colum A', '_Colun B'], dtype='object')
In [44]:
#拆分
s = pd.Series(['a,b,c','1,2,3',['a...c'],np.nan])
print(s)
print(s.str.split(','))
#expand = True 分裂
print(s.str.split(',',expand = True))
0 a,b,c
1 1,2,3
2 [a...c]
3 NaN
dtype: object
0 [a, b, c]
1 [1, 2, 3]
2 NaN
3 NaN
dtype: object
0 1 2
0 a b c
1 1 2 3
2 NaN None None
3 NaN None None
In [67]:
df = pd.DataFrame({'key1':['a,b,c','1,2,3',['...,..,..']],
'key2':['a-b-c','1-2-3',['...-...-']]})
df['k200'] = df['key1'].str.split(',').str[0]
print(a)
df['k201'] = df['key1'].str.split(',').str[1]
df['k202'] = df['key1'].str.split(',').str[2]
df
0 a,b,c
1 1,2,3
2 NaN
Name: key1, dtype: object
Out[67]:
key1 key2 k200 k201 k202
0 a,b,c a-b-c a b c
1 1,2,3 1-2-3 1 2 3
2 [...,..,..] [...-...-] NaN NaN NaN