1.汇总和计算描述统计
frame = DataFrame([ [1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3] ], index=list('abcd'), columns=['one', 'two']) print(frame) print(frame.sum()) print(frame.sum(axis=1))one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
求和时忽略了NaN
one 9.25two -5.80
dtype: float64
b 2.60
c 0.00
d -0.55
dtype: float64
print(frame.mean(axis=1, skipna=False)) #对行求均值,不忽略NaNa NaN
b 1.300
c NaN
d -0.275
dtype: float64
print(frame.idxmax()) #没列最大值的索引one b
two d
dtype: object
print(frame.cumsum())one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
对列求累加,默认忽略nan
describe()方法用于一次型产生多个汇总统计:
print(frame.describe()) #默认忽略nanone two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
对于非数值型数据,describe()会产生另外一种汇总统计:
obj = Series( ['a', 'a', 'b', 'c'] * 4 ) print(obj)0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
print(obj.describe())count 16
unique 3
top a
freq 8
dtype: object
2.唯一值,值计数以及成员资格
obj = Series( ['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'] ) uniques = obj.unique() print(uniques) obj2 = obj.value_counts() #计算各值出现频率 print(obj2)
['c' 'a' 'd' 'b']
a 3
b 2
d 1
还可以使用顶级pandas方法:
obj2 = pd.value_counts(obj.values, sort=False) #计算各值出现频率 print(obj2)c 3
a 3
d 1
b 2
obj = Series( ['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'] ) mask = obj.isin(['b', 'c']) print(mask)0 True
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
3.处理缺失数据
obj = Series( [1, np.nan, 3.5, np.nan, 7] ) print(obj) obj1 = obj.dropna() print(obj1)0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype: float64
0 1.0
2 3.5
4 7.0
dtype: float64
也可以通过布尔值索引达到目的:
obj = Series( [1, np.nan, 3.5, np.nan, 7] ) obj1 = obj[obj.notnull()] print(obj1)0 1.0
2 3.5
4 7.0
dtype: float64
对DataFrame而言:
frame = DataFrame([ [1, 6.5, 3], [1, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3] ]) print(frame) frame1 = frame.dropna(how='all') #去除全是NaN的行 frame2 = frame.dropna(how='any') #去除含有NaN的行 print(frame1) print(frame2)0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
0 1.0 6.5 3.0
对列操作只需要传入axis=1
frame = DataFrame([ [1, 6.5, np.nan], [1, np.nan, np.nan], [1, np.nan, np.nan], [2, 6.5, np.nan] ]) print(frame) frame1 = frame.dropna(axis=1, how='all') #dropna默认axis=0,how='any' frame2 = frame.dropna(axis=1, how='any') print(frame1) print(frame2)0 1 2
0 1 6.5 NaN
1 1 NaN NaN
2 1 NaN NaN
3 2 6.5 NaN
0 1 6.5
1 1 NaN
2 1 NaN
3 2 6.5
0 1
1 1
2 1
3 2
4.填充确实数据
frame = DataFrame(np.random.randn(7, 3)) frame.ix[:4, 1] = np.nan frame.ix[:2, 2] = np.nan print(frame) frame1 = frame.fillna(0) print(frame1)0 1 2
0 0.174681 NaN NaN
1 -0.748038 NaN NaN
2 -0.385075 NaN NaN
3 1.941320 NaN 1.134585
4 1.159647 NaN -0.349412
5 -0.109030 -0.238049 -0.091797
6 0.815965 -1.348901 1.771237
0 0.174681 0.000000 0.000000
1 -0.748038 0.000000 0.000000
2 -0.385075 0.000000 0.000000
3 1.941320 0.000000 1.134585
4 1.159647 0.000000 -0.349412
5 -0.109030 -0.238049 -0.091797
6 0.815965 -1.348901 1.771237
frame = DataFrame(np.random.randn(7, 3)) frame.ix[:4, 1] = np.nan frame.ix[:2, 2] = np.nan print(frame) frame2 = frame.fillna( {1: 0.5, 2: -1} ) #传入字典,1列用0.5填补缺失值,2列用-1填补 print(frame2)0 1 2
0 0.581664 NaN NaN
1 -0.283730 NaN NaN
2 -0.121095 NaN NaN
3 -1.239776 NaN 0.782291
4 1.283248 NaN 1.841159
5 0.305333 -0.195191 -0.828173
6 0.583356 -0.374066 -2.179363
0 0.581664 0.500000 -1.000000
1 -0.283730 0.500000 -1.000000
2 -0.121095 0.500000 -1.000000
3 -1.239776 0.500000 0.782291
4 1.283248 0.500000 1.841159
5 0.305333 -0.195191 -0.828173
6 0.583356 -0.374066 -2.179363
fillna()默认是创建副本并对副本修改并返回,但也可设置inplace=True直接对现有对象进行修改:
frame = DataFrame(np.random.randn(7, 3)) frame.ix[:4, 1] = np.nan frame.ix[:2, 2] = np.nan print(frame) frame.fillna(0, inplace=True) print(frame)0 1 2
0 1.648203 NaN NaN
1 -1.149365 NaN NaN
2 -0.277807 NaN NaN
3 0.920745 NaN -0.214246
4 0.994661 NaN -0.032028
5 0.158606 -1.598697 0.922042
6 0.313768 -0.549528 0.894851
0 1.648203 0.000000 0.000000
1 -1.149365 0.000000 0.000000
2 -0.277807 0.000000 0.000000
3 0.920745 0.000000 -0.214246
4 0.994661 0.000000 -0.032028
5 0.158606 -1.598697 0.922042
6 0.313768 -0.549528 0.894851
frame = DataFrame(np.random.randn(6, 3)) frame.ix[2:, 1] = np.nan frame.ix[4:, 2] = np.nan print(frame) frame1 = frame.fillna(method='ffill') print(frame1) frame2 = frame.fillna(method='ffill', limit=2) #限制向前传播2个 print(frame2)0 1 2
0 -1.802197 -0.111689 -0.776186
1 -2.262859 -1.466189 0.690359
2 0.213340 NaN 2.357712
3 0.597074 NaN 1.355261
4 0.255808 NaN NaN
5 -0.303095 NaN NaN
0 -1.802197 -0.111689 -0.776186
1 -2.262859 -1.466189 0.690359
2 0.213340 -1.466189 2.357712
3 0.597074 -1.466189 1.355261
4 0.255808 -1.466189 1.355261
5 -0.303095 -1.466189 1.355261
0 -1.802197 -0.111689 -0.776186
1 -2.262859 -1.466189 0.690359
2 0.213340 -1.466189 2.357712
3 0.597074 -1.466189 1.355261
4 0.255808 NaN 1.355261
5 -0.303095 NaN 1.355261
ffillna()可以实现很多功能,比如将NaN替换为均值:
obj = Series( [1, np.nan, 3.5, np.nan, 7] ) obj1 = obj.fillna(obj.mean()) print(obj1)0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
5.层次化索引
obj = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) print(obj) print(obj.index)a 1 -2.499392
2 -0.502652
3 0.639299
b 1 -0.085652
2 -0.168285
3 0.124634
c 1 -0.229483
2 1.757817
d 2 0.635124
3 -0.059686
dtype: float64
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
print(obj['b']) print(obj['b':'c']) print(obj.ix[ ['b', 'd'] ]) #不可以print(obj[ ['b', 'd'] ])1 -0.118772
2 0.708275
3 0.246212
dtype: float64
2 0.708275
3 0.246212
c 1 0.746326
2 -0.561385
dtype: float64
2 0.708275
3 0.246212
d 2 0.021336
3 1.417169
dtype: float64
还可以更为具体的选取:
print(obj[:, 2])a -0.390513
b -0.371184
c -0.409489
d -0.506282
dtype: float64
层次化索引的数据可以通过unstack()方法变为DataFrame,DataFrame也可以通过stack()方法变为层次化索引数据:
obj1 = obj.unstack() #同样是创建副本然后操作 print(obj1)1 2 3
a -0.026853 -0.543384 -0.402048
b -2.758856 -0.899349 0.236109
c 0.885205 -0.211624 NaN
d NaN 0.091227 -0.201358
obj2 = obj1.stack() print(obj2)a 1 -1.104728
2 1.283418
3 0.617252
b 1 -0.828558
2 -1.289287
3 -2.625240
c 1 -0.878695
2 -0.872315
d 2 -1.600262
3 -1.095252
dtype: float64
对于DataFrame,每条轴都有分层索引:
frame = DataFrame(np.arange(12).reshape((4,3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['JS', 'JS', 'ZJ'], ['nj', 'yz', 'hz']]) frame.index.names = ['key1', 'key2'] frame.columns.names = ['province', 'city'] print(frame)province JS ZJ
city nj yz hz
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
print(frame['JS'])city nj yz
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10