Python之Pandas（5）

#数据分组
#根据某些条件将数据进行拆分成组
#每个组独立应用函数
#将结果合并到一个数据结构中
import numpy as np
import pandas as pd

In [4]:

#分组
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','foo','bar','bar'],
                  'B':['one','two','three','two','two','one','three','one'],
                  'C':np.random.randn(8),
                  'D':np.random.randn(8)})
In [5]:

df
Out[5]:
A	B	C	D
0	foo	one	0.444202	1.406586
1	bar	two	-0.311666	1.196347
2	foo	three	0.440234	0.949232
3	bar	two	-1.578572	2.464325
4	foo	two	-1.353510	0.773391
5	foo	one	0.307378	-0.492570
6	bar	three	1.446811	-2.350776
7	bar	one	-2.097978	0.296710
In [10]:

df.groupby(['A','B']).mean()
Out[10]:
C	D
A	B		
bar	one	-2.097978	0.296710
three	1.446811	-2.350776
two	-0.945119	1.830336
foo	one	0.375790	0.457008
three	0.440234	0.949232
two	-1.353510	0.773391
In [26]:

df.groupby(['A']).mean()
Out[26]:
C	D
A		
bar	-0.635351	0.401652
foo	-0.040424	0.659160
In [21]:

list(df.groupby(['A']))[1]list(df.groupby(['A']))[1]
Out[21]:
('foo',      A      B         C         D
 0  foo    one  0.444202  1.406586
 2  foo  three  0.440234  0.949232
 4  foo    two -1.353510  0.773391
 5  foo    one  0.307378 -0.492570)
In [24]:

list(df.groupby(['A']))[0]
Out[24]:
('bar',      A      B         C         D
 1  bar    two -0.311666  1.196347
 3  bar    two -1.578572  2.464325
 6  bar  three  1.446811 -2.350776
 7  bar    one -2.097978  0.296710)
In [28]:

df.groupby(['A','B']).size()
Out[28]:
A    B    
bar  one      1
     three    1
     two      2
foo  one      2
     three    1
     two      1
dtype: int64
In [31]:

df.groupby(['A']).groups
Out[31]:
{'bar': [1, 3, 6, 7], 'foo': [0, 2, 4, 5]}
In [51]:

s = pd.Series([1,2,3,10,20,30],index=[1,2,3,1,2,3])
grouped = s.groupby(level=0)
print(s)
print(grouped.first())#显示非NaN的第一个值
print(grouped.last())#显示非NaN的最后一个值
print(grouped.sum())#显示非NaN的和
print(grouped.mean)#平均值
print(grouped.median())#中值
print(grouped.count())#计数
print(grouped.min())#最小
print(grouped.max())#最大
print(grouped.std())#标准差
print(grouped.var())#方差
print(grouped.prod())#积
1     1
2     2
3     3
1    10
2    20
3    30
dtype: int64
1    1
2    2
3    3
dtype: int64
1    10
2    20
3    30
dtype: int64
1    11
2    22
3    33
dtype: int64
<bound method GroupBy.mean of <pandas.core.groupby.SeriesGroupBy object at 0x00000000049BBA90>>
1     5.5
2    11.0
3    16.5
dtype: float64
1    2
2    2
3    2
dtype: int64
1    1
2    2
3    3
dtype: int64
1    10
2    20
3    30
dtype: int64
1     6.363961
2    12.727922
3    19.091883
dtype: float64
1     40.5
2    162.0
3    364.5
dtype: float64
1    10
2    40
3    90
dtype: int64
In [53]:

df = pd.DataFrame({'a':[1,1,2,2],
                  'b':np.random.rand(4),
                  'c':np.random.rand(4),
                  'd':np.random.rand(4)})
df
Out[53]:
a	b	c	d
0	1	0.390076	0.664425	0.493986
1	1	0.534739	0.378918	0.813577
2	2	0.894389	0.680243	0.294173
3	2	0.741806	0.223494	0.160900
In [56]:

print(df.groupby('a').agg(['mean',np.sum]))
          b                   c                   d          
       mean       sum      mean       sum      mean       sum
a                                                            
1  0.462407  0.924815  0.521671  1.043342  0.653782  1.307563
2  0.818098  1.636195  0.451869  0.903737  0.227536  0.455073
In [58]:

print(df.groupby('a')['b'].agg({'result1':np.mean,
                                'result2':np.sum}))
    result1   result2
a                    
1  0.462407  0.924815
2  0.818098  1.636195
In [61]:

#小作业
df = pd.DataFrame({'A':['one','two','three','one','two','three','one','two'],
                  'B':['h','h','h','h','f','f','f','f'],
                  'C':[10,12,14,16,18,20,22,24],
                  'D':np.random.randn(8),
                  'E':np.random.rand(8)})
df

Out[61]:
A	B	C	D	E
0	one	h	10	-1.188879	0.771559
1	two	h	12	-0.414063	0.743417
2	three	h	14	-0.241158	0.182954
3	one	h	16	0.381358	0.100378
4	two	f	18	-0.101517	0.291719
5	three	f	20	-0.808872	0.007264
6	one	f	22	-1.164982	0.351209
7	two	f	24	-1.144294	0.831537
In [62]:

df.groupby('A')['C','D'].mean()
Out[62]:
C	D
A		
one	16	-0.657501
three	17	-0.525015
two	18	-0.553291
In [63]:

df.groupby(['A','B'])['D','E'].sum()
Out[63]:
D	E
A	B		
one	f	-1.164982	0.351209
h	-0.807521	0.871937
three	f	-0.808872	0.007264
h	-0.241158	0.182954
two	f	-1.245811	1.123255
h	-0.414063	0.743417
In [78]:

print(dict(list(df.groupby('A'))))
{'three':        A  B   C         D         E
2  three  h  14 -0.241158  0.182954
5  three  f  20 -0.808872  0.007264, 'two':      A  B   C         D         E
1  two  h  12 -0.414063  0.743417
4  two  f  18 -0.101517  0.291719
7  two  f  24 -1.144294  0.831537, 'one':      A  B   C         D         E
0  one  h  10 -1.188879  0.771559
3  one  h  16  0.381358  0.100378
6  one  f  22 -1.164982  0.351209}
In [72]:

df2 = df[['C','D']]
df2['sum'] = df2.sum(axis = 1)
df2
C:\Program Files\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[72]:
C	D	sum
0	10	-1.188879	8.811121
1	12	-0.414063	11.585937
2	14	-0.241158	13.758842
3	16	0.381358	16.381358
4	18	-0.101517	17.898483
5	20	-0.808872	19.191128
6	22	-1.164982	20.835018
7	24	-1.144294	22.855706
猜你喜欢