import numpy as np
import pandas as pd
from pandas import Series,DataFrame
一、GroupBy的介绍
groupby就是将DataFrame按照指定的列进行分组,然后在每个组上应用函数进行映射,最后将映射的结果进行合并。Pandas中的groupby只是返回GroupBy对象,它只是记录了关于分组的信息,并不进行任何实质的计算。
二、GroupBy的方式
df = DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5),
'data3':np.random.randn(5),
'data4':np.random.randn(5)})
print(df)
data1 data2 data3 data4 key1 key2
0 0.180213 -0.738877 0.486369 2.359576 a one
1 -0.594930 0.139863 -0.150285 -1.128795 a two
2 1.325142 1.194908 0.060578 -0.336161 b one
3 1.610629 -0.687301 -0.227717 -0.909036 b two
4 0.517015 -1.929637 0.990730 -0.677321 a one
1.使用Series进行groupby
df['data1'].groupby(df['key1']) # df['key1']是Series
<pandas.core.groupby.SeriesGroupBy object at 0x000000B6B6EB5DA0>
2.使用列名进行groupby
df.groupby('key1')
<pandas.core.groupby.DataFrameGroupBy object at 0x000000B6AFCF7EB8>
3.使用外部的数据进行groupby
years = np.array([2005,2005,2006,2005,2006])
df.groupby(years)
<pandas.core.groupby.DataFrameGroupBy object at 0x000000B6B6ECE278>
4.使用多个列进行groupby
df.groupby(['key1','key2'])
<pandas.core.groupby.DataFrameGroupBy object at 0x000000B6B6ECE240>
5.使用字典进行groupby
将data1和data3分为组1,data2和data4分为组2
mapping = {'data1':1,'data2':2,'data3':1,'data4':2}
print(df.groupby(mapping,axis=1).sum())
1.0 2.0
0 0.666582 1.620699
1 -0.745215 -0.988931
2 1.385720 0.858747
3 1.382913 -1.596337
4 1.507745 -2.606958
series也可以看做是一个字典
s = Series(mapping)
print(df.groupby(s,axis=1).sum())
1.0 2.0
0 0.666582 1.620699
1 -0.745215 -0.988931
2 1.385720 0.858747
3 1.382913 -1.596337
4 1.507745 -2.606958
6.使用函数进行groupby
按列索引最后的数字进行分组,data1和key1是一组,data2和key2是一组,data3一组,data4一组
f = lambda x:x[-1]
print(df.groupby(f,axis=1).sum())
1 2 3 4
0 0.180213 -0.738877 0.486369 2.359576
1 -0.594930 0.139863 -0.150285 -1.128795
2 1.325142 1.194908 0.060578 -0.336161
3 1.610629 -0.687301 -0.227717 -0.909036
4 0.517015 -1.929637 0.990730 -0.677321
7.使用索引进行groupby
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names = ['cty','tenor'])#二级列索引,并为每级索引命名
df = DataFrame(np.random.randn(4,5),columns=columns)
print(df)
cty US JP
tenor 1 3 5 1 3
0 1.005013 1.354099 0.384804 -1.185332 -0.949672
1 -0.393205 -0.844469 -0.577132 0.967219 0.211839
2 0.935070 0.545067 -0.023603 0.761364 -0.710385
3 -2.446770 -0.264199 -0.421761 -0.508110 1.243657
print(df.groupby(level='cty',axis=1).count())
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3