# 自定义分组规则
def get_score_group(score):
if score <= 4:
score_group = 'low'
elif score <= 6:
score_group = 'middle'
else:
score_group = 'high'
return score_group
# 方法1:传入自定义的函数进行分组按单列分组
data2 = data.set_index('Happiness Score')
data2.groupby(get_score_group).size()
# 方法2:人为构造出一个分组列
data['score group'] = data['Happiness Score'].apply(get_score_group)
data.tail()
data.groupby('score group').size()
Custom aggregation operation
data.groupby('Region').max()
data.groupby('Region').agg(np.max)
# 传入包含多个函数的列表
data.groupby('Region')['Happiness Score'].agg([np.max, np.min, np.mean])
# 通过字典为每个列指定不同的操作方法
data.groupby('Region').agg({
'Happiness Score': np.mean, 'Happiness Rank': np.max})
# 传入自定义函数
def max_min_diff(x):
return x.max() - x.min()
data.groupby('Region')['Happiness Rank'].agg(max_min_diff)
Exercise
存在DataFrame结构的数据df,对列名为data1的数据依次求最大值、均值、最小值;
利用字典,对列名为data2的数据求均值,对列名data3的数据计算个数;
对列名为data4的数据自定义计算,自定义函数为组内最大值减去均值,且保留两位小数,最后将聚合的数据进行合并。
df = pd.DataFrame({
'key':['one', 'three', 'two', 'two', 'one','three','three','two','one','one'],
'data1':np.random.randint(25,75,size=10),
'data2':np.random.randint(1,50,size=10),
'data3':np.random.randint(50,100,size=10),
'data4':np.random.randint(100,150,size=10)})
data1=df.groupby('key')['data1'].(1)([np.max,np.mean,np.(2)])
data1.columns=['data1_max','data1_mean','data1_min']
data23=df.(3)('key').agg({
'data2':np.(4),'data3':np.size})
def get_data4_diff(data):
return round(data.(5)()-data.mean(),(6))
data4=df.groupby('key')['data4'].agg((7))
pd.concat([data1,data23,(8)],axis=1)
Fill in the blank (1): The correct answer is agg.
Fill in the blank (2): The correct answer is min.
Fill in the blank (3): The correct answer is groupby.
Fill in the blank (4): The correct answer means.
Fill in the blank (5): The correct answer is max.
Fill in the blank (6): Correct answer 2.
Fill in the blank (7): The correct answer is get_data4_diff.
Fill in the blank (8): correct answer data4.