利用Python进行数据聚合和分组运算

#Created by: Darren Chen
#Created on: 2018/8/2

import pandas as pd
import numpy as np
import os,time,sys

#像SQL这种结构化查询语言所执行的分组运算的种类十分有限

'''
    分组键的多种形式:
        1、列表或数组,其长度与待分组的轴长度相同。
        2、DataFrame的某个列名。
        3、字典或Series,给出待分组的值与分组名之间的对应关系。
        4、函数,用来处理轴索引或者索引中的各个标签。
'''

df = pd.DataFrame({'key1':['a','a','b','b','a'],
                    'key2':['one','two','one','two','one'],
                    'data1':np.random.randn(5),
                    'data2':np.random.randn(5)})

print(df)

grouped = df['data1'].groupby(df['key1'])
print(grouped, '\n')          #grouped是一个中间对象
print(grouped.mean(), '\n')
df.groupby(['key1','key2']).size()

#一次传入多个数组
means = df.groupby([df['key1'],df['key2']]).mean()
means
means.unstack()

#分组键可以是任意长的数组
states = np.array(['shandong','zhejiang','zhejiang','shandong','shandong'])
years = np.array([2018,2018,2020,2018,2020])
df.groupby([states,years]).mean()

#可以对分组进行迭代
for name,group in df.groupby('key1'):
    print(name)
    print(group)

for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

#groupby默认是在axis=0上进行分组
df.dtypes
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))

#选取一个或一组列(大数据集很可能只需要对部分进行聚合)
df.groupby(['key1','key2'])[['data2']].mean()
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
s_grouped.mean()

#通过字典或Series进行分组
people = pd.DataFrame(np.random.randn(5,5),
                      columns=['a','b','c','d','e'],
                      index= ['James','Koby','Darren','Yao','Lin'])
people.ix[2:3,['b','c']] = np.nan   #添加几个缺失值
people

mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_columns = people.groupby(mapping,axis=1)
by_columns.sum()

map_series = pd.Series(mapping)
map_series
people.groupby(map_series,axis=1).sum()

#通过函数进行分组
people.groupby(len).sum()
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).sum()     #可以将函数和其他混合使用

#根据索引级别分组
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                     [1,3,5,1,3]],names=['city','tensor'])
columns
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df
hier_df.groupby(level='city',axis=1).count()

#数据聚合
df
df.groupby(['key1']).quantile(0.9)          #计算样本分位数

#使用自己的聚合函数
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped = df.groupby(['key1'])
grouped.agg(peak_to_peak)
grouped.describe().unstack()

#面向列的多函数应用
os.chdir('C:\\Users\\cfc47\\OneDrive\\文档\\数据分析\\data')
tips = pd.read_csv('tips.csv')
tips.head()
grouped = tips.groupby(['sex','smoker'])['tip']
grouped.mean()

#传入一组函数或函数名
grouped.agg(['mean','size',peak_to_peak])
grouped.agg([('foo','mean'),('bax',np.std)])            #(名字, 函数或函数名)

#对不同列应用不同的函数
grouped.agg({'tip':['mean','size','count'],'total_bill':np.std})

#以无索引形式返回数据
tips.groupby(['sex','smoker']).mean()
tips.groupby(['sex','smoker'],as_index=False).mean()

####分组级运算和转换
#为df添加衣蛾用于存放各索引分组平均值的列
#方法一:先聚合再合并
df
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
pd.merge(df,k1_means,left_on='key1',right_index=True)
#方法二:
people
key = ['one','two','one','two','one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)
#tranform会将一个函数应用到各个分组,然后将结果放到适当的位置上
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned
demeaned.groupby(key).mean()

####apply——更一般的“拆分——应用——合并”
#根据分组选出最高的5个tip值
def top(df,n=5,column='tip'):
    return df.sort_index(by=column)[-n:]

top(tips,n=6)
tips.groupby('smoker').apply(top)
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')

#禁止分组键
tips.groupby('smoker',group_keys=False).apply(top)

####分位数和桶分析
frame = pd.DataFrame({'data1':np.random.randn(1000),
                      'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)
factor[:10]
#cut返回的对象可直接用于groupby
def get_stats(group):
    return {'min':group.min(),'max':group.max(),
            'count':group.count(),'mean':group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
#返回分位数编号
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

猜你喜欢

转载自blog.csdn.net/PyDarren/article/details/82632592