Pyhton科学计算工具Pandas(九)—— 数据分组

Pyhton科学计算工具Pandas(九)—— 数据分组

分组统计 - groupby功能

  1. 根据某些条件将数据拆分成组
  2. 对每个组独立应用函数
  3. 将结果合并到一个数据结构中

Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。

df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)

分组的基本操作

分组

#分组


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
print(df)
print('-----')

dfa = df.groupby('A')
print(dfa.size(), type(dfa))
#groupby之后的数据并不是DataFrame格式的数据,而是特殊的groupby类型
#可以通过size()方法返回分组后的记录数目的统计结果
print('========')

a = df.groupby('A').sum()
print(a, type(a))

b = df.groupby(['A','B']).mean()
print(b, type(b))

c = df.groupby('A')['D'].mean()
print(c, type(c))
# 通过分组后的计算,得到一个新的dataframe
# 默认axis = 0,以行来分组
# 可单个或多个([])列分组
     A      B         C         D
0  foo    one  2.479737 -2.368789
1  bar    one  1.028346  0.950277
2  foo    two  1.001758 -1.278156
3  bar  three -0.205714 -0.330909
4  foo    two  0.337572  1.256110
5  bar    two  0.244171 -0.820276
6  foo    one  0.554198  0.683419
7  foo  three -0.534419 -0.319840
-----
A
bar    3
foo    5
dtype: int64 <class 'pandas.core.groupby.DataFrameGroupBy'>
========
            C         D
A                      
bar  1.066804 -0.200907
foo  3.838847 -2.027256 <class 'pandas.core.frame.DataFrame'>
                  C         D
A   B                        
bar one    1.028346  0.950277
    three -0.205714 -0.330909
    two    0.244171 -0.820276
foo one    1.516967 -0.842685
    three -0.534419 -0.319840
    two    0.669665 -0.011023 <class 'pandas.core.frame.DataFrame'>
A
bar   -0.066969
foo   -0.405451
Name: D, dtype: float64 <class 'pandas.core.series.Series'>

分组是一个可迭代的对象

# 分组 - 可迭代对象

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))
print('-----')


print(list(df.groupby('X')), '→ 可迭代对象,直接生成list\n')
print(list(df.groupby('X'))[0], '→ 以元祖形式显示\n')

for n,g in df.groupby('X'):
    print(n)
    print(g, type(g))
    print('======')
# n是组名,g是分组后的Dataframe
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
<pandas.core.groupby.DataFrameGroupBy object at 0x000002AF2EE7C080> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A',    X  Y
0  A  1
2  A  3), ('B',    X  Y
1  B  4
3  B  2)] → 可迭代对象,直接生成list

('A',    X  Y
0  A  1
2  A  3) → 以元祖形式显示

A
   X  Y
0  A  1
2  A  3 <class 'pandas.core.frame.DataFrame'>
======
B
   X  Y
1  B  4
3  B  2 <class 'pandas.core.frame.DataFrame'>
======

选择分组 .get_group()

# 提取分组后的某组

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})

print(df)
print('-------')

print(df.groupby('X').get_group('A'))
print('-------')
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
-------
   X  Y
0  A  1
2  A  3

将分组转化为字典 .groups

#  将分组转化为字典

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('---------')

a = df.groupby('X')
print(a.groups,'\n')
print(a.groups['A'],'\n')
print(a.groups['A'][0])
#  字典的值为index
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
---------
{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')} 

Int64Index([0, 2], dtype='int64') 

0

查看分组里的记录数 .size()

#  .size()  查看分组中的记录的统计数目

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})

print(df)
print('====')

a = df.groupby('X')
print(a.size())
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
====
X
A    2
B    2
dtype: int64

多个列分组

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df)
print('---------')
print(grouped)
print('=====')
print(grouped[('foo', 'three')])
# 按照两个列进行分组
     A      B         C         D
0  foo    one -0.539735  0.252334
1  bar    one  1.247811 -0.144133
2  foo    two -0.965486  0.042095
3  bar  three -0.158520 -0.667123
4  foo    two  1.283692  1.201100
5  bar    two -0.795091  0.368176
6  foo    one -0.263945  0.085682
7  foo  three  0.710263 -1.238407
---------
{('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'two'): Int64Index([2, 4], dtype='int64')}
=====
Int64Index([7], dtype='int64')

在其他轴上分组

df = pd.DataFrame({'data1':np.random.rand(2),
                  'data2':np.random.rand(2),
                  'key1':['a','b'],
                  'key2':['one','two']})
print(df)
print('------')
print(df.dtypes)
print('------')

for n,p in df.groupby(df.dtypes, axis=1):
    print(n)
    print(p)
    print('===')
# 按照值类型分列
      data1    data2 key1 key2
0  0.257623  0.81153    a  one
1  0.325821  0.78845    b  two
------
data1    float64
data2    float64
key1      object
key2      object
dtype: object
------
float64
      data1    data2
0  0.257623  0.81153
1  0.325821  0.78845
===
object
  key1 key2
0    a  one
1    b  two
===

通过字典或者Series分组

# 通过字典或者Series分组

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'])
print(df)
print('-----')

mapping = {'a':'one','b':'one','c':'two','d':'two','e':'three'}
by_column = df.groupby(mapping, axis = 1)
print(by_column.sum())
print('-----')
# mapping中,a、b列对应的为one,c、d列对应的为two,以字典来分组

s = pd.Series(mapping)
print(s,'\n')
print(s.groupby(s).count())
# s中,index中a、b对应的为one,c、d对应的为two,以Series来分组
'''??????'''
    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
-----
   one  two
0    1    5
1    9   13
2   17   21
3   25   29
-----
a      one
b      one
c      two
d      two
e    three
dtype: object 

one      2
three    1
two      2
dtype: int64





'??????'

通过函数分组

# 通过函数分组

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'],
                 index = ['abc','bcd','aa','b'])
print(df,'\n')
print(df.groupby(len).sum())
# 按照字母长度分组
      a   b   c   d
abc   0   1   2   3
bcd   4   5   6   7
aa    8   9  10  11
b    12  13  14  15 

    a   b   c   d
1  12  13  14  15
2   8   9  10  11
3   4   6   8  10

分组中常见的函数

# 分组计算函数方法

s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0)  # 唯一索引用.groupby(level=0),将同一个index的分为一组
print(grouped)
print(grouped.first(),'→ first:非NaN的第一个值\n')
print(grouped.last(),'→ last:非NaN的最后一个值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算术中位数\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的标准差和方差\n')
print(grouped.prod(),'→ prod:非NaN的积\n')
<pandas.core.groupby.SeriesGroupBy object at 0x000002AF2F1B7278>
1    1
2    2
3    3
dtype: int64 → first:非NaN的第一个值

1    10
2    20
3    30
dtype: int64 → last:非NaN的最后一个值

1    11
2    22
3    33
dtype: int64 → sum:非NaN的和

1     5.5
2    11.0
3    16.5
dtype: float64 → mean:非NaN的平均值

1     5.5
2    11.0
3    16.5
dtype: float64 → median:非NaN的算术中位数

1    2
2    2
3    2
dtype: int64 → count:非NaN的值

1    1
2    2
3    3
dtype: int64 → min、max:非NaN的最小值、最大值

1     6.363961
2    12.727922
3    19.091883
dtype: float64 → std,var:非NaN的标准差和方差

1    10
2    40
3    90
dtype: int64 → prod:非NaN的积

多函数计算

# 多函数计算:agg()

df = pd.DataFrame({'a':[1,1,2,2],
                  'b':np.random.randint(100, size=4),
                  'c':np.random.randint(100, size=4),
                  'd':np.random.randint(100, size=4)})
print(df)
print(df.groupby('a').agg(['mean',sum]))
print(df.groupby('a')['b'].agg({'mean':np.mean,
                               'sum':'sum'}))
# 函数写法可以用str,或者np.方法
# 可以通过list,dict传入,当用dict时,key名为columns
   a   b   c   d
0  1  47   0  61
1  1  83  52   2
2  2  54  77  87
3  2  52  99  97
     b         c          d     
  mean  sum mean  sum  mean  sum
a                               
1   65  130   26   52  31.5   63
2   53  106   88  176  92.0  184
   mean  sum
a           
1    65  130
2    53  106


F:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  # Remove the CWD from sys.path while we load stuff.

分组转换

数据分组转换 transform

# 数据分组转换,transform

df = pd.DataFrame({'data1':np.random.randint(100, size=5),
                  'data2':np.random.randint(100, size=5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})
k_mean = df.groupby('key1').mean()
print(df)
print(k_mean)
print(pd.merge(df, k_mean, left_on='key1', right_index=True).add_prefix('mean_'))  # .add_prefix('mean_'):添加前缀
print('============')
# 通过分组、合并,得到一个包含均值的Dataframe

print(df.groupby('key2').mean()) # 按照key2分组求均值
print(df.groupby('key2').transform(np.mean))
# data1、data2每个位置元素取对应分组列的均值
# 字符串不能进行计算
   data1  data2 key1 key2
0      7     98    a  one
1     77      3    a  two
2     50     73    b  one
3     74     23    b  two
4     21      9    a  one
      data1      data2
key1                  
a      35.0  36.666667
b      62.0  48.000000
   mean_data1_x  mean_data2_x mean_key1 mean_key2  mean_data1_y  mean_data2_y
0             7            98         a       one          35.0     36.666667
1            77             3         a       two          35.0     36.666667
4            21             9         a       one          35.0     36.666667
2            50            73         b       one          62.0     48.000000
3            74            23         b       two          62.0     48.000000
============
      data1  data2
key2              
one    26.0   60.0
two    75.5   13.0
   data1  data2
0   26.0     60
1   75.5     13
2   26.0     60
3   75.5     13
4   26.0     60

一般化Groupby方法:apply

# 一般化Groupby方法:apply

df = pd.DataFrame({'data1':np.random.randint(100, size=5),
                  'data2':np.random.randint(100, size=5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})

print(df.groupby('key1').apply(lambda x: x.describe()))
# apply直接运行其中的函数
# 这里为匿名函数,描述性统计
print('=========================')

def f_df1(d,n):
    return(d.sort_index()[:n])
def f_df2(d,k1):
    return(d[k1])
print(df.groupby('key1').apply(f_df1,2),'\n')
print(df.groupby('key1').apply(f_df2,'data2'))
print(type(df.groupby('key1').apply(f_df2,'data2')))
# f_df1函数:返回排序后的前n行数据
# f_df2函数:返回分组后表的k1列,结果为Series,层次化索引
# 直接运行f_df函数
# 参数直接写在后面,也可以为.apply(f_df,n = 2))
                data1      data2
key1                            
a    count   3.000000   3.000000
     mean   39.666667  47.333333
     std    45.566801  33.306656
     min     4.000000  10.000000
     25%    14.000000  34.000000
     50%    24.000000  58.000000
     75%    57.500000  66.000000
     max    91.000000  74.000000
b    count   2.000000   2.000000
     mean   25.500000  18.500000
     std     3.535534  16.263456
     min    23.000000   7.000000
     25%    24.250000  12.750000
     50%    25.500000  18.500000
     75%    26.750000  24.250000
     max    28.000000  30.000000
=========================
        data1  data2 key1 key2
key1                          
a    0      4     10    a  one
     1     91     58    a  two
b    2     28      7    b  one
     3     23     30    b  two 

key1   
a     0    10
      1    58
      4    74
b     2     7
      3    30
Name: data2, dtype: int32
<class 'pandas.core.series.Series'>

猜你喜欢

转载自blog.csdn.net/ICERON/article/details/79736468