python进行数据分析groupby基础操作

from pandas import Series,DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],'key2' : ['one', 'two', 'one', 'two', 'one'],
 'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
grouped=df['data1'].groupby(df['key1'])
grouped.mean()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
Out[10]:
key1  key2
a     one    -0.230076
      two     1.248653
b     one    -0.196613
      two     0.689761
Name: data1, dtype: float64
means.unstack(level=-1)
Out[12]:
key2       one       two
key1                    
a    -0.230076  1.248653
b    -0.196613  0.689761

means.unstack(level=0)
Out[13]:
key1         a         b
key2                    
one  -0.230076 -0.196613
two   1.248653  0.689761

states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

years = np.array([2005, 2005, 2006, 2005, 2006])

df
Out[16]:
      data1     data2 key1 key2
0 -0.169761 -0.297803    a  one
1  1.248653  0.116745    a  two
2 -0.196613 -0.224198    b  one
3  0.689761  1.436277    b  two
4 -0.290392  1.292000    a  one

df['data1'].groupby([states,years]).mean()
Out[17]:
California  2005    1.248653
            2006   -0.196613
Ohio        2005    0.260000
            2006   -0.290392
Name: data1, dtype: float64

#被聚合的只有数值列
df.groupby(df['key1']).mean()
Out[19]:
         data1     data2
key1                    
a     0.262833  0.370314
b     0.246574  0.606039

df.groupby(['key1','key2']).mean()
Out[20]:
              data1     data2
key1 key2                    
a    one  -0.230076  0.497098
     two   1.248653  0.116745
b    one  -0.196613 -0.224198
     two   0.689761  1.436277

for name,group in df.groupby(['key1']):
    print (name)
    print(group)
    
a
      data1     data2 key1 key2
0 -0.169761 -0.297803    a  one
1  1.248653  0.116745    a  two
4 -0.290392  1.292000    a  one
b
      data1     data2 key1 key2
2 -0.196613 -0.224198    b  one
3  0.689761  1.436277    b  two

for (k1,k2),group in df.groupby(['key1','key2']):
    print (k1,k2)
    print(group)
    
a one
      data1     data2 key1 key2
0 -0.169761 -0.297803    a  one
4 -0.290392  1.292000    a  one
a two
      data1     data2 key1 key2
1  1.248653  0.116745    a  two
b one
      data1     data2 key1 key2
2 -0.196613 -0.224198    b  one
b two
      data1     data2 key1 key2
3  0.689761  1.436277    b  two

df.dtypes
Out[29]:
data1    float64
data2    float64
key1      object
key2      object
dtype: object

grouped=df.groupby(df.dtypes,axis=1)

dict(list(grouped))
Out[31]:
{dtype('float64'):       data1     data2
 0 -0.169761 -0.297803
 1  1.248653  0.116745
 2 -0.196613 -0.224198
 3  0.689761  1.436277
 4 -0.290392  1.292000, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

df.groupby(['key1', 'key2'])[['data2']].mean()
Out[15]:
              data2
key1 key2          
a    one   0.942926
     two  -0.671692
b    one   0.366026
     two  -0.794155
     
df.ix[2:3,['data1','data2']]=np.nan

df
Out[22]:
      data1     data2 key1 key2
0 -1.090066  0.132120    a  one
1 -0.683913 -0.671692    a  two
2       NaN       NaN    b  one
3       NaN       NaN    b  two
4 -0.003452  1.753732    a  one

猜你喜欢

转载自www.cnblogs.com/qmxj-blog/p/9373711.html