groupby - 2 of pandas aggregation and grouping operations

import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pandas as pd

np.random.seed(12345) # Record the seed of the random number to ensure that each execution is the same random number
plt.rc('figure', figsize=(10, 6))

from sklearn.metrics import confusion_matrix
'''
In the field of machine learning, confusion matrix, also known as likelihood table or error matrix.
It is a specific matrix used to visualize the performance of an algorithm, usually supervised learning (unsupervised learning, usually a matching matrix).
Each column represents the predicted value, and each row represents the actual class. The name comes from the fact that it is very easy to indicate whether multiple classes are confused (that is, one class is predicted to be another class).
'''
y_true=[2,1,0,1,2,0]
y_pred=[2,0,0,1,2,1]

C=confusion_matrix(y_true, y_pred)
print( C )
'''
[[1 1 0]
 [1 1 0]
 [0 0 2]]
'''

y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
cc = confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])
print( cc )
'''
[[2 0 0]
 [0 0 1]
 [1 0 2]]
'''

df = pd.DataFrame({'key1':list('aaaab'),
                  'key2': ['one','two','one','two','one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
print( df )
'''
      data1     data2 key1 key2
0 -0.848172 -2.385541    a  one
1 -0.453870  0.156512    a  two
2 -0.336633 -0.323486    a  one
3 -1.258714  1.339105    a  two
4  0.669843  0.511622    b  one
'''

print (df.groupby('key1')) # <pandas.core.groupby.DataFrameGroupBy object at 0x000000000DB1EC18>
print (df.groupby('key1').agg('sum'))

'''
The df is obtained as follows:
df is the dataframe in pd, groupby('column name'), which is equivalent to pre-classifying with this column.
The print result is:
         data1     data2
key1                    
a    -1.094335  2.781858
b    -0.548833  1.198655

Then agg() is an operation on the above. Here is the sum, so add up:
PS: I tried to select only the data1 column for calculation, so I wrote a df['data1'], which didn't work. Doing so only selects the data1 column!
PS: df['data1'] is series type, df[['data1']] is dataframe type
'''


#### Column-oriented multifunction application
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
				
tips = pd.read_csv('data/tips.csv')

tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

grouped = tips.groupby(['sex', 'smoker'])

grouped_pct = grouped['tip_pct']

'''
The difference between the agg function and the apply function in pandas
In fact, the book on data analysis with python does not clearly indicate the difference between these two functions, but that apply is more general.

In fact, in the ninth chapter of this book "arrays and operations and transformations" point to a little difference between the two:
    agg is used for aggregation operations. The so-called aggregation is of course the composition of larger components, which is mentioned at the beginning of this section:
    Aggregation is just one of the grouping operations. It is a special case of data transformation, that is, it accepts functions that reduce a one-dimensional array to a scalar value.

Of course, these two functions both act on the groupby object, that is, on the grouped object.
If the value is a one-dimensional array, after using a specific function, if it can be simplified, agg can call,
Conversely, if, for example, the custom function is sorting, or a function such as top defined on page 278 of the book,
Of course, it can't be solved by agg. At this time, apply can be used to solve it. Because he is more general, there is no simplification, what one-dimensional array, what scalar value.
'''
grouped_pct.agg('mean')
#grouped_pct.agg(['mean', 'std', peak_to_peak])
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
print( result )
'''
              tip_pct                     total_bill                  
                count      mean       max      count       mean    max
sex    smoker                                                         
Female No          54  0.156921  0.252672         54  18.105185  35.83
       Yes         33  0.182150  0.416667         33  17.977879  44.30
Male   No          97  0.160669  0.291990         97  19.791237  48.33
       Yes         60  0.152771  0.710345         60  22.284500  50.81
'''

print( result['tip_pct'] )
'''
               count      mean       max
sex    smoker                           
Female No         54  0.156921  0.252672
       Yes        33  0.182150  0.416667
Male   No         97  0.160669  0.291990
       Yes        60  0.152771  0.710345
'''
print( '-------------- ftuples --------------' )

ftuples = [('average', 'mean'), ('deviation', np.var)]

print( ftuples )
'''
[('average', 'mean'), ('deviation', <function var at 0x00000000035C5510>)]
'''

print( '-------------- grouped_1 --------------' )
grouped_1 = grouped['tip_pct', 'total_bill'].agg(ftuples)

print( grouped_1 )
'''
                   tip_pct              total_bill           
              Average Deviation Average Deviation
sex    smoker                                                
Female No         0.156921   0.001327    18.105185  53.092422
       Yes        0.182150   0.005126    17.977879  84.451517
Male   No         0.160669   0.001751    19.791237  76.152961
       Yes        0.152771   0.008206    22.284500  98.244673
'''

print( '-------------- grouped_2 --------------' )
grouped_2 = grouped.agg({'tip' : np.max, 'size' : 'sum'})

print( grouped_2 )
'''
               size   tip
sex    smoker            
Female No       140   5.2
       Yes       74   6.5
Male   No       263   9.0
       Yes      150  10.0
'''

print( '-------------- grouped_3 --------------' )
grouped_3 = grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],'size' : 'sum'})
print( grouped_3 )
'''
              size   tip_pct                              
               sum       min       max      mean       std
sex    smoker                                             
Female No      140  0.056797  0.252672  0.156921  0.036421
       Yes      74  0.056433  0.416667  0.182150  0.071595
Male   No      263  0.071804  0.291990  0.160669  0.041849
       Yes     150  0.035638  0.710345  0.152771  0.090588
'''

###Group-level operations and transformations
print( '-------------- df --------------' )
print( df )
'''
      data1     data2 key1 key2
0  1.007189  0.886429    a  one
1 -1.296221 -2.001637    a  two
2  0.274992 -0.371843    b  one
3  0.228913  1.669025    b  two
4  1.352917 -0.438570    a  one
'''

k1_means = df.groupby('key1').mean().add_prefix('mean_')

print( '-------------- k1_means --------------' )
print( k1_means )
'''
      mean_data1  mean_data2
key1                        
a       0.354628   -0.517926
b       0.251952    0.648591
'''
_merge_11 = pd.merge(df, k1_means, left_on='key1', right_index=True)

print ('-------------- _merge_11 --------------')
print (_merge_11)
'''
      data1     data2 key1 key2  mean_data1  mean_data2
0  1.007189  0.886429    a  one    0.354628   -0.517926
1 -1.296221 -2.001637    a  two    0.354628   -0.517926
4  1.352917 -0.438570    a  one    0.354628   -0.517926
2  0.274992 -0.371843    b  one    0.251952    0.648591
3  0.228913  1.669025    b  two    0.251952    0.648591
'''

people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
print( '-------------- people --------------' )
print( people )
'''
               a         b         c         d         e
Joe    -0.539741  0.476985  3.248944 -1.021228 -0.577087
Steve   0.124121  0.302614  0.523772  0.000940  1.343810
Wes    -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
Jim     0.560145 -1.265934  0.119827 -1.063512  0.332883
Travis -2.359419 -0.199543 -1.541996 -0.970736 -1.307030
'''

key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)

print( '-------------- people-groupby --------------' )
print( people )
'''
               a         b         c         d         e
Joe    -0.539741  0.476985  3.248944 -1.021228 -0.577087
Steve   0.124121  0.302614  0.523772  0.000940  1.343810
Wes    -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
Jim     0.560145 -1.265934  0.119827 -1.063512  0.332883
Travis -2.359419 -0.199543 -1.541996 -0.970736 -1.307030
'''

def demean(arr):
    return arr - arr.mean()
    
demeaned = people.groupby(key).transform(demean)

print( '-------------- demeaned --------------' )
print( demeaned )
'''
               a         b         c         d         e
Joe     0.664493  0.661556  3.470038  0.263014  0.337871
Steve  -0.218012  0.784274  0.201972  0.532226  0.505464
Wes     0.490691 -0.646583 -2.149137 -0.576519  0.054201
Jim     0.218012 -0.784274 -0.201972 -0.532226 -0.505464
Travis -1.155184 -0.014972 -1.320901  0.313505 -0.392072
'''

demeaned_2 = demeaned.groupby(key).mean()

print( '-------------- demeaned_2 --------------' )
print( demeaned_2 )
'''
                a             b             c             d             e
one -7.401487e-17  1.850372e-17 -7.401487e-17  7.401487e-17 -1.110223e-16
two  2.775558e-17 -5.551115e-17 -1.387779e-17  0.000000e+00  0.000000e+00
'''

# ### apply method
def top(df, n=5, column='tip_pct'):
    return df.sort_index(by=column)[-n:]
	
top(tip, n=6)

tips.groupby('smoker').apply(top)

tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

result = tips.groupby('smoker')['tip_pct'].describe()

print( '-------------- result --------------' )
print( result )
'''
smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min 0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min 0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64
'''

result_unstack = result.unstack('smoker')

print( '-------------- result_unstack --------------' )
print( result_unstack )
'''
smoker          No        Yes
count   151.000000  93.000000
mean      0.159328   0.163196
std       0.039910   0.085119
min 0.056797 0.035638
25%       0.136906   0.106771
50%       0.155625   0.153846
75%       0.185014   0.195059
max       0.291990   0.710345
'''

#f = lambda x: x.describe()
#grouped.apply(f)

# disable grouping keys
tips.groupby('smoker', group_keys=False).apply(top)


# ### Quantile and bucket analysis
frame = DataFrame({'data1': np.random.randn(1000),
                   'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor[:10]

def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

grouping = pd.qcut(frame.data1, 10, labels=False)

grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

# ### Fill missing values ​​with group-specific values
s = Series(np.random.randn(6))
s [:: 2] = np.nan
print( s )
'''
0 NaN
1   -0.438053
2 NaN
3    0.401587
4 NaN
5   -0.574654
dtype: float64
'''

s.fillna(s.mean())

states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
print( data )
'''
Ohio          0.786210
New York     -1.393822
Vermont NaN
Florida       1.170900
Oregon        0.678661
Nevada             NaN
California    0.150581
Idaho NaN
dtype: float64
'''
data.groupby(group_key).mean()

fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)
print( data )
'''
Ohio          0.786210
New York     -1.393822
Vermont NaN
Florida       1.170900
Oregon        0.678661
Nevada             NaN
California    0.150581
Idaho NaN
dtype: float64
'''

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325394865&siteId=291194637