数据聚合与分组运算

#数据聚合与分组
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                       'key2' : ['one', 'two', 'one', 'two', 'one'],
                       'data1' : np.random.randn(5),
                       'data2' : np.random.randn(5)})
df
key1 key2 data1 data2
0 a one 1.364596 0.352792
1 a two 1.685626 0.236429
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
4 a one 0.849733 1.619383
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E2CEB8>
grouped.mean()
key1
a    1.299985
b    0.426394
Name: data1, dtype: float64
means = df['data1'].groupby([df['key1'],df['key2']]).mean()#两个参数分组
means
key1  key2
a     one     1.107165
      two     1.685626
b     one    -0.537077
      two     1.389866
Name: data1, dtype: float64
means.unstack()
key2 one two
key1
a 1.107165 1.685626
b -0.537077 1.389866
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
#分组键可以是任何长度适当的数组
df['data1'].groupby([states, years]).mean()
California  2005    1.685626
            2006   -0.537077
Ohio        2005    1.377231
            2006    0.849733
Name: data1, dtype: float64
df.groupby('key1').mean()
data1 data2
key1
a 1.299985 0.736201
b 0.426394 0.404096
df.groupby(['key1','key2']).mean()
data1 data2
key1 key2
a one 1.107165 0.986087
two 1.685626 0.236429
b one -0.537077 -0.018004
two 1.389866 0.826195
df.groupby(['key1','key2']).size()#返回分组大小
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64
#对分组进行迭代,GroupBy对象支持迭代
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  1.364596  0.352792
1    a  two  1.685626  0.236429
4    a  one  0.849733  1.619383
b
  key1 key2     data1     data2
2    b  one -0.537077 -0.018004
3    b  two  1.389866  0.826195
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)
('a', 'one')
  key1 key2     data1     data2
0    a  one  1.364596  0.352792
4    a  one  0.849733  1.619383
('a', 'two')
  key1 key2     data1     data2
1    a  two  1.685626  0.236429
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.537077 -0.018004
('b', 'two')
  key1 key2     data1     data2
3    b  two  1.389866  0.826195
#将这些数据片段做成一个字典:
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1 key2 data1 data2
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
#对列进行分组
df.dtypes
key1      object
key2      object
data1    float64
data2    float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)
float64
      data1     data2
0  1.364596  0.352792
1  1.685626  0.236429
2 -0.537077 -0.018004
3  1.389866  0.826195
4  0.849733  1.619383
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
#选取一列或列的子集
df['data1'].groupby(df['key1'])
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E74FD0>
df.groupby(['key1', 'key2'])[['data2']].mean()
data2
key1 key2
a one 0.986087
two 0.236429
b one -0.018004
two 0.826195
#通过字典或Series进行分组
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3,[1,2]] = np.nan
people
a b c d e
Joe 0.037796 1.631083 0.455609 -1.405327 0.495940
Steve 0.304393 0.326005 0.146350 0.075903 -0.263559
Wes -0.055827 NaN NaN 1.268622 -0.541199
Jim 2.034394 0.818811 0.333991 0.158734 1.187207
Travis 2.719235 -0.459516 -0.292250 0.158169 1.102169
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',#分组字典
              'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
blue red
Joe -0.949718 2.164819
Steve 0.222253 0.366838
Wes 1.268622 -0.597026
Jim 0.492726 4.040412
Travis -0.134082 3.361889
map_series = pd.Series(mapping)
map_series
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
people.groupby(map_series,axis=1).count()
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3
#通过函数进行分组
people.groupby(len).sum()
a b c d e
3 2.016363 2.449894 0.789601 0.022029 1.141948
5 0.304393 0.326005 0.146350 0.075903 -0.263559
6 2.719235 -0.459516 -0.292250 0.158169 1.102169
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
a b c d e
3 one -0.055827 1.631083 0.455609 -1.405327 -0.541199
two 2.034394 0.818811 0.333991 0.158734 1.187207
5 one 0.304393 0.326005 0.146350 0.075903 -0.263559
6 two 2.719235 -0.459516 -0.292250 0.158169 1.102169
#根据索引级别分组
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                        [1, 3, 5, 1, 3]],
                                        names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
cty US JP
tenor 1 3 5 1 3
0 -1.865861 0.145454 1.476689 0.798411 0.546048
1 1.887149 0.218613 0.951165 2.790860 -2.419909
2 -0.681806 -0.490238 -2.215909 0.375548 0.145144
3 0.501322 1.358101 -1.069453 1.916614 -0.018305
#根据级别分组,使用level关键字传递级别序号或名字:
hier_df.groupby(level='cty',axis=1).count()
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3
#数据聚合
#聚合指的是任何能够从数组产生标量值的数据转换过程
df
key1 key2 data1 data2
0 a one 1.364596 0.352792
1 a two 1.685626 0.236429
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
4 a one 0.849733 1.619383
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
key1
a    1.621420
b    1.197171
Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max()-arr.min()
grouped.agg(peak_to_peak)#使用你自己的聚合函数,只需将其传入aggregate或agg方法
data1 data2
key1
a 0.835893 1.382954
b 1.926943 0.844199
grouped.describe()
data1 data2
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
key1
a 3.0 1.299985 0.421675 0.849733 1.107165 1.364596 1.525111 1.685626 3.0 0.736201 0.767068 0.236429 0.294610 0.352792 0.986087 1.619383
b 2.0 0.426394 1.362554 -0.537077 -0.055342 0.426394 0.908130 1.389866 2.0 0.404096 0.596939 -0.018004 0.193046 0.404096 0.615145 0.826195
#面向列的多函数应用
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
total_bill tip smoker day time size tip_pct
0 16.99 1.01 No Sun Dinner 2 0.059447
1 10.34 1.66 No Sun Dinner 3 0.160542
2 21.01 3.50 No Sun Dinner 3 0.166587
3 23.68 3.31 No Sun Dinner 2 0.139780
4 24.59 3.61 No Sun Dinner 4 0.146808
5 25.29 4.71 No Sun Dinner 4 0.186240
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean', 'std', peak_to_peak])
mean std peak_to_peak
day smoker
Fri No 0.151650 0.028123 0.067349
Yes 0.174783 0.051293 0.159925
Sat No 0.158048 0.039767 0.235193
Yes 0.147906 0.061375 0.290095
Sun No 0.160113 0.042347 0.193226
Yes 0.187250 0.154134 0.644685
Thur No 0.160298 0.038774 0.193350
Yes 0.163863 0.039389 0.151240
#修改聚合名字
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
foo bar
day smoker
Fri No 0.151650 0.028123
Yes 0.174783 0.051293
Sat No 0.158048 0.039767
Yes 0.147906 0.061375
Sun No 0.160113 0.042347
Yes 0.187250 0.154134
Thur No 0.160298 0.038774
Yes 0.163863 0.039389
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
tip_pct total_bill
count mean max count mean max
day smoker
Fri No 4 0.151650 0.187735 4 18.420000 22.75
Yes 15 0.174783 0.263480 15 16.813333 40.17
Sat No 45 0.158048 0.291990 45 19.661778 48.33
Yes 42 0.147906 0.325733 42 21.276667 50.81
Sun No 57 0.160113 0.252672 57 20.506667 48.17
Yes 19 0.187250 0.710345 19 24.120000 45.35
Thur No 45 0.160298 0.266312 45 17.113111 41.19
Yes 17 0.163863 0.241255 17 19.190588 43.11
ftuples = [('Durchschnitt', 'mean'),('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
tip_pct total_bill
Durchschnitt Abweichung Durchschnitt Abweichung
day smoker
Fri No 0.151650 0.000791 18.420000 25.596333
Yes 0.174783 0.002631 16.813333 82.562438
Sat No 0.158048 0.001581 19.661778 79.908965
Yes 0.147906 0.003767 21.276667 101.387535
Sun No 0.160113 0.001793 20.506667 66.099980
Yes 0.187250 0.023757 24.120000 109.046044
Thur No 0.160298 0.001503 17.113111 59.625081
Yes 0.163863 0.001551 19.190588 69.808518
#对一个列或不同的列应用不同的函数
grouped.agg({'tip' : np.max, 'size' : 'sum'})
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
                'size' : 'sum'})
tip_pct size
min max mean std sum
day smoker
Fri No 0.120385 0.187735 0.151650 0.028123 9
Yes 0.103555 0.263480 0.174783 0.051293 31
Sat No 0.056797 0.291990 0.158048 0.039767 115
Yes 0.035638 0.325733 0.147906 0.061375 104
Sun No 0.059447 0.252672 0.160113 0.042347 167
Yes 0.065660 0.710345 0.187250 0.154134 49
Thur No 0.072961 0.266312 0.160298 0.038774 112
Yes 0.090014 0.241255 0.163863 0.039389 40
#apply:一般性的拆分应用合并
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)
total_bill tip smoker day time size tip_pct
109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
232 11.61 3.39 No Sat Dinner 2 0.291990
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby('smoker').apply(top)
total_bill tip smoker day time size tip_pct
smoker
No 88 24.71 5.85 No Thur Lunch 2 0.236746
185 20.69 5.00 No Sun Dinner 5 0.241663
51 10.29 2.60 No Sun Dinner 2 0.252672
149 7.51 2.00 No Thur Lunch 2 0.266312
232 11.61 3.39 No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
total_bill tip smoker day time size tip_pct
smoker day
No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857
Sat 212 48.33 9.00 No Sat Dinner 4 0.186220
Sun 156 48.17 5.00 No Sun Dinner 6 0.103799
Thur 142 41.19 5.00 No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750
Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812
Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178
Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
result = tips.groupby('smoker')['tip_pct'].describe()
result
count mean std min 25% 50% 75% max
smoker
No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014 0.291990
Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059 0.710345
result.unstack('smoker')
       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64
#禁止分组键
tips.groupby('smoker',group_keys=False).apply(top)
total_bill tip smoker day time size tip_pct
88 24.71 5.85 No Thur Lunch 2 0.236746
185 20.69 5.00 No Sun Dinner 5 0.241663
51 10.29 2.60 No Sun Dinner 2 0.252672
149 7.51 2.00 No Thur Lunch 2 0.266312
232 11.61 3.39 No Sat Dinner 2 0.291990
109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
#分位数和桶分析
frame = pd.DataFrame({'data1':np.random.randn(1000),
                    'data2':np.random.randn(1000)})
quartiles = pd.cut(frame.data1,4)
quartiles[:10]
0    (-1.415, 0.195]
1     (0.195, 1.805]
2    (-1.415, 0.195]
3     (0.195, 1.805]
4    (-1.415, 0.195]
5    (-1.415, 0.195]
6     (0.195, 1.805]
7    (-1.415, 0.195]
8    (-1.415, 0.195]
9    (-1.415, 0.195]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.032, -1.415] < (-1.415, 0.195] < (0.195, 1.805] < (1.805, 3.414]]
def get_stats(group):
    return{'min':group.min(),'max':group.max(),
          'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()
count max mean min
data1
(-3.032, -1.415] 75.0 2.055051 0.217488 -2.419102
(-1.415, 0.195] 511.0 3.134327 0.002096 -2.709974
(0.195, 1.805] 379.0 3.146306 -0.014438 -2.561375
(1.805, 3.414] 35.0 2.209609 0.293813 -1.865174
grouping = pd.qcut(frame.data1,10,labels=False)#大小相同的桶
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
count max mean min
data1
0 100.0 2.055051 0.128959 -2.419102
1 100.0 3.134327 -0.106525 -2.131247
2 100.0 2.178958 0.041848 -1.964788
3 100.0 2.443715 0.176331 -2.587572
4 100.0 2.231973 0.016023 -2.709974
5 100.0 2.432079 -0.082424 -2.189209
6 100.0 3.146306 0.072272 -2.282556
7 100.0 1.831667 -0.011231 -2.538154
8 100.0 2.060099 -0.060095 -2.561375
9 100.0 2.209609 0.046783 -2.049609
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s
0         NaN
1   -0.162668
2         NaN
3    0.273627
4         NaN
5    0.345890
dtype: float64
s.fillna(s.mean())
0    0.152283
1   -0.162668
2    0.152283
3    0.273627
4    0.152283
5    0.345890
dtype: float64
states = ['Ohio', 'New York', 'Vermont', 'Florida',
              'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East']*4 + ['West']*4
data = pd.Series(np.random.randn(8),index=states)
data
Ohio         -1.141165
New York      0.077290
Vermont       1.178992
Florida       1.413521
Oregon       -0.840100
Nevada       -0.262705
California    0.410535
Idaho        -2.736216
dtype: float64
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data
Ohio         -1.141165
New York      0.077290
Vermont            NaN
Florida       1.413521
Oregon       -0.840100
Nevada             NaN
California    0.410535
Idaho              NaN
dtype: float64
data.groupby(group_key).mean()
East    0.116549
West   -0.214783
dtype: float64
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio         -1.141165
New York      0.077290
Vermont       0.116549
Florida       1.413521
Oregon       -0.840100
Nevada       -0.214783
California    0.410535
Idaho        -0.214783
dtype: float64
#随机采样和排列
suits = ['H','S','C','D']
card_val = (list(range(1,11))+[10]*3)*4
base_names = ['A']+list(range(2,11))+['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
    cards.extend(str(num)+suit for num in base_names)
deck = pd.Series(card_val,index=cards)
deck[:13]
AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64
def draw(deck,n=5):
    return deck.sample(n)
draw(deck)
QH    10
AS     1
2S     2
4C     4
5C     5
dtype: int64
get_suit = lambda card:card[-1]
deck.groupby(get_suit).apply(draw,n=2)
C  9C      9
   2C      2
D  QD     10
   10D    10
H  5H      5
   KH     10
S  3S      3
   2S      2
dtype: int64
#分组加权平均数和相关系数
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                    'b', 'b', 'b', 'b'],
                       'data': np.random.randn(8),
                       'weights': np.random.rand(8)})
df


category data weights
0 a -0.326019 0.107540
1 a 1.333957 0.220552
2 a -0.055586 0.189389
3 a -0.947162 0.598072
4 b -0.764437 0.505189
5 b 1.204008 0.921015
6 b -1.048168 0.660169
7 b -1.495828 0.525462
grouped = df.groupby('category')
get_wavg = lambda g:np.average(g['data'],weights=g['weights'])#加权平均数
grouped.apply(get_wavg)
category
a   -0.284928
b   -0.289163
dtype: float64
close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,
                           index_col=0)
close_px.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB
close_px[-4:]
AAPL MSFT XOM SPX
2011-10-11 400.29 27.00 76.27 1195.54
2011-10-12 402.19 26.96 77.16 1207.25
2011-10-13 408.43 27.18 76.37 1203.66
2011-10-14 422.00 27.27 78.11 1224.58
spx_corr = lambda x:x.corrwith(x['SPX'])
rets = close_px.pct_change().dropna()
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)
AAPL MSFT XOM SPX
2003 0.541124 0.745174 0.661265 1.0
2004 0.374283 0.588531 0.557742 1.0
2005 0.467540 0.562374 0.631010 1.0
2006 0.428267 0.406126 0.518514 1.0
2007 0.508118 0.658770 0.786264 1.0
2008 0.681434 0.804626 0.828303 1.0
2009 0.707103 0.654902 0.797921 1.0
2010 0.710105 0.730118 0.839057 1.0
2011 0.691931 0.800996 0.859975 1.0
#透视表和交叉表
tips.pivot_table(index=['day', 'smoker'])
size tip tip_pct total_bill
day smoker
Fri No 2.250000 2.812500 0.151650 18.420000
Yes 2.066667 2.714000 0.174783 16.813333
Sat No 2.555556 3.102889 0.158048 19.661778
Yes 2.476190 2.875476 0.147906 21.276667
Sun No 2.929825 3.167895 0.160113 20.506667
Yes 2.578947 3.516842 0.187250 24.120000
Thur No 2.488889 2.673778 0.160298 17.113111
Yes 2.352941 3.030000 0.163863 19.190588
 tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                     columns='smoker')
size tip_pct
smoker No Yes No Yes
time day
Dinner Fri 2.000000 2.222222 0.139622 0.165347
Sat 2.555556 2.476190 0.158048 0.147906
Sun 2.929825 2.578947 0.160113 0.187250
Thur 2.000000 NaN 0.159744 NaN
Lunch Fri 3.000000 1.833333 0.187735 0.188937
Thur 2.500000 2.352941 0.160311 0.163863
 tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                     columns='smoker', margins=True)
size tip_pct
smoker No Yes All No Yes All
time day
Dinner Fri 2.000000 2.222222 2.166667 0.139622 0.165347 0.158916
Sat 2.555556 2.476190 2.517241 0.158048 0.147906 0.153152
Sun 2.929825 2.578947 2.842105 0.160113 0.187250 0.166897
Thur 2.000000 NaN 2.000000 0.159744 NaN 0.159744
Lunch Fri 3.000000 1.833333 2.000000 0.187735 0.188937 0.188765
Thur 2.500000 2.352941 2.459016 0.160311 0.163863 0.161301
All 2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',
                    aggfunc=len, margins=True)
day Fri Sat Sun Thur All
time smoker
Dinner No 3.0 45.0 57.0 1.0 106.0
Yes 9.0 42.0 19.0 NaN 70.0
Lunch No 1.0 NaN NaN 44.0 45.0
Yes 6.0 NaN NaN 17.0 23.0
All 19.0 87.0 76.0 62.0 244.0
tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],
                 columns='day', aggfunc='mean', fill_value=0)
day Fri Sat Sun Thur
time size smoker
Dinner 1 No 0.000000 0.137931 0.000000 0.000000
Yes 0.000000 0.325733 0.000000 0.000000
2 No 0.139622 0.162705 0.168859 0.159744
Yes 0.171297 0.148668 0.207893 0.000000
3 No 0.000000 0.154661 0.152663 0.000000
Yes 0.000000 0.144995 0.152660 0.000000
4 No 0.000000 0.150096 0.148143 0.000000
Yes 0.117750 0.124515 0.193370 0.000000
5 No 0.000000 0.000000 0.206928 0.000000
Yes 0.000000 0.106572 0.065660 0.000000
6 No 0.000000 0.000000 0.103799 0.000000
Lunch 1 No 0.000000 0.000000 0.000000 0.181728
Yes 0.223776 0.000000 0.000000 0.000000
2 No 0.000000 0.000000 0.000000 0.166005
Yes 0.181969 0.000000 0.000000 0.158843
3 No 0.187735 0.000000 0.000000 0.084246
Yes 0.000000 0.000000 0.000000 0.204952
4 No 0.000000 0.000000 0.000000 0.138919
Yes 0.000000 0.000000 0.000000 0.155410
5 No 0.000000 0.000000 0.000000 0.121389
6 No 0.000000 0.000000 0.000000 0.173706
#交叉表计算分组频率

猜你喜欢

转载自blog.csdn.net/DMU_lzq1996/article/details/83788856