数据分组方法
- 分组计算根据某个或某几个字段对数据集进行分组,然后运用特点的函数,得到结果
- 使用groupby方法进行分组计算,得到分组对象GroupBy
- 语法为df.groupby(by=)
- 分组对象GroupBy可以运用描述性统计方法,如count(计数)、mean(均值)、median(中位数)、max(最大值)和min(最小值)等
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据统计'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('online_order.csv', encoding='gbk', dtype={'customer':str, 'order':str})
df.head(5)
|
customer |
order |
total_items |
discount% |
weekday |
hour |
Food% |
Fresh% |
Drinks% |
Home% |
Beauty% |
Health% |
Baby% |
Pets% |
0 |
0 |
0 |
45 |
23.03 |
4 |
13 |
9.46 |
87.06 |
3.48 |
0.00 |
0.00 |
0.00 |
0.0 |
0.0 |
1 |
0 |
1 |
38 |
1.22 |
5 |
13 |
15.87 |
75.80 |
6.22 |
2.12 |
0.00 |
0.00 |
0.0 |
0.0 |
2 |
0 |
2 |
51 |
18.08 |
4 |
13 |
16.88 |
56.75 |
3.37 |
16.48 |
6.53 |
0.00 |
0.0 |
0.0 |
3 |
1 |
3 |
57 |
16.51 |
1 |
12 |
28.81 |
35.99 |
11.78 |
4.62 |
2.87 |
15.92 |
0.0 |
0.0 |
4 |
1 |
4 |
53 |
18.31 |
2 |
11 |
24.13 |
60.38 |
7.78 |
7.72 |
0.00 |
0.00 |
0.0 |
0.0 |
grouped = df.groupby('weekday')
type(grouped)
pandas.core.groupby.generic.DataFrameGroupBy
grouped.mean()
|
total_items |
discount% |
hour |
Food% |
Fresh% |
Drinks% |
Home% |
Beauty% |
Health% |
Baby% |
Pets% |
weekday |
|
|
|
|
|
|
|
|
|
|
|
1 |
30.662177 |
8.580705 |
14.693122 |
22.690866 |
20.000904 |
22.522993 |
13.932553 |
6.972394 |
1.152285 |
11.592562 |
1.007306 |
2 |
31.868612 |
8.638014 |
14.966197 |
23.994915 |
19.407738 |
24.346459 |
13.559191 |
4.903366 |
1.079423 |
11.277284 |
1.272638 |
3 |
31.869796 |
7.794507 |
15.059898 |
24.309274 |
19.957653 |
23.822470 |
13.282088 |
6.702640 |
1.156829 |
9.591389 |
0.937205 |
4 |
32.251899 |
8.068155 |
14.324185 |
24.374364 |
21.538027 |
24.553266 |
13.391946 |
4.806528 |
1.031490 |
9.058201 |
1.080473 |
5 |
31.406619 |
9.159031 |
13.386919 |
24.602790 |
20.549153 |
24.976466 |
12.485788 |
5.431221 |
1.248605 |
9.655343 |
0.908227 |
6 |
32.154814 |
8.414258 |
14.751084 |
23.743196 |
18.707788 |
23.593699 |
14.173291 |
5.878647 |
1.170585 |
11.478343 |
1.150980 |
7 |
32.373837 |
8.710171 |
16.989535 |
22.271512 |
21.020359 |
21.093767 |
13.632481 |
5.895322 |
1.145938 |
13.844250 |
0.950391 |
grouped.mean()['Food%']
weekday
1 22.690866
2 23.994915
3 24.309274
4 24.374364
5 24.602790
6 23.743196
7 22.271512
Name: Food%, dtype: float64
grouped = df.groupby(by=['customer', 'weekday'])
grouped.sum()['total_items']
customer weekday
0 4 96
5 38
1 1 423
2 127
4 37
5 36
10 1 23
3 26
100 1 38
2 78
3 78
7 135
1000 2 6
10000 6 30
10001 6 15
10002 3 11
6 42
7 48
10003 2 4
10004 2 28
3 131
4 93
10005 7 29
10006 2 20
5 27
7 26
10007 2 6
6 15
10008 7 123
10009 1 2
...
9984 6 40
7 61
9985 6 11
9986 1 50
6 49
7 50
9987 1 23
9988 1 18
4 1
9989 1 27
999 1 173
2 45
4 60
5 137
7 149
9990 7 8
9991 6 46
9992 1 13
2 14
5 25
6 24
9993 6 8
9994 2 64
3 57
9995 7 14
9996 7 14
9997 6 5
9998 1 28
6 10
9999 6 4
Name: total_items, Length: 20777, dtype: int64