【数据分析可视化】数据聚合技术Aggregation

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 读入城市天气csv文件
df = pd.read_csv('/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/city_weather.csv')
df

	date	city	temperature	wind
0	03/01/2016	BJ	8	5
1	17/01/2016	BJ	12	2
2	31/01/2016	BJ	19	2
3	14/02/2016	BJ	-3	3
4	28/02/2016	BJ	19	2
5	13/03/2016	BJ	5	3
6	27/03/2016	SH	-4	4
7	10/04/2016	SH	19	3
8	24/04/2016	SH	20	3
9	08/05/2016	SH	17	3
10	22/05/2016	SH	4	2
11	05/06/2016	SH	-10	4
12	19/06/2016	SH	0	5
13	03/07/2016	SH	-9	5
14	17/07/2016	GZ	10	2
15	31/07/2016	GZ	-1	5
16	14/08/2016	GZ	1	5
17	28/08/2016	GZ	25	4
18	11/09/2016	SZ	20	1
19	25/09/2016	SZ	-10	4

# 根据城市进行分组
g = df.groupby('city')
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1201a06d0>

# 可以直接进行聚合操作
g.mean()

	temperature	wind
city
BJ	10.000	2.833333
GZ	8.750	4.000000
SH	4.625	3.625000
SZ	5.000	2.500000

g.describe()

	temperature								wind
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
city
BJ	6.0	10.000	8.532292	-3.0	5.75	10.0	17.25	19.0	6.0	2.833333	1.169045	2.0	2.00	2.5	3.00	5.0
GZ	4.0	8.750	11.842719	-1.0	0.50	5.5	13.75	25.0	4.0	4.000000	1.414214	2.0	3.50	4.5	5.00	5.0
SH	8.0	4.625	12.489281	-10.0	-5.25	2.0	17.50	20.0	8.0	3.625000	1.060660	2.0	3.00	3.5	4.25	5.0
SZ	2.0	5.000	21.213203	-10.0	-2.50	5.0	12.50	20.0	2.0	2.500000	2.121320	1.0	1.75	2.5	3.25	4.0

# 传入聚合函数 使用
g.agg('min')

	date	temperature	wind
city
BJ	03/01/2016	-3	2
GZ	14/08/2016	-1	2
SH	03/07/2016	-10	2
SZ	11/09/2016	-10	1

# 测试自定义聚合函数
def foo(attr):
    print(type(attr)),print(attr)
    return np.nan

g.agg(foo)

<class 'pandas.core.series.Series'>
0    03/01/2016
1    17/01/2016
2    31/01/2016
3    14/02/2016
4    28/02/2016
5    13/03/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
14    17/07/2016
15    31/07/2016
16    14/08/2016
17    28/08/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
6     27/03/2016
7     10/04/2016
8     24/04/2016
9     08/05/2016
10    22/05/2016
11    05/06/2016
12    19/06/2016
13    03/07/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
18    11/09/2016
19    25/09/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
0     8
1    12
2    19
3    -3
4    19
5     5
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
14    10
15    -1
16     1
17    25
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
6     -4
7     19
8     20
9     17
10     4
11   -10
12     0
13    -9
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
18    20
19   -10
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
0    5
1    2
2    2
3    3
4    2
5    3
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
14    2
15    5
16    5
17    4
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
6     4
7     3
8     3
9     3
10    2
11    4
12    5
13    5
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
18    1
19    4
Name: wind, dtype: int64

	date	temperature	wind
city
BJ	NaN	NaN	NaN
GZ	NaN	NaN	NaN
SH	NaN	NaN	NaN
SZ	NaN	NaN	NaN

# 自定义函数 聚合 （最大值-最小值）
def foo(attr):
    return attr.max() - attr.min()

g.agg(foo)

	temperature	wind
city
BJ	22	3
GZ	26	3
SH	30	3
SZ	30	3

# 对某两个列做groupBy
g_new = df.groupby(['city', 'wind'])
g_new

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1213acdd0>

# 细分了风力的group
g_new.groups

{('BJ', 2): Int64Index([1, 2, 4], dtype='int64'),
 ('BJ', 3): Int64Index([3, 5], dtype='int64'),
 ('BJ', 5): Int64Index([0], dtype='int64'),
 ('GZ', 2): Int64Index([14], dtype='int64'),
 ('GZ', 4): Int64Index([17], dtype='int64'),
 ('GZ', 5): Int64Index([15, 16], dtype='int64'),
 ('SH', 2): Int64Index([10], dtype='int64'),
 ('SH', 3): Int64Index([7, 8, 9], dtype='int64'),
 ('SH', 4): Int64Index([6, 11], dtype='int64'),
 ('SH', 5): Int64Index([12, 13], dtype='int64'),
 ('SZ', 1): Int64Index([18], dtype='int64'),
 ('SZ', 4): Int64Index([19], dtype='int64')}

# 多列groupby get也要注意
g_new.get_group(('BJ',2))

	date	city	temperature	wind
1	17/01/2016	BJ	12	2
2	31/01/2016	BJ	19	2
4	28/02/2016	BJ	19	2

# 多列GroupBy的for
for (name_1, name_2), group in g_new:
    print(name_1, name_2)
    print(group)

BJ 2
         date city  temperature  wind
1  17/01/2016   BJ           12     2
2  31/01/2016   BJ           19     2
4  28/02/2016   BJ           19     2
BJ 3
         date city  temperature  wind
3  14/02/2016   BJ           -3     3
5  13/03/2016   BJ            5     3
BJ 5
         date city  temperature  wind
0  03/01/2016   BJ            8     5
GZ 2
          date city  temperature  wind
14  17/07/2016   GZ           10     2
GZ 4
          date city  temperature  wind
17  28/08/2016   GZ           25     4
GZ 5
          date city  temperature  wind
15  31/07/2016   GZ           -1     5
16  14/08/2016   GZ            1     5
SH 2
          date city  temperature  wind
10  22/05/2016   SH            4     2
SH 3
         date city  temperature  wind
7  10/04/2016   SH           19     3
8  24/04/2016   SH           20     3
9  08/05/2016   SH           17     3
SH 4
          date city  temperature  wind
6   27/03/2016   SH           -4     4
11  05/06/2016   SH          -10     4
SH 5
          date city  temperature  wind
12  19/06/2016   SH            0     5
13  03/07/2016   SH           -9     5
SZ 1
          date city  temperature  wind
18  11/09/2016   SZ           20     1
SZ 4
          date city  temperature  wind
19  25/09/2016   SZ          -10     4

g.groups

{'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'),
 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'),
 'SZ': Int64Index([18, 19], dtype='int64')}

# 单列group 获取获取某列
g.get_group('BJ')

	date	temperature	wind
0	03/01/2016	8	5
1	17/01/2016	12	2
2	31/01/2016	19	2
3	14/02/2016	-3	3
4	28/02/2016	19	2
5	13/03/2016	5	3

bennyrhys

原创文章 257 获赞 187 访问量 16万+

关注私信

【数据分析可视化】数据聚合技术Aggregation

猜你喜欢