import numpy as np
import pandas as pd
from pandas import Series, DataFrame
df = pd. read_csv( '/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/city_weather.csv' )
df
date
city
temperature
wind
0
03/01/2016
BJ
8
5
1
17/01/2016
BJ
12
2
2
31/01/2016
BJ
19
2
3
14/02/2016
BJ
-3
3
4
28/02/2016
BJ
19
2
5
13/03/2016
BJ
5
3
6
27/03/2016
SH
-4
4
7
10/04/2016
SH
19
3
8
24/04/2016
SH
20
3
9
08/05/2016
SH
17
3
10
22/05/2016
SH
4
2
11
05/06/2016
SH
-10
4
12
19/06/2016
SH
0
5
13
03/07/2016
SH
-9
5
14
17/07/2016
GZ
10
2
15
31/07/2016
GZ
-1
5
16
14/08/2016
GZ
1
5
17
28/08/2016
GZ
25
4
18
11/09/2016
SZ
20
1
19
25/09/2016
SZ
-10
4
g = df. groupby( 'city' )
g
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1201a06d0>
g. mean( )
temperature
wind
city
BJ
10.000
2.833333
GZ
8.750
4.000000
SH
4.625
3.625000
SZ
5.000
2.500000
g. describe( )
temperature
wind
count
mean
std
min
25%
50%
75%
max
count
mean
std
min
25%
50%
75%
max
city
BJ
6.0
10.000
8.532292
-3.0
5.75
10.0
17.25
19.0
6.0
2.833333
1.169045
2.0
2.00
2.5
3.00
5.0
GZ
4.0
8.750
11.842719
-1.0
0.50
5.5
13.75
25.0
4.0
4.000000
1.414214
2.0
3.50
4.5
5.00
5.0
SH
8.0
4.625
12.489281
-10.0
-5.25
2.0
17.50
20.0
8.0
3.625000
1.060660
2.0
3.00
3.5
4.25
5.0
SZ
2.0
5.000
21.213203
-10.0
-2.50
5.0
12.50
20.0
2.0
2.500000
2.121320
1.0
1.75
2.5
3.25
4.0
g. agg( 'min' )
date
temperature
wind
city
BJ
03/01/2016
-3
2
GZ
14/08/2016
-1
2
SH
03/07/2016
-10
2
SZ
11/09/2016
-10
1
def foo ( attr) :
print ( type ( attr) ) , print ( attr)
return np. nan
g. agg( foo)
<class 'pandas.core.series.Series'>
0 03/01/2016
1 17/01/2016
2 31/01/2016
3 14/02/2016
4 28/02/2016
5 13/03/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
14 17/07/2016
15 31/07/2016
16 14/08/2016
17 28/08/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
6 27/03/2016
7 10/04/2016
8 24/04/2016
9 08/05/2016
10 22/05/2016
11 05/06/2016
12 19/06/2016
13 03/07/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
18 11/09/2016
19 25/09/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
0 8
1 12
2 19
3 -3
4 19
5 5
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
14 10
15 -1
16 1
17 25
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
6 -4
7 19
8 20
9 17
10 4
11 -10
12 0
13 -9
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
18 20
19 -10
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
0 5
1 2
2 2
3 3
4 2
5 3
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
14 2
15 5
16 5
17 4
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
6 4
7 3
8 3
9 3
10 2
11 4
12 5
13 5
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
18 1
19 4
Name: wind, dtype: int64
date
temperature
wind
city
BJ
NaN
NaN
NaN
GZ
NaN
NaN
NaN
SH
NaN
NaN
NaN
SZ
NaN
NaN
NaN
def foo ( attr) :
return attr. max ( ) - attr. min ( )
g. agg( foo)
temperature
wind
city
BJ
22
3
GZ
26
3
SH
30
3
SZ
30
3
g_new = df. groupby( [ 'city' , 'wind' ] )
g_new
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1213acdd0>
g_new. groups
{('BJ', 2): Int64Index([1, 2, 4], dtype='int64'),
('BJ', 3): Int64Index([3, 5], dtype='int64'),
('BJ', 5): Int64Index([0], dtype='int64'),
('GZ', 2): Int64Index([14], dtype='int64'),
('GZ', 4): Int64Index([17], dtype='int64'),
('GZ', 5): Int64Index([15, 16], dtype='int64'),
('SH', 2): Int64Index([10], dtype='int64'),
('SH', 3): Int64Index([7, 8, 9], dtype='int64'),
('SH', 4): Int64Index([6, 11], dtype='int64'),
('SH', 5): Int64Index([12, 13], dtype='int64'),
('SZ', 1): Int64Index([18], dtype='int64'),
('SZ', 4): Int64Index([19], dtype='int64')}
g_new. get_group( ( 'BJ' , 2 ) )
date
city
temperature
wind
1
17/01/2016
BJ
12
2
2
31/01/2016
BJ
19
2
4
28/02/2016
BJ
19
2
for ( name_1, name_2) , group in g_new:
print ( name_1, name_2)
print ( group)
BJ 2
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
BJ 3
date city temperature wind
3 14/02/2016 BJ -3 3
5 13/03/2016 BJ 5 3
BJ 5
date city temperature wind
0 03/01/2016 BJ 8 5
GZ 2
date city temperature wind
14 17/07/2016 GZ 10 2
GZ 4
date city temperature wind
17 28/08/2016 GZ 25 4
GZ 5
date city temperature wind
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
SH 2
date city temperature wind
10 22/05/2016 SH 4 2
SH 3
date city temperature wind
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
SH 4
date city temperature wind
6 27/03/2016 SH -4 4
11 05/06/2016 SH -10 4
SH 5
date city temperature wind
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ 1
date city temperature wind
18 11/09/2016 SZ 20 1
SZ 4
date city temperature wind
19 25/09/2016 SZ -10 4
g. groups
{'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
'GZ': Int64Index([14, 15, 16, 17], dtype='int64'),
'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'),
'SZ': Int64Index([18, 19], dtype='int64')}
g. get_group( 'BJ' )
date
temperature
wind
0
03/01/2016
8
5
1
17/01/2016
12
2
2
31/01/2016
19
2
3
14/02/2016
-3
3
4
28/02/2016
19
2
5
13/03/2016
5
3