Pandas玩转数据(十四) -- 分组和透视功能实战

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 打开美国航空公司数据
f = open('D:\\imooc Python3数据科学入门与实战\\homework\\usa_flights.csv')
df = pd.read_csv(f)

df.head()
Out[6]: 
       flight_date unique_carrier  flight_num origin dest  arr_delay  \
0  02/01/2015 0:00             AA           1    JFK  LAX      -19.0   
1  03/01/2015 0:00             AA           1    JFK  LAX      -39.0   
2  04/01/2015 0:00             AA           1    JFK  LAX      -12.0   
3  05/01/2015 0:00             AA           1    JFK  LAX       -8.0   
4  06/01/2015 0:00             AA           1    JFK  LAX       25.0   

   cancelled  distance  carrier_delay  weather_delay  late_aircraft_delay  \
0          0      2475            NaN            NaN                  NaN   
1          0      2475            NaN            NaN                  NaN   
2          0      2475            NaN            NaN                  NaN   
3          0      2475            NaN            NaN                  NaN   
4          0      2475            0.0            0.0                  0.0   

   nas_delay  security_delay  actual_elapsed_time  
0        NaN             NaN                381.0  
1        NaN             NaN                358.0  
2        NaN             NaN                385.0  
3        NaN             NaN                389.0  
4       25.0             0.0                424.0  


# 对延误时间进行排序
df.sort_values('arr_delay', ascending=False).head()
Out[7]: 
           flight_date unique_carrier  flight_num origin dest  arr_delay  \
11073  11/01/2015 0:00             AA        1595    AUS  DFW     1444.0   
10214  13/01/2015 0:00             AA        1487    OMA  DFW     1392.0   
12430  03/01/2015 0:00             AA        1677    MEM  DFW     1384.0   
8443   04/01/2015 0:00             AA        1279    OMA  DFW     1237.0   
10328  05/01/2015 0:00             AA        1495    EGE  DFW     1187.0   

       cancelled  distance  carrier_delay  weather_delay  late_aircraft_delay  \
11073          0       190         1444.0            0.0                  0.0   
10214          0       583         1392.0            0.0                  0.0   
12430          0       432         1380.0            0.0                  0.0   
8443           0       583         1222.0            0.0                 15.0   
10328          0       721         1019.0            0.0                168.0   

       nas_delay  security_delay  actual_elapsed_time  
11073        0.0             0.0                 59.0  
10214        0.0             0.0                117.0  
12430        4.0             0.0                104.0  
8443         0.0             0.0                102.0  
10328        0.0             0.0                127.0  

#延误时间top10
df.sort_values('arr_delay')[:10]

# 计算延误和没有延误的比例
# 使用value_counts()进行统计
df['cancelled'].value_counts()
Out[9]: 
0    196873
1      4791
Name: cancelled, dtype: int64

# 计算延误列
df['delayed'] = df['arr_delay'].apply(lambda x : x>0)
# 统计
df['delayed'].value_counts()
Out[12]: 
False    103037
True      98627
Name: delayed, dtype: int64

#计算延误比
delay_date = df['delayed'].value_counts()
delay_date
Out[16]: 
False    103037
True      98627
Name: delayed, dtype: int64
delay_date[1]/(delay_date[0]+delay_date[1])
Out[18]: 0.48906597112027927

# 计算每个航空公司的延误比
# 对公司和是否延误进行分组
delay_group = df.groupby(['unique_carrier', 'delayed'])

# 统计查看行数,生成一个多级index,Series
delay_group.size()
Out[23]: 
unique_carrier  delayed
AA              False       8912
                True        9841
AS              False       3527
                True        2104
B6              False       4832
                True        4401
DL              False      17719
                True        9803
EV              False      10596
                True       11371
F9              False       1103
                True        1848
HA              False       1351
                True        1354
MQ              False       4692
                True        8060
NK              False       1550
                True        2133
OO              False       9977
                True       10804
UA              False       7885
                True        8624
US              False       7850
                True        6353
VX              False       1254
                True         781
WN              False      21789
                True       21150
dtype: int64

# 多级Series转换为DataFrame
df_delay = delay_group.size().unstack()
df_delay
Out[25]: 
delayed         False  True 
unique_carrier              
AA               8912   9841
AS               3527   2104
B6               4832   4401
DL              17719   9803
EV              10596  11371
F9               1103   1848
HA               1351   1354
MQ               4692   8060
NK               1550   2133
OO               9977  10804
UA               7885   8624
US               7850   6353
VX               1254    781
WN              21789  21150

# 画图
# 折线图
df_delay.plot()
Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1cfde9b6198>
# 柱状图
df_delay.plot(kind='barh', stacked=True, figsize=[16,6], colormap='winter')
Out[27]: <matplotlib.axes._subplots.AxesSubplot at 0x1cfe483b048>

# 透视表
# 对时间和航空公司进行透视
fights_by_carrier = df.pivot_table(index='flight_date', columns='unique_carrier', aggfunc='count')

fights_by_carrier.head()
Out[33]: 
                actual_elapsed_time                                            \
unique_carrier                   AA   AS   B6    DL    EV   F9   HA   MQ   NK   
flight_date                                                                     
02/01/2015 0:00                1508  477  758  2261  1763  252  223  975  285   
03/01/2015 0:00                1425  444  707  2021  1668  180  202  834  282   
04/01/2015 0:00                1493  458  752  2254  1709  241  206  845  283   
05/01/2015 0:00                1492  433  750  2211  1751  248  207  856  282   
06/01/2015 0:00                1370  414  682  2050  1616  246  202  859  275   

 weather_delay                                     \
unique_carrier     OO  ...             EV   F9   HA   MQ   NK   OO   UA   US   
flight_date            ...                                                     
02/01/2015 0:00  1718  ...            425   90  119  441   94  506  415  175   
03/01/2015 0:00  1588  ...            909  110  111  542  152  720  648  339   
04/01/2015 0:00  1653  ...            902  171   25  638  166  729  659  480   
05/01/2015 0:00  1644  ...            743  138   28  594  163  627  443  291   
06/01/2015 0:00  1469  ...            597   90   12  489  149  426  499  424  

这里写图片描述
这里写图片描述

猜你喜欢

转载自blog.csdn.net/weixin_39778570/article/details/81124608
今日推荐