import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# 打开美国航空公司数据
f = open('D:\\imooc Python3数据科学入门与实战\\homework\\usa_flights.csv')
df = pd.read_csv(f)
df.head()
Out[6]:
flight_date unique_carrier flight_num origin dest arr_delay \
0 02/01/2015 0:00 AA 1 JFK LAX -19.0
1 03/01/2015 0:00 AA 1 JFK LAX -39.0
2 04/01/2015 0:00 AA 1 JFK LAX -12.0
3 05/01/2015 0:00 AA 1 JFK LAX -8.0
4 06/01/2015 0:00 AA 1 JFK LAX 25.0
cancelled distance carrier_delay weather_delay late_aircraft_delay \
0 0 2475 NaN NaN NaN
1 0 2475 NaN NaN NaN
2 0 2475 NaN NaN NaN
3 0 2475 NaN NaN NaN
4 0 2475 0.0 0.0 0.0
nas_delay security_delay actual_elapsed_time
0 NaN NaN 381.0
1 NaN NaN 358.0
2 NaN NaN 385.0
3 NaN NaN 389.0
4 25.0 0.0 424.0
# 对延误时间进行排序
df.sort_values('arr_delay', ascending=False).head()
Out[7]:
flight_date unique_carrier flight_num origin dest arr_delay \
11073 11/01/2015 0:00 AA 1595 AUS DFW 1444.0
10214 13/01/2015 0:00 AA 1487 OMA DFW 1392.0
12430 03/01/2015 0:00 AA 1677 MEM DFW 1384.0
8443 04/01/2015 0:00 AA 1279 OMA DFW 1237.0
10328 05/01/2015 0:00 AA 1495 EGE DFW 1187.0
cancelled distance carrier_delay weather_delay late_aircraft_delay \
11073 0 190 1444.0 0.0 0.0
10214 0 583 1392.0 0.0 0.0
12430 0 432 1380.0 0.0 0.0
8443 0 583 1222.0 0.0 15.0
10328 0 721 1019.0 0.0 168.0
nas_delay security_delay actual_elapsed_time
11073 0.0 0.0 59.0
10214 0.0 0.0 117.0
12430 4.0 0.0 104.0
8443 0.0 0.0 102.0
10328 0.0 0.0 127.0
#延误时间top10
df.sort_values('arr_delay')[:10]
# 计算延误和没有延误的比例
# 使用value_counts()进行统计
df['cancelled'].value_counts()
Out[9]:
0 196873
1 4791
Name: cancelled, dtype: int64
# 计算延误列
df['delayed'] = df['arr_delay'].apply(lambda x : x>0)
# 统计
df['delayed'].value_counts()
Out[12]:
False 103037
True 98627
Name: delayed, dtype: int64
#计算延误比
delay_date = df['delayed'].value_counts()
delay_date
Out[16]:
False 103037
True 98627
Name: delayed, dtype: int64
delay_date[1]/(delay_date[0]+delay_date[1])
Out[18]: 0.48906597112027927
# 计算每个航空公司的延误比
# 对公司和是否延误进行分组
delay_group = df.groupby(['unique_carrier', 'delayed'])
# 统计查看行数,生成一个多级index,Series
delay_group.size()
Out[23]:
unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64
# 多级Series转换为DataFrame
df_delay = delay_group.size().unstack()
df_delay
Out[25]:
delayed False True
unique_carrier
AA 8912 9841
AS 3527 2104
B6 4832 4401
DL 17719 9803
EV 10596 11371
F9 1103 1848
HA 1351 1354
MQ 4692 8060
NK 1550 2133
OO 9977 10804
UA 7885 8624
US 7850 6353
VX 1254 781
WN 21789 21150
# 画图
# 折线图
df_delay.plot()
Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1cfde9b6198>
# 柱状图
df_delay.plot(kind='barh', stacked=True, figsize=[16,6], colormap='winter')
Out[27]: <matplotlib.axes._subplots.AxesSubplot at 0x1cfe483b048>
# 透视表
# 对时间和航空公司进行透视
fights_by_carrier = df.pivot_table(index='flight_date', columns='unique_carrier', aggfunc='count')
fights_by_carrier.head()
Out[33]:
actual_elapsed_time \
unique_carrier AA AS B6 DL EV F9 HA MQ NK
flight_date
02/01/2015 0:00 1508 477 758 2261 1763 252 223 975 285
03/01/2015 0:00 1425 444 707 2021 1668 180 202 834 282
04/01/2015 0:00 1493 458 752 2254 1709 241 206 845 283
05/01/2015 0:00 1492 433 750 2211 1751 248 207 856 282
06/01/2015 0:00 1370 414 682 2050 1616 246 202 859 275
weather_delay \
unique_carrier OO ... EV F9 HA MQ NK OO UA US
flight_date ...
02/01/2015 0:00 1718 ... 425 90 119 441 94 506 415 175
03/01/2015 0:00 1588 ... 909 110 111 542 152 720 648 339
04/01/2015 0:00 1653 ... 902 171 25 638 166 729 659 480
05/01/2015 0:00 1644 ... 743 138 28 594 163 627 443 291
06/01/2015 0:00 1469 ... 597 90 12 489 149 426 499 424