数据来源
https://fivethirtyeight.com/politics/
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
link = '/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/usa_flights.csv'
df = pd.read_csv(link)
df.head()
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
0 |
02/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-19.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
381.0 |
1 |
03/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-39.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
358.0 |
2 |
04/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-12.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
385.0 |
3 |
05/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-8.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
389.0 |
4 |
06/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
25.0 |
0 |
2475 |
0.0 |
0.0 |
0.0 |
25.0 |
0.0 |
424.0 |
df.tail()
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
201659 |
10/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-16.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
77.0 |
201660 |
11/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-4.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
87.0 |
201661 |
12/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-7.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
82.0 |
201662 |
13/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
23.0 |
0 |
407 |
3.0 |
0.0 |
0.0 |
20.0 |
0.0 |
103.0 |
201663 |
14/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-7.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
82.0 |
df.shape
(201664, 14)
判断延误arr_delay>0就是延误
排序航班到达时间,前十名递减
df.sort_values('arr_delay', ascending=False)[:10]
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
11073 |
11/01/2015 0:00 |
AA |
1595 |
AUS |
DFW |
1444.0 |
0 |
190 |
1444.0 |
0.0 |
0.0 |
0.0 |
0.0 |
59.0 |
10214 |
13/01/2015 0:00 |
AA |
1487 |
OMA |
DFW |
1392.0 |
0 |
583 |
1392.0 |
0.0 |
0.0 |
0.0 |
0.0 |
117.0 |
12430 |
03/01/2015 0:00 |
AA |
1677 |
MEM |
DFW |
1384.0 |
0 |
432 |
1380.0 |
0.0 |
0.0 |
4.0 |
0.0 |
104.0 |
8443 |
04/01/2015 0:00 |
AA |
1279 |
OMA |
DFW |
1237.0 |
0 |
583 |
1222.0 |
0.0 |
15.0 |
0.0 |
0.0 |
102.0 |
10328 |
05/01/2015 0:00 |
AA |
1495 |
EGE |
DFW |
1187.0 |
0 |
721 |
1019.0 |
0.0 |
168.0 |
0.0 |
0.0 |
127.0 |
36570 |
04/01/2015 0:00 |
DL |
1435 |
MIA |
MSP |
1174.0 |
0 |
1501 |
1174.0 |
0.0 |
0.0 |
0.0 |
0.0 |
231.0 |
36495 |
04/01/2015 0:00 |
DL |
1367 |
ROC |
ATL |
1138.0 |
0 |
749 |
1112.0 |
0.0 |
0.0 |
26.0 |
0.0 |
171.0 |
59072 |
14/01/2015 0:00 |
DL |
1687 |
SAN |
MSP |
1084.0 |
0 |
1532 |
1070.0 |
0.0 |
0.0 |
14.0 |
0.0 |
240.0 |
32173 |
05/01/2015 0:00 |
AA |
970 |
LAS |
LAX |
1042.0 |
0 |
236 |
1033.0 |
0.0 |
9.0 |
0.0 |
0.0 |
66.0 |
56488 |
12/01/2015 0:00 |
DL |
2117 |
ATL |
COS |
1016.0 |
0 |
1184 |
1016.0 |
0.0 |
0.0 |
0.0 |
0.0 |
193.0 |
计算延误和没有延误所占比例
df['cancelled'].value_counts()
0 196873
1 4791
Name: cancelled, dtype: int64
df['delayed'] = df['arr_delay'].apply(lambda x: x > 0)
df.head()
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
delayed |
0 |
02/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-19.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
381.0 |
False |
1 |
03/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-39.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
358.0 |
False |
2 |
04/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-12.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
385.0 |
False |
3 |
05/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-8.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
389.0 |
False |
4 |
06/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
25.0 |
0 |
2475 |
0.0 |
0.0 |
0.0 |
25.0 |
0.0 |
424.0 |
True |
delay_data = df['delayed'].value_counts()
delay_data
False 103037
True 98627
Name: delayed, dtype: int64
type(delay_data)
pandas.core.series.Series
delay_data[1] / (delay_data[0] + delay_data[1])
0.4890659711202793
每一个航空公司延误的情况
delay_group = df.groupby(['unique_carrier', 'delayed'])
delay_group
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ff50710>
delay_group.size()
unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64
df_delay = delay_group.size().unstack()
df_delay
delayed |
False |
True |
unique_carrier |
|
|
AA |
8912 |
9841 |
AS |
3527 |
2104 |
B6 |
4832 |
4401 |
DL |
17719 |
9803 |
EV |
10596 |
11371 |
F9 |
1103 |
1848 |
HA |
1351 |
1354 |
MQ |
4692 |
8060 |
NK |
1550 |
2133 |
OO |
9977 |
10804 |
UA |
7885 |
8624 |
US |
7850 |
6353 |
VX |
1254 |
781 |
WN |
21789 |
21150 |
import matplotlib.pyplot as plt
df_delay.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1210efb50>
plt.show()
df_delay.plot(kind='barh', stacked=True, figsize=[16,6], colormap='winter')
<matplotlib.axes._subplots.AxesSubplot at 0x11c9e2290>
透视表功能
flights_by_carrier = df.pivot_table(index='flight_date', columns='unique_carrier')
flights_by_carrier.head()
|
actual_elapsed_time |
... |
weather_delay |
unique_carrier |
AA |
AS |
B6 |
DL |
EV |
F9 |
HA |
MQ |
NK |
OO |
... |
EV |
F9 |
HA |
MQ |
NK |
OO |
UA |
US |
VX |
WN |
flight_date |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
02/01/2015 0:00 |
176.852122 |
182.872117 |
174.580475 |
155.151703 |
103.325014 |
148.674603 |
108.654709 |
99.608205 |
156.775439 |
101.953434 |
... |
0.214118 |
0.000000 |
7.621849 |
4.956916 |
0.0 |
1.154150 |
0.816867 |
0.131429 |
4.413793 |
0.674080 |
03/01/2015 0:00 |
177.679298 |
189.126126 |
178.595474 |
161.668481 |
107.364508 |
158.116667 |
108.158416 |
102.381295 |
162.085106 |
107.836902 |
... |
1.605061 |
0.454545 |
0.000000 |
4.263838 |
0.0 |
1.369444 |
1.317901 |
2.938053 |
5.350000 |
1.442974 |
04/01/2015 0:00 |
178.200938 |
184.766376 |
179.517287 |
156.963620 |
104.893505 |
149.746888 |
104.878641 |
109.936095 |
162.289753 |
106.465820 |
... |
1.160754 |
0.374269 |
0.000000 |
9.286834 |
0.0 |
1.006859 |
2.996965 |
1.350000 |
6.414634 |
1.116999 |
05/01/2015 0:00 |
176.660858 |
178.226328 |
174.088000 |
142.719584 |
103.107938 |
152.104839 |
103.091787 |
108.128505 |
164.081560 |
103.141727 |
... |
4.005384 |
0.586957 |
0.000000 |
12.048822 |
0.0 |
2.732057 |
8.422122 |
0.116838 |
3.312500 |
1.370968 |
06/01/2015 0:00 |
171.155474 |
173.200483 |
175.029326 |
144.458049 |
100.694926 |
146.739837 |
100.168317 |
100.343423 |
165.410909 |
99.287270 |
... |
2.685092 |
4.866667 |
2.666667 |
7.971370 |
0.0 |
2.753521 |
6.551102 |
6.155660 |
13.615385 |
2.797213 |
5 rows × 154 columns