US presidential candidate in 2012 political contributions data analysis
Import Package
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
Easy to operate, and the month's party candidates and defining
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick',
'Paul, Ron', 'Gingrich, Newt']
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
df = pd.read_csv('./data/usa_election.txt')
df.head()
C:\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e + 08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
WITH |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
WITH |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
# 新建一列各个候选人所在党派party
df['party'] = df['cand_nm'].map(parties)
df.head()
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e + 08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
WITH |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
WITH |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
# party这一列中有哪些元素
df['party'].unique()
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
# 统计party列中各个元素出现次数
df['party'].value_counts()
Democrat 292400
Republican 237575
Reform 5364
Libertarian 702
Name: party, dtype: int64
# 查看各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by='party')['contb_receipt_amt'].sum()
party
Democrat 8.105758e+07
Libertarian 4.132769e+05
Reform 3.390338e+05
Republican 1.192255e+08
Name: contb_receipt_amt, dtype: float64
# 查看每天各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()
contb_receipt_dt party
01-APR-11 Reform 50.00
Republican 12635.00
01-AUG-11 Democrat 175281.00
Libertarian 1000.00
Reform 1847.00
Republican 234598.46
01-DEC-11 Democrat 651532.82
Libertarian 725.00
Reform 875.00
Republican 486405.96
01-FEB-11 Republican 250.00
01-JAN-11 Republican 8600.00
01-JAN-12 Democrat 58098.80
Reform 515.00
Republican 75704.72
01-JUL-11 Democrat 165961.00
Libertarian 2000.00
Reform 100.00
Republican 115848.72
01-JUN-11 Democrat 145459.00
Libertarian 500.00
Reform 50.00
Republican 433109.20
01-MAR-11 Republican 1000.00
01-MAY-11 Democrat 82644.00
Reform 480.00
Republican 28663.87
01-NOV-11 Democrat 122529.87
Libertarian 3000.00
Reform 1792.00
...
30-OCT-11 Reform 3910.00
Republican 43913.16
30-SEP-11 Democrat 3373517.24
Libertarian 550.00
Reform 2050.00
Republican 4886331.76
31-AUG-11 Democrat 374387.44
Libertarian 10750.00
Reform 450.00
Republican 1017735.02
31-DEC-11 Democrat 3553072.57
Reform 695.00
Republican 1094376.72
31-JAN-11 Republican 6000.00
31-JAN-12 Democrat 1418410.31
Reform 150.00
Republican 869890.41
31-JUL-11 Democrat 20305.00
Reform 966.00
Republican 12781.02
31-MAR-11 Reform 200.00
Republican 62475.00
31-MAY-11 Democrat 351705.66
Libertarian 250.00
Reform 100.00
Republican 301339.80
31-OCT-11 Democrat 204996.87
Libertarian 4250.00
Reform 3105.00
Republican 734601.83
Name: contb_receipt_amt, Length: 1183, dtype: float64
# 将表中日期格式转换为'yyyy-mm-dd' day-m-y
def transformDate(d):
day,month,year = d.split('-')
month = months[month]
return '20'+year+'-'+str(month)+'-'+day
df['contb_receipt_dt'] = df['contb_receipt_dt'].apply(transformDate)
df.head()
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
2011-6-20 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
2011-6-23 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e + 08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
2011-7-05 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
WITH |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
2011-8-01 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
WITH |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
2011-6-20 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
# 查看老兵(捐献者职业)主要支持谁 :查看老兵们捐赠给谁的钱最多
# 1.将老兵对应的行数据取出
df['contbr_occupation'] == 'DISABLED VETERAN'
old_bing = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
# 2.根据候选人分组
old_bing.groupby(by='cand_nm')['contb_receipt_amt'].sum()
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
df['contb_receipt_amt'].max()
1944042.43
#捐赠金额最大的人的职业以及捐献额 .通过query("查询条件来查找捐献人职业")
df.query('contb_receipt_amt == 1944042.43')
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
176127 |
C00431445 |
P80003338 |
Obama, Barack |
OBAMA VICTORY FUND 2012 - UNITEMIZED |
CHICAGO |
THE |
60680 |
NaN |
NaN |
1944042.43 |
2011-12-31 |
NaN |
X |
* |
SA18 |
763233 |
Democrat |