1、pandas操作CSV
import pandas as pd
df = pd.read_csv('datas/donations.csv', nrows=10000) # nrows设置读取数据的行数
donation_src_table = pd.DataFrame(df, columns=['projectid', 'is_teacher_acct']) # 可以从CSV中提取固定的列
df.to_csv('new_filename') # 保存数据到CSV中
df['projectid'] # 取列名为projectid这一整列的数据
df['projectid'][1] # 取列名为projectid这一列中的第二个元素值
df['projectid'][5] # 取列名为projectid这一列中的前五个元素值
lines = [{}] # lines中的元素为字典
donation_obj = pd.DataFrame(lines) # 将字典数据转化为DataFrame格式数据
donation_obj.to_csv(csv_name)
注意:如果CSV中有值为空,则通过DataFrame读取出来的值为 nan,类型为float类型的,值为0.0,
根据不同的需要,可以通过 isinstance(value, type) 来过滤掉
2、处理大文件CSV使用pandas过滤数据例子
import pandas as pd
# import operator
donations_df = pd.read_csv('datas/donations.csv')
projects_df = pd.read_csv('datas/projects.csv')
resource_df = pd.read_csv('datas/resources.csv')
outcomes_df = pd.read_csv('datas/outcomes.csv')
# df = pd.read_csv('datas/donations.csv', nrows=10000)
# df.to_csv('new_filename')
donation_src_table = pd.DataFrame(donations_df, columns=['projectid', 'is_teacher_acct', 'donation_timestamp',
'donation_to_project',
'donation_optional_support', 'donation_total', 'dollar_amount',
'donation_included_optional_support', 'payment_method', 'payment_included_acct_credit',
'payment_included_campaign_gift_card', 'payment_included_web_purchased_gift_card',
'payment_was_promo_matched', 'via_giving_page', 'for_honoree'])
project_src_table = pd.DataFrame(projects_df, columns=['projectid', 'school_state', 'school_metro', 'school_charter',
'school_magnet',
'school_year_round', 'school_nlns', 'school_kipp',
'school_charter_ready_promise', 'teacher_prefix',
'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
'primary_focus_area', 'resource_type', 'poverty_level',
'grade_level', 'fulfillment_labor_materials',
'total_price_excluding_optional_support',
'total_price_including_optional_support', 'students_reached',
'eligible_double_your_impact_match', 'eligible_almost_home_match',
'date_posted'])
resource_src_table = pd.DataFrame(resource_df, columns=['projectid', 'item_unit_price', 'item_quantity'])
def generate_t_f(flag, data, name):
if isinstance(name, str):
name_t = name + '_t'
name_f = name + '_f'
if flag == 't':
if name_t in data:
data[name_t] += 1
else:
data[name_t] = 1
data[name_f] = 0
else:
if name_f in data:
data[name_f] += 1
else:
data[name_f] = 1
data[name_t] = 0
def get_all_donation_datas():
t_f_names = ['is_teacher_acct', 'donation_included_optional_support',
'payment_included_acct_credit', 'payment_included_campaign_gift_card',
'payment_included_web_purchased_gift_card', 'payment_was_promo_matched', 'via_giving_page',
'for_honoree']
all_t_f_datas = {}
for i in range(len(donation_src_table['projectid'])):
if not isinstance(donation_src_table['projectid'][i], str):
continue
if donation_src_table['projectid'][i] in all_t_f_datas:
for name in t_f_names:
generate_t_f(donation_src_table['is_teacher_acct'][i], all_t_f_datas[donation_src_table['projectid'][i]], name)
if isinstance(donation_src_table['donation_timestamp'][i], str) and \
donation_src_table['donation_timestamp'][i] > all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp']:
all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp'] = donation_src_table['donation_timestamp'][i]
all_t_f_datas[donation_src_table['projectid'][i]]['donation_to_project'] += \
float(donation_src_table['donation_to_project'][i])
all_t_f_datas[donation_src_table['projectid'][i]]['donation_optional_support'] += \
float(donation_src_table['donation_optional_support'][i])
all_t_f_datas[donation_src_table['projectid'][i]]['donation_total'] += \
float(donation_src_table['donation_total'][i])
if isinstance(donation_src_table['dollar_amount'][i], str):
all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['dollar_amount'][i]] += 1
if isinstance(donation_src_table['payment_method'][i], str):
all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['payment_method'][i]] += 1
else:
all_t_f_datas[donation_src_table['projectid'][i]] = {}
if isinstance(donation_src_table['donation_timestamp'][i], str):
all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp'] = \
donation_src_table['donation_timestamp'][i]
else:
all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp'] = ''
all_t_f_datas[donation_src_table['projectid'][i]]['donation_to_project'] = \
float(donation_src_table['donation_to_project'][i])
all_t_f_datas[donation_src_table['projectid'][i]]['donation_optional_support'] = \
float(donation_src_table['donation_optional_support'][i])
all_t_f_datas[donation_src_table['projectid'][i]]['donation_total'] = \
float(donation_src_table['donation_total'][i])
all_t_f_datas[donation_src_table['projectid'][i]]['10_to_100'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['under_10'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['100_and_up'] = 0
if isinstance(donation_src_table['dollar_amount'][i], str):
all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['dollar_amount'][i]] += 1
all_t_f_datas[donation_src_table['projectid'][i]]['no_cash_received'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['creditcard'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['paypal'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['amazon'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['promo_code_match'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['check'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['double_your_impact_match'] = 0
all_t_f_datas[donation_src_table['projectid'][i]]['almost_home_match'] = 0
if isinstance(donation_src_table['payment_method'][i], str):
all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['payment_method'][i]] += 1
for name in t_f_names:
generate_t_f(donation_src_table[name][i],
all_t_f_datas[donation_src_table['projectid'][i]], name)
return all_t_f_datas
def save_csv(datas, csv_name):
columns = list(datas.values()[0].keys())
columns.insert(0, 'projectid')
lines = []
for data in datas:
csv_line = {}
csv_line['projectid'] = data[0]
for key, value in data[1].items():
try:
csv_line[key] = str(float('%.2f' % value))
except:
csv_line[key] = value
lines.append(csv_line)
donation_obj = pd.DataFrame(lines)
donation_obj.to_csv(csv_name)
def total_datas(data, name, number):
if isinstance(name, str):
if name not in data:
data[name] = float(number)
else:
data[name] += float(number)
def total_school_state(data, name):
if isinstance(name, str):
new_name = 'school_state_' + name
other_name = 'school_state_other'
if new_name not in data:
data[new_name] = 0
if other_name not in data:
data[other_name] = 0
if name == 'CA' or name == 'NY' or name == 'NC':
data[new_name] += 1
else:
data[other_name] += 1
def get_all_project_datas():
all_project_datas = {}
number_names = ['fulfillment_labor_materials', 'total_price_excluding_optional_support',
'total_price_including_optional_support', 'students_reached']
all_t_f_names = ['school_charter', 'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
'school_charter_ready_promise', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
'eligible_double_your_impact_match', 'eligible_almost_home_match']
for i in range(len(project_src_table['projectid'])):
if not isinstance(donation_src_table['projectid'][i], str):
continue
if project_src_table['projectid'][i] in all_project_datas:
if isinstance(project_src_table['teacher_prefix'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['teacher_prefix'][i]] += 1
if isinstance(project_src_table['resource_type'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['resource_type'][i]] += 1
if isinstance(project_src_table['poverty_level'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['poverty_level'][i]] += 1
if isinstance(project_src_table['grade_level'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['grade_level'][i]] += 1
if isinstance(project_src_table['date_posted'][i], str) and \
all_project_datas[project_src_table['projectid'][i]]['date_posted'] < project_src_table['date_posted'][i]:
all_project_datas[project_src_table['projectid'][i]]['date_posted'] = project_src_table['date_posted'][i]
else:
all_project_datas[project_src_table['projectid'][i]] = {}
all_project_datas[project_src_table['projectid'][i]]['Mrs.'] = 0
all_project_datas[project_src_table['projectid'][i]]['Mr.'] = 0
all_project_datas[project_src_table['projectid'][i]]['Ms.'] = 0
all_project_datas[project_src_table['projectid'][i]]['Dr.'] = 0
if isinstance(project_src_table['teacher_prefix'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['teacher_prefix'][i]] += 1
all_project_datas[project_src_table['projectid'][i]]['Books'] = 0
all_project_datas[project_src_table['projectid'][i]]['Technology'] = 0
all_project_datas[project_src_table['projectid'][i]]['Other'] = 0
all_project_datas[project_src_table['projectid'][i]]['Supplies'] = 0
all_project_datas[project_src_table['projectid'][i]]['Visitors'] = 0
all_project_datas[project_src_table['projectid'][i]]['Trips'] = 0
if isinstance(project_src_table['resource_type'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['resource_type'][i]] += 1
all_project_datas[project_src_table['projectid'][i]]['highest poverty'] = 0
all_project_datas[project_src_table['projectid'][i]]['high poverty'] = 0
all_project_datas[project_src_table['projectid'][i]]['moderate poverty'] = 0
all_project_datas[project_src_table['projectid'][i]]['low poverty'] = 0
if isinstance(project_src_table['poverty_level'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['poverty_level'][i]] += 1
all_project_datas[project_src_table['projectid'][i]]['Grades 6-8'] = 0
all_project_datas[project_src_table['projectid'][i]]['Grades PreK-2'] = 0
all_project_datas[project_src_table['projectid'][i]]['Grades 3-5'] = 0
all_project_datas[project_src_table['projectid'][i]]['Grades 9-12'] = 0
if isinstance(project_src_table['grade_level'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['grade_level'][i]] += 1
all_project_datas[project_src_table['projectid'][i]]['urban'] = 0
all_project_datas[project_src_table['projectid'][i]]['rural'] = 0
all_project_datas[project_src_table['projectid'][i]]['suburban'] = 0
if isinstance(project_src_table['school_metro'][i], str):
all_project_datas[project_src_table['projectid'][i]][project_src_table['school_metro'][i]] += 1
# 'date_posted'
if isinstance(project_src_table['date_posted'][i], str):
all_project_datas[project_src_table['projectid'][i]]['date_posted'] = project_src_table['date_posted'][i]
else:
all_project_datas[project_src_table['projectid'][i]]['date_posted'] = ''
for name in number_names:
total_datas(all_project_datas[project_src_table['projectid'][i]], name, project_src_table[name][i])
for name in all_t_f_names:
generate_t_f(project_src_table[name][i], all_project_datas[project_src_table['projectid'][i]], name)
total_school_state(all_project_datas[project_src_table['projectid'][i]], project_src_table['school_state'][i])
return all_project_datas
def get_resources_data():
resource_src_table['total_price'] = resource_src_table["item_unit_price"] * resource_src_table["item_quantity"]
resources_data = {}
for i in range(len(resource_src_table['projectid'])):
if resource_src_table['projectid'][i] not in resources_data:
resources_data[resource_src_table['projectid'][i]] = {}
resources_data[resource_src_table['projectid'][i]]['total_price'] = resource_src_table['total_price'][i]
return resources_data
def get_outcomes_data():
outcomes_data = {}
columns = ['is_exciting', 'at_least_1_teacher_referred_donor', 'fully_funded', 'at_least_1_green_donation',
'great_chat', 'three_or_more_non_teacher_referred_donors',
'one_non_teacher_referred_donor_giving_100_plus', 'donation_from_thoughtful_donor',
'great_messages_proportion', 'teacher_referred_count', 'non_teacher_referred_count']
for i in range(len(outcomes_df['projectid'])):
if outcomes_df['projectid'][i] not in outcomes_data:
outcomes_data[resource_src_table['projectid'][i]] = {}
for col in columns:
outcomes_data[resource_src_table['projectid'][i]][col] = outcomes_df[col][i]
return outcomes_data
if __name__ == '__main__':
# all_dollars = {}
# for data in project_src_table['school_metro']:
# if isinstance(data, str):
# if data in all_dollars:
# all_dollars[data] += 1
# else:
# all_dollars[data] = 1
# print(all_dollars)
donation_datas = get_all_donation_datas()
project_datas = get_all_project_datas()
outcomes_datas = get_outcomes_data()
resources_datas = get_resources_data()
total_datas = {}
for key, value in donation_datas.items():
# print(key, value)
if key in outcomes_datas and \
key in project_datas and \
key in resources_datas:
total_datas[key] = {}
for key1, value1 in outcomes_datas[key].items():
total_datas[key][key1] = value1
for key2, value2 in project_datas[key].items():
total_datas[key][key2] = value2
for key3, value3 in resources_datas[key].items():
total_datas[key][key3] = value3
for key4, value4 in value.items():
total_datas[key][key4] = value4
for key, value in total_datas.items():
print(key, value)
# save_csv(total_datas, 'tests/all_total_datas.csv')