pandas 处理CSV大文件

1、pandas操作CSV

import pandas as pd

df = pd.read_csv('datas/donations.csv', nrows=10000)  # nrows设置读取数据的行数
donation_src_table = pd.DataFrame(df, columns=['projectid', 'is_teacher_acct']) # 可以从CSV中提取固定的列
df.to_csv('new_filename') # 保存数据到CSV中

df['projectid']  # 取列名为projectid这一整列的数据
df['projectid'][1] # 取列名为projectid这一列中的第二个元素值
df['projectid'][5] # 取列名为projectid这一列中的前五个元素值

lines = [{}]   # lines中的元素为字典
donation_obj = pd.DataFrame(lines)  # 将字典数据转化为DataFrame格式数据
donation_obj.to_csv(csv_name)

注意:如果CSV中有值为空,则通过DataFrame读取出来的值为 nan,类型为float类型的,值为0.0,
根据不同的需要,可以通过 isinstance(value, type) 来过滤掉

2、处理大文件CSV使用pandas过滤数据例子

import pandas as pd
# import operator

donations_df = pd.read_csv('datas/donations.csv')
projects_df = pd.read_csv('datas/projects.csv')
resource_df = pd.read_csv('datas/resources.csv')
outcomes_df = pd.read_csv('datas/outcomes.csv')
# df = pd.read_csv('datas/donations.csv', nrows=10000)
# df.to_csv('new_filename')


donation_src_table = pd.DataFrame(donations_df, columns=['projectid', 'is_teacher_acct', 'donation_timestamp',
                                                         'donation_to_project',
                          'donation_optional_support', 'donation_total', 'dollar_amount',
                          'donation_included_optional_support', 'payment_method', 'payment_included_acct_credit',
                          'payment_included_campaign_gift_card', 'payment_included_web_purchased_gift_card',
                          'payment_was_promo_matched', 'via_giving_page', 'for_honoree'])


project_src_table = pd.DataFrame(projects_df, columns=['projectid', 'school_state', 'school_metro', 'school_charter',
                                                       'school_magnet',
                                              'school_year_round', 'school_nlns', 'school_kipp',
                                              'school_charter_ready_promise', 'teacher_prefix',
                                              'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
                                              'primary_focus_area', 'resource_type', 'poverty_level',
                                              'grade_level', 'fulfillment_labor_materials',
                                              'total_price_excluding_optional_support',
                                              'total_price_including_optional_support', 'students_reached',
                                              'eligible_double_your_impact_match', 'eligible_almost_home_match',
                                              'date_posted'])

resource_src_table = pd.DataFrame(resource_df, columns=['projectid', 'item_unit_price', 'item_quantity'])


def generate_t_f(flag, data, name):
    if isinstance(name, str):
        name_t = name + '_t'
        name_f = name + '_f'
        if flag == 't':
            if name_t in data:
                data[name_t] += 1
            else:
                data[name_t] = 1
                data[name_f] = 0
        else:
            if name_f in data:
                data[name_f] += 1
            else:
                data[name_f] = 1
                data[name_t] = 0


def get_all_donation_datas():
    t_f_names = ['is_teacher_acct', 'donation_included_optional_support',
                 'payment_included_acct_credit', 'payment_included_campaign_gift_card',
                 'payment_included_web_purchased_gift_card', 'payment_was_promo_matched', 'via_giving_page',
                 'for_honoree']

    all_t_f_datas = {}
    for i in range(len(donation_src_table['projectid'])):
        if not isinstance(donation_src_table['projectid'][i], str):
            continue

        if donation_src_table['projectid'][i] in all_t_f_datas:
            for name in t_f_names:
                generate_t_f(donation_src_table['is_teacher_acct'][i], all_t_f_datas[donation_src_table['projectid'][i]], name)

            if isinstance(donation_src_table['donation_timestamp'][i], str) and \
                    donation_src_table['donation_timestamp'][i] > all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp']:
                all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp'] = donation_src_table['donation_timestamp'][i]

            all_t_f_datas[donation_src_table['projectid'][i]]['donation_to_project'] += \
                float(donation_src_table['donation_to_project'][i])

            all_t_f_datas[donation_src_table['projectid'][i]]['donation_optional_support'] += \
                float(donation_src_table['donation_optional_support'][i])

            all_t_f_datas[donation_src_table['projectid'][i]]['donation_total'] += \
                float(donation_src_table['donation_total'][i])

            if isinstance(donation_src_table['dollar_amount'][i], str):
                all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['dollar_amount'][i]] += 1

            if isinstance(donation_src_table['payment_method'][i], str):
                all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['payment_method'][i]] += 1
        else:
            all_t_f_datas[donation_src_table['projectid'][i]] = {}

            if isinstance(donation_src_table['donation_timestamp'][i], str):
                all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp'] = \
                    donation_src_table['donation_timestamp'][i]
            else:
                all_t_f_datas[donation_src_table['projectid'][i]]['donation_timestamp'] = ''

            all_t_f_datas[donation_src_table['projectid'][i]]['donation_to_project'] = \
                float(donation_src_table['donation_to_project'][i])

            all_t_f_datas[donation_src_table['projectid'][i]]['donation_optional_support'] = \
                float(donation_src_table['donation_optional_support'][i])

            all_t_f_datas[donation_src_table['projectid'][i]]['donation_total'] = \
                float(donation_src_table['donation_total'][i])

            all_t_f_datas[donation_src_table['projectid'][i]]['10_to_100'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['under_10'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['100_and_up'] = 0
            if isinstance(donation_src_table['dollar_amount'][i], str):
                all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['dollar_amount'][i]] += 1

            all_t_f_datas[donation_src_table['projectid'][i]]['no_cash_received'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['creditcard'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['paypal'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['amazon'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['promo_code_match'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['check'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['double_your_impact_match'] = 0
            all_t_f_datas[donation_src_table['projectid'][i]]['almost_home_match'] = 0
            if isinstance(donation_src_table['payment_method'][i], str):
                all_t_f_datas[donation_src_table['projectid'][i]][donation_src_table['payment_method'][i]] += 1

            for name in t_f_names:
                generate_t_f(donation_src_table[name][i],
                             all_t_f_datas[donation_src_table['projectid'][i]], name)

    return all_t_f_datas


def save_csv(datas, csv_name):
    columns = list(datas.values()[0].keys())
    columns.insert(0, 'projectid')
    lines = []
    for data in datas:
        csv_line = {}
        csv_line['projectid'] = data[0]
        for key, value in data[1].items():
            try:
                csv_line[key] = str(float('%.2f' % value))
            except:
                csv_line[key] = value
        lines.append(csv_line)

    donation_obj = pd.DataFrame(lines)
    donation_obj.to_csv(csv_name)


def total_datas(data, name, number):
    if isinstance(name, str):
        if name not in data:
            data[name] = float(number)
        else:
            data[name] += float(number)


def total_school_state(data, name):
    if isinstance(name, str):
        new_name = 'school_state_' + name
        other_name = 'school_state_other'
        if new_name not in data:
            data[new_name] = 0

        if other_name not in data:
            data[other_name] = 0

        if name == 'CA' or name == 'NY' or name == 'NC':
            data[new_name] += 1
        else:
            data[other_name] += 1


def get_all_project_datas():
    all_project_datas = {}

    number_names = ['fulfillment_labor_materials', 'total_price_excluding_optional_support',
             'total_price_including_optional_support', 'students_reached']

    all_t_f_names = ['school_charter', 'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
                     'school_charter_ready_promise', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
                     'eligible_double_your_impact_match', 'eligible_almost_home_match']

    for i in range(len(project_src_table['projectid'])):
        if not isinstance(donation_src_table['projectid'][i], str):
            continue

        if project_src_table['projectid'][i] in all_project_datas:
            if isinstance(project_src_table['teacher_prefix'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['teacher_prefix'][i]] += 1
            if isinstance(project_src_table['resource_type'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['resource_type'][i]] += 1
            if isinstance(project_src_table['poverty_level'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['poverty_level'][i]] += 1
            if isinstance(project_src_table['grade_level'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['grade_level'][i]] += 1
            if isinstance(project_src_table['date_posted'][i], str) and \
                    all_project_datas[project_src_table['projectid'][i]]['date_posted'] < project_src_table['date_posted'][i]:
                all_project_datas[project_src_table['projectid'][i]]['date_posted'] = project_src_table['date_posted'][i]
        else:
            all_project_datas[project_src_table['projectid'][i]] = {}
            all_project_datas[project_src_table['projectid'][i]]['Mrs.'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Mr.'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Ms.'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Dr.'] = 0
            if isinstance(project_src_table['teacher_prefix'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['teacher_prefix'][i]] += 1

            all_project_datas[project_src_table['projectid'][i]]['Books'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Technology'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Other'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Supplies'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Visitors'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Trips'] = 0
            if isinstance(project_src_table['resource_type'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['resource_type'][i]] += 1

            all_project_datas[project_src_table['projectid'][i]]['highest poverty'] = 0
            all_project_datas[project_src_table['projectid'][i]]['high poverty'] = 0
            all_project_datas[project_src_table['projectid'][i]]['moderate poverty'] = 0
            all_project_datas[project_src_table['projectid'][i]]['low poverty'] = 0
            if isinstance(project_src_table['poverty_level'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['poverty_level'][i]] += 1

            all_project_datas[project_src_table['projectid'][i]]['Grades 6-8'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Grades PreK-2'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Grades 3-5'] = 0
            all_project_datas[project_src_table['projectid'][i]]['Grades 9-12'] = 0
            if isinstance(project_src_table['grade_level'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['grade_level'][i]] += 1

            all_project_datas[project_src_table['projectid'][i]]['urban'] = 0
            all_project_datas[project_src_table['projectid'][i]]['rural'] = 0
            all_project_datas[project_src_table['projectid'][i]]['suburban'] = 0
            if isinstance(project_src_table['school_metro'][i], str):
                all_project_datas[project_src_table['projectid'][i]][project_src_table['school_metro'][i]] += 1

            # 'date_posted'
            if isinstance(project_src_table['date_posted'][i], str):
                all_project_datas[project_src_table['projectid'][i]]['date_posted'] = project_src_table['date_posted'][i]
            else:
                all_project_datas[project_src_table['projectid'][i]]['date_posted'] = ''

        for name in number_names:
            total_datas(all_project_datas[project_src_table['projectid'][i]], name, project_src_table[name][i])

        for name in all_t_f_names:
            generate_t_f(project_src_table[name][i], all_project_datas[project_src_table['projectid'][i]], name)

        total_school_state(all_project_datas[project_src_table['projectid'][i]], project_src_table['school_state'][i])

    return all_project_datas


def get_resources_data():
    resource_src_table['total_price'] = resource_src_table["item_unit_price"] * resource_src_table["item_quantity"]
    resources_data = {}
    for i in range(len(resource_src_table['projectid'])):
        if resource_src_table['projectid'][i] not in resources_data:
            resources_data[resource_src_table['projectid'][i]] = {}

        resources_data[resource_src_table['projectid'][i]]['total_price'] = resource_src_table['total_price'][i]

    return resources_data


def get_outcomes_data():
    outcomes_data = {}
    columns = ['is_exciting', 'at_least_1_teacher_referred_donor', 'fully_funded', 'at_least_1_green_donation',
               'great_chat', 'three_or_more_non_teacher_referred_donors',
               'one_non_teacher_referred_donor_giving_100_plus', 'donation_from_thoughtful_donor',
               'great_messages_proportion', 'teacher_referred_count', 'non_teacher_referred_count']
    for i in range(len(outcomes_df['projectid'])):
        if outcomes_df['projectid'][i] not in outcomes_data:
            outcomes_data[resource_src_table['projectid'][i]] = {}

        for col in columns:
            outcomes_data[resource_src_table['projectid'][i]][col] = outcomes_df[col][i]

    return outcomes_data


if __name__ == '__main__':
    # all_dollars = {}
    # for data in project_src_table['school_metro']:
    #     if isinstance(data, str):
    #         if data in all_dollars:
    #             all_dollars[data] += 1
    #         else:
    #             all_dollars[data] = 1
    # print(all_dollars)


    donation_datas = get_all_donation_datas()
    project_datas = get_all_project_datas()
    outcomes_datas = get_outcomes_data()
    resources_datas = get_resources_data()
    total_datas = {}
    for key, value in donation_datas.items():
        # print(key, value)
        if key in outcomes_datas and \
                key in project_datas and \
                key in resources_datas:
            total_datas[key] = {}

            for key1, value1 in outcomes_datas[key].items():
                total_datas[key][key1] = value1

            for key2, value2 in project_datas[key].items():
                total_datas[key][key2] = value2

            for key3, value3 in resources_datas[key].items():
                total_datas[key][key3] = value3

            for key4, value4 in value.items():
                total_datas[key][key4] = value4

    for key, value in total_datas.items():
        print(key, value)


    # save_csv(total_datas, 'tests/all_total_datas.csv')

猜你喜欢

转载自blog.csdn.net/u012089823/article/details/81329055