Mathematical modeling of Python csv file operations

1. Python through a column inside the csv file, the key is formed, and then count the number of other key column appears.

import pandas as pd
import numpy as np
import csv
import codecs
import sys

data_original = pd.read_csv('D:/csv_data_original.csv')
data = pd.read_csv('D:/week1.csv')
#data = data['retweeted_status_mid'].fillna('NOT PROVIDED',inplace=True)
#data_transpond = data[data['retweeted_status_mid'] != 'NOT PROVIDED']

#每条原创微博转发次数统计
def statistics(path1,path2):
    num1 = 0
    num2 = 0
    #这块代码用来形成键值,初始化为0
    with open(path2, 'r', encoding="iso-8859-1") as f:
        reader2 = csv.reader(f)
        data_head2 = next(reader2)
        print(data_head2)
        data_line = next(reader2)
        while(data_line):
            if data_line[0] not in mid.keys():
                mid[data_line[0].encode("iso-8859-1").decode("gbk", "ignore")] = 0
                num2 += 1
                print("正在创建第" + str(num2) + "个键")
            try:
                data_line = next(reader2)
            except StopIteration:
                print("数据处理完毕,键值完全形成" + str(num2) + "!")
                break
                #sys.exit()
        f.close()
    #这块代码用来统计每个键出现的次数
    with open(path1, 'r', encoding="iso-8859-1") as f:
        reader1 = csv.reader(f)
        data_head1 = next(reader1)
        print(data_head1)
        data_line = next(reader1)
        while(data_line):
            if data_line[1] in mid.keys():
                mid[data_line[1].encode("iso-8859-1").decode("gbk", "ignore")] += 1
                print("这条微博被转发" + str(mid[data_line[1]]) + "次")
            try:
                data_line = next(reader1)
            except StopIteration:
                print("数据处理完毕,转发次数统计完毕")
                break
                #sys.exit()
        f.close()
#字典转化为列表
def transpond(dict):
    global list_key#保存键
    global list_value#保存值
    list_key = list(dict)
    list_value = list(dict.values())


#将数据写入csv文件
def data_write_csv(file_name, list1,list2):#file_name为写入CSV文件的路径,datas为要写入数据列表
    with open(file_name,'w',newline='') as f:
        writer = csv.writer(f)
        writer.writerows(zip(list1, list2))


if __name__ == "__main__":
    path_data = 'D:/week1.csv'  # 原始数据路径
    path_data_original = 'D:/csv_data_original.csv'  # 处理后只含原创的微博数据路径
    path_save = 'D:/transpond_data.csv'  # 保存处理后的数据
    mid = {}  # 定义字典用来保存每条原创微博被转发的次数
    list_key = []  # 保存键
    list_value = []  # 保存值
    statistics(path_data,path_data_original)
    transpond(mid)
    data_write_csv(path_save,list_key,list_value)

1 and 2. The similar operation, some specific details change, the code annotated

import csv
import pandas as pd

#每条原创微博转发次数统计
def statistics(path1,path2):
    num2 = 0
    #这块代码用来形成键值,初始化为0
    with open(path2, 'r', encoding="iso-8859-1") as f:
        reader2 = csv.reader(f)
        data_head2 = next(reader2)
        print(data_head2)
        data_line = next(reader2)
        while(data_line):
            if data_line[0] not in mid.keys():
                mid[data_line[0].encode("iso-8859-1").decode("gbk", "ignore")] = 0
                num2 += 1
                print("正在创建第" + str(num2) + "个键")
            try:
                data_line = next(reader2)
            except StopIteration:
                print("数据处理完毕,键值完全形成" + str(num2) + "!")
                break
                #sys.exit()
        f.close()
    #这块代码用来统计每个键出现的次数
    with open(path1, 'r', encoding="iso-8859-1") as f:
        reader1 = csv.reader(f)
        data_head1 = next(reader1)
        print(data_head1)
        data_line = next(reader1)
        while(data_line):
            if data_line[2] in mid.keys():
                mid[data_line[2].encode("iso-8859-1").decode("gbk", "ignore")] += int(data_line[1])
                print("这个用户的微博被转发一共" + str(mid[data_line[2]]) + "次")
            try:
                data_line = next(reader1)
            except StopIteration:
                print("数据处理完毕,转发次数统计完毕")
                break
                #sys.exit()
        f.close()

#字典转化为列表
def transpond(dict):
    global list_key#保存键
    global list_value#保存值
    list_key = list(dict)
    list_value = list(dict.values())

#将数据写入csv文件
def data_write_csv(file_name, list1,list2):#file_name为写入CSV文件的路径,datas为要写入数据列表
    with open(file_name,'w',newline='') as f:
        writer = csv.writer(f)
        writer.writerows(zip(list1, list2))

if __name__ == '__main__':
    path1 = 'D:/csv_data_original_num.csv'  # 用来形成键的数据路径
    path2 = 'D:/data_all.csv'  # 用来查找键值的数据路径
    path_save = 'D:/user_transpond.csv'  # 存放统计好的数据路径
    mid = {}
    list_key = []
    list_value = []
    statistics(path2,path1)
    transpond(mid)
    data_write_csv(path_save,list_key,list_value)

3. The large data csv file into several smaller files in accordance with certain conditions

#coding = utf-8
import pandas as pd
import csv


def get_txt(path1,path2,path3,path4,path5,path6,path7,path8):
    num = 0
    with open(path1, 'r',encoding = 'utf-8') as f:
        txt1 = open(path2, "w", encoding='utf-8')
        txt2 = open(path3, "w", encoding='utf-8')
        txt3 = open(path4, "w", encoding='utf-8')
        txt4 = open(path5, "w", encoding='utf-8')
        txt5 = open(path6, "w", encoding='utf-8')
        txt6 = open(path7, "w", encoding='utf-8')
        txt7 = open(path8, "w", encoding='utf-8')
        reader1 = csv.reader(f)
        data_head1 = next(reader1)
        print(data_head1)
        data_line = next(reader1)
        while(data_line):
            num += 1
            print(num)
            print(data_line[6])
            if num > 0 and num < 700000:
                txt1.write(data_line[6] + '\n')
            elif num >= 700000 and num < 1400000:
                txt2.write(data_line[6] + '\n')
            elif num >= 1400000 and num < 2100000:
                txt3.write(data_line[6] + '\n')
            elif num >= 2100000 and num < 2800000:
                txt4.write(data_line[6] + '\n')
            elif num >= 2800000 and num < 3500000:
                txt5.write(data_line[6] + '\n')
            elif num >= 3500000 and num < 4200000:
                txt6.write(data_line[6] + '\n')
            elif num >= 4200000 and num < 4700000:
                txt7.write(data_line[6] + '\n')
            try:
                data_line = next(reader1)
            except StopIteration:
                print("数据处理完毕,转发次数统计完毕")
                break
                #sys.exit()
        f.close()
if __name__ == '__main__':
    path1 = 'D:/week1.csv'
    path2 = 'D:/text1.txt'
    path3 = 'D:/text2.txt'
    path4 = 'D:/text3.txt'
    path5 = 'D:/text4.txt'
    path6 = 'D:/text5.txt'
    path7 = 'D:/text6.txt'
    path8 = 'D:/text7.txt'
    get_txt(path1,path2,path3,path4,path5,path6,path7,path8)

Guess you like

Origin www.cnblogs.com/tsruixi/p/11406848.html