目录下有如图60个txt文件,每个txt文件里的数据大概有7000万行
目的:把每个txt文件里的数据去重后合并60个文件为一个总文件,然后把总文件里的数据按第一列、第二列分组
第三列求去重后出现的次数
每个文件的内容如下:
代码如下:
# -*- coding:utf-8 -*- from datetime import datetime import pandas as pd import os def Main(): sourcr_dir = '/data/u_lx_data/zhangqm/Gocapture/ford/ford_tongji/uaad/' target_dir = '/data/u_lx_data/zhangqm/Gocapture/ford/ford_tongji/uaad/distinct/' target_txt = '/data/u_lx_data/zhangqm/Gocapture/ford/ford_tongji/uaad/distinct/merge_result.txt' print("开始。。。。。") print("加载规则数据") print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #遍历文件并去重 for file in os.listdir(sourcr_dir): if file.find('201708') != -1: data=pd.read_csv(sourcr_dir+file,sep="\t",header=None,names=['ad','number','name']).drop_duplicates()[['number','name','ad']] data.to_csv(target_dir+'distinct_'+file,sep="\t",header=None,index=False) if file.find('201709') != -1: data=pd.read_csv(sourcr_dir+file,sep="\t",header=None,names=['number','name','ad']).drop_duplicates() data.to_csv(target_dir+'distinct_'+file,sep="\t",header=None,index=False) #合并文件 with open(target_txt, "w+") as ff_write: for file in os.listdir(target_dir): if file.find('distinct_2017') != -1: with open(file, 'r') as f_read: for line in f_read: line = line.strip().split("\t") ff_write.write("\t".join(line) + "\n") #算UV data2=data=pd.read_csv(target_txt,sep="\t",header=None,names=['number','name','ad']).groupby(['number','name']).agg({'ad':pd.Series.nunique}) data2.to_csv('uv_result.txt', sep="\t", header=None, index=True) print("处理完成。。。。。") print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) if __name__ == "__main__": Main()
版权声明:本文为博主原创文章,未经博主允许不得转载。