实现也非常简单,不过多啰嗦,见代码注释:
pv uv 代指 select host,count(*),count(distinct ad) from table group by ;
# -*- coding:utf-8 -*- import pandas as pd from datetime import datetime def Main(): print("开始。。。。。") print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) source_txt = "/data/u_lx_data/zhangqm/sh/yanjie/fudan/bigdata/bigdata_click_result.txt" target_txt = "/data/u_lx_data/zhangqm/sh/yanjie/fudan/bigdata/host_pvuv_click.txt" uname = ['ad','type','host','url','ref','time','os','os_type'] # count(*) pv = pd.read_csv(source_txt,sep="\t",header=None,names=uname,index_col=False)[['host','ad']].groupby('host')['ad'].size() # count(distinct **) uv = pd.read_csv(source_txt,sep="\t",header=None,names=uname,index_col=False)[['host','ad']].groupby('host').agg({'ad': pd.Series.nunique}) # 去除索引,带出分组的字段 result = pd.merge(pv.reset_index(),uv.reset_index(),how='inner',on='host') result.to_csv (target_txt,header=None,index=False,sep="\t") print("完成。。。。。") print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) if __name__ == "__main__": Main()