python统计excel利用pandans的分组,其中还用列表数据求差集
csv数据结构(有三个按照日期统计的csv)
需要统计出这三张csv按照areaid缺少的type和bdtype
其中type=[1,2,3,4] bdtype=[1,3,4]
源码如下:
第一步数据初步处理删除非必须列
# -*- coding: utf-8 -*-
import requests
import pandas as pd
import numpy as np
import re
"""数据读取,初步处理删除不要的列"""
cname = ['num','area_id','type','bd_type','date']
dftfour = pd.read_csv("20191224.csv")
dftthree = pd.read_csv("20191223.csv")
dfttwo = pd.read_csv("20191222.csv")
dftfour.columns = cname
dftthree.columns = cname
dfttwo.columns = cname
dftfour = dftfour.drop(['num','date'], axis=1)
dftthree = dftthree.drop(['num','date'], axis=1)
dfttwo = dfttwo.drop(['num','date'], axis=1)
第二步获取三张csv的交集,按照’area_id’,‘type’,‘bd_type’
区域的数据结构:
df1 = pd.merge(dftfour,dftthree,on=['area_id', 'type', 'bd_type'],how='inner')
"""df2为两次交集后的结果,需要知道area的名称,对区域表的areaid再次获取区域name"""
df2 = pd.merge(df1,dfttwo,on=['area_id', 'type', 'bd_type'],how='inner')
dfareas = pd.read_csv("all_areas.csv")
dfareas.columns = ['name','area_id']
"""结果dfs"""
dfs = pd.merge(df2,dfareas,on=['area_id'],how='inner')
对结果集分组获取数据,三次分组统计
首次分组获取areaid 再次分组type 最后分组bdtype
allres=[]
for filds, group in dfs.groupby('area_id'):
# print("area_id",group)
# print(group.area_id)
tempTypelist=[]
tempType=0
for cfilds, cgroup in group.groupby('type'):
"""
tempTypelist 按区域统计type所有值,便于结果统计缺失的type
"""
tempTypelist.append(cgroup.drop_duplicates(['type'])['type'].values[0])
# tempType 按区域统计当前type
tempType = cgroup.drop_duplicates(['type'])['type'].values[0]#type
tempBdtypes=[]
for ccfilds, ccgroup in cgroup.groupby('bd_type'):
# tempBdtypes 按区域按type统计bd_type所有值
"""
tempBdtypes 按区域按type统计bd_type所有值,便于结果统计缺失的bd_type
"""
tempBdtypes.append(ccgroup.drop_duplicates(['bd_type'])['bd_type'].values[0])#bd_type列表
# print(tempTypelist,tempType,tempBdtypes)
tempname = cgroup.drop_duplicates(['name'])['name'].values[0]#区域名称
tempareaId = cgroup.drop_duplicates(['area_id'])['area_id'].values[0] # 区域名称
# 指定文件求交集得到连续三天都有的区域数据
li = []
# 1、比较缺少的type
isLetype = list(set(checType).difference(set(tempTypelist)))# checType中有而tempTypelist中没有的
if len(isLetype)>0:
print("{}[{}]缺少的type:{}".format(tempname,tempareaId,isLetype))
totype_1 = isLetype
tobdtype_1 = []
for v in totype_1:
for v2 in checBdtype:
# print(v,v2)
s = (v, v2)
li.append(s)
print(li)
# 2、比较缺少的bd_type
# print(tempType,tempBdtypes)
isLebdtype = list(set(checBdtype).difference(set(tempBdtypes)))
if len(isLebdtype)>0:
totype_2 = tempType
tobdtype_2 = isLebdtype
for v in tobdtype_2:
# print(totype_2,v)
s = (totype_2, v)
li.append(s)
print("{}[{}]的type{}缺少的bdtypes{}".format(tempname,tempareaId,tempType,isLebdtype))
s=[tempname,tempareaId,li,len(li)]
allres.append(s)
"""将结果写入csv"""
data = pd.DataFrame(allres)
data.columns = ['name','area_id','all','num']
data.to_csv('allresless.csv')
allresless.csv结果如下: