一个文件夹下多个sheet多个excel获取并清洗

#by carson 2019-05-21
#函数的定义：获取对应目录下多个带有多个sheet工作薄的excel文件
def read_info(inputdir):
	#文件夹下多个工作薄
    sheet_names = ["ALL-ALL","ALL-TM","ALL-TB","PC-ALL","PC-TM","PC-TB","WX-ALL","WX-TM","WX-TB"]
	#读取当前文件中的指定的excel文件，也可用endswith()
    xlsx_names = [x for x in os.listdir(inputdir) if x.startswith('2019')]
	#新建一个数据框
    df = pd.DataFrame()
	#遍历所有的excel文件名
    for xlsx_name in xlsx_names:
        -- print(xlsx_name + "打印成功")
		#将文件名修改成数据框中的时间字段
        month = xlsx_name[: xlsx_name.index('.')]
        month = month.replace('_', '0')
		#遍历工作薄
        for sheet_name in sheet_names:    
            file = os.path.join(inputdir, xlsx_name)
            _df = pd.read_excel(file, sheet_name=sheet_name)
            if _df.empty:
                continue
            _df['month'] = pd.Series([month for _ in range(len(_df))])
            df = pd.concat([df, _df])
            # 数据清洗
            df_1 = df.dropna(subset=["商品"])
            df_2 = df_1[~df_1["序号"].isin(["偶数"])]
            df_3 = df_2.drop(["序号","操作"], axis=1)
            df_3["solr_index"] = df_3["搜索人气"]
            df_3["solr_index"] = df_3["solr_index"].replace("-","0")      
    return df_3
		
#df.shape[0]表示返回行数，df.shape[1]表示返回列数
inputdir= r'C:\Users\lihwa11\Desktop\data_clean\month_competition_product_monitor'
df_4 = read_info(inputdir)
storage_file = os.path.join(inputdir, 'test.xlsx')
df_4.to_excel(storage_file,index=False)

===============================================完整版数据清洗======================================================

#品类数据清洗
import pandas as pd
import os
#读取该文件下所有的文件
inputdir=r'C:\Users\lihwa11\Desktop\data_clean\data'
#product_type_day=pd.DataFrame(columns=["统计日期","一级类目名称","二级类目名称","类目名称","商品访客数","商品浏览量","有访客商品数","有支付商品数","商品加购人数","商品加购件数","商品收藏人数","访问收藏转化率","访问加购转化率","下单买家数","下单件数","下单金额","下单转化率","支付买家数","支付件数","支付金额","支付金额占比","支付转化率","月累计支付金额","年累计支付金额","聚划算支付金额","支付新买家数","支付老买家数","老买家支付金额","客单价","访客平均价值","售中售后成功退款金额"])
product_type_day=pd.DataFrame()
#分别读取类目的所有列并追加
for parents,dirnames, filenames in os.walk(inputdir):
    for filename in filenames:
        df=pd.read_excel(os.path.join(parents,filename))
        product_type_day=product_type_day.append(df,ignore_index=True)

#数据维度
#product_type_day.shape
#数据列名
#product_type_day.columns
#数据查询
#product_type_day.head(5)
#查看数据的第5行之后的数据
temp = product_type_day.iloc[5:,:]
#给数据列命名
temp.columns = ["统计日期","一级类目名称","二级类目名称","类目名称","商品访客数","商品浏览量","有访客商品数","有支付商品数","商品加购人数","商品加购件数","商品收藏人数","访问收藏转化率","访问加购转化率","下单买家数","下单件数","下单金额","下单转化率","支付买家数","支付件数","支付金额","支付金额占比","支付转化率","月累计支付金额","年累计支付金额","聚划算支付金额","支付新买家数","支付老买家数","老买家支付金额","客单价","访客平均价值","售中售后成功退款金额"]
#print(temp)

#去除对应类中空值的行记录
temp2 = temp.dropna(subset=["一级类目名称"])
#剔除一级类目名称中有一级类目名称的记录行
temp4 = temp2[~temp2["一级类目名称"].isin(["一级类目名称"])]
#用set提取对应数据列一级类目名称中的非重名，并转成list
demo5  = list(set(temp4["一级类目名称"]))
demo6  = list(set(temp4["二级类目名称"]))
#用isin逆函数给数据做过滤，让二级类目名称，类目名称没有汇总的情况
temp5 = temp4[~temp4["二级类目名称"].isin(demo5)]
temp6 = temp5[~temp5["类目名称"].isin(demo6)]
一个文件夹下多个sheet多个excel获取并清洗

猜你喜欢