Use Python to filter Excel by column value and split the table into multiple files

 

 

Scenario: When the group center issued the provincial data, it was not split by prefecture city and business. Now it needs to be split by prefecture city and business and distributed to prefecture cities.

This article uses Python's pandas package to achieve the above scenarios.

Examples of data are as follows:

The above Excel needs to be filtered and split by city and saved as a file for each city.

now, show u the code:

 

# -*- coding: utf-8 -*-
"""
Created on Fri Nov  1 09:53:30 2019

@author: lanxuxml

应用场景:

	包含多个sheet的Excel 需要按列筛选出来另存为其它文件

"""

import pandas as pd

split_excel_name_head = r'F:\20191129\jidi\湖南省_JD_'

split_excel_name_tail = r'_疑似同一客户合并.xlsx'

xlsx_name = r"F:\20191129\湖南省_JD_疑似同一客户合并.xls"

#用来筛选的列名
filter_column_name = u'市'
filter_column_name_quxian = u'区县'
#获取所有sheet名
df = pd.ExcelFile(xlsx_name)
sheet_names = df.sheet_names
#删除sheet名为数据量的sheet 
sheet_names.remove('数据量')
#不需要筛选的sheet名
sheet_not_filter_names = ['场景说明']

#将该列去重后保存为list
city_names = []
#从市 区县 两列 获取所有sheet中的唯一值
for sheet_name in sheet_names:
    try :
        tmp_df = pd.read_excel(xlsx_name, sheet_name=sheet_name)
        city_names = city_names + tmp_df[filter_column_name].unique().tolist() + tmp_df[filter_column_name_quxian].unique().tolist()
    except :
        pass
#与实际市名求交集再去重,避免出现空表或者空sheet
city_names_unique_all = ['娄底市', '湘西土家族苗族自治州', '邵阳市', '常德市', '长沙市', '株洲市', '郴州市', '益阳市', '永州市', '怀化市', '湘潭市', '衡阳市', '岳阳市','张家界市']
city_names = list(set(city_names).intersection(set(city_names_unique_all)))
city_names = list(set(city_names))
#所有场景没有数据的市
city_no_data = list(set(city_names_unique_all).difference(set(city_names)))
print(city_no_data)
tmp_report_writer = pd.ExcelWriter(xlsx_name + u"_拆分报告_临时.xlsx")
report_writer = pd.ExcelWriter(xlsx_name + u"_拆分报告.xlsx")

for city_name in city_names:
    city_excel_name = split_excel_name_head + str(city_name) + split_excel_name_tail
    writer = pd.ExcelWriter(city_excel_name)  
    #将city_name转为list
    city_name_to_list = []
    city_name_to_list.append(city_name)
    tmp_df = pd.DataFrame(city_no_data)
    
    for sheet_name in sheet_names:
        tmp_df = pd.read_excel(xlsx_name, sheet_name=sheet_name)
        tmp_sliced = pd.DataFrame()
        if sheet_name not in sheet_not_filter_names:
        	tmp_sheet = tmp_df[tmp_df[filter_column_name].isin(city_name_to_list)]#按市
        	tmp_sheet = tmp_sheet.append(tmp_df[tmp_df[filter_column_name_quxian].isin(city_name_to_list)])#增加按区县有地市的行
        	tmp_sheet = tmp_sheet.drop_duplicates(keep = 'first')#去重
            #无法通过 市 区县两列拆分的数据
        	tmp_sliced = tmp_df.append(tmp_sheet).drop_duplicates(keep = False)
        else :
        	tmp_sheet = tmp_df
            
        tmp_sheet.to_excel(excel_writer=writer, sheet_name=sheet_name, encoding="utf-8", index=False)
        tmp_sliced.to_excel(excel_writer=tmp_report_writer, sheet_name=sheet_name, encoding="utf-8", index=False)
      
    writer.save()
    writer.close()
    tmp_report_writer.save()
    tmp_report_writer.close()
    
for sheet_name in sheet_names:
    ori_df = pd.read_excel(xlsx_name, sheet_name=sheet_name)
    sli_df = pd.read_excel(xlsx_name + u"_拆分报告_临时.xlsx", sheet_name=sheet_name)
    ori_df = ori_df.append(sli_df).drop_duplicates(keep = False)
    ori_df.to_excel(excel_writer=report_writer, sheet_name=sheet_name, encoding="utf-8", index=False)
    report_writer.save()
    report_writer.close()
    


 

Guess you like

Origin blog.csdn.net/lanxuxml/article/details/102891127