import os
import zipfile
import pandas as pd
def merge_csv_files(zip_folder, output_folder):
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历指定路径下的所有zip文件
for zip_file in os.listdir(zip_folder):
if zip_file.endswith('.zip'):
zip_path = os.path.join(zip_folder, zip_file)
temp_folder = os.path.join(output_folder, 'temp')
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_folder)
for csv_file in os.listdir(temp_folder):
if csv_file.endswith('.csv'):
csv_path = os.path.join(temp_folder, csv_file)
# 通过尝试不同的编码方式来解决编码问题
try:
# 尝试使用utf-8编码读取CSV文件
df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
# 如果utf-8解码失败,尝试使用latin1编码
df = pd.read_csv(csv_path, encoding='latin1')
merge_csv(df, csv_file, output_folder)
clean_temp_folder(temp_folder)
def merge_csv(df, csv_file, output_folder):
output_path = os.path.join(output_folder, os.path.basename(csv_file))
if os.path.exists(output_path):
existing_df = pd.read_csv(output_path)
merged_df = pd.concat([existing_df, df], ignore_index=True)
merged_df.to_csv(output_path, index=False)
else:
df.to_csv(output_path, index=False)
def clean_temp_folder(temp_folder):
# 删除临时文件夹及其内容
for file in os.listdir(temp_folder):
file_path = os.path.join(temp_folder, file)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
clean_temp_folder(file_path)
os.rmdir(temp_folder)
# 指定输入和输出文件夹
zip_folder_path = '/home/philtell/data/'
output_folder_path = '/home/philtell/data/test'
# 执行合并操作
merge_csv_files(zip_folder_path, output_folder_path)
功能增加,支持解压中文,同时支持所在行筛选
import os
import zipfile
import pandas as pd
def merge_csv_files(zip_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for zip_file in os.listdir(zip_folder):
if zip_file.endswith('.zip'):
zip_path = os.path.join(zip_folder, zip_file)
temp_folder = os.path.join(output_folder, 'temp')
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_folder)
for csv_file in os.listdir(temp_folder):
if csv_file.endswith('.csv'):
csv_path = os.path.join(temp_folder, csv_file)
# 读取CSV文件时指定GBK编码
df = pd.read_csv(csv_path, encoding='gbk')
# 保留第七列中文内容为"离线"的行
df = df[df.iloc[:, 6] == "离线"]
merge_csv(df, csv_file, output_folder)
clean_temp_folder(temp_folder)
def merge_csv(df, csv_file, output_folder):
output_path = os.path.join(output_folder, os.path.basename(csv_file))
if os.path.exists(output_path):
existing_df = pd.read_csv(output_path, encoding='utf-8')
merged_df = pd.concat([existing_df, df], ignore_index=True)
merged_df.to_csv(output_path, index=False, encoding='utf-8')
else:
df.to_csv(output_path, index=False, encoding='utf-8')
def clean_temp_folder(temp_folder):
for file in os.listdir(temp_folder):
file_path = os.path.join(temp_folder, file)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
clean_temp_folder(file_path)
os.rmdir(temp_folder)
# 指定输入和输出文件夹
zip_folder_path = '/home/philtell/data/'
output_folder_path = '/home/philtell/data/test2'
# 执行合并操作
merge_csv_files(zip_folder_path, output_folder_path)
将CSV从默认编码格式转成UTF-8的格式
import os
import pandas as pd
def convert_csv_files(input_directory, output_directory, output_encoding='utf-8'):
# 检查目标目录是否存在,如果不存在则创建
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# 遍历输入目录中的所有CSV文件
for filename in os.listdir(input_directory):
if filename.endswith(".csv"):
input_filepath = os.path.join(input_directory, filename)
output_filepath = os.path.join(output_directory, f"{
filename.split('.')[0]}_utf8.csv")
# 读取CSV文件
df = pd.read_csv(input_filepath, encoding='utf-8')
# 保存为UTF-8编码的CSV文件
df.to_csv(output_filepath, index=False, encoding=output_encoding)
print(f"Converted and saved: {
output_filepath}")
if __name__ == "__main__":
input_directory = "/path/to/your/input/directory"
output_directory = "/path/to/your/output/directory"
convert_csv_files(input_directory, output_directory)