Python转码&解压&多进程

Python批量转换文件编码格式

Eclipse中看ANSI编码的文件有乱码，所以希望通过python将相关文件转换成utf-8编码。

源：https://www.cnblogs.com/tsbc/p/4450675.html

'''

遍历文件夹

如果文件名是.cpp .h

如果原来的编码不是utf-8,将文件编码格式改成utf-8

'''

import os,sys

import chardet

def convert( filename, out_enc="UTF8" ):

try:

fp = open(filename,'rb+')

content = fp.read()

coding = chardet.detect(content)['encoding'] #获取encoding的值[编码格式]

if coding != 'utf-8':

new_content = content.decode(coding,"ignore").encode(out_enc)

fp.seek(0)

fp.write(new_content)

fp.close()

except IOError:

print( " error")

def explore(dir):

#遍历目录

for root, dirs, files in os.walk(dir):

for file in files:

if '.cpp' in file or '.h' in file:

path = os.path.join(root, file)

convert(path)

fiePath = r'E:\Code'

def main():

explore(fiePath)

if __name__ == "__main__":

main()

Python解压

https://www.cnblogs.com/Oliva/p/8824040.html 多线程字典破解加密zip

https://www.cnblogs.com/fyqq0403/p/9710420.html 解压加密的zip

https://www.cnblogs.com/flyhigh1860/p/3884842.html 解压zip

Python多线程&多进程

https://www.cnblogs.com/yeayee/p/4952022.html 基础介绍

https://www.cnblogs.com/kellyseeme/p/5525017.html 锁的应用

https://www.cnblogs.com/znicy/p/6234522.html 通过多进程的方式解决了解压缩的性能问题

https://www.cnblogs.com/xybaby/p/6510941.html#undefined python性能优化，介绍了GIL导致多线程的问题

https://www.cnblogs.com/SuKiWX/p/8804974.html python GIL解释

用python解压多个压缩文件（环境中有6000个左右压缩文件）遇到瓶颈，解压过程非常慢。尝试用多线程解压，处理时间不仅没有减少，还增加了。后搜索上述博客后，用多进程解压以缩短处理时间。

import zipfile

import tarfile

import gzip

import os

from time import ctime

from multiprocessing import Pool

from multiprocessing import cpu_count

dayZipsPath = r'.'

quarterZipsPath = r'./tmp'

zipPassWord = b'password'

mrFilePath = r'./data'

def unzipDayFile():

for file_name in os.listdir(dayZipsPath):

if os.path.splitext(file_name)[1] == '.zip':

print( file_name)

file_zip = zipfile.ZipFile(file_name, 'r')

file_zip.extractall(path = quarterZipsPath, pwd = zipPassWord)

file_zip.close()

#os.remove(file_name)

def untarDayFile():

for file_name in os.listdir(dayZipsPath):

if '.tar.gz' in file_name:

print( file_name)

file_tar = tarfile.open(file_name)

file_tar.extractall(path = quarterZipsPath)

file_tar.close()

#os.remove(file_name)

def unzip(zipsList):

for file_name in zipsList:

if os.path.splitext(file_name)[1] == '.zip':

zipFileName = quarterZipsPath +'/'+ file_name

file_zip = zipfile.ZipFile(zipFileName, 'r')

file_zip.extractall(path = mrFilePath, pwd = zipPassWord)

file_zip.close()

os.remove(zipFileName)

if __name__ == '__main__':

print('Begin:%s' % ctime())

#获取CPU核个数

cpuNum = cpu_count()

print(cpuNum)

unzipDayFile()

untarDayFile()

#多进程解压，大大缩短处理时间

quarterZipsList = list(os.listdir(quarterZipsPath))

zipFileNum = len(quarterZipsList)

print("total zip files num:%d" % (zipFileNum))

print("begin unzip:%s" % ctime())

p = Pool()

for i in range(cpuNum):

beginPos = int(i*zipFileNum/cpuNum)

endPos = min(int((i+1)*zipFileNum/cpuNum),zipFileNum)

print("proc %d - %d" % (beginPos, endPos))

p.apply_async(unzip,args=(quarterZipsList[beginPos:endPos],))

print("waiting for unzip quarter mr data ...")

p.close()

p.join()

print("end unzip:%s" % ctime())

print( "End:%s" % ctime())