将txt文本由utf-8转gbk

import codecs

# file = open(path, encoding='gbk', errors='ignore')
# print(file.readline())


path = "locator5.txt" #输入文件路径,要保证格式为utf-8,window下文本打开另存为utf-8,
                    # linux下文件直接转gbk  [shenjiayu@cp01-shenjiayu: ~]$ iconv -f utf-8 -t gbk utf_file > gbk_file
path2 = "locator6.txt"
path3 = "locator.json"

def ReadFile(filePath,encoding="utf-8"):
    with codecs.open(filePath, "r", encoding) as f:
        return f.read()

def ReadFile2(filePath,encoding="gbk"):
    with codecs.open(filePath, "r", encoding) as f:
        return f.read()

def WriteFile(filePath,u,encoding="gbk"):
    with codecs.open(filePath, "w", encoding) as f:
        f.write(u)

def UTF8_2_GBK(src,dst):
    content = ReadFile(src, encoding="utf-8")
    WriteFile(dst, content, encoding="gbk")

def UTF8_2_GBK2(src,dst):
    content = ReadFile(src, encoding="utf-8")
    WriteFile(dst, content, encoding="gb18030")




def UTF8_2_GBK3(src,dst):
    content = ReadFile(src, encoding="gbk")
    WriteFile(dst, content, encoding="gbk")



UTF8_2_GBK2(path,path2)
#
# a = ReadFile2(path3)
# b = WriteFile(path2)
# UTF8_2_GBK2(a,b)




``

还有顺便补一个去除中文的

#随便记一个去除中文的
```python
import re
from zhon.hanzi import punctuation
from zhon.hanzi import characters

def lm_find_unchinese(file):
    pattern = re.compile(r'[\u4e00-\u9fa5]')
    unchinese = re.sub(pattern,"",file) #排除汉字
    unchinese = re.sub('[{}]'.format(punctuation),"",unchinese) #排除中文符号
    #print("unchinese:",unchinese)
    return unchinese

猜你喜欢

转载自blog.csdn.net/weixin_43134049/article/details/111597517