#coding=utf-8 import re with open('aaa.txt','r',encoding="utf-8") as f: #data = f.read().decode('gbk').encode('utf-8') data = f.read() print(data) #str = re.sub(r'(\\u\d+)',"",data) #data = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", data) #data = re.sub('[\W_+]', "", data) data = re.sub('[\u4E00-\u9FA5]',"", data) print(data)
# Filter out Chinese characters in addition to the
Import Re "" " Python 3.5 version of the regular match Chinese, fixed form: \ u4E00- \ u9FA5 " "" text = " aqweded *** xsa *** *** China Korea Japan *** " regStr = " *.? ([\ u4E00- \ u9FA5] +) *.? " AA = re.findall (regStr, text) IF AA: Print (AA)
# Extract in the Chinese string, returns an array
#coding=utf-8 import re with open('aaa.txt','r',encoding="utf-8") as f: #data = f.read().decode('gbk').encode('utf-8') data = f.read() print(data) data = re.sub("[A-Za-z0-9\!\%\[\]\,\。\ ]", "", data) #data = re.sub('[\u4E00-\u9FA5]',"", data) print(data)
# - * - Coding: UTF-8 - * - Import Re # filtered out in addition to the Chinese character str = " the Hello, world !!% [545] 234 hello world ... " str = re.sub ( " [ Za-Z0-9-a \! \% \ [\] \, \.] " , " " , STR) Print (STR) # of the strings in the Chinese, returns an array pattern = " [\ u4e00- \ u9fa5 ] + " REGEX = the re.compile (pattern) Results = regex.findall ( " adf adf Chinese hair BOE " ) Print (Results)