最近因为需要处理提示信息的国际化问题,编写了一个小工具扫描工程中的中文词条,其基本的思路如下:
(1)获取文件的编码
(2)以文件的编码打开文件(实际上就是统一转化为Unicode),并读取所有文件行
(3)使用正则表达式获取所有的中文(re.compile(ur'[^\u4e00-\u9fa5]')),中文编码的范围在\u4e00-\u9fa5之间
(4)拼接所检测出的中文
其具体的实现如下:
# coding=utf-8
import re
import sys
import os
import argparse
import chardet
import io
reload(sys)
sys.setdefaultencoding("utf8")
def translate(str,chr_set):
line = str.strip()
p2 = re.compile(ur'[^\u4e00-\u9fa5]')
str_zh = " ".join(p2.split(line)).strip()
str_zh = ",".join(str_zh.split())
return str_zh
def get_all_file(rawdir):
allfile = []
allfilelist=os.listdir(rawdir)
for f in allfilelist:
filepath=os.path.join(rawdir,f)
if os.path.isdir(filepath):
get_all_file(filepath)
allfile.append(filepath)
return allfile
def deal_one_file(file_path,chr_set):
with io.open(file_path,'r+',encoding=chr_set) as file:
lines = file.readlines()
line_cur = 1
for line in lines:
ret = translate(line,chr_set)
if ret:
#是否为注释
note_type_one = re.compile('//.*')
note_type_two = re.compile('/\*.*?\*/')
note_one=note_type_one.match(line)
note_two=note_type_two.match(line)
if None != note_one or None != note_two:
continue
print file_path,line_cur,ret
line_cur = line_cur + 1
def get_encoding(file):
with open(file, 'rb') as f:
data = f.read()
return chardet.detect(data)['encoding']
#获取当前目录
cur_path=os.getcwd()
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dir")
parser.add_argument("-f", "--full_path")
args = parser.parse_args()
scan_file_dir=""
if None == args.dir :
print "扫描目录参数未输入!"
scan_file_dir = cur_path + args.dir
if False == os.path.exists(scan_file_dir):
print "文件夹:",scan_file_dir,"不存在!"
#获取所有文件
all_file = get_all_file(scan_file_dir)
all_file_fit = []
for file in all_file:
if file.endswith('.cpp'):
all_file_fit.append(file)
continue
if file.endswith('.h'):
all_file_fit.append(file)
continue
if file.endswith('.xml'):
all_file_fit.append(file)
continue
for file in all_file_fit:
chr_set = get_encoding(file)
deal_one_file(file,chr_set)