python提取中文词条

最近因为需要处理提示信息的国际化问题,编写了一个小工具扫描工程中的中文词条,其基本的思路如下:

(1)获取文件的编码

(2)以文件的编码打开文件(实际上就是统一转化为Unicode),并读取所有文件行

 (3)使用正则表达式获取所有的中文(re.compile(ur'[^\u4e00-\u9fa5]')),中文编码的范围在\u4e00-\u9fa5之间

 (4)拼接所检测出的中文

其具体的实现如下:

# coding=utf-8

import re
import sys
import os
import argparse
import chardet
import io
reload(sys)
sys.setdefaultencoding("utf8")


def translate(str,chr_set):
	line     = str.strip()
	p2       = re.compile(ur'[^\u4e00-\u9fa5]')
	str_zh   = " ".join(p2.split(line)).strip()
	str_zh   = ",".join(str_zh.split())
	return str_zh


def get_all_file(rawdir):
	allfile = []
	allfilelist=os.listdir(rawdir)
	for f in allfilelist:
		filepath=os.path.join(rawdir,f)
		if os.path.isdir(filepath):
			  get_all_file(filepath)
		allfile.append(filepath)
	return allfile

def deal_one_file(file_path,chr_set):
	with io.open(file_path,'r+',encoding=chr_set) as file:
		lines = file.readlines()
		line_cur = 1
		for line in  lines:
			ret = translate(line,chr_set)
			if ret:
				#是否为注释
				note_type_one = re.compile('//.*')
				note_type_two = re.compile('/\*.*?\*/')
				note_one=note_type_one.match(line)
				note_two=note_type_two.match(line)
				if None != note_one or None != note_two:
					continue
				print file_path,line_cur,ret
			line_cur = line_cur + 1

def get_encoding(file):
	with open(file, 'rb') as f:
		data = f.read()
		return chardet.detect(data)['encoding']


#获取当前目录
cur_path=os.getcwd()
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dir")
parser.add_argument("-f", "--full_path")
args = parser.parse_args()
scan_file_dir=""

if None == args.dir :
	print "扫描目录参数未输入!"

scan_file_dir = cur_path + args.dir

if False == os.path.exists(scan_file_dir):
	print "文件夹:",scan_file_dir,"不存在!"

#获取所有文件
all_file = get_all_file(scan_file_dir)

all_file_fit = []
for file in all_file:
	if file.endswith('.cpp'):
		all_file_fit.append(file)
		continue
	if file.endswith('.h'):
		all_file_fit.append(file)
		continue
	if file.endswith('.xml'):
		all_file_fit.append(file)
		continue

for file in all_file_fit:
	chr_set = get_encoding(file)
	deal_one_file(file,chr_set)

猜你喜欢

转载自blog.csdn.net/lyj22/article/details/106482202