corpus处理---

实现的效果如上所示:

(1)总共几千个文件,遍历文件的关键代码:

1 files = os.listdir(src_path)  # 得到文件夹下的所有文件名称
2 for file in files:  # 遍历文件夹
3     if not os.path.isdir(file):  # 判断是否是文件夹,不是文件夹才打开
4         f = open(src_path + "/" + file)  # 打开文件
5         target_file = open(target_path + file, 'w')
6         iter_f = iter(f)  # 创建迭代器
7         for line in iter_f:
8             line = line.rstrip('\n')
View Code

(2)正则表达式:

1  target_file.write(re.sub(u'\\(.*?\\)|\\{.*?}|\\[.*?]','',line))

(3) 拆分,排序的问题

 1 import itertools
 2 import  os
 3 
 4 def split_str(count):
 5     temp_str = ''
 6     str_param = list()
 7     for num in range(len(count)-1,-1,-1):
 8         s = count[num]
 9         if s != '':
10             temp=s.split('-')
11             if temp[0] == '' or temp[1] == '':# 去掉没有对应的
12                count.remove(s)
13             else:
14                 left,right = deal_Str(s)
15                 temp =[]
16                 temp = list(itertools.product(left,right))
17                 temp = list(temp)
18                 for i in range(0,len(temp)):
19                     for j in range(0,1):
20                         str_param.append(str(temp[i][j])+'-'+str(temp[i][j+1]))
21     return str_param
22 
23 def deal_Str(Str): # 将一个字符串分割,eg:'7,8,9-4,5,6,7',变成[7,8,9]和[4,5,6,7]
24     L=Str.split('-')
25     left=L[0].split(',')
26     right=L[1].split(',')
27     left=[int(x) for x in left]
28     right=[int(x) for x in right]
29     return left,right
30 
31 def sort_str(str_list):# 排序
32      count = len(str_list)
33      for i in range(0, count):
34          for j in range(0, count-1):
35              m = str_list[j].split('-')[0]
36              n = str_list[j+1].split('-')[0]
37              if int(m) > int(n):
38                  temp = str_list[j]
39                  str_list[j] = str_list[j+1]
40                  str_list[j+1] = temp
41 
42      return str_list
43 
44 
45 src_path = 'D:\\wordalign\\WA\\ctb_aligned_chuli'
46 target_path = 'D:\\wordalign\\WA\\ctb_aligned_last_chuli\\'
47 files = os.listdir(src_path)  # 得到文件夹下的所有文件名称
48 for file in files:  # 遍历文件夹
49     if not os.path.isdir(file):  # 判断是否是文件夹,不是文件夹才打开
50         f = open(src_path + "/" + file)  # 打开文件
51         target_file = open(target_path + file, 'w')
52         iter_f = iter(f)  # 创建迭代器
53         for line in iter_f:
54             line = line.rstrip('\n')
55             if line != 'rejected':
56                 count = line.split(' ')
57 
58                 arr = sort_str(split_str(count))
59                 temp = ''
60                 for st in arr:
61                     temp += st
62                     temp += ' '
63                 target_file.write(temp+'\n')
64             else:
65                 target_file.write(line+'\n')
66 f.close()
67 target_file.close()
View Code

猜你喜欢

转载自www.cnblogs.com/Shaylin/p/9925937.html