修改数据集格式,但是每行最后有空格
import random
if __name__ == '__main__':
# name = './TREC/train.txt'
with open('./data/weibo/dev/dev.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()#获取所有行
sum = 0
words = []
tags = []
for line in lines:
if line.isspace() == False:#针对一句话
for i,word in enumerate(line):
if word.isspace()==True or word==' ':
words.append(line[:i].strip())
tags.append(line[i:].strip())
break
else:#一句话结束
with open('./data/weibo/dev/dev_words.txt', 'a', encoding='utf-8') as g:
for word in words:
g.write(word+' ')
g.write('\n')
words = []
with open('./data/weibo/dev/dev_tags.txt', 'a', encoding='utf-8') as z:
for tag in tags:
z.write(tag+' ')
z.write('\n')
tags = []
# with open('./data/weibo/dev/dev_words.txt', 'a', encoding='utf-8') as g:
# for word in words:
# g.write(word + ' ')
# g.write('\n')
#
# with open('./data/weibo/dev/dev_tags.txt', 'a', encoding='utf-8') as z:
# for tag in tags:
# z.write(tag + ' ')
# z.write('\n')
print(words)
print(tags)
f.close()
g.close()
z.close()
原格式:
修改后格式: