修改数据集格式

修改数据集格式,但是每行最后有空格

import random
if __name__ == '__main__':

    # name = './TREC/train.txt'
    with open('./data/weibo/dev/dev.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()#获取所有行
        sum = 0
        words = []
        tags = []
        for line in lines:
            if line.isspace() == False:#针对一句话

                for i,word in enumerate(line):
                    if word.isspace()==True or word=='  ':
                        words.append(line[:i].strip())
                        tags.append(line[i:].strip())
                        break
            else:#一句话结束
                with open('./data/weibo/dev/dev_words.txt', 'a', encoding='utf-8') as g:
                    for word in words:
                        g.write(word+' ')
                    g.write('\n')
                words = []
                with open('./data/weibo/dev/dev_tags.txt', 'a', encoding='utf-8') as z:
                    for tag in tags:
                        z.write(tag+' ')
                    z.write('\n')
                tags = []

        # with open('./data/weibo/dev/dev_words.txt', 'a', encoding='utf-8') as g:
        #     for word in words:
        #         g.write(word + ' ')
        #     g.write('\n')
        # 
        # with open('./data/weibo/dev/dev_tags.txt', 'a', encoding='utf-8') as z:
        #     for tag in tags:
        #         z.write(tag + ' ')
        #     z.write('\n')



        print(words)
        print(tags)



    f.close()
    g.close()
    z.close()

原格式:
在这里插入图片描述

修改后格式:
在这里插入图片描述在这里插入图片描述

发布了50 篇原创文章 · 获赞 44 · 访问量 8896

猜你喜欢

转载自blog.csdn.net/tailonh/article/details/105536311
今日推荐