【python】英文文本分词词频统计

import re
import functools
import pandas as pd
import csv
import os, sys

# 把文本内容整理成一个word list
txt=open('/Users/suyue/Downloads/all(1).txt')
readl=txt.readline()


r='[’!"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~]+'
word_list2 = []
while readl:
    readl = txt.readline()
    ff = str(readl)
    ff=re.sub(r,' ',ff) 
    words=ff.split(' ') 
    for word in words:
        word_list2.append(word)    
#print(word_list2)
txt.close()


#统计频次
tf = {}
for word in word_list2:
    word = word.lower()
        # print(word)
    word = ''.join(word.split())
    if word in tf:
        tf[word] += 1
    else:
        tf[word] = 1
#print(tf) 


#将结果输出到csv
with open('tmp_table.csv','w') as csvfile:

    spamwriter=csv.writer(csvfile)
    spamwriter.writerow(['col1','col2'])
    for aa,bb in tf.items():
        spamwriter.writerow([aa,bb])



#读入两个需要匹配的文件

df1=pd.read_csv('/Users/suyue/desktop/tmp_table.csv')
print(df1)

df2=pd.read_csv('/Users/suyue/desktop/table_structure.txt',sep=',')
print(df2)



#删除缓兵之计的中间文件
if(os.path.exists("/Users/suyue/desktop/tmp_table.csv")):
    os.remove("/Users/suyue/desktop/tmp_table.csv")

#merge
newdf=pd.merge(df2,df1,left_on='table_name',right_on='col1',how='left') 
print(newdf)
newnewdf=pd.merge(newdf,df1,left_on='column_name',right_on='col1',how='left') 
print(newnewdf)

#结果输出
if(os.path.exists("/Users/suyue/desktop/jieguo2.csv")):
    os.remove("/Users/suyue/desktop/jieguo2.csv")
newnewdf.to_csv('jieguo2.csv')






    

猜你喜欢

转载自blog.csdn.net/qq_41839921/article/details/83653126