分词加关键词提取

import numpy as np
import pandas as pd
import jieba
#读取文件
news_all=pd.read_excel(r"",names=[“title”,“url”,“kind”])
new_all=news_all.dropna()
#选取标题并列表化
title_all=new_all.title.value.tolist()
#创建一个列表用于存放分好的词
cut_word_list=[]
#对每个新闻标题迭代分词
for one in title_all:
cut_word=jieba.lcut(one)
if len(cut_word)>1 and cut_word !="\r\n"
cut_word_list.append(cut_word)
#读取关键词
stop_word=pd.read_csv(r"",sep="\t",quoting=3,names=[“stopwords”],encoding=“utf-8”)
#stop_word.head(30)
#删除停用词
drop_stopwords(cut_word_list,stopwords):
title_clearn=[]
word_cloud=[]
for word_list in cut_word_list:
line_clearn=[]
for word in word_list:
if word in stopwords:
continue
line_clearn.append(word)
word_cloud.append(str(word))
title_clearn.append(word)
return title_clearn,word_cloud
stopwords=stopwords.stopwords.value.tolist()
title_clearn,all_words=drop_stopwords(cut_word_list,stopwords)
#吧清洗好的词以数据框的形式呈现
df_title=pd.DataFrame({“title_clearn”:title_clearn})
#打印前五条
title_clearn[:6]

#吧清洗好的词库以数据框的形式呈现
df_title=pd.DataFrame({“word_cloud”:word_cloud})
#打印前30个词
df_title.[:30]
#统计所有的词的词频
words_count=df_title.groupby(by=[“word_cloud”])[“all_word”].agg({“count”:numpy.size})
words_count=words_count.reset_index().sort_values(by=[“count”],ascending=False)
words_count.head()

猜你喜欢

转载自blog.csdn.net/chengjintao1121/article/details/84806827