TF-IDF提取关键词

  今天完成了提取字符串关键词的这一部分,代码如下:

 1 # -*- coding: gbk -*-
 2 import jieba.analyse
 3 import pymysql
 4 
 5 
 6 # 打开连接
 7 def open_conn(dbname):
 8     db = pymysql.connect(
 9         host="localhost",
10         port=3306,
11         user="root",
12         passwd="******",
13         db=dbname,
14         charset="utf8")
15     return db
16 
17 # 遍历查询
18 def query(db):
19     cursor = db.cursor()
20     sql = "select ID,key_words,result_summary from summary_table"
21     cursor.execute(sql)
22     for each in cursor.fetchall():
23         ID = each[0]
24         key_words = each[1]
25         result_summary = each[2]
26         if key_words is None:                  
27             if len(result_summary) > 10:
28                 new_key_word = get_keyword_by_TFIDF(result_summary)
29                 update(db,ID,new_key_word)
30                 print(ID,new_key_word)
31 
32 # 修改数据  参数(db,ID,关键词)
33 def update(db,ID,keyword):
34     cursor = db.cursor()
35     sql = " update summary_table set key_words = %s where ID = %s"
36     cursor.execute(sql,(keyword,ID))
37     db.commit()
38 
39 ##词性:n:名词,v:动词,ns:地名,vn:名动词,nt:机构团体
40 # TF-IDF算法提取关键词
41 def get_keyword_by_TFIDF(result_summary):
42     keywords = " ".join(jieba.analyse.extract_tags(result_summary, topK=5, withWeight=False, allowPOS=(["ns", "n", "vn", "v","nt"])))
43     return keywords
44 #TextRank算法
45 def get_keyword_by_TextRank(result_summary):
46     keywords = " ".join(jieba.analyse.textrank(result_summary, topK=5, withWeight=False, allowPOS=("ns", "n", "vn", "v","nt")))
47     return keywords
48 
49 
50 if __name__ == '__main__':
51     query(open_conn("datax"))
View Code

猜你喜欢

转载自www.cnblogs.com/liyuchao/p/12488974.html