NLP（十）主题识别

主题识别
是发现输入文本集合中存在的主题的过程
LDA算法，即狄利克雷分布算法

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora,models
import feedparser

class IdentifyingTopicExample:
    def getDocuments(self): # 获取文档 放到documents中
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print('-- {}'.format(text))
        print('INFO: Fetching documents from {} completed'.format(url))

    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # 想要只处理字母9
        en_stop = set(stopwords.words('english')) # 英文停用词放到en_stop中
        self.cleaned = [] # 用于存储所有被清洗且分词后的文档
        for doc in self.documents:
            lowercase_doc = doc.lower() # 字母都变小写
            words = tokenizer.tokenize(lowercase_doc) # 分词
            non_stopped_words = [i for i in words if not i in en_stop] # 过滤掉停用词
            self.cleaned.append(non_stopped_words) # cleaned 二维列表
        print('INFO: Clearning {} documents completed'.format(len(self.documents)))

    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned) # 创建字典
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        # 由每个清洗后的句子，以词袋形式定义corpus变量
        ldamodel = models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary)
        # 在corpus上创建一个模型，主题数量设为2，id2word设置词典的大小/映射情况
        print(ldamodel.print_topics(num_topics=2,num_words=4)) # 打印主题 每个主题含4个单词

    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()

if __name__ == "__main__":
    topicExample = IdentifyingTopicExample()
    topicExample.run()

输出：

-- MLB Network documentary shines spotlight on 1995 Mariners team that saved baseball in Seattle.
-- Marcus Semien's second big swing of the day finally gave the Oakland Athletics some breathing room in an oh-so-tight series with the AL Central-leading Twins.  Semien hit a grand slam in the eighth inning after his tying homer leading off the fifth, Chris Herrmann had a career-high four hits, and
-- It wasn't long until Cleveland took advantage of it.  Francisco Lindor drove in the go-ahead runs during a six-run seventh inning, Jose Ramirez homered twice and Carlos Santana pushed his on-base streak to 27 games as the Indians rallied to beat bumbling Kansas City 8-4 on Thursday and complete a
-- A look at what's happening around the majors Friday:
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Clearning 4 documents completed
[(0, '0.022*"look" + 0.022*"friday" + 0.022*"around" + 0.022*"majors"'), (1, '0.023*"leading" + 0.023*"semien" + 0.022*"inning" + 0.014*"homer"')]

NLP（十） 主题识别

猜你喜欢

NLP（十）主题识别