NLP (X) to identify topics

  • Topic identification
    is to find the set of input text relating to the process of present
    LDA algorithm, i.e. Dirichlet distribution algorithm
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora,models
import feedparser

class IdentifyingTopicExample:
    def getDocuments(self): # 获取文档 放到documents中
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print('-- {}'.format(text))
        print('INFO: Fetching documents from {} completed'.format(url))

    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # 想要只处理字母9
        en_stop = set(stopwords.words('english')) # 英文停用词放到en_stop中
        self.cleaned = [] # 用于存储所有被清洗且分词后的文档
        for doc in self.documents:
            lowercase_doc = doc.lower() # 字母都变小写
            words = tokenizer.tokenize(lowercase_doc) # 分词
            non_stopped_words = [i for i in words if not i in en_stop] # 过滤掉停用词
            self.cleaned.append(non_stopped_words) # cleaned 二维列表
        print('INFO: Clearning {} documents completed'.format(len(self.documents)))

    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned) # 创建字典
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        # 由每个清洗后的句子,以词袋形式定义corpus变量
        ldamodel = models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary)
        # 在corpus上创建一个模型,主题数量设为2,id2word设置词典的大小/映射情况
        print(ldamodel.print_topics(num_topics=2,num_words=4)) # 打印主题 每个主题含4个单词

    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()

if __name__ == "__main__":
    topicExample = IdentifyingTopicExample()
    topicExample.run()

Output:

-- MLB Network documentary shines spotlight on 1995 Mariners team that saved baseball in Seattle.
-- Marcus Semien's second big swing of the day finally gave the Oakland Athletics some breathing room in an oh-so-tight series with the AL Central-leading Twins.  Semien hit a grand slam in the eighth inning after his tying homer leading off the fifth, Chris Herrmann had a career-high four hits, and
-- It wasn't long until Cleveland took advantage of it.  Francisco Lindor drove in the go-ahead runs during a six-run seventh inning, Jose Ramirez homered twice and Carlos Santana pushed his on-base streak to 27 games as the Indians rallied to beat bumbling Kansas City 8-4 on Thursday and complete a
-- A look at what's happening around the majors Friday:
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Clearning 4 documents completed
[(0, '0.022*"look" + 0.022*"friday" + 0.022*"around" + 0.022*"majors"'), (1, '0.023*"leading" + 0.023*"semien" + 0.022*"inning" + 0.014*"homer"')]

Guess you like

Origin www.cnblogs.com/peng8098/p/nlp_10.html