# 模拟业务场景 reviews = [ 'It is an amazing movie.', 'This is a dull movie. I would never recommend it to anyone.', 'The cinematography is pretty great in this movie.', 'The direction was terrible and the story was all over the place.'] sents, probs = [], [] for review in reviews: sample = {} words = review.split() for word in words: sample[word] = True pcls = model.classify(sample) print(review, '->', pcls) 输出结果: 0.735 It is an amazing movie. -> POSITIVE This is a dull movie. I would never recommend it to anyone. -> NEGATIVE The cinematography is pretty great in this movie. -> POSITIVE The direction was terrible and the story was all over the place. -> NEGATIVE
'' '
Subject extraction: After the word, the word cleaning, stemming, TF-IDF algorithm can be extracted core word relating to a text in order to determine the current topic text.
It belongs to unsupervised learning. gensim module provides a common theme extraction tool.
Theme extract the relevant API:
Import gensim.models.ldamodel gm AS
Import gensim.corpora AS gc
# lines_tokens the words that appear in the dictionary are stored in the object gc offered to do the coding for each word.
= line_tokens [ 'Hello', 'World', ...]
DIC = gc.Dictionary (line_tokens)
# word dictionary constructed by the bag
Bow = dic.doc2bow (line_tokens)
# Construction LDA model
# bow: bag of words
# num_topics: Classification number
# id2word: dictionary
# passes: each subject reserved for the maximum number of keywords
model = gm.LdaModel (bow, num_topics = n_topics, id2word = dic, passes = 25)
The output of each category in the category # greatest contribution to 4 MeSH
Topics = model.print_topics (NUM_TOPICS = n_topics, NUM_WORDS =. 4)
'' '
Import nltk.tokenize AS TK
Import nltk.corpus AS NC
Import nltk.stem.snowball SB AS
Import gensim.models.ldamodel AS GM
Import gensim.corpora AS GC
DOC = []
with Open ( '../ ml_data / topic.txt', 'R & lt') AS F:
for in f.readlines Line ():
doc.append (Line [: -. 1])
the tokenizer tk.WordPunctTokenizer = ()
stopwords = nc.stopwords.words ( 'Dictionary Dictionary English')
Signs = [ '.' '!' ',',,]
Stemmer = sb.SnowballStemmer ( 'Dictionary Dictionary English')
lines_tokens = []
for Line in DOC:
tokens = tokenizer.tokenize (Line.lower())
line_tokens = []
in token tokens for:
IF token and token not in stopwords not in Signs:
token = stemmer.stem (token)
line_tokens.append (token)
lines_tokens.append (line_tokens)
dictionary # lines_tokens the word appear in all stores offer gc objects, do the coding for each word.
= gc.Dictionary DIC (lines_tokens)
# through each row to construct a list of words bags
Bow = []
for line_tokens in lines_tokens:
Row = dic.doc2bow (line_tokens)
bow.append (Row)
n_topics = 2
# through the bag of words, the number of classifications , dictionary, the maximum number of keywords for each topic reserved build LDA model
model = gm.LdaModel (Bow, NUM_TOPICS = n_topics, id2word = dic, passes = 25)
# output in each category for a maximum of four theme contribution categories words
Topics = model.print_topics (NUM_TOPICS = n_topics, NUM_WORDS =. 4)
Print (Topics)