!/usr/bin/python
# -*- coding:utf-8 -*-
from gensim import corpora, models, similarities
from pprint import pprint
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if __name__ == '__main__':
f = open('22.LDA_test.txt')
stop_list = set('for a of the and to in'.split())
#texts = [line.strip().split() for line in f]
#print(texts)
texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f]
print ('Text = ')
pprint(texts)
dictionary = corpora.Dictionary(texts)
V = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
corpus_tfidf = models.TfidfModel(corpus)[corpus]
for c in corpus_tfidf:
print(c)
print ('\nLSI Model:')
lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
topic_result = [a for a in lsi[corpus_tfidf]]
pprint(topic_result)
print ('LSI Topics:')
pprint(lsi.print_topics(num_topics=2, num_words=5))
similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # similarities.Similarity()
print ('Similarity:')
pprint(list(similarity))
print ('\nLDA Model:')
num_topics = 2
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
alpha='auto', eta='auto', minimum_probability=0.001)
doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
print ('Document-Topic:\n')
pprint(doc_topic)
for doc_topic in lda.get_document_topics(corpus_tfidf):
print(doc_topic)
for topic_id in range(num_topics):
print( 'Topic', topic_id)
# pprint(lda.get_topic_terms(topicid=topic_id))
pprint(lda.show_topic(topic_id))
打印结果:
Text = [['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']] [(0, 0.4301019571350565), (1, 0.4301019571350565), (2, 0.2944198962221451), (3, 0.2944198962221451), (4, 0.2944198962221451), (5, 0.4301019571350565), (6, 0.4301019571350565)] [(2, 0.3726494271826947), (7, 0.5443832091958983), (8, 0.3726494271826947), (9, 0.3726494271826947), (10, 0.27219160459794917), (11, 0.3726494271826947), (12, 0.27219160459794917)] [(4, 0.438482464916089), (10, 0.32027755044706185), (12, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)] [(3, 0.3449874408519962), (10, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)] [(8, 0.30055933182961736), (11, 0.30055933182961736), (12, 0.21953536176370683), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)] [(21, 0.48507125007266594), (22, 0.48507125007266594), (23, 0.48507125007266594), (24, 0.24253562503633297), (25, 0.48507125007266594)] [(24, 0.31622776601683794), (26, 0.31622776601683794), (27, 0.6324555320336759), (28, 0.6324555320336759)] [(24, 0.20466057569885868), (26, 0.20466057569885868), (29, 0.40932115139771735), (30, 0.2801947048062438), (31, 0.40932115139771735), (32, 0.40932115139771735), (33, 0.40932115139771735), (34, 0.40932115139771735)] [(9, 0.6282580468670046), (26, 0.45889394536615247), (30, 0.6282580468670046)] LSI Model: [[(0, 0.34057117986841934), (1, -0.20602251622679674)], [(0, 0.693304000217156), (1, 0.007232758390388724)], [(0, 0.590260767038972), (1, -0.3526046949085567)], [(0, 0.5214901821825131), (1, -0.338879761540553)], [(0, 0.39533193176354514), (1, -0.05919285336660024)], [(0, 0.036353173528493904), (1, 0.18146550208818907)], [(0, 0.14709012328778967), (1, 0.49432948127822307)], [(0, 0.2140711731756531), (1, 0.6406456664453937)], [(0, 0.4006656831817075), (1, 0.6413108299093995)]] LSI Topics: [(0, '0.400*"system" + 0.318*"survey" + 0.290*"user" + 0.274*"eps" + ' '0.236*"management"'), (1, '0.421*"minors" + 0.420*"graph" + 0.293*"survey" + 0.239*"trees" + ' '0.226*"intersection"')] Similarity: [array([ 1. , 0.8501795 , 0.9999846 , 0.9994811 , 0.9228376 , -0.33944285, -0.2520774 , -0.21974573, 0.01438823], dtype=float32), array([0.8501795 , 1. , 0.8530905 , 0.8327791 , 0.98737705, 0.20664607, 0.29518002, 0.32680073, 0.5386711 ], dtype=float32), array([ 0.9999846 , 0.8530905 , 1. , 0.9992868 , 0.92496276, -0.33421332, -0.24669874, -0.214324 , 0.01994151], dtype=float32), array([ 0.9994811 , 0.8327791 , 0.9992868 , 1. , 0.9099512 , -0.36956567, -0.28311783, -0.25105584, -0.01782739], dtype=float32), array([0.9228376 , 0.98737705, 0.92496276, 0.9099512 , 1. , 0.04906873, 0.14012395, 0.1729846 , 0.39842743], dtype=float32), array([-0.33944285, 0.20664607, -0.33421332, -0.36956567, 0.04906873, 1. , 0.99581695, 0.99222624, 0.93564534], dtype=float32), array([-0.2520774 , 0.29518002, -0.24669874, -0.28311783, 0.14012395, 0.99581695, 1. , 0.9994465 , 0.96397996], dtype=float32), array([-0.21974573, 0.32680073, -0.214324 , -0.25105584, 0.1729846 , 0.99222624, 0.9994465 , 0.99999994, 0.97229445], dtype=float32), array([ 0.01438823, 0.5386711 , 0.01994151, -0.01782739, 0.39842743, 0.93564534, 0.96397996, 0.97229445, 1. ], dtype=float32)] LDA Model: Document-Topic: [[(0, 0.71403325), (1, 0.28596672)], [(0, 0.35664505), (1, 0.64335495)], [(0, 0.33336183), (1, 0.6666382)], [(0, 0.6937576), (1, 0.3062424)], [(0, 0.22962306), (1, 0.770377)], [(0, 0.26161683), (1, 0.7383832)], [(0, 0.6377626), (1, 0.3622374)], [(0, 0.74558794), (1, 0.25441203)], [(0, 0.66050804), (1, 0.33949193)]] [(0, 0.7139703), (1, 0.28602976)] [(0, 0.3565676), (1, 0.6434324)] [(0, 0.33338004), (1, 0.66661996)] [(0, 0.6937526), (1, 0.30624744)] [(0, 0.22970387), (1, 0.77029616)] [(0, 0.2616304), (1, 0.73836964)] [(0, 0.63779765), (1, 0.36220235)] [(0, 0.7456425), (1, 0.2543575)] [(0, 0.6604755), (1, 0.33952454)] Topic 0 [('graph', 0.042950127), ('system', 0.0411274), ('minors', 0.04057547), ('survey', 0.038986754), ('human', 0.034782734), ('trees', 0.03250014), ('paths', 0.032380816), ('computer', 0.031485233), ('eps', 0.03146121), ('intersection', 0.031362716)] Topic 1 [('user', 0.04024524), ('system', 0.037700906), ('survey', 0.03605829), ('time', 0.03475459), ('eps', 0.03466464), ('response', 0.03434055), ('interface', 0.033663087), ('management', 0.033292256), ('trees', 0.03284403), ('generation', 0.03110097)]对比以上,更好的理解LDA