LDA实践1（NLP）

 !/usr/bin/python
# -*- coding:utf-8 -*-

from gensim import corpora, models, similarities
from pprint import pprint

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


if __name__ == '__main__':
    f = open('22.LDA_test.txt')
    stop_list = set('for a of the and to in'.split())
    #texts = [line.strip().split() for line in f]
    #print(texts)
    texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f]
    print ('Text = ')
    pprint(texts)
    dictionary = corpora.Dictionary(texts)
    V = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    for c in corpus_tfidf:
        print(c)
    print ('\nLSI Model:')
    lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]
    pprint(topic_result)
    print ('LSI Topics:')
    pprint(lsi.print_topics(num_topics=2, num_words=5))
    similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf])   # similarities.Similarity()
    print ('Similarity:')
    pprint(list(similarity))
    print ('\nLDA Model:')
    num_topics = 2
    lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                          alpha='auto', eta='auto', minimum_probability=0.001)
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
    print ('Document-Topic:\n')
    pprint(doc_topic)
    for doc_topic in lda.get_document_topics(corpus_tfidf):
        print(doc_topic)
    for topic_id in range(num_topics):
        print( 'Topic', topic_id)
        # pprint(lda.get_topic_terms(topicid=topic_id))
        pprint(lda.show_topic(topic_id))

打印结果：

Text = 
[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]
[(0, 0.4301019571350565), (1, 0.4301019571350565), (2, 0.2944198962221451), (3, 0.2944198962221451), (4, 0.2944198962221451), (5, 0.4301019571350565), (6, 0.4301019571350565)]
[(2, 0.3726494271826947), (7, 0.5443832091958983), (8, 0.3726494271826947), (9, 0.3726494271826947), (10, 0.27219160459794917), (11, 0.3726494271826947), (12, 0.27219160459794917)]
[(4, 0.438482464916089), (10, 0.32027755044706185), (12, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(3, 0.3449874408519962), (10, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(8, 0.30055933182961736), (11, 0.30055933182961736), (12, 0.21953536176370683), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.48507125007266594), (23, 0.48507125007266594), (24, 0.24253562503633297), (25, 0.48507125007266594)]
[(24, 0.31622776601683794), (26, 0.31622776601683794), (27, 0.6324555320336759), (28, 0.6324555320336759)]
[(24, 0.20466057569885868), (26, 0.20466057569885868), (29, 0.40932115139771735), (30, 0.2801947048062438), (31, 0.40932115139771735), (32, 0.40932115139771735), (33, 0.40932115139771735), (34, 0.40932115139771735)]
[(9, 0.6282580468670046), (26, 0.45889394536615247), (30, 0.6282580468670046)]

LSI Model:
[[(0, 0.34057117986841934), (1, -0.20602251622679674)],
 [(0, 0.693304000217156), (1, 0.007232758390388724)],
 [(0, 0.590260767038972), (1, -0.3526046949085567)],
 [(0, 0.5214901821825131), (1, -0.338879761540553)],
 [(0, 0.39533193176354514), (1, -0.05919285336660024)],
 [(0, 0.036353173528493904), (1, 0.18146550208818907)],
 [(0, 0.14709012328778967), (1, 0.49432948127822307)],
 [(0, 0.2140711731756531), (1, 0.6406456664453937)],
 [(0, 0.4006656831817075), (1, 0.6413108299093995)]]
LSI Topics:
[(0,
  '0.400*"system" + 0.318*"survey" + 0.290*"user" + 0.274*"eps" + '
  '0.236*"management"'),
 (1,
  '0.421*"minors" + 0.420*"graph" + 0.293*"survey" + 0.239*"trees" + '
  '0.226*"intersection"')]
Similarity:
[array([ 1.        ,  0.8501795 ,  0.9999846 ,  0.9994811 ,  0.9228376 ,
       -0.33944285, -0.2520774 , -0.21974573,  0.01438823], dtype=float32),
 array([0.8501795 , 1.        , 0.8530905 , 0.8327791 , 0.98737705,
       0.20664607, 0.29518002, 0.32680073, 0.5386711 ], dtype=float32),
 array([ 0.9999846 ,  0.8530905 ,  1.        ,  0.9992868 ,  0.92496276,
       -0.33421332, -0.24669874, -0.214324  ,  0.01994151], dtype=float32),
 array([ 0.9994811 ,  0.8327791 ,  0.9992868 ,  1.        ,  0.9099512 ,
       -0.36956567, -0.28311783, -0.25105584, -0.01782739], dtype=float32),
 array([0.9228376 , 0.98737705, 0.92496276, 0.9099512 , 1.        ,
       0.04906873, 0.14012395, 0.1729846 , 0.39842743], dtype=float32),
 array([-0.33944285,  0.20664607, -0.33421332, -0.36956567,  0.04906873,
        1.        ,  0.99581695,  0.99222624,  0.93564534], dtype=float32),
 array([-0.2520774 ,  0.29518002, -0.24669874, -0.28311783,  0.14012395,
        0.99581695,  1.        ,  0.9994465 ,  0.96397996], dtype=float32),
 array([-0.21974573,  0.32680073, -0.214324  , -0.25105584,  0.1729846 ,
        0.99222624,  0.9994465 ,  0.99999994,  0.97229445], dtype=float32),
 array([ 0.01438823,  0.5386711 ,  0.01994151, -0.01782739,  0.39842743,
        0.93564534,  0.96397996,  0.97229445,  1.        ], dtype=float32)]

LDA Model:
Document-Topic:

[[(0, 0.71403325), (1, 0.28596672)],
 [(0, 0.35664505), (1, 0.64335495)],
 [(0, 0.33336183), (1, 0.6666382)],
 [(0, 0.6937576), (1, 0.3062424)],
 [(0, 0.22962306), (1, 0.770377)],
 [(0, 0.26161683), (1, 0.7383832)],
 [(0, 0.6377626), (1, 0.3622374)],
 [(0, 0.74558794), (1, 0.25441203)],
 [(0, 0.66050804), (1, 0.33949193)]]
[(0, 0.7139703), (1, 0.28602976)]
[(0, 0.3565676), (1, 0.6434324)]
[(0, 0.33338004), (1, 0.66661996)]
[(0, 0.6937526), (1, 0.30624744)]
[(0, 0.22970387), (1, 0.77029616)]
[(0, 0.2616304), (1, 0.73836964)]
[(0, 0.63779765), (1, 0.36220235)]
[(0, 0.7456425), (1, 0.2543575)]
[(0, 0.6604755), (1, 0.33952454)]
Topic 0
[('graph', 0.042950127),
 ('system', 0.0411274),
 ('minors', 0.04057547),
 ('survey', 0.038986754),
 ('human', 0.034782734),
 ('trees', 0.03250014),
 ('paths', 0.032380816),
 ('computer', 0.031485233),
 ('eps', 0.03146121),
 ('intersection', 0.031362716)]
Topic 1
[('user', 0.04024524),
 ('system', 0.037700906),
 ('survey', 0.03605829),
 ('time', 0.03475459),
 ('eps', 0.03466464),
 ('response', 0.03434055),
 ('interface', 0.033663087),
 ('management', 0.033292256),
 ('trees', 0.03284403),
 ('generation', 0.03110097)]

对比以上，更好的理解LDA

猜你喜欢