LDA实践3（NLP）

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import lda
import lda.datasets
from pprint import pprint


if __name__ == "__main__":
    # document-term matrix
    X = lda.datasets.load_reuters()
    print("type(X): {}".format(type(X)))
    print("shape: {}\n".format(X.shape))
    print(X[:10, :10])

    # the vocab
    vocab = lda.datasets.load_reuters_vocab()
    print("type(vocab): {}".format(type(vocab)))
    print("len(vocab): {}\n".format(len(vocab)))
    print(vocab[:10])

    # titles for each story
    titles = lda.datasets.load_reuters_titles()
    print("type(titles): {}".format(type(titles)))
    print("len(titles): {}\n".format(len(titles)))
    pprint(titles[:10])

    print( 'LDA start ----')
    topic_num = 20
    model = lda.LDA(n_topics=topic_num, n_iter=500, random_state=1)
    model.fit(X)

    # topic-word
    topic_word = model.topic_word_
    print("type(topic_word): {}".format(type(topic_word)))
    print("shape: {}".format(topic_word.shape))
    print(vocab[:5])
    print(topic_word[:, :5])

    # Print Topic distribution
    n = 7
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1]
        print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

    # Document - topic
    doc_topic = model.doc_topic_
    print("type(doc_topic): {}".format(type(doc_topic)))
    print("shape: {}".format(doc_topic.shape))
    for i in range(10):
        topic_most_pr = doc_topic[i].argmax()
        print(u"文档: {} 主题: {} value: {}".format(i, topic_most_pr, doc_topic[i][topic_most_pr]))

    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False

    # Topic - word
    plt.figure(figsize=(8, 9))
    # f, ax = plt.subplots(5, 1, sharex=True)
    for i, k in enumerate([0, 5, 9, 14, 19]):
        ax = plt.subplot(5, 1, i+1)
        ax.plot(topic_word[k, :], 'r-')
        ax.set_xlim(-50, 4350)   # [0,4258]
        ax.set_ylim(0, 0.08)
        ax.set_ylabel(u"概率")
        ax.set_title(u"主题 {}".format(k))
    plt.xlabel(u"词", fontsize=14)
    plt.tight_layout()
    plt.suptitle(u'主题的词分布', fontsize=18)
    plt.subplots_adjust(top=0.9)
    plt.show()

    # Document - Topic
    plt.figure(figsize=(8, 9))
    # f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
    for i, k in enumerate([1, 3, 4, 8, 9]):
        ax = plt.subplot(5, 1, i+1)
        ax.stem(doc_topic[k, :], linefmt='g-', markerfmt='ro')
        ax.set_xlim(-1, topic_num+1)
        ax.set_ylim(0, 1)
        ax.set_ylabel(u"概率")
        ax.set_title(u"文档 {}".format(k))
    plt.xlabel(u"主题", fontsize=14)
    plt.suptitle(u'文档的主题分布', fontsize=18)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

打印结果：

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500

type(X): <class 'numpy.ndarray'>
shape: (395, 4258)

[[ 1  0  1  0  0  0  1  0  0  1]
 [ 7  0  2  0  0  0  0  1  0  0]
 [ 0  0  0  1 10  0  4  1  1  0]
 [ 6  0  1  0  0  0  1  1  1  0]
 [ 0  0  0  2 14  1  1  0  2  1]
 [ 0  0  2  2 24  0  2  0  2  1]
 [ 0  0  0  2  7  1  1  0  1  0]
 [ 0  0  2  2 20  0  2  0  3  1]
 [ 0  1  0  2 17  2  2  0  0  0]
 [ 2  0  2  0  0  2  0  1  0  3]]
type(vocab): <class 'tuple'>
len(vocab): 4258

('church', 'pope', 'years', 'people', 'mother', 'last', 'told', 'first', 'world', 'year')
type(titles): <class 'tuple'>
len(titles): 395

('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20',
 '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany '
 '1996-08-21',
 "2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23",
 '3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25',
 '4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25',
 "5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA "
 '1996-08-25',
 '6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA '
 '1996-08-26',
 "7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India "
 '1996-08-25',
 '8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA '
 '1996-08-26',
 '9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26')
LDA start ----

INFO:lda:<0> log likelihood: -1051748
INFO:lda:<10> log likelihood: -719800
INFO:lda:<20> log likelihood: -699115
INFO:lda:<30> log likelihood: -689370
INFO:lda:<40> log likelihood: -684918
INFO:lda:<50> log likelihood: -681322
INFO:lda:<60> log likelihood: -678979
INFO:lda:<70> log likelihood: -676598
INFO:lda:<80> log likelihood: -675383
INFO:lda:<90> log likelihood: -673316
INFO:lda:<100> log likelihood: -672761
INFO:lda:<110> log likelihood: -671320
INFO:lda:<120> log likelihood: -669744
INFO:lda:<130> log likelihood: -669292
INFO:lda:<140> log likelihood: -667940
INFO:lda:<150> log likelihood: -668038
INFO:lda:<160> log likelihood: -667429
INFO:lda:<170> log likelihood: -666475
INFO:lda:<180> log likelihood: -665562
INFO:lda:<190> log likelihood: -664920
INFO:lda:<200> log likelihood: -664979
INFO:lda:<210> log likelihood: -664722
INFO:lda:<220> log likelihood: -664459
INFO:lda:<230> log likelihood: -664360
INFO:lda:<240> log likelihood: -663600
INFO:lda:<250> log likelihood: -664164
INFO:lda:<260> log likelihood: -663826
INFO:lda:<270> log likelihood: -663458
INFO:lda:<280> log likelihood: -663393
INFO:lda:<290> log likelihood: -662904
INFO:lda:<300> log likelihood: -662294
INFO:lda:<310> log likelihood: -662031
INFO:lda:<320> log likelihood: -662430
INFO:lda:<330> log likelihood: -661601
INFO:lda:<340> log likelihood: -662108
INFO:lda:<350> log likelihood: -662152
INFO:lda:<360> log likelihood: -661899
INFO:lda:<370> log likelihood: -661012
INFO:lda:<380> log likelihood: -661278
INFO:lda:<390> log likelihood: -661085
INFO:lda:<400> log likelihood: -660418
INFO:lda:<410> log likelihood: -660510
INFO:lda:<420> log likelihood: -660343
INFO:lda:<430> log likelihood: -659789
INFO:lda:<440> log likelihood: -659336
INFO:lda:<450> log likelihood: -659039
INFO:lda:<460> log likelihood: -659329
INFO:lda:<470> log likelihood: -658707
INFO:lda:<480> log likelihood: -658879
INFO:lda:<490> log likelihood: -658819
INFO:lda:<499> log likelihood: -658407

type(topic_word): <class 'numpy.ndarray'>
shape: (20, 4258)
('church', 'pope', 'years', 'people', 'mother')
[[2.72436509e-06 2.72436509e-06 2.72708945e-03 2.72436509e-06
  2.72436509e-06]
 [2.29518860e-02 1.08771556e-06 7.83263973e-03 1.15308726e-02
  1.08771556e-06]
 [3.97404221e-03 4.96135108e-06 2.98177200e-03 4.96135108e-06
  4.96135108e-06]
 [3.27374625e-03 2.72585033e-06 2.72585033e-06 2.45599115e-03
  2.72585033e-06]
 [8.26262882e-03 8.56893407e-02 1.61980569e-06 4.87561512e-04
  1.61980569e-06]
 [1.30107788e-02 2.95632328e-06 2.95632328e-06 2.95632328e-06
  2.95632328e-06]
 [2.80145003e-06 2.80145003e-06 2.80145003e-06 2.80145003e-06
  2.80145003e-06]
 [2.42858077e-02 4.66944966e-06 4.66944966e-06 4.66944966e-06
  2.42858077e-02]
 [6.84655429e-03 1.90129250e-06 6.84655429e-03 1.90129250e-06
  1.90129250e-06]
 [3.48361655e-06 3.48361655e-06 3.48361655e-06 3.48361655e-06
  3.48361655e-06]
 [2.98781661e-03 3.31611166e-06 3.31611166e-06 8.29359526e-03
  3.31611166e-06]
 [4.27062069e-06 4.27062069e-06 4.27062069e-06 1.19620086e-02
  4.27062069e-06]
 [1.50994982e-02 1.64107142e-06 1.64107142e-06 1.59200339e-02
  2.95556963e-03]
 [7.73480150e-07 7.73480150e-07 1.70946848e-02 7.73480150e-07
  7.73480150e-07]
 [2.82280146e-06 2.82280146e-06 2.82280146e-06 6.77754631e-03
  7.28311005e-02]
 [5.15309856e-06 5.15309856e-06 4.64294180e-03 5.15309856e-06
  5.15309856e-06]
 [3.41695768e-06 3.41695768e-06 3.41695768e-06 1.29878561e-02
  3.41695768e-06]
 [3.90980357e-02 1.70316633e-03 4.42279319e-03 3.39953358e-06
  3.39953358e-06]
 [2.39373034e-06 2.39373034e-06 2.39373034e-06 2.39612407e-03
  2.39373034e-06]
 [3.32493234e-06 3.32493234e-06 3.32493234e-06 3.32493234e-06
  3.32493234e-06]]
*Topic 0
- government british minister west group letters party
*Topic 1
- church first during people political country ceremony
*Topic 2
- elvis king wright fans presley concert life
*Topic 3
- yeltsin russian russia president kremlin michael romania
*Topic 4
- pope vatican paul surgery pontiff john hospital
*Topic 5
- family police miami versace cunanan funeral home
*Topic 6
- south simpson born york white north african
*Topic 7
- order church mother successor since election religious
*Topic 8
- charles prince diana royal queen king parker
*Topic 9
- film france french against actor paris bardot
*Topic 10
- germany german war nazi christian letter book
*Topic 11
- east prize peace timor quebec belo indonesia
*Topic 12
- n't told life people church show very
*Topic 13
- years world time year last say three
*Topic 14
- mother teresa heart charity calcutta missionaries sister
*Topic 15
- city salonika exhibition buddhist byzantine vietnam swiss
*Topic 16
- music first people tour including off opera
*Topic 17
- church catholic bernardin cardinal bishop death cancer
*Topic 18
- harriman clinton u.s churchill paris president ambassador
*Topic 19
- century art million museum city churches works
type(doc_topic): <class 'numpy.ndarray'>
shape: (395, 20)
文档: 0 主题: 8 value: 0.4830434782608696
文档: 1 主题: 1 value: 0.29057971014492756
文档: 2 主题: 14 value: 0.6656903765690377
文档: 3 主题: 8 value: 0.5076555023923446
文档: 4 主题: 14 value: 0.7789667896678965
文档: 5 主题: 14 value: 0.844097222222222
文档: 6 主题: 14 value: 0.8035353535353537
文档: 7 主题: 14 value: 0.877474402730375
文档: 8 主题: 14 value: 0.8196153846153844
文档: 9 主题: 8 value: 0.5342105263157892

猜你喜欢