nlp sample code

import logging
from logging import NullHandler

log = logging.getLogger(__name__)
log.addHandler(NullHandler())

from corpussrc import DoubanCorpus
from gensim import corpora, models, similarities
from cleaner import StopWordFilter

def test_lsi_query(dictionary, lsi, index):
    teststr = u'haha a comment is so cute, do you guys know how hard Wu Jing is? But one star is for news. '
 filter = StopWordFilter()
    vec_bow = dictionary.doc2bow(filter.transform(jieba.cut(teststr)))
    vec_lsi = lsi [vec_bow]
    sims = index [vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    log.warn(sims)

def test_deep_learning():
     # size: dimension of feature vector window: length of context-sensitive environment min_count: minimum word frequency workers: number of processes
 model = models.Word2Vec(DoubanCorpus( 'tbDoubanReview' ), size = 100 , window = 5 , min_count = 5 , workers = 4 )
    model.wv[ u'Wu Jing' ]
    model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

# Corpus-"dictionary->bow->model->similar
 # Basic idea: vectorization, comparison vector
 def testapi():
    dictionary = corpora.Dictionary(DoubanCorpus('tbDoubanReview'))
    log.warn(dictionary.token2id)

    docs = DoubanCorpus( 'tbDoubanReview' )
     # bow format[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6 , 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1)]
     # The meaning of each tuple is The id in the dictionary, the number of times this document contains, only the words contained in this document are stored
 corpus = []
     for docwords in docs:
        log.warn(docwords)
        corpus.append(dictionary.doc2bow(docwords))

    log.warn(corpus)   # 1, bow

 tfidf = models.TfidfModel(corpus)   # 2, bow ->tfidf
     # [(4, 0.447213595499958), (5, 0.447213595499958), (6, 0.447213595499958), (7, 0.44499899) , (8, 0.447213595499958)]
     # Reflect the importance of each word, the larger the value, the more important
 corpus_tfidf = tfidf[corpus]
     for doc in corpus_tfidf:
        log.warn(doc)

    # 3. lsi model, using singular matrix to compress data (SVD), is a process of removing noise (removing the interference of synonyms and antonyms)
 lsi = models.LsiModel(corpus_tfidf, id2word =dictionary, num_topics = 10 )
     corpus_lsi = lsi[corpus_tfidf ]
    lsi.print_topics(10)

    lda = models.LdaModel(corpus, id2word =dictionary, num_topics = 100 ) # input is in bow format
 lda.print_topics( 20 )

    # First establish the index of the vector model
 index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
    test_lsi_query(dictionary, lsi, index)


if __name__ == '__main__':
    testapi()

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326529399&siteId=291194637