import logging from logging import NullHandler log = logging.getLogger(__name__) log.addHandler(NullHandler()) from corpussrc import DoubanCorpus from gensim import corpora, models, similarities from cleaner import StopWordFilter def test_lsi_query(dictionary, lsi, index): teststr = u'haha a comment is so cute, do you guys know how hard Wu Jing is? But one star is for news. ' filter = StopWordFilter() vec_bow = dictionary.doc2bow(filter.transform(jieba.cut(teststr))) vec_lsi = lsi [vec_bow] sims = index [vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) log.warn(sims) def test_deep_learning(): # size: dimension of feature vector window: length of context-sensitive environment min_count: minimum word frequency workers: number of processes model = models.Word2Vec(DoubanCorpus( 'tbDoubanReview' ), size = 100 , window = 5 , min_count = 5 , workers = 4 ) model.wv[ u'Wu Jing' ] model.wv.most_similar(positive=['woman', 'king'], negative=['man']) # Corpus-"dictionary->bow->model->similar # Basic idea: vectorization, comparison vector def testapi(): dictionary = corpora.Dictionary(DoubanCorpus('tbDoubanReview')) log.warn(dictionary.token2id) docs = DoubanCorpus( 'tbDoubanReview' ) # bow format[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6 , 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1)] # The meaning of each tuple is The id in the dictionary, the number of times this document contains, only the words contained in this document are stored corpus = [] for docwords in docs: log.warn(docwords) corpus.append(dictionary.doc2bow(docwords)) log.warn(corpus) # 1, bow tfidf = models.TfidfModel(corpus) # 2, bow ->tfidf # [(4, 0.447213595499958), (5, 0.447213595499958), (6, 0.447213595499958), (7, 0.44499899) , (8, 0.447213595499958)] # Reflect the importance of each word, the larger the value, the more important corpus_tfidf = tfidf[corpus] for doc in corpus_tfidf: log.warn(doc) # 3. lsi model, using singular matrix to compress data (SVD), is a process of removing noise (removing the interference of synonyms and antonyms) lsi = models.LsiModel(corpus_tfidf, id2word =dictionary, num_topics = 10 ) corpus_lsi = lsi[corpus_tfidf ] lsi.print_topics(10) lda = models.LdaModel(corpus, id2word =dictionary, num_topics = 100 ) # input is in bow format lda.print_topics( 20 ) # First establish the index of the vector model index = similarities.MatrixSimilarity(lsi[corpus_tfidf]) test_lsi_query(dictionary, lsi, index) if __name__ == '__main__': testapi()
nlp sample code
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=326529399&siteId=291194637
Recommended
Ranking