nlp示例代码

import logging
from logging import NullHandler

log = logging.getLogger(__name__)
log.addHandler(NullHandler())

from corpussrc import DoubanCorpus
from gensim import corpora, models, similarities
from cleaner import StopWordFilter

def test_lsi_query(dictionary, lsi, index):
    teststr = u'哈哈有个评论太可爱了,你们知道吴京有多努力吗?不过打一星是看新闻气的。'
filter = StopWordFilter()
    vec_bow = dictionary.doc2bow(filter.transform(jieba.cut(teststr)))
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    log.warn(sims)

def test_deep_learning():
    # size:特征向量的维度 window:上下文相关环境的长度  min_count:最小词频率 workers:进程数
model = models.Word2Vec(DoubanCorpus('tbDoubanReview'), size=100, window=5, min_count=5, workers=4)
    model.wv[u'吴京']
    model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

# 语料-》词典->bow->model->similar
# 基本思路:向量化、比较向量
def testapi():
    dictionary = corpora.Dictionary(DoubanCorpus('tbDoubanReview'))
    log.warn(dictionary.token2id)

    docs = DoubanCorpus('tbDoubanReview')
    # bow格式[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1)]
    # 每个元组的含义是 字典中的id,本文档含有的次数, 只存储本文档含有的词汇
corpus = []
    for docwords in docs:
        log.warn(docwords)
        corpus.append(dictionary.doc2bow(docwords))

    log.warn(corpus)  # 1、bow

tfidf = models.TfidfModel(corpus)  # 2、bow ->tfidf
    # [(4, 0.447213595499958), (5, 0.447213595499958), (6, 0.447213595499958), (7, 0.447213595499958), (8, 0.447213595499958)]
    # 体现每个词的重要性,数值越大,越重要
corpus_tfidf = tfidf[corpus]
    for doc in corpus_tfidf:
        log.warn(doc)

    # 3、lsi 模型, 使用奇异矩阵来压缩数据(SVD),是去掉噪声的过程(去掉同义词、反义词的干扰)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    corpus_lsi = lsi[corpus_tfidf]
    lsi.print_topics(10)

    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100) # 输入是bow格式
lda.print_topics(20)

    # 先建立向量model的索引
index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
    test_lsi_query(dictionary, lsi, index)


if __name__ == '__main__':
    testapi()

猜你喜欢

转载自eric-weitm.iteye.com/blog/2389401
今日推荐