#!/usr/bin/python # -*- coding: UTF-8 -*- import sys import codecs #可以以特定编码打开文件 import jieba import jieba.posseg as pseg reload(sys) #zzh说这种方法不好,不要再用了!!! 可是真的很好用啊 QAQ sys.setdefaultencoding('utf-8') import gensim #model = gensim.models.Word2Vec.load("22620491.model") model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True) word_vec = model.wv del model #把模型给word_vec,所以Model删掉。 ''' print word_vec[u'难过'] ''' f = codecs.open("qinggancidanmu.txt",'r','utf-8') #codecs包指定TXT打开方式 lines = f.readlines() doc = open('fenlei.txt', 'w') for line in lines: #每一行弹幕 if lines.index(line) % 500 ==0: #显示跑到多少条数据 print lines.index(line) words=line.split(" ") words.pop(0) u = [] for word in words: if word != "\r\n": #去掉换行符,linux只用\n换行。win下用\r\n表示换行。反正\n不行就\r\n试试! #print type(word) u.append(word) #word_vec输入必须要unicode才行。 le = [u'乐'] ai = [u'哀'] nu = [u'怒'] jing = [u'惊'] wu = [u'恶'] try: l,a,n,j,w=word_vec.n_similarity(u, le),word_vec.n_similarity(u, ai),word_vec.n_similarity(u, nu),word_vec.n_similarity(u, jing),word_vec.n_similarity(u, wu) list=[l,a,n,j,w] #print list doc.write(line.split(" ",1)[0]+" "+str(list.index(max(list)))+" "+line.split(" ",1)[1]+'\n') # index记得 变成 str啊 !!改了好半天! except: doc.write(line.split(" ",1)[0]+" "+"-1"+" "+line.split(" ",1)[1]+'\n') continue print("end") f.close() doc.close()
word2vec全部弹幕比较句子相似度情感五分类
猜你喜欢
转载自blog.csdn.net/qq_35398413/article/details/81043658
今日推荐
周排行