'''
Chinese sentiment analysis
'''
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib #把数据转化为二进制
from sklearn.svm import SVC
import sys
'''
数据预处理:载入数据
预处理
切分训练集和测试集
'''
def load_file_and_processing():
neg = pd.read_excel('H:/word2vect_3data/Chinese_data/neg.xls')
pos = pd.read_excel('H:/word2vect_3data/Chinese_data/pos.xls')
cw = lambda x:list(jieba.cut(x)) #jieba分词
pos['words'] = pos[0].apply(cw) #此处会报错,读取时给列命名,在apply jieba.cut()不会报错
neg['words'] = neg[0].apply(cw)
# use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos)),np.zeros(len(neg))))
x_train,x_test,y_train,y_test = train_test_split(np.concatenate((pos['words'],neg['words'])),y,test_size=0.2)
np.save('H:/word2vect_3data/Chinese_data/y_train.npy', y_train)
np.save('H:/word2vect_3data/Chinese_data/y_test.npy', y_test)
return x_train,x_test
'''
对每个句子的所有词向量取均值,生成一个句子的vector
'''
def build_sentence_vector(text,size,imdb_w2v):
vec = np.zeros(size).reshape((1,size))
count = 0
for word in text:
try:
vec += imdb_w2v[word].reshape((1,size))
count += 1
except KeyError:
continue
if count != 0:
vec /= count
return vec
'''
计算词向量
'''
def get_train_vecs(x_train,x_test):
n_dim = 300
#初始化模型和词表
imdb_w2v = Word2Vec(size=n_dim,min_count=10) #词频少于min_count次数的单词会被丢弃掉, 默认值为5
imdb_w2v.build_vocab(x_train)
#在评论集上训练模型
imdb_w2v.train(x_train)
train_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_train])
np.save('H:/word2vect_3data/Chinese_data/train_vecs.npy',train_vecs)
print('train_vecs size:')
print(train_vecs.shape)
#在测试集上训练
imdb_w2v.train(x_test)
imdb_w2v.save('H:/word2vect_3data/Chinese_data/w2v_model.pkl')
#build test tweet vector then scale
test_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_test])
np.save('H:/word2vect_3data/Chinese_data/test_vecs.npy',test_vecs)
print('test_vecs size:')
print(test_vecs.shape)
def get_data():
train_vecs = np.load('H:/word2vect_3data/Chinese_data/train_vecs.npy')
y_train = np.load('H:/word2vect_3data/Chinese_data/y_train.npy')
test_vecs = np.load('H:/word2vect_3data/Chinese_data/test_vecs.npy')
y_test = np.load('H:/word2vect_3data/Chinese_data/y_test.npy')
return train_vecs,test_vecs,y_train,y_test
'''
训练模型
'''
def svm_train(train_vecs,y_train,test_vecs,y_test):
clf = SVC(kernel='rbf',verbose=True)
clf.fit(train_vecs,y_train)
joblib.dump(clf, 'H:/word2vect_3data/Chinese_data/model.pkl')
print(clf.score(test_vecs,y_test))
'''
构建待测句子向量
'''
def get_predict_vecs(words):
n_dim = 300
imdb_w2v =Word2Vec.load('H:/word2vect_3data/Chinese_data/w2v_model.pkl')
train_vecs = build_sentence_vector(words,n_dim,imdb_w2v)
return train_vecs
'''
对单个句子进行情感分析
'''
def svm_predict(string):
words = jieba.cut(string) #jieba.lcut直接返回list
words_vecs = get_predict_vecs(words)
clf =joblib.load('H:/word2vect_3data/Chinese_data/model.pkl')
result = clf.predict(words_vecs)
if int(result[0]) == 1:
print('positive')
else:
print('negative')
Word2vec进行中文情感分析
猜你喜欢
转载自blog.csdn.net/weixin_40924580/article/details/83629615
今日推荐
周排行