with open('IMDB_data/unsup.txtr') as infile:
unsup_reviews = infile.readlines()
#用1表示积极感情,0表示消极感情
y = np.concatenate((np.ones(len(pos_reviews)),np.zeros(len(neg_reviews))))
x_train,x_test,y_train,y_test=
train_test_split(np.concatenate((pos_reviews,neg_reviews)),y, test_size=0.2)
#将于文本无关的符号信息删除(文本处理)
def cleanText(corpus):
punctuation=....,?!:;(){>[]....
corpus=[z.lower().replace('\n)forzincorpus]
corpus=[z.replace]1<br/>','')forzincorpus]
#将标点当做单个词处理
for c in punctuation:
corpus = [z.replace(c,%s'%c)forzincorpus]
corpus=[z.split() for z in corpus]
return corpus
x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_reviews = cleanText(unsup_reviews)
#Gensim'sDoc2Vecimplementationrequireseachdocument/paragraphtohavea
labelassociatedwithit.
#WedothisbyusingtheLabeledSentencemethod.Theformatwillbe
"TRAIN_i"or~TEST_i"where"i"is
#adummyindexofthereview.
def labelizeReviews (reviews,label_type):
labelized=[]
for i,v inenumerate(reviews):
label='%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v,[label]))
return labelized
x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews,'UNSUP')
Doc2vc算法代码
猜你喜欢
转载自blog.csdn.net/u014565726/article/details/80466557
今日推荐
周排行