Doc2vc算法代码

with open('IMDB_data/unsup.txtr') as infile:
     unsup_reviews = infile.readlines()

#用1表示积极感情,0表示消极感情
 y = np.concatenate((np.ones(len(pos_reviews)),np.zeros(len(neg_reviews))))

 x_train,x_test,y_train,y_test=
 train_test_split(np.concatenate((pos_reviews,neg_reviews)),y, test_size=0.2)

#将于文本无关的符号信息删除(文本处理)

def cleanText(corpus):
   punctuation=....,?!:;(){>[]....
   corpus=[z.lower().replace('\n)forzincorpus]
   corpus=[z.replace]1<br/>','')forzincorpus]

#将标点当做单个词处理
for c in punctuation:
    corpus = [z.replace(c,%s'%c)forzincorpus]

   corpus=[z.split() for z in corpus]
return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)

unsup_reviews = cleanText(unsup_reviews)
 
#Gensim'sDoc2Vecimplementationrequireseachdocument/paragraphtohavea
labelassociatedwithit.
#WedothisbyusingtheLabeledSentencemethod.Theformatwillbe
"TRAIN_i"or~TEST_i"where"i"is
#adummyindexofthereview.
def labelizeReviews (reviews,label_type):
   labelized=[]

for i,v inenumerate(reviews):
label='%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v,[label]))
return labelized

x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')

unsup_reviews = labelizeReviews(unsup_reviews,'UNSUP')

猜你喜欢

转载自blog.csdn.net/u014565726/article/details/80466557
doc