Wikipedia training with the Chinese word vector

1. Place the .xml file into .text file format, .text file format, the line is an article

from gensim.corpora import WikiCorpus
input_file_name = '/Users/admin/A_NLP/data/zhwiki-20190720-pages-articles-multistream.xml.bz2'
output_file_name = 'wiki.cn.txt'
input_file = WikiCorpus(input_file_name, lemmatize=False, dictionary={})
output_file = open(output_file_name, 'w', encoding="utf-8")
count = 0
for text in input_file.get_texts():
    output_file.write(' '.join (text) + ' \ n- ' ) 
    COUNT = COUNT +. 1
     IF COUNT% 10000 == 0:
         Print ( ' currently processed data of% d ' % COUNT) 
input_file.close () output_file.close ()

2. Press the corpus row, turn complicated simple, clean, word

import re
import opencc
import jieba
cc = opencc.OpenCC('t2s')
fr = open('wiki.cn.txt','r')
fw = open('wiki.sen.txt','a+')
for line in fr:
    simple_format = cc.convert(line)# 繁转简
    zh_list = re.findall(u"[\u4e00-\u9fa5]+",simple_format)#Chinese Non-washed data 
    sentence = []
     for short_sentence in zh_list: 
        sentence + = List (jieba.cut (cc.convert (short_sentence))) 
    fw.write ( '  ' .join (sentence) + ' \ n- ' ) 
fr. use Close () 
fw.close ()

3. Print out the first line of the file processed View

ft = open('wiki.sen.txt','r')
for line in ft:
    print(line)
    break 

4. training vector word and save the model

import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

input_file_name = 'wiki.sen1k.txt'
model_file_name = 'wiki_min_count500.model'

model = Word2Vec(LineSentence(input_file_name),
                 size=100,  # 词向量长度
                 window=5,
                 min_count=500,
                 workers=multiprocessing.cpu_count())

model.save(model_file_name)

5. Test few words, to find the most similar words 10

from gensim.models Import Word2Vec 
 
wiki_model = Word2Vec.load ( ' wiki_min_count500.model ' ) 
 
the Test = [ ' literature ' , ' rain ' , ' car ' , ' monsters ' , ' geometrical ' , ' Forbidden City ' ]
 for Word in the Test: 
    RES = wiki_model.most_similar (Word)
     Print (Word)    
     Print (RES)

6. See more vectors

wiki_model.wv['文学']

 

 

7.向量可视化,利用TSNE工具,将词向量降维,降维的原理是使得原本距离相近的向量降维后距离尽可能近,原本距离较远的向量降维后距离尽可能远

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
wiki_model = Word2Vec.load('wiki_min_count500.model')
tsne_plot(wiki_model)

 

 第一张图是用所有语料训练处的模型,包含的词语较多,因此成了一个大墨团;由于simhei.ttf文件没有替换,该有的中文文字没有显示出来

 

 

 

 第二张图使用了前1w行数据作为语料库训练,在相应位置替换掉了simhei.ttf文件,是以下效果:

 

 为了看的更清晰一点,第三章图使用的是前1000行数据作训练,效果如下:

 

Guess you like

Origin www.cnblogs.com/byugo/p/11300233.html