维基百科语料库:
1.http://ftp.acc.umu.se/mirror/wikimedia.org/dumps/enwiki/20180320/
2.https://meta.wikimedia.org/wiki/Mirroring_Wikimedia_project_XML_dumps
3.https://dumps.wikimedia.org/
生成词向量教程:
1.https://www.jianshu.com/p/05800a28c5e4
2.http://www.52nlp.cn/%E4%B8%AD%E8%8B%B1%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91%E8%AF%AD%E6%96%99%E4%B8%8A%E7%9A%84word2vec%E5%AE%9E%E9%AA%8C
代码有bug,修改后的代码:
# coding=utf-8 import os import logging import sys import re import multiprocessing import gensim from gensim.corpora import WikiCorpus from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence def process_wiki(inp, outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) i = 0 #output = open(outp, 'w', encoding='utf-8') output = open(outp, 'wt', encoding='utf-8')#使用 “t” 类型打开文件,字符串的形式 wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): #output.write(b' '.join(text).decode('utf-8') + '\n') output.write( " ".join('%s' %id for id in text) + '\n')#遍历list的元素,把他转化成字符串。 #output.write('\n') i = i + 1 if i % 10000 == 0: logger.info('Saved ' + str(i) + ' articles') output.close() logger.info('Finished ' + str(i) + ' articles') def remove_words(inp, outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) output = open(outp, 'w', encoding='utf-8') inp = open(inp, 'r', encoding='utf-8') for line in inp.readlines(): ss = re.findall('[\n\s*\r\u4e00-\u9fa5]', line) output.write("".join(ss)) logger.info("Finished removed words!") def separate_words(inp, outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) output = open(outp, 'w', encoding='utf-8') inp = open(inp, 'r', encoding='utf-8') for line in inp.readlines(): seg_list = jieba.cut(line.strip()) output.write(' '.join(seg_list) + '\n') logger.info("finished separate words!") # inp为输入语料 def train_w2v_model(inp, outp1, outp2): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # LineSentence(inp):应该是把word2vec训练模型的磁盘存储文件(model在内存中总是不踏实)转换成所需要的格式;对应的格式是参考上面的例1。 # size:是每个词的向量维度; # window:是词向量训练时的上下文扫描窗口大小,窗口为5就是考虑前5个词和后5个词; # min - count:设置最低频率,默认是5,如果一个词语在文档中出现的次数小于5,那么就会丢弃; # workers:是训练的进程数(需要更精准的解释,请指正),默认是当前运行机器的处理器核数。这些参数先记住就可以了。 model = Word2Vec(LineSentence(inp), size=100, window=5, min_count=2, workers=multiprocessing.cpu_count()) # outp1 为输出模型 model.save(outp1) # outp2为原始c版本word2vec的vector格式的模型 model.wv.save_word2vec_format(outp2, binary=False) def main(): process_wiki('enwiki-latest-pages-articles.xml.bz2', 'wiki.en.text') #process_wiki('enwiki-20180320-pages-articles14.xml-p7697599p7744799.bz2', 'wiki.en.text') # remove_words('./data/wiki_cn_jian.txt', './data/wiki_cn_jian_removed.txt') # separate_words('./data/wiki_cn_jian_removed.txt', './data/wiki_cn_jian_sep_removed.txt') # train_w2v_model('./data/wiki_cn_jian_sep_removed.txt', './bin/300/w2v_model.bin', './bin/300/w2v_vector.bin') #train_w2v_model('./data/wiki.en.text', './bin/w2v_model_100.bin', './bin/w2v_vector_100.bin') if __name__=='__main__': main() # model = gensim.models.Word2Vec.load('./bin/300/w2v_model.bin') # print(model.most_similar([u'李连杰', u'基金'], [u'成龙']))