机器学习之路: python 实践 word2vec 词向量技术

git: https://github.com/linyi0604/MachineLearning

词向量技术 Word2Vec 
     每个连续词汇片段都会对后面有一定制约 称为上下文context
     
     找到句子之间语义层面的联系
 1 from sklearn.datasets import fetch_20newsgroups
 2 from bs4 import BeautifulSoup
 3 import nltk, re
 4 from gensim.models import word2vec
 5 
 6 # nltk.download('punkt')
 7 
 8 
 9 '''
10 词向量技术 Word2Vec 
11     每个连续词汇片段都会对后面有一定制约 称为上下文context
12     
13     找到句子之间语义层面的联系
14     
15 '''
16 
17 # 联网下载新闻数据
18 news = fetch_20newsgroups(subset="all")
19 x, y = news.data, news.target
20 
21 # 定义一个函数 将每条新闻中的句子分离,并返回一个句子的列表
22 def news_to_sentences(news):
23     news_text = BeautifulSoup(news).get_text()
24     tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
25     raw_sentences = tokenizer.tokenize(news_text)
26     sentences = []
27     for sent in raw_sentences:
28         temp = re.sub("[^a-zA-Z]", " ", sent.lower().strip()).split()
29         sentences.append(temp)
30 
31     return sentences
32 
33 # 将长新闻中的句子剥离出来用于训练
34 sentences = []
35 for i in x:
36     sentence_list = news_to_sentences(i)
37     sentences += sentence_list
38 
39 
40 # 配置词向量的维度
41 num_features = 300
42 # 保证被考虑的词汇的频度
43 min_word_count = 20
44 # 并行计算使用cpu核心数量
45 num_workers = 2
46 # 定义训练词向量的上下文窗口大小
47 context = 5
48 downsapling = 1e-3
49 
50 # 训练词向量模型
51 model = word2vec.Word2Vec(sentences,
52                           workers=num_workers,
53                           size=num_features,
54                           min_count=min_word_count,
55                           window=context,
56                           sample=downsapling)
57 # 这个设定代表当前训练好的词向量为最终版, 也可以加速模型训练的速度
58 model.init_sims(replace=True)
59 
60 # 利用训练好的模型 寻找文本中与college相关的十个词汇
61 print(model.most_similar("college"))
62 '''
63 [('wisconsin', 0.7664438486099243), 
64 ('osteopathic', 0.7474539279937744), 
65 ('madison', 0.7433826923370361), 
66 ('univ', 0.7296794652938843), 
67 ('melbourne', 0.7212647199630737), 
68 ('walla', 0.7068545818328857), 
69 ('maryland', 0.7038443088531494), 
70 ('carnegie', 0.7038302421569824), 
71 ('institute', 0.7003713846206665), 
72 ('informatics', 0.6968873143196106)]
73 '''

猜你喜欢

转载自www.cnblogs.com/Lin-Yi/p/9007259.html