自然语言处理 作业A2
作业地址: link
Unigram model
1. Creating the word_to_index dictionary
把txt文本读入,转成字典,然后输出到一个txt
import codecs
# TODO: read brown_vocab_100.txt into word_index_dict
import codecs
from generate import GENERATE
# TODO: read brown_vocab_100.txt into word_index_dict
vocabs = codecs.open("brown_vocab_100.txt" , "r","utf-16")
word_index_dict = {i.rstrip():index for index,i in enumerate(vocabs.readlines())}
# TODO: write word_index_dict to word_to_index_100.txt
with open("word_to_index_100.txt","w") as wf:
for index,i in enumerate(word_index_dict.items()):
c = i[0]+' '+str(i[1])+'\n'
wf.write(c)
print(word_index_dict['all'])
print(word_index_dict['resolution'])
print(len(word_index_dict))
2. Building an MLE unigram model
词频,并求unigram的概率
vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")
word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
text = f.read().lower()
#TODO: iterate through file and update counts
# 防止allen 里找到all,text中 句子为' . '空格句号空格结尾
counts = np.array([text.count(' '+word+' ') if not word == '<s>' else text.count(word) for word in word_index_dict] )
#TODO: normalize and writeout counts.
prob = counts/counts.sum()
with open("unigram_probs.txt","w") as wf:
for index,i in enumerate(prob):
# 由value找key
word = list(word_index_dict.keys())[list(word_index_dict.values()).index(index)]
c = word +' '+str(prob[index])+'\n'
wf.write(c)
这样做其实挺危险的,如果因为 里面如果乱序咋办。作业里面还附带了一个生成的模型
returnSTR = ""
index_word_dict = {v: k for k, v in word_index_dict.items()}
num_words = 0
max_words = 20
probs = prob
#using https://stackoverflow.com/questions/483666/python-reverse-invert-a-mapping
while(True):
# 依据unigram的概率生成下一个字符
wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs)) #output:[index],所以需要wordIndex[0]
word = index_word_dict[wordIndex[0]]
returnSTR += word + " "
num_words +=1
if word == "</s>" or num_words == max_words:
break
print(returnSTR)
随便生成了两句话,牛头不对马嘴
not worth , or . the , receives its the this term for or superintendent the or as on i
he i in . end wife i it can force . details i he these i he by despite a
关于问题中的小问,小数据集和大数据集相比只出现一个的单词比例会多还是少?
sum(prob==1/counts.sum())/len(word_index_dict) #ouput=0.5633
出现一次的比重未免太高了8,因为这是一个不全的dictionary,所以最后 ,大的数据集中, 肯定也会减小
import matplotlib.pyplot as plt
from matplotlib import rcParams
vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")
word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
text = f.read().lower()
#TODO: iterate through file and update counts
divide = [0.25, 0.5, 0.75, 1]
prob_1 = []
prob_0 = []
for i in divide:
text2 = text[:round(len(text)*i)]
counts = np.array([text2.count(' '+word+' ') for word in word_index_dict])
#TODO: normalize and writeout counts.
prob = counts/counts.sum()
prob_1.append(sum(prob==1/counts.sum())/len(word_index_dict))
prob_0.append(sum(prob==0/counts.sum())/len(word_index_dict))
prob_0 = np.asarray(prob_0)
prob_1 = np.asarray(prob_1)
# 画图
plt.rcParams['figure.figsize'] = (9.0, 10.0)
def plot_result(y,x,xlabel='Number of words in the corpus',ylabel = 'Prob',title='Probabilities of the word occurred X times'):
y = np.array(y)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.grid(True)
plt.plot(x, y)
# plt.show()
x = np.asarray([round(len(text)*i) for i in divide])
plt.subplot(2,1,1)
plot_result(prob_1,x)
plot_result(prob_0,x)
plt.legend(['Once','Zero'], loc='best')
plt.subplot(2,1,2)
plot_result(prob_1+prob_0,x,ylabel='Sum of probs',title='The probabilities of the word occurred zero and one times')
plt.tight_layout()
plt.savefig('result.png')
plt.show()
Bigram models
3. Building an MLE bigram model
bigram模型
import codecs
import numpy as np
from sklearn.preprocessing import normalize
from generate import GENERATE
import random
# bigram和unigram差别很大
#load the indices dictionary
with codecs.open("brown_vocab_100.txt", "r", encoding="utf-16") as vocab:
Dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
# 句尾加一个' '防止把all the 和 all there搞混
Dict2 = np.asarray([j[0]+' '+j[1] for word in Dict for j in zip([word]*len(Dict),Dict)]).reshape([813]*2)
#TODO: iterate through file and update counts
with codecs.open("brown_100.txt", "r",'utf-16') as f:
text = f.read().lower()
# 多维array里,一个个元素迭代,readwrite允许读写
it = np.nditer(Dict2, flags=['multi_index'])
count2 = []
while not it.finished:
# </s> \r\n<s> 句尾和另一句的接头是这样子,不在字典内
if '<s>' in it.value.tolist():
count = text.count(Dict2[it.multi_index]+' ')
else:
count = text.count(' '+Dict2[it.multi_index]+' ')
count2.append(count)
it.iternext()
# 去掉了
count2 = np.asarray(count2).reshape([813]*2)
#TODO: normalize counts
probs = normalize(count2, norm='l1', axis=1)
# p(the | all)
print(probs[Dict2 == 'all the'])
# p(jury | the)
print(probs[Dict2 == 'the jury'])
# p(campaign | the)
print(probs[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs[Dict2 == 'anonymous calls'])
[1.]
[0.08333333]
[0.00641026]
[0.33333333]
生成模型:
# def GENERATE(word_index_dict, probs, model_type, max_words, start_word):
start_word = "<s>"
max_words = 20
returnSTR = ""
index_word_dict = {v: k for k, v in Dict.items()}
num_words = 0
returnSTR = start_word + " "
prevWord = start_word
while(True):
wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs[word_index_dict[prevWord]]))
word = index_word_dict[wordIndex[0]]
returnSTR += word + " "
prevWord = word
num_words +=1
if word == "</s>" or num_words == max_words:
break
print(returnSTR)
虽然还是不明所以(第一句还不错),但是比unigram好太多了8
<s> it was the county democratic executive committee . </s>
<s> the size of sunday night in a proportionate distribution of this problem . </s>
###4. Add-α smoothing the bigram model
加一(拉普拉斯平滑)和加0.1平滑
# Laplace smoothing
count2_laplace = count2+1
probs_laplace = normalize(count2_laplace, norm='l1', axis=1)
# p(the | all)
print(probs_laplace[Dict2 == 'all the'])
# p(jury | the)
print(probs_laplace[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_laplace[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_laplace[Dict2 == 'anonymous calls'])
[0.002457]
[0.01444788]
[0.00206398]
[0.00245098]
# add-α smoothing
count2_alpha = count2+0.1
probs_alpha = normalize(count2_alpha, norm='l1', axis=1)
# p(the | all)
print(probs_alpha[Dict2 == 'all the'])
# p(jury | the)
print(probs_alpha[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_alpha[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_alpha[Dict2 == 'anonymous calls'])
[0.01336574]
[0.05520438]
[0.00463548]
[0.01304864]
Original | Laplace smoothing(add one) | smoothing(add point one) | |
---|---|---|---|
1. | 0.002457 | 0.01336574 | |
0.08333333 | 0.01444788 | 0.05520438 | |
0.00641026 | 0.00206398 | 0.00463548 | |
0.33333333 | 0.00245098 | 0.01304864 |
问:为什么平滑模型中所有四个概率都下降了?现在请注意,概率并没有全部减少相同的数量。特别是,以’the’为条件的两个概率仅略微下降,而另外两个概率(以’all’和’anonymous’为条件)相当显着地下降。问:为什么add-α平滑导致以’the’为条件的概率比其他的更低?为什么这种行为(导致’the’的概率低于其他因素)是一件好事?在弄清楚这一点时,您可能会发现查看计数矩阵的相关各行(在添加0.1之前)以查看它们的不同之处是有用的。在numpy中,你可以看看第n行counts 矩阵使用counts[n,]。
A: the为前一个的字符明显比较多,此时增加 影响就小,但是像以anonymous为前一个的,全语料库就3个,所以影响当然大了。
Using n-gram models
5. Experimenting with a MLE trigram model
获得单独的
def triFinder(_input, Dict):
if type(_input) == str:
# ...寻找index
a,b,c = _input.split(' ')
return np.array([Dict.get(a),Dict.get(b),Dict.get(c)])
else:
# 寻找str
key = np.array(list(Dict.keys()))
return key[_input[0]]+' '+key[_input[1]]+' '+key[_input[2]]
def Prob( _input,Dict,count2, alpha = 0):
# _input should be (number, number, number)
word_pair = triFinder(_input, Dict)
if '<s>' in word_pair:
count = text.count(word_pair+' ')+alpha
else:
count = text.count(' '+word_pair+' ')+alpha
prob = count/np.sum(count2[_input[0],_input[1]]+alpha*len(Dict))
return prob
index_of_3wordpair = triFinder("in the past", Dict)
Prob(index_of_3wordpair,Dict,count2)
6. Calculating sentence probabilities
计算unigram、bigram和bigram-alpha smoothing的句子概率以及困惑度
prob = counts/counts.sum()
sentprob = 1
perplexity = []
perplexity2 = []
perplexity2alpha = []
with codecs.open("toy_corpus.txt", "r",'utf-16') as f:
for j in f.readlines():
words = j.lower().split()
sent_len = len(words)
for i in j.lower().split():
# unigram
sentprob *= prob[Dict.get(i)]
perplexity.append(1/(pow(sentprob, 1.0/sent_len)))
#print('Unigram:')
#print('prob of sentence:',sentprob,'perplexity:',perplexity)
sentprob = 1
for i in j.lower().split():
# bigram & alpha=0.1
if i == "<s>":
previous_word = i
continue
sentprob *= probs_alpha[Dict.get(previous_word),Dict.get(i)]
previous_word = i
perplexity2alpha.append(1/(pow(sentprob, 1.0/(sent_len))))
sentprob = 1
#print('Bigram & alpha=0.1:')
#print('prob of sentence:',sentprob,'perplexity:',perplexity)
for i in j.lower().split():
# bigram & alpha=0.1
if i == "<s>":
previous_word = i
continue
sentprob *= probs[Dict.get(previous_word),Dict.get(i)]
previous_word = i
perplexity2.append(1/(pow(sentprob, 1.0/(sent_len))))
sentprob = 1
困惑度分别为
[281.3714617705383, 153.03157461392973]
[4.315371247379676, 7.237099402748856]
[46.10791988057384, 49.63008570872447]
哪种型号表现最差,为什么您可能期望该型号表现最差? 问:在此语料库中进行评估时,平滑是否有助于或损害模型的“性能”?为什么会这样?
Compare the perplexities of these two sentences under all three models. Q: Which model performed worst and why might you have expected that model to have performed worst? Q: Did smoothing help or hurt the model’s ‘performance’ when evaluated on this corpus? Why might that be?
看上去好像加入平滑后,困惑度变高了,但是这样会损害性能吗,emmm,不清楚,但是平滑实质为高概率的单词将部分概率分给了计数零的字符对,因此显然按困惑度的概念是必然会导致模型变差的。但是再思考一下,类比于多元统计中的正则,一定程度上加入bias反而能得到更好的结果。
7. Generation
之前都做过了,来试试trigram吧
对之前的模型修改了一下,现在能表P(*|word1,word2)了,便于抽样
def triFinder(_input, Dict):
if type(_input) == str:
# ...寻找index
a,b,c = _input.split(' ')
return np.array([Dict.get(a),Dict.get(b),Dict.get(c)])
else:
# 寻找str
key = np.array(list(Dict.keys()))
return key[_input[0]]+' '+key[_input[1]]+' '+key[_input[2]]
def Prob( _input,Dict,count2, alpha = 0,generation=False):
# _input should be (number, number, number)
if generation:
# 生成列表P(*|word1,word2),输入为[word1, word2]
prob = [Prob(triFinder(_input+ " "+i, Dict),Dict,count2) for i in Dict]
# p = []
# for i in Dict:
# print(_input+ " "+i,Prob(triFinder(_input+ " "+i , Dict),Dict,count2))
# p.append(Prob(triFinder(_input+ " "+i, Dict),Dict,count2))
return prob
_sum = np.sum(count2[_input[0],_input[1]]+alpha*len(Dict))
if _sum == 0:
return 0
word_pair = triFinder(_input, Dict)
if '<s>' in word_pair:
count = text.count(word_pair+' ')+alpha
else:
count = text.count(' '+word_pair+' ')+alpha
prob = count/_sum
return prob
生成模型也要做一定修改
# def GENERATE(word_index_dict, probs, model_type, max_words, start_word):
start_word = "<s>"
max_words = 25
returnSTR = ""
index_word_dict = {v: k for k, v in Dict.items()}
num_words = 0
returnSTR = start_word + " "
prev2Word = start_word
prevWord = index_word_dict[np.random.choice(len(index_word_dict), 1,p=list(probs[word_index_dict[start_word]]))[0]]
while(True):
wordIndex = np.random.choice(len(word_index_dict), 1, p=Prob(prev2Word+" "+prevWord,Dict,count2,generation=True))
word = index_word_dict[wordIndex[0]]
returnSTR += word + " "
prev2Word = prevWord
prevWord = word
num_words +=1
if word == "</s>" or num_words == max_words:
break
print(returnSTR)
最后的结果,虽然还是很不理想,比如标点符号等没有处理,但是已经很酷炫了:
<s> bond issue approved earlier in the past . </s>
<s> petition said that the city `` take steps to remedy '' this problem . </s>
<s> felix tabb said the ordinary apparently made good his promise . </s>
<s> pointed out that georgia voters last november rejected a constitutional amendment to allow legislators to vote on pay raises . </s>
<s> listed his wife's age as 71 . </s>