自然语言处理作业A2

Unigram model

1. Creating the word_to_index dictionary
2. Building an MLE unigram model

Bigram models

3. Building an MLE bigram model

Using n-gram models

5. Experimenting with a MLE trigram model
6. Calculating sentence probabilities
7. Generation

作业地址： link

Unigram model

1. Creating the word_to_index dictionary

把txt文本读入，转成字典，然后输出到一个txt

import codecs

# TODO: read brown_vocab_100.txt into word_index_dict
import codecs
from generate import GENERATE
# TODO: read brown_vocab_100.txt into word_index_dict
vocabs = codecs.open("brown_vocab_100.txt" , "r","utf-16")
word_index_dict = {i.rstrip():index for index,i in enumerate(vocabs.readlines())}
# TODO: write word_index_dict to word_to_index_100.txt
with open("word_to_index_100.txt","w") as wf:
    for index,i in enumerate(word_index_dict.items()):
        c = i[0]+' '+str(i[1])+'\n'
        wf.write(c)


print(word_index_dict['all'])
print(word_index_dict['resolution'])
print(len(word_index_dict))

2. Building an MLE unigram model

词频，并求unigram的概率

vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")

word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()
#TODO: iterate through file and update counts
# 防止allen 里找到all，text中 句子为' . '空格句号空格结尾
counts = np.array([text.count(' '+word+' ') if not word == '<s>' else text.count(word) for word in word_index_dict] )
#TODO: normalize and writeout counts. 
prob = counts/counts.sum()

with open("unigram_probs.txt","w") as wf:
    for index,i in enumerate(prob):
        # 由value找key
        word = list(word_index_dict.keys())[list(word_index_dict.values()).index(index)]
        c = word +' '+str(prob[index])+'\n'
        wf.write(c)

这样做其实挺危险的，如果因为 $enumerate(word\_index\_dict)]$ 里面如果乱序咋办。作业里面还附带了一个生成的模型

returnSTR = ""
index_word_dict = {v: k for k, v in word_index_dict.items()}
num_words = 0
max_words = 20
probs = prob
#using https://stackoverflow.com/questions/483666/python-reverse-invert-a-mapping
while(True):
    # 依据unigram的概率生成下一个字符
    wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs)) #output:[index]，所以需要wordIndex[0]
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    num_words +=1
    if word == "</s>" or num_words == max_words:
        break
print(returnSTR)

随便生成了两句话，牛头不对马嘴

not worth , or . the , receives its the this term for or superintendent the or as on i

he i in . end wife i it can force . details i he these i he by despite a

关于问题中的小问，小数据集和大数据集相比只出现一个的单词比例会多还是少？

sum(prob==1/counts.sum())/len(word_index_dict) #ouput=0.5633

出现一次的比重未免太高了8，因为这是一个不全的dictionary，所以最后 $p_{zero}=0$ ，大的数据集中， $p_{once}$ 肯定也会减小

import matplotlib.pyplot as plt
from matplotlib import rcParams
vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")

word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()
#TODO: iterate through file and update counts
divide = [0.25, 0.5, 0.75, 1]
prob_1 = []
prob_0 = []
for i in divide:
    text2 = text[:round(len(text)*i)]
    counts = np.array([text2.count(' '+word+' ') for word in word_index_dict])
    #TODO: normalize and writeout counts. 
    prob = counts/counts.sum()
    prob_1.append(sum(prob==1/counts.sum())/len(word_index_dict))
    prob_0.append(sum(prob==0/counts.sum())/len(word_index_dict))
prob_0 = np.asarray(prob_0)
prob_1 = np.asarray(prob_1)

# 画图
plt.rcParams['figure.figsize'] = (9.0, 10.0)
def plot_result(y,x,xlabel='Number of words in the corpus',ylabel = 'Prob',title='Probabilities of the word occurred X times'):
    y = np.array(y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.plot(x, y)
    # plt.show()
    
x = np.asarray([round(len(text)*i) for i in divide])

plt.subplot(2,1,1)
plot_result(prob_1,x)
plot_result(prob_0,x)
plt.legend(['Once','Zero'], loc='best')

plt.subplot(2,1,2)
plot_result(prob_1+prob_0,x,ylabel='Sum of probs',title='The probabilities of the word occurred zero and one times')

plt.tight_layout()
plt.savefig('result.png')
plt.show()

在这里插入图片描述

Bigram models

3. Building an MLE bigram model

bigram模型

import codecs
import numpy as np
from sklearn.preprocessing import normalize
from generate import GENERATE
import random

# bigram和unigram差别很大
#load the indices dictionary
with codecs.open("brown_vocab_100.txt", "r", encoding="utf-16") as vocab:
    Dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
# 句尾加一个' '防止把all the 和 all there搞混
Dict2 = np.asarray([j[0]+' '+j[1]  for word in Dict for j in zip([word]*len(Dict),Dict)]).reshape([813]*2)


#TODO: iterate through file and update counts
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()


# 多维array里，一个个元素迭代，readwrite允许读写
it = np.nditer(Dict2, flags=['multi_index'])
count2 = []


while not it.finished:
    # </s> \r\n<s> 句尾和另一句的接头是这样子，不在字典内
    if '<s>' in it.value.tolist(): 
        count = text.count(Dict2[it.multi_index]+' ')
    else:   
        count = text.count(' '+Dict2[it.multi_index]+' ')
    
    count2.append(count)
    it.iternext()
# 去掉了
count2 = np.asarray(count2).reshape([813]*2)

#TODO: normalize counts
probs = normalize(count2, norm='l1', axis=1)
# p(the | all)
print(probs[Dict2 == 'all the'])
# p(jury | the)
print(probs[Dict2 == 'the jury'])
# p(campaign | the)
print(probs[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs[Dict2 == 'anonymous calls'])

[1.]
[0.08333333]
[0.00641026]
[0.33333333]

生成模型：

扫描二维码关注公众号，回复： 4741409 查看本文章

# def GENERATE(word_index_dict, probs, model_type, max_words, start_word):
start_word = "<s>"
max_words = 20
returnSTR = ""
index_word_dict = {v: k for k, v in Dict.items()}
num_words = 0

returnSTR = start_word + " "
prevWord = start_word
while(True):
    wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs[word_index_dict[prevWord]]))
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    prevWord = word
    num_words +=1
    if word == "</s>" or num_words == max_words:
        break
print(returnSTR)

虽然还是不明所以（第一句还不错），但是比unigram好太多了8

<s> it was the county democratic executive committee . </s> 
<s> the size of sunday night in a proportionate distribution of this problem . </s>

###4. Add-α smoothing the bigram model
加一（拉普拉斯平滑）和加0.1平滑

# Laplace smoothing
count2_laplace = count2+1
probs_laplace = normalize(count2_laplace, norm='l1', axis=1)


# p(the | all)
print(probs_laplace[Dict2 == 'all the'])
# p(jury | the)
print(probs_laplace[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_laplace[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_laplace[Dict2 == 'anonymous calls'])

[0.002457]
[0.01444788]
[0.00206398]
[0.00245098]

# add-α smoothing
count2_alpha = count2+0.1
probs_alpha = normalize(count2_alpha, norm='l1', axis=1)


# p(the | all)
print(probs_alpha[Dict2 == 'all the'])
# p(jury | the)
print(probs_alpha[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_alpha[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_alpha[Dict2 == 'anonymous calls'])

[0.01336574]
[0.05520438]
[0.00463548]
[0.01304864]

	Original	Laplace smoothing(add one)	$\alpha$ smoothing(add point one)
$p(the\vert all)$	1.	0.002457	0.01336574
$p(jury \vert the)$	0.08333333	0.01444788	0.05520438
$p(campaign \vert the)$	0.00641026	0.00206398	0.00463548
$p(calls\vert anonymous)$	0.33333333	0.00245098	0.01304864

问：为什么平滑模型中所有四个概率都下降了？现在请注意，概率并没有全部减少相同的数量。特别是，以’the’为条件的两个概率仅略微下降，而另外两个概率（以’all’和’anonymous’为条件）相当显着地下降。问：为什么add-α平滑导致以’the’为条件的概率比其他的更低？为什么这种行为（导致’the’的概率低于其他因素）是一件好事？在弄清楚这一点时，您可能会发现查看计数矩阵的相关各行（在添加0.1之前）以查看它们的不同之处是有用的。在numpy中，你可以看看第n行counts 矩阵使用counts[n,]。

A: the为前一个的字符明显比较多，此时增加 $\alpha$ 影响就小，但是像以anonymous为前一个的，全语料库就3个，所以影响当然大了。

Using n-gram models

5. Experimenting with a MLE trigram model

获得单独的 $P(w_{2}|w_{1},w_{0})$

def triFinder(_input, Dict):
    if type(_input) == str:
        # ...寻找index
        a,b,c = _input.split(' ')
        return np.array([Dict.get(a),Dict.get(b),Dict.get(c)])
    else:
        # 寻找str
        key = np.array(list(Dict.keys()))
        return key[_input[0]]+' '+key[_input[1]]+' '+key[_input[2]]
    
def Prob( _input,Dict,count2, alpha = 0):
    # _input should be (number, number, number) 
    word_pair = triFinder(_input, Dict)
    if '<s>' in word_pair: 
        count = text.count(word_pair+' ')+alpha
    else:   
        count = text.count(' '+word_pair+' ')+alpha
    
    prob = count/np.sum(count2[_input[0],_input[1]]+alpha*len(Dict))
    return prob

index_of_3wordpair = triFinder("in the past", Dict)
Prob(index_of_3wordpair,Dict,count2)

6. Calculating sentence probabilities

计算unigram、bigram和bigram-alpha smoothing的句子概率以及困惑度

prob = counts/counts.sum()
sentprob = 1
perplexity = []
perplexity2 = []
perplexity2alpha = []

with codecs.open("toy_corpus.txt", "r",'utf-16') as f:
    
    
    for j in f.readlines():
        
        words = j.lower().split()
        sent_len = len(words)
        for i in j.lower().split():
            # unigram 
            sentprob *= prob[Dict.get(i)]
        perplexity.append(1/(pow(sentprob, 1.0/sent_len)))
        #print('Unigram:')
        #print('prob of sentence:',sentprob,'perplexity:',perplexity)
        sentprob = 1
        for i in j.lower().split():
            # bigram & alpha=0.1
            if i == "<s>":
                previous_word = i
                continue
                
            sentprob *= probs_alpha[Dict.get(previous_word),Dict.get(i)]
            previous_word = i 
        perplexity2alpha.append(1/(pow(sentprob, 1.0/(sent_len))))
        sentprob = 1
        #print('Bigram & alpha=0.1:')
        #print('prob of sentence:',sentprob,'perplexity:',perplexity)
        for i in j.lower().split():
            # bigram & alpha=0.1
            if i == "<s>":
                previous_word = i
                continue
   
            sentprob *= probs[Dict.get(previous_word),Dict.get(i)]
            previous_word = i
        perplexity2.append(1/(pow(sentprob, 1.0/(sent_len))))
        sentprob = 1

困惑度分别为

[281.3714617705383, 153.03157461392973]
[4.315371247379676, 7.237099402748856]
[46.10791988057384, 49.63008570872447]

哪种型号表现最差，为什么您可能期望该型号表现最差？问：在此语料库中进行评估时，平滑是否有助于或损害模型的“性能”？为什么会这样？
Compare the perplexities of these two sentences under all three models. Q: Which model performed worst and why might you have expected that model to have performed worst? Q: Did smoothing help or hurt the model’s ‘performance’ when evaluated on this corpus? Why might that be?

看上去好像加入平滑后，困惑度变高了，但是这样会损害性能吗，emmm，不清楚，但是平滑实质为高概率的单词将部分概率分给了计数零的字符对，因此显然按困惑度的概念是必然会导致模型变差的。但是再思考一下，类比于多元统计中的正则，一定程度上加入bias反而能得到更好的结果。

7. Generation

之前都做过了，来试试trigram吧
对之前的模型修改了一下，现在能表P(*|word1,word2)了，便于抽样

def triFinder(_input, Dict):
    if type(_input) == str:
        # ...寻找index
        a,b,c = _input.split(' ')
        return np.array([Dict.get(a),Dict.get(b),Dict.get(c)])
    else:
        # 寻找str
        key = np.array(list(Dict.keys()))
        return key[_input[0]]+' '+key[_input[1]]+' '+key[_input[2]]
        
def Prob( _input,Dict,count2, alpha = 0,generation=False):
    # _input should be (number, number, number) 
    if generation:
        # 生成列表P(*|word1,word2),输入为[word1, word2]
        prob = [Prob(triFinder(_input+ " "+i, Dict),Dict,count2) for i in Dict]
        # p = []
        #  for i in Dict:
        #      print(_input+ " "+i,Prob(triFinder(_input+ " "+i , Dict),Dict,count2))
        #     p.append(Prob(triFinder(_input+ " "+i, Dict),Dict,count2))
        return prob
    _sum = np.sum(count2[_input[0],_input[1]]+alpha*len(Dict))
    if _sum == 0:
        return 0
    word_pair = triFinder(_input, Dict)
    if '<s>' in word_pair: 
        count = text.count(word_pair+' ')+alpha
    else:   
        count = text.count(' '+word_pair+' ')+alpha
    prob = count/_sum
    return prob

生成模型也要做一定修改

# def GENERATE(word_index_dict, probs, model_type, max_words, start_word):
start_word = "<s>"
max_words = 25
returnSTR = ""
index_word_dict = {v: k for k, v in Dict.items()}
num_words = 0

returnSTR = start_word + " "
prev2Word = start_word
prevWord = index_word_dict[np.random.choice(len(index_word_dict), 1,p=list(probs[word_index_dict[start_word]]))[0]]
while(True):
    wordIndex = np.random.choice(len(word_index_dict), 1, p=Prob(prev2Word+" "+prevWord,Dict,count2,generation=True))
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    prev2Word = prevWord
    prevWord = word
    num_words +=1
    if word == "</s>" or num_words == max_words:
        break
print(returnSTR)

最后的结果，虽然还是很不理想，比如标点符号等没有处理，但是已经很酷炫了:

<s> bond issue approved earlier in the past . </s> 
<s> petition said that the city `` take steps to remedy '' this problem . </s> 
<s> felix tabb said the ordinary apparently made good his promise . </s> 
<s> pointed out that georgia voters last november rejected a constitutional amendment to allow legislators to vote on pay raises . </s> 
<s> listed his wife's age as 71 . </s>

自然语言处理作业A2

自然语言处理 作业A2

Unigram model

1. Creating the word_to_index dictionary

2. Building an MLE unigram model

Bigram models

3. Building an MLE bigram model

Using n-gram models

5. Experimenting with a MLE trigram model

6. Calculating sentence probabilities

7. Generation

猜你喜欢

自然语言处理作业A2