回答下列问题：

（１）怎样才能识别出语言数据中明显用于分类的特征？

（２）怎样才能构建用于自动执行语言处理任务的语言模型？

（３）从这些模型中我们可以学到哪些关于语言的知识？

决策树，朴素贝叶斯分类器和最大熵（shang）分类

一　监督式分类

＃性别鉴定

创建分类

[python]view plain copy
def gender_features(word):  
    return {'last_letter': word[-1]}  
gender_features('Shrek')  
{'last_letter': 'k'}  
  
from nltk.corpus import names  
import random  
names = ([(name, 'male') for name in names.words('male.txt')] +   
         [(name, 'female') for name in names.words('female.txt')])  
random.shuffle(names)  
  
import nltk  
featuresets = [ (gender_features(n), g) for (n,g) in names ]  
train_set, test_set = featuresets[500:], featuresets[:500]　　　＃训练集和测试集  
classifier = nltk.NaiveBayesClassifier.train(train_set)  
  
classifier.classify(gender_features('Neo'))  
'male'  
classifier.classify(gender_features('Trinity'))  
'female'  
print nltk.classify.accuracy(classifier, test_set)   #评估  
<span style="color:#ff0000;">0.75</span>  
classifier.show_most_informative_features(5)  #哪些特征对于区分名字的性别是最有效的  
Most Informative Features  
             last_letter = u'a'           female : male   =     33.4 : 1.0  
             last_letter = u'k'             male : female =     30.8 : 1.0  
             last_letter = u'f'             male : female =     17.3 : 1.0  
             last_letter = u'p'             male : female =     10.5 : 1.0  
             last_letter = u'd'             male : female =     10.0 : 1.0  

＃选择正确的特征

[python]view plain copy
def gender_features2(name):  
    features = {}  
    features["firstletter"] = name[0].lower()  
    features["lastletter"] = name[-1].lower()  
    for letter in 'abcdefghijklmnopqrstuvwxyz':  
        features["count(%s)" % letter] = name.lower().count(letter)  
        features["has(%s)" % letter] = (letter in name.lower())  
    return features  
gender_features2('JJohn')  
  
featuresets = [(gender_features2(n), g) for (n,g) in names]  
train_set, test_set = featuresets[500:], featuresets[:500]  
classifier = nltk.NaiveBayesClassifier.train(train_set)  #使用朴素贝叶斯分类器  
print nltk.classify.accuracy(classifier, test_set)  
0.776  

[python]view plain copy
#一种能有效完善特征集的方法称为错误分析。首先，选择开发集，其中包含用于创建模型的语料数据。然后将这种开发集分为训练集和开发测试集  
train_names = names[1500:]  
devtest_names = names[500:1500]  
test_names = names[:500]  
train_set = [(gender_features(n), g) for (n,g) in train_names]  
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]  
test_set = [(gender_features(n),g) for (n,g) in test_names]  
classifier = nltk.NaiveBayesClassifier.train(train_set)  
print nltk.classify.accuracy(classifier, devtest_set)  
<span style="color:#ff0000;">0.766</span>  

[python]view plain copy
errors = []  
for (name, tag) in devtest_names:  
    guess = classifier.classify(gender_features(name))  
    if guess != tag:  
        errors.append( (tag, guess, name) )  
for (tag, guess, name) in sorted(errors):    
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)  
correct=female   guess=male     name=Abagael                         
correct=female   guess=male     name=Adel                            
correct=female   guess=male     name=Alys                            
correct=female   guess=male     name=Amargo                          
correct=female   guess=male     name=Ambur  

[python]view plain copy
...        

[python]view plain copy
#调整特征提取器使其包含两个字母后缀的特征  
def gender_features(word):  
    return {'suffix1': word[-1:],  
            'suffix2': word[-2:]}  
train_set = [(gender_features(n), g) for (n,g) in train_names]  
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]  
classifier = nltk.NaiveBayesClassifier.train(train_set)  
print nltk.classify.accuracy(classifier, devtest_set)  

＃文档分类

将电影评论语料库归类为正面或负面

[python]view plain copy
from nltk.corpus import movie_reviews  
documents = [(list(movie_reviews.words(fileid)), category)  
              for category in movie_reviews.categories()  
              for fileid in movie_reviews.fileids(category)]  
random.shuffle(documents)  

<text, categories>

[python]view plain copy
#文档分类的特征提取器，其特征表示每个词是否在一个给定的文档中  
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())  
word_features = all_words.keys()[:2000]  
def document_features(document):  
    document_words = set(document)  
    features = {}  
    for word in word_features:  
        features['contains(%s)' % word] = (word in document_words)  
    return features  
print document_features(movie_reviews.words('pos/cv957_8737.txt'))  
{u'contains(corporate)': False, u'contains(barred)': False, u'contains(batmans)': False, u'contains(menacing)': False,   

[python]view plain copy
u'contains(rags)': False, u'contains(inquires)': False,   

[python]view plain copy
#训练和测试分类器以进行文档分类  
featuresets = [(document_features(d),c) for (d,c) in documents]  
train_set, test_set = featuresets[100:], featuresets[:100]  
classifier = nltk.NaiveBayesClassifier.train(train_set)  
print nltk.classify.accuracy(classifier, test_set)  
0.73  

[python]view plain copy
classifier.show_most_informative_features(5)  #找出哪些特征是分类器发现的并且是最有信息量的  
Most Informative Features  
          contains(sans) = True              neg : pos    =      9.1 : 1.0  
    contains(mediocrity) = True              neg : pos    =      7.8 : 1.0  
     contains(dismissed) = True              pos : neg    =      6.9 : 1.0  
     contains(testament) = True              pos : neg    =      6.5 : 1.0  
   contains(bruckheimer) = True              neg : pos    =      6.4 : 1.0  

＃词性标注

[python]view plain copy
from nltk.corpus import brown  
suffix_fdist = nltk.FreqDist()  
for word in brown.words():  
    word = word.lower()  
    suffix_fdist[word[-1:]] += 1  
    suffix_fdist[word[-2:]] += 1  
    suffix_fdist[word[-3:]] += 1  
from operator import itemgetter  
common_suffixes = sorted(suffix_fdist.items(), key=itemgetter(1), reverse=True)  
common_suffixes[:100]  
[(u'e', 202946),  
 (u',', 175002),  
 (u'.', 152999),  
 (u's', 128722),  
 (u'd', 105687),  
 (u't', 94459),  

[python]view plain copy
common_suf = [ suffix[0] for suffix in common_suffixes][:100]  

[python]view plain copy
common_suf  

[python]view plain copy
  

[python]view plain copy
def pos_features(word):  
    features = {}  
    for suffix in common_suf:  
        features['endswith(%s)'%suffix] = word.lower().endswith(suffix)  
    return features  

训练新的“决策树”的分类器

[python]view plain copy
tagged_words = brown.tagged_words(categories='news')  
tagged_words[0]  
(u'The', u'AT')  
len(tagged_words)  
100554  
len(pos_features(tagged_words[0][0]))  
100  
pos_features(tagged_words[0][0])  
{u"endswith('')": False,  
 u"endswith(')": False,  
 u"endswith('s)": False,  
 u'endswith(()': False,  
 u'endswith())': False,  
 u'endswith(,)': False,  
featuresets = [(pos_features(n),g) for (n,g) in tagged_words]  
size = int(len(featuresets) * 0.1)  
size  
Out[52]:  10055  
train_set,test_set = featuresets[size:], featuresets[:size]  

[python]view plain copy
classifier = nltk.DecisionTreeClassifier.train(train_set)   #决策树  
nltk.classify.accuracy(classifier, test_set)  
<span style="color:#ff0000;">0.6270512182993535</span>  
classifier.classify(pos_features('cats'))  
Out[54]:  u'NNS'  
#决策树的优点是容易解释，甚至可以它们以伪代码形式输出  
print classifier.pseudocode(depth=4)  
if endswith(the) == False:   
  if endswith(,) == False:   
    if endswith(s) == False:   
      if endswith(.) == False: return u'.'  
      if endswith(.) == True: return u'.'  
    if endswith(s) == True:   
      if endswith(is) == False: return u'PP$'  
      if endswith(is) == True: return u'BEZ'  
  if endswith(,) == True: return u','  
if endswith(the) == True: return u'AT'  

＃探索上下文语境

不是只传递已标注的词，而是传递整个（未标注的）句子，以及目标词的索引

＃特征检测器

[python]view plain copy
def pos_features(sentence, i):  
    features = {"suffix(1)": sentence[i][-1:],  
                "suffix(2)": sentence[i][-2:],  
                "suffix(3)": sentence[i][-3:]}  
    if i == 0:  
        features["prev-word"] = "<START>"  
    else:  
        features["prev-word"] = sentence[i-1]  
    return features  
brown.sents()[0][7]  
Out[62]:  u'an'  
brown.sents()[0][8]  
Out[63]:  u'investigation'  
pos_features(brown.sents()[0], 8)    ###### 四个特征  
{'prev-word': u'an',  
 'suffix(1)': u'n',  
 'suffix(2)': u'on',  
 'suffix(3)': u'ion'}  

>>>

[python]view plain copy
tagged_sents = brown.tagged_sents(categories='news')  
featuresets = []  
for tagged_sent in tagged_sents:  
    untagged_sent = nltk.tag.untag(tagged_sent)  
    for i, (word, tag) in enumerate(tagged_sent):  
        featuresets.append( (pos_features(untagged_sent, i), tag) )  
size = int(len(featuresets) * 0.1)  
10055  

[python]view plain copy
train_set, test_set = featuresets[size:], featuresets[:size]  
classifier = nltk.NaiveBayesClassifier.train(train_set)  
nltk.classify.accuracy(classifier, test_set)  
<span style="color:#ff0000;">0.7891596220785678</span>  

＃序列分类

在词性标注的例子中，可以使用各种不同的序列分类器模型为给定的句子中的所有词选择词性标注

一种称为连续分类或贪婪序列分类的序列分类器策略，为第一个输入找到最有可能的类标签，然后在此基础上找到下一个输入的最佳的标签。这个过程可以不断重复直到所有的输入都被贴上标签。

特征提取器

[python]view plain copy
def pos_features(sentence, i, history):  
    features = {"suffix(1)": sentence[i][-1:],  
                "suffix(2)": sentence[i][-2:],  
                "suffix(3)": sentence[i][-3:] }  
    if i == 0:  
        features["prev-word"] = "<START>"  
        features["prev-tag"] = "<START>"  
    else:  
        features["prev-word"] = sentence[i-1]  
        features["pre-tag"] = history[i-1]  
    return features  

[python]view plain copy
class ConsecutivePosTagger(nltk.TaggerI):  
    def __init__(self, train_sents):  
        train_set = []  
        for tagged_sent in train_sents:  
            untagged_sent = nltk.tag.untag(tagged_sent)  
            history = []  
            for i, (word, tag) in enumerate(tagged_sent):  
                featureset = pos_features(untagged_sent, i, history)  
                train_set.append((featureset, tag))  
                history.append(tag)   ######  
            self.classifier = nltk.NaiveBayesClassifier.train(train_set)  
    def tag(self, sentence):  
        history = []  
        for i, word in enumerate(sentence):  
            featureset = pos_features(sentence, i, history)  
            tag = self.classifier.classify(featureset)  
            history.append(tag)  
        return zip(sentence, history)  

[python]view plain copy
tagged_sents = brown.tagged_sents(categories='news')  
size = int(len(tagged_sents) * 0.1)  
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]  
tagger = ConsecutivePosTagger(train_sents)  
print tagger.evaluate(test_sents)  

＃其他序列分类方法

这种方法的缺点是一旦做出决定便无法更改。例如：如果决定将一个词标注为名词，但后来发现应该是动词，那也没有办法修复我们的错误了。解决这个问题的方法是采取转型策略。转型联合分类的工作原理是为输入的标签创建一个初始值，然后反复提炼该值，尝试修复相关输入之间的不一致

另一种方案是为词性标记所有可能的序列打分，选择总得分最高的序列。隐马尔科夫模型就采取了这种方法。隐ｍａｅｒｋｅｆｕｍｏｘ类似于连续分类器，不光考虑输入也考虑已预测标记的历史。然而，不是简单地找出一个给定词的单个最好标签，而是为标记产生一个概率分布。然后这些概率结合起来计算标记序列的概率得分，最后选择最高概率的标记序列。不过，可能的标签序列数量相当大。给定拥有３０个标签的标记集，大约有600万亿(30^10)中方式来标记一个１０个词的句子。为了避免单独考虑所有这些可能的序列，隐马尔科夫模型要求特征提取器只考虑最近的标记（或最近的n个标记，其中n是相当小的）。由于这种限制，它可以使用动态规划来有效地找出最有可能的标记序列。特别是，对每个连续的词索引i，当前的及以前的每个可能的标记都将计算得分。这种基础的方法被两个更先进的模型所采用，它们被称为最大熵马尔科夫模型和线性链条件随机场模型；但为标记序列打分用的是不同的算法。

二　监督式分类的举例

＃句子分割

第一步是获得一些已被分割成句子的数据，将它转换成一种适合提取特征的形式

[python]view plain copy
sents = <span style="color:#ff0000;">nltk.corpus.treebank_raw.sents</span>()  
tokens = []  
boundaries = set()  
offset = 0  
for sent in nltk.corpus.treebank_raw.sents():  
    tokens.extend(sent)  
    offset += len(sent)  
    boundaries.add(offset - 1)  

[python]view plain copy
def punct_features(tokens, i):  
    return { 'next-word-capitailized': tokens[i+1][0].isupper(),  
             'prevword': tokens[i-1].lower(),  
             'punct': tokens[i],  
             'prev-word-is-one-char': len(tokens[i-1]) == 1}  

[python]view plain copy
featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens)-1) if tokens[i] in '.?!']  
size = int(len(featuresets) * 0.1)  
train_set, test_set = featuresets[size:], featuresets[:size]  
classifier = nltk.NaiveBayesClassifier.train(train_set)  
nltk.classify.accuracy(classifier, test_set)  
0.936026936026936  

[python]view plain copy
def segment_sentences(words):  #基于分类的断句器  
    start = 0  
    sents = []  
    for i, word in words:  
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:  
            sents.append(words[start:i+1])  
            start = i+1  
    if start < len(words):  
        sents.append(words[start:])  
        return sents  

＃识别对话行为类型

表述行为的陈述句，问候，问题，回答，断言和说明都可以被认为是基于语言的行为类型。识别对话中隐含言语下的对话行为是理解谈话的重要步骤。

利用NPS聊天语料库建立一个分类器，用来识别新的即时消息帖子的对话行为类型。

[python]view plain copy
posts = nltk.corpus.nps_chat.xml_posts()[:10000]   #每个帖子的XML注释  

[python]view plain copy
def dialogue_act_features(post):   #特征提取器  
    features = {}  
    for word in nltk.word_tokenize(post):  
        features['contains(%s)' % word.lower()] = True  
    return features  
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]  
({'contains(gay)': True,  
  'contains(im)': True,  
  'contains(left)': True,  
  'contains(name)': True,  
  'contains(now)': True,  
  'contains(this)': True,  
  'contains(with)': True},  
 '<span style="color:#ff0000;">Statement</span>')  #陈述句  

[python]view plain copy
size = int(len(featuresets) * 0.1)   #分类器  
train_set, test_set = featuresets[size:], featuresets[:size]  
classifier = nltk.<span style="color:#ff0000;">NaiveBayesClassifier</span>.train(train_set)  
print nltk.classify.accuracy(classifier, test_set)  
0.668  

＃识别文字蕴涵

(Recognizing textual entailment, RTE)是判断文本Ｔ内的一个给定片段是否继承另一个叫做“假设”的文本。迄今为止，已经有４个RTE挑战赛，在那里共享的开发和测试数据会提供给参赛队伍。

[python]view plain copy
def rte_features(rtepair):  
    extractor = nltk.RTEFeatureExtractor(rtepair)  
    features = {}  
    features['word_overlap'] = len(extractor.overlap('word'))  
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))  
    features['ne_overlap'] = len(extractor.overlap('ne'))  
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))  
    return features  

[python]view plain copy
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]  
extractor = nltk.RTEFeatureExtractor(rtepair)  
print extractor.text_words  
set(['Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'was', 'that', 'republics', 'former', 'Co', 'representing', 'Russia', 'Parviz', 'central', 'meeting', 'together', 'binds', 'terrorism.'])  

[python]view plain copy
print extractor.hyp_words  
set(['member', 'SCO.', 'China'])  
print extractor.overlap('word')  
set([])  
print extractor.overlap('ne')  
set(['China'])  
print extractor.hyp_extra('word')  
set(['member'])  

＃扩展到大型数据集

纯Python的分类不是很快，建议探索NLTK与外部机器学习包的接口技术，

三　评估

测试集

准确度

精确度和召回率

混淆矩阵

交叉验证

四　决策树

熵和信息增益

五　朴素贝叶斯分类器

潜在概率模型

零计数和平滑

非二元特征

独立的朴素性

双重计数的原因

六　最大熵分类器

最大熵模型

熵的最大化

生成式分类器对比条件分类器

七　为语言模式建模

模型告诉我们什么？

八　深入阅读

使用Weka, Mallet, TADM 和 MegaM

python.nlp随笔（七）贝叶斯，决策树分类

一　监督式分类

＃性别鉴定

＃选择正确的特征

＃词性标注

＃探索上下文语境

＃序列分类

＃其他序列分类方法

二　监督式分类的举例

＃句子分割

＃识别对话行为类型

＃识别文字蕴涵

＃扩展到大型数据集

三　评估

四　决策树

五　朴素贝叶斯分类器

六　最大熵分类器

七　为语言模式建模

八　深入阅读

猜你喜欢

python.nlp随笔（七）贝叶斯，决策树分类

一 监督式分类

＃性别鉴定

＃选择正确的特征

＃词性标注

＃探索上下文语境

＃序列分类

＃其他序列分类方法

二 监督式分类的举例

＃句子分割

＃识别对话行为类型

＃识别文字蕴涵

＃扩展到大型数据集

三 评估

四 决策树

五 朴素贝叶斯分类器

六 最大熵分类器

七 为语言模式建模

八 深入阅读

猜你喜欢

一　监督式分类

二　监督式分类的举例

三　评估

四　决策树

五　朴素贝叶斯分类器

六　最大熵分类器

七　为语言模式建模

八　深入阅读