词性标注POS tagging

比如下面一段标注过词性的文字文字，用空格分开后，/前面的是英文单词，后面表示它的词性。

Confidence/NN in/IN the/DT pound/NN is/VBZ widely/RB expected/VBN to/TO take/VB another/DT sharp/JJ dive/NN if/IN trade/NN figures/NNS for/IN September/NNP ,/, due/JJ for/IN release/NN tomorrow/NN ,/, fail/VB to/TO show/VB a/DT substantial/JJ improvement/NN from/IN July/NNP and/CC August/NNP 's/POS near-record/JJ deficits/NNS ./.
Chancellor/NNP of/IN the/DT Exchequer/NNP Nigel/NNP Lawson/NNP 's/POS restated/VBN commitment/NN to/TO a/DT firm/NN monetary/JJ policy/NN has/VBZ helped/VBN to/TO prevent/VB a/DT freefall/NN in/IN sterling/NN over/IN the/DT past/JJ week/NN ./.

上面NN是名词，IN是介词或从属连词，DT: determiner 表示限定词。。。

问题是现在要给一段未标注词性的文字的每个单词标注词性。

HMM、最大熵模型、crf都可以完成这一任务

HMM

用HMM做词性标注和HMM做中文分词类似，也可以看成是序列标注问题

基于隐马尔可夫模型的有监督词性标注

HMM在自然语言处理中的应用一：词性标注

词性标注

扫描二维码关注公众号，回复： 3242852 查看本文章

#coding:utf-8
import re
 
from dicts import DefaultDict
from random import choice
 
def Dict(**args): 
    """Return a dictionary with argument names as the keys, 
    and argument values as the key values"""
    return args
 
def hmm(training_sentences, reducedtagset):
    """Given a list of pre-tagged sentences, return an HMM tuple containing
    the transition (1) and emission (2) probabilities"""
    transitions = DefaultDict(DefaultDict(0))
    emissions = DefaultDict(DefaultDict(0))
    wordcounts = DefaultDict(0)
    tagcounts = DefaultDict(0)
 
    for line in training_sentences:
	prevtag = '<START>'   # Before each sentence, begin in START state
        tagcounts['<START>'] += 1
	for taggedword in line.split():
	    (word, tag) = re.split('(?<!\\\)\/', taggedword)
 
            if reducedtagset:
            	if re.match('VB', tag) is not None: tag = 'VB'
            	elif re.match('NN', tag) is not None: tag = 'NN'
           	elif re.match('JJ', tag) is not None: tag = 'JJ'
            	elif re.match('RB', tag) is not None: tag = 'RB'
 
	    transitions[prevtag][tag] += 1
	    emissions[tag][word] += 1
	    wordcounts[word] += 1
            tagcounts[tag] += 1
            prevtag = tag
 
    print emissions.keys()
    
    return hmmtuple(transitions, emissions, wordcounts, tagcounts)
 
def hmmtuple(transitions, emissions, wordcounts, tagcounts):    
    # At test time we will need estimates for "unknown words"---the words
    # the words that never occurred in the training data.  One recommended
    # way to do this is to turn all training words occurring just once 
    # into '<UNKNOWN>' and use this as the stand-in for all "unknown words"
    # at test time.  Below we make all the necessary transformations
    # to '<UNKNOWN>'.
    for tag,dict in emissions.items():
	for word,count in dict.items():
	    if wordcounts[word] == 1:
		del emissions[tag][word]
		emissions[tag]['<UNKNOWN>'] += 1
 
    # Calculate smoothed conditional probabilities
    tags = emissions.keys()
    words = wordcounts.keys()
 
    for prevtag in transitions.keys():
        for tag in tags: #transitions[prevtag]:
            transitions[prevtag][tag] = (transitions[prevtag][tag]+1.)/(tagcounts[prevtag]+len(tags))
            #transitions[prevtag][tag] *= 1./tagcounts[prevtag]
 
    for tag in emissions.keys():
        for word in words: #emissions[tag]:
            emissions[tag][word] = (emissions[tag][word]+1.)/(tagcounts[tag]+len(wordcounts))
            #emissions[tag][word] *= 1./tagcounts[tag]
 
    #print len(transitions), len(emissions), len(tagcounts)
    return (transitions, emissions, tags)
 
def strip_tags(tagged_sentences):
    """Given a list of tagged sentences, return a list of untagged sentences"""
    untagged_sentences = []
    for taggedsent in tagged_sentences:
        untaggedsent = ''
	for taggedword in taggedsent.split():
	    word = re.split('(?<!\\\)\/', taggedword)[0]
            untaggedsent += word + ' '
        #print untaggedsent
        untagged_sentences.append(untaggedsent)
    return untagged_sentences
 
def maxsequence(probtable, tags):
    """Given a filled Viterbi probabibility table, return the most likely 
    sequence of POS tags"""
    r = len(probtable)
    c = len(probtable[0])
 
    maxfinalprob = 0
    maxfinaltag = None
    for i in range(r):
        if (probtable[i][c-1][0] > maxfinalprob):
            maxfinalprob = probtable[i][c-1][0]
            maxfinaltag = i
 
    #print maxfinaltag
 
    maxsequence = []
    prevmaxtag = maxfinaltag
    for j in range(c-1, -1, -1):
        maxsequence.insert(0, tags[prevmaxtag])
        #print probtable[prevmaxtag][j][1]
        prevmaxtag = probtable[prevmaxtag][j][1]
	    
    return maxsequence
 
def viterbi_tags (untagged_sentences, h):
    """Given a list of untagged sentences, return the most likely sequence of
    POS tags"""
    transitions = h[0]
    emissions = h[1]
    tags = h[2]
    maxtags = []
    #print tags
 
    for untaggedsent in untagged_sentences:
        #Create empty probtable
        words = untaggedsent.split()
        r = len(tags)
        c = len(words)
        probtable = [None]*r
        for i in range(r):
            probtable[i] = [None]*c
            for j in range(c):
                probtable[i][j] = [None]*2
 
        #Initialize zeroth column of probtable
        prevtag = '<START>'
        word = words[0]
        for i in range(r):
            tag = tags[i]
 
            transition = transitions[prevtag][tag]
            if word in emissions[tag]:
                emission = emissions[tag][word]
            else:
                emission = .0001*emissions[tag]['<UNKNOWN>']
 
            probtable[i][0][0] = transition*emission
        
        #Fill in probtable
        for j in range(1, c):
            word = words[j]
            for i in range(r):
                tag = tags[i]
                maxprob = 0
                maxtag = None
 
                if word in emissions[tag]:
                    emission = emissions[tag][word]
                else:
                    emission = .0001*emissions[tag]['<UNKNOWN>']
 
                for k in range(r):
                    prevtag = tags[k]
                    transition = transitions[prevtag][tag]
                    prob = probtable[k][j-1][0]*transition*emission
                    
                    if (prob > maxprob):
                        maxprob = prob
                        maxtag = k
 
                probtable[i][j][0] = maxprob
                probtable[i][j][1] = maxtag
 
        #Find most likely sequence of POS tags of this sentence
        sentmaxtags = maxsequence(probtable, tags)
        maxtags.extend(sentmaxtags)
 
    #Return most likely sequence of POS tags of all sentences
    return maxtags
 
def true_tags (tagged_sentences):
    """Given a list of tagged sentences, return the tag sequence"""
    tags = []
    for sent in tagged_sentences:
        tags.extend([re.split('(?<!\\\)\/', word)[1] for word in sent.split()])
    return tags
 
def compare(mytags, truetags, reducedtagset):
    #print mytags, truetags
    score = 0
    length = len(mytags)
    for i in range(length):
	truetag = truetags[i]
	if reducedtagset:
            if re.match('VB', truetag) is not None: truetag = 'VB'
            elif re.match('NN', truetag) is not None: truetag = 'NN'
            elif re.match('JJ', truetag) is not None: truetag = 'JJ'
            elif re.match('RB', truetag) is not None: truetag = 'RB'
 
        if mytags[i] == truetag: score += 1
    
    return 1.*score/length
 
if __name__ == '__main__':
    f = open('wsj15-18.pos').readlines()
    
    #90% of data is used for training
    print '90% of data is used for training'
    print '--------------------------------'
    i = int(len(f)*.9)
    h = hmm(f[:i], False)
 
    test1 = f[i:]
    v1 = viterbi_tags(strip_tags(test1), h)
    t1 = true_tags(test1)
    c1 = compare(v1, t1, False)
    print c1
 
    test2 = open('wsj_0159.pos').readlines()
    v2 = viterbi_tags(strip_tags(test2), h)
    t2 = true_tags(test2)
    c2 = compare(v2, t2, False)
    print c2

词性标注POS tagging

猜你喜欢