nltk(三)

1.tokenize

主要用于单词的拆分，

主要啊包括MWETokenizer(多单词拆分)，RegexpTokenizer(正则抽取单词拆分)，SpaceTokenizer(空格单词拆分)，TabTokenizer，StanfordSegmenter(斯坦福分词器)，TreebankWordTokenizer，TreebankWordDeTokenizer

from nltk.tokenize.util import string_span_tokenize
s = '''Good muffins cost $3.88\nin New York.  Please buy me
two of them.\n\nThanks.'''
list(string_span_tokenize(s, " "))

View Code

>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True

View Code

2.cgg

用于自定义词类型词典

主要包括CCG（CCG的保存类型，由子类完成），CCGChartParser（句子解析器），CCGLexicon（词类型词典）

lex = fromstring(
    '''
    :- S, NP, N, VP    # Primitive categories, S is the target primitive

    Det :: NP/N         # Family of words
    Pro :: NP
    TV :: VP/NP
    Modal :: (S\\NP)/VP # Backslashes need to be escaped

    I => Pro             # Word -> Category mapping
    you => Pro

    the => Det

    # Variables have the special keyword 'var'
    # '.' prevents permutation
    # ',' prevents composition
    and => var\\.,var/.,var

    which => (N\\N)/(S/NP)

    will => Modal # Categories can be either explicit, or families.
    might => Modal

    cook => TV
    eat => TV

    mushrooms => N
    parsnips => N
    bacon => N
    '''
)


def demo():
    parser = CCGChartParser(lex, DefaultRuleSet)
    for parse in parser.parse("I might cook and eat the bacon".split()):
        printCCGDerivation(parse)


if __name__ == '__main__':
    demo()

View Code

3.cluster（聚类）和classify（分类）

实现的聚类算法kmeans,gaac,em

分类算法maxent,naviebayes,DecisionTree等

分类和聚类都有统一接口

class ClassifierI(object):
    """
    A processing interface for labeling tokens with a single category
    label (or "class").  Labels are typically strs or
    ints, but can be any immutable type.  The set of labels
    that the classifier chooses from must be fixed and finite.

    Subclasses must define:
      - ``labels()``
      - either ``classify()`` or ``classify_many()`` (or both)

    Subclasses may define:
      - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
    """

    def labels(self):
        """
        :return: the list of category labels used by this classifier.
        :rtype: list of (immutable)
        """
        raise NotImplementedError()

    def classify(self, featureset):
        """
        :return: the most appropriate label for the given featureset.
        :rtype: label
        """
        if overridden(self.classify_many):
            return self.classify_many([featureset])[0]
        else:
            raise NotImplementedError()

    def prob_classify(self, featureset):
        """
        :return: a probability distribution over labels for the given
            featureset.
        :rtype: ProbDistI
        """
        if overridden(self.prob_classify_many):
            return self.prob_classify_many([featureset])[0]
        else:
            raise NotImplementedError()

    def classify_many(self, featuresets):
        """
        Apply ``self.classify()`` to each element of ``featuresets``.  I.e.:

            return [self.classify(fs) for fs in featuresets]

        :rtype: list(label)
        """
        return [self.classify(fs) for fs in featuresets]

    def prob_classify_many(self, featuresets):
        """
        Apply ``self.prob_classify()`` to each element of ``featuresets``.  I.e.:

            return [self.prob_classify(fs) for fs in featuresets]

        :rtype: list(ProbDistI)
        """
        return [self.prob_classify(fs) for fs in featuresets]



@add_metaclass(ABCMeta)
class ClusterI(object):
    """
    Interface covering basic clustering functionality.
    """

    @abstractmethod
    def cluster(self, vectors, assign_clusters=False):
        """
        Assigns the vectors to clusters, learning the clustering parameters
        from the data. Returns a cluster identifier for each vector.
        """

    @abstractmethod
    def classify(self, token):
        """
        Classifies the token into a cluster, setting the token's CLUSTER
        parameter to that cluster identifier.
        """

    def likelihood(self, vector, label):
        """
        Returns the likelihood (a float) of the token having the
        corresponding cluster.
        """
        if self.classify(vector) == label:
            return 1.0
        else:
            return 0.0

    def classification_probdist(self, vector):
        """
        Classifies the token into a cluster, returning
        a probability distribution over the cluster identifiers.
        """
        likelihoods = {}
        sum = 0.0
        for cluster in self.cluster_names():
            likelihoods[cluster] = self.likelihood(vector, cluster)
            sum += likelihoods[cluster]
        for cluster in self.cluster_names():
            likelihoods[cluster] /= sum
        return DictionaryProbDist(likelihoods)

    @abstractmethod
    def num_clusters(self):
        """
        Returns the number of clusters.
        """

    def cluster_names(self):
        """
        Returns the names of the clusters.
        :rtype: list
        """
        return list(range(self.num_clusters()))

    def cluster_name(self, index):
        """
        Returns the names of the cluster at index.
        """
        return index

View Code

4.corpus词库

每一个词库都有同样的方法

words(): list of str
sents(): list of (list of str)
paras(): list of (list of (list of str))
tagged_words(): list of (str,str) tuple
tagged_sents(): list of (list of (str,str))
tagged_paras(): list of (list of (list of (str,str)))
chunked_sents(): list of (Tree w/ (str,str) leaves)
parsed_sents(): list of (Tree with str leaves)
parsed_paras(): list of (list of (Tree with str leaves))
xml(): A single xml ElementTree

raw(): unprocessed corpus contents

from nltk.corpus import brown
print(", ".join(brown.words()))

View Code

5.draw

画图模块，包括cfg（控制流图）,tree(树),dispersion(散图)

def demo():
    import random

    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0, 999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.fromstring(
        '''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))'''
    )

    tc = TreeWidget(
        cf.canvas(),
        t,
        draggable=1,
        node_font=('helvetica', -14, 'bold'),
        leaf_font=('helvetica', -12, 'italic'),
        roof_fill='white',
        roof_color='black',
        leaf_color='green4',
        node_color='blue2',
    )
    cf.add_widget(tc, 10, 10)

    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green')

    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan')

    treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node['color'] = '#%04d00' % random.randint(0, 9999)

    def color2(treeseg):
        treeseg.label()['fill'] = '#%06d' % random.randint(0, 9999)
        treeseg.label().child()['color'] = 'white'

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2] + 10, 10)

    tree3 = Tree.fromstring(
        '''
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))'''
    )
    tc3 = tree_to_treesegment(
        cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2
    )
    tc3['draggable'] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3] + 10)

    def orientswitch(treewidget):
        if treewidget['orientation'] == 'horizontal':
            treewidget.expanded_tree(1, 1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
            treewidget['orientation'] = 'vertical'
        else:
            treewidget.expanded_tree(1, 1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
            treewidget['orientation'] = 'horizontal'

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10)

    tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))')
    tc4 = TreeWidget(
        cf.canvas(),
        tree4,
        draggable=1,
        line_color='brown2',
        roof_color='brown2',
        node_font=('helvetica', -12, 'bold'),
        node_color='brown4',
        orientation='horizontal',
    )
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()


if __name__ == '__main__':
    demo()

View Code

6.chunk

语句块，用于从一个句子中找到一个语句块，

统一的接口

class ChunkParserI(ParserI):
    """
    A processing interface for identifying non-overlapping groups in
    unrestricted text.  Typically, chunk parsers are used to find base
    syntactic constituents, such as base noun phrases.  Unlike
    ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
    will always generate a parse.
    """

    def parse(self, tokens):
        """
        Return the best chunk structure for the given tokens
        and return a tree.

        :param tokens: The list of (word, tag) tokens to be chunked.
        :type tokens: list(tuple)
        :rtype: Tree
        """
        raise NotImplementedError()

    def evaluate(self, gold):
        """
        Score the accuracy of the chunker against the gold standard.
        Remove the chunking the gold standard text, rechunk it using
        the chunker, and return a ``ChunkScore`` object
        reflecting the performance of this chunk peraser.

        :type gold: list(Tree)
        :param gold: The list of chunked sentences to score the chunker on.
        :rtype: ChunkScore
        """
        chunkscore = ChunkScore()
        for correct in gold:
            chunkscore.score(correct, self.parse(correct.leaves()))
        return chunkscore

View Code

两个实现子类ne和regex，第一个是通过语料库来完成分块，第二个用正则定义语句块

def demo_eval(chunkparser, text):
    """
    Demonstration code for evaluating a chunk parser, using a
    ``ChunkScore``.  This function assumes that ``text`` contains one
    sentence per line, and that each sentence has the form expected by
    ``tree.chunk``.  It runs the given chunk parser on each sentence in
    the text, and scores the result.  It prints the final score
    (precision, recall, and f-measure); and reports the set of chunks
    that were missed and the set of chunks that were incorrect.  (At
    most 10 missing chunks and 10 incorrect chunks are reported).

    :param chunkparser: The chunkparser to be tested
    :type chunkparser: ChunkParserI
    :param text: The chunked tagged text that should be used for
        evaluation.
    :type text: str
    """
    from nltk import chunk
    from nltk.tree import Tree

    # Evaluate our chunk parser.
    chunkscore = chunk.ChunkScore()

    for sentence in text.split('\n'):
        print(sentence)
        sentence = sentence.strip()
        if not sentence:
            continue
        gold = chunk.tagstr2tree(sentence)
        tokens = gold.leaves()
        test = chunkparser.parse(Tree('S', tokens), trace=1)
        chunkscore.score(gold, test)
        print()

    print('/' + ('=' * 75) + '\\')
    print('Scoring', chunkparser)
    print(('-' * 77))
    print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ')
    print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ')
    print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100))

    # Missed chunks.
    if chunkscore.missed():
        print('Missed:')
        missed = chunkscore.missed()
        for chunk in missed[:10]:
            print('  ', ' '.join(map(str, chunk)))
        if len(chunkscore.missed()) > 10:
            print('  ...')

    # Incorrect chunks.
    if chunkscore.incorrect():
        print('Incorrect:')
        incorrect = chunkscore.incorrect()
        for chunk in incorrect[:10]:
            print('  ', ' '.join(map(str, chunk)))
        if len(chunkscore.incorrect()) > 10:
            print('  ...')

    print('\\' + ('=' * 75) + '/')
    print()


def demo():
    """
    A demonstration for the ``RegexpChunkParser`` class.  A single text is
    parsed with four different chunk parsers, using a variety of rules
    and strategies.
    """

    from nltk import chunk, Tree

    text = """\
    [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
    [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
    [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
    """

    print('*' * 75)
    print('Evaluation text:')
    print(text)
    print('*' * 75)
    print()

    grammar = r"""
    NP:                   # NP stage
      {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
      {<NNP>+}            # chunk proper nouns
    """
    cp = chunk.RegexpParser(grammar)
    demo_eval(cp, text)

    grammar = r"""
    NP:
      {<.*>}              # start by chunking each tag
      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
    """
    cp = chunk.RegexpParser(grammar)
    demo_eval(cp, text)

    grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
    VP: {<TO>?<VB.*>}       # VP = verb words
    """
    cp = chunk.RegexpParser(grammar)
    demo_eval(cp, text)

    grammar = r"""
    NP: {<.*>*}             # start by chunking everything
        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
        <.*>}{<DT>          # separate on determiners
    PP: {<IN><NP>}          # PP = preposition + noun phrase
    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
    """
    cp = chunk.RegexpParser(grammar)
    demo_eval(cp, text)

    # Evaluation

    from nltk.corpus import conll2000

    print()
    print("Demonstration of empty grammar:")

    cp = chunk.RegexpParser("")
    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))))

    print()
    print("Demonstration of accuracy evaluation using CoNLL tags:")

    grammar = r"""
    NP:
      {<.*>}              # start by chunking each tag
      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
    """
    cp = chunk.RegexpParser(grammar)
    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))

    print()
    print("Demonstration of tagged token input")

    grammar = r"""
    NP: {<.*>*}             # start by chunking everything
        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
        <.*>}{<DT>          # separate on determiners
    PP: {<IN><NP>}          # PP = preposition + noun phrase
    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
    """
    cp = chunk.RegexpParser(grammar)
    print(
        cp.parse(
            [
                ("the", "DT"),
                ("little", "JJ"),
                ("cat", "NN"),
                ("sat", "VBD"),
                ("on", "IN"),
                ("the", "DT"),
                ("mat", "NN"),
                (".", "."),
            ]
        )
    )


if __name__ == '__main__':
    demo()

View Code

猜你喜欢