python自然语言处理-读书笔记7

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/zlp_zky/article/details/83105029
# -*- coding:utf-8 -*-
# __author__ = 'lipzhang'

#分类和标注词汇

#使用词性标注器
import nltk
# text = nltk.word_tokenize("And now for something completely different")
# print(nltk.pos_tag(text))#在这里我们看到and 是 CC,并列连词;now 和completely 是 RB,副词;for 是IN,介 词;something 是NN,名词;different 是JJ,形容词。
# nltk.help.upenn_tagset('RB')
#
# text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
# print(text.similar('woman'))#text.similar()方法为一个词w 找出所有上下文w1ww2,然 后找出所有出现在相同上下文中的词 w',即w1w'w2。

#标注语料库
#表示已标注的标识符
# tagged_token = nltk.tag.str2tuple('fly/NN')
# print( tagged_token)
# sent = '''  The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN  other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC  Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PP  said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/R  accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT  interest/NN of/IN both/ABX governments/NNS ''/'' ./. '''
# print([nltk.tag.str2tuple(t) for t in sent.split()])#我们可以直接从一个字符串构造一个已标注的标识符的链表。第一步是对字符串分词以 便能访问单独的词/标记字符串,然后将每一个转换成一个元组
#读取已标注的语料库
# print(nltk.corpus.brown.tagged_words(tagset = 'universal'))
#简化的词性标记集
# Tag	Meaning	Examples
# ADJ	adjective	new, good, high, special, big, local
# ADV	adverb	really, already, still, early, now
# CNJ	conjunction	and, or, but, if, while, although
# DET	determiner	the, a, some, most, every, no
# EX	existential	there, there’s
# FW	foreign word	dolce, ersatz, esprit, quo, maitre
# MOD	modal verb	will, can, would, may, must, should
# N	    noun	year, home, costs, time, education
# NP	proper noun	Alison, Africa, April, Washington
# NUM	number	twenty-four, fourth, 1991, 14:24
# PRO	pronoun	he, their, her, its, my, I, us
# P	    preposition	on, of, at, with, by, into, under
# TO	the word to	to
# UH	interjection	ah, bang, ha, whee, hmpf, oops
# V	    verb	is, has, get, do, make, see, run
# VD	past tense	said, took, told, made, asked
# VG	present participle	making, going, playing, working
# VN	past participle	given, taken, begun, sung
# WH	wh determiner	who, which, when, what, where, how
from nltk.corpus import brown
# 名词 Nons: 通常指代人、地点、事情、概念
# 动词 Verbs: 用以描述事件和行为
# 形容词和副词 Adjectives and Adverbs: 形容词用来描述名词,副词用来描述动词
# def findtags(tag_prefix, tagged_text):#找出最频繁的名词标记
#     cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix))
#     return dict((tag, list(cfd[tag].keys())[:5]) for tag in cfd.conditions())
# tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
# for tag in sorted(tagdict):
#     print(tag, tagdict[tag])
# def process(sentence):#使用 POS 标记寻找三词短语。
#     for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
#         if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
#             print(w1, w2, w3)
# for tagged_sent in brown.tagged_sents():
#      print(process(tagged_sent))

#使用Python的词典将词与属性之间建立映射
#POS-Tagging中每个词都会对应一个tag, 很自然地,要建立词与属性的映射 python的dict提供一种defaultdict,nltk也提供一种 nltk.defauldict ,这样使得使用不在dict中的key取value时不抛出异常,而给出默认值 key和value都可以很复杂
# counts = nltk.defaultdict(int)
# for (word, tag) in brown.tagged_words(categories='news'):
#     counts[tag] += 1
# print(counts['NN'])
# print(list(counts))
# from operator import itemgetter #递增地更新字典,按值排序。
# print(sorted(counts.items(), key=itemgetter(1), reverse=True))
# print([t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)])
# Python 字典方法:常用的方法与字典相关习惯用法的总结
# 示例 说明
# d = {} 创建一个空的字典,并将分配给d
# d[key] = value 分配一个值给一个给定的字典键
# d.keys() 字典的键的链表
# list(d) 字典的键的链表
# sorted(d) 字典的键,排序
# key in d 测试一个特定的键是否在字典中
# for key in d 遍历字典的键
# d.values() 字典中的值的链表
# dict([(k1,v1), (k2,v2), ...]) 从一个键-值对链表创建一个字典
# d1.update(d2) 添加d2 中所有项目到d1
# defaultdict(int) 一个默认值为0 的字典

#自动标注
#默认标注器
from nltk.corpus import brown
# brown_tagged_sents = brown.tagged_sents(categories='news')
# brown_sents = brown.sents(categories='news')
# raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
# tokens = nltk.word_tokenize(raw)
# default_tagger = nltk.DefaultTagger('NN')#将所有词都标注成NN 的标注器
# print(default_tagger.tag(tokens))
# print(default_tagger.evaluate(brown_tagged_sents))
# #正则表达式标注器
# patterns = [
#      (r'.*ing$', 'VBG'),               # gerunds
#      (r'.*ed$', 'VBD'),                # simple past
#      (r'.*es$', 'VBZ'),                # 3rd singular present
#      (r'.*ould$', 'MD'),               # modals
#      (r'.*\'s$', 'NN$'),               # possessive nouns
#      (r'.*s$', 'NNS'),                 # plural nouns
#      (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
#      (r'.*', 'NN')  # nouns (default)
#  ]
# regexp_tagger = nltk.RegexpTagger(patterns)
# print(regexp_tagger.tag(brown_sents[3]))
# print(regexp_tagger.evaluate(brown_tagged_sents))
# #查询标注器  #我们找出100个出现频率最高的词并存储其tag — 使用这种信息作为一个”lookup tagger”的模型(在NLTK中是UnigramTagger):
# fd = nltk.FreqDist(brown.words(categories='news'))
# cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
# most_freq_words = list(fd.keys())[:100]
# likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
# baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))#我们首先使用lookup table, 如果不能决定一个token的tag,我们再使用default tagger — 这个过程就称为 backoff .那么这个过程怎么实现呢:将default tagger作为lookup tagger的输入参数
# print(baseline_tagger.evaluate(brown_tagged_sents))
# sent = brown.sents(categories='news')[3]
# print(baseline_tagger.tag(sent))

猜你喜欢

转载自blog.csdn.net/zlp_zky/article/details/83105029
今日推荐