Python Natural Language Processing-Take you through the common operations of the NLTK library

from nltk.book import * # 加载语料库
text1.concordance('monstrous') # 搜索文本上下文
text1.similar('monstrous') # 查找与monstrous拥有共同上下文的词
text2.common_contexts(['monstrous','very'])
text4.dispersion_plot(['America','citizen','democracy','freedom']) # 文中分布位置
text3.generate() # 以相同风格产生文本(有可能有文法错误)
fdist  = FreqDist(text1) # 单词计数
fdist.plot(50,cumulative=50) # 绘制排名前50的单词
fdist = FreqDist([len(w) for w in text1]) # 计算其他东西
fdist.freq(3) # 获取比例
fdist.max() # 最大频率对应的词
sorted([w for w in set(text1) if w.endswith('ableness')]) # 词汇判断
from nltk.corpus import gutenberg # 引入语料库
gutenberg.fileids() # 查看文本标识符
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid)) # 计数,返回一个包括空白字符的字符串,解析文本
    num_words = len(gutenberg.words(fileid)) # 返回词链表[]
    num_sents = len(gutenberg.sents(fileid)) # 返回词链表,每个句子为单独链表[[],[]]
    num_voac = len(set([w.lower() for w in gutenberg.words(fileid)])) # 去重
print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_voac),fileid) # 输出平均词长,平均句子长度和单词多样性得分(每个词出现的平均次数)
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist((genre,word) for genre in brown.categories() for word in brown.words(categories=genre)) # 条件频率分布对象
cfd.tabulate(conditions=['adventure','fiction'],sample=range(10),cumulative=True)
text = nltk.corpus.genesis.words('english-kjv.txt')    
bigrams = nltk.bigrams(text) # 产生双连词 
cfd = nltk.ConditionalFreqDist(bigrams)
print(cfd['living']) # 查看living条件下的单词分布 
from nltk.corpus import wordnet as wn 
wn.synsets('motorcar') # 查找同义词
wn.synset('car.n.01').lemma_names # car是同义词集合,查看集合内容
wn.synset('whale.n.02').min_depth() # 对应同义词集深度 
wn.synset('whale.n.02').path_similarity(wn.synset('right_whale.n.01')) # 查看单词相似度
token = nltk.word_tokenize(str(text1)) # 进行分词 
text = nltk.Text(token) #变成nltk文本,便于调用方法 
import re  
wordlis = [w for w in nltk.corpus.words.words('en') if w.islower()] 
[w for w in wordlis if re.search('ed$',w)] #如果以ed结尾就留存
re.findall(r'^(.*?)(ing|ly|ious|ies|es|s|ment)$','posses') # 理解贪婪模式和findall
porter= nltk.PorterStemmer() # 词干提取lying-lie,但不处理women 
[porter.stem(t) for t in text] 
wnl = nltk.WordNetLemmatizer() # 词形提取 women变成woman 
[wnl.lemmatize(w) for w in text]
def is_content(word):
    return word.lower not in ['a','text']
filter(is_content,text) # 过滤停止词
nltk.pos_tag(nltk.word_tokenize('I love doing homework .')) # 词性标注    
freq = nltk.defaultdict(int)
freq['b'] # 默认词典0
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news') # 已经标注好的句子
brown_sents = brown.sents(categories='news') # 未标注的z数据 
size = int(len(brown_tagged_sents)*0.9) 
train_sents = brown_tagged_sents[:size] 
test_sents = brown_tagged_sents[size:]  
bigram_tagger = nltk.BigramTagger(train_sents) # 2_Gram 
bigram_tagger.tag[brown_sents[2007]] # 标注2007号
bigram_tagger.evaluate(test_sents) # 评估标注结果
bigram_tagger = nltk.BigramTagger(train_sents,backoff=nltk.DefaultTagger('NN')) # 2_Gram,设置回退 
from nltk import names
from nltk.corpus import names
import random
names = ([(name.'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
names = ([(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
random.shuffle(names)
def gen_feature(word):
    return {'last_letter':word[-1]}
featuresets = [(gen_feature(n),g) for (n,g) in names]
trainset,testset = featuresets[500:],featuresets[:500]
import nltk
classifier = nltk.NaiveBayesClassifier.train(trainset)
classifier.classify(gen_feature('Neo'))
classifier.show_most_informative_features(5) # 查看5个最有信息量的特征
nltk.classify.accuracy(classifier,testset) # 评估

 

Guess you like

Origin blog.csdn.net/weixin_40539952/article/details/107632559