用python进行自然语言处理的有用代码，存在个别错误（已经标出）

from __future__ import  division
from nltk.book import *
# 在text1中找到相关词，并显示上下文
text1.concordance("monstrous")
# 找到与其用法意义相似的词
text1.similar("monstrous")
# 找与该集合相似的词
text1.common_contexts(["monstrous","very"])
# 画出各个词的离散图
text4.dispersion_plot(["citizens","democracy","freedom"])
# 产生错误  由于版本问题
text3.generate()

# 计算平均每个词的出现频率
print(len(text3)/len(set(text3)))
# 对文中出现该词计数
print(text3.count("smote"))

# 文本索引
print(text3.index("smote"))
print(text3[9073])

# 文本切片
print(text3[123:178])

# 字符串
name="python"
a=" ".join(['hello','python'])
print(a)
a.split(" ")
print(a)

# 频率统计
fdist1=FreqDist(text1)
print(fdist1)
vocabulary1=fdist1.keys()
print(vocabulary1)
fdist1.plot(50,cumulative=True)

# 细粒度的选择词
V=set(text1)
fdist1=FreqDist(text1)
long_words=[w for w in V if len(w)>15 and fdist1[w]>2]
print(long_words)

# 搭配词和双连词
print(list(bigrams(['more','is','than','done'])))
print(text1.collocations())

import nltk
# 人机对话
nltk.chat.chatbots()


import nltk
# 捕捉用户的输入
# 没有raw_input了
s=input("enter some text")
print("you typed",len(nltk.word_tokenize(s)),"words")

# 正则表达式
import re
import nltk

wordlist=[w for w in nltk.corpus.words.words('en') if w.islower()]
# print(wordlist)
# 查找ed结尾的词汇
print([w for w in wordlist if re.search('ed$',w)])
# ^表示开头 $表示接受
print([w for w in wordlist if re.search('^..j..t..$',w)])
# 范围与闭包，根据按键顺序决定的
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$',w)])

# 利用正则表达式提取字符块
word="asdnsjndnsudndjkdn"
print(re.findall(r'[aeiou]',word))
print(len(re.findall(r'[aeiou]',word)))


# 文本换行
from textwrap import fill
saying=['After','all','is','said','and','done']
format='%s(%d)'
pieces=[format % (word,len(word)) for word in saying]
output=' '.join(pieces)
wrapped=fill(output)
print(wrapped)


# 可视化词之间的关系
import networkx as nx
import matplotlib

from nltk.corpus import wordnet as wn
def traverse(graph,start,node):
    graph.depth[node.name]=node.shortest_path_distance(start)
    for child in node.hyponyms():
        graph.add_edge(node.name,child.name)
        traverse(graph,start,child)

def hyponym_graph(start):
    G=nx.Graph()
    G.depth={}
    traverse(G,start,start)
    return G

def graph_draw(graph):
    nx.draw(graph,
                     node_size=[16*graph.degree(n) for n in graph],
                     node_color=[graph.depth[n] for n in graph],
                     with_labels=False)
    matplotlib.pyplot.show()


dog=wn.synset('dog.n.01')
graph=hyponym_graph(dog)
graph_draw(graph)

# 一元标注 unigram tagging
# 训练
import nltk

from nltk.corpus import brown
brown_tagged_sents=brown.tagged_sents(categories='news')
brown_sents=brown.sents(categories='news')
unigram_tagger=nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
print(unigram_tagger.evaluate(brown_tagged_sents))

# 分离训练
size=int(len(brown_tagged_sents)*0.9)
train_sents=brown_tagged_sents[:size]
test_sents=brown_tagged_sents[size:]
unigram_tagger=nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

# 组合标注器
t0=nltk.DefaultTagger('NN')
t1=nltk.UnigramTagger(train_sents,backoff=t0)
t2=nltk.BigramTagger(train_sents,backoff=t1)
t2.evaluate(test_sents)

# 存储标注器 python3 是pickle
from pickle import dump
output=open('t2.pkl','wb')
dump(t2,output,-1)
output.close()


from pickle import load
input=open('t2.pkl','rb')
tagger=load(input)
input.close()

text="I'm a small girl in a big world"
tokens=text.split()
print(tagger.tag(tokens))

# 性别鉴定
import nltk
nltk.download('names')
# 返回的字典成为特征集  特征提取器
def gender_features(word):
    return {'last_letrer':word[-1]}
print(gender_features('Shrek'))

# 准备例子和对应类标签的链表
from nltk.corpus import names
import random
names=([(name,'male') for name in names.words('male.txt')]+
       [(name,'female') for name in names.words('female.txt')])
random.shuffle(names)

# 用特征提取器处理名称数据，划分训练集和测试集，用于训练一个“朴素贝叶斯分类器
featuresets=[(gender_features(n),g) for (n,g) in names]
train_set,test_set=featuresets[500:],featuresets[:500]
classifier=nltk.NaiveBayesClassifier.train(train_set)

print(classifier.classify(gender_features('lucy')))
# 显示最强特征
classifier.show_most_informative_features(5)

# 文档分类
# 选择电影评论语料库
import random
import nltk
from nltk.corpus import movie_reviews
documents=[(list(movie_reviews.words(fileid)),category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)
# 构建频繁词列表
all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features=list(all_words.keys())[:2000]

# 定义特征提取器
def document_features(document):
    document_words=set(document)
    features={}
    for word in word_features:
        features['contains(%s)' %word]=(word in document_words)
    return features
# print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
# 训练和测试一个分类器进行文档分类
featuresets=[(document_features(d),c) for (d,c) in documents]
train_set,test_set=featuresets[100:],featuresets[:100]
classifier=nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))
classifier.show_most_informative_features(5)

# 基于上下文语境进行词汇标注
def pos_features(sentence,i):
    features={"shuffix(1)":sentence[i][-1:],
              "shuffix(2)": sentence[i][-2:],
              "shuffix(3)": sentence[i][-3:]
              }
    if i==0:
        features["pre-word"]="<START>"
    else:
        features["pre-word"]=sentence[i-1]
    return features
print(pos_features("I'm a small girl in a big world",4))

# 使用连续分类器进行词性标注
import nltk


def pos_features(sentence, i, history):
    features = {"shuffix(1)": sentence[i][-1:],
                "shuffix(2)": sentence[i][-2:],
                "shuffix(3)": sentence[i][-3:]
                }
    if i == 0:
        features["pre-word"] = "<START>"
        features["pre-tag"] = "<START>"
    else:
        features["pre-word"] = sentence[i - 1]
    return features


class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent)
                featureset = pos_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


# 使用连续分词器对名词短语分块
import nltk
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                # 这里报错
                featureset = npchunk_features(untagged_sent, i,history)

                train_set.append( (featureset, tag))
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(rain_set, algorithm = 'megam', trace = 0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

def npchunk_features(sentence,i,history):
    word.pos=sentence[i]
    return {"pos":pos}
chunker = ConsecutiveNPChunker(train_sents)
print chunker.evaluate(test_sents)

# 命名实体识别
import nltk
nltk.download('maxent_ne_chunker')
sent=nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent,binary=True))

#关系抽取
import nltk

import re
IN=re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG','LOC',doc,corpus='ieer',pattern=IN):
        # 这里出错
        print(nltk.sem.relextract(rel))
用python进行自然语言处理的有用代码，存在个别错误（已经标出）

猜你喜欢