Python Text Processing with NLTK 2.0 Cookbook中比较有用的代码

有这本书的童鞋能共享一下最新版本么

para="hello world. it's good to see you. thanks for buying this book"
from nltk.tokenize import sent_tokenize
print(sent_tokenize(para))

print("----------------------------")

from nltk.tokenize import word_tokenize
print(word_tokenize('hello world'))

print("----------------------------")

from nltk.tokenize import word_tokenize
print(word_tokenize('你好,我是 自然 语言 处理'))
# 这里如果不加空格的话它就处理不出来,。也就是说是以空格作为切分得依据

print("----------------------------")
import nltk
text="hello, this is my world"
pattern=r"\w+|[^\w\s]+"
# r:regular expression;双引号""可以用单引号''代替;
# \w表示单词字符,等同于字符集合[a-zA-Z0-9_];+表示一次或者多次,等同于{1,},即c+ 和 c{1,} 是一个意思;
# "|":二选一,正则表达式中的"或"; [...]:字符集(字符类),其对应的位置可以是字符集中任意字符,
# 例如,a[bcd]表abe、ace和ade;^表示只匹配字符串的开头;\s匹配单个空格,等同于[\f\n\r\t\v]。
print(nltk.tokenize.regexp_tokenize(text,pattern))

print("--------------以上均为切词的手段--------------")

from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text=webtext.raw('overheard.txt')
sent_tokenizer=PunktSentenceTokenizer(text)
# print(sent_tokenizer=PunktSentenceTokenizer(text))

sents1=sent_tokenizer.tokenize(text)
print(sents1[0])

from nltk.tokenize import sent_tokenize
sents2=sent_tokenize(text)
print(sents2[1])

print("--------------去除停用词--------------")
from nltk.corpus import stopwords
english_stops=set(stopwords.words('english'))
words=["cant","is","a","constraction"]
sets=[]
for word in words:
    if word not in english_stops:
        sets.append(word)
print(sets)

print("--------------在WORDnet上找同义词--------------")
# way1:
from nltk.corpus import wordnet
syn=wordnet.synsets('cookbook')[0]
print(syn.name())
print(syn.definition())

# way2:
print(syn.name)
print(syn.definition)


print("----------------------------")

from nltk.corpus import wordnet as wn
motorcar=wn.synset('car.n.01')
types_of_motorcar=motorcar.hyponyms()
print(types_of_motorcar)

print("-----------------部分与整体的关系-----------")

print(wn.synset('computer.n.01').part_meronyms())

print("-------------反义词关系---------------")
print(wn.lemma('good.a.01.good').antonyms())

print("---------查看词汇关系和同义词集上定义的其他方法-------------------")
print(dir(wn.synset('beautiful.a.01')))


print("------------pos----------------")
syn=wordnet.synsets('hello')[0]
print(syn.pos())

print("------------查看复数形式和同义词----------------")
print(wn.synset('car.n.01').lemma_names())

print("------------计算同义词的相似度----------------")
# way1: path_similarity  基于上位词层次结构中相互连接的概念之间的最短路径,
# 其值为0-1之间,如果没有路径返回-1
right=wn.synset('right_whale.n.01')
minke=wn.synset('minke_whale.n.01')
print(right.path_similarity(minke))
# way2: wup_similarity  基于同义词在上位树出现的位置进行计算
print(right.wup_similarity(minke))


print("------------相对于n-gram----------------")
from nltk import bigrams
a=r"I'm a girl"
tokens=a.split()
# 这地方一定要加LISt,否则打印不出来
print(list(bigrams(tokens)))


print("----------------词频统计--------------------")

from nltk import FreqDist
# 空格也算
fdist1=FreqDist("a ni n nn n t t m")
print(fdist1)
print(fdist1.most_common(3))

import matplotlib
# fdist1.plot(3,cumulative=True)


print("----------------词干词语--------------------")
# 单个词干 Poter是一种词干提取的算法
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
print(stemmer.stem('coding'))

# 多个词词干
verbs=['appears', 'appear', 'appeared', 'calling', 'called']
stems=[]
for verb in verbs:
    stemmed_verb=stemmer.stem(verb)
    stems.append(stemmed_verb)
print(sorted((set(stems))))

print("----------------词形还原-------------------")
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
print(lemmatizer.lemmatize('coding'))
print(lemmatizer.lemmatize('codes'))


print("----------------利用正则表达式进行词语替换词语--------------------")

import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]

class RegexReplacer(object):
    def __init__(self,patterns=replacement_patterns):
        self.patterns=[(re.compile(regex),repl) for (regex,repl) in patterns]

    def replace(self,text):
        s=text
        for (pattern,repl) in self.patterns:
            s=re.sub(pattern,repl,s)
        return s

replacer=RegexReplacer()
print(replacer.replace("You're the world, I'm a girl"))

print("----------------获取语料-------------------")
# 语料库的文件名,平均字长,平均句长,每个词平均出现的次数
from nltk.corpus import gutenberg
for filename in gutenberg.fileids():
    r=gutenberg.raw(filename)
    w=gutenberg.words(filename)
    s=gutenberg.sents(filename)
    v=set(w)
    print(filename,len(r)/len(w),len(w)/len(s),len(w)/len(v))


f=open('hello.txt')
print(f.read())

print("----------------建立语料库,并进行检索-------------------")
# step1:
corps_root='E:/JustForNLP/nltkEx'
from nltk.corpus import PlaintextCorpusReader
wordlist=PlaintextCorpusReader(corps_root,'walden.txt')
print(wordlist.fileids())

wordlists=PlaintextCorpusReader(corps_root,'.*')
print(wordlists.fileids())
import nltk
# step2:
n=nltk.word_tokenize(wordlists.raw(fileids="walden.txt"))
complete_Walden=nltk.Text(n)
print(complete_Walden.concordance("walden"))

print("----------------获取网络文本-------------------")
from urllib.request import urlopen
url='https://blog.csdn.net/u011001084/article/details/78980299'
html=urlopen(url).read()
print(html[:20])


print("----------------tag-------------------")
import nltk
nltk.download('averaged_perceptron_tagger')

text=nltk.word_tokenize("I'm a small girl but the world is big")
print(nltk.pos_tag(text))

猜你喜欢

转载自blog.csdn.net/t_zht/article/details/82025095
今日推荐