英文原始文本的读取与处理

声明：代码的运行环境为Python3。Python3与Python2在一些细节上会有所不同，希望广大读者注意。本博客以代码为主，代码中会有详细的注释。相关文章将会发布在我的个人博客专栏《Python自然语言处理》，欢迎大家关注。

一、在线下载txt文档

import nltk, re, pprint
from nltk import word_tokenize
from urllib.request import urlopen

# txt在线文档下载
url = "http://www.gutenberg.org/files/2553/2553.txt"  # 链接路径
response = urlopen(url)  # 下载路径中的文件
raw = response.read().decode('utf8')  # 读取文件并进行解码操作
print(type(raw))  # 打印出文本的类型
print(len(raw))  # 打印出文本的长度
print(raw[:10])  # 打印出文本前10个字符

<class 'str'>
653642
The Projec

#分词
tokens = word_tokenize(raw)
print(type(tokens))
print(len(tokens))
print(tokens[:10])

<class 'list'>
131869
['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Jeanne', "d'Arc", ',', 'by', 'Mrs.']

#创建text
text = nltk.Text(tokens)
print(type(text))
print(text[1024:1062])
print(text.collocations())  # 查看常用的搭配

<class 'nltk.text.Text'>
['century', '.', 'A', 'strong', 'and', 'splendid', 'kingdom', ',', 'to', 'which', 'in', 'early', 'ages', 'one', 'great', 'man', 'had', 'given', 'the', 'force', 'and', 'supremacy', 'of', 'a', 'united', 'nation', ',', 'had', 'fallen', 'into', 'a', 'disintegration', 'which', 'seems', 'almost', 'incredible', 'when', 'regarded']
Project Gutenberg-tm; St. Catherine; would seem; Project Gutenberg;
St. Margaret; Frere Isambard; St. Michael; Literary Archive; St.
Denis; Gutenberg-tm electronic; Archive Foundation; electronic works;
Gutenberg Literary; fifteenth century; United States; Church militant;
Holy Father; set forth; fifteen days; Jacques d'Arc
None

print(raw.find("Frere Isambard"))

415623


print(raw.rfind("fifteen days"))

502248


raw = raw[415623:502248]
raw

Out[14]: 'Frere Isambard, who was\r\nthe person in question, speaks at a later period he tells us that "the\r\nquestions put to Jeanne were too difficult, subtle, and dangerous, so\r\nthat the great clerks and learned men who were present scarcely would\r\nhave known

二、HTML格式下载文本

# HTML下载
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')

#HTML解析
from bs4 import BeautifulSoup
raw = BeautifulSoup(html,'lxml').get_text()  # 使用lxml解析器，get_text()获取到所有的文本信息
tokens = word_tokenize(raw)  # 分词

bs = BeautifulSoup(html,'lxml')
print(bs.find("div",class_='bodytext').get_text())  # 找到第一个属性为bodytext的div标签，bs.find_all()找到所有的属性为bodytext的div标签

#过滤无关内容
tokens = tokens[110:390]
text = nltk.Text(tokens)

三、读取本地文件

raw = open('F:/document.txt').read()
print(type(raw))
tokens = word_tokenize(raw)
print(type(tokens))
words = [w.lower() for w in tokens]
print(type(words))
vocab = sorted(set(words))
print(type(vocab))

<class 'str'>
<class 'list'>
<class 'list'>
<class 'list'>

vocab.append('blog')
# raw.append('blog')  # 追加的时候一定要注意类型


# query = 'Who knows?'
# beatles = ['john', 'paul', 'george', 'ringo']
# query + beatles  # 连接的时候也需要注意类型

字符串常用的方法：

四、Unicode字符

path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')  # 波兰语的文件
f= path.open(encoding='latin2')  # 采用拉丁语的方式打开
for line in f:
    line = line.strip()
    print(line)
    
Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.

ord('a')  # 查找字母的整数序号

Out[21]: 97

a=u'\u0061' # 表示Unicode编码
print(a)

a

五、正则表达式

常用的正则表达式符号：

# 正则表达式
import re  # 导入正则表达式的包
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
#查找ed结尾的词汇
print([w for w in wordlist if re.search('ed$', w)])

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed', 'absconded', 'absorbed', 'abstracted', 'abstricted', 'accelerated', 'accepted', 'accidented', ...]

#字谜：8个字母，第3个字母是j，第6个字母是t
print([w for w in wordlist if re.search('^..j..t..$', w)])
['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector', 'unjilted', 'unjolted', 'unjustly']

#9宫格输入判断
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)])
['gold', 'golf', 'hold', 'hole']

#正则表达式中的+
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
print([w for w in chat_words if re.search('^m+i+n+e+$', w)])
['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

# 提取字符块
word = 'supercalifragilisticexpialidocious'
print(re.findall(r'[aeiou]', word))  # 匹配正则表达式的时候通常在字符前面加上r
print(len(re.findall(r'[aeiou]', word)))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']
16

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and

rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()  # 运用正则表达式获取交叉表

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49

re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')  # 提取括号里面的内容

Out[41]: ['ing']


re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')  # 匹配整个单词

Out[42]: ['processing']


re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')  # 分成单词和后缀

Out[43]: [('process', 'ing')]


re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')  # 如果以es或s结尾，可能会出现问题

Out[44]: [('processe', 's')]


re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')  # 加?，匹配出完整的后缀和词语

Out[45]: [('process', 'es')]

def stems(word):  # 去除文本中的后缀
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government.  Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
[stems(t) for t in tokens]

Out[47]: 
['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'women',
 'ly',
 'in',
 'pond',
 'distribut',
 'sword',
 'i',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'Supreme',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

六、规范化文本

# 词干提取器(两种不同的方法)
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]

Out[54]: 
['denni',
 ':',
 'listen',
 ',',
 'strang',
 'women',
 'lie',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandat',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcic',
 'aquat',
 'ceremoni',
 '.']

[lancaster.stem(t) for t in tokens]
Out[55]: 
['den',
 ':',
 'list',
 ',',
 'strange',
 'wom',
 'lying',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'bas',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'pow',
 'der',
 'from',
 'a',
 'mand',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'som',
 'farc',
 'aqu',
 'ceremony',
 '.']

wnl = nltk.WordNetLemmatizer()  # 词性归并
[wnl.lemmatize(t) for t in tokens]
Out[56]: 
['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'woman',
 'lying',
 'in',
 'pond',
 'distributing',
 'sword',
 'is',
 'no',
 'basis',
 'for',
 'a',
 'system',
 'of',
 'government',
 '.',
 'Supreme',
 'executive',
 'power',
 'derives',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

七、分割

# 例子
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

['"Nonsense!"',
 'said Gregory, who was very rational when anyone else\nattempted paradox.',
 '"Why do all the clerks and navvies in the\n'
 'railway trains look so sad and tired, so very sad and tired?',
 'I will\ntell you.',
 'It is because they know that the train is going right.',
 'It\n'
 'is because they know that whatever place they have taken a ticket\n'
 'for that place they will reach.',
 'It is because after they have\n'
 'passed Sloane Square they know that the next station must be\n'
 'Victoria, and nothing but Victoria.',
 'Oh, their wild rapture!',
 'oh,\n'
 'their eyes like stars and their souls again in Eden, if the next\n'
 'station were unaccountably Baker Street!"',
 '"It is you who are unpoetical," replied the poet Syme.']

def segment(text, segs):  # 分词例子
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
segment(text, seg1)

Out[63]: ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']

segment(text, seg2)

Out[64]: 
['do',
 'you',
 'see',
 'the',
 'kitty',
 'see',
 'the',
 'doggy',
 'do',
 'you',
 'like',
 'the',
 'kitty',
 'like',
 'the',
 'doggy']

def evaluate(text, segs):  # 分词例子
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
print(evaluate(text, seg3))

46


print(evaluate(text, seg2))

47


print(evaluate(text, seg1))

63

# 模拟退火算法
from random import randint
def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs
def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature,0)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
61 ['doyousee', 'thekitty', 'seeth', 'edoggy', 'doyou', 'l', 'i', 'ke', 'thekitty', 'liketh', 'edoggy']
57 ['doyou', 'see', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'l', 'ike', 'thekitty', 'liketh', 'edoggy']
54 ['doyou', 'see', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'l', 'ik', 'e', 'thekitty', 'l', 'ik', 'eth', 'edoggy']
52 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'lik', 'eth', 'edoggy']
52 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'lik', 'eth', 'edoggy']
51 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
51 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
48 ['doyou', 'se', 'e', 'thekitty', 'se', 'e', 'th', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
45 ['doyou', 'see', 'thekitty', 'see', 'th', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
Out[68]: '0000100100000001001000000010000100010000000100010000000'

八、链表与字符串

#链表到字符串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print(' '.join(silly))

We called him Tortoise because he taught us .

fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
    print(word, '->', fdist[word], '; ')
    
cat -> 3 ; 
dog -> 4 ; 
snake -> 1 ;

for word in sorted(fdist):
    print('%s->%d;' % (word, fdist[word]))
    
cat->3;
dog->4;
snake->1;

# 例子：定义并使用交叉表
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category')
    for word in words: # column headings
        print('%6s' % word)
    print()
    for category in categories:
        print('%-16s' % category) # row heading
        for word in words: # for each word
            print('%6d' % cfdist[category][word]) # print table cell
        print()                                              # end the row
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

英文原始文本的读取与处理

猜你喜欢