自然语言处理与知识图谱week3 | NTLK 入门及英文语料库处理

参考资料

提取词干:
https://www.cnblogs.com/no-tears-girl/p/6964910.html


准备

import nltk
import re 
import string

f = open("out.txt", "w")  

text_en = open(u'./data/text_en.txt',encoding='utf-8',errors='ignore').read()

...

f.close()

分词

words = nltk.word_tokenize(text_en)

提取词干

from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
temp = [stemmerlan.stem(t) for t in words]
print(temp, file=f)

去停用词

from nltk.corpus import stopwords
text_stop_words = open(u'./data/stop_words.txt',encoding='utf-8',errors='ignore').read()
stop_words = nltk.word_tokenize(text_stop_words)
#stops=set(stopwords.words('english'))
temp = [word for word in words if word.lower() not in stop_words]
print(temp, file=f)

标点符号过滤

def filter_punctuati

on(words):
    new_words = []
    illegal_char = string.punctuation + '.?!,:;-–—()[]{}"\'' 
    pattern=re.compile('[%s]' % re.escape(illegal_char))
    for word in words:
        new_word = pattern.sub(u'', word)
        if not new_word == u'':
            new_words.append(new_word)
    return new_words

words_no_punc = filter_punctuation(words)
print(words_no_punc, file=f)

低频词过滤(n <= threshold)

temp = []
fdist = nltk.probability.FreqDist(words)
for word in fdist:
    if fdist[word] > 20:
        temp.append(word)
print(temp,file=f)

对前 20 个有意义的高频词,绘制频率分布图

fdist = nltk.probability.FreqDist(words_no_punc)
fdist.plot(20)

绘制离散图,查看指定单词(Elizabeth, Darcy, Wickham, Bingley, Jane)在文中的分布位置

spe_words = ["Elizabeth", "Darcy", "Wickham", "Bingley", "Jane"]
text = nltk.text.Text(spe_words)
text.dispersion_plot(spe_words)

猜你喜欢

转载自blog.csdn.net/cat_xing/article/details/88543223