jieba、NLTK学习笔记

　　中文分词 - jieba

import re
import jieba

news_CN = '''
    央视315晚会曝光湖北省知名的神丹牌、莲田牌“土鸡蛋”实为普通鸡蛋冒充，同时在商标上玩猫腻，
    分别注册“鲜土”、注册“好土”商标，让消费者误以为是“土鸡蛋”。3月15日晚间，新京报记者就此
    事致电湖北神丹健康食品有限公司方面，其工作人员表示不知情，需要了解清楚情况，截至发稿暂未
    取得最新回应。新京报记者还查询发现，湖北神丹健康食品有限公司为农业产业化国家重点龙头企
    业、高新技术企业，此前曾因涉嫌虚假宣传“中国最大的蛋品企业”而被罚6万元。
    '''
 
string = re.sub('[^\w]', '', news_CN)   #使用正则去符号，之后都是用这个str字符串
 
seg_list = jieba.cut(string, cut_all=False) #精确模式
print('/'.join(seg_list))

'''
path = ''
file=open(path,'r')
jieba.load_userdict(file)
file.close()
'''
jieba.load_userdict(['神丹牌','莲花牌','土鸡蛋','新京报']) #载入字典
seg_list = jieba.cut(string, cut_all=False) #精确模式  str 为之前的字符串
print('/'.join(seg_list))  #词典中指定的词不会拆分

　　英文分词 - NLTK

import nltk
from nltk.corpus import names
from nltk.classify import NaiveBayesClassifier
# 分句
sentences = nltk.sent_tokenize(news.data[0])
postag_data = []
for sent in sentences:
    # 词性标注 [('RULE', 'NNP'),...]
    postag_data += nltk.pos_tag(nltk.word_tokenize(sent))

for word in postag_data:
    #首字母大写都判为专有名词了
    if 'NNP' == word[1]:
        print(word)

　　实例1：根据人名预测性别

# 导入数据 [(u'Aaron', 'male'), (u'Abbey', 'male')]
data = ([(name, 'male') for name in names.words('male.txt')] + 
     [(name, 'female') for name in names.words('female.txt')])

# 提取特征
def gender_features(word): 
    return {'last_letter': word[-1]}

train_set = [(gender_features(n), g) for (n,g) in data]

# 训练模型
classifier = NaiveBayesClassifier.train(train_set) 
classifier.classify(gender_features('Frank'))

　　实例2：确定积极评论和消极评论所占的比例

positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]

def word_feats(words):
    return dict([(word, True) for word in words])
 
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

train_set = negative_features + positive_features + neutral_features
classifier = NaiveBayesClassifier.train(train_set)

neg = 0
pos = 0
sentence = "Awesome movie, I liked it"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
    classResult = classifier.classify(word_feats(word))
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1

print('Positive: ' + str(float(pos) / len(words)))
print('Negative: ' + str(float(neg) / len(words)))

参考链接：

Jieba、NLTK等8种中英文分词工具的分词效果对比

jieba、NLTK学习笔记

猜你喜欢