中文分词 - jieba
import re import jieba news_CN = ''' 央视315晚会曝光湖北省知名的神丹牌、莲田牌“土鸡蛋”实为普通鸡蛋冒充,同时在商标上玩猫腻, 分别注册“鲜土”、注册“好土”商标,让消费者误以为是“土鸡蛋”。3月15日晚间,新京报记者就此 事致电湖北神丹健康食品有限公司方面,其工作人员表示不知情,需要了解清楚情况,截至发稿暂未 取得最新回应。新京报记者还查询发现,湖北神丹健康食品有限公司为农业产业化国家重点龙头企 业、高新技术企业,此前曾因涉嫌虚假宣传“中国最大的蛋品企业”而被罚6万元。 ''' string = re.sub('[^\w]', '', news_CN) #使用正则去符号,之后都是用这个str字符串 seg_list = jieba.cut(string, cut_all=False) #精确模式 print('/'.join(seg_list)) ''' path = '' file=open(path,'r') jieba.load_userdict(file) file.close() ''' jieba.load_userdict(['神丹牌','莲花牌','土鸡蛋','新京报']) #载入字典 seg_list = jieba.cut(string, cut_all=False) #精确模式 str 为之前的字符串 print('/'.join(seg_list)) #词典中指定的词不会拆分
英文分词 - NLTK
import nltk from nltk.corpus import names from nltk.classify import NaiveBayesClassifier # 分句 sentences = nltk.sent_tokenize(news.data[0]) postag_data = [] for sent in sentences: # 词性标注 [('RULE', 'NNP'),...] postag_data += nltk.pos_tag(nltk.word_tokenize(sent)) for word in postag_data: #首字母大写都判为专有名词了 if 'NNP' == word[1]: print(word)
实例1:根据人名预测性别
# 导入数据 [(u'Aaron', 'male'), (u'Abbey', 'male')] data = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) # 提取特征 def gender_features(word): return {'last_letter': word[-1]} train_set = [(gender_features(n), g) for (n,g) in data] # 训练模型 classifier = NaiveBayesClassifier.train(train_set) classifier.classify(gender_features('Frank'))
实例2:确定积极评论和消极评论所占的比例
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ] negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ] neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ] def word_feats(words): return dict([(word, True) for word in words]) positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab] negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab] neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab] train_set = negative_features + positive_features + neutral_features classifier = NaiveBayesClassifier.train(train_set) neg = 0 pos = 0 sentence = "Awesome movie, I liked it" sentence = sentence.lower() words = sentence.split(' ') for word in words: classResult = classifier.classify(word_feats(word)) if classResult == 'neg': neg = neg + 1 if classResult == 'pos': pos = pos + 1 print('Positive: ' + str(float(pos) / len(words))) print('Negative: ' + str(float(neg) / len(words)))
参考链接: