python之NLP词性标注

1、知识点

包括中文和英文的词性标注
主要使用的库是nltk和jiaba

2、代码

# coding = utf-8

import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown
import numpy as np
"""
标注步骤:
    1、清洗,分词
    2、标注
    
FAQ:
    1、 Resource punkt not found.
        请安装punkt模块 
    2、安装average_perceptron tagger
    3、Resource sinica_treebank not found
        请安装sinica_treebank模块
"""
def english_label():
    """
    英文词性标注
    :return:
    """
    # 分词
    text = "Sentiment analysis is a challenging subject in machine learning.\
     People express their emotions in language that is often obscured by sarcasm,\
      ambiguity, and plays on words, all of which could be very misleading for \
      both humans and computers.".lower()
    text_list = nltk.word_tokenize(text)
    # 去掉标点符号
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    text_list = [word for word in text_list if word not in english_punctuations]
    # 去掉停用词
    stops = set(stopwords.words("english"))
    text_list = [word for word in text_list if word not in stops]

    list = nltk.pos_tag(text_list) #打标签
    print(list)


def chineses_label():
    import jieba.posseg as pseg
    import re
    """
    fool也可以针对中文词性标注
    HanLP词性标注集
    案例使用jieba进行词性标注
    :return:
    """
    str = "我爱你,是粉色,舒服 ,舒服,士大夫"
    posseg_list = re.sub(r'[,]', " ", str)
    posseg_list =pseg.cut(posseg_list)
    print(posseg_list)
    print(' '.join('%s/%s' % (word, tag) for (word, tag) in posseg_list))

猜你喜欢

转载自www.cnblogs.com/ywjfx/p/11026712.html