ngram特征统计

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/Yellow_python/article/details/99702393

bigram(sort)统计

import re
from jieba import cut
from collections import Counter


stopwords = {'的', '是', '啊'}


def ngram(text):
    ls = re.split('[^a-zA-Z\u4e00-\u9fa5]+', text)
    return (i.strip() for i in ls if i.strip())


def bigram(text):
    words = [w for w in cut(text) if w not in stopwords]
    for i in range(len(words) - 1):
        # yield words[i], words[i + 1]  # 元组hashable,列表和集合unhashable
        yield ' '.join(sorted((words[i], words[i + 1])))  # 排序(可选)


texts = ['sb的老师!高一的英语 老师是sb啊', '温柔的老师(*^▽^*)']
phrases = [phrase for text in texts for phrase in ngram(text)]
print(phrases)
counter = Counter(w for phrase in phrases for w in bigram(phrase))
for word, freq in counter.most_common():
    print(word, freq)
print
sb 老师 2
英语 高一 1
温柔 老师 1

bigram(flag)统计

import re
from jieba.posseg import cut
from collections import Counter


stopwords = {'的', '是', '啊'}


def ngram(text):
    ls = re.split('[^a-zA-Z\u4e00-\u9fa5]+', text)
    return (i.strip() for i in ls if i.strip())


def bigram(text):
    words = [w for w in cut(text) if w.word not in stopwords]
    for i in range(len(words) - 1):
        wf1, wf2 = words[i], words[i + 1]
        # wf = sorted([(wf1.word, wf1.flag), (wf2.word, wf2.flag)])  # 排序
        # w = ' '.join([wf[0][0], wf[1][0]])
        # f = ' '.join([wf[0][1], wf[1][1]])
        w = ' '.join([wf1.word, wf2.word])
        f = ' '.join([wf1.flag, wf2.flag])
        yield w, f


texts = ['sb的老师!高一的英语 老师是sb啊', '温柔的老师(*^▽^*)']
phrases = [phrase for text in texts for phrase in ngram(text)]
counter = Counter(w for phrase in phrases for w in bigram(phrase))
for word, freq in counter.most_common():
    print(word, freq)
print
(‘sb 老师’, ‘eng n’) 1
(‘高一 英语’, ‘b nz’) 1
(‘老师 sb’, ‘n eng’) 1
(‘温柔 老师’, ‘a n’) 1

猜你喜欢

转载自blog.csdn.net/Yellow_python/article/details/99702393