版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
bigram(sort)统计
import re
from jieba import cut
from collections import Counter
stopwords = {'的', '是', '啊'}
def ngram(text):
ls = re.split('[^a-zA-Z\u4e00-\u9fa5]+', text)
return (i.strip() for i in ls if i.strip())
def bigram(text):
words = [w for w in cut(text) if w not in stopwords]
for i in range(len(words) - 1):
# yield words[i], words[i + 1] # 元组hashable,列表和集合unhashable
yield ' '.join(sorted((words[i], words[i + 1]))) # 排序(可选)
texts = ['sb的老师!高一的英语 老师是sb啊', '温柔的老师(*^▽^*)']
phrases = [phrase for text in texts for phrase in ngram(text)]
print(phrases)
counter = Counter(w for phrase in phrases for w in bigram(phrase))
for word, freq in counter.most_common():
print(word, freq)
-
sb 老师 2
英语 高一 1
温柔 老师 1
bigram(flag)统计
import re
from jieba.posseg import cut
from collections import Counter
stopwords = {'的', '是', '啊'}
def ngram(text):
ls = re.split('[^a-zA-Z\u4e00-\u9fa5]+', text)
return (i.strip() for i in ls if i.strip())
def bigram(text):
words = [w for w in cut(text) if w.word not in stopwords]
for i in range(len(words) - 1):
wf1, wf2 = words[i], words[i + 1]
# wf = sorted([(wf1.word, wf1.flag), (wf2.word, wf2.flag)]) # 排序
# w = ' '.join([wf[0][0], wf[1][0]])
# f = ' '.join([wf[0][1], wf[1][1]])
w = ' '.join([wf1.word, wf2.word])
f = ' '.join([wf1.flag, wf2.flag])
yield w, f
texts = ['sb的老师!高一的英语 老师是sb啊', '温柔的老师(*^▽^*)']
phrases = [phrase for text in texts for phrase in ngram(text)]
counter = Counter(w for phrase in phrases for w in bigram(phrase))
for word, freq in counter.most_common():
print(word, freq)
-
(‘sb 老师’, ‘eng n’) 1
(‘高一 英语’, ‘b nz’) 1
(‘老师 sb’, ‘n eng’) 1
(‘温柔 老师’, ‘a n’) 1