PageRank做简单的文本摘要
公式
PageRank算法把所有评论的句子集合视作一个图结构 ,把评论中的单个句子视为节点。对于任意节点 和节点 , 表示这两个节点之间边的权重,我们使用两句子之间的余弦相似度来计算:
我们可以对所有句子建立一个矩阵 存放任意两个句子之间边的权重,然后对该矩阵的每行除以其L1范数,得到任意两句子之间的转移概率,我们将节点自身的转移概率设置为0,即 ,并且对全为0的行中的元素统一设置为 进行平滑处理:
我们使用下面的公式进行迭代若干次:
代码
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def transition_probability(vec_arr):
n = vec_arr.shape[0]
m = np.zeros((n, n))
for i in range(n):
for j in range(i + 1, n):
m[i, j] = cosine_similarity(vec_arr[i].reshape(1, -1),
vec_arr[j].reshape(1, -1))[0][0]
m[j, i] = m[i, j]
for i in range(n):
if not np.count_nonzero(m[i]):
m[i] = np.full((1, n), 1 / n)
return preprocessing.normalize(m, norm='l1')
def score(m, mu=0.85, epsilon=0.0001, n=50):
score = np.full((m.shape[0], 1), 1)
for _ in range(n):
temp = score.copy()
score = mu * np.mat(m).T * score + (1 - mu) / m.shape[0]
if max(abs(temp - score)) < epsilon:
break
return score
def summary(sentences, n=5):
# take the top n sentences in scores
vectorizer = CountVectorizer(binary=True)
# vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(sentences)
m = mrw_transition_probability(x.toarray())
scores = mrw_score(m).A
sents = []
for i in np.argsort(-scores, axis=0).flatten()[:n]:
sents.append(sentences[i])
return sents