PositionRank代码分析(二)

2021SC@SDUSC

简介

本文主要分析核心模块PositionRank.py的使用.

初始化

# 生成图
self.graph = nx.Graph()
""" The word graph. """
# window用于边的生成
self.window = window

build_graph方法

该方法用于使用window窗口为graph添加边,代码分析如下:

    def build_graph(self, window, pos=None):
        """
        该方法用于使用window窗口为graph添加边
        build the word graph
        :param window: window的大小
        :param pos: 语义标记
        :return:
        """

        if pos is None:
            # NN:名词,单数或不可数
            # NNS:复数名词
            # NNP:专有名词单数
            # NNPS:专有名词复数
            # JJ:形容词
            pos = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']

        # 节点列表
        seq = []

        # 选择可以加入图中的节点
        # 当词性满足pos的要求时,将该词加入节点
        for el in self.words:
            if el.pos_pattern in pos:
                seq.append((el.stemmed_form, el.position, el.sentence_id))
                self.graph.add_node(el.stemmed_form)

        # 构造边
        for i in range(0, len(seq)):
            for j in range(i + 1, len(seq)):
                # 如果两个节点的位置不同,并且二者之间的距离小于window,那么为两个节点之间构造边
                if seq[i][1] != seq[j][1] and abs(j - i) < window:
                    if not self.graph.has_edge(seq[i][0], seq[j][0]):
                        # 如果没有边则添加边
                        self.graph.add_edge(seq[i][0], seq[j][0], weight=1)
                    else:
                        # 如果有边则将边的权重加1
                        self.graph[seq[i][0]][seq[j][0]]['weight'] += 1

添加边

如果节点 u 和 v 不在图中。边属性可以用关键字指定,也可以直接通过访问边的属性字典。

例如:

>>> G.add_edge(1, 2, weight=3)
>>> G.add_edge(1, 3, weight=7, capacity=15, length=342.7)
def add_edge(self, u_of_edge, v_of_edge, **attr):
     u, v = u_of_edge, v_of_edge
        # 添加节点
        if u not in self._node:
            self._adj[u] = self.adjlist_inner_dict_factory()
            self._node[u] = self.node_attr_dict_factory()
        if v not in self._node:
            self._adj[v] = self.adjlist_inner_dict_factory()
            self._node[v] = self.node_attr_dict_factory()
        # 添加边
        datadict = self._adj[u].get(v, self.edge_attr_dict_factory())
        datadict.update(attr)
        self._adj[u][v] = datadict
        self._adj[v][u] = datadict

然而,当数据集改为中文后,add_edge使用add_edge方法需要添加大量的遍历代码,为了方便从以元组为元素的列表中添加边,可以使用基于add_edge方法的add_edges_from方法。

def add_edges_from(self, ebunch_to_add, **attr):
    	"""从可迭代属性中添加边"""
        for e in ebunch_to_add:
            ne = len(e)
            if ne == 3:
                u, v, dd = e
            elif ne == 2:
                u, v = e
                dd = {
    
    } 
            else:
                raise NetworkXError(f"Edge tuple {
      
      e} must be a 2-tuple or 3-tuple.")
            if u not in self._node:
                self._adj[u] = self.adjlist_inner_dict_factory()
                self._node[u] = self.node_attr_dict_factory()
            if v not in self._node:
                self._adj[v] = self.adjlist_inner_dict_factory()
                self._node[v] = self.node_attr_dict_factory()
            datadict = self._adj[u].get(v, self.edge_attr_dict_factory())
            datadict.update(attr)
            datadict.update(dd)
            self._adj[u][v] = datadict
            self._adj[v][u] = datadict

添加边的几种方式

>>> G = nx.Graph() 
>>> e = (1, 2)
>>> G.add_edge(1, 2)  # 指明两个节点
>>> G.add_edge(*e)  # 对元组进行解包
>>> G.add_edges_from([(1, 2)])  # 从一个可迭代对象中添加边

candidate_scoring方法

该方法用于为候选词、短语打分(基于PageRank算法)

一个小插曲:我一直以为PageRank算法是”页面排名“算法,这和他的功能十分契合,然而在信息检索课程中,老师向我们介绍了PageRank算法是由谷歌创始人之一”Page“提出的,因此命名为PageRank算法。

    def candidate_scoring(self, pos=None, window=10, update_scoring_method=False):
        """
        该方法用于为候选词、短语打分(基于PageRank算法)
        :param pos: 语义标记
        :param window: window的大小
        :param update_scoring_method: if you want to update the scoring method based on my paper cited below:
        Florescu, Corina, and Cornelia Caragea. "A New Scheme for Scoring Phrases in Unsupervised Keyphrase Extraction."
         European Conference on Information Retrieval. Springer, Cham, 2017.

        :return:
        """

        if pos is None:
            pos = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']

        # 建立词图
        self.build_graph(window=window, pos=pos)

        # 去除掉不可能为关键词的候选词
        self.filter_candidates(max_phrase_length=4, min_word_length=3, valid_punctuation='-.')

        # 计算每一个词的分数
        # 为每一个单词赋值为 1/position_in_the_doc(出现的位置)
        # 为权重进行归一化(可以回顾关于论文解读时提到的pagerank方法)
        personalization = {
    
    }
        for w in self.words:
            stem = w.stemmed_form
            poz = w.position
            pos = w.pos_pattern

            if pos in pos:
                if stem not in personalization.keys():
                    personalization[stem] = 1.0 / poz
                else:
                    personalization[stem] = personalization.get(stem) + 1.0 / poz
        # 计算归一化系数
        factor = 1.0 / sum(personalization.values())
        # 归一化
        normalized_personalization = {
    
    k: v * factor for k, v in personalization.items()}

        # 迭代计算每一个单词的得分
        pagerank_weights = nx.pagerank_scipy(self.graph, personalization=normalized_personalization, weight='weight')

        # 遍历每一个候选词
        if update_scoring_method:
            for c in self.candidates:
                if len(c.stemmed_form.split()) > 1:
                    self.weights[c.stemmed_form] = [stem.stemmed_form for stem in self.candidates].count(
                        c.stemmed_form) * \
                                                   len(c.stemmed_form.split()) / sum(
                        [1.0 / pagerank_weights[t] for t in c.stemmed_form.split()])
                else:
                    self.weights[c.stemmed_form] = pagerank_weights[c.stemmed_form]
        else:
            for c in self.candidates:
                self.weights[c.stemmed_form] = sum([pagerank_weights[t] for t in c.stemmed_form.split()])

Guess you like

Origin blog.csdn.net/Simonsdu/article/details/121308382