文章关键词提取算法

文本预处理部分

1.对于原始文档，我们要求是中文（包括标点符号），并且文档的一第句（即第一个全角句号之前的内容）应该是文章的标题。

2.采ISCTCLAS分词，并标注词性。

wordseg.cpp

注意编译时要指明头文件和动态库的路径：

g++ wordseg.cpp -o wordseg -I /home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -L /home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -lICTCLAS50

3.保留特写词性的词语，其它的删掉。同时把文档合并成一行。

posfilter.cpp

4.把文章分为四大部分：标题，段首，段中，段尾。各部分之间用一个空行分开。标题是第一句，紧接着后两句是段首，文章末两句是段尾，其余属段中。

section.cpp

5.构造元组（词语，词性，出现的次数，出现在标题，出现在段首，出现在段尾）

tuple.cpp

6.把同义词词林存入gdbm数据库

sy2db.cpp

算法部分

7.计算词语之间的相似度

simMatrix.cpp

8.根据词语的语义相似度矩阵，计算词语居间度

顶点V_i的居间度bc_i定义为：

${ bc }_{ i }=\sum _{ m,k=1 }^{ n }{ \frac { { g }_{ mk }({ V }_{ i }) }{ { g }_{ mk } } }$

n是顶点的个数，g_mk是顶点m和k之间的最短路径的个数，g_mk(V_i)是顶点m和k之间的最短路径中经过顶点V_i的条数。

对于无向图可以表示为

${ bc }_{ i }=\sum _{ m=1 }^{ n }{\sum_{k=1}^{m-1}{ \frac { { g }_{ mk }({ V }_{ i }) }{ { g }_{ mk } } } }$

Dijkstra算法可以找到单源节点的最短径，但是只能找出一条，要想找到两顶点之间的所有最短路径只需对经典Dijkstra稍作修改（见下面的代码）。在Dijkstra中运用PairingHeap可以提高算法效率，见我的另外一篇博客《用PairingHeap改进Dijkstra算法》。分别指定不同的顶点作起点就可以找出图中所有的最短路径。

代码中使用一个全局数组PairDependencyArray [ num_of_vertex ] 来保存各个节点的居间度，数组初始化为0，随着新的最短路径的发现，数组元素不断增加。比如运行一次Dijkstra后我们发现了顶点V₁到其他顶点之间的最短路径：

V	known	d	p
V1	T	0	0
V2	T	1	V1
V3	T	4	V4
V4	T	2	V1,V2
V5	T	3	V4
V6	T	7	V7
V7	T	6	V4,V5

我们直观地画出V₁到V₆的最短路径（有多条）：

现在我们要更新每条路径上除两端点之外的中间节点的居间度，它们的居间度要增加一个值，这个值怎么计算呢？终点赋予1.0，它的前继节点平分这个值。

PairDependencyArray [7]增加1;

PairDependencyArray [4]增加0.5+0.5;

PairDependencyArray [5]增加0.5;

PairDependencyArray [2]增加0.25+0.25;

pariheap.h

bc.cpp

#include<utility>
#include<climits>
#include<vector>
#include<fstream>
#include<iostream>
#include<sstream>
#include<cassert>
#include<cstdlib>
#include<list>
#include"pairheap.h"
using namespace std;
 
struct Table {
    bool known;
    double dv;
     vector < int >pv;
};

template <typename Printable>
struct Node{
    Printable data;
    Node *parent;

    Node(Printable d,Node *p=NULL)
        :data(d),parent(p){
    }

};
 
int vertextNum;         //顶点数
double **adjMatrix;     //用邻接矩阵来存储图
vector < Table > dijkTable;   //dijkstra维护的表
double *pairDependency;

vector <string> words;    //存储文章中出现的词
 
/**读出所有的词，存入vector**/
void initWords(string filename)
{
    ifstream ifs(filename.c_str());
    assert(ifs);
    string line;
    while (getline(ifs, line)) {
        istringstream stream(line);
        string word;
        stream >> word;   //读出一行中的首列词语即可
        words.push_back(word);
    }
    ifs.close();
}
 
void initMatrix(string filename)
{
    ifstream ifs(filename.c_str());
    assert(ifs);
    int scale;
    string line;
    getline(ifs, line); //读出第一行，存放着矩阵的规模
    istringstream stream(line);
    stream >> scale;
    vertextNum = scale;
 
    adjMatrix = new double *[scale];    //动态创建二维数组
    for (int i = 0; i < scale; ++i) {
        adjMatrix[i] = new double[scale];
        for (int j = 0; j < scale; ++j)
            adjMatrix[i][j] = 0;
    }
 
    int row, col;
    string word;
    int vari;
    while (getline(ifs, line)) {
        istringstream strm(line);
        strm >> vari; //第一列存储的是顶点的index
        row = vari;
        while (strm >> word) {
            int pos1 = word.find("(");
            int pos2 = word.find(")");
            int ind = atoi(word.substr(0, pos1).c_str());
            col = ind;
            double dis =
                atof(word.substr(pos1 + 1, pos2 - pos1 - 1).
                 c_str());
            adjMatrix[row][col] = adjMatrix[col][row] = dis;
        }
    }
 
    ifs.close();
}
 
/*释放邻接矩阵*/
void deleteMatrix()
{
    for (int i = 0; i < vertextNum; ++i)
        delete adjMatrix[i];
    delete[]adjMatrix;
}
 
void printMatrix()
{
    for (int i = 0; i < vertextNum; i++) {
        for (int j = 0; j < vertextNum; ++j) {
            cout << adjMatrix[i][j] << "\t";
        }
        cout << endl;
    }
}
 
void initDijkTable()
{
    dijkTable.clear();
    Table tb;
    tb.dv = INT_MAX;
    tb.known = false;
    for (int i = 0; i < vertextNum; ++i)
        dijkTable.push_back(tb);
}
 
void printDijkTable()
{
    for (int i = 0; i < vertextNum; ++i) {
        cout << i << "\t" << (dijkTable[i].
                      known ? "TRUE" : "FALSE") << "\t" <<
            dijkTable[i].dv << "\t";
        for (int j = 0; j < dijkTable[i].pv.size(); ++j) {
            cout << dijkTable[i].pv.at(j) << "\t";
        }
        cout << endl;
    }
}

void addNode(Node<string> *nodeP,int index,int startindex,list<Node<string> *> *leafNodes){
    if(index==startindex){
        leafNodes->push_back(nodeP);
        return;
    }
    for(int j=0;j<dijkTable[index].pv.size();++j){
        Node<string>* nodeC=new Node<string>(words[dijkTable[index].pv.at(j)]);
        nodeC->parent=nodeP;
        addNode(nodeC,dijkTable[index].pv.at(j),startindex,leafNodes);
    }
} 

/*根据DijkTable打印所有的最短路徑*/
void printSPathFromSource(int startindex){
    ofstream ofs("shortpath",ofstream::app);        //把所有的最短路径追加方式写入文件
    for (int endindex = 0; endindex < vertextNum; ++endindex) {
        list<Node<string> *> leafNodes;
        Node<string>* nodeP=new Node<string>(words[endindex]);
        addNode(nodeP,endindex,startindex,&leafNodes);
        
        list<Node<string> *>::iterator itr=leafNodes.begin();
        while(itr!=leafNodes.end()){
            Node<string>* down=*itr;
            while(down){
                ofs<<down->data<<"\t";
                down=down->parent;
            }
            ofs<<endl;
            itr++;
        }
    }
}
 
/*指定起点，运行带pairingheap的dijkstra算法*/
void dijkstra(int start)
{
    initDijkTable();
    dijkTable[start].dv = 0;
    PairNode *phroot = new PairNode(start, 0);
    for (int i = 0; i < vertextNum; ++i) {
        if (i == start)
            continue;
        insert(phroot, new PairNode(i, INT_MAX));
    }
    while (phroot != NULL) {
        int index = phroot->nodeindex;
        dijkTable[index].known = true;
        deleteMin(phroot);
        for (int i = 0; i < vertextNum; ++i) {
            if (adjMatrix[index][i] != 0
                && dijkTable[i].known == false) {
                double newdis =
                    dijkTable[index].dv + adjMatrix[index][i];
                double delta = dijkTable[i].dv - newdis;
                if (delta > 0) {
                    pair < int, double >pa;
                    pa.first = i;
                    pa.second = dijkTable[i].dv;
                    PairNode *fp = findNode(phroot, pa);
                    if (fp == NULL) {
                        cerr << "not found:" << pa.
                            first << "(" << pa.
                            second << ")" << endl;
                        cerr << "root=" << phroot->
                            nodeindex << endl;
                        exit(-1);
                    }
                    decreaseKey(phroot, fp, delta);
                    dijkTable[i].dv = newdis;
                    dijkTable[i].pv.clear();
                    dijkTable[i].pv.push_back(index);
                } else if (delta == 0) {
                    dijkTable[i].pv.push_back(index);
                }
            }
        }
    }
    deleteHeap(phroot);
}
 
void ancRat(double base, int index, int start)
{
    if (index == start)
        return;
    int len = dijkTable[index].pv.size();
    for (int i = 0; i < len; ++i) {
        int ind = dijkTable[index].pv.at(i);
        if (ind == start)
            continue;
        pairDependency[ind] += base / (len * vertextNum);   //分母上加一项vertextNum,是为了避免计算出来的居间度太大
        ancRat(pairDependency[ind], ind, start);
    }
}
 
/*计算从顶点start到terminal的最短路径上的所有节点的pair-dependency*/
void pairDepend(int start, int terminal)
{
    ancRat(1.0, terminal, start);
}
 
int main(int argc, char *argv[])
{
    if (argc < 3) {
        cerr << "Usage:command sim_matrix_file tuple_file." << endl;
        return -1;
    }
    string filename(argv[1]);
    initWords(argv[2]);
    initMatrix(filename);   //初始华邻接矩阵
    pairDependency = new double[vertextNum];
    for (int i = 0; i < vertextNum; ++i)
        pairDependency[i] = 0;
    for (int i = 0; i < vertextNum; ++i) {
        dijkstra(i);
        printSPathFromSource(i);
        //printDijkTable();
        for (int j = 0; j < i; ++j) {    //因为是无向图，所以算一半就够了
            pairDepend(i, j);
        }
    }
    deleteMatrix();
 
    ofstream ofs("BetweenCencility");
    ofs << vertextNum << endl;
    for (int i = 0; i < vertextNum; ++i)
        ofs << pairDependency[i] << endl;
    ofs.close();
 
    return 0;
}

9.计算居间度密度

bcdensity.cpp

10.计算词语的总得分。得分最高的K个为关键词

score.cpp

11.算法测试。

下面取了《人民日报》上的两篇文章，进行了关键词提取。结果如下：

《让雷锋精神代代相传》2012年3月5日见报

我提取的关键词：

雷锋 9.38646
精神 6.41262
社会主义 3.75446
共产主义 3.31446
推动 3.04646

凤凰网提取的关键词：

雷锋
精神
共产主义
社会主义

《宁要“不完美”的改革不要不改革的危机》2012年2月23日见报

我提取的关键词：

改革 14.7982
风险 3.59819
危机 2.39819
历史 1.99819
发展 1.93819

凤凰网提取的关键词：

改革
风险
危机
问题

12.最后我实事求是地宣告“基于语义”的关键词提取是失败的。

还是《让雷锋精神代代相传》这篇文章，按照sore.cpp中的算法提取前20个关键词如下：

雷锋 9.35777
精神 6.4173
社会主义 3.72577
共产主义 3.28577
推动 3.01777
活动 2.61265
道德 2.43777
体系 2.31777
价值 2.31777
坚定 2.29777
内涵 2.19777
建设 2.17265
具有 2.05777
人民 1.99777
重要 1.97777
开展 1.91265
中国 1.87777
核心 1.85265
弘扬 1.81777
全国 1.67777

如果把“语义”的权值调整为0,即把score.cpp修改两行：

const float vdw = 0.0;

const float tw = 1.0;

这样得到的前20个关键词是：

雷锋 22.22
精神 16.02
社会主义 8.14
共产主义 7.04
活动 6.52
推动 6.37
建设 5.42
道德 4.92
开展 4.77
体系 4.62
核心 4.62
价值 4.62
坚定 4.57
内涵 4.32
具有 3.97
人民 3.82
重要 3.77
中国 3.52
弘扬 3.37
全国 3.02

可见“语义”在关键词提取中实际上没有发挥作用！