注意要把string类型的词转成unicode32的，这样计算的长度就是字符的长度

数据整理

如果文本中有英文，如“abc”，把”abc”看成一个词来计算，返回一个vector<vector<int>>类型的词

vector<vector<int>>  getRealWord(vector<int> result) {
    vector<vector<int>>  realWord;
    vector<int> temp;
    for (auto it = result.begin(); it != result.end(); ++it) {
        //判断是否英文或数字
        if ((*(it) > u'\u0061' && *(it) < u'\u007a')
        || (*(it) > u'\u0041' && *(it) < u'\u005a') 
        || (*(it) >= u'\u0030' && *(it) <= u'\u0039'))
        {
            temp.push_back(*it);
        }
        else {
            if (!temp.empty()) {
                realWord.push_back(temp);
                //清空一下，只记录一个单词
                temp.clear();
            }
            temp.push_back(*it);
        }
    }
    realWord.push_back(temp);
    return realWord;
}

UTF8转UTF32

#include "utf8.h"
std::vector<int>  UTF8ToUTF32(const char*  str) {
    std::vector<int> result;
    utf8::unchecked::utf8to32(str, str + strlen(str), back_inserter(result));
    return result;
}

最大长度

//计算最大长度
int Max(std::string str1, std::string str2) {
    //类型转化 采用unicode编码
    vector<vector<int>> pre = getRealWord(UTF8ToUTF32(str1.c_str()));
    vector<vector<int>> next = getRealWord(UTF8ToUTF32(str2.c_str()));

    int a = pre.size();
    int b = next.size();
    return a >b ? a : b;
}

最小长度

int Min(int a,int b){
    return a < b ? a : b;
}

字符串是否部分包含

//求两个vector向量是否有交集
void vectorIntersect(const vector<int>& v1, const vector<int>& v2, vector<int>& des)//求交集
{
    int i, j;
    i = j = 0;//定位到2个有序向量的头部
    des.clear();
    while (i < v1.size() && j < v2.size())
    {
        if (v1[i] == v2[j]) //相等则为交集的元素之一
        {
            des.push_back(v1[i]);
            i += 1;
            j += 1;
        }
        else if (v1[i] < v2[j])   //不相等时，指示较小元素额标记加一
        {
            i += 1;
        }
        else
        {
            j += 1;
        }
    }
}
//判断字符串是否部分包含另一个字符串
bool  SearchPartStr(std::string str1, std::string str2)
{
    vector<int> pre = UTF8ToUTF32(str1.c_str());
    vector<int> next = UTF8ToUTF32(str2.c_str());
    vector<int> result;
    vectorIntersect(pre, next, result);
    //要除去全包含的情况
    if (str1 != str2&&!result.empty()
        && !SearchAllStr(str1.c_str(), str2.c_str())
        && !SearchAllStr(str2.c_str(), str1.c_str()))
    {
        return true;
    }
    else
    {
        return false;
    }
}

字符串是否全部包含

bool SearchAllStr(const char *pSource, const char *pSearch)
{
    //在pSource查找字符pSearch是否存在  
    int lenSource = strlen(pSource);
    int lenSearch = strlen(pSearch);
    int firstCharNum = lenSource + 1;
    //判断搜索字符串是否比源字符串大  
    if (lenSearch > lenSource)
    {
        return false;
    }
    //遍历字符串 找到第一个相同的字符位置 firstCharNum  
    for (int i = 0; i < lenSource; i++)
    {
        if (pSource[i] == pSearch[0])
        {
            //找到目标字符  
            firstCharNum = i;
            break;
        }
    }
    if (firstCharNum > lenSource || (lenSource - firstCharNum) < lenSearch)
    {
        //没有找到相同字符  
        return false;
    }
    //首字符之后字符连续相同返回true  
    int j = 1;
    for (int i = firstCharNum + 1; i < (firstCharNum + lenSearch); i++)
    {
        //遍历字符串  
        if (pSource[i] == pSearch[j])
        {
            //找到目标字符  
            j++;
        }
        else
        {
            //递归调用自己  
            const char *newsource = pSource + firstCharNum + 1;
            return SearchAllStr(newsource, pSearch);
        }
    }
    return true;

}

编辑距离

参考文章《编辑距离及编辑距离算法》
编辑距离矩阵

//编辑距离矩阵
int ** GetEditMatrix(std::string str1, std::string str2) {

    vector<vector<int>> pre = getRealWord(UTF8ToUTF32(str1.c_str()));
    vector<vector<int>> next = getRealWord(UTF8ToUTF32(str2.c_str()));

    int max1 = pre.size();
    int max2 = next.size();
    int **ptr = new int *[max1 + 1];
    for (int i = 0; i < max1 + 1; i++)
    {
        ptr[i] = new int[max2 + 1];
    }
    for (int i = 0; i < max1 + 1; i++)
    {
        ptr[i][0] = i;
    }
    for (int i = 0; i < max2 + 1; i++)
    {
        ptr[0][i] = i;
    }
    for (int i = 1; i < max1 + 1; i++)
    {
        for (int j = 1; j < max2 + 1; j++)
        {
            int d;
            int temp = Min(ptr[i - 1][j] + 1, ptr[i][j - 1] + 1);
            if (pre[i - 1] == next[j - 1])
            {
                d = 0;
            }
            else
            {
                d = 1;
            }
            ptr[i][j] = Min(temp, ptr[i - 1][j - 1] + d);
        }
    }
    return ptr;
}

编辑距离

//编辑距离
double GetEditDistance(std::string str1, std::string str2)
{
    vector<vector<int>> pre = getRealWord(UTF8ToUTF32(str1.c_str()));
    vector<vector<int>> next = getRealWord(UTF8ToUTF32(str2.c_str()));

    int max1 = pre.size();
    int max2 = next.size();
    int **ptr = GetEditMatrix(str1, str2);
    int dis = ptr[max1][max2];
    for (int i = 0; i < max1 + 1; i++)
    {
        delete[] ptr[i];
        ptr[i] = NULL;
    }
    delete[] ptr;
    ptr = NULL;
    return dis;
}

最大公共子串长度

//两个字符串之间最长的相同子字符串的长度
int longestCommonSubstring(std::string s1, std::string s2)
{
    vector<int> str1 =UTF8ToUTF32(s1.c_str());
    vector<int> str2 =UTF8ToUTF32(s2.c_str());
    int len1 = str1.size();
    int len2 = str2.size();
    if (len1 == 0 || len2 == 0)return 0;
    //定义二维数组dp[i][j]  代表串1从0~i这段与串2从0~j这段的公共子串的最大值
    //赋初值dp[0~len1][0]=0   dp[0][0~len2]=0
    vector<vector<int>> dp(len1 + 1, vector<int>(len2 + 1, 0));
    for (int i = 1; i <= len1; i++) {
        for (int j = 1; j <= len2; j++) {
            if (str1[i - 1] == str2[j - 1]) {
                //若相等则上层值+1
                dp[i][j] = dp[i - 1][j - 1] + 1;
            }
            else {
                //若不相等则等于交错值中的最大值
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]);
            }
        }

    }
    return dp[len1][len2];
}

对角线改变元素行下标

//记录对角线上元素值第一次发生改变的元素行下标
int GetFirstChange(std::string str1, std::string str2) {
    //类型转化 采用unicode编码

    vector<vector<int>> pre = getRealWord(UTF8ToUTF32(str1.c_str()));
    vector<vector<int>> next = getRealWord(UTF8ToUTF32(str2.c_str()));

    int max1 = pre.size();
    int max2 = next.size();
    int **ptr = GetEditMatrix(str1,str2);
    vector<int> result;
    for (int i = 0; i < max1 + 1; i++)
    {
        for (int j = 0; j < max2 + 1; j++)
        {
            if (i == j&&i != 0 && j != 0) {
                if (ptr[i - 1][j - 1] != ptr[i][j]) {
                    result.push_back(i);
                }
            }
            /*cout << ptr[i][j] << "";*/
        }
        /*cout << endl;*/
    }
    for (int i = 0; i < max1 + 1; i++)
    {
        delete[] ptr[i];
        ptr[i] = NULL;
    }
    delete[] ptr;
    if (!result.empty()) {
        return result[0];
    }
    else {
        return  0;
    }
}

关联度计算

论文上的公式

优化前方案

优化后方案

代码实现

//计算关联度
double GetCorrelation(std::string str1, std::string str2, int flag) {

    //optimiLen优化后的编辑距离
    double  correlation, correlA, correlB, optimiLen;
    vector<int> result;
    vector<vector<int>> pre = getRealWord(UTF8ToUTF32(str1.c_str()));
    vector<vector<int>> next = getRealWord(UTF8ToUTF32(str2.c_str()));
    int m = pre.size();
    int n = next.size();
    switch (flag)
    {
    case 0:
        //字符串全包含编辑距离替换成两串长度差的平均值
        if (str1 != str2 && (SearchAllStr(str1.c_str(), str2.c_str()) || SearchAllStr(str2.c_str(), str1.c_str())))
        {
             optimiLen = abs(GetEditDistance(str1, str2)- abs((m-n)/2));
        }
        //字符串部分包含编辑距离替换最小字符串长度减去字符串交集部分长度后取平均值
        else if (SearchPartStr(str1.c_str(), str2.c_str(),result) || SearchPartStr(str2.c_str(), str1.c_str(),result))
        {
            int t = result.size();
             optimiLen = abs(GetEditDistance(str1, str2) - abs((Min(m,n)-t) / 2));
        }
        else {
             optimiLen = GetEditDistance(str1, str2);
        }
        /*cout << editLen << endl;*/
        correlA = double(1 - ( optimiLen/ Max(str1, str2)));
        correlB = double(longestCommonSubstring(str1, str2) / double( optimiLen + longestCommonSubstring(str1, str2) + double((m - GetFirstChange(str1, str2)) / m)));
        /*cout << correlA << "," << correlB << endl;*/
        //返回两种方法的最大值
        if ( optimiLen != 0) {
            correlation = MaxCorrelation(correlA, correlB);
        }
        else {
            correlation = MaxCorrelation(GetCorrelation(str1,str2,1), GetCorrelation(str1, str2, 2));
        }
        break;

    case 1:
        //方法一: 关联度=1-(编辑距离/词最大长度)
        correlation = double(1 - (GetEditDistance(str1, str2) / Max(str1, str2)));
        break;
    case 2:
        //方法二: 关联度=最大子串长度/(最大子串长度+编辑距离+(第一个字符串长度-对角线上元素值第一次发生改变的元素行下标)/第一个字符串长度))
        correlation = double(longestCommonSubstring(str1, str2) / double(GetEditDistance(str1,str2) + longestCommonSubstring(str1, str2) + double((m - GetFirstChange(str1, str2)) / m)));

        break;
    case 3:
        //方法三: 关联度=1/(编辑距离+1)
        correlation = double(1 / (GetEditDistance(str1, str2) + 1));
        break;
    default:
        break;
    }
    return correlation;
}

字符串关联度求解算法改进模型

数据整理

UTF8转UTF32

最大长度

最小长度

字符串是否部分包含

字符串是否全部包含

编辑距离

最大公共子串长度

对角线改变元素行下标

关联度计算

优化前方案

优化后方案

代码实现

运行结果

猜你喜欢