基于词频的文件相似度-算法应用

一、问题引入：

二、分析：
1.首先，我们从题目得知：

文件相似度 = 公共词汇 / 总词汇

2.注意的地方：

不考虑中文，文章里只考虑有英文。

3.输入： txt文件所在的文件夹目录

三、代码分析：

看代码：一般从main函数开始(程序入口同样也是main函数)

1. 函数1 ：在指定文件夹下找到对应的文件类型(后缀)的所有文件名称。

所有文件名称保存在了传进来的参数vector<string>& files里面。因为带了地址符&，可作用域不限于此函数体中。

/*
path: 指定目录
files: 保存结果
fileType: 指定的文件格式，如 .txt
*/
void getAllFiles(string path, vector<string>& files,string fileType) {
	// 文件句柄
	long hFile = 0;
	// 文件信息
	struct _finddata_t fileinfo; 
	string p;
	if ((hFile = _findfirst(p.assign(path).append("\\*" + fileType).c_str(), &fileinfo)) != -1) {
    do {
	    // 保存文件的全路径
	    files.push_back(p.assign(path).append("\\").append(fileinfo.name));
	} while (_findnext(hFile, &fileinfo) == 0); //寻找下一个，成功返回0，否则-1
	    _findclose(hFile);
	}
}

2. 关键算法 - 使用哈希思想的文本相似度算法

3.完整代码：

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <cstring> 
#include <stdlib.h>
#include <vector> 
#include <iostream> 
#include <set>
#include <io.h>
#include <string>
#include <vector>
#include <fstream>
#include<iostream>
#define P 524309
#define WordMaxLen 10 // 宏定义 
using namespace std;

typedef struct WordNode { //字节点 
    char value[WordMaxLen];
    struct WordNode *next;
} *PWordNode, *PWordList;

typedef struct FileNode { //文件节点 
    int wordCount;
    PWordList list;//表 
} *PFileNode, *PFileList;

typedef struct HTNode {
    char value[WordMaxLen];
    int fileCount;
    int *fileChoose;
} *PHTNode;

PFileList fileList; //创建文件节点 
PHTNode table; //创建HTNode节点 
int N;  //文件个数 
string s[1001]; //保存所有公共词汇 
int index = 0;  //保存公共词汇的个数 

//初始化节点,开辟空间 
void init() {
    fileList = (PFileList) malloc(sizeof(struct FileNode) * N);
    int i;
    for (i = 0; i < N; i++) {
        fileList[i].wordCount = 0;
        fileList[i].list = (PWordNode) malloc(sizeof(struct WordNode));
        fileList[i].list->next = NULL;
    }
    table = (PHTNode) malloc(sizeof(struct HTNode) * P);
    for (i = 0; i < P; i++) {
        table[i].fileCount = 0;
        table[i].fileChoose = (int *) malloc(sizeof(int) * N);
        int j;
        for (j = 0; j < N; j++) {
            table[i].fileChoose[j] = 0;
        }
    }
}
//字符串移位法散列函数 
int hash(char ss[]) {
    char *p = ss;
    unsigned int hash = 0;
    //所有比int型小的数据类型（包括char,signed char,unsigned char,short,signed short,unsigned short）转换为int型。
	//如果转换后的数据会超出int型所能表示的范围的话，则转换为unsigned int型 
	
	//遍历单词char数组
    while (*p != '\0') {
        hash = (hash << 5) + *p - 'a';   //移位 
        p++;
    }
    return hash % P;
}
//将文件编号插入到散列表中的倒排索引表
void addWordToFile(int fileIndex, char ss[]) {
	//为单词节点开辟空间 
    PWordNode nn = (PWordNode) malloc(sizeof(struct WordNode));
    //赋值 
    strcpy(nn->value, ss);
    nn->next = NULL;
    //定义文件节点-并加上文件索引 
    PFileNode pFile = fileList + fileIndex;
    //如果文件节点里的单词个数为0 
    if (pFile->wordCount == 0) {
    	//文件节点的单词节点实体的下一个指向nn 
        pFile->list->next = nn;
        //文件节点里的单词书+1 
        pFile->wordCount++;
    } else {
    	//文件节点里的单词个数不为0的情况 
        int flag = 0;
        PWordNode p = pFile->list;
        while (p->next != NULL) {
            p = p->next;
            //两个字符串相等时,strcmp返回0 
            if (strcmp(p->value, ss) == 0) {
                flag = 1;
                break;
            }
        }
        //判断一下标志变量 
        if (! flag) {
            p->next = nn;
            (pFile->wordCount)++;
        }
    }
}
//将单词在散列表中的位置存入每个文件的词汇索引表 
void addWordToTable(int fileIndex, char ss[]) {
	//获取哈希码 
    int hCode = hash(ss);
    //定义 
    PHTNode pNode = table + hCode;
    //接收的第一个 
    if (pNode->fileCount == 0) {
        strcpy(pNode->value, ss);
        pNode->fileChoose[fileIndex] = 1;
        pNode->fileCount++;
    } else {
    	//两个字符串相等的情况 
        if (strcmp(pNode->value, ss) == 0) {
            if (pNode->fileChoose[fileIndex] == 0) {
                pNode->fileChoose[fileIndex] = 1;
                pNode->fileCount++;
            } 
        } else {
            while ((table + hCode)->fileCount == 0) {
                hCode++;
                hCode %= P;
            }
            pNode = table + hCode;
            strcpy(pNode->value, ss);
            pNode->fileChoose[fileIndex] = 1;
            pNode->fileCount++;
        }
    }
}
//计算两个文件之间的相似度 
double printSimilar(int f1, int f2) {
	//选择词汇量比较小的那个文件 
    if (fileList[f1].wordCount > fileList[f2].wordCount) {
        int t = f1;
        f1 = f2;
        f2 = t;
    }
    //公共词汇的记录数 
    int allHaveCount = 0;
    //文件节点赋值为 "词汇量比较小的那个文件"
    PWordNode p = fileList[f1].list;
    while (p->next != NULL) {
        p = p->next;
        //获取哈希码 
        int hCode = hash(p->value);
        if ((table + hCode)->fileCount == 0) {
            continue;
        } else {
        	//两字符串不相等就循环 
            while (strcmp(p->value, (table + hCode)->value) != 0) {
                hCode++;
                hCode %= P;
            }
            PHTNode pHTNode = table + hCode;
            if (pHTNode->fileChoose[f2] == 1) {
            	//如果有单词相同
            	s[index++] = pHTNode->value;  //保存公共词汇字符串 
                allHaveCount++;  //公共词汇+1 
            }
        }
    }
    float rate = allHaveCount * 100.0 / (fileList[f1].wordCount + fileList[f2].wordCount - allHaveCount);
    return rate;
}
/*
path: 指定目录
files: 保存结果
fileType: 指定的文件格式，如 .txt
*/
void getAllFiles(string path, vector<string>& files,string fileType) {
	// 定义文件句柄
	long hFile = 0;
	// 文件信息
	struct _finddata_t fileinfo; 
	string p;
	if ((hFile = _findfirst(p.assign(path).append("\\*" + fileType).c_str(), &fileinfo)) != -1) {
    do {
	    // 保存文件的全路径
	    files.push_back(p.assign(path).append("\\").append(fileinfo.name));
	} while (_findnext(hFile, &fileinfo) == 0);  //寻找下一个，成功返回0，否则-1
	    _findclose(hFile);
	}
}

int main() {
	//C:\Users\mibook\Desktop\文本相似度算法 
	cout<<"请输入文件夹名称："<<endl; 
    string sss;  // 
	cin>>sss;
	vector<string> temp;   
	//扫描sss文件目录下的所有后缀为.txt的文件名，保存在temp数组中。 
	getAllFiles(sss , temp, ".txt");   
	cout<<"---------------查找到该目录下的所有txt文件------------------"<<endl;
	for (int i = 0; i < temp.size();++i ) {
	  cout << temp[i] << endl;
	}
	cout<<"------------------------------------------------------------"<<endl;
	
    N = temp.size();   //txt文件数量 
    init();      //初始化 
    int i = 0;
    
    for(int i = 0; i < N; i++) {
		fstream fin;
		//把vector里的数据转到char数组 
		char filename[1000];
		//复制函数，把第二个参数char[]的所有字符复制到第一个参数char[]中。 
		strcpy(filename,temp[i].c_str()); 
		//动态记录取到的单词
		char ss[WordMaxLen];  
		//打开文件 
		fin.open( filename , ios::in);
		string s; 
		//循环获取txt中的字符串 
		while(fin>>s){
			//表示如果单词的字母数>10,则:截取前10个字母的单词
			if(s.length() > 10) {
				s = s.substr(0,10); 
			} 
			//string转char 
			strcpy(ss , s.c_str()); 
        	//将文件编号插入到散列表中的倒排索引表
            addWordToFile(i, ss);
            //将单词在散列表中的位置存入每个文件的词汇索引表
            addWordToTable(i, ss);
		} 
    }
    cout<<endl; 
    //M = N * (N-1) / 2; 
    //int f1, f2;
    cout<<" ---------------------------------------------------------------"<<endl;
    cout<<"|第i篇文章与第j篇文章 | 文本相似度  |          公共词汇         "<<endl;
    cout<<" ---------------------------------------------------------------"<<endl;
    //遍历次数 = N * (N-1) / 2; 
    for (i = 1; i <= N; i++) {
    	for(int j = i+1; j <= N; j++){
    		//初始化记录相同词汇的个数 
	    	index = 0; 
	        //两文件之间的对比 
			double sim = printSimilar(i - 1, j - 1);
	        
	        cout<<"|第"<<i<<"篇文章与第"<<j<<"篇文章"<<" | ";
			printf("  %.1f%%     |      ",sim); 
			cout<<"共"<<index<<"个，具体是："; 
			//打印所有的公共词汇 
	        for(int i = 0; i < index; i++){
	        	cout<<s[i];
	        	if(i != index-1) cout<<",";
			}
			cout<<endl;
			cout<<" ----------------------------------------------------------------"<<endl;
		}
    }
    return 0;
} 
/*
3
Aaa Bbb Ccc
#
Bbb Ccc Ddd
#
Aaa ccc Eee
#
2
1 2
1 3
*/

四、测试：

Over~

Flowerwither

发布了70 篇原创文章 · 获赞 22 · 访问量 6004

私信关注

基于词频的文件相似度-算法应用

猜你喜欢