TF-IDF algorithm and cosine similarity algorithm to calculate text similarity (pure hand tear)

1. TF-IDF algorithm

  TF-IDF (term frequency–inverse document frequency) is a commonly used weighting technique for information retrieval and text mining.
  TF-IDF is a statistical method used to evaluate the importance of a word to a document set or one of the documents in a corpus . The importance of a word increases in proportion to the number of times it appears in the document, but at the same time it decreases in inverse proportion to the frequency of its appearance in the corpus.
  The main idea of ​​TF-IDF is : if a word appears in an article with a high frequency of TF and rarely appears in other articles, it is considered that the word or phrase has good classification ability and is suitable for classification .
Converted to mathematical thinking:
  TF-IDF is proportional to the number of occurrences of a word in the document, and inversely proportional to the number of occurrences of the word in the entire language.
  TF-IDF = TF (term frequency) * IDF (inverse document frequency)

Insert picture description here
Insert picture description here
Note: The operation of adding 1 is mainly to avoid dividing by 0

2. Cosine similarity

  Cosine distance, also known as cosine similarity, uses the cosine value of the angle between two vectors in a vector space as a measure of the difference between two individuals.
  The closer the cosine value is to 1, the closer the angle is to 0 degrees, that is, the more similar the two vectors are, which is called " cosine similarity ".

  Assuming that the a vector is [x1, y1] and the b vector is [x2, y2], the
Insert picture description here
Insert picture description here
  law of cosines can be rewritten into the following form: this calculation method of cosine is also true for n-dimensional vectors . Assuming that A and B are two n-dimensional vectors, A is [A1, A2, …, An] and B is [B1, B2, …, Bn], then the cosine of the angle θ between A and B is equal to:
Insert picture description here

Three, code implementation

# -*- coding: utf-8 -*-
import jieba
import math
import numpy as np


text_sum=2
# 给出文档路径
filename1 = "101.txt"
outfilename1 = "out1.txt"
inputs1 = open(filename1, 'r')
outputs1 = open(outfilename1, 'w')

filename1 = "102.txt"
outfilename1 = "out2.txt"
inputs2 = open(filename1, 'r')
outputs2 = open(outfilename1, 'w')

# 创建停用词列表
def stopwordslist():
    stopwords = [line.strip() for line in open('stopword.txt',encoding='UTF-8').readlines()]
    return stopwords

# 对句子进行中文分词
def seg_depart(sentence):
    # 对文档中的每一行进行中文分词
    sentence_depart = jieba.cut(sentence.strip())
    # 创建一个停用词列表
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

# 将输出结果写入ou.txt中
for line in inputs1:
    line_seg = seg_depart(line)
    outputs1.write(line_seg + '\n')
outputs1.close()
inputs1.close()
print("删除停用词和分词成功!!!")


# 将输出结果写入ou.txt中
for line in inputs2:
    line_seg = seg_depart(line)
    outputs2.write(line_seg + '\n')
outputs2.close()
inputs2.close()

s=''
with open("out1.txt") as f:
    while True:
        line = f.readline()
        s+=line
        if not line:
            break;
s2=''

for i in s:
    if i is not '\n':
        s2+=i

# print(s2)
text1=s2.split()
print(text1)

c=''
with open("out2.txt") as f:
    while True:
        line = f.readline()
        c+=line
        if not line:
            break;
c2=''

for i in c:
    if i is not '\n':
        c2+=i

# print(c2)
text2=c2.split()
print(text2)

print("-----------------------------------创建词汇------------------------------------")
vocabulary=[]
vocabulary=text1+text2
# print(vocabulary)
vocabulary=list(set(vocabulary))
print(vocabulary)
print("-----------------------------------创建文本的向量矩阵:start---------------------------------------")
#创建文本1的向量矩阵
arr1=[]
for t in vocabulary:
    if text1.count(t):
        arr1.append(text1.count(t))
    else:
        arr1.append(0)
print(arr1)
#创建文本2的向量矩阵
arr2=[]
for t in vocabulary:
    if text2.count(t):
        arr2.append(text2.count(t))
    else:
        arr2.append(0)
print(arr2)
print("-----------------------------创建文本的向量矩阵:end------------------------------------")
# print(len(vocabulary))
# print(len(arr1))
# print(len(arr2))
print("-----------------------------TF:start------------------------------------")
#计算词频TF
def compute_tf(list_words):
    tf_list=[]
    for i in list_words:
        tf_list.append(i/len(list_words))
    return tf_list

arr1_tf=compute_tf(arr1)
print(arr1_tf)

arr2_tf=compute_tf(arr2)
print(arr2_tf)
print("-----------------------------TF:end------------------------------------")

print("-----------------------------IDF:start------------------------------------")
#计算词语出现在文档的次数
def count_words(text1,text2):
    text_conut_arr=[0]*len(vocabulary)
    # print(text_conut_arr)
    # count=0
    # for i in range(0,len(text)):
    #     if text[i].
    for i in range(0,len(vocabulary)):
        # print(vocabulary[i])
        if vocabulary[i] in text1:
            text_conut_arr[i]+=1
            if vocabulary[i] in text2:
                text_conut_arr[i]+=1
    return text_conut_arr

#文档一词语出现在文档数的向量
c1=count_words(text1,text2)
print(c1)
#文档二词语出现在文档数的向量
c2=count_words(text2,text1)
print(c2)


#计算逆向文件频率:IDF
def file_idf(c1):
    idf_arr=[]
    for i in c1:
        idf_arr.append(math.log(text_sum/(i+1)))
    return idf_arr

arr1_idf=file_idf(c1)
print(arr1_idf)
arr2_idf=file_idf(c2)
print(arr2_idf)
print("-----------------------------IDF:end------------------------------------")

print("---------------------------------计算TF-IDF的向量矩阵:start-----------------------------------------")
# print(arr1_tf)
# print(arr1_idf)
#计算TF-IDF的向量矩阵
def tf_idf(arr_tf,arr_idf):
    tfidf_arr=[]
    for i in arr_tf:
        for j in arr_idf:
            tfidf_arr.append(i*j)
    return tfidf_arr

arr1_tfidf=tf_idf(arr1_tf,arr1_idf)
print(arr1_tfidf)
arr2_tfidf=tf_idf(arr2_tf,arr2_idf)
print(arr2_tfidf)
print("---------------------------------计算TF-IDF的向量矩阵:end-----------------------------------------")

print("----------------------------余弦相似度--------------------------------")
#余弦相似度
def cosine_similarity(x, y, norm=False):
    assert len(x) == len(y), "len(x) != len(y)"
    zero_list = [0] * len(x)
    if x == zero_list or y == zero_list:
        return float(1) if x == y else float(0)
    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))
    return 0.5 * cos + 0.5 if norm else cos

similarity=cosine_similarity(arr1_tfidf,arr2_tfidf)
print("这两篇文档的相似度为:{:%}".format(similarity))
print(similarity)

Fourth, the result screenshot
Insert picture description here

Guess you like

Origin blog.csdn.net/yjh_SE007/article/details/108429694