自然言語処理-tf-idfとコサインの類似性に基づいてキーワード検索エンジンを構築します（3つのドキュメントを処理します）

キーワード検索エンジン：検索クエリ自体はドキュメントと見なされ、TF-IDFに基づいてベクトル表現を取得します。次に、クエリとのコサイン類似度が最も高いベクトルのドキュメントを見つけ、これらのドキュメントを検索結果として返します。

私たちのコーパスは3つのドキュメントで構成されており、次のコードに示すように、クエリは「ストアに到達するのにどのくらい時間がかかりますか？」です。

from collections import OrderedDict
import copy
from collections import Counter
from nltk.tokenize import TreebankWordTokenizer
import nltk
import math

# 计算余弦相似度
# 输入为词频向量
def cosine_sim(vec1, vec2):
    """ Let's convert our dictionaries to lists for easier matching."""
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]

    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod / (mag_1 * mag_2)

# 数据：三篇文档
docs = ["The faster Harry got to the store, the faster and faster Harry would get home."]
docs.append("Harry is hairy and faster than Jill.")
docs.append("Jill is not as hairy as Harry.")

# 分词后取并集
tokenizer = TreebankWordTokenizer()
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]
print(len(doc_tokens[0]))
all_doc_tokens = sum(doc_tokens, [])
print(len(all_doc_tokens))
lexicon = sorted(set(all_doc_tokens))
print(len(lexicon))
print(lexicon)

zero_vector = OrderedDict((token, 0) for token in lexicon)
print(zero_vector)

# 构建文档词频向量tf
'''
copy.copy()构建了完全独立的副本，即 0 向量的一个独立
的实例，而非复用一个指针指向原始对象的内存位置，否
则，就会在每次循环中用新值重写相同的 zero_vector，从
而导致每次循环都没有使用新的零向量
'''
doc_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        vec[key] = value / len(lexicon)
    doc_vectors.append(vec)

print(doc_vectors)

# 在每个文档向量中，我们用词的 TF-IDF 替换 TF。
# 向量将更全面地反映文档的含义或主题
document_tfidf_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)

    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc in docs:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf * idf
    document_tfidf_vectors.append(vec)

# 进行基本 TF-IDF 搜索：
# 将搜索查询本身视为文档，从而获得它的基于 TF-IDF 的向量表示。
# 接着找到与查询余弦相似度最高的向量的文档，并将这些文档作为搜索结果返回。
query = "How long does it take to get to the store?"
# copy.copy()确保对独立的对象进行处
# 理，而不是多个指向同一个对象的引用
query_vec = copy.copy(zero_vector)
documents = docs

tokens = tokenizer.tokenize(query.lower())
token_counts = Counter(tokens)

for key, value in token_counts.items():
    docs_containing_key = 0
    for _doc in documents:
        if key in _doc.lower():
            docs_containing_key += 1

    if docs_containing_key == 0:
        continue
    tf = value / len(tokens)
    idf = len(documents) / docs_containing_key
    query_vec[key] = tf * idf

# 计算余弦相似度
# 对于当前查询，文档 0 的相关度最高
print(cosine_sim(query_vec, document_tfidf_vectors[0]))
print(cosine_sim(query_vec, document_tfidf_vectors[1]))
print(cosine_sim(query_vec, document_tfidf_vectors[2]))

自然言語処理-tf-idfとコサインの類似性に基づいてキーワード検索エンジンを構築します（3つのドキュメントを処理します）

おすすめ