Retrieving question answering system based on knowledge question answering library of power plant (python code)

I have written a question-and-answer system based on an inverted list before.
Construction of the
question and answer system for power dispatch knowledge based on the inverted table The data required by the question and answer system has been provided, and the corresponding answer can be found for each question, so it can be understood that each sample data is <question, answer>. The core of the system is that when the user enters a question, it must first find the question that is closest to the question and has been stored in the library, and then directly return the corresponding answer.
Since the author is a student of electricity, here is a question-and-answer system based on the knowledge text of power plants

This article is a low-profile question and answer system, and the idea is not as good as the inverted list.

Idea
1: Integrate the power plant knowledge question and answer data set (question.txt & answer.txt) into standardized data through preprocessing.
2. Based on the bag-of-words model and TFIDF model, the cosine similarity is used as the metric to calculate the text similarity of the questions in the test question corpus, and find the questions with higher similarity as the set of similar questions.
3. Sort the questions in the set of similar questions, and return their corresponding answers to the user at the same time.

Text data set preparation
answer.txt

Problem.txt

Step 1: Read the data

#第一步:读取数据

def read_corpus(file):
    with open(file,'r',encoding='utf8',errors='ignore') as f:
        list = []
        lines = f.readlines()
        for i in lines:
            list.append(i)
    return list

questions = read_corpus('./问题.txt')
answers = read_corpus('./答案.txt')

print('Example:')
print('Question',questions[0])
print('Answer',answers[0])

Step 2: Pretreatment

#第二步:预处理
import re
import jieba


def filter_out_category(input):
    new_input = re.sub('[\u4e00-\u9fa5]{2,5}\\/','',input) #过滤掉非汉字,即标点符号
    return new_input

def filter_out_punctuation(input):
    new_input = re.sub('([a-zA-Z0-9])','',input)#过滤掉字母和数字
    new_input = ''.join(e for e in new_input if e.isalnum())
    return new_input

def word_segmentation(input):
    new_input = ','.join(jieba.cut(input))#分词
    return new_input

def preprocess_text(data):
    new_data = []
    for q in data:
        q = filter_out_category(q)#过滤掉符号
        q = filter_out_punctuation(q)#过滤掉字母和数字
        q = word_segmentation(q)#分词
        new_data.append(q)
    return new_data

qlist = preprocess_text(questions)   # 更新后的
print('questions after preprocess',qlist[0:3])

Step 3: Bag of words model and TFIDF model

#第三步:词袋模型和tf_idf模型
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#词袋模型
def bow_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


# # 词袋模型特征
def conver2BOW(data):
    new_data = []
    for q in data:
        new_data.append(q)
    bow_vectorizer, bow_X = bow_extractor(new_data)
    return bow_vectorizer, bow_X
bow_vectorizer, bow_X = conver2BOW(qlist)

# print('BOW model')
print('vectorizer',bow_vectorizer.get_feature_names())
print('vector of text',bow_X[0:3].toarray())

#tf_idf
def tfidf_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


# # tfidf 特征
def conver2tfidf(data):
    new_data = []
    for q in data:
        new_data.append(q)
    tfidf_vectorizer, tfidf_X = tfidf_extractor(new_data)
    return tfidf_vectorizer, tfidf_X
tfidf_vectorizer, tfidf_X = conver2tfidf(qlist)

print('TFIDF model')
print('vectorizer',tfidf_vectorizer.get_feature_names())
print('vector of text',tfidf_X[0:3].toarray())

The fourth step: cosine similarity.

#第四步:余弦相似度
import numpy as np
def idx_for_largest_cosine_sim(input, questions):
    list = []
    input = (input.toarray())[0]
    for question in questions:
        question = question.toarray()
        num = float(np.matmul(question, input))
        denom = np.linalg.norm(question) * np.linalg.norm(input)

        if denom ==0:
            cos = 0.0
        else:
            cos = num / denom

        list.append(cos)

    best_idx = list.index(max(list))
    return best_idx

Step 5: Calculate the text similarity of the questions in the test question corpus, and find the questions with higher similarity as a set of similar questions

#第五步:问题求解
#词袋模型求解
def answer_bow(input):
    input = filter_out_punctuation(input)#对输入进行过滤字母和数字
    input = word_segmentation(input)#对输入进行分词
    bow = bow_vectorizer.transform([input])#对输入进行词袋模型
    best_idx = idx_for_largest_cosine_sim(bow, bow_X)#将输入和问答库的问题进行相似度计算,取出最好的哪一个
    return answers[best_idx]

#tf-idf求解
def answer_tfidf(input):
    input = filter_out_punctuation(input)#对输入进行过滤字母和数字
    input = word_segmentation(input)#对输入进行分词
    bow = tfidf_vectorizer.transform([input])#对输入进行tf-idf模型
    best_idx = idx_for_largest_cosine_sim(bow, tfidf_X)#将输入和问答库的问题进行相似度计算,取出最好的哪一个
    return answers[best_idx]

Step 6: Test

#第六步:测试
print('词袋 model',answer_bow("火电厂是什么"))
print('tfidf model',answer_tfidf("火电厂"))

All codes

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author: yudengwu(余登武)
# @Date  : 2020/12/26
#@email:[email protected]
#第一步:读取数据

def read_corpus(file):
    with open(file,'r',encoding='utf8',errors='ignore') as f:
        list = []
        lines = f.readlines()
        for i in lines:
            list.append(i)
    return list

questions = read_corpus('./问题.txt')
answers = read_corpus('./答案.txt')

#第二步:预处理
import re
import jieba


def filter_out_category(input):
    new_input = re.sub('[\u4e00-\u9fa5]{2,5}\\/','',input) #过滤掉非汉字,即标点符号
    return new_input

def filter_out_punctuation(input):
    new_input = re.sub('([a-zA-Z0-9])','',input)#过滤掉字母和数字
    new_input = ''.join(e for e in new_input if e.isalnum())
    return new_input

def word_segmentation(input):
    new_input = ','.join(jieba.cut(input))#分词
    return new_input

def preprocess_text(data):
    new_data = []
    for q in data:
        q = filter_out_category(q)#过滤掉符号
        q = filter_out_punctuation(q)#过滤掉字母和数字
        q = word_segmentation(q)#分词
        new_data.append(q)
    return new_data

qlist = preprocess_text(questions)   # 更新后的问题


#第三步:词袋模型和tf_idf模型
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


def bow_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features




# # 词袋模型特征
def conver2BOW(data):
    new_data = []
    for q in data:
        new_data.append(q)
    bow_vectorizer, bow_X = bow_extractor(new_data)
    return bow_vectorizer, bow_X
bow_vectorizer, bow_X = conver2BOW(qlist)


#tf_idf
def tfidf_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


# # tfidf 特征
def conver2tfidf(data):
    new_data = []
    for q in data:
        new_data.append(q)
    tfidf_vectorizer, tfidf_X = tfidf_extractor(new_data)
    return tfidf_vectorizer, tfidf_X
tfidf_vectorizer, tfidf_X = conver2tfidf(qlist)


#第四步:余弦相似度
import numpy as np
def idx_for_largest_cosine_sim(input, questions):
    list = []
    input = (input.toarray())[0]
    for question in questions:
        question = question.toarray()
        num = float(np.matmul(question, input))
        denom = np.linalg.norm(question) * np.linalg.norm(input)

        if denom ==0:
            cos = 0.0
        else:
            cos = num / denom

        list.append(cos)

    best_idx = list.index(max(list))
    return best_idx

#第五步:问题求解
#词袋模型求解
def answer_bow(input):
    input = filter_out_punctuation(input)#对输入进行过滤字母和数字
    input = word_segmentation(input)#对输入进行分词
    bow = bow_vectorizer.transform([input])#对输入进行词袋模型
    best_idx = idx_for_largest_cosine_sim(bow, bow_X)#将输入和问答库的问题进行相似度计算,取出最好的哪一个
    return answers[best_idx]

#tf-idf求解
def answer_tfidf(input):
    input = filter_out_punctuation(input)#对输入进行过滤字母和数字
    input = word_segmentation(input)#对输入进行分词
    bow = tfidf_vectorizer.transform([input])#对输入进行tf-idf模型
    best_idx = idx_for_largest_cosine_sim(bow, tfidf_X)#将输入和问答库的问题进行相似度计算,取出最好的哪一个
    return answers[best_idx]

#第六步:测试
print('词袋 model',answer_bow("火电厂是什么"))
print('tfidf model',answer_tfidf("火电厂"))

Summary
The idea is not as good as the inverted table. Please refer to another one
when you have time. The inverted table is fast.
The link is as follows:

Construction of power dispatching knowledge question answering system based on inverted table

Insert picture description here
Author: Electricity - Yu Dengwu
Insert picture description here

Guess you like

Origin blog.csdn.net/kobeyu652453/article/details/111747281