PYTHON handles string matching

Encountered a problem today: string matching.
Actual business: Compare the similarity between N JSON strings and a line of text. And choose the most similar M JSON.

The business logic is consistent with this:
I have 100 numbers here
Insert picture description hereand then another table. Insert picture description herePlease find the same data, similarity and positioning of table 1 and table 2.
(It is not important to lose a few data, the important thing is the similarity)
I have been a dog for XX days.

————————————————————————

Write a script to locate:

def get_list2(data, line=2):
    data = data.replace('\n', '')
    data = str(data)
    data_list =[]
    if data == '':
        data ='¥¥'
    for i in range(0, int(len(data)/line)):
        part = data[i*line:i*line+line]
        data_list.append(part)
    return data_list




file1 = open('l1.txt', 'r', encoding='gbk')
file2 = open('l2.txt', 'r', encoding='gbk',errors='replace')
data1 = file1.read()
data2 = file2.read()
file1.close()
file2.close()
count = []
key_list = []
list1 = data1.split("/>")
list2 = get_list2(data2)

# print(list1)
# print(list2)
for part in list1:
    count_num = 0
    key_part = []
    for key in list2:
        if part.find(key) != -1:
            count_num = count_num+1
            key_part.append(key)
    key_part_all = "".join(key_part)
    count.append(count_num)
    key_list.append(key_part_all)
print(len(count))
print(len(list1))
for i in range(len(count)):
    for j in range(len(count)-i-1):
        if count[j] < count[j+1]:
            count[j], count[j+1] = count[j+1], count[j]
            list1[j], list1[j+1] = list1[j+1], list1[j]
            key_list[j], key_list[j+1] = key_list[j+1], key_list[j]
print(list1[0])
print('**************************************************************************')
print(list1[1])
print('--------------------------------------------------------------------------')
print("最符合的信息串为:{}\n匹配的正文为{}\n匹配权重为{}".format(list1[0], key_list[0], count[0]))
print('\n*******************\n')
print("比较符合的信息串为:{}\n匹配的正文为{}\n匹配权重为{}".format(list1[1], key_list[1], count[0]))

First: save JSON, etc. as Dbunit. (Let them lose the format and can be separated)
Then splice the string to l1.txt.
Then the string to be matched is l2.txt.
Then the string to be matched is disassembled into various small strings of 2, 3, and then compared with each list.

After writing, I found that although the business logic is simple and simple, it is quite useful.

Make do with it.



Optimized a lost algorithm:

import time
import sys
file1_root = r'F:\d1\getmessage\l1.txt'
file2_root = r'F:\d1\getmessage\l2.txt'


def get_char(end='end', end_char=''):
    data = 'char_is_null'
    while 1:
        var = input()
        if var == str(end):
            break
        elif end_char != '' and var.find(end_char) != -1:
            var = var[0:var.find(end_char)]
            data = '{}\n{}'.format(data, var)
            break
        else:
            data = '{}\n{}'.format(data, var)
    return data.replace('char_is_null\n', '')


def get_list2(data, line=2):
    data = data.replace('\n', '')
    data = str(data)
    data_list =[]
    if data == '':
        data = '**************************'
    for i in range(0, int(len(data)/line)):
        part = data[i*line:i*line+line]
        data_list.append(part)
    return data_list


file1 = open(file1_root, 'a+', encoding='gbk', errors='replace')
file1.close()
file2 = open(file2_root, 'a+', encoding='gbk', errors='replace')
file2.close()
while 1:
    file1 = open(file1_root, 'r', encoding='gbk', errors='replace')
    file2 = open(file2_root, 'r', encoding='gbk', errors='replace')
    data1 = file1.read()
    data2 = file2.read()
    print("获取的字符串为:{}".format(data1))
    print("比对的字符串为:{}".format(data2))
    file1.close()
    file2.close()
    get_data = input("需要获取文件?Y/N\n")
    if get_data == 'Y' or get_data == 'y':
        print("以/>分隔的字符串,以end结束\n")
        data1 = get_char()
        data2 = input("比对的字符串为,以end结束:\n")
        data2 = get_char()
        file1 = open(file1_root, 'w+', encoding='gbk', errors='replace')
        file1.write(data1)
        file1.close()
        file2 = open(file2_root, 'w+', encoding='gbk', errors='replace')
        file2.write(data2)
        file2.close()
    else:
        print('不获取新数据,处理历史数据\n')
    data1 = data1.lower()
    data2 = data2.lower()
    count = []
    key_list = []
    list1 = data1.split("/>")
    list1.append('')
    list2 = get_list2(data2)
    try:
        # print(list1)
        # print(list2)
        for part in list1:
            count_num = 0
            key_part = []
            for key in list2:
                if part.find(key) != -1:
                    count_num = count_num + 1
                    key_part.append(key)
            key_part_all = "".join(key_part)
            count.append(count_num)
            key_list.append(key_part_all)
        print(len(count))
        print(len(list1))
        for i in range(len(count)):
            for j in range(len(count) - i - 1):
                if count[j] < count[j + 1]:
                    count[j], count[j + 1] = count[j + 1], count[j]
                    list1[j], list1[j + 1] = list1[j + 1], list1[j]
        for i in range(100):
            if count[i] <= count[0] * 0.5 and i > 0:
                break
            else:
                print(list1[0])
                print('**************************************************************************')
                print('--------------------------------------------------------------------------')
                print("符合的信息串为:{}\n匹配的正文为{}\n匹配权重为{}".format(list1[0], key_list[0], count[0]))
    except Exception as e:
        print(e)
    var = input('点击重新操作?Y/N')
    if var == 'Y' or var == 'y':
        pass
    else:
        break





Guess you like

Origin blog.csdn.net/weixin_45642669/article/details/114133660