Python 优化大列表遍历速度(列表转为查找树存储)

问题描述

问题描述

有一个列表:A,大约15W行。有一个字符串:B。

现在想找到这个列表A中,所有字符串B的字串。

普通方法:遍历这个列表A,判断每个元素是否在这个字符串B中

问题:想要提升效率,(A的存储类型可以改)

比如:

A:['小王王', '王王', '大王王', '大王', '中王王']   
B: "我爱小王王"
结果: '小王王', '王王'

将列表存储改为查找树存储

# 非叶子节点
def addDictTree(inDict, c):
    if c not in inDict:
        inDict[c] = {
    
    }
    return inDict[c]

# 叶子节点,格式"end":[w, type_w, level]
def addDictEnd(inDict, w, type_w, level):
    inDict["end"] = [w, type_w, level]
    return None

# 构建字典查找树
def buildSearchTree(dbDict, dictTree):
    for w, type_w, level in dbDict:
        # 根结点
        node = addDictTree(dictTree, w[0])
        # 子节点
        for i in w[1:]:
            node = addDictTree(node, i)
        node = addDictEnd(node, w, type_w, level)
    return dictTree

查找树搜索

# 查找匹配字符串
def searchNode(inDict, c):
    if c in inDict:
        if c in inDict:
            if "end" in inDict[c]:
                return "END", inDict[c]  # 到达叶子节点
            else:   # 非叶子节点
                return "NODE", inDict[c]
        else:   # 不存在分支
            return "SAFE", None
    else:
        return "SAFE", None

# 查找嵌套字典中某key的value
def get_dict_value(in_dict, target_key, results=[], not_d=True):
    # 迭代当前的字典层级
    for key in in_dict.keys():
        # 将当前字典层级的第一个元素的值赋值给data
        data = in_dict[key]
        # 如果当前data属于dict类型, 递归
        if isinstance(data, dict):
            get_dict_value(data, target_key, results=results, not_d=not_d)
        # 如果当前键与目标键相等, 并且判断是否要筛选
        if key == target_key and isinstance(data, dict) != not_d:
            results.append(in_dict[key])
    return results

def searchStr(inStr):
    '''
    Returns:
        'BLOCK', [['大王王', 'title', 80], ['小王王', 'title', 80], ['王王', 'title', 70]]
        'SAFE', None
    '''
    tmpflag = 0
    resdict = {
    
    }
    for i in range(len(inStr)):
        # 接受广播的查找树
        tmpDict = dictTree
        rst, nextDict = searchNode(tmpDict, inStr[i])
        # 检查子串
        for j in range(1, len(inStr) - i):
            if rst == "END":
                tmpflag = 1
                resdict[nextDict['end'][0]] = nextDict
            elif rst == "NODE":
                rst, nextDict = searchNode(nextDict, inStr[i + j])
            elif rst == "SAFE":
                break
        # 剩余串状态
        if rst == "END":
            tmpflag = 1
            resdict[nextDict['end'][0]] = nextDict
        i += 1

    if tmpflag == 1:
        keywords, types, levels = [], [], []
        allvalues = get_dict_value(resdict, 'end', results=[])

        # 精确匹配子串中的违禁词
        for i in range(len(allvalues)):
            if allvalues[i][0] in inStr:
                keywords.append(allvalues[i][0])
                types.append(allvalues[i][1])
                levels.append(allvalues[i][2])

        sorce = sum(levels) if sum(levels) < 100 else 99
        return "BLOCK", keywords, types, sorce
    else:
        return "SAFE", None, None, None

应用

# 违禁词过滤
def check_txtblock(title):
    code, keywords, types, sorce = searchStr(title)
    if code == "SAFE":
        return {
    
    'code': 0}
    else:
        return {
    
    'code': 2, 'message': {
    
    'keywords': keywords, 'type': types, 'score': sorce}}
       

# 违禁词demo       
dbDict = (('XXX', 'title', 80),('DDD', 'title', 70), ('AAAA', 'title', 80), ('WEWEWEWEWE', 'title', 95),
         ('小王王', 'title', 100), ('王王', 'title', 99), ('小王王万岁', 'title', 99), ('大王王万万岁', 'title', 99))

dictTree = {
    
    }
dictTree = buildSearchTree(dbDict, dictTree)

print(check_txtblock('我爱小王王'))
print(check_txtblock('XXXPPPPPPP'))
{'code': 2, 'message': {'keywords': ['小王王', '王王'], 'type': ['title', 'title'], 'score': 99}}
{'code': 2, 'message': {'keywords': ['XXX'], 'type': ['title'], 'score': 70}}

Guess you like

Origin blog.csdn.net/qq_42363032/article/details/121167655