文章目录
问题描述
问题描述:
有一个列表:A,大约15W行。有一个字符串:B。
现在想找到这个列表A中,所有字符串B的字串。
普通方法:遍历这个列表A,判断每个元素是否在这个字符串B中
问题:想要提升效率,(A的存储类型可以改)
比如:
A:['小王王', '王王', '大王王', '大王', '中王王']
B: "我爱小王王"
结果: '小王王', '王王'
将列表存储改为查找树存储
# 非叶子节点
def addDictTree(inDict, c):
if c not in inDict:
inDict[c] = {
}
return inDict[c]
# 叶子节点,格式"end":[w, type_w, level]
def addDictEnd(inDict, w, type_w, level):
inDict["end"] = [w, type_w, level]
return None
# 构建字典查找树
def buildSearchTree(dbDict, dictTree):
for w, type_w, level in dbDict:
# 根结点
node = addDictTree(dictTree, w[0])
# 子节点
for i in w[1:]:
node = addDictTree(node, i)
node = addDictEnd(node, w, type_w, level)
return dictTree
查找树搜索
# 查找匹配字符串
def searchNode(inDict, c):
if c in inDict:
if c in inDict:
if "end" in inDict[c]:
return "END", inDict[c] # 到达叶子节点
else: # 非叶子节点
return "NODE", inDict[c]
else: # 不存在分支
return "SAFE", None
else:
return "SAFE", None
# 查找嵌套字典中某key的value
def get_dict_value(in_dict, target_key, results=[], not_d=True):
# 迭代当前的字典层级
for key in in_dict.keys():
# 将当前字典层级的第一个元素的值赋值给data
data = in_dict[key]
# 如果当前data属于dict类型, 递归
if isinstance(data, dict):
get_dict_value(data, target_key, results=results, not_d=not_d)
# 如果当前键与目标键相等, 并且判断是否要筛选
if key == target_key and isinstance(data, dict) != not_d:
results.append(in_dict[key])
return results
def searchStr(inStr):
'''
Returns:
'BLOCK', [['大王王', 'title', 80], ['小王王', 'title', 80], ['王王', 'title', 70]]
'SAFE', None
'''
tmpflag = 0
resdict = {
}
for i in range(len(inStr)):
# 接受广播的查找树
tmpDict = dictTree
rst, nextDict = searchNode(tmpDict, inStr[i])
# 检查子串
for j in range(1, len(inStr) - i):
if rst == "END":
tmpflag = 1
resdict[nextDict['end'][0]] = nextDict
elif rst == "NODE":
rst, nextDict = searchNode(nextDict, inStr[i + j])
elif rst == "SAFE":
break
# 剩余串状态
if rst == "END":
tmpflag = 1
resdict[nextDict['end'][0]] = nextDict
i += 1
if tmpflag == 1:
keywords, types, levels = [], [], []
allvalues = get_dict_value(resdict, 'end', results=[])
# 精确匹配子串中的违禁词
for i in range(len(allvalues)):
if allvalues[i][0] in inStr:
keywords.append(allvalues[i][0])
types.append(allvalues[i][1])
levels.append(allvalues[i][2])
sorce = sum(levels) if sum(levels) < 100 else 99
return "BLOCK", keywords, types, sorce
else:
return "SAFE", None, None, None
应用
# 违禁词过滤
def check_txtblock(title):
code, keywords, types, sorce = searchStr(title)
if code == "SAFE":
return {
'code': 0}
else:
return {
'code': 2, 'message': {
'keywords': keywords, 'type': types, 'score': sorce}}
# 违禁词demo
dbDict = (('XXX', 'title', 80),('DDD', 'title', 70), ('AAAA', 'title', 80), ('WEWEWEWEWE', 'title', 95),
('小王王', 'title', 100), ('王王', 'title', 99), ('小王王万岁', 'title', 99), ('大王王万万岁', 'title', 99))
dictTree = {
}
dictTree = buildSearchTree(dbDict, dictTree)
print(check_txtblock('我爱小王王'))
print(check_txtblock('XXXPPPPPPP'))
{'code': 2, 'message': {'keywords': ['小王王', '王王'], 'type': ['title', 'title'], 'score': 99}}
{'code': 2, 'message': {'keywords': ['XXX'], 'type': ['title'], 'score': 70}}