Python sensitive word filtering DFA algorithm + free sensitive word library

DFA profile reference: https://blog.csdn.net/chenssy/article/details/26961957

This article is the python version, full version, modified version of the above JAVA sensitive word filtering

First, let’s take a look at the final processing effect.
Example 1:
Input string
Insert picture description here
Processing result
Insert picture description here
Core code:

SensitiveFilter class

The frame is as follows

class SensitiveFilter:

	#初始化
    def __init__(self):
        self.initSensitiveWordMap(self.sensitiveWordList)
        ...
        
    #构建敏感词库
    def initSensitiveWordMap(self,sensitiveWordList):
        ...
        
	#检测文本中存在的敏感词
    def checkSensitiveWord(self,txt,beginIndex=0):
        ...
	
	#得到输入字符串中敏感词列表
    def getSensitiveWord(self,txt):
    ...
    
	#替换文本中的敏感词
    def replaceSensitiveWord(self,txt,replaceChar='*'):
        ...
        

Let's look at each specific function
Part 1

init function initialization

    def __init__(self):
        # file把敏感词库加载到列表中
        file = open(Dir_sensitive, 'r', encoding = 'ANSI')
        file_lst = file.readlines()
        self.sensitiveWordList = [i.split('\n')[0] for i in file_lst]
        # print(sensitiveWordList[:10])
      	
        # file1把停用词加载到列表中
        file1 = open(Dir_stopWord, 'r', encoding = 'ANSI')
        file1_lst = file1.readlines()
        self.stopWordList = [i.split('\n')[0] for i in file1_lst]
        ##得到sensitive字典
        self.sensitiveWordMap = self.initSensitiveWordMap(self.sensitiveWordList)

Part 2

initSensitiveWordMap function to construct a sensitive word library

	#构建敏感词库
    def initSensitiveWordMap(self,sensitiveWordList):
        sensitiveWordMap = {
    
    }
        # 读取每一行,每一个word都是一个敏感词
        for word in sensitiveWordList:
            nowMap=sensitiveWordMap
            #遍历该敏感词的每一个特定字符
            for i in range(len(word)):
                keychar=word[i]
                wordMap=nowMap.get(keychar)
                if wordMap !=None:
                    #nowMap更新为下一层
                    nowMap=wordMap
                else:
                    #不存在则构建一个map,isEnd设置为0,因为不是最后一个
                    newNextMap={
    
    }
                    newNextMap["isEnd"]=0
                    nowMap[keychar]=newNextMap
                    nowMap=newNextMap
                #到这个词末尾字符
                if i==len(word)-1:
                    nowMap["isEnd"]=1
        #print(sensitiveWordMap)
        return sensitiveWordMap

Part 3

The checkSensitiveWord function detects the input text and returns the length of the sensitive word

    def checkSensitiveWord(self,txt,beginIndex=0):
        '''
        :param txt: 输入待检测的文本
        :param beginIndex:输入文本开始的下标
        :return:返回敏感词字符的长度
        '''
        nowMap=self.sensitiveWordMap
        sensitiveWordLen=0 #敏感词的长度
        containChar_sensitiveWordLen=0 #包括特殊字符敏感词的长度
        endFlag=False #结束标记位

        for i in range(beginIndex,len(txt)):
            char=txt[i]
            if char in self.stopWordList:
                containChar_sensitiveWordLen+=1
                continue

            nowMap=nowMap.get(char)
            if nowMap != None:
                sensitiveWordLen+=1
                containChar_sensitiveWordLen+=1
                #结束位置为True
                if nowMap.get("isEnd")==1:
                    endFlag=True
            else:
                break
        if  endFlag==False:
            containChar_sensitiveWordLen=0
        #print(sensitiveWordLen)
        return containChar_sensitiveWordLen

Part 4

getSensitiveWord function to get a list of sensitive words in the input text

    def getSensitiveWord(self,txt):
        cur_txt_sensitiveList=[]
        #注意,并不是一个个char查找的,找到敏感词会i增强敏感词的长度
        for i in range(len(txt)):
            length=self.checkSensitiveWord(txt,i)
            if length>0:
                word=txt[i:i+length]
                cur_txt_sensitiveList.append(word)
                i=i+length-1 
                #出了循环还要+1 i+length是没有检测到的,
                #下次直接从i+length开始

        return cur_txt_sensitiveList

Part 5

replaceSensitiveWord function sensitive word replacement part

    def replaceSensitiveWord(self,txt,replaceChar='*'):
        Lst=self.getSensitiveWord(txt)
        #print(Lst)
        for word in Lst:
            replaceStr=len(word)*replaceChar
            txt=txt.replace(word,replaceStr)

        return txt

Sensitive words and stop words can be customized. The
format is as follows

Complete code

#!/usr/bin/env python
#-*- coding:utf-8 -*-
# author:zbt
# datetime:2020-03-16 11:53
# software: PyCharm
Dir_sensitive='C:\\Users\\zbt\\Desktop\\X实习\\敏感词【ing】\\敏感词.txt'
Dir_stopWord='C:\\Users\\zbt\\Desktop\\X实习\\敏感词【ing】\\停用词.txt'
class SensitiveFilter:
    def __init__(self):
        # file把敏感词库加载到列表中
        file = open(Dir_sensitive, 'r', encoding = 'ANSI')
        file_lst = file.readlines()
        self.sensitiveWordList = [i.split('\n')[0] for i in file_lst]
        # print(sensitiveWordList[:10])
        # >>['1234', '12345', '123456', '甲基麻黄碱', '来曲唑', '依西美坦', '阿那曲唑', '螺内酯', '沙美特罗', '丙磺舒']
        # file1把停用词加载到列表中
        file1 = open(Dir_stopWord, 'r', encoding = 'ANSI')
        file1_lst = file1.readlines()
        self.stopWordList = [i.split('\n')[0] for i in file1_lst]
        ##得到sensitive字典
        self.sensitiveWordMap = self.initSensitiveWordMap(self.sensitiveWordList)
    #构建敏感词库
    def initSensitiveWordMap(self,sensitiveWordList):
        sensitiveWordMap = {
    
    }
        # 读取每一行,每一个word都是一个敏感词
        for word in sensitiveWordList:
            nowMap=sensitiveWordMap
            #遍历该敏感词的每一个特定字符
            for i in range(len(word)):
                keychar=word[i]
                wordMap=nowMap.get(keychar)
                if wordMap !=None:
                    #nowMap更新为下一层
                    nowMap=wordMap
                else:
                    #不存在则构建一个map,isEnd设置为0,因为不是最后一个
                    newNextMap={
    
    }
                    newNextMap["isEnd"]=0
                    nowMap[keychar]=newNextMap
                    nowMap=newNextMap
                #到这个词末尾字符
                if i==len(word)-1:
                    nowMap["isEnd"]=1
        #print(sensitiveWordMap)
        return sensitiveWordMap

    def checkSensitiveWord(self,txt,beginIndex=0):
        '''
        :param txt: 输入待检测的文本
        :param beginIndex:输入文本开始的下标
        :return:返回敏感词字符的长度
        '''
        nowMap=self.sensitiveWordMap
        sensitiveWordLen=0 #敏感词的长度
        containChar_sensitiveWordLen=0 #包括特殊字符敏感词的长度
        endFlag=False #结束标记位

        for i in range(beginIndex,len(txt)):
            char=txt[i]
            if char in self.stopWordList:
                containChar_sensitiveWordLen+=1
                continue

            nowMap=nowMap.get(char)
            if nowMap != None:
                sensitiveWordLen+=1
                containChar_sensitiveWordLen+=1
                #结束位置为True
                if nowMap.get("isEnd")==1:
                    endFlag=True
            else:
                break
        if  endFlag==False:
            containChar_sensitiveWordLen=0
        #print(sensitiveWordLen)
        return containChar_sensitiveWordLen

    def getSensitiveWord(self,txt):
        cur_txt_sensitiveList=[]
        #注意,并不是一个个char查找的,找到敏感词会i增强敏感词的长度
        for i in range(len(txt)):
            length=self.checkSensitiveWord(txt,i)
            if length>0:
                word=txt[i:i+length]
                cur_txt_sensitiveList.append(word)
                i=i+length-1 #出了循环还要+1 i+length是没有检测到的,下次直接从i+length开始

        return cur_txt_sensitiveList

    def replaceSensitiveWord(self,txt,replaceChar='*'):
        Lst=self.getSensitiveWord(txt)
        #print(Lst)
        for word in Lst:
            replaceStr=len(word)*replaceChar
            txt=txt.replace(word,replaceStr)

        return txt

if __name__ == "__main__":
    str="blablablabla"
    Filter=SensitiveFilter()
    replaceStr=Filter.replaceSensitiveWord(str)
    print(replaceStr)

Finally, sensitive words and stop words are included for free
https://pan.baidu.com/s/1AftA45Zdz2_AtVJEuI5jHA
password
b0rs

Guess you like

Origin blog.csdn.net/weixin_39666736/article/details/104903518