JAVA敏感词过滤

JAVA敏感词过滤

一、初始化敏感词库

  1 import java.io.BufferedReader;
  2 import java.io.File;
  3 import java.io.FileInputStream;
  4 import java.io.InputStreamReader;
  5 import java.util.HashMap;
  6 import java.util.HashSet;
  7 import java.util.Iterator;
  8 import java.util.Map;
  9 import java.util.Set;
 10 
 11 /**
 12  * 初始化敏感词库,将敏感词加入到HashMap中,构建DFA算法模型
 13  */
 14 public class SensitiveWordInit {
 15     private String ENCODING = "utf-8";    //字符编码
 16     public HashMap sensitiveWordMap;
 17     public SensitiveWordInit(){
 18         super();
 19     }
 20 
 21     /**
 22      * 初始化
 23      */
 24     public Map initKeyWord(){
 25         try {
 26             //读取敏感词库
 27             Set<String> keyWordSet = readSensitiveWordFile();
 28             //将敏感词库加入到HashMap中
 29             addSensitiveWordToHashMap(keyWordSet);
 30             //spring获取application,然后application.setAttribute("sensitiveWordMap",sensitiveWordMap);
 31         } catch (Exception e) {
 32             e.printStackTrace();
 33         }
 34         return sensitiveWordMap;
 35     }
 36 
 37     /**
 38      * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
 39      * 中 = {
 40      *      isEnd = 0
 41      *      国 = {<br>
 42      *           isEnd = 1
 43      *           人 = {isEnd = 0
 44      *                民 = {isEnd = 1}
 45      *                }
 46      *           男  = {
 47      *                  isEnd = 0
 48      *                   人 = {
 49      *                        isEnd = 1
 50      *                       }
 51      *               }
 52      *           }
 53      *      }
 54      *  五 = {
 55      *      isEnd = 0
 56      *      星 = {
 57      *          isEnd = 0
 58      *          红 = {
 59      *              isEnd = 0
 60      *              旗 = {
 61      *                   isEnd = 1
 62      *                  }
 63      *              }
 64      *          }
 65      *      }
 66      */
 67     private void addSensitiveWordToHashMap(Set<String> keyWordSet) {
 68         sensitiveWordMap = new HashMap(keyWordSet.size());     //初始化敏感词容器,减少扩容操作
 69         String key = null;
 70         Map nowMap = null;
 71         Map<String, String> newWorMap = null;
 72         //迭代keyWordSet
 73         Iterator<String> iterator = keyWordSet.iterator();
 74         while(iterator.hasNext()){
 75             key = iterator.next();    //关键字
 76             nowMap = sensitiveWordMap;
 77             for(int i = 0 ; i < key.length() ; i++){
 78                 char keyChar = key.charAt(i);       //转换成char型
 79                 Object wordMap = nowMap.get(keyChar);       //获取
 80 
 81                 if(wordMap != null){        //如果存在该key,直接赋值
 82                     nowMap = (Map) wordMap;
 83                 }
 84                 else{     //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
 85                     newWorMap = new HashMap<String,String>();
 86                     newWorMap.put("isEnd", "0");     //不是最后一个
 87                     nowMap.put(keyChar, newWorMap);
 88                     nowMap = newWorMap;
 89                 }
 90 
 91                 if(i == key.length() - 1){
 92                     nowMap.put("isEnd", "1");    //最后一个
 93                 }
 94             }
 95         }
 96     }
 97 
 98     /**
 99      * 读取敏感词库中的内容,将内容添加到set集合中
100      */
101     @SuppressWarnings("resource")
102     private Set<String> readSensitiveWordFile() throws Exception{
103         Set<String> set = null;
104         //https://github.com/heqiyoujing/config_file 词库地址
105         File file = new File("D:\\SensitiveWord.txt");    //读取文件
106         InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
107         try {
108             if(file.isFile() && file.exists()){      //文件流是否存在
109                 set = new HashSet<String>();
110                 BufferedReader bufferedReader = new BufferedReader(read);
111                 String txt = null;
112                 while((txt = bufferedReader.readLine()) != null){    //读取文件,将文件内容放入到set中
113                     set.add(txt);
114                 }
115             }
116             else{         //不存在抛出异常信息
117                 throw new Exception("敏感词库文件不存在");
118             }
119         } catch (Exception e) {
120             throw e;
121         }finally{
122             read.close();     //关闭文件流
123         }
124         return set;
125     }
126 }
View Code

二、检查敏感词并替换

  1 import java.util.HashSet;
  2 import java.util.Iterator;
  3 import java.util.Map;
  4 import java.util.Set;
  5 
  6 /**
  7  * 敏感词过滤
  8  */
  9 public class SensitivewordFilter {
 10     private Map sensitiveWordMap = null;
 11     public static int minMatchTYpe = 1;      //最小匹配规则
 12     public static int maxMatchType = 2;      //最大匹配规则
 13     private static String replaceString = null;
 14     /**例如:敏感词中含有中国人、中国
 15      * 最小匹配规则minMatchTYpe为1时,会匹配出**人,为2时,会匹配出***
 16      * */
 17     public static void main(String[] args) throws Exception{
 18         SensitivewordFilter filter = new SensitivewordFilter();
 19         System.out.println("敏感词的数量:" + filter.sensitiveWordMap.size());
 20         String string = "dfa是面向三级装配的设计(Design for assembly)的英文简称,是指在产品设计阶段设计产品使得产品具有良好" +
 21                 "的可装配性,确保装配工序简单、装配效率高、装配质量高、装配不良率低和装配成本低。面向装配的设计通过一系" +
 22                 "列有利于装配的设计指南例如简化产品设计、减少零件数量等,女女并同装配工程师一起合作,被逼简化产品结构,近親使其便于" +
 23                 "装配,为提高产品质量、缩短产品开发周期和降低产品成本奠定基础";
 24         // ------获取敏感词---------
 25         Set<String> set = filter.getSensitiveWord(string, 1);
 26         System.out.println("含敏感词的个数为:" + set.size() + "。包含:" + set);
 27         // ------------------------替换敏感字begin----------------------
 28         Iterator<String> iterator = set.iterator();
 29         String word = null;
 30         while (iterator.hasNext()) {
 31             word = iterator.next();
 32             /**
 33              * 得到word中敏感关键词被替换后的字符串,例如:***
 34              * */
 35             getReplaceCharsS("*", word.length());
 36             /**
 37              * 将原字符串中的敏感关键词替换成带有replaceChar
 38              * 或全部为replaceChar的关键词
 39              * */
 40             string = string.replaceAll(word, replaceString);
 41         }
 42         // ------------------------替换敏感字end----------------------
 43         System.out.println(string);
 44     }
 45 
 46     /**
 47      * 构造函数,初始化敏感词库
 48      */
 49     public SensitivewordFilter(){
 50         sensitiveWordMap = new SensitiveWordInit().initKeyWord();
 51     }
 52 
 53     /**
 54      * 判断文字是否包含敏感字符
 55      * @param matchType  匹配规则&nbsp;1:最小匹配规则,2:最大匹配规则
 56      */
 57     public boolean isContaintSensitiveWord(String txt,int matchType){
 58         boolean flag = false;
 59         for(int i = 0 ; i < txt.length() ; i++){
 60             int matchFlag = this.CheckSensitiveWord(txt, i, matchType); //判断是否包含敏感字符
 61             if(matchFlag > 0){    //大于0存在,返回true
 62                 flag = true;
 63             }
 64         }
 65         return flag;
 66     }
 67 
 68     /**
 69      * 获取文字中的敏感词
 70      * @param matchType 匹配规则&nbsp;1:最小匹配规则,2:最大匹配规则
 71      */
 72     public Set<String> getSensitiveWord(String txt , int matchType){
 73         Set<String> sensitiveWordList = new HashSet<String>();
 74 
 75         for(int i = 0 ; i < txt.length() ; i++){
 76             int length = CheckSensitiveWord(txt, i, matchType);    //判断是否包含敏感字符
 77             if(length > 0){    //存在,加入list中
 78                 sensitiveWordList.add(txt.substring(i, i+length));
 79                 i = i + length - 1;    //减1的原因,是因为for会自增
 80             }
 81         }
 82 
 83         return sensitiveWordList;
 84     }
 85 
 86     /**
 87      * 替换敏感字字符,默认*
 88      */
 89     public String replaceSensitiveWord(String txt,int matchType,String replaceChar){
 90         String resultTxt = txt;
 91         Set<String> set = getSensitiveWord(txt, matchType);     //获取所有的敏感词
 92         Iterator<String> iterator = set.iterator();
 93         String word = null;
 94         String replaceString = null;
 95         while (iterator.hasNext()) {
 96             word = iterator.next();
 97             replaceString = getReplaceChars(replaceChar, word.length());
 98             resultTxt = resultTxt.replaceAll(word, replaceString);
 99         }
100 
101         return resultTxt;
102     }
103 
104     /**
105      * 获取替换字符串
106      */
107     private String getReplaceChars(String replaceChar,int length){
108         String resultReplace = replaceChar;
109         for(int i = 1 ; i < length ; i++){
110             resultReplace += replaceChar;
111         }
112 
113         return resultReplace;
114     }
115 
116     /**
117      * 获取替换字符串,无返回值
118      */
119     private static void getReplaceCharsS(String replaceChar,int length){
120         replaceString = "";
121         String resultReplace = replaceChar;
122         for(int i = 1 ; i < length ; i++){
123             resultReplace += replaceChar;
124         }
125         replaceString = resultReplace;
126     }
127 
128     /**
129      * 检查文字中是否包含敏感字符,检查规则如下:<br>
130      */
131     @SuppressWarnings({ "rawtypes"})
132     public int CheckSensitiveWord(String txt,int beginIndex,int matchType){
133         boolean  flag = false;    //敏感词结束标识位:用于敏感词只有1位的情况
134         int matchFlag = 0;     //匹配标识数默认为0
135         char word = 0;
136         Map nowMap = sensitiveWordMap;
137         for(int i = beginIndex; i < txt.length() ; i++){
138             word = txt.charAt(i);
139             nowMap = (Map) nowMap.get(word);     //获取指定key
140             if(nowMap != null){     //存在,则判断是否为最后一个
141                 matchFlag++;     //找到相应key,匹配标识+1
142                 if("1".equals(nowMap.get("isEnd"))){       //如果为最后一个匹配规则,结束循环,返回匹配标识数
143                     flag = true;       //结束标志位为true
144                     if(SensitivewordFilter.minMatchTYpe == matchType){    //最小规则,直接返回,最大规则还需继续查找
145                         break;
146                     }
147                 }
148             }
149             else{     //不存在,直接返回
150                 break;
151             }
152         }
153         if(matchFlag < 2 || !flag){        //长度必须大于等于1,为词
154             matchFlag = 0;
155         }
156         return matchFlag;
157     }
158 
159 }
View Code

三、运行结果

猜你喜欢

转载自www.cnblogs.com/heqiyoujing/p/9259777.html