Starrier/commons

View on GitHub
src/main/java/org/starrier/common/utils/SensitiveWordUtil.java

Summary

Maintainability
A
2 hrs
Test Coverage
package org.starrier.common.utils;
 
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
 
import static org.starrier.common.constant.Constant.IS_END;
import static org.starrier.common.constant.Constant.MAX_MATCH_TYPE;
import static org.starrier.common.constant.Constant.MIN_MATCH_T_YPE;
import static org.starrier.common.constant.Constant.ONE;
import static org.starrier.common.constant.Constant.ZERO;
 
/**
* @author Starrier
* @date 2019/4/10.
* <p>
* Description :
*/
public class SensitiveWordUtil {
 
/**
* 敏感词集合
*/
public static Map sensitiveWordMap;
 
private SensitiveWordUtil() {
init(null);
}
 
public static Map getSensitiveWordMap() {
return sensitiveWordMap;
}
 
public static void setSensitiveWordMap(Map sensitiveWordMap) {
SensitiveWordUtil.sensitiveWordMap = sensitiveWordMap;
}
 
/**
* 初始化敏感词库,构建DFA算法模型
*
* @param sensitiveWordSet 敏感词库
*/
public static synchronized void init(Set<String> sensitiveWordSet) {
sensitiveWordSet = new HashSet<>(7);
sensitiveWordSet.add("太多");
sensitiveWordSet.add("爱恋");
sensitiveWordSet.add("静静");
sensitiveWordSet.add("哈哈");
sensitiveWordSet.add("啦啦");
sensitiveWordSet.add("感动");
sensitiveWordSet.add("发呆");
initSensitiveWordMap(sensitiveWordSet);
}
 
/**
* 初始化敏感词库,构建 DFA 算法模型
*
* @param sensitiveWordSet 敏感词库
*/
Method `initSensitiveWordMap` has a Cognitive Complexity of 10 (exceeds 5 allowed). Consider refactoring.
@SuppressWarnings({"rawtypes", "unchecked"})
private static void initSensitiveWordMap(Set<String> sensitiveWordSet) {
//初始化敏感词容器,减少扩容操作
sensitiveWordMap = new HashMap<>(sensitiveWordSet.size());
String key;
Map nowMap;
Map newWorMap;
 
//迭代sensitiveWordSet
for (String s : sensitiveWordSet) {
//关键字
key = s;
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
//转换成char型
char keyChar = key.charAt(i);
//库中获取关键字
Object wordMap = nowMap.get(keyChar);
//如果存在该key,直接赋值,用于下一个循环获取
if (wordMap != null) {
nowMap = (Map) wordMap;
} else {
//不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
newWorMap = new HashMap<>(sensitiveWordSet.size());
//不是最后一个
newWorMap.put(IS_END, ZERO);
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if (i == key.length() - 1) {
//最后一个
nowMap.put(IS_END, ONE);
}
}
}
}
 
/**
* 判断文字是否包含敏感字符
*
* @param txt 文字
* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return 若包含返回true,否则返回false
*/
private static boolean contains(String txt, int matchType) {
boolean flag = false;
for (int i = 0; i < txt.length(); i++) {
if (checkSensitiveWord(txt, i, matchType) > 0) {
flag = true;
}
}
return flag;
}
 
/**
* 判断文字是否包含敏感字符
*
* @param txt 文字
* @return 若包含返回true,否则返回false
*/
public static boolean contains(String txt) {
return contains(txt, MAX_MATCH_TYPE);
}
 
/**
* 获取文字中的敏感词
*
* @param txt 文字
* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return Set<String>
*/
private static Set<String> getSensitiveWord(String txt, int matchType) {
Set<String> sensitiveWordList = new HashSet<>(txt.length());
IntStream.range(0, txt.length()).forEach(i -> {
int length = checkSensitiveWord(txt, i, matchType);
//存在,加入list中
if (length > 0) {
sensitiveWordList.add(txt.substring(i, i + length));
//减1的原因,是因为for会自增
i = i + length - 1;
}
});
return sensitiveWordList;
}
 
/**
* 获取文字中的敏感词
*
* @param txt content.
* @return return.
*/
public static Set<String> getSensitiveWord(String txt) {
return getSensitiveWord(txt, MAX_MATCH_TYPE);
}
 
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,
* 如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***
* @param matchType 敏感词匹配规则
* @return return
*/
private static String replaceSensitiveWord(String txt, char replaceChar, int matchType) {
String resultTxt = txt;
//获取所有的敏感词
Set<String> set = getSensitiveWord(txt, matchType);
Iterator<String> iterator = set.iterator();
String word;
String replaceString;
while (iterator.hasNext()) {
word = iterator.next();
replaceString = getReplaceChars(replaceChar, word.length());
resultTxt = resultTxt.replaceAll(word, replaceString);
}
 
return resultTxt;
}
 
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,
* 如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***
* @return result come from {@see SensitiveWordUtil#replaceSensitiveWord(String, char)}
*/
public static String replaceSensitiveWord(String txt, char replaceChar) {
return replaceSensitiveWord(txt, replaceChar, MAX_MATCH_TYPE);
}
 
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,
* 如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
* @param matchType 敏感词匹配规则
* @return return
*/
private static String replaceSensitiveWord(String txt, String replaceStr, int matchType) {
String resultTxt = txt;
//获取所有的敏感词
Set<String> set = getSensitiveWord(txt, matchType);
Iterator<String> iterator = set.iterator();
String word;
while (iterator.hasNext()) {
word = iterator.next();
resultTxt = resultTxt.replaceAll(word, replaceStr);
}
return resultTxt;
}
 
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换
* 如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
* @return return.
*/
public static String replaceSensitiveWord(String txt, String replaceStr) {
return replaceSensitiveWord(txt, replaceStr, MAX_MATCH_TYPE);
}
 
/**
* 获取替换字符串
*
* @param replaceChar replace char.
* @param length length.
* @return return.
*/
private static String getReplaceChars(char replaceChar, int length) {
//In JDK 11+
/*return String.valueOf(replaceChar) + String.valueOf( ).repeat(Math.max(0, length - 1));*/
/**
* In JDK 1.8
*/
return null;
}
 
/**
* 检查文字中是否包含敏感字符,检查规则如下:<br>
*
* @param txt content.
* @param beginIndex begin start index.
* @param matchType match type.
* @return 如果存在,则返回敏感词字符的长度,不存在返回0
*/
Method `checkSensitiveWord` has a Cognitive Complexity of 13 (exceeds 5 allowed). Consider refactoring.
private static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
//敏感词结束标识位:用于敏感词只有1位的情况
boolean flag = false;
//匹配标识数默认为0
int matchFlag = 0;
char word;
Map nowMap = sensitiveWordMap;
for (int i = beginIndex; i < txt.length(); i++) {
word = txt.charAt(i);
//获取指定key
nowMap = (Map) nowMap.get(word);
//存在,则判断是否为最后一个
if (nowMap != null) {
//找到相应key,匹配标识+1
matchFlag++;
//如果为最后一个匹配规则,结束循环,返回匹配标识数
if ("1".equals(nowMap.get("isEnd"))) {
//结束标志位为true
flag = true;
//最小规则,直接返回,最大规则还需继续查找
if (MIN_MATCH_T_YPE == matchType) {
break;
}
}
} else {
//不存在,直接返回
break;
}
}
//长度必须大于等于1,为词
if (matchFlag < 2 || !flag) {
matchFlag = 0;
}
return matchFlag;
}
}