敏感关键词过滤工具类

package com.startx.http.wordfilter;/*** 结束类型定义** @author minghu.zhang* @date 11:37 2020/11/11**/
public enum EndType {/*** 有下一个,结束*/HAS_NEXT, IS_END
}
package com.startx.http.wordfilter;import java.util.List;/*** 敏感词标记** @author minghu.zhang*/
public class FlagIndex {/*** 标记结果*/private boolean flag;/*** 标记索引*/private List index;public boolean isFlag() {return flag;}public void setFlag(boolean flag) {this.flag = flag;}public List getIndex() {return index;}public void setIndex(List index) {this.index = index;}}

算法类


package com.startx.http.wordfilter;import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;/*** 词库上下文环境* 

* 初始化敏感词库,将敏感词加入到HashMap中,构建DFA算法模型** @author minghu.zhang*/ @SuppressWarnings({"rawtypes", "unchecked"}) public class WordContext {/*** 敏感词字典*/private final Map wordMap = new HashMap(1024);/*** 是否已初始化*/private boolean init;/*** 黑名单列表*/private String blackList;/*** 白名单列表*/private String whiteList;public WordContext() {this.blackList = "/blacklist.txt";this.whiteList = "/whitelist.txt";initKeyWord();}public WordContext(String blackList, String whiteList) {this.blackList = blackList;this.whiteList = whiteList;initKeyWord();}/*** 获取初始化的敏感词列表** @return 敏感词列表*/public Map getWordMap() {return wordMap;}/*** 初始化*/private synchronized void initKeyWord() {try {if (!init) {// 将敏感词库加入到HashMap中addWord(readWordFile(blackList), WordType.BLACK);// 将非敏感词库也加入到HashMap中addWord(readWordFile(whiteList), WordType.WHITE);}init = true;} catch (Exception e) {throw new RuntimeException(e);}}/*** 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
* 中 = { isEnd = 0 国 = {
* isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd = 1 }* } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1 } } } }*/public void addWord(Iterable wordList, WordType wordType) {Map nowMap;Map newWorMap;// 迭代keyWordSetfor (String key : wordList) {nowMap = wordMap;for (int i = 0; i < key.length(); i++) {// 转换成char型char keyChar = key.charAt(i);// 获取Object wordMap = nowMap.get(keyChar);// 如果存在该key,直接赋值if (wordMap != null) {nowMap = (Map) wordMap;} else {// 不存在则构建一个map,同时将isEnd设置为0,因为他不是最后一个newWorMap = new HashMap<>(4);// 不是最后一个newWorMap.put("isEnd", String.valueOf(EndType.HAS_NEXT.ordinal()));nowMap.put(keyChar, newWorMap);nowMap = newWorMap;}if (i == key.length() - 1) {// 最后一个nowMap.put("isEnd", String.valueOf(EndType.IS_END.ordinal()));nowMap.put("isWhiteWord", String.valueOf(wordType.ordinal()));}}}}/*** 在线删除敏感词** @param wordList 敏感词列表* @param wordType 黑名单 BLACk,白名单WHITE*/public void removeWord(Iterable wordList, WordType wordType) {Map nowMap;for (String key : wordList) {List cacheList = new ArrayList<>();nowMap = wordMap;for (int i = 0; i < key.length(); i++) {char keyChar = key.charAt(i);Object map = nowMap.get(keyChar);if (map != null) {nowMap = (Map) map;cacheList.add(nowMap);} else {throw new RuntimeException("操作失败,黑白名单都没有该词汇[" + key + "]");}if (i == key.length() - 1) {char[] keys = key.toCharArray();boolean cleanable = false;char lastChar = 0;for (int j = cacheList.size() - 1; j >= 0; j--) {Map cacheMap = cacheList.get(j);if (j == cacheList.size() - 1) {if (String.valueOf(WordType.BLACK.ordinal()).equals(cacheMap.get("isWhiteWord"))) {if (wordType == WordType.WHITE) {throw new RuntimeException("操作失败,白名单没有该词汇[" + key + "]");}}if (String.valueOf(WordType.WHITE.ordinal()).equals(cacheMap.get("isWhiteWord"))) {if (wordType == WordType.BLACK) {throw new RuntimeException("操作失败,黑名单没有该词汇[" + key + "]");}}cacheMap.remove("isWhiteWord");cacheMap.remove("isEnd");if (cacheMap.size() == 0) {cleanable = true;continue;}}if (cleanable) {Object isEnd = cacheMap.get("isEnd");if (String.valueOf(EndType.IS_END.ordinal()).equals(isEnd)) {cleanable = false;}cacheMap.remove(lastChar);}lastChar = keys[j];}if (cleanable) {wordMap.remove(lastChar);}}}}}/*** 读取敏感词库中的内容,将内容添加到set集合中*/private Set readWordFile(String file) throws Exception {Set set;// 字符编码String encoding = "UTF-8";try (InputStreamReader read = new InputStreamReader(this.getClass().getResourceAsStream(file), encoding)) {set = new HashSet<>();BufferedReader bufferedReader = new BufferedReader(read);String txt;// 读取文件,将文件内容放入到set中while ((txt = bufferedReader.readLine()) != null) {set.add(txt);}}// 关闭文件流return set;} }

字符判断类


package com.startx.http.wordfilter;import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Objects;/*** 敏感词过滤器** @author minghu.zhang*/
@SuppressWarnings("rawtypes")
public class WordFilter {/*** 词库上下文环境*/private WordContext context = new WordContext();/*** 敏感词表*/private final Map wordMap;/*** 构造函数*/public WordFilter(WordContext context) {this.wordMap = context.getWordMap();}/*** 替换敏感词** @param text 输入文本*/public String replace(final String text) {return replace(text, 0, '*');}/*** 替换敏感词** @param text   输入文本* @param symbol 替换符号*/public String replace(final String text, final char symbol) {return replace(text, 0, symbol);}/*** 替换敏感词** @param text   输入文本* @param skip   文本距离* @param symbol 替换符号*/public String replace(final String text, final int skip, final char symbol) {char[] charset = text.toCharArray();for (int i = 0; i < charset.length; i++) {FlagIndex fi = getFlagIndex(charset, i, skip);if (fi.isFlag()) {for (int j : fi.getIndex()) {charset[j] = symbol;}} else {if (fi.getIndex().size() > 0) {i += fi.getIndex().size() - 1;}}}return new String(charset);}/*** 是否包含敏感词** @param text 输入文本*/public boolean include(final String text) {return include(text, 0);}/*** 是否包含敏感词** @param text 输入文本* @param skip 文本距离*/public boolean include(final String text, final int skip) {boolean flag = false;char[] charset = text.toCharArray();for (int i = 0; i < charset.length; i++) {flag = getFlagIndex(charset, i, skip).isFlag();if (flag) {break;}}return flag;}/*** 获取敏感词数量** @param text 输入文本*/public int wordCount(final String text) {return wordCount(text, 0);}/*** 获取敏感词数量** @param text 输入文本* @param skip 文本距离*/public int wordCount(final String text, final int skip) {int count = 0;char[] charset = text.toCharArray();for (int i = 0; i < charset.length; i++) {FlagIndex fi = getFlagIndex(charset, i, skip);if (fi.isFlag()) {count++;}}return count;}/*** 获取敏感词列表** @param text 输入文本*/public List wordList(final String text) {return wordList(text, 0);}/*** 获取敏感词列表** @param text 输入文本* @param skip 文本距离*/public List wordList(final String text, final int skip) {List sensitives = new ArrayList<>();char[] charset = text.toCharArray();for (int i = 0; i < charset.length; i++) {FlagIndex fi = getFlagIndex(charset, i, skip);if (fi.isFlag()) {StringBuilder builder = new StringBuilder();for (int j : fi.getIndex()) {char word = text.charAt(j);builder.append(word);}sensitives.add(builder.toString());}}return sensitives;}/*** 获取标记索引** @param charset 输入文本* @param begin   检测起始* @param skip    文本距离*/private FlagIndex getFlagIndex(final char[] charset, final int begin, final int skip) {FlagIndex fi = new FlagIndex();Map current = wordMap;boolean flag = false;int count = 0;List index = new ArrayList<>();for (int i = begin; i < charset.length; i++) {char word = charset[i];Map mapTree = (Map) current.get(word);if (count > skip || (i == begin && Objects.isNull(mapTree))) {break;}if (Objects.nonNull(mapTree)) {current = mapTree;count = 0;index.add(i);} else {count++;if (flag && count > skip) {break;}}if ("1".equals(current.get("isEnd"))) {flag = true;}//当前词语是白名单if ("1".equals(current.get("isWhiteWord"))) {flag = false;break;}}fi.setFlag(flag);fi.setIndex(index);return fi;}
}

设置词汇类型

package com.startx.http.wordfilter;/*** 词汇类型** @author minghu.zhang* @date 11:37 2020/11/11**/
public enum WordType {/*** 黑名单/白名单*/BLACK, WHITE
}


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部