敏感词之DFA算法

前言

DFA 即 Deterministic Finite Automaton，也就是确定有穷自动机。根源在于建立树形词库模型，树形词库检索优于循环检索

构建算法模型

DFA需要创建一个词库模型，模型JSON如下

构建步骤说明

graph LR
	开始((开始)) --> 读取当前字符
	读取当前字符 --> 模型中是否存在当前字符>模型中是否存在当前字符]
    模型中是否存在当前字符 --存在--> 设置当前词库上级为模型中已定义的信息
    设置当前词库上级为模型中已定义的信息--> 是否已读取结束
    模型中是否存在当前字符 --不存在--> 创建模型并添加
    创建模型并添加--> 设置当前词库上级为模型中已定义的信息
    是否已读取结束 --是--> 结束((结束))
    是否已读取结束 --否--> 读取当前字符

检索词

graph LR
START(开始) --> READ_CAHRS("读取检索词char")
READ_CAHRS --> IS_EXISTS(模型中是否存在)
IS_EXISTS --存在--> 向下读取(记录当前下标并循环向下读取模型)
IS_EXISTS --不存在--> 结束(结束)
向下读取 --> WORD_END(模型中词结束,表示找到了词库中的词)
WORD_END --> 结束

完全代码

package com.startx.http.wordfilter;

import com.alibaba.fastjson.JSON;

import java.util.*;
import java.util.stream.Collectors;

/**
 * <p>
 * DFA算法实现
 * </p>
 *
 * @author KThirty
 * @since 2022/8/31
 */
@SuppressWarnings({"rawtypes", "unchecked"})
public class DfaWordFilter {
    private final Map<Character, Object> wordMap = new HashMap<>(8192);

    public static DfaWordFilter create() {
        return new DfaWordFilter();
    }


    /**
     * ignore
     */
    public void addWord(WordType wordType, String... words) {
        addWord(wordType, "", 1, words);
    }

    /**
     * 添加词库
     *
     * @param wordType  词类型
     * @param wordClass 词性
     * @param weight    权重
     * @param words     词
     */
    public void addWord(WordType wordType, String wordClass, int weight, String... words) {
        addWord(wordType, wordClass, weight, new HashSet<>(Arrays.asList(words)));
    }

    /**
     * 添加词库
     *
     * @param wordType  词类型
     * @param wordClass 词性
     * @param weight    权重
     * @param words     词
     */
    public void addWord(WordType wordType, String wordClass, int weight, Set<String> words) {
        // 上级Map
        Map lastMap;
        // 文字不存在时用于创建新Map
        Map newWordMap;
        for (String word : words) {
            // 初始化上级为所有文字map
            lastMap = wordMap;
            char[] chars = word.toCharArray();
            int length = chars.length;
            for (int i = 0; i < length; i++) {
                char c = chars[i];
                // 查看上级Map是否已存在该词
                Object wordMap = lastMap.get(c);
                // 存在则使用本字符不处理，直接使用存在的map
                if (wordMap != null) {
                    lastMap = (Map) wordMap;
                } else {
                    // 不存在则创建一个Map
                    newWordMap = new HashMap<>(4);
                    // 默认当前字未结束，如果单字，下方的判断会重置为Yes
                    // newWordMap.put(Constants.IS_END, Constants.NO);
                    // 本级设置到上级Map中并开始下级
                    lastMap.put(c, newWordMap);
                    lastMap = newWordMap;
                }
                // 词结束时设置是否结束与词类型
                if (i == length - 1) {
                    lastMap.put(Constants.IS_END, Constants.YES);
                    lastMap.put(Constants.WORD_TYPE, wordType.getValue());
                    lastMap.put(Constants.WORD_CLASS, wordClass);
                    lastMap.put(Constants.WORD_WEIGHT, weight);
                }

            }
        }
    }

    public String highlight(final String word, final int skip, final String prefix, final String suffix) {
        char[] chars = word.toCharArray();
        List<SearchResult> search = search(word, skip, WordType.BLACK_WORD);
        StringBuilder result = new StringBuilder();
        int current = 0;
        for (SearchResult searchResult : search) {
            // 拼接区间原始数据
            for (int i = current; i < searchResult.getWordStartIndex(); i++) {
                result.append(chars[i]);
            }
            // 前缀
            result.append(prefix);
            // 拼接原始数据
            for (int i = searchResult.getWordStartIndex(); i <= searchResult.getWordEndIndex(); i++) {
                result.append(chars[i]);
            }
            // 拼接后缀
            result.append(suffix);
            current = searchResult.getWordEndIndex() + 1;
        }
        for (int i = current; i < chars.length; i++) {
            result.append(chars[i]);
        }
        return result.toString();
    }

    public List<SearchResult> search(final String word, final int skip, WordType... wordTypes) {
        assert word != null;
        assert skip >= 0;
        assert wordTypes != null && wordTypes.length > 0;
        assert !"".equals(word);
        char[] charArray = word.toCharArray();
        List<SearchResult> results = new ArrayList<>();
        for (int i = 0; i < charArray.length; i++) {
            // 查询词
            SearchResult searchResult = searchWord(charArray, i, skip);
            // 没找到跳过
            if (!searchResult.isFound()) {
                continue;
            }
            // 整理需要查询的词类型
            Set<String> wordTypeSet = Arrays.stream(wordTypes).map(WordType::getValue).collect(Collectors.toSet());
            // 不包含当前词类型则跳过
            if (!wordTypeSet.contains(searchResult.getWordType())) {
                i = searchResult.getWordEndIndex();
                continue;
            }
            results.add(searchResult);
        }
        return results;
    }

    /**
     * 搜索词
     *
     * @param chars 词array
     * @param start 开始下标
     * @param skip  模糊次数（skip=1时 中华人民 可以匹配 中华的人民）
     * @return 搜索结果
     */
    public SearchResult searchWord(final char[] chars, final int start, final int skip) {
        SearchResult searchResult = new SearchResult();
        // 跳跃次数
        int skipCount = 0;
        // 上一个字的map
        Map lastMap = wordMap;
        // 文字长度
        int length = chars.length;
        for (int i = start; i < length; i++) {
            // 已跳跃足够次数，未找到对应词
            if (skipCount > skip) {
                break;
            }
            char word = chars[i];
            Map wordTree = (Map) lastMap.get(word);
            // 第一次查询，且未找到以首字符开头的词树，跳出循环
            if (i == start && wordTree == null) {
                break;
            }
            // 未找到词树，跳过本次，开始下一个字
            if (wordTree == null) {
                skipCount++;
                continue;
            }
            // 跳过次数重置
//            skipCount = 0;
            // 找到字树，开始查询下一个字
            lastMap = wordTree;
            // 记录当前下标
            searchResult.getWordIndex().add(i);
            // 词库中词已结束，视为找到词
            // 黑名单不可直接跳出，有可能在白名单中
            if (Constants.YES.equals(wordTree.get(Constants.IS_END))) {
                searchResult.setFound(true);
                searchResult.setWordClass((String) wordTree.get(Constants.WORD_CLASS));
                searchResult.setWordWeight(Integer.valueOf((String) wordTree.get(Constants.WORD_WEIGHT)));
            }
            // 整理词类型
            String wordType = (String) wordTree.get(Constants.WORD_TYPE);
            // 设置词类型
            searchResult.setWordType((String) wordTree.get(Constants.WORD_TYPE));
            // 如果是白名单，直接跳出，不再查询，否则继续查询黑名单
            if (WordType.WHITE_WORD.getValue().equals(wordType)) {
                break;
            }
        }
        return searchResult;
    }


    static class SearchResult {
        private boolean found = false;
        private String wordType;
        private String wordClass;
        private Integer wordWeight;
        private List<Integer> wordIndex = new ArrayList<>();

        public boolean isFound() {
            return found;
        }

        public void setFound(boolean found) {
            this.found = found;
        }

        public String getWordType() {
            return wordType;
        }

        public void setWordType(String wordType) {
            this.wordType = wordType;
        }

        public List<Integer> getWordIndex() {
            return wordIndex;
        }

        public void setWordIndex(List<Integer> wordIndex) {
            this.wordIndex = wordIndex;
        }

        public String getWordClass() {
            return wordClass;
        }

        public SearchResult setWordClass(String wordClass) {
            this.wordClass = wordClass;
            return this;
        }

        public Integer getWordWeight() {
            return wordWeight;
        }

        public SearchResult setWordWeight(Integer wordWeight) {
            this.wordWeight = wordWeight;
            return this;
        }

        public int getWordStartIndex() {
            if (wordIndex != null && !wordIndex.isEmpty()) {
                return wordIndex.get(0);
            }
            return -1;
        }

        public int getWordEndIndex() {
            if (wordIndex != null && !wordIndex.isEmpty()) {
                return wordIndex.get(wordIndex.size() - 1);
            }
            return -1;
        }
    }

    enum WordType {
        /**
         * 黑名单，白名单
         */
        BLACK_WORD("1", "黑名单"),
        WHITE_WORD("2", "白名单");
        private final String value;
        private final String description;

        WordType(String value, String description) {
            this.value = value;
            this.description = description;
        }

        public String getValue() {
            return value;
        }

        public String getDescription() {
            return description;
        }

        public boolean match(String value) {
            return this.value.equals(value);
        }

        public static WordType fromValue(String value) {
            for (WordType type : WordType.values()) {
                if (type.getValue().equals(value)) {
                    return type;
                }
            }
            return null;
        }
    }

    interface Constants {
        String YES = "1";
        String NO = "0";
        /**
         * 词类型
         */
        char WORD_TYPE = 'T';
        /**
         * 是否结束
         */
        char IS_END = 'E';
        /**
         * 词性
         */
        char WORD_CLASS = 'C';
        /**
         * 词权重
         */
        char WORD_WEIGHT = 'W';
    }


    public static void main(String[] args) {
        DfaWordFilter dfaWordFilter = DfaWordFilter.create();
        dfaWordFilter.addWord(WordType.BLACK_WORD, "中国解放军", "人民解放军", "人民子弟兵");
        dfaWordFilter.addWord(WordType.WHITE_WORD, "中国人民解放军");
        System.out.println(JSON.toJSONString(dfaWordFilter.wordMap));
        String word = "省检察院召开第三季度院务会，回顾总结前三季度工作情况，深入分析工作中存在的突出问题和短板弱项，明确四季度的工作重点和要求，动员全体干警立足主责主业、聚焦改革创新、强势推进落实、奋力夺冠争先，确保年度工作任务高质量圆满完成。省检察院党组书记、检察长杨景海主持会议，听取了院领导对分管工作的总结点评，研究部署下一步工作。巡视员、厅级干部、二级高级检察官、各内设机构、院属事业单位负责人参加会议。";
        String highlight = dfaWordFilter.highlight(word, 0, "<span class=\"highlight\">", "</span>");
        System.out.println(highlight);
    }
}