首页 > 代码库 > 过滤敏感字DFA JAVA实现

过滤敏感字DFA JAVA实现

package com.member.schedule;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author 徐良永
 * @Date 2011-10-13 上午9:23:43
 */
public class DFA {

    private static final Logger logger = LoggerFactory.getLogger(DFA.class);

    /**
     * 根节点
     */
    private TreeNode rootNode = new TreeNode();

    /**
     * 关键词缓存
     */
    private ByteBuffer keywordBuffer = ByteBuffer.allocate(1024);

    /**
     * 关键词编码
     */
    private String charset = "GBK";

    /**
     * 创建DFA
     * 
     * @param keywordList
     * @throws UnsupportedEncodingException
     */
    public void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException {

        for (String keyword : keywordList) {
            if (keyword == null)
                continue;
            keyword = keyword.trim();
            byte[] bytes = keyword.getBytes(charset);

            TreeNode tempNode = rootNode;
            // 循环每个字节
            for (int i = 0; i < bytes.length; i++) {
                int index = bytes[i] & 0xff; // 字符转换成数字
                TreeNode node = tempNode.getSubNode(index);

                if (node == null) { // 没初始化
                    node = new TreeNode();
                    tempNode.setSubNode(index, node);
                }

                tempNode = node;

                if (i == bytes.length - 1) {
                    tempNode.setKeywordEnd(true); // 关键词结束, 设置结束标志
                    logger.debug("DFA:{}", keyword);
                }
            } // end for
        } // end for

    }

    /**
     * 搜索关键字
     */
    public String searchKeyword(String text) throws UnsupportedEncodingException {
        return searchKeyword(text.getBytes(charset));
    }

    /**
     * 搜索关键字
     */
    public String searchKeyword(byte[] bytes) {
        StringBuilder words = new StringBuilder();

        if (bytes == null || bytes.length == 0) {
            return words.toString();
        }

        TreeNode tempNode = rootNode;
        int rollback = 0; // 回滚数
        int position = 0; // 当前比较的位置

        while (position < bytes.length) {
            int index = bytes[position] & 0xFF;
            keywordBuffer.put(bytes[position]); // 写关键词缓存
            tempNode = tempNode.getSubNode(index);

            // 当前位置的匹配结束
            if (tempNode == null) {
                position = position - rollback; // 回退 并测试下一个字节
                rollback = 0;
                tempNode = rootNode; // 状态机复位
                keywordBuffer.clear(); // 清空
            } else if (tempNode.isKeywordEnd()) { // 是结束点 记录关键词
                keywordBuffer.flip();
                String keyword = Charset.forName(charset).decode(keywordBuffer).toString();
                logger.debug("Find keyword:{}", keyword);
                keywordBuffer.limit(keywordBuffer.capacity());

                if (words.length() == 0)
                    words.append(keyword);
                else
                    words.append(":").append(keyword);

                rollback = 1; // 遇到结束点 rollback 置为1
            } else {
                rollback++; // 非结束点 回退数加1
            }

            position++;
        }

        return words.toString();
    }

    public void setCharset(String charset) {
        this.charset = charset;
    }

}
package com.member.schedule;

import java.util.ArrayList;
import java.util.List;

/**
 * 树节点 每个节点包含一个长度为256的数组
 * 
 * @author 徐良永
 * @Date 2011-10-12 上午3:11:24
 */
public class TreeNode {

    private static final int NODE_LEN = 256;

    /**
     * true 关键词的终结 ; false 继续
     */
    private boolean end = false;

    private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN);

    public TreeNode() {
        for (int i = 0; i < NODE_LEN; i++) {
            subNodes.add(i, null);
        }
    }

    /**
     * 向指定位置添加节点树
     * 
     * @param index
     * @param node
     */
    public void setSubNode(int index, TreeNode node) {
        subNodes.set(index, node);
    }

    public TreeNode getSubNode(int index) {
        return subNodes.get(index);
    }

    public boolean isKeywordEnd() {
        return end;
    }

    public void setKeywordEnd(boolean end) {
        this.end = end;
    }

}

link:http://www.iteye.com/topic/1116520

过滤敏感字DFA JAVA实现