首页 > 代码库 > 利用AC自动机进行关键字的提取和过滤

利用AC自动机进行关键字的提取和过滤

昨天看了meituan.com的AC算法在美团上单系统的应用一文,深受启发,原来ACM算法在工程中也能有这样赤裸裸的运用~~~ 于是便复习了AC自动机,并把代码用java重新搞了一遍~~



AC自动机整体的结果大概是长这样的,其实就是在trie树上做KMP :
技术分享

技术分享

AC自动机里面比较难理解的应该是它的失配指针的计算过程。
这个计算过程从本质上讲就是进行一遍广搜,于此同时维护
fail指针,每一步的维护过程可用下图表示。
技术分享


技术分享

Keyword.java

package com.AC.domain;

import java.io.*;
import java.util.*;
import java.math.*;

public class Keyword implements Serializable{
	
	/**
	 * 
	 */
	
	
	private Integer id;
	private Map<Integer, Integer> categoryTypeMap;
	private String word;
	private List<Integer> categories;
	
	
	private static final long serialVersionUID = 1L;
	
	public Keyword(){
		id = null;
		categories=null;
		categoryTypeMap=null;
		word=null;
	}
	
	public Keyword(String key){
		id = null;
		categories=null;
		categoryTypeMap=null;
		word=key;
	}
	
	
	
	public Keyword(Keyword p){
		this.categories=p.categories;
		this.categoryTypeMap=p.categoryTypeMap;
		this.id=p.id;
		this.word=p.word;
	}
	
	
	@Override
	public boolean equals(Object o) {
		// TODO Auto-generated method stub
		
		if (this == o) return true;
		if(o==null||getClass()!=o.getClass()) return false;
		
		Keyword keyword = (Keyword) o;
		
		if(id!=null?!id.equals(keyword.id):keyword.id!=null)
			return false;
	
		return true;
	}
	@Override
	public int hashCode() {
		// TODO Auto-generated method stub
		return id != null ?id.hashCode():0;
	}
	public Integer getId() {
		return id;
	}
	public void setId(Integer id) {
		this.id = id;
	}
	public Map<Integer, Integer> getCategoryTypeMap() {
		return categoryTypeMap;
	}
	public void setCategoryTypeMap(Map<Integer, Integer> categoryTypeMap) {
		this.categoryTypeMap = categoryTypeMap;
	}
	public String getWord() {
		return word;
	}
	public void setWord(String word) {
		this.word = word;
	}
	public List<Integer> getCategories() {
		return categories;
	}
	public void setCategories(List<Integer> categories) {
		this.categories = categories;
	}

	
}

Node.java

package com.AC.domain;

import java.util.ArrayList;
import java.util.List;

public class Node {
	public Integer state;
	public char  character = 0;  //鎸囧悜褰撳墠鑺傜偣鐨勫瓧绗?
	public Node failureNode;
	public List <Keyword> keywords;
	public List <Node> childrenList;
	
	public Node(){
		keywords=new ArrayList<Keyword>();
		childrenList = new ArrayList<Node>();
		state = 0;
		failureNode = null;
		character = 0;
	}
	
	public Node (char c,Node node) {
		keywords=new ArrayList<Keyword>();
		childrenList = new ArrayList<Node>();
		state =1;
		character =c ;
		failureNode = node;
	}
	
	public Boolean containsChild (char c){
		for(Node childNode : childrenList) {
			if(childNode.character==c) return true;
		}
		return false;
	}
	
	public Node getChild (char c){
		for (Node childNode : childrenList){
			if(childNode.character==c) return childNode;
		} 
		return null;
	}
	
	public void addKeyword(Keyword keyword){
		keywords.add(keyword);
		
	}
	
	public void addKeywords(List<Keyword> k){
		keywords.addAll(k);
	}
	
	public void addChild(Node child){
		childrenList.add(child);
	}
	
}

Patterns.java
package com.AC.domain;

import java.util.*;
import java.io.*;
import java.math.*;

public class Patterns {
	private final Node root = new Node();
	
	
	private List<Node> tree;
	
	public Patterns(List<Keyword> keywords){
		tree = new ArrayList<Node> ();
		root.failureNode=root;
		tree.add(root);
		for(Keyword keyword : keywords){
			addKeyword(keyword);
		}
		setFailNode();
	}

	private  void setFailNode() {
		// TODO Auto-generated method stub
		
		Queue<Node> queue = new LinkedList<Node>();
		Node node =root;
		for (Node d1 : node.childrenList){
			queue.offer(d1);
		}
		while (!queue.isEmpty()){
			node = queue.poll();
			if (node.childrenList!=null){
				for (Node curNode : node.childrenList) {
					queue.offer(curNode);
					Node failNode = node.failureNode;
					while(!failNode.containsChild(curNode.character)){
						failNode = failNode.failureNode;
						if(failNode==null||failNode.state==0) break;
					}
					if(failNode!=null&&failNode.containsChild(curNode.character)) {
						curNode.failureNode = failNode.getChild(curNode.character);
						curNode.addKeywords(curNode.failureNode.keywords);
						
					}
					
				}
			}
		}
	}

	private  void addKeyword(Keyword keyword) {
		// TODO Auto-generated method stub
		
		char [] wordCharArr = keyword.getWord().toCharArray();
		Node current = root;
		for(char currentChar : wordCharArr){
			if(current.containsChild(currentChar)){
				current = current.getChild(currentChar);
			}
			else{
				Node node = new Node (currentChar,root);
				current.addChild(node);
				current=node;
				tree.add(node);
			}
		}
		current.addKeyword(keyword);
		
	}
	
	public List<Keyword> searchKeyword(String data,Integer category) {
		List<Keyword> matchResult = new ArrayList<Keyword>();
		Node node = root;
		char[] chs = data.toCharArray();
		for (int i=0;i<chs.length;i++){
			while(node!=null&&!node.containsChild(chs[i])){
			//	if(node.state==0) break;
				node = node.failureNode;
				if(node==null||node.state==0) break;
			}
			
			if(node!=null&&node.containsChild(chs[i])) {
				node = node.getChild(chs[i]);
				if(node.keywords!=null){
					for(Keyword pattern : node.keywords){
						if(category == null){
	//						System.out.println(pattern.getWord());
							matchResult.add(new Keyword(pattern.getWord()));
						}
						else{
							if(pattern.getCategories().contains(category)){
								matchResult.add(pattern);
							}
						}
						
					}
				}
			}
		}
		return matchResult;
	}

}


Test.java
package com.AC.domain;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Test {
	public static void main(String []args){
		
	//	abcd abc abe ae bc be bce cm kcabcmgh
		
		List<Keyword> keywords = new ArrayList<Keyword>();
		List<Keyword> result = new ArrayList<Keyword> ();
		
		
		
/*		List<Keyword> re= new ArrayList<Keyword> ();	
		re.clear();
		Keyword a= new Keyword("abcd");
		re.add(a);
		Keyword b= new Keyword("abc");
		re.add(b);
		
		System.out.println(re.size());*/
		
		
		
		
		Keyword a1= new Keyword();
		a1.setWord("abcd");
		keywords.add(a1);
		
		Keyword a2= new Keyword();
		a2.setWord("abc");
		keywords.add(a2);
		
		Keyword a3= new Keyword();
		a3.setWord("abe");
		keywords.add(a3);
		
		Keyword a5= new Keyword();
		a5.setWord("ae");
		keywords.add(a5);	
		
		Keyword a6= new Keyword();
		a6.setWord("bc");
		keywords.add(a6);	
		
		Keyword a7= new Keyword();
		a7.setWord("be");
		keywords.add(a7);	
		
		Keyword a8= new Keyword();
		a8.setWord("bce");
		keywords.add(a8);	
		
		Keyword a9= new Keyword();
		a9.setWord("cm");
		keywords.add(a9);	
		
		Patterns patterns=new Patterns(keywords);
		result=patterns.searchKeyword("kcabcmgha", null);
		
//		System.out.println(result.size());
		System.out.println("keys: ");
		for(Keyword key:result){
			System.out.println(key.getWord());
		}
		
	//	System.out.println(result);
	}

}

附美团文章链接:http://tech.meituan.com/ac.html


利用AC自动机进行关键字的提取和过滤