首页 > 代码库 > 利用AC自动机进行关键字的提取和过滤
利用AC自动机进行关键字的提取和过滤
昨天看了meituan.com的AC算法在美团上单系统的应用一文,深受启发,原来ACM算法在工程中也能有这样赤裸裸的运用~~~ 于是便复习了AC自动机,并把代码用java重新搞了一遍~~
AC自动机整体的结果大概是长这样的,其实就是在trie树上做KMP :
AC自动机里面比较难理解的应该是它的失配指针的计算过程。
这个计算过程从本质上讲就是进行一遍广搜,于此同时维护
fail指针,每一步的维护过程可用下图表示。
Keyword.java
package com.AC.domain; import java.io.*; import java.util.*; import java.math.*; public class Keyword implements Serializable{ /** * */ private Integer id; private Map<Integer, Integer> categoryTypeMap; private String word; private List<Integer> categories; private static final long serialVersionUID = 1L; public Keyword(){ id = null; categories=null; categoryTypeMap=null; word=null; } public Keyword(String key){ id = null; categories=null; categoryTypeMap=null; word=key; } public Keyword(Keyword p){ this.categories=p.categories; this.categoryTypeMap=p.categoryTypeMap; this.id=p.id; this.word=p.word; } @Override public boolean equals(Object o) { // TODO Auto-generated method stub if (this == o) return true; if(o==null||getClass()!=o.getClass()) return false; Keyword keyword = (Keyword) o; if(id!=null?!id.equals(keyword.id):keyword.id!=null) return false; return true; } @Override public int hashCode() { // TODO Auto-generated method stub return id != null ?id.hashCode():0; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public Map<Integer, Integer> getCategoryTypeMap() { return categoryTypeMap; } public void setCategoryTypeMap(Map<Integer, Integer> categoryTypeMap) { this.categoryTypeMap = categoryTypeMap; } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public List<Integer> getCategories() { return categories; } public void setCategories(List<Integer> categories) { this.categories = categories; } }
Node.java
package com.AC.domain; import java.util.ArrayList; import java.util.List; public class Node { public Integer state; public char character = 0; //鎸囧悜褰撳墠鑺傜偣鐨勫瓧绗? public Node failureNode; public List <Keyword> keywords; public List <Node> childrenList; public Node(){ keywords=new ArrayList<Keyword>(); childrenList = new ArrayList<Node>(); state = 0; failureNode = null; character = 0; } public Node (char c,Node node) { keywords=new ArrayList<Keyword>(); childrenList = new ArrayList<Node>(); state =1; character =c ; failureNode = node; } public Boolean containsChild (char c){ for(Node childNode : childrenList) { if(childNode.character==c) return true; } return false; } public Node getChild (char c){ for (Node childNode : childrenList){ if(childNode.character==c) return childNode; } return null; } public void addKeyword(Keyword keyword){ keywords.add(keyword); } public void addKeywords(List<Keyword> k){ keywords.addAll(k); } public void addChild(Node child){ childrenList.add(child); } }
Patterns.java
package com.AC.domain; import java.util.*; import java.io.*; import java.math.*; public class Patterns { private final Node root = new Node(); private List<Node> tree; public Patterns(List<Keyword> keywords){ tree = new ArrayList<Node> (); root.failureNode=root; tree.add(root); for(Keyword keyword : keywords){ addKeyword(keyword); } setFailNode(); } private void setFailNode() { // TODO Auto-generated method stub Queue<Node> queue = new LinkedList<Node>(); Node node =root; for (Node d1 : node.childrenList){ queue.offer(d1); } while (!queue.isEmpty()){ node = queue.poll(); if (node.childrenList!=null){ for (Node curNode : node.childrenList) { queue.offer(curNode); Node failNode = node.failureNode; while(!failNode.containsChild(curNode.character)){ failNode = failNode.failureNode; if(failNode==null||failNode.state==0) break; } if(failNode!=null&&failNode.containsChild(curNode.character)) { curNode.failureNode = failNode.getChild(curNode.character); curNode.addKeywords(curNode.failureNode.keywords); } } } } } private void addKeyword(Keyword keyword) { // TODO Auto-generated method stub char [] wordCharArr = keyword.getWord().toCharArray(); Node current = root; for(char currentChar : wordCharArr){ if(current.containsChild(currentChar)){ current = current.getChild(currentChar); } else{ Node node = new Node (currentChar,root); current.addChild(node); current=node; tree.add(node); } } current.addKeyword(keyword); } public List<Keyword> searchKeyword(String data,Integer category) { List<Keyword> matchResult = new ArrayList<Keyword>(); Node node = root; char[] chs = data.toCharArray(); for (int i=0;i<chs.length;i++){ while(node!=null&&!node.containsChild(chs[i])){ // if(node.state==0) break; node = node.failureNode; if(node==null||node.state==0) break; } if(node!=null&&node.containsChild(chs[i])) { node = node.getChild(chs[i]); if(node.keywords!=null){ for(Keyword pattern : node.keywords){ if(category == null){ // System.out.println(pattern.getWord()); matchResult.add(new Keyword(pattern.getWord())); } else{ if(pattern.getCategories().contains(category)){ matchResult.add(pattern); } } } } } } return matchResult; } }
package com.AC.domain; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; public class Test { public static void main(String []args){ // abcd abc abe ae bc be bce cm kcabcmgh List<Keyword> keywords = new ArrayList<Keyword>(); List<Keyword> result = new ArrayList<Keyword> (); /* List<Keyword> re= new ArrayList<Keyword> (); re.clear(); Keyword a= new Keyword("abcd"); re.add(a); Keyword b= new Keyword("abc"); re.add(b); System.out.println(re.size());*/ Keyword a1= new Keyword(); a1.setWord("abcd"); keywords.add(a1); Keyword a2= new Keyword(); a2.setWord("abc"); keywords.add(a2); Keyword a3= new Keyword(); a3.setWord("abe"); keywords.add(a3); Keyword a5= new Keyword(); a5.setWord("ae"); keywords.add(a5); Keyword a6= new Keyword(); a6.setWord("bc"); keywords.add(a6); Keyword a7= new Keyword(); a7.setWord("be"); keywords.add(a7); Keyword a8= new Keyword(); a8.setWord("bce"); keywords.add(a8); Keyword a9= new Keyword(); a9.setWord("cm"); keywords.add(a9); Patterns patterns=new Patterns(keywords); result=patterns.searchKeyword("kcabcmgha", null); // System.out.println(result.size()); System.out.println("keys: "); for(Keyword key:result){ System.out.println(key.getWord()); } // System.out.println(result); } }
利用AC自动机进行关键字的提取和过滤
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。