首页 > 代码库 > prefixTreeEspan 频繁子树模式挖掘 A pattern growth 算法实现 mining embedded subtrees.

prefixTreeEspan 频繁子树模式挖掘 A pattern growth 算法实现 mining embedded subtrees.

直接说这算法的作用吧,

上图的右边就是左边图的 ES(Embedded Subtree),相对应的ABC-1D-1-1E-1-1就是上图左边的Pre-Order-String,也是这个算法输入的数据格式,中间我们用空格隔开(A B C -1 D -1 -1 E -1 -1),-1就是表示回走。

这算法主要是用来挖掘频繁子树,也就是将一个树里的所有频繁子树找出来。原理就不说了,大家可以自己去搜下这篇论文,这是邹老师的一篇论文。这里我主要采用java来实现,所先针对GrowthElement、PrefixTree、PrefixTreeNode、ProjectedDB建立实体类,然后再递归实现搜索ES。


package iie.ucas.treeMiner.bean;

public class GrowthElement {
	private String label = "";
	private int attached = 0;// attached means that the GEs is attached to the
	private int times = 0;// 出现的次数,跟min_sup比较

	// n-th node of the prefix-tree.
	public String getLabel() {
		return label;
	}

	public int getTimes() {
		return times;
	}

	public void setTimes(int times) {
		this.times = times;
	}

	public void setLabel(String label) {
		this.label = label;
	}

	public int getAttached() {
		return attached;
	}

	public void setAttached(int attached) {
		this.attached = attached;
	}
	public String toString(){
		return "(" + label + ","+ getAttached() + "):" + getTimes();
	}
}<pre name="code" class="java">package iie.ucas.treeMiner.bean;

public class PrefixTree {
	private ProjectedDB prodb;
	private PrefixTreeNode[] ptnodes;
	private int length = 0;
	private int times = 1;

	public PrefixTree(int n) {
		ptnodes = new PrefixTreeNode[n];// 频繁模式的prefix的label个数<A C -1 -1>,则n为2
		for (int i = 0; i < n; i++) {
			ptnodes[i] = new PrefixTreeNode();
		}
		length = n;
	}

	public PrefixTreeNode getPtnode(int index) {
		if (index < length)
			return ptnodes[index];
		else
			return null;
	}

	public int getTimes() {
		return times;
	}

	public void setTimes(int times) {
		this.times = times;
	}

	public void setPtnode(int index, int numMinus, String label, int pos) {
		if (index < length) {
			ptnodes[index].setNumMinus(numMinus);
			ptnodes[index].setLabel(label);
			ptnodes[index].setPos(pos);
		}
	}

	public String toString() {
		String res = "";
		for (int i = 0; i < length; i++) {
			if (i == 0)
				res += ptnodes[i].getLabel();
			else {
				res += " " + ptnodes[i].getLabel();
			}
		}
		return res;
	}

	public String toLastLabelString() {
		String res = "";
		if(length == 1)
			return ptnodes[0].getLabel();
		for (int i = 0; i < length; i++) {
			if (i == 0) {
				res += ptnodes[i].getLabel();
				for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
					res += " " + "-1";
				}
			} else if (i == length - 1) {
				res += " " + ptnodes[i].getLabel();
			} else {
				res += " " + ptnodes[i].getLabel();
				for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
					res += " " + "-1";
				}
			}
		}
		return res;
	}

	public String toString(boolean isComplete) {
		String res = "";
		for (int i = 0; i < length; i++) {
			if (i == 0) {
				res += ptnodes[i].getLabel();
				for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
					res += " " + "-1";
				}
			} else {
				res += " " + ptnodes[i].getLabel();
				for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
					res += " " + "-1";
				}
			}
		}
		return res;
	}

	public int getLength() {
		return length;
	}

	public void setLength(int length) {
		this.length = length;
	}

	public ProjectedDB getProdb() {
		return prodb;
	}

	public void setProdb(ProjectedDB prodb) {
		this.prodb = prodb;
	}

	public PrefixTreeNode[] getPtnode() {
		return ptnodes;
	}

	public void setPtnode(PrefixTreeNode[] ptnodes) {
		this.ptnodes = ptnodes;
	}
}

package iie.ucas.treeMiner.bean;

public class PrefixTreeNode {
	private String label = "";
	private int pos = 0;
	private int  numMinus = 0;// 用来表示在下一个label前着-1的个数
	

	public int getNumMinus() {
		return numMinus;
	}

	public void setNumMinus(int numMinus) {
		this.numMinus = numMinus;
	}

	public String getLabel() {
		return label;
	}

	public void setLabel(String label) {
		this.label = label;
	}

	public int getPos() {
		return pos;
	}

	public void setPos(int pos) {
		this.pos = pos;
	}
}

package iie.ucas.treeMiner.bean;

import java.util.ArrayList;

public class ProjectedDB {
	private ArrayList<String> pre_order_list;// 用来存放ProDB,eg.{A B C -1 -1- 1;A B
												// B -1 D -1 -1 -1}

	public ProjectedDB() {
		pre_order_list = new ArrayList<String>();
	}

	public ArrayList<String> getPre_order_list() {
		return pre_order_list;
	}

	public void setPre_order_list(ArrayList<String> preOrderList) {
		pre_order_list = preOrderList;
	}
}

package iie.ucas.treeMiner.action;

import iie.ucas.treeMiner.bean.GrowthElement;
import iie.ucas.treeMiner.bean.PrefixTree;
import iie.ucas.treeMiner.bean.ProjectedDB;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map.Entry;

public class PrefixTreeESpanAction {



	/**
	 * 按行读取数据,并存放到ArrayList中
	 * 
	 * @param path
	 * @return
	 * @throws IOException
	 * @throws IOException
	 */

	public ArrayList<String> file2arrlist(String path) throws IOException,
			IOException {
		ArrayList<String> arrlist = new ArrayList<String>();
		File datafile = new File(path);
		String line = "";
		BufferedReader input = new BufferedReader(new InputStreamReader(
				new FileInputStream(datafile), "utf-8"));
		while ((line = input.readLine()) != null) {
			arrlist.add(line);
		}
		input.close();
		return arrlist;
	}

	/**
	 * 统计arrlist中每个label出现的次数
	 * 
	 * @param arrlist
	 * @return
	 */
	public HashMap<String, Integer> getAllLabel(ArrayList<String> arrlist) {
		HashMap<String, Integer> hashmap = new HashMap<String, Integer>();
		for (int i = 0; i < arrlist.size(); i++) {
			String[] labels = arrlist.get(i).split(" ");// 返回第i行的所有label
			for (int j = 0; j < labels.length; j++) {
				if (!labels[j].equals("-1")) {// 筛选所有label
					if (hashmap.containsKey(labels[j])) {// label已经存在,直接在相应的value上加1
						hashmap.put(labels[j], hashmap.get(labels[j]) + 1);
					} else {
						hashmap.put(labels[j], 1);
					}
				}
			}
		}
		return hashmap;
	}

	/**
	 * 根据prefixtree构造相应的prodb
	 * 
	 * @param prodb
	 * @param pretree
	 * @return
	 */
	public ProjectedDB getProjectedDB(ProjectedDB prodb, PrefixTree pretree) {// suppose
		// pretree:<A(1)
		// -1>
		ProjectedDB resprodb = new ProjectedDB();
		ArrayList<String> pretlist = new ArrayList<String>();
		String label = pretree.getPtnode()[pretree.getLength() - 1].getLabel();
		
		
		System.out.println("<" + pretree.toString(true) + ">-projectedDB");
		ArrayList<String> prodblist = prodb.getPre_order_list();
		for (int i = 0; i < prodblist.size(); i++) {
			String[] labels = prodblist.get(i).split(" ");
			int length = labels.length;
			for (int k = pretree.getLength()-1; k < length; k++) {
				String prodbElement = label;//pretree.toLastLabelString();///??????????????????????????????????????
				//System.out.println("lastlabelString :" + prodbElement);
				// String prodbElement = label;// 表示一行such as.{A C E -1 -1 -1}
				// System.out.println("prodbElement:" + prodbElement);
				if (label.equals(labels[k])) {// find all occurrence of label
					// ,and
					// construct prodb
					int contLabel = 1;//pretree.getLength();//????????
					int minus = 0;// 用来标记后面出现-1的个数
					for (int j = k+1; j < length; j++) {//从根label一样的那个位置的下一个位置开始
						if (!labels[j].equals("-1")) {// not -1
							contLabel++;
							prodbElement += " " + labels[j];
						} else {
							minus++;
							prodbElement += " " + labels[j];
							if (minus >= contLabel) {//
								//k = j + 1;
								break;
							}
							
						}
					}// break
					
					System.out.println("prodbElement:" + prodbElement);
					pretlist.add(prodbElement);// prodb的一项内容,也就是一行
				}// end{if}
			}
		}
		resprodb.setPre_order_list(pretlist);
		return resprodb;
	}

	/**
	 * 返回pretree对应的 GrowthElements
	 * 
	 * @param prodb
	 * @param min_sup
	 * @param pretree
	 * @return
	 */
	public HashSet<GrowthElement> getGEs(ProjectedDB prodb, PrefixTree pretree) {
		// ProjectedDB pdb = this.getProjectedDB(prodb, pretree);
		ArrayList<String> arrprodb = prodb.getPre_order_list();// pdb.getPre_order_list();
		HashSet<GrowthElement> geset = new HashSet<GrowthElement>();
		int prelength = pretree.getLength();

		for (int i = 0; i < arrprodb.size(); i++) {
			String labels[] = arrprodb.get(i).split(" ");
			for (int j = prelength; j < labels.length; j++) {
				// if (prelength == 1) {
				//					
				// } else {// pretree.getLength()!=1
				int contLabel = 0;
				int minus = 0;
				if (!labels[j].equals("-1")) {
					if (contLabel >= minus) {// GE attached to
						// pretree.getLength()
						GrowthElement tmpge = null;
						boolean flag = false;// 用来表示新的GE是否之前已经找到
						Iterator<GrowthElement> it = geset.iterator();
						while (it.hasNext()) {
							tmpge = it.next();
							if (tmpge.getLabel().equals(labels[j])
									&& tmpge.getAttached() == prelength) {
								flag = true;
								break;
							}
						}// break
						if (flag) {// 该GE之前就找到过了
							tmpge.setTimes(tmpge.getTimes() + 1);
							geset.add(tmpge);
						} else {
							GrowthElement ge = new GrowthElement();
							ge.setAttached(prelength);
							ge.setLabel(labels[j]);
							ge.setTimes(1);
							geset.add(ge);
						}
					} else {// //////////////////////////////////////////////////////
						GrowthElement tmpge = null;
						boolean flag = false;// 用来表示新的GE是否之前已经找到
						Iterator<GrowthElement> it = geset.iterator();
						while (it.hasNext()) {
							tmpge = it.next();
							if (tmpge.getLabel().equals(labels[j])
									&& tmpge.getAttached() == prelength) {
								flag = true;
								break;
							}
						}// break
						if (flag) {// 该GE之前就找到过了
							tmpge.setTimes(tmpge.getTimes() + 1);
							geset.add(tmpge);
						} else {
							GrowthElement ge = new GrowthElement();
							ge.setAttached(prelength + (contLabel - minus));
							ge.setLabel(labels[j]);
							ge.setTimes(1);
							geset.add(ge);
						}
					}
					contLabel++;
				} else {// labels[j].equals("-1")
					minus++;
				}
			}
		}
		return geset;
	}

	public PrefixTree extendSByGE(GrowthElement ge, PrefixTree pretree,
			int min_sup) {
		PrefixTree respret = null;
		int length = pretree.getLength();
		if (ge.getTimes() >= min_sup) {// 判断是否符合大于min_sup
			if (1 == length) {
				respret = new PrefixTree(length + 1);
				respret.setPtnode(0, 0, pretree.getPtnode(0).getLabel(), 1);
				respret.setPtnode(1, 2, ge.getLabel(), 2);
			} else {
				respret = new PrefixTree(length + 1);
				int attached = ge.getAttached();
				if (attached == length) {
					for (int i = 0; i < length; i++) {// 将之前的信息复制过来eg.{A B -1
						// -1}GE是(C,2)则{A B C -1
						// -1 -1}
						respret.setPtnode(i, 0,
								pretree.getPtnode(i).getLabel(), i + 1);
					}
					respret.setPtnode(length, length + 1, ge.getLabel(),
							length + 1);// 设置新加入的GE的信息
				} else {// ***************************************************************************88//
					for (int i = 0; i < length - 1; i++) {// 将之前的信息复制过来eg.{A B
						// -1 -1}GE是(C,2)则{A
						// B C -1 -1 -1}
						respret.setPtnode(i,
								pretree.getPtnode(i).getNumMinus(), pretree
										.getPtnode(i).getLabel(), pretree
										.getPtnode(i).getPos());
					}
					// int index,int numMinus, String label,int pos
					respret.setPtnode(length - 1, (length - attached), pretree
							.getPtnode(length - 1).getLabel(), pretree
							.getPtnode(length - 1).getPos());
					respret.setPtnode(length, attached + 1, ge.getLabel(),
							length + 1);// 设置新加入的GE的信息
				}
			}
		} else {// end{if}判断是否符合大于min_sup
			return null;
		}
		System.err.println("添加GE后的prefix:" + respret.toString(true));
		return respret;
	}

	public void Fre(PrefixTree pretree, ProjectedDB prodb, int min_sup) {

		HashSet<GrowthElement> geSet = this.getGEs(prodb, pretree);
		Iterator<GrowthElement> it = geSet.iterator();
		while (it.hasNext()) {
			GrowthElement ge = it.next();
			System.out.println("ge: " + ge.toString());
			Fre(this.extendSByGE(ge, pretree, min_sup), this.getProjectedDB(
					prodb, this.extendSByGE(ge, pretree, min_sup)), min_sup);

		}
		return ;
	}

	public static void main(String[] args) throws IOException {
		PrefixTreeESpanAction ptes = new PrefixTreeESpanAction();
		ArrayList<String> arrlist = ptes
				.file2arrlist("C:\\Users\\Fernando\\Desktop\\treedata\\CSlog.data");// 原始数据
		HashMap<String, Integer> all_label = ptes.getAllLabel(arrlist);
		System.out.println("所有label数量: " + all_label.size());
		ProjectedDB basedb = new ProjectedDB();
		basedb.setPre_order_list(arrlist);// 最先的原始proDB

		PrefixTree pretrees[] = new PrefixTree[all_label.size()];// {a -1},{b
		// -1}....
		for (int i = 0; i < pretrees.length; i++) {
			pretrees[i] = new PrefixTree(1);// 生成长度为1的prefixtree
		}
		Iterator iter = all_label.entrySet().iterator();
		int index = 0;
		while (iter.hasNext()) {
			Entry entry = (Entry) iter.next();
			pretrees[index].setLength(1);
			pretrees[index].getPtnode(0).setLabel((String) entry.getKey());
			pretrees[index].getPtnode(0).setPos(1);
			pretrees[index].getPtnode(0).setNumMinus(1);
			index++;
		}

		int min_sup = 1;
		for (int i = 0; i < pretrees.length; i++) {
			ProjectedDB ptdb = ptes.getProjectedDB(basedb, pretrees[i]);
			ptes.Fre(pretrees[i],ptdb, min_sup);
//			 HashSet<GrowthElement> geSet = ptes.getGEs(ptdb, pretrees[i]);
//			 Iterator<GrowthElement> it = geSet.iterator();
//			 while (it.hasNext()) {
//			 GrowthElement ge = it.next();
//			 System.out.println("ge: " + ge.toString());
//			 PrefixTree induce_pre = ptes.extendSByGE(ge, pretrees[i],
//			 min_sup);
//			 ProjectedDB induceDB = ptes.getProjectedDB(ptdb, induce_pre) ;
//			 HashSet<GrowthElement> geSet1 = ptes.getGEs(induceDB,
//			 induce_pre);
//			 Iterator<GrowthElement> it1 = geSet1.iterator();
//			 while (it1.hasNext()) {
//			 GrowthElement ge1 = it1.next();
//		//	 System.out.println("ge1: " + ge.toString());
//			 PrefixTree induce_pre1 = ptes.extendSByGE(ge1, induce_pre,
//			 min_sup);
//			 }
//			 System.out.println("======================");
//			 }
		
		}
		// ProjectedDB ptdb = new ProjectedDB();
		// for (int i = 0; i < pretrees.length; i++) {
		// //{A -1}-proDB
		// ptdb = ptes.getProjectedDB(basedb, pretrees[i]);// /记录下来下次用得到
		// HashSet<GrowthElement> geSet = ptes.getGEs(ptdb, pretrees[i]);
		// Iterator<GrowthElement> it = geSet.iterator();
		// while (it.hasNext()) {
		// GrowthElement ge = it.next();
		// System.out.println("ge: " + ge.toString());
		// PrefixTree induce_pre = ptes.extendSByGE(ge, pretrees[i], min_sup);
		// ProjectedDB induceDB = ptes.getProjectedDB(ptdb, induce_pre) ;
		// HashSet<GrowthElement> inducegeSet = ptes.getGEs(induceDB,
		// pretrees[i]);
		// }
		//			
		// }
		// ===================================================================================================
		// PrefixTree pretree = new PrefixTree(1);
		// pretree.getPtnode()[0].setLabel("72");
		// pretree.getPtnode()[0].setPos(1);
		// // pretree.getPtnode()[0].setContainMinus(true);//<72(1)
		// // -1>,也就是第一个位置的标签是72
		// // ***********//
		// // ProjectedDB prodb1 = ptes.getProjectedDB(prodb, pretree);
		// // ***********//
		// HashSet<GrowthElement> hashset = ptes.getGEs(prodb, pretree);
		// Iterator<GrowthElement> it = hashset.iterator();
		// System.out.println("GEs:");
		// while (it.hasNext()) {
		// GrowthElement ge = it.next();
		// System.out.println("ge: " + ge.toString());
		// ptes.extendSByGE(ge, pretree, 2);
		// }

		// ArrayList<String> treedata = http://www.mamicode.com/new ArrayList();>
 



prefixTreeEspan 频繁子树模式挖掘 A pattern growth 算法实现 mining embedded subtrees.