首页 > 代码库 > FP_Growth算法原理及实现

FP_Growth算法原理及实现

***********************************************声明******************************************************

      原创作品,出自 “晓风残月xj” 博客,欢迎转载,转载时请务必注明出处(http://blog.csdn.net/xiaofengcanyuexj)。

      由于各种原因,可能存在诸多不足,欢迎斧正!

*********************************************************************************************************

         前面提到关联规则寻找频繁项集的Apriori算法,Apriori算法是挖掘布尔型关联规则频繁项集的最为经典、最为基本的算法,但是该算法需要不断寻找候选集,然后剪枝即去掉包含非频繁子集的候选集 ,效率不是很高,时间复杂度由暴力枚举所有子集的指数级别O(n^2) 降为多项式级别,多项式具体系数是底层实现情况而定 。Apriori算法的主要瓶颈在于不断寻找候选项集,可不可以找到一种不用频繁寻找候选项集的算法呢?而且当待挖掘的数据很大进而需要存储在数据库中时,Apriori算法还有一个无可回避的问题就是每次都要扫描数据库,涉及大量I/O操作,比较耗时(当然可以不用数据库)。

     FP_Gwoth算法是一种不生成候选集从而寻找频繁项集的算法,主要基于树结构:包含一个一棵FP_Tree和一个项头表,每个项通过一个结点链指向它在树中出现的位置。基本结构如下所示。需要注意的是项头表需要按照支持度递减排序,在FP_Tree(有后缀的也成条件FP_Tree)中高支持度的节点只能是低支持度节点的祖先节点。这样一来可以保证尽可能的共用祖先节点,更重要的是保证正确性。


procedure FP_Growth(FP_Tree, α)
if FP_Tree 只含单个路径P then{                                                    【1】
         for 路径P中结点的每个组合(记作β)                                      【2】
         产生模式βUα,其支持度MinSupport =β 中结点的最小支持度;   

}
else{
       for each αi 在FP_Tree的项头表(按照支持度由低到高顺序进行扫描){ 【3】
              产生一个模式β= αiUβ,其支持度MinSupport=αi.MinSupport;
              构造β的条件模式基,然后构造β的条件FP_Treeβ;                   【4】
              if FP_Tree不为空 then
                     调用 FP_Growth (FP_Treeβ, β);

       }

}          

【1】 FP_Tree 只含单个路径P,即只有一条分支且分支不能分叉,如果分叉可能隐含了分支合并问题,可能导致在为合并之前误删为不满足最小支持度; 

【2】 若分支上有n个属性值,则总共有2^n组合,可以每个属性值取或不取两种情况递归下去;  

【3】当前条件FP_Tree的的项头表,用尾插法建立单链表;

【4】 以当前项头表αi沿着条件FP_Tree的每条分支向上找出所有条件模式,然后建立后缀模式β的条件FP_Treeβ        


 源代码:

     在此声明,以下代码并不是系本人原创,如需使用,必须声明,谢谢!

//FP_Tree.h

/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved,but you can use this program.
 */
#ifndef FP_TREE_H
#define FP_TREE_H

#include"Transaction.h"
#include"TransactionSet.h"

#include<map>
using namespace std;

const int MAXN_CHILD=20;
typedef string ItemType;

struct ItemSupport
{
	ItemType m_ITItemName;
	int m_nSupportCount;
	ItemSupport(ItemType tItem,int tSup)
	{
		m_ITItemName=tItem;
	    m_nSupportCount=tSup;
	}
};

struct CFP_TreeNode
{
	int m_nSupportCount;
	int m_nChildSize;
	ItemType m_ITItemName;
	CFP_TreeNode *m_pFatherNode;
	CFP_TreeNode *m_pLinkedNode;
	CFP_TreeNode *m_pChildNode[MAXN_CHILD];

	CFP_TreeNode()
	{
		m_ITItemName.clear();
		m_nChildSize=0;
		m_nSupportCount =0;
		m_pFatherNode=NULL;
		m_pLinkedNode=NULL;
		for(int i=0;i<MAXN_CHILD;++i)
			m_pChildNode[i]=NULL;
	}

	
	CFP_TreeNode(int tCount)
	{
		m_ITItemName.clear();
		m_nChildSize=0;
		m_nSupportCount =tCount;
		m_pFatherNode=NULL;
		m_pLinkedNode=NULL;
		for(int i=0;i<MAXN_CHILD;++i)
			m_pChildNode[i]=NULL;
	}

	CFP_TreeNode(ItemType tITtem,CFP_TreeNode *tFa,CFP_TreeNode *tLinked,int tCount)
	{
		m_ITItemName=tITtem;
		m_nChildSize=0;
		m_nSupportCount=tCount;
		m_pFatherNode =tFa;
		m_pLinkedNode=tLinked;
		for(int i=0;i<MAXN_CHILD;++i)
			m_pChildNode[i]=NULL;
	}

};

struct CItemHeaderNode
{
	int m_nSupportCount;
	CFP_TreeNode *m_pFPFirst;
	CItemHeaderNode()
	{
		m_nSupportCount =0 ;
		m_pFPFirst = NULL;
	}
	CItemHeaderNode(int tCount)
	{
		m_nSupportCount =tCount ;
		m_pFPFirst = NULL;
	}
};




class CFP_Tree
{
private:
	double m_dMinConfidence;
	double m_dMinSupport;
	int m_nSize;
	int m_nMinConfidence;
	int m_nMinSupport;

	CFP_TreeNode *m_pCFP_TreeRoot;

private:
	

	void insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount);
	void DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet);
	void printLinkList(CFP_TreeNode *tRoot);
	void destroy(CFP_TreeNode *tRoot);

public:
	vector<ItemSupport>m_vecItemSupportSet;
	map<ItemType,CItemHeaderNode>m_mapItemHeaderList;

	void sortMapItemHeaderList();
	void addItem(ItemType tItem,int tCount);
	void eraseInfrequent1ItemSet();

	/***********************************************
	*
	*功能:对tTranSet进行计数支持度从大到小排序
	*
	***********************************************/
	void sortTransactionSet(CTransactionSet &tTranSet);
	
	CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount);
	CFP_Tree(double tMinCon,double tMinSup,int tSize);
	void insertFPTree(CTransaction &tTran,int id,int tCount);
	void printPath();
	void printItemHeaderList();
	bool isSinglePath(CFP_TreeNode *tRoot);
	void destroy();
	friend class CFP_Growth;
};



#endif

//FP_Tree.cpp
/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved,but you can use this program.
 */
#include<algorithm>
#include"FP_Tree.h"

bool cmp(ItemSupport &a,ItemSupport &b)
{
	return a.m_nSupportCount<b.m_nSupportCount;
}

void CFP_Tree::sortMapItemHeaderList()
{
	vector<ItemSupport>tItemSupportSet;

	for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter)
	{
		tItemSupportSet.push_back(ItemSupport(iter->first,iter->second.m_nSupportCount));
	}
	
	sort(tItemSupportSet.begin(),tItemSupportSet.end(),cmp);

	m_vecItemSupportSet.clear();
	for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin();iter!=tItemSupportSet.end();++iter)
	{
	//	cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")"<<endl;
		m_vecItemSupportSet.push_back(*iter);
	}
}

	
CFP_Tree::CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount)
{
	this->m_mapItemHeaderList.clear();
	this->m_vecItemSupportSet.clear();
	this->m_dMinConfidence=tMinCon;
	this->m_dMinSupport =tMinSup;
	this->m_nSize=tTranSet.getSize();
    this->m_nMinConfidence = (this->m_dMinConfidence)*(this->m_nSize);
	this->m_nMinSupport=(this->m_dMinSupport)*(this->m_nSize);

	this->m_pCFP_TreeRoot=new CFP_TreeNode(tCount);
	
	for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter)
	{
		for(vector<string>::iterator strIter=iter->getVecItem().begin();strIter!=iter->getVecItem().end();++strIter)
		{
			this->addItem(*strIter,1);
		}
	}
	this->sortTransactionSet(tTranSet);

	//cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl;

	this->eraseInfrequent1ItemSet();
	this->sortMapItemHeaderList();

	//cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl;
}

CFP_Tree::CFP_Tree(double tMinCon,double tMinSup,int tSize)
{
	this->m_mapItemHeaderList.clear();
	this->m_vecItemSupportSet.clear();
	this->m_dMinConfidence=tMinCon;
	this->m_dMinSupport =tMinSup;
	this->m_nSize=tSize;
    this->m_nMinConfidence = (this->m_dMinConfidence)*tSize;
	this->m_nMinSupport=(this->m_dMinSupport)*tSize;

	this->m_pCFP_TreeRoot=new CFP_TreeNode();
}

void CFP_Tree::addItem(ItemType tItem,int tCount)
{
	map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.find(tItem);
	if(iter!=m_mapItemHeaderList.end())
	{
	//	cout<<"&&&&&&&&&&&"<<endl;
		iter->second.m_nSupportCount +=tCount;
	}
	else
	{
		//cout<<"**********"<<endl;
	    CItemHeaderNode p(tCount);
	//	cout<<"**********p.m_nSupportCount"<<p.m_nSupportCount<<endl;
		pair<ItemType,CItemHeaderNode> node=make_pair(tItem,p);
		m_mapItemHeaderList.insert(node);	
	}
}


void CFP_Tree::eraseInfrequent1ItemSet()
{
	for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();)
	{
		if(iter->second.m_nSupportCount<this->m_nMinSupport)
		{
			m_mapItemHeaderList.erase(iter);
		}
		else 
		{
			++iter;
		}
	}
}

class CTransactionItemSort
{
public:
  static bool cmp(ItemType a,ItemType b);
  static CFP_Tree *g_pRoot;
//std::sort要求函数对象,或是静态/全局函数指针
//非静态成员函数指针不能直接传递给std::sort
 
};

CFP_Tree *CTransactionItemSort::g_pRoot=NULL;


bool CTransactionItemSort::cmp(ItemType a,ItemType b)
{
	int aCount,bCount;
	map<ItemType,CItemHeaderNode>::iterator iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(a);
	if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end())
	{
		aCount=iter->second.m_nSupportCount;
	}
	else
	{
		aCount=-1;
	}

	iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(b);
	if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end())
	{
		bCount=iter->second.m_nSupportCount;
	}
	else
	{
		bCount=-1;
	}
	return aCount>bCount;
}


void CFP_Tree::sortTransactionSet(CTransactionSet &tTranSet)
{
	CTransactionItemSort::g_pRoot=this;
	for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter)
	{
		sort(iter->getVecItem().begin(),iter->getVecItem().end(),CTransactionItemSort::cmp);//cmp调用出错
	}
}



void CFP_Tree::insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount)
{
	if(id>=tTran.getVecItem().size()||this->m_mapItemHeaderList.find(tTran.getVecItem()[id])==this->m_mapItemHeaderList.end())
		return ;

	CFP_TreeNode *pChild;
	//cout<<"**********this->m_nChildSize="<<this->m_nChildSize<<endl;
	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
		 pChild=tRoot->m_pChildNode[i];
		if(pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id])
		{
		//	cout<<"pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id]"<<endl;
			pChild->m_nSupportCount+=tCount;
			this->insertFPTree(pChild,tTran,id+1,tCount);
			return ;
		}
	}

	//cout<<"&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl;
	ItemType item=tTran.getVecItem()[id];
	pChild=new CFP_TreeNode(item,tRoot, this->m_mapItemHeaderList[item].m_pFPFirst,tCount);
	this->m_mapItemHeaderList[item].m_pFPFirst=pChild;
	tRoot->m_pChildNode[tRoot->m_nChildSize]=pChild;
	++tRoot->m_nChildSize;	
	insertFPTree(pChild,tTran,id+1,tCount);

	//cout<<"!!!!!!!!!!!!this->m_nChildSize="<<this->m_nChildSize<<endl;
}

void CFP_Tree::insertFPTree(CTransaction &tTran,int id,int tCount)
{
	this->insertFPTree(this->m_pCFP_TreeRoot,tTran,id,tCount);
}


void CFP_Tree::DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet)
{
	if(tRoot->m_nChildSize==0)
	{
		for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin(); iter!=tItemSupportSet.end();++iter)
			cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")";
		cout<<endl;
		return ;
	}
	//cout<<"&&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl;

	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
		CFP_TreeNode *pChild= tRoot->m_pChildNode[i];
		tItemSupportSet.push_back(ItemSupport(pChild->m_ITItemName,pChild->m_nSupportCount));
		DFSPrintPath(pChild,tItemSupportSet);
		tItemSupportSet.pop_back();
	}
}

void CFP_Tree::printPath()
{
	vector<ItemSupport>tItemSupportSet;
	cout<<"打印FP_Tree树:"<<endl;
	this->DFSPrintPath(m_pCFP_TreeRoot,tItemSupportSet);
}

void CFP_Tree::printLinkList(CFP_TreeNode *tRoot)
{
	cout<<"--->("<<tRoot->m_ITItemName<<","<<tRoot->m_nSupportCount<<")";
	if(tRoot->m_pLinkedNode!=NULL)
	  printLinkList(tRoot->m_pLinkedNode);
}

void CFP_Tree::printItemHeaderList()
{
	cout<<"打印顶点表中每个单链表:"<<endl;
	for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter)
	{
		if(iter->second.m_pFPFirst!=NULL)
		{
			cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :";
			printLinkList(iter->second.m_pFPFirst);
			cout<<endl;
		}
	}
}



void CFP_Tree::destroy(CFP_TreeNode *tRoot)
{
	CFP_TreeNode *pChild;
	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
	    pChild=tRoot->m_pChildNode[i];
		if(pChild!=NULL)
		{
			destroy(pChild);
		}
	}
	tRoot->m_pFatherNode=NULL;
	tRoot->m_pLinkedNode=NULL;
	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
		if(tRoot->m_pChildNode[i]!=NULL)
		{
		   delete tRoot->m_pChildNode[i];
	    	tRoot->m_pChildNode[i]=NULL;
		}
	}
}

void CFP_Tree::destroy()
{
	this->destroy(this->m_pCFP_TreeRoot);
}

bool CFP_Tree::isSinglePath(CFP_TreeNode *tRoot)
{
	if(0==tRoot->m_nChildSize)
		return true;
	else if(tRoot->m_nChildSize>1)
		return false;
	return isSinglePath(tRoot->m_pChildNode[0]);
}



//FP_Crowth.h
/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved,but you can use this program.
 */
#ifndef FP_GROWTH_H
#define FP_GROWTH_H

#include"Transaction.h"
#include"TransactionSet.h"
#include"FP_Tree.h"


class CFP_Growth
{
private:
	CFP_Tree *m_pCFPTConditionTree;

private:
	void initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet);	
	void printOneFreSet(vector<ItemSupport> &tItemSupportSet);
	void findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet);

public:
	
	CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet);
	void printPath();
	void printItemHeaderList();
};

#endif

//FP_Crowth.cpp
/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved,but you can use this program.
 */
#include"FP_Crowth.h"
#include<algorithm>
using namespace std;

void CFP_Growth::initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet)
{
	for(vector<ItemSupport>::iterator iter=tCFPTTree->m_vecItemSupportSet.begin();iter!=tCFPTTree->m_vecItemSupportSet.end();++iter)
	{
		map<ItemType,CItemHeaderNode>::iterator iterMap=tCFPTTree->m_mapItemHeaderList.find(iter->m_ITItemName);

		//创建条件FP_growth树
		CFP_Tree * pCFPTConTree =new CFP_Tree(tCFPTTree->m_dMinConfidence,tCFPTTree->m_dMinSupport,tCFPTTree->m_nSize);
		for(CFP_TreeNode *next=iterMap->second.m_pFPFirst; next!=NULL; next=next->m_pLinkedNode)
		{
			CTransaction tran;
			CFP_TreeNode *fa=next->m_pFatherNode;
			int count=next->m_nSupportCount;
			while(fa!=NULL&&!fa->m_ITItemName.empty())
			{
				tran.addItem(fa->m_ITItemName);
				pCFPTConTree->addItem(fa->m_ITItemName,count);
				fa=fa->m_pFatherNode;
			}
			CTransaction reve;
			for(vector<string>::reverse_iterator iter=tran.getVecItem().rbegin();iter!=tran.getVecItem().rend();++iter)
			{
				reve.addItem(*iter);
			}
			pCFPTConTree->insertFPTree(reve,0,count);
		}
		pCFPTConTree->sortMapItemHeaderList();

		tItemSupportSet.push_back(ItemSupport(iterMap->first,iterMap->second.m_nSupportCount));
		new CFP_Growth(pCFPTConTree,tItemSupportSet);
		tItemSupportSet.pop_back();
	}
}

CFP_Growth::CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport> &tItemSupportSet)
{
	this->m_pCFPTConditionTree=tCFPTTree;

	if(0==tCFPTTree->m_pCFP_TreeRoot->m_nChildSize)
	{
		this->printOneFreSet(tItemSupportSet);
		return ;
	}
	else if(tCFPTTree->isSinglePath(tCFPTTree->m_pCFP_TreeRoot))
	{
		findCombine(tCFPTTree->m_pCFP_TreeRoot->m_pChildNode[0],tItemSupportSet);
		return ;
	}
	else
	{
		initCFP_Growth(tCFPTTree,tItemSupportSet);
	}
}

void CFP_Growth::findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet)
{
	if(tRoot==NULL)
	{
		printOneFreSet(tItemSupportSet);
		return ;
	}
	findCombine(tRoot->m_pChildNode[0],tItemSupportSet);

	tItemSupportSet.push_back(ItemSupport(tRoot->m_ITItemName,tRoot->m_nSupportCount));
	findCombine(tRoot->m_pChildNode[0],tItemSupportSet);
	tItemSupportSet.pop_back();

}

void CFP_Growth::printOneFreSet(vector<ItemSupport> &tItemSupportSet)
{
	if(1==tItemSupportSet.size())
		return ;

	int count=m_pCFPTConditionTree->m_nSize*10;

	for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter)
	{
		if(count>iter->m_nSupportCount)
			count=iter->m_nSupportCount;
	}
	if(count<m_pCFPTConditionTree->m_nMinSupport)
	   return ;

	cout<<"{ ";
	for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter)
	{
		cout<<iter->m_ITItemName<<" ";
	}
	cout<<count<<" }";
	cout<<endl;
}



void CFP_Growth::printPath()
{
	vector<ItemSupport>tItemSupportSet;
	cout<<"打印FP_Tree树:"<<endl;
	this->m_pCFPTConditionTree->DFSPrintPath(this->m_pCFPTConditionTree->m_pCFP_TreeRoot,tItemSupportSet);
}


void CFP_Growth::printItemHeaderList()
{
	cout<<"打印顶点表中每个单链表:"<<endl;
	for(map<ItemType,CItemHeaderNode>::iterator iter=this->m_pCFPTConditionTree->m_mapItemHeaderList.begin();iter!=this->m_pCFPTConditionTree->m_mapItemHeaderList.end();++iter)
	{
		if(iter->second.m_pFPFirst!=NULL)
		{
			cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :";
			this->m_pCFPTConditionTree->printLinkList(iter->second.m_pFPFirst);
			cout<<endl;
		}
	}
}



     FP-growth算法比Apriori算法快一个数量级,在空间复杂度方面也比Apriori也有数量级级别的优化。但是对于海量数据,FP-growth的时空复杂度仍然很高,可以采用的改进方法包括数据库划分,数据采样等等。

    由于时间有限,在写博文的过程中参考过一些文献,在此表示感谢;同时鉴于水平原因,你难免有不足之处,欢迎斧正!

FP_Growth算法原理及实现