首页 > 代码库 > 基于用户相似性的协同过滤——Python实现

基于用户相似性的协同过滤——Python实现

代码基本来自项亮的<推荐系统实践>,把书上的伪代码具体实现,还参考了https://www.douban.com/note/336280497/

还可以加入对用户相似性的归一化操作,效果会更好。

数据集为MovieLens的10万条数据.
链接:MoiveLens

#coding:utf-8import random,mathfrom operator import itemgetterclass UserBasedCF:    def __init__(self,trainDataFile=None,testDataFile=None,splitor=\t):        if trainDataFile!=None:            self.train=self.loadData(trainDataFile, splitor)        if testDataFile!=None:            self.test=self.loadData(testDataFile, splitor)        self.simiMatrix={}            def setData(self,train,test):        self.train=train        self.test=test                     def loadData(self,dataFile,splitor=\t):        data={}        for line in open(dataFile):            user,item,record,_ = line.split()            data.setdefault(user,{})            data[user][item]=record        return data        def recallAndPrecision(self,peersCount,topN=10):        hit=0        recall=0        precision=0        for user in self.train.keys():            itemOfuser=self.test.get(user,{})            recItems=self.recommend(user,peersCount,topN)            for item,pui in recItems.items():                if item in itemOfuser:                    hit+=1            recall+=len(itemOfuser)            precision+=topN        #print ‘Recall:%s    hit:%s    allRatings:%s‘%(hit/(recall*1.0),hit,precision)        return (hit / (recall * 1.0),hit / (precision * 1.0))        def coverage(self,peersCount,topN=10):        recommend_items=set()        all_items=set()        for user in self.train.keys():            for item in self.train[user].keys():                all_items.add(item)            rank=self.recommend(user,peersCount,topN)            for item,pui in rank.items():                recommend_items.add(item)        return len(recommend_items)/(len(all_items)*1.0)      def popularity(self,peersCount,topN=10):        item_popularity=dict()        for user,items in self.train.items():            for item in items.keys():                if item not in item_popularity:                    item_popularity[item]=1                item_popularity[item]+=1        ret=0        n=0        for user in self.train.keys():            rank=self.recommend(user,peersCount,topN)            for item,pui in rank.items():                ret+=math.log(1+item_popularity[item])                n+=1        return ret/(n*1.0)        def calUserSimilarity(self):        item_users=dict()        for u,ratings in self.train.items():            for i in ratings.keys():                item_users.setdefault(i,set())                item_users[i].add(u)                        #calculate co-rated items between users        coRatedCount=dict()        itemCountOfUser=dict()        for item,users in item_users.items():            for u in users:                itemCountOfUser.setdefault(u,0)                itemCountOfUser[u]+=1                for v in users:                    if u==v:                        continue                    coRatedCount.setdefault(u,{})                    coRatedCount[u].setdefault(v,0)                    coRatedCount[u][v]+=1/math.log(1+len(users))        userSimiMatrix=dict()        for u,related_users in coRatedCount.items():            userSimiMatrix.setdefault(u,{})            for v,cuv in related_users.items():                userSimiMatrix[u][v]=cuv/math.sqrt(itemCountOfUser[u]*itemCountOfUser[v])        self.simiMatrix=userSimiMatrix                  def recommend(self,userU,peersCount,topN=10):        recItems=dict()        interacted_items=self.train[userU]        ‘‘‘prepare the user similarity matrix first‘‘‘        if not self.simiMatrix:            self.calUserSimilarity()        for userV,simiUV in sorted(self.simiMatrix[userU].items(),key=itemgetter(1),reverse=True)[0:peersCount]:            for item,ratingV4I in self.train[userV].items():                if item in interacted_items:                    continue                if item not in recItems:                    recItems[item]=0                recItems[item]+=simiUV*float(ratingV4I)#transform 4 stars into score 0.8                                ‘‘‘if len(recItems)==topN:                    return recItems‘‘‘        return dict(sorted(recItems.items(),key = lambda x :x[1],reverse = True)[0:topN])def testUserBasedCF():    cf=UserBasedCF(trainDataFile=rE:\ResearchAndPapers\DataSet\ml-100k\u3.base,testDataFile=rE:\ResearchAndPapers\DataSet\ml-100k\u3.test)    #cf.calUserSimilarity()    print("%3s%15s%15s%15s%15s" % (K,"precision",recall,coverage,popularity))    for k in [5,10,20,40,80,160]:        recall,precision = cf.recallAndPrecision(peersCount = k)        coverage = cf.coverage(peersCount = k)        popularity = cf.popularity(peersCount = k)        print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))def SplitData(wholeData,M,k,seed,splitor=\t):        test={}        train={}        random.seed(seed)            for line in wholeData:            user,item,score,time=line.strip().split(splitor)            if random.randint(0,M)==k:                test.setdefault(user,{})                test[user][item]=score            else:                train.setdefault(user,{})                train[user][item]=score        return train,test    def testUserBasedCF2():    wholeData=open(rE:\ResearchAndPapers\DataSet\ml-1m\ratings.dat)    train,test=SplitData(wholeData, 8, 5, 10, splitor=::)    cf=UserBasedCF()    cf.setData(train, test)    #cf=UserBasedCF(trainDataFile=r‘E:\ResearchAndPapers\DataSet\ml-100k\u5.base‘,testDataFile=r‘E:\ResearchAndPapers\DataSet\ml-100k\u5.test‘)    #cf.calUserSimilarity()    print("%3s%15s%15s%15s%15s" % (K,"precision",recall,coverage,popularity))    for k in [5,10,20,40,80,160]:        recall,precision = cf.recallAndPrecision(peersCount = k)        coverage = cf.coverage(peersCount = k)        popularity = cf.popularity(peersCount = k)        print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))        if __name__=="__main__":    testUserBasedCF()    #testUserBasedCF2()    

 

基于用户相似性的协同过滤——Python实现