首页 > 代码库 > 转:TopN推荐系统——推荐的实现与推荐效果的评价指标

转:TopN推荐系统——推荐的实现与推荐效果的评价指标

转自:用户推荐系统_python 代码-豆瓣
书籍:项亮的<推荐系统实践>

import randomimport mathclass UserBasedCF:    def __init__(self,train = None,test = None):        self.trainfile = train        self.testfile = test        self.readData()            def readData(self,train = None,test = None):        self.trainfile = train or self.trainfile        self.testfile = test or self.testfile        self.traindata = {}        self.testdata = {}        for line in open(self.trainfile):            userid,itemid,record,_ = line.split()            self.traindata.setdefault(userid,{})            self.traindata[userid][itemid]=record        for line in open(self.testfile):            userid,itemid,record,_ = line.split()            self.testdata.setdefault(userid,{})            self.testdata[userid][itemid]=record                    def userSimilarityBest(self,train = None):        train = train or self.traindata        self.userSimBest = dict()        item_users = dict()        for u,item in train.items():            for i in item.keys():                item_users.setdefault(i,set())                item_users[i].add(u)        user_item_count = dict()        count = dict()        for item,users in item_users.items():            for u in users:                user_item_count.setdefault(u,0)                user_item_count[u] += 1                for v in users:                    if u == v:continue                    count.setdefault(u,{})                    count[u].setdefault(v,0)                    count[u][v] += 1        for u ,related_users in count.items():            self.userSimBest.setdefault(u,dict())            for v, cuv in related_users.items():                self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)    def recommend(self,user,train = None,k = 8,nitem = 40):        train = train or self.traindata        rank = dict()        interacted_items = train.get(user,{})        for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]:#获取与user相似度最高的k个用户            for i , rvi in train[v].items():                if i in interacted_items:                    continue #只选择user没有评分过的物品进行推荐                rank.setdefault(i,0)#设置初始值,以便做下面的累加运算                rank[i] += wuv #书中为rank[i] +=rvi*wuv        return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])#用sorted方法对推荐的物品进行排序,预计评分高的排在前面,再取其中nitem个,nitem为每个用户推荐的物品数量        def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10):        train = train or self.traindata        test = test or self.testdata        hit = 0        recall = 0        precision = 0        for user in train.keys():            tu = test.get(user,{})#如果测试集中没有这个用户,则将tu初始化为空,避免test[user]报错            rank = self.recommend(user, train = train,k = k,nitem = nitem)            for item,_ in rank.items():                if item in tu:                    hit += 1            recall += len(tu)            precision += nitem        return (hit / (recall * 1.0),hit / (precision * 1.0))        def coverage(self,train = None,test = None,k = 8,nitem = 10):        train = train or self.traindata        test = test or self.testdata        recommend_items = set()        all_items = set()        for user in train.keys():            for item in train[user].keys():                all_items.add(item)            rank = self.recommend(user, train, k = k, nitem = nitem)            for item,_ in rank.items():                recommend_items.add(item)        return len(recommend_items) / (len(all_items) * 1.0)        def popularity(self,train = None,test = None,k = 8,nitem = 10):        train = train or self.traindata        test = test or self.testdata        item_popularity = dict()        for user ,items in train.items():            for item in items.keys():                item_popularity.setdefault(item,0)                item_popularity[item] += 1        ret = 0        n = 0        for user in train.keys():            rank = self.recommend(user, train, k = k, nitem = nitem)            for item ,_ in rank.items():                ret += math.log(1+item_popularity[item])                n += 1        return ret / (n * 1.0)            def testUserBasedCF():    train = u1.base    test = u1.test    cf = UserBasedCF(train,test)    cf.userSimilarityBest()    print("%3s%20s%20s%20s%20s" % (K,"precision",recall,coverage,popularity))    for k in [5,10,20,40,80,160]:        recall,precision = cf.recallAndPrecision( k = k)        coverage = cf.coverage(k = k)        popularity = cf.popularity(k = k)        print("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,precision * 100,recall * 100,coverage * 100,popularity))        if __name__ == "__main__":    testUserBasedCF()基于项目的推荐系统,IBCF:‘‘‘Created on 2013-10-10@author: Administrator‘‘‘import randomimport mathclass KNN:    def __init__(self,train = None,test = None):        self.trainfile = train        self.testfile = test        self.readData()            def readData(self,train = None,test = None):        self.trainfile = train or self.trainfile        self.testfile = test or self.testfile        self.traindata = {}        self.testdata = {}        for line in open(self.trainfile):            userid,itemid,record,_ = line.split()            self.traindata.setdefault(userid,{})            self.traindata[userid][itemid]=record        for line in open(self.testfile):            userid,itemid,record,_ = line.split()            self.testdata.setdefault(userid,{})            self.testdata[userid][itemid]=record                                    def ItemSim(self,train = None):        train = train or self.traindata        ItemSimcount = dict()        Item_count = dict()        for _,items in train.items():            for itemidi in items.keys():                Item_count.setdefault(itemidi,0)                Item_count[itemidi] += 1                for itemidj in items.keys():                    if itemidi == itemidj:                        continue                    ItemSimcount.setdefault(itemidi,{})                    ItemSimcount[itemidi].setdefault(itemidj,0)                    ItemSimcount[itemidi][itemidj] +=1        self.ItemSimlist = dict()        for itemidi, related_item in ItemSimcount.items():            self.ItemSimlist.setdefault(itemidi,{})            for itemidj,wij in related_item.items():                self.ItemSimlist[itemidi].setdefault(itemidj,0)                self.ItemSimlist[itemidi][itemidj] = wij/math.sqrt(Item_count[itemidi]*Item_count[itemidj]*1.0)    def recommend(self,user,train = None,k = 5,nitem = 10):        train = train or self.traindata        recommendlist = dict()        User_Itemlist = train.get(user,{})        for i,ri in User_Itemlist.items():            for j,wij in sorted(self.ItemSimlist[i].items(),key = lambda x:x[1],reverse = True)[0:k]:                if j in User_Itemlist:                    continue                recommendlist.setdefault(j,0)                recommendlist[j] += float(ri)*wij        return dict(sorted(recommendlist.items(),key = lambda x :x[1],reverse = True)[0:nitem])        def recallAndPrecision(self,train = None,test = None,k = 5,nitem = 10):        train = train or self.traindata        test = test or self.testdata        hit = 0        recall = 0        precision = 0        for user in train.keys():            tu = test.get(user,{})            rank = self.recommend(user, train = train,k = k,nitem = nitem)            for item,_ in rank.items():                if item in tu:                    hit += 1            recall += len(tu)            precision += nitem        return (hit / (recall * 1.0),hit / (precision * 1.0))        def coverage(self,train = None,test = None,k = 5,nitem = 10):        train = train or self.traindata        test = test or self.testdata        recommend_items = set()        all_items = set()        for user in train.keys():            for item in train[user].keys():                all_items.add(item)            rank = self.recommend(user, train, k = k, nitem = nitem)            for item,_ in rank.items():                recommend_items.add(item)        return len(recommend_items) / (len(all_items) * 1.0)        def popularity(self,train = None,test = None,k = 5,nitem = 10):        train = train or self.traindata        test = test or self.testdata        item_popularity = dict()        for user ,items in train.items():            for item in items.keys():                item_popularity.setdefault(item,0)                item_popularity[item] += 1        ret = 0        n = 0        for user in train.keys():            rank = self.recommend(user, train, k = k, nitem = nitem)            for item ,_ in rank.items():                if item in item_popularity:                    ret += math.log(1+item_popularity[item])                    n += 1        return ret / (n * 1.0)            def testKNNCF():    train = u1.base    test = u1.test    cf = KNN(train,test)    cf.ItemSim()    print("%3s%20s%20s%20s%20s" % (K,"precision",recall,coverage,popularity))    for k in [5,10,20,40,80,160]:        recall,precision = cf.recallAndPrecision( k = k)        coverage = cf.coverage(k = k)        popularity = cf.popularity(k = k)        print("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,precision * 100,recall * 100,coverage * 100,popularity))        if __name__ == "__main__":    testKNNCF()

 

转:TopN推荐系统——推荐的实现与推荐效果的评价指标