首页 > 代码库 > 基于用户的协同过滤算法

基于用户的协同过滤算法

基于用户的协同过滤算法-参考《推荐系统实践》一书,作者:项亮

  1 import random  2 import math  3 class UserBasedCF:  4     def __init__(self,datafile = None):  5         self.datafile = datafile  6         self.readData()  7         self.splitData(3,47)  8     def readData(self,datafile = None):  9         """ 10         read the data from the data file which is a data set 11         """ 12         self.datafile = datafile or self.datafile 13         self.data =http://www.mamicode.com/ [] 14         for line in open(self.datafile): 15             userid,itemid,record,_ = line.split() 16             self.data.append((userid,itemid,int(record))) 17     def splitData(self,k,seed,data=http://www.mamicode.com/None,M = 8): 18         """ 19         split the data set 20         testdata is a test data set 21         traindata is a train set  22         test data set / train data set is 1:M-1 23         """ 24         self.testdata =http://www.mamicode.com/ {} 25         self.traindata =http://www.mamicode.com/ {} 26         data = http://www.mamicode.com/data or self.data 27         random.seed(seed) 28         for user,item, record in self.data: 29             if random.randint(0,M) == k: 30                 self.testdata.setdefault(user,{}) 31                 self.testdata[user][item] = record  32             else: 33                 self.traindata.setdefault(user,{}) 34                 self.traindata[user][item] = record 35     def userSimilarity(self,train = None): 36         """ 37         One method of getting user similarity matrix 38         """ 39         train = train or self.traindata 40         self.userSim = dict() 41         for u in train.keys(): 42             for v in train.keys(): 43                 if u == v: 44                     continue 45                 self.userSim.setdefault(u,{}) 46                 self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys())) 47                 self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0) 48     def userSimilarityBest(self,train = None): 49         """ 50         the other method of getting user similarity which is better than above 51         you can get the method on page 46 52         In this experiment,we use this method 53         """ 54         train = train or self.traindata 55         self.userSimBest = dict() 56         item_users = dict() 57         for u,item in train.items(): 58             for i in item.keys(): 59                 item_users.setdefault(i,set()) 60                 item_users[i].add(u) 61         user_item_count = dict() 62         count = dict() 63         for item,users in item_users.items(): 64             for u in users: 65                 user_item_count.setdefault(u,0) 66                 user_item_count[u] += 1 67                 for v in users: 68                     if u == v:continue 69                     count.setdefault(u,{}) 70                     count[u].setdefault(v,0) 71                     count[u][v] += 1 72         for u ,related_users in count.items(): 73             self.userSimBest.setdefault(u,dict()) 74             for v, cuv in related_users.items(): 75                 self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0) 76   77     def recommend(self,user,train = None,k = 8,nitem = 40): 78         train = train or self.traindata 79         rank = dict() 80         interacted_items = train.get(user,{}) 81         for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]: 82             for i , rvi in train[v].items(): 83                 if i in interacted_items: 84                     continue 85                 rank.setdefault(i,0) 86                 rank[i] += wuv 87         return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem]) 88     def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10): 89         """ 90         Get the recall and precision, the method you want to know is listed  91         in the page 43 92         """ 93         train  = train or self.traindata 94         test = test or self.testdata 95         hit = 0 96         recall = 0 97         precision = 0 98         for user in train.keys(): 99             tu = test.get(user,{})100             rank = self.recommend(user, train = train,k = k,nitem = nitem) 101             for item,_ in rank.items():102                 if item in tu:103                     hit += 1104             recall += len(tu)105             precision += nitem106         return (hit / (recall * 1.0),hit / (precision * 1.0))107     def coverage(self,train = None,test = None,k = 8,nitem = 10):108         train = train or self.traindata109         test = test or self.testdata110         recommend_items = set()111         all_items  = set()112         for user in train.keys():113             for item in train[user].keys():114                 all_items.add(item)115             rank = self.recommend(user, train, k = k, nitem = nitem)116             for item,_ in rank.items():117                 recommend_items.add(item)118         return len(recommend_items) / (len(all_items) * 1.0)119     def popularity(self,train = None,test = None,k = 8,nitem = 10):120         """121         Get the popularity122         the algorithm on page 44123         """124         train = train or self.traindata125         test = test or self.testdata126         item_popularity = dict()127         for user ,items in train.items():128             for item in items.keys():129                 item_popularity.setdefault(item,0)130                 item_popularity[item] += 1131         ret = 0132         n = 0133         for user in train.keys():134             rank = self.recommend(user, train, k = k, nitem = nitem)135             for item ,_ in rank.items():136                 ret += math.log(1+item_popularity[item])137                 n += 1138         return ret / (n * 1.0)139      140 def testRecommend():141     ubcf = UserBasedCF(u.data)142     ubcf.readData()143     ubcf.splitData(4,100)144     ubcf.userSimilarity()145     user = "345"146     rank = ubcf.recommend(user,k = 3)147     for i,rvi in rank.items():148          149         items = ubcf.testdata.get(user,{})150         record = items.get(i,0)151         print "%5s: %.4f--%.4f" %(i,rvi,record)152 def testUserBasedCF():153     cf  =  UserBasedCF(u.data)154     cf.userSimilarityBest()155     print "%3s%20s%20s%20s%20s" % (K,"recall",precision,coverage,popularity)156     for k in [5,10,20,40,80,160]:157         recall,precision = cf.recallAndPrecision( k = k)158         coverage = cf.coverage(k = k)159         popularity = cf.popularity(k = k)160         print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100,precision * 100,coverage * 100,popularity)161          162 if __name__ == "__main__":163     testUserBasedCF()

基于用户的协同过滤算法