首页 > 代码库 > 机器学习之python: kNN

机器学习之python: kNN

技术分享
  1 ##################################################  2 # kNN : k Nearest Neighbour  3 # Author : Monne  4 # Date : 2015-01-24  5 # Email : 416606639@qq.com  6 ##################################################  7 import numpy as np  8 import time  9 starttime = time.time() 10  11 """ too long , equal to classify() 12 def distance(xVec, yVec): 13     # 1. attain distance from xVec and yVec 14     x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4]) 15     diff = x - y # x - y = array([-1, -1, -1]) 16     diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1]) 17     sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3 18     sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0 19     return sqrtsumdiff2 20  21 def disttest(testx, trainx): 22     # attain all the distance between testx and trainx[i] 23     # from distx {ID: distance} 24     distx = {} 25     numsample = len(trainx) 26     for i in range(numsample): 27         distx[i] = distance(testx, trainx[i]) 28     return distx 29  30 def sort(testx, trainx): 31     # sort distx {ID: distance} 32     # return IDk 33     distx = disttest(testx, trainx) 34     sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list 35     IDk = []; distances = [] 36     l = len(trainx) 37     for i in range(l): 38         IDk.append(sortitems[i][0]) # ID 39         distances.append(sortitems[i][1]) # distance 40     #print "distances = ", distances[:5] 41     return IDk 42  43 def majorcount(testx, trainx, trainy, k): 44     IDk = sort(testx, trainx) 45     sorty = {} # dist(y, count) 46     #l = len(trainx) 47     for i in range(k): 48         sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1 49     sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True)  # list 50     #print "sorty = ",sorty 51     return sorty[0][0] 52  53 def kNN(testx, trainx, trainy, k): 54     # given testx, trainx, trainy, k 55     # return predict y 56     c = classify(testx, trainx, trainy, k) 57     print "the classifier came back: % r" % c 58     return c 59 """ 60  61  62 # step 1. data input 63 def testsample(): 64     trainx = [[1.0, 1.1], 65                     [1.0, 1.0], 66                     [0, 0], 67                     [0, 0.1]] 68     trainy = [A, A, B, B] 69     return trainx, trainy 70  71 def txt2trainxy(filename): 72     # 1.read from file 73     # 2.attain dataset: trainx and trainy 74     fr = open( filename +.txt) 75     trainx = []; trainy = [] 76     for line in fr.readlines(): 77         l = line.split() 78         trainx.append(map(float,l[: -1])) 79         trainy.append(int(l[-1])) 80     return trainx,trainy 81  82 def img2trainxy(filename): 83     trainx = []; trainy = [] 84     from os import listdir 85     fl = listdir(filename) # fr = [‘0_2.txt‘,‘0_1.txt‘] 86     for name in fl: # name = ‘0_2.txt‘ 87         trainy.append(int(name[0])) # name[0] = ‘0‘, int(name[0]) = int(‘0‘) = 0 88         fr = open(filename + / + name) # open(‘0_2.txt‘) 89         tx = [] 90         for line in fr.readlines(): # line = ‘001100\r\n‘ 91             tx.extend(line.strip()) # line.strip() = ‘001100‘, tx = [‘0‘,‘0,‘1‘,‘1‘,...] 92         trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...] 93     return trainx, trainy 94  95 # step 2. data transform 96 def norm(trainx): 97     max = np.array(trainx).max(0) # max(0) = max(axis = 0) 98     min = np.array(trainx).min(0)  99     diff = max - min100     ntrainx = (np.array(trainx) - min) / map(float, diff)101     return ntrainx.tolist(), min, map(float, diff)102 103 104 # step 3. classify function105 def classify(testx, trainx, trainy, k):106     diff = np.array(trainx) - np.array(testx)107     diff2 = diff ** 2108     sumdiff2 = diff2.sum(axis = 1)109     sqrt = sumdiff2 ** 0.5110     IDs = sqrt.argsort() # sorted index 111     sorty = {} # (y, count)112     for i in range(k):113         key = trainy[IDs[i]]114         sorty[key] = sorty.get(key, 0) + 1115     return sorted(sorty.iteritems(), key = 116         lambda d:d[1], reverse = True)[0][0]117 118 119 # step 4. test for error rate120 def testkNN(testratio, trainx, trainy, k):121     l = int(len(trainx) * testratio)122     errorcount = 0123     for i in range(l):124         c =  classify(trainx[i], trainx[l:], trainy[l:], k)125         #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])126         if c != trainy[i]:127             errorcount += 1128     print "the total error rate is: %f." % (errorcount / float(l))129     #return (errorcount / float(l))130 131 def randomtestkNN(testratio, trainx, trainy, k):132     import random133     m = len(trainx); l = int(m * 0.1)134     testx = []; testy = []; s = []135 136     # random choose k number in [0,l)137     s = random.sample(range(m), l); b = list(set(range(m)) - set(s))138     testx = [trainx[i] for i in s]139     testy = [trainy[i] for i in s]140     trainx = [trainx[i] for i in b]141     trainy = [trainy[i] for i in b]142     """143     for i in range(l):144         s = random.randint(0, m - 1) #[0,m] include m and maybe repeat145         dels.append(s)146         testx.append(trainx[s])147         testy.append(trainy[s])148     trainx = [trainx[i] for i in range(m) if i not in dels]149     trainy = [trainy[i] for i in range(m) if i not in dels]150     """151 152     errorcount = 0153     for i in range(l):154         c =  classify(testx[i], trainx, trainy, k)155         #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])156         if c != testy[i]:157             errorcount += 1158     print "the total error rate is: %f." % (errorcount / float(l))159     return (errorcount / float(l))160 161 def avg():162     a = []163     for i in range(1,10):164         #print i165         a.append(handwriting(trainingDigits, testDigits, i))166     a = np.array(a)167     print a168     print a.argsort()169     # k = 4, errormin = 0.03170 171 172 # step 5_1 small sample173 def sample(k):174     trainx, trainy = testsample()175     testkNN(trainx, trainy, k)176 177 178 # step 5_2. use for dating web site179 def datingwebsite(filename, k):180     ## step 1: load data181     print "step 1: load data..."182     trainx, trainy = txt2trainxy(filename) # must str like ‘datingTestSet2‘, not datingTestSet2183     trainx, min, diff = norm(trainx)184 185 186     ## step 2: training...187     print "step 2: training..."188     pass189 190 191     ## step 3: testing...192     print "step 3: testing..."193     randomtestkNN(0.10, trainx, trainy, k)194     #testkNN(0.10, trainx, trainy, k)195     print "time cost: ", (time.time() - starttime)196     197 198     ## step 4: show the result...199     print "step 4: show the result..."200     resultList = [not at all, in small doses, in large doses]201     percentTats = float(raw_input(202                     "percentage of time spent playing video games?> "))203     ffMiles = float(raw_input("frequent flier miles earned per year?> "))204     iceCream = float(raw_input("liters of ice cream consumed per year?> "))205     classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff206     classy = classify(classx, trainx, trainy, k)207     print "You will probably like this person: ", resultList[classy - 1]208 209     return (errorcount / float(l))210 211 212 # step 5_3. use for hand writing213 def handwriting(trainfile, testfile, k):214     ## step 1: load data... 215     print "step 1: load data..."216     print "---Getting training set..."217     trainx, trainy = img2trainxy(trainfile)218     print "---Geting testing set..."219     testx, testy = img2trainxy(testfile)220     m = len(trainx)221     print m, len(trainx[0])222     print len(testx), len(testx[0])223 224     # random choose trainx225     print "---Random choosing the training data..."226     import random227     n = random.randint(0, m - 1) # random numbers228     s = random.sample(range(m), n) # random samples229     trainx = [trainx[i] for i in s]230     trainy = [trainy[i] for i in s]231     print "---the numbers of training data is: ", n232 233 234     ## step 2: training...235     print "step 2: training..."236     pass237 238 239     ## step 3: testing...240     print "step 3: testing..."241     l = len(testx)242     errorcount = 0243     for i in range(l):244         c =  classify(testx[i], trainx, trainy, k)245         #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])246         if c != testy[i]:247             errorcount += 1248     print "the total error rate is: %f." % (errorcount / float(l))249     print "time cost: ", (time.time() - starttime)250     251 252     ## step 4: show the result...253     print "step 4: show the result..."254     pass255 256     return (errorcount / float(l))257 258 259 260 261 #datingwebsite(‘datingTestSet2‘, 4)262 263 handwriting(trainingDigits, testDigits, 3)    264     265 #avg()
View Code

 

机器学习之python: kNN