首页 > 代码库 > 机器学习之python: kNN
机器学习之python: kNN
1 ################################################## 2 # kNN : k Nearest Neighbour 3 # Author : Monne 4 # Date : 2015-01-24 5 # Email : 416606639@qq.com 6 ################################################## 7 import numpy as np 8 import time 9 starttime = time.time() 10 11 """ too long , equal to classify() 12 def distance(xVec, yVec): 13 # 1. attain distance from xVec and yVec 14 x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4]) 15 diff = x - y # x - y = array([-1, -1, -1]) 16 diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1]) 17 sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3 18 sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0 19 return sqrtsumdiff2 20 21 def disttest(testx, trainx): 22 # attain all the distance between testx and trainx[i] 23 # from distx {ID: distance} 24 distx = {} 25 numsample = len(trainx) 26 for i in range(numsample): 27 distx[i] = distance(testx, trainx[i]) 28 return distx 29 30 def sort(testx, trainx): 31 # sort distx {ID: distance} 32 # return IDk 33 distx = disttest(testx, trainx) 34 sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list 35 IDk = []; distances = [] 36 l = len(trainx) 37 for i in range(l): 38 IDk.append(sortitems[i][0]) # ID 39 distances.append(sortitems[i][1]) # distance 40 #print "distances = ", distances[:5] 41 return IDk 42 43 def majorcount(testx, trainx, trainy, k): 44 IDk = sort(testx, trainx) 45 sorty = {} # dist(y, count) 46 #l = len(trainx) 47 for i in range(k): 48 sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1 49 sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True) # list 50 #print "sorty = ",sorty 51 return sorty[0][0] 52 53 def kNN(testx, trainx, trainy, k): 54 # given testx, trainx, trainy, k 55 # return predict y 56 c = classify(testx, trainx, trainy, k) 57 print "the classifier came back: % r" % c 58 return c 59 """ 60 61 62 # step 1. data input 63 def testsample(): 64 trainx = [[1.0, 1.1], 65 [1.0, 1.0], 66 [0, 0], 67 [0, 0.1]] 68 trainy = [‘A‘, ‘A‘, ‘B‘, ‘B‘] 69 return trainx, trainy 70 71 def txt2trainxy(filename): 72 # 1.read from file 73 # 2.attain dataset: trainx and trainy 74 fr = open( filename +‘.txt‘) 75 trainx = []; trainy = [] 76 for line in fr.readlines(): 77 l = line.split() 78 trainx.append(map(float,l[: -1])) 79 trainy.append(int(l[-1])) 80 return trainx,trainy 81 82 def img2trainxy(filename): 83 trainx = []; trainy = [] 84 from os import listdir 85 fl = listdir(filename) # fr = [‘0_2.txt‘,‘0_1.txt‘] 86 for name in fl: # name = ‘0_2.txt‘ 87 trainy.append(int(name[0])) # name[0] = ‘0‘, int(name[0]) = int(‘0‘) = 0 88 fr = open(filename + ‘/‘ + name) # open(‘0_2.txt‘) 89 tx = [] 90 for line in fr.readlines(): # line = ‘001100\r\n‘ 91 tx.extend(line.strip()) # line.strip() = ‘001100‘, tx = [‘0‘,‘0,‘1‘,‘1‘,...] 92 trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...] 93 return trainx, trainy 94 95 # step 2. data transform 96 def norm(trainx): 97 max = np.array(trainx).max(0) # max(0) = max(axis = 0) 98 min = np.array(trainx).min(0) 99 diff = max - min100 ntrainx = (np.array(trainx) - min) / map(float, diff)101 return ntrainx.tolist(), min, map(float, diff)102 103 104 # step 3. classify function105 def classify(testx, trainx, trainy, k):106 diff = np.array(trainx) - np.array(testx)107 diff2 = diff ** 2108 sumdiff2 = diff2.sum(axis = 1)109 sqrt = sumdiff2 ** 0.5110 IDs = sqrt.argsort() # sorted index 111 sorty = {} # (y, count)112 for i in range(k):113 key = trainy[IDs[i]]114 sorty[key] = sorty.get(key, 0) + 1115 return sorted(sorty.iteritems(), key = 116 lambda d:d[1], reverse = True)[0][0]117 118 119 # step 4. test for error rate120 def testkNN(testratio, trainx, trainy, k):121 l = int(len(trainx) * testratio)122 errorcount = 0123 for i in range(l):124 c = classify(trainx[i], trainx[l:], trainy[l:], k)125 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])126 if c != trainy[i]:127 errorcount += 1128 print "the total error rate is: %f." % (errorcount / float(l))129 #return (errorcount / float(l))130 131 def randomtestkNN(testratio, trainx, trainy, k):132 import random133 m = len(trainx); l = int(m * 0.1)134 testx = []; testy = []; s = []135 136 # random choose k number in [0,l)137 s = random.sample(range(m), l); b = list(set(range(m)) - set(s))138 testx = [trainx[i] for i in s]139 testy = [trainy[i] for i in s]140 trainx = [trainx[i] for i in b]141 trainy = [trainy[i] for i in b]142 """143 for i in range(l):144 s = random.randint(0, m - 1) #[0,m] include m and maybe repeat145 dels.append(s)146 testx.append(trainx[s])147 testy.append(trainy[s])148 trainx = [trainx[i] for i in range(m) if i not in dels]149 trainy = [trainy[i] for i in range(m) if i not in dels]150 """151 152 errorcount = 0153 for i in range(l):154 c = classify(testx[i], trainx, trainy, k)155 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])156 if c != testy[i]:157 errorcount += 1158 print "the total error rate is: %f." % (errorcount / float(l))159 return (errorcount / float(l))160 161 def avg():162 a = []163 for i in range(1,10):164 #print i165 a.append(handwriting(‘trainingDigits‘, ‘testDigits‘, i))166 a = np.array(a)167 print a168 print a.argsort()169 # k = 4, errormin = 0.03170 171 172 # step 5_1 small sample173 def sample(k):174 trainx, trainy = testsample()175 testkNN(trainx, trainy, k)176 177 178 # step 5_2. use for dating web site179 def datingwebsite(filename, k):180 ## step 1: load data181 print "step 1: load data..."182 trainx, trainy = txt2trainxy(filename) # must str like ‘datingTestSet2‘, not datingTestSet2183 trainx, min, diff = norm(trainx)184 185 186 ## step 2: training...187 print "step 2: training..."188 pass189 190 191 ## step 3: testing...192 print "step 3: testing..."193 randomtestkNN(0.10, trainx, trainy, k)194 #testkNN(0.10, trainx, trainy, k)195 print "time cost: ", (time.time() - starttime)196 197 198 ## step 4: show the result...199 print "step 4: show the result..."200 resultList = [‘not at all‘, ‘in small doses‘, ‘in large doses‘]201 percentTats = float(raw_input(202 "percentage of time spent playing video games?> "))203 ffMiles = float(raw_input("frequent flier miles earned per year?> "))204 iceCream = float(raw_input("liters of ice cream consumed per year?> "))205 classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff206 classy = classify(classx, trainx, trainy, k)207 print "You will probably like this person: ", resultList[classy - 1]208 209 return (errorcount / float(l))210 211 212 # step 5_3. use for hand writing213 def handwriting(trainfile, testfile, k):214 ## step 1: load data... 215 print "step 1: load data..."216 print "---Getting training set..."217 trainx, trainy = img2trainxy(trainfile)218 print "---Geting testing set..."219 testx, testy = img2trainxy(testfile)220 m = len(trainx)221 print m, len(trainx[0])222 print len(testx), len(testx[0])223 224 # random choose trainx225 print "---Random choosing the training data..."226 import random227 n = random.randint(0, m - 1) # random numbers228 s = random.sample(range(m), n) # random samples229 trainx = [trainx[i] for i in s]230 trainy = [trainy[i] for i in s]231 print "---the numbers of training data is: ", n232 233 234 ## step 2: training...235 print "step 2: training..."236 pass237 238 239 ## step 3: testing...240 print "step 3: testing..."241 l = len(testx)242 errorcount = 0243 for i in range(l):244 c = classify(testx[i], trainx, trainy, k)245 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])246 if c != testy[i]:247 errorcount += 1248 print "the total error rate is: %f." % (errorcount / float(l))249 print "time cost: ", (time.time() - starttime)250 251 252 ## step 4: show the result...253 print "step 4: show the result..."254 pass255 256 return (errorcount / float(l))257 258 259 260 261 #datingwebsite(‘datingTestSet2‘, 4)262 263 handwriting(‘trainingDigits‘, ‘testDigits‘, 3) 264 265 #avg()
机器学习之python: kNN
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。