首页 > 代码库 > k-近邻分类的Python实现

k-近邻分类的Python实现

参见《机器学习实战》

 1 # -*- coding:cp936 -*-
 2 #===============================================================================
 3 # 设计KNN最近邻分类器:
 4 #     找出每个元素在数据集中的最近邻的K个数据,统计这K个数据所属的类,所属类最多的那个类就是该元素所属的类
 5 #===============================================================================
 6 import numpy as np
 7 
 8 def loadHaiLunData(f_name):
 9     with open(f_name) as fHandle:
10         fLines = fHandle.readlines()
11         dataLines = len(fLines)
12         label  = []
13         dataSetMat = np.zeros((dataLines,3))
14         for i in range(dataLines):
15             lineList = fLines[i].strip().split(\t)
16             dataSetMat[i,:] = lineList[0:3]
17             label.append(int(lineList[-1]))
18         return dataSetMat,label
19     
20 
21 def dataNorm(dataSet):
22     numOfEle = dataSet.shape[0]
23     minEle = dataSet.min(0)
24     maxEle = dataSet.max(0)
25     normedData = http://www.mamicode.com/(dataSet-np.tile(minEle,(numOfEle,1)))/np.tile(maxEle-minEle,(numOfEle,1))
26     return normedData
27 
28 def classifyKnn(inX, dataSet, label, k):
29     #===========================================================================
30     # inX:输入向量
31     # dataSet:保存数据特征的数组,每一行为若干个特征的参数,与label对应
32     # label:表明当前这个数据集中的每一个元素属于哪一类
33     # k:设定最近邻的个数
34     #===========================================================================
35     
36     #首先对数据集进行归一化
37 #     dataSet = dataNorm(dataSet)
38     numOfEle = dataSet.shape[0]
39     index = 0
40     diffDistance = dataSet - np.tile(inX, (numOfEle,1))
41     diffDistance = diffDistance**2
42     squareDistance = diffDistance.sum(1)
43 #     squareDistance = squareDistance**0.5
44     knnIndex = squareDistance.argsort()
45     #统计最近的k个近邻的label,看哪个label类别最多就可将该训练元素判为对应类
46     staticDict = {}
47     for i in range(k):
48         staticDict[label[knnIndex[i]]]=staticDict.get(label[knnIndex[i]],0)+1
49     itemList = staticDict.items()
50     argmax = np.argmax(itemList, axis = 0)
51     return itemList[argmax[1]][0]
52     
53 def testHaiLunClassify(k = 3, hRatio = 0.5):
54     dataSet,label = loadHaiLunData(datingTestSet2.txt)
55 #     hRatio = 0.5
56     totalNum = dataSet.shape[0]
57     testNum = int(totalNum*hRatio)
58     dataNormed = dataNorm(dataSet)
59     errorClass = 0
60     for i in range(testNum):
61         classRes = classifyKnn(dataNormed[i,:], dataNormed[testNum:,:], label[testNum:], k)
62         if classRes != label[i]:
63             errorClass += 1
64 #             print "classify error, No. %d should be label %d but got %d"%(i, label[i],classRes)
65     errorRate = errorClass/float(testNum)
66 #     print
67 #     print "Error rate: %f"%(errorRate)
68     return errorRate
69 
70 if __name__ == __main__:
71     errorList = []
72     kRange = range(1,50,1)
73     for k in kRange:
74         errorList.append(testHaiLunClassify(k))
75     print errorList
76     import matplotlib.pyplot as  plt
77     fig = plt.figure(1)
78 #     ax  = fig.add_subplot(111)
79     plt.plot(kRange, errorList,rs-)
80     plt.show()
81