首页 > 代码库 > 机器学习实战-k-近邻算法

机器学习实战-k-近邻算法

k-近邻算法原理:

存在一个样本的数据集合,也叫训练的样本集,样本集中每个数据都有标签,算法分类时,输入没有分类的新数据,将新数据的每个特征与样本集中每个数据对应的特征进行比较,然后样本集可以计算得到与新数据的相似度,然后取前k(通常不大于20)大相似度所对应的类标签,然后将新数据标识为k个中类标签最多的标签。

例子:

使用k-近邻算法识别博客园注册时的验证码。

使用的工具如下:

  • 使用的编程语言为python
  • 使用opencv库的python接口处理图像

具体的流程如下:

  1. 收集验证码
  2. 分割验证码中的数字并分类,作为用于k-近邻算法识别数字的训练集
  3. 使用k-近邻算法识别验证码

//========================================================

收集验证码:

使用python编写网络爬虫爬取博客园的验证码,一个能使用的代码如下:

#-*- encoding: utf-8 -*-
‘‘‘
Created on 2014年5月13日
@author: jsy
‘‘‘
import os
import re
import urllib
import time

def get_html(url):
    page = urllib.urlopen(url)
    html = page.read()
    page.close()
    return html

def get_image_urls(html):
    reg = r/ValidCodeImage.aspx\?id=\S{14}
    reg_com = re.compile(reg)
    image_urls = re.findall(reg_com, html)
    return image_urls

def download_file(url, outpath):
    if not os.path.exists(outpath):
        os.mkdir(outpath)
        
    tmp_file_name = outpath + / + str(time.strftime(%Y%m%d%H%M%S)) + .jpg
    urllib.urlretrieve(http://passport.cnblogs.com + url, tmp_file_name)

if __name__ == __main__:
    url = http://passport.cnblogs.com/register.aspx?ReturnUrl=http://www.cnblogs.com/
    outpath = valid_code
    
    for i in range(1000):
        html = get_html(url)
        image_urls = get_image_urls(html)
        for j in image_urls:
            download_file(j, outpath)
        time.sleep(1)

爬取的验证码如下:

//=============================================

分割验证码中的数字并分类,作为用于k-近邻算法识别数字的训练集。分割的代码如下,主要使用了一下图像处理的方法,比如腐蚀等

#-*- encoding: utf-8 -*-
‘‘‘
Created on 2014年5月14日
@author: jsy
‘‘‘

import os
import cv2
def segment_num(image):
    blured_image = cv2.GaussianBlur(image, (3, 3), 0)
    retval, binary_image = cv2.threshold(blured_image, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
#     struct_element = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#     binary_image = cv2.dilate(binary_image, struct_element)
    binary_image = cv2.bitwise_not(binary_image)
#     cv2.imshow(‘binary image1‘, binary_image)
    contours, hierarchy = cv2.findContours(binary_image, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
    
    num_rects = []
    for c in contours:
        num_rects.append(cv2.boundingRect(c))
        if num_rects[-1][3] < image.shape[0] / 2:
            num_rects.pop()
            continue
#         cv2.rectangle(image, (num_rects[-1][0], num_rects[-1][1]), (num_rects[-1][0] + num_rects[-1][2], num_rects[-1][1] + num_rects[-1][3]), (0, 0, 0))
#     print num_rects
#     cv2.drawContours(image, contours, -1, (0))
    
#     cv2.imshow(‘original image‘, image)
#     cv2.imshow(‘binary image‘, binary_image)
#     cv2.waitKey(-1)
    return num_rects


if __name__ == __main__:
    in_path = valid_code
    out_path = segmented_numbers
    filename = os.listdir(in_path)
    nums = 0
    for f in filename:
        if f[-3:] != jpg:
            continue
        print f
        image = cv2.imread(in_path + / + f, cv2.CV_LOAD_IMAGE_GRAYSCALE)
        num_rects = segment_num(image)
#         print num_rects
        for rect in num_rects:
            tmp_file = out_path + / + str(nums) + .jpg
            nums += 1
            cv2.imwrite(tmp_file, image[rect[1] : (rect[1] + rect[3]), rect[0] : (rect[0] + rect[2])])
        

分割出数字的效果(已经分类好的0):

然后做好分类:

然后使用kNN算法识别数字,代码如下:

#-*- encoding: utf-8 -*-
‘‘‘
Created on 2014年5月13日
@author: jsy
‘‘‘
import os, sys
from numpy import *
import operator

COM_WIDTH = 32
COM_HEIGHT = 32
TRAINING_NUMS = 100

def createDataSet():
    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = [A, A, B, B]
    return group, labels

def autoNorm(filename):
    import cv2
    gray = cv2.imread(filename, 0)
    gray_resize = cv2.resize(gray, (COM_WIDTH, COM_HEIGHT))
    gray_resize = gray_resize * 1.0
    gray_norm = cv2.normalize(gray_resize);
    return reshape(gray_norm, (1, COM_WIDTH * COM_HEIGHT))

def autoNorm2(gray):
    import cv2
    gray_resize = cv2.resize(gray, (COM_WIDTH, COM_HEIGHT))
    gray_resize = gray_resize * 1.0
    gray_norm = cv2.normalize(gray_resize);
    return reshape(gray_norm, (1, COM_WIDTH * COM_HEIGHT))


def loadDataSet(path):
    nums = os.listdir(path)
    trainingGroup = []
    trainingLabels = []
    testingGroup = []
    testingLabels = []
    for n in nums:
        tmp_nums = os.listdir(path + / + n)
        trainingNum = 0
        for tn in tmp_nums:
            if tn == Thumbs.db:
                continue
            tmp_norm_num = autoNorm(path + / + n + / + tn)            
            if trainingNum >= TRAINING_NUMS:
                testingGroup.append(tmp_norm_num[0,:])
                testingLabels.append(int(n))
            else:
                trainingGroup.append(tmp_norm_num[0,:])
                trainingLabels.append(int(n))
            trainingNum += 1
    return trainingGroup, trainingLabels, testingGroup, testingLabels

def kNN_distance(inX, inY):
    pass


def kNN(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]  #训练集中样本的个数
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet #输入向量与训练集的差
    sqDiffMat = diffMat ** 2    #求差的平方
    sqDistances = sqDiffMat.sum(axis = 1)   #求和
    distances = sqDistances ** 0.5  #开方
    sortedDistIndicies = distances.argsort()    #距离从小到大排序
    classCount = {}
    for i in range(k):  #去前k个最小距离的分类标签
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1  #统计前k个最小距离中分类标签出现的次数
    #求前k个最小距离中分类标签出现最多的标签
    sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
    return sortedClassCount[0][0]
    
if __name__ == __main__:
#     group, labels = createDataSet()
#     print kNN([0, 0], group, labels, 3)

    path = trainingData
    k = 10
    pnum = [0] * 10
    nnum = [0] * 10
    trainingGroup, trainingLabels, testingGroup, testingLabels = loadDataSet(path)
    savetxt(trainingGroup.txt, trainingGroup)
    savetxt(trainingLabels.txt, trainingLabels)
    trainingGroup = loadtxt(trainingGroup.txt)
    trainingLabels = loadtxt(trainingLabels.txt)
    
    for i in range(len(testingGroup)):
        result = kNN(testingGroup[i], array(trainingGroup), trainingLabels, k)
        result = int(result)
        print result, testingLabels[i]
        if result != testingLabels[i]:
            nnum[int(testingLabels[i])] += 1
        else:
            pnum[int(testingLabels[i])] += 1
            
    print positive nums: , pnum
    print negative nums: , nnum
    print 准确率:, 1.0 * sum(pnum) / len(testingGroup)
    print 错误率:, 1.0 * sum(nnum) / len(testingGroup)

识别的结果如下:

positive nums是指每个数字对于识别对的个数

negative nums是指每个数字对应识别错的个数

准确率达到0.99214...

效果看起来不错

//====================================================================

然后就是集成上面的代码

#-*- encoding: utf-8 -*-
‘‘‘
Created on 2014??5??15??
@author: jsy
‘‘‘

import os
from numpy import *
import cv2
import segment_numbers
import kNN

if __name__ == __main__:
    path = valid_code
    filenames = os.listdir(path)
    trainingGroup = loadtxt(trainingGroup.txt)
    trainingLabels = loadtxt(trainingLabels.txt)
    
    for f in filenames:
        gray = cv2.imread(path + / + f, 0)
        num_rects = segment_numbers.segment_num(gray)
        num_rects = sorted(num_rects)
        result = ‘‘
        for r in num_rects:
            sub_gray = gray[r[1] : r[1] + r[3], r[0] : r[0] + r[2]]
            sub_gray_norm = kNN.autoNorm2(sub_gray)
            rr = kNN.kNN(sub_gray_norm, trainingGroup, trainingLabels, 10)
            rr = int(rr)
            result += str(rr)
        cv2.imshow(sample, gray)
        print result
        cv2.waitKey(1000)

整个识别验证码的效果如下:

//================================================

打包代码:http://pan.baidu.com/s/1qWx0Nm8