首页 > 代码库 > Python爬虫(三)爬淘宝MM图片

Python爬虫(三)爬淘宝MM图片

直接上代码:

# python2
# -*- coding: utf-8 -*-

import urllib2
import re
import string
import os
import shutil

def crawl_taobaoMM(baseUrl, start, end):
    imgDir = mm_img
    isImgDirExist = os.path.exists(imgDir)
    if not isImgDirExist:
        os.makedirs(imgDir)
    else:
        shutil.rmtree(imgDir)

    fileName = mm.txt
    picNumber = 0
    with open(fileName, a) as f:
        for i in range(start, end + 1):
            url = baseUrl + ?page= + str(i)
            userAgent = Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)                          AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36
            headers = {user-agent: userAgent}
            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req).read().decode(gbk)
            # 图片url、姓名、年龄、城市、职业
            serchPattern = r<div class="personal-info">.*?<img src="http://(.*?)".*?<a class="lady-name".*?>(.*?)                            r</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>.*?<em>(.*?)</em>
            searchObj = re.compile(serchPattern, re.S)
            results = searchObj.findall(response)

            print  + str(i) + 页...
            for result in results:
                message = %s %s %s %s %s\n % (result[0], result[1], result[2], result[3], result[4])
                print picNumber
                print message
                f.write(message.encode(utf-8))
                pic = urllib2.urlopen(https:// + result[0]).read()
                picName = imgDir + / + string.zfill(picNumber, 5) + .jpg
                with open(picName, wb) as pf:
                    pf.write(pic)
                picNumber += 1

crawl_taobaoMM(https://mm.taobao.com/json/request_top_list.htm, 1, 10)

爬下来的图片:

技术分享

 

参考资料:

Python爬虫实战四之抓取淘宝MM照片

 

Python爬虫(三)爬淘宝MM图片