初识python之 APP store排行榜蜘蛛抓取(二)

首页 > 代码库 > 初识python之 APP store排行榜蜘蛛抓取(二)

初识python之 APP store排行榜蜘蛛抓取(二)

2024-07-19 16:04:04 222人阅读

#-*- coding: utf-8 -*-import urllib2import urllibimport reimport timeimport MySQLdbimport time,datetime#from datetime import date#----------- APP store 排行榜 -----------class Spider_Model:    def __init__(self):        self.page = 1        self.pages = []        self.enable = False    def startWork(self,url,tabName):        nowtime = int(time.time())        content = self.GetCon(url)        oneItems =  self.Match(content) #匹配一级参数        time.sleep(1)        for index,item in enumerate(oneItems):            content_two = self.GetCon(item[1])            twoItems = self.Match_two(content_two)            oneItems[index].append([twoItems[0],twoItems[1]])            if oneItems[index][6][0] == ‘0‘:                fabutime = ‘0‘            else:                fabutime=int(time.mktime(time.strptime(oneItems[index][6][0].strip(),‘%Y年%m月%d日‘)))            sql = "INSERT INTO "+tabName+"(`rank`,`detailurl`,`logo`,`name`,`type`,`appid`,`appstoretime`,`compatible`,`ctime`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"%(‘"‘+oneItems[index][0]+‘"‘,‘"‘+oneItems[index][1]+‘"‘,‘"‘+oneItems[index][2]+‘"‘,‘"‘+oneItems[index][3]+‘"‘,‘"‘+oneItems[index][4]+‘"‘,‘"‘+oneItems[index][5]+‘"‘,fabutime,‘"‘+oneItems[index][6][1]+‘"‘,nowtime)            self.contentDb(sql)            time.sleep(1)    def GetCon(self,url):        myUrl = url        headers = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘,‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘}        //网站禁止爬虫解决方法加上上面的代码 模拟浏览器访问  
　　　　 req = urllib2.Request(myUrl, headers = headers)        global myResponse        try:                 myResponse = urllib2.urlopen(req)        except urllib2.HTTPError, e:                print e.fp.read()
　　　　 //异常处理必须加  否则就算模拟了浏览器 也会返回 403  原因不知道......          myPage = myResponse.read()        #encode的作用是将unicode编码转换成其他编码的字符串        #decode的作用是将其他编码的字符串转换成unicode编码        #unicodePage = myPage.decode(‘utf-8‘).encode(‘gbk‘,‘ignore‘)        #unicodePage = myPage.decode(‘utf-8‘,‘ignore‘)        return myPage  def Match(self,con):        # 找出所有class="content"的div标记        #re.S是任意匹配模式，也就是.可以匹配换行符        pattenA = re.compile(r‘<section class="section apps grid">(.*?)</section>‘,re.U|re.S)        pattenB = re.compile(r‘<li><strong>(.*?).</strong><a href="http://www.mamicode.com/(.*?)".*?><img src="http://www.mamicode.com/(.*?)".*?></a><h3><a.*?>(.*?)</a></h3><h4><a.*?>(.*?)</a></h4><a.*?>.*?</a></li>‘,re.U|re.S)        match = re.findall(pattenA,con)        myItems = re.findall(pattenB,match[0])        items = []        for item in myItems:             items.append([item[0].replace("\n",""),item[1].replace("\n",""),item[2].replace("\n",""),(item[3].replace("\n","")).split(‘-‘)[0],item[4].replace("\n",""),(item[1].split(‘id‘)[1]).split(‘?‘)[0]])        return items    def Match_two(self,con):        pattenTwoA = re.compile(r‘<li.*?class="release-date"><span.*?>.*?</span>(.*?)</li>‘,re.U|re.S)        pattenTwoB = re.compile(r‘<span.*?class="app-requirements">.*?</span>(.*?)</p>‘,re.U|re.S)        matchTwoA = self.is_empty(re.findall(pattenTwoA,con))        matchTwoB = self.is_empty(re.findall(pattenTwoB,con))        itemsTwo = [matchTwoA,matchTwoB]        return itemsTwo    def is_empty(self,param):        if len(param):            res = param[0]        else:            res = ‘0‘        return res    def contentDb(self,sql):        try:             conn = MySQLdb.connect(host="主机", user="用户", passwd="密码", db="表名",charset=‘utf8‘)            cur = conn.cursor()            result = cur.execute(sql)            conn.commit()        except MySQLdb.Error,e:            print "Mysql Error %d: %s" %(e.args[0],e.args[1])addArr = [["http://www.apple.com/jp/itunes/charts/free-apps/",‘cg_jp_free‘],          ["http://www.apple.com/jp/itunes/charts/paid-apps/",‘cg_jp_paid‘]]myModel = Spider_Model()for val in addArr:        myModel.startWork(val[0],val[1])

初识Python 代码写的有点烂，自制罪孽深重......

python版本：2.7.5 测试环境：Linux、Windows

望高手拍砖带我一起装逼！一起飞！

初识python之 APP store排行榜蜘蛛抓取(二)

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 初识python之 APP store排行榜 蜘蛛抓取(二)

初识python之 APP store排行榜 蜘蛛抓取(二)

看完仍有疑问？有类似问题直接问程序猿

首页 > 代码库 > 初识python之 APP store排行榜蜘蛛抓取(二)

初识python之 APP store排行榜蜘蛛抓取(二)