首页 > 代码库 > cnblogs博客下载-cnblogs博客导出-cnblogs博客备份工具-基于python
cnblogs博客下载-cnblogs博客导出-cnblogs博客备份工具-基于python
http://blog.csdn.net/infoworld/article/details/19547723
以下代码是基于infoworld的csdn备份python代码修改的cnblogs博客备份,但是和infoworld的界面不匹配,只能够用在python里面。python确实有意思,开发很快,怪不得这么流行。
#! encoding=utf-8 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。 import urllib2 import re import os import sys # from HTMLParser import HTMLParser import html5lib # from xml.etree.ElementTree import ElementTree from urlparse import urlparse import xml import codecs import traceback import time # class MyHTMLParser(HTMLParser): # def handle_starttag(self, tag, attrs): # # if tag.lower() == "img": # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) # for x in attrs: # print "name %s,value %s" % (x[0],x[1]) # def handle_endtag(self, tag): # print "Encountered the end of a %s tag" % tag # def handle_startendtag(self, tag, attrs): # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) # for x in attrs: # print "name %s,value %s" % (x[0],x[1]) # 资源尝试次数 gTestTime = 5 def DownloadFile(url,output): responseText = None dirssPath = None try: res = urlparse(url) url = res.scheme+"://"+res.netloc+res.path path = res.path index = path.rfind('/') dirss = "/" if index != -1: dirss = output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8") dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8") dirss_ansi = dirss.decode('utf-8') if not os.path.exists(dirss_ansi): os.makedirs(dirss_ansi) global gTestTime count = gTestTime while True: if count < 0: break count = count - 1 header={"User-Agent": "Mozilla-Firefox5.0"} if not url.startswith("http://"): break try: # print "url: %s:%d" % (url,count) time.sleep(0.5) request = urllib2.Request(url,None,header) response = urllib2.urlopen(request) dirssPath_ansi = dirssPath.decode("utf-8") if not os.path.exists(dirssPath_ansi): resourceFile = open(dirssPath_ansi,"wb") responseText = response.read() if url.endswith(".js"): responseText = responseText.replace("http://","") responseText = responseText.replace("https://","") resourceFile.write(responseText) resourceFile.close() break except Exception,e: print "DownloadFile: %s:%s:%d" % (e,url,count) # pass # exstr = traceback.format_exc() # print exstr except Exception,e: pass # exstr = traceback.format_exc() # print exstr return (responseText,url,output) def ReadCss(css): # print "ReadCss" mode = 'url\(\"?([^)]+)\"?\)' pattern = re.compile(mode) try: text = css[0] if css[0] == None: return strMatch = pattern.findall(text) size = len(strMatch) # print "size: ",size for i in range(0,size,1): one = strMatch[i] newurl = GetConcatUrl(css[1],one) DownloadFile(newurl,css[2]) except Exception,e: pass # exstr = traceback.format_exc() # print exstr def Download(url,output): # try: header={"User-Agent": "Mozilla-Firefox5.0"} namespace = "{http://www.w3.org/1999/xhtml}" request = urllib2.Request(url,None,header) response = urllib2.urlopen(request) data = http://www.mamicode.com/response.read()>
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。