首页 > 代码库 > How to backup your blogs on cnblogs
How to backup your blogs on cnblogs
This is an alternative to OfflineExplorer.
Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:
1. L193, change "homepage1_BottomPager" to "homepage1_HomePageDays_BottomPager". Because I can‘t find "homepage1_BottomPager" in the source code of my cnblog web page at all.
2. L394, set url to your last page.
3. L396, set the output directory on your local disk.
Enjoy it!
1 #! encoding=utf-8 2 3 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。 4 5 import urllib2 6 import re 7 import os 8 import sys 9 # from HTMLParser import HTMLParser 10 import html5lib 11 # from xml.etree.ElementTree import ElementTree 12 from urlparse import urlparse 13 import xml 14 import codecs 15 import traceback 16 import time 17 18 # class MyHTMLParser(HTMLParser): 19 20 # def handle_starttag(self, tag, attrs): 21 # # if tag.lower() == "img": 22 # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 23 # for x in attrs: 24 # print "name %s,value %s" % (x[0],x[1]) 25 # def handle_endtag(self, tag): 26 # print "Encountered the end of a %s tag" % tag 27 28 # def handle_startendtag(self, tag, attrs): 29 # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 30 # for x in attrs: 31 # print "name %s,value %s" % (x[0],x[1]) 32 33 # 资源尝试次数 34 gTestTime = 5 35 36 def DownloadFile(url,output): 37 responseText = None 38 dirssPath = None 39 try: 40 res = urlparse(url) 41 url = res.scheme+"://"+res.netloc+res.path 42 path = res.path 43 index = path.rfind(‘/‘) 44 dirss = "/" 45 if index != -1: 46 dirss = output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8") 47 dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8") 48 dirss_ansi = dirss.decode(‘utf-8‘) 49 if not os.path.exists(dirss_ansi): 50 os.makedirs(dirss_ansi) 51 global gTestTime 52 count = gTestTime 53 while True: 54 if count < 0: 55 break 56 count = count - 1 57 header={"User-Agent": "Mozilla-Firefox5.0"} 58 if not url.startswith("http://"): 59 break 60 try: 61 # print "url: %s:%d" % (url,count) 62 time.sleep(0.5) 63 request = urllib2.Request(url,None,header) 64 response = urllib2.urlopen(request) 65 dirssPath_ansi = dirssPath.decode("utf-8") 66 if not os.path.exists(dirssPath_ansi): 67 resourceFile = open(dirssPath_ansi,"wb") 68 responseText = response.read() 69 if url.endswith(".js"): 70 responseText = responseText.replace("http://","") 71 responseText = responseText.replace("https://","") 72 resourceFile.write(responseText) 73 resourceFile.close() 74 break 75 except Exception,e: 76 print "DownloadFile: %s:%s:%d" % (e,url,count) 77 # pass 78 # exstr = traceback.format_exc() 79 # print exstr 80 81 except Exception,e: 82 pass 83 # exstr = traceback.format_exc() 84 # print exstr 85 86 return (responseText,url,output) 87 88 def ReadCss(css): 89 # print "ReadCss" 90 mode = ‘url\(\"?([^)]+)\"?\)‘ 91 pattern = re.compile(mode) 92 try: 93 text = css[0] 94 if css[0] == None: 95 return 96 strMatch = pattern.findall(text) 97 size = len(strMatch) 98 # print "size: ",size 99 for i in range(0,size,1):100 one = strMatch[i]101 newurl = GetConcatUrl(css[1],one)102 DownloadFile(newurl,css[2])103 except Exception,e:104 pass105 # exstr = traceback.format_exc()106 # print exstr 107 108 def Download(url,output):109 # try:110 header={"User-Agent": "Mozilla-Firefox5.0"}111 namespace = "{http://www.w3.org/1999/xhtml}"112 request = urllib2.Request(url,None,header)113 response = urllib2.urlopen(request)114 115 data =http://www.mamicode.com/ response.read()116 document = html5lib.parse(data)117 imgElements = document.findall(‘.//{0}img‘.format(namespace))118 # print "imgElements %d" % len(imgElements)119 for img in imgElements:120 src = http://www.mamicode.com/img.attrib["src"]121 # print "src %s" % src122 try:123 res = urlparse(src)124 # 非cnblogs的图片不下载125 if not res.netloc.endswith(".cnblogs.com"):126 print "image not download: %s:%s" % (src,res.netloc)127 continue128 except Exception,e:129 pass130 DownloadFile(src,output)131 132 linkElements = document.findall(‘.//{0}link‘.format(namespace))133 # print "linkElements %d" % len(linkElements)134 for link in linkElements:135 href = http://www.mamicode.com/link.attrib["href"]136 # print "href %s" % href137 text = DownloadFile(href,output)138 if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":139 ReadCss(text)140 141 scriptElements = document.findall(‘.//{0}script‘.format(namespace))142 # print "scriptElements %d" % len(scriptElements)143 for script in scriptElements:144 if script.attrib.has_key("src"):145 src = http://www.mamicode.com/script.attrib["src"]146 # print "src %s" % src147 DownloadFile(src,output)148 149 htmlNameIndex = url.rfind("/");150 urlLen = len(url)151 htmlName = GetHtmlName(url)152 output = output.decode("utf-8") + "/"+htmlName+".htm"153 data = http://www.mamicode.com/data.replace("http://","")154 data = http://www.mamicode.com/data.replace("https://","")155 data = http://www.mamicode.com/data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")156 157 resourceFile = open(output,"wb")158 resourceFile.write(data)159 resourceFile.close()160 161 def GetConcatUrl(url,png):162 # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css163 count = 0164 index = png.find("..")165 startindex = None166 while index != -1:167 count = count + 1;168 startindex = index + 2169 index = png.find("..",startindex)170 171 second = png[startindex:]172 length = len(url)173 index = url.rfind("/")174 endindex = 0175 while count >= 0 and index != -1:176 endindex = index177 index = url.rfind("/",0, endindex)178 count = count - 1179 first = url[0:endindex]180 return first+second181 182 def getAllListUrl(url):183 header={"User-Agent": "Mozilla-Firefox5.0"}184 request = urllib2.Request(url,None,header)185 response = urllib2.urlopen(request)186 data =http://www.mamicode.com/ response.read()187 188 # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).189 document = html5lib.parse(data)190 namespace = "{http://www.w3.org/1999/xhtml}"191 192 # get <div id="homepage1_BottomPager" class="topicListFooter">
193 pageList = document.findall(‘.//{0}div[@id=\‘homepage1_HomePageDays_BottomPager\‘]‘.format(namespace))194 print( "Debug>len(pageList)=%d"%len(pageList) );195 # get <div class="pager">196 alinks = list(pageList[0])197 # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">198 alinks1 = list(alinks[0])199 lastArticle = alinks1[len(alinks1)-1]200 201 # lastArticleHref = http://www.mamicode.com/u‘http://www.cnblogs.com/GnagWang/default.html?page=20‘202 lastArticleHref = http://www.mamicode.com/lastArticle.attrib["href"]203 lastPageIndex = lastArticleHref.rfind("=")204 lastPageNum = int(lastArticleHref[lastPageIndex+1:])205 urlInfo = lastArticleHref[0:lastPageIndex]206 207 urlList = []208 for x in xrange(1,lastPageNum+1):209 listUrl = urlInfo+"="+str(x)210 urlList.append(listUrl)211 212 return urlList213 214 215 def getArticleList(url):216 # 获取所有的文章url217 # <div id="article_toplist" class="list"></div>218 # <div id="article_list" class="list" 219 220 # <div class="list_item article_item"221 222 # <div class="article_title">223 # <span class="ico ico_type_Original"></span>224 # <h1>225 # <span class="link_title">226 # <a href="http://www.mamicode.com/infoworld/article/details/18984183">227 228 # <div class="article_manage">229 # <span class="link_postdate"></span>230 231 urlList = getAllListUrl(url)232 print "文章页数(number of pages) ",len(urlList)233 header={"User-Agent": "Mozilla-Firefox5.0"}234 235 allLists = []236 237 strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")238 pageNum = 0239 global gTestTime240 for one in urlList:241 tryCount = gTestTime # try count242 pageNum = pageNum + 1243 pageNumStr = strPage.format(pageNum)244 print pageNumStr245 246 while tryCount > 0:247 try:248 tryCount = tryCount - 1249 time.sleep(0.5) #访问太快会不响应250 request = urllib2.Request(one,None,header)251 response = urllib2.urlopen(request)252 253 data =http://www.mamicode.com/ response.read()254 document = html5lib.parse(data,encoding="utf-8")255 namespace = "{http://www.w3.org/1999/xhtml}"256 # .//{0}div[@id=\‘article_toplist\‘]257 #topLists = document.findall(‘.//{0}div[@id=\‘article_toplist\‘]/{0}div[@class=\‘list_item article_item\‘]‘.format(namespace))258 #articleLists = document.findall(‘.//{0}div[@id=\‘article_list\‘]/{0}div[@class=\‘list_item article_item\‘]‘.format(namespace))259 articleLists = document.findall(‘.//{0}div[@class=\‘postTitle\‘]‘.format(namespace))260 allLists = allLists + articleLists261 break262 except Exception, e:263 print "getArticleList %s:%s:%d" % (e,one,tryCount)264 265 266 count = 0 # 文章数 267 artices = []268 for article in allLists:269 count = count+1270 alink = article.find(".//{0}a".format(namespace))271 # href = http://www.mamicode.com/u‘http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html‘272 href = http://www.mamicode.com/alink.attrib["href"]273 #oneHref = "http://blog.csdn.net"+href274 oneHref =http://www.mamicode.com/ href275 276 childElement = list(alink)277 linkIter = alink.itertext()278 title = "".encode("utf-8")279 for x in linkIter:280 title = title+x.strip().encode("utf-8")281 artices.append([oneHref,title])282 283 return artices284 285 def GetUserName(url):286 htmlNameIndex = url.rfind("/");287 urlLen = len(url)288 htmlName = ""289 htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)290 htmlName = url[htmlNameIndex1+1:htmlNameIndex]291 # if htmlNameIndex+1 == urlLen:292 # htmlNameIndex = url.rfind("/",0,htmlNameIndex)293 # htmlName = url[htmlNameIndex+1:urlLen-1]294 # else:295 # htmlName = url[htmlNameIndex+1:]296 return htmlName297 298 299 def GetHtmlName(url):300 htmlNameIndex = url.rfind("/");301 urlLen = len(url)302 htmlName = ""303 if htmlNameIndex+1 == urlLen:304 htmlNameIndex = url.rfind("/",0,htmlNameIndex)305 htmlName = url[htmlNameIndex+1:urlLen-1]306 else:307 htmlName = url[htmlNameIndex+1:]308 return htmlName309 310 311 312 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 313 def Start(url,output):314 315 print "备份开始"316 lists = getArticleList(url)317 username = GetUserName(url)318 output_username = output+"/"+username319 output_username.replace("\\","/")320 if not os.path.exists(output_username.decode("utf-8")):321 os.mkdir(output_username.decode("utf-8"))322 323 totalNum = len(lists)324 print "总文章数(number of articles): %d" % totalNum325 326 # 生成首页文件327 doctype = ‘<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n‘328 charset = ‘<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />‘329 indexHtml = output_username + ".htm" 330 f = open(indexHtml.decode("utf-8"),"w") 331 print >> f,doctype332 print >> f,‘<html>‘333 print >> f,‘<head>‘334 print >> f,charset335 print >> f,‘</head>‘336 print >> f,‘<frameset cols=\"20%,*\">‘337 navigationHtmlName = username+‘-navigation.htm‘338 print >> f,‘<frame src=http://www.mamicode.com/"‘+navigationHtmlName+‘\" />‘339 firstHtmlName = GetHtmlName(lists[0][0])340 print >> f,‘<frame src=http://www.mamicode.com/"‘+username+‘/‘+firstHtmlName+‘.htm\" name=\"showframe\">‘341 print >> f,‘</frameset>‘342 print >> f,‘</html>‘343 f.close()344 345 # 生成导航文件346 navigationHtml = output+"/"+navigationHtmlName347 # f = open(navigationHtml.decode("utf-8"),"w")348 f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")349 print >> f,doctype350 print >> f,‘<html>‘351 print >> f,‘<head>‘352 print >> f,charset353 print >> f,‘<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>‘354 print >> f,‘</head>‘355 print >> f,‘<body>‘356 count = 0357 for x in lists:358 count = count + 1359 articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"360 print >> f,‘<a href=http://www.mamicode.com/"‘+articleIdHtml + ‘\" target=\"showframe\">‘+str(count)+‘.‘+x[1].decode("utf-8")+‘</a><br /><br />‘361 print >> f,‘</body>‘362 print >> f,‘</html>‘363 f.close()364 365 print "开始下载文章"366 currentNum = 0367 strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")368 global gTestTime369 for x in lists:370 count = gTestTime371 currentNum = currentNum+1372 while True:373 if count < 0:374 break375 count = count - 1376 try:377 time.sleep(1) #访问太快,csdn会报503错误.378 strPageTemp = strPage.format(totalNum,currentNum)379 strPageTemp = strPageTemp+x[1]380 print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时381 382 print x[0]383 print "\n"384 Download(x[0],output_username)385 break386 except Exception, e:387 # exstr = traceback.format_exc()388 # print exstr389 pass390 391 392 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 393 if __name__==‘__main__‘:394 url = "http://www.cnblogs.com/yaoyansi/default.html?page=3"395 #output = "C:/Users/apple/Desktop/新建文件夹"396 output = "/tmp/my_tmp/cnblogs"397 Start(url,output)398 # Download("http://blog.csdn.net/dcraw/article/details/6858820",399 # "C:/Users/apple/Desktop/新建文件夹/infoworld")
Reference:
[1] http://blog.csdn.net/llrraa2010/article/details/35540845
How to backup your blogs on cnblogs
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。