首页 > 代码库 > How to backup your blogs on cnblogs

How to backup your blogs on cnblogs

This is an alternative to OfflineExplorer.

 

Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:

1. L193, change "homepage1_BottomPager" to  "homepage1_HomePageDays_BottomPager". Because I can‘t find "homepage1_BottomPager" in the source code of my cnblog web page at all.

2. L394, set url to your last page.

3. L396, set the output directory on your local disk.

Enjoy it!

  1 #! encoding=utf-8  2   3 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。  4   5 import urllib2  6 import re  7 import os  8 import sys  9 # from HTMLParser import HTMLParser 10 import html5lib 11 # from xml.etree.ElementTree import ElementTree 12 from urlparse import urlparse 13 import xml 14 import codecs 15 import traceback 16 import time 17  18 # class MyHTMLParser(HTMLParser): 19  20 #     def handle_starttag(self, tag, attrs): 21 #         # if tag.lower() == "img": 22 #             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 23 #             for x in attrs: 24 #                 print "name %s,value %s" % (x[0],x[1]) 25 #     def handle_endtag(self, tag): 26 #         print "Encountered the end of a %s tag" % tag 27  28 #     def handle_startendtag(self, tag, attrs): 29 #         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 30 #         for x in attrs: 31 #             print "name %s,value %s" % (x[0],x[1]) 32  33 # 资源尝试次数 34 gTestTime = 5 35  36 def DownloadFile(url,output): 37   responseText = None 38   dirssPath = None 39   try: 40     res = urlparse(url) 41     url = res.scheme+"://"+res.netloc+res.path 42     path = res.path 43     index = path.rfind(/) 44     dirss = "/" 45     if index != -1: 46       dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8") 47       dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8") 48       dirss_ansi = dirss.decode(utf-8) 49       if not os.path.exists(dirss_ansi): 50         os.makedirs(dirss_ansi) 51     global gTestTime 52     count = gTestTime     53     while True: 54       if count < 0: 55         break 56       count = count - 1 57       header={"User-Agent": "Mozilla-Firefox5.0"} 58       if not url.startswith("http://"): 59         break 60       try: 61         # print "url: %s:%d" % (url,count) 62         time.sleep(0.5) 63         request = urllib2.Request(url,None,header) 64         response = urllib2.urlopen(request) 65         dirssPath_ansi = dirssPath.decode("utf-8") 66         if not os.path.exists(dirssPath_ansi): 67           resourceFile = open(dirssPath_ansi,"wb") 68           responseText = response.read() 69           if url.endswith(".js"): 70             responseText = responseText.replace("http://","") 71             responseText = responseText.replace("https://","") 72           resourceFile.write(responseText) 73           resourceFile.close() 74         break          75       except Exception,e: 76         print "DownloadFile: %s:%s:%d" % (e,url,count) 77         # pass 78         # exstr = traceback.format_exc() 79         # print exstr 80  81   except Exception,e: 82       pass 83       # exstr = traceback.format_exc() 84       # print exstr 85    86   return (responseText,url,output) 87  88 def ReadCss(css): 89   # print "ReadCss" 90   mode = url\(\"?([^)]+)\"?\) 91   pattern = re.compile(mode) 92   try: 93     text = css[0] 94     if css[0] == None: 95       return 96     strMatch = pattern.findall(text) 97     size = len(strMatch) 98     # print "size: ",size 99     for i in range(0,size,1):100       one = strMatch[i]101       newurl = GetConcatUrl(css[1],one)102       DownloadFile(newurl,css[2])103   except Exception,e:104       pass105       # exstr = traceback.format_exc()106       # print exstr 107 108 def Download(url,output):109   # try:110   header={"User-Agent": "Mozilla-Firefox5.0"}111   namespace = "{http://www.w3.org/1999/xhtml}"112   request = urllib2.Request(url,None,header)113   response = urllib2.urlopen(request)114 115   data =http://www.mamicode.com/ response.read()116   document = html5lib.parse(data)117   imgElements = document.findall(.//{0}img.format(namespace))118   # print "imgElements %d" % len(imgElements)119   for img in imgElements:120     src = http://www.mamicode.com/img.attrib["src"]121     # print "src %s" % src122     try:123       res = urlparse(src)124       # 非cnblogs的图片不下载125       if not res.netloc.endswith(".cnblogs.com"):126         print "image not download: %s:%s" % (src,res.netloc)127         continue128     except Exception,e:129       pass130     DownloadFile(src,output)131 132   linkElements = document.findall(.//{0}link.format(namespace))133   # print "linkElements %d" % len(linkElements)134   for link in linkElements:135     href = http://www.mamicode.com/link.attrib["href"]136     # print "href %s" % href137     text = DownloadFile(href,output)138     if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":139       ReadCss(text)140 141   scriptElements = document.findall(.//{0}script.format(namespace))142   # print "scriptElements %d" % len(scriptElements)143   for script in scriptElements:144     if script.attrib.has_key("src"):145       src = http://www.mamicode.com/script.attrib["src"]146       # print "src %s" % src147       DownloadFile(src,output)148     149   htmlNameIndex = url.rfind("/");150   urlLen = len(url)151   htmlName = GetHtmlName(url)152   output = output.decode("utf-8") + "/"+htmlName+".htm"153   data = http://www.mamicode.com/data.replace("http://","")154   data = http://www.mamicode.com/data.replace("https://","")155   data = http://www.mamicode.com/data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")156 157   resourceFile = open(output,"wb")158   resourceFile.write(data)159   resourceFile.close()160 161 def GetConcatUrl(url,png):162   # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css163   count = 0164   index = png.find("..")165   startindex = None166   while index != -1:167     count = count + 1;168     startindex = index + 2169     index = png.find("..",startindex)170 171   second = png[startindex:]172   length = len(url)173   index = url.rfind("/")174   endindex = 0175   while count >= 0 and index != -1:176     endindex = index177     index = url.rfind("/",0, endindex)178     count = count - 1179   first = url[0:endindex]180   return first+second181 182 def getAllListUrl(url):183   header={"User-Agent": "Mozilla-Firefox5.0"}184   request = urllib2.Request(url,None,header)185   response = urllib2.urlopen(request)186   data =http://www.mamicode.com/ response.read()187   188   # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).189   document = html5lib.parse(data)190   namespace = "{http://www.w3.org/1999/xhtml}"191 192   # get <div id="homepage1_BottomPager" class="topicListFooter">
193 pageList = document.findall(.//{0}div[@id=\‘homepage1_HomePageDays_BottomPager\‘].format(namespace))194 print( "Debug>len(pageList)=%d"%len(pageList) );195 # get <div class="pager">196 alinks = list(pageList[0])197 # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">198 alinks1 = list(alinks[0])199 lastArticle = alinks1[len(alinks1)-1]200 201 # lastArticleHref = http://www.mamicode.com/u‘http://www.cnblogs.com/GnagWang/default.html?page=20‘202 lastArticleHref = http://www.mamicode.com/lastArticle.attrib["href"]203 lastPageIndex = lastArticleHref.rfind("=")204 lastPageNum = int(lastArticleHref[lastPageIndex+1:])205 urlInfo = lastArticleHref[0:lastPageIndex]206 207 urlList = []208 for x in xrange(1,lastPageNum+1):209 listUrl = urlInfo+"="+str(x)210 urlList.append(listUrl)211 212 return urlList213 214 215 def getArticleList(url):216 # 获取所有的文章url217 # <div id="article_toplist" class="list"></div>218 # <div id="article_list" class="list" 219 220 # <div class="list_item article_item"221 222 # <div class="article_title">223 # <span class="ico ico_type_Original"></span>224 # <h1>225 # <span class="link_title">226 # <a href="http://www.mamicode.com/infoworld/article/details/18984183">227 228 # <div class="article_manage">229 # <span class="link_postdate"></span>230 231 urlList = getAllListUrl(url)232 print "文章页数(number of pages) ",len(urlList)233 header={"User-Agent": "Mozilla-Firefox5.0"}234 235 allLists = []236 237 strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")238 pageNum = 0239 global gTestTime240 for one in urlList:241 tryCount = gTestTime # try count242 pageNum = pageNum + 1243 pageNumStr = strPage.format(pageNum)244 print pageNumStr245 246 while tryCount > 0:247 try:248 tryCount = tryCount - 1249 time.sleep(0.5) #访问太快会不响应250 request = urllib2.Request(one,None,header)251 response = urllib2.urlopen(request)252 253 data =http://www.mamicode.com/ response.read()254 document = html5lib.parse(data,encoding="utf-8")255 namespace = "{http://www.w3.org/1999/xhtml}"256 # .//{0}div[@id=\‘article_toplist\‘]257 #topLists = document.findall(‘.//{0}div[@id=\‘article_toplist\‘]/{0}div[@class=\‘list_item article_item\‘]‘.format(namespace))258 #articleLists = document.findall(‘.//{0}div[@id=\‘article_list\‘]/{0}div[@class=\‘list_item article_item\‘]‘.format(namespace))259 articleLists = document.findall(.//{0}div[@class=\‘postTitle\‘].format(namespace))260 allLists = allLists + articleLists261 break262 except Exception, e:263 print "getArticleList %s:%s:%d" % (e,one,tryCount)264 265 266 count = 0 # 文章数 267 artices = []268 for article in allLists:269 count = count+1270 alink = article.find(".//{0}a".format(namespace))271 # href = http://www.mamicode.com/u‘http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html‘272 href = http://www.mamicode.com/alink.attrib["href"]273 #oneHref = "http://blog.csdn.net"+href274 oneHref =http://www.mamicode.com/ href275 276 childElement = list(alink)277 linkIter = alink.itertext()278 title = "".encode("utf-8")279 for x in linkIter:280 title = title+x.strip().encode("utf-8")281 artices.append([oneHref,title])282 283 return artices284 285 def GetUserName(url):286 htmlNameIndex = url.rfind("/");287 urlLen = len(url)288 htmlName = ""289 htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)290 htmlName = url[htmlNameIndex1+1:htmlNameIndex]291 # if htmlNameIndex+1 == urlLen:292 # htmlNameIndex = url.rfind("/",0,htmlNameIndex)293 # htmlName = url[htmlNameIndex+1:urlLen-1]294 # else:295 # htmlName = url[htmlNameIndex+1:]296 return htmlName297 298 299 def GetHtmlName(url):300 htmlNameIndex = url.rfind("/");301 urlLen = len(url)302 htmlName = ""303 if htmlNameIndex+1 == urlLen:304 htmlNameIndex = url.rfind("/",0,htmlNameIndex)305 htmlName = url[htmlNameIndex+1:urlLen-1]306 else:307 htmlName = url[htmlNameIndex+1:]308 return htmlName309 310 311 312 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 313 def Start(url,output):314 315 print "备份开始"316 lists = getArticleList(url)317 username = GetUserName(url)318 output_username = output+"/"+username319 output_username.replace("\\","/")320 if not os.path.exists(output_username.decode("utf-8")):321 os.mkdir(output_username.decode("utf-8"))322 323 totalNum = len(lists)324 print "总文章数(number of articles): %d" % totalNum325 326 # 生成首页文件327 doctype = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n328 charset = <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />329 indexHtml = output_username + ".htm" 330 f = open(indexHtml.decode("utf-8"),"w") 331 print >> f,doctype332 print >> f,<html>333 print >> f,<head>334 print >> f,charset335 print >> f,</head>336 print >> f,<frameset cols=\"20%,*\">337 navigationHtmlName = username+-navigation.htm338 print >> f,<frame src=http://www.mamicode.com/"+navigationHtmlName+\" />339 firstHtmlName = GetHtmlName(lists[0][0])340 print >> f,<frame src=http://www.mamicode.com/"+username+/+firstHtmlName+.htm\" name=\"showframe\">341 print >> f,</frameset>342 print >> f,</html>343 f.close()344 345 # 生成导航文件346 navigationHtml = output+"/"+navigationHtmlName347 # f = open(navigationHtml.decode("utf-8"),"w")348 f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")349 print >> f,doctype350 print >> f,<html>351 print >> f,<head>352 print >> f,charset353 print >> f,<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>354 print >> f,</head>355 print >> f,<body>356 count = 0357 for x in lists:358 count = count + 1359 articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"360 print >> f,<a href=http://www.mamicode.com/"+articleIdHtml + \" target=\"showframe\">+str(count)+.+x[1].decode("utf-8")+</a><br /><br />361 print >> f,</body>362 print >> f,</html>363 f.close()364 365 print "开始下载文章"366 currentNum = 0367 strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")368 global gTestTime369 for x in lists:370 count = gTestTime371 currentNum = currentNum+1372 while True:373 if count < 0:374 break375 count = count - 1376 try:377 time.sleep(1) #访问太快,csdn会报503错误.378 strPageTemp = strPage.format(totalNum,currentNum)379 strPageTemp = strPageTemp+x[1]380 print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时381 382 print x[0]383 print "\n"384 Download(x[0],output_username)385 break386 except Exception, e:387 # exstr = traceback.format_exc()388 # print exstr389 pass390 391 392 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 393 if __name__==__main__:394 url = "http://www.cnblogs.com/yaoyansi/default.html?page=3"395 #output = "C:/Users/apple/Desktop/新建文件夹"396 output = "/tmp/my_tmp/cnblogs"397 Start(url,output)398 # Download("http://blog.csdn.net/dcraw/article/details/6858820",399 # "C:/Users/apple/Desktop/新建文件夹/infoworld")

 

Reference:

[1] http://blog.csdn.net/llrraa2010/article/details/35540845

How to backup your blogs on cnblogs