How to backup your blogs on cnblogs

首页 > 代码库 > How to backup your blogs on cnblogs

How to backup your blogs on cnblogs

2024-07-29 03:09:16 220人阅读
This is an alternative to OfflineExplorer.
Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:
1. L193, change "homepage1_BottomPager" to "homepage1_HomePageDays_BottomPager". Because I can‘t find "homepage1_BottomPager" in the source code of my cnblog web page at all.
2. L394, set url to your last page.
3. L396, set the output directory on your local disk.
Enjoy it!
  1 #! encoding=utf-8  2   3 #cnblogs博客备份，使用方法：修改最下面的url和output，然后执行就可以了。  4   5 import urllib2  6 import re  7 import os  8 import sys  9 # from HTMLParser import HTMLParser 10 import html5lib 11 # from xml.etree.ElementTree import ElementTree 12 from urlparse import urlparse 13 import xml 14 import codecs 15 import traceback 16 import time 17  18 # class MyHTMLParser(HTMLParser): 19  20 #     def handle_starttag(self, tag, attrs): 21 #         # if tag.lower() == "img": 22 #             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 23 #             for x in attrs: 24 #                 print "name %s,value %s" % (x[0],x[1]) 25 #     def handle_endtag(self, tag): 26 #         print "Encountered the end of a %s tag" % tag 27  28 #     def handle_startendtag(self, tag, attrs): 29 #         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 30 #         for x in attrs: 31 #             print "name %s,value %s" % (x[0],x[1]) 32  33 # 资源尝试次数 34 gTestTime = 5 35  36 def DownloadFile(url,output): 37   responseText = None 38   dirssPath = None 39   try: 40     res = urlparse(url) 41     url = res.scheme+"://"+res.netloc+res.path 42     path = res.path 43     index = path.rfind(‘/‘) 44     dirss = "/" 45     if index != -1: 46       dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8") 47       dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8") 48       dirss_ansi = dirss.decode(‘utf-8‘) 49       if not os.path.exists(dirss_ansi): 50         os.makedirs(dirss_ansi) 51     global gTestTime 52     count = gTestTime     53     while True: 54       if count < 0: 55         break 56       count = count - 1 57       header={"User-Agent": "Mozilla-Firefox5.0"} 58       if not url.startswith("http://"): 59         break 60       try: 61         # print "url: %s:%d" % (url,count) 62         time.sleep(0.5) 63         request = urllib2.Request(url,None,header) 64         response = urllib2.urlopen(request) 65         dirssPath_ansi = dirssPath.decode("utf-8") 66         if not os.path.exists(dirssPath_ansi): 67           resourceFile = open(dirssPath_ansi,"wb") 68           responseText = response.read() 69           if url.endswith(".js"): 70             responseText = responseText.replace("http://","") 71             responseText = responseText.replace("https://","") 72           resourceFile.write(responseText) 73           resourceFile.close() 74         break          75       except Exception,e: 76         print "DownloadFile: %s:%s:%d" % (e,url,count) 77         # pass 78         # exstr = traceback.format_exc() 79         # print exstr 80  81   except Exception,e: 82       pass 83       # exstr = traceback.format_exc() 84       # print exstr 85    86   return (responseText,url,output) 87  88 def ReadCss(css): 89   # print "ReadCss" 90   mode = ‘url\(\"?([^)]+)\"?\)‘ 91   pattern = re.compile(mode) 92   try: 93     text = css[0] 94     if css[0] == None: 95       return 96     strMatch = pattern.findall(text) 97     size = len(strMatch) 98     # print "size: ",size 99     for i in range(0,size,1):100       one = strMatch[i]101       newurl = GetConcatUrl(css[1],one)102       DownloadFile(newurl,css[2])103   except Exception,e:104       pass105       # exstr = traceback.format_exc()106       # print exstr 107 108 def Download(url,output):109   # try:110   header={"User-Agent": "Mozilla-Firefox5.0"}111   namespace = "{http://www.w3.org/1999/xhtml}"112   request = urllib2.Request(url,None,header)113   response = urllib2.urlopen(request)114 115   data =http://www.mamicode.com/ response.read()116   document = html5lib.parse(data)117   imgElements = document.findall(‘.//{0}img‘.format(namespace))118   # print "imgElements %d" % len(imgElements)119   for img in imgElements:120     src = http://www.mamicode.com/img.attrib["src"]121     # print "src %s" % src122     try:123       res = urlparse(src)124       # 非cnblogs的图片不下载125       if not res.netloc.endswith(".cnblogs.com"):126         print "image not download: %s:%s" % (src,res.netloc)127         continue128     except Exception,e:129       pass130     DownloadFile(src,output)131 132   linkElements = document.findall(‘.//{0}link‘.format(namespace))133   # print "linkElements %d" % len(linkElements)134   for link in linkElements:135     href = http://www.mamicode.com/link.attrib["href"]136     # print "href %s" % href137     text = DownloadFile(href,output)138     if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":139       ReadCss(text)140 141   scriptElements = document.findall(‘.//{0}script‘.format(namespace))142   # print "scriptElements %d" % len(scriptElements)143   for script in scriptElements:144     if script.attrib.has_key("src"):145       src = http://www.mamicode.com/script.attrib["src"]146       # print "src %s" % src147       DownloadFile(src,output)148     149   htmlNameIndex = url.rfind("/");150   urlLen = len(url)151   htmlName = GetHtmlName(url)152   output = output.decode("utf-8") + "/"+htmlName+".htm"153   data = http://www.mamicode.com/data.replace("http://","")154   data = http://www.mamicode.com/data.replace("https://","")155   data = http://www.mamicode.com/data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")156 157   resourceFile = open(output,"wb")158   resourceFile.write(data)159   resourceFile.close()160 161 def GetConcatUrl(url,png):162   # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css163   count = 0164   index = png.find("..")165   startindex = None166   while index != -1:167     count = count + 1;168     startindex = index + 2169     index = png.find("..",startindex)170 171   second = png[startindex:]172   length = len(url)173   index = url.rfind("/")174   endindex = 0175   while count >= 0 and index != -1:176     endindex = index177     index = url.rfind("/",0, endindex)178     count = count - 1179   first = url[0:endindex]180   return first+second181 182 def getAllListUrl(url):183   header={"User-Agent": "Mozilla-Firefox5.0"}184   request = urllib2.Request(url,None,header)185   response = urllib2.urlopen(request)186   data =http://www.mamicode.com/ response.read()187   188   # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).189   document = html5lib.parse(data)190   namespace = "{http://www.w3.org/1999/xhtml}"191 192   # get <div id="homepage1_BottomPager" class="topicListFooter">
193   pageList = document.findall(‘.//{0}div[@id=\‘homepage1_HomePageDays_BottomPager\‘]‘.format(namespace))194   print( "Debug>len(pageList)=%d"%len(pageList) );195   # get <div class="pager">196   alinks = list(pageList[0])197   # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">198   alinks1 = list(alinks[0])199   lastArticle = alinks1[len(alinks1)-1]200   201   # lastArticleHref = http://www.mamicode.com/u‘http://www.cnblogs.com/GnagWang/default.html?page=20‘202   lastArticleHref = http://www.mamicode.com/lastArticle.attrib["href"]203   lastPageIndex = lastArticleHref.rfind("=")204   lastPageNum = int(lastArticleHref[lastPageIndex+1:])205   urlInfo = lastArticleHref[0:lastPageIndex]206 207   urlList = []208   for x in xrange(1,lastPageNum+1):209     listUrl = urlInfo+"="+str(x)210     urlList.append(listUrl)211 212   return urlList213 214 215 def getArticleList(url):216   # 获取所有的文章url217   # <div id="article_toplist" class="list"></div>218   # <div id="article_list" class="list"  219   220   # <div class="list_item article_item"221   222   # <div class="article_title">223   # <span class="ico ico_type_Original"></span>224   # <h1>225   #     <span class="link_title">226   #         <a href="http://www.mamicode.com/infoworld/article/details/18984183">227 228   # <div class="article_manage">229   # <span class="link_postdate"></span>230 231   urlList = getAllListUrl(url)232   print "文章页数(number of pages) ",len(urlList)233   header={"User-Agent": "Mozilla-Firefox5.0"}234 235   allLists = []236 237   strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")238   pageNum = 0239   global gTestTime240   for one in urlList:241     tryCount = gTestTime # try count242     pageNum = pageNum + 1243     pageNumStr = strPage.format(pageNum)244     print pageNumStr245 246     while tryCount > 0:247       try:248         tryCount = tryCount - 1249         time.sleep(0.5) #访问太快会不响应250         request = urllib2.Request(one,None,header)251         response = urllib2.urlopen(request)252 253         data =http://www.mamicode.com/ response.read()254         document = html5lib.parse(data,encoding="utf-8")255         namespace = "{http://www.w3.org/1999/xhtml}"256         # .//{0}div[@id=\‘article_toplist\‘]257         #topLists = document.findall(‘.//{0}div[@id=\‘article_toplist\‘]/{0}div[@class=\‘list_item article_item\‘]‘.format(namespace))258         #articleLists = document.findall(‘.//{0}div[@id=\‘article_list\‘]/{0}div[@class=\‘list_item article_item\‘]‘.format(namespace))259         articleLists =  document.findall(‘.//{0}div[@class=\‘postTitle\‘]‘.format(namespace))260         allLists = allLists + articleLists261         break262       except Exception, e:263         print "getArticleList %s:%s:%d" % (e,one,tryCount)264       265   266   count = 0 # 文章数    267   artices = []268   for article in allLists:269       count = count+1270       alink = article.find(".//{0}a".format(namespace))271       # href = http://www.mamicode.com/u‘http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html‘272       href = http://www.mamicode.com/alink.attrib["href"]273       #oneHref = "http://blog.csdn.net"+href274       oneHref =http://www.mamicode.com/ href275 276       childElement = list(alink)277       linkIter = alink.itertext()278       title = "".encode("utf-8")279       for x in linkIter:280         title = title+x.strip().encode("utf-8")281       artices.append([oneHref,title])282 283   return artices284 285 def GetUserName(url):286   htmlNameIndex = url.rfind("/");287   urlLen = len(url)288   htmlName = ""289   htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)290   htmlName = url[htmlNameIndex1+1:htmlNameIndex]291   # if htmlNameIndex+1 == urlLen:292     # htmlNameIndex = url.rfind("/",0,htmlNameIndex)293     # htmlName = url[htmlNameIndex+1:urlLen-1]294   # else:295     # htmlName = url[htmlNameIndex+1:]296   return htmlName297 298 299 def GetHtmlName(url):300   htmlNameIndex = url.rfind("/");301   urlLen = len(url)302   htmlName = ""303   if htmlNameIndex+1 == urlLen:304     htmlNameIndex = url.rfind("/",0,htmlNameIndex)305     htmlName = url[htmlNameIndex+1:urlLen-1]306   else:307     htmlName = url[htmlNameIndex+1:]308   return htmlName309 310 311 312 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页，则URL建议为前面的URL    313 def Start(url,output):314 315   print "备份开始"316   lists = getArticleList(url)317   username = GetUserName(url)318   output_username = output+"/"+username319   output_username.replace("\\","/")320   if not os.path.exists(output_username.decode("utf-8")):321     os.mkdir(output_username.decode("utf-8"))322 323   totalNum = len(lists)324   print "总文章数(number of articles): %d" % totalNum325 326   # 生成首页文件327   doctype = ‘<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n‘328   charset = ‘<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />‘329   indexHtml = output_username + ".htm" 330   f = open(indexHtml.decode("utf-8"),"w")            331   print >> f,doctype332   print >> f,‘<html>‘333   print >> f,‘<head>‘334   print >> f,charset335   print >> f,‘</head>‘336   print >> f,‘<frameset cols=\"20%,*\">‘337   navigationHtmlName = username+‘-navigation.htm‘338   print >> f,‘<frame src=http://www.mamicode.com/"‘+navigationHtmlName+‘\" />‘339   firstHtmlName = GetHtmlName(lists[0][0])340   print >> f,‘<frame src=http://www.mamicode.com/"‘+username+‘/‘+firstHtmlName+‘.htm\" name=\"showframe\">‘341   print >> f,‘</frameset>‘342   print >> f,‘</html>‘343   f.close()344 345   # 生成导航文件346   navigationHtml = output+"/"+navigationHtmlName347   # f = open(navigationHtml.decode("utf-8"),"w")348   f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")349   print >> f,doctype350   print >> f,‘<html>‘351   print >> f,‘<head>‘352   print >> f,charset353   print >> f,‘<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>‘354   print >> f,‘</head>‘355   print >> f,‘<body>‘356   count = 0357   for x in lists:358     count = count + 1359     articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"360     print >> f,‘<a href=http://www.mamicode.com/"‘+articleIdHtml + ‘\" target=\"showframe\">‘+str(count)+‘.‘+x[1].decode("utf-8")+‘</a><br /><br />‘361   print >> f,‘</body>‘362   print >> f,‘</html>‘363   f.close()364 365   print "开始下载文章"366   currentNum = 0367   strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")368   global gTestTime369   for x in lists:370     count = gTestTime371     currentNum = currentNum+1372     while True:373       if count < 0:374         break375       count = count - 1376       try:377         time.sleep(1) #访问太快,csdn会报503错误.378         strPageTemp = strPage.format(totalNum,currentNum)379         strPageTemp = strPageTemp+x[1]380         print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时381 382         print x[0]383         print "\n"384         Download(x[0],output_username)385         break386       except Exception, e:387         # exstr = traceback.format_exc()388         # print exstr389         pass390   391   392 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页，则URL建议为前面的URL   393 if __name__==‘__main__‘:394   url = "http://www.cnblogs.com/yaoyansi/default.html?page=3"395   #output = "C:/Users/apple/Desktop/新建文件夹"396   output = "/tmp/my_tmp/cnblogs"397   Start(url,output)398   # Download("http://blog.csdn.net/dcraw/article/details/6858820",399   #     "C:/Users/apple/Desktop/新建文件夹/infoworld")
Reference:
[1] http://blog.csdn.net/llrraa2010/article/details/35540845
How to backup your blogs on cnblogs
声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。
联系
我们
首页 > 代码库 > How to backup your blogs on cnblogs

How to backup your blogs on cnblogs

看完仍有疑问？有类似问题直接问程序猿