首页 > 代码库 > 爬取当当网的图书信息之封装一个工具类
爬取当当网的图书信息之封装一个工具类
把这个类名取为Tool
封装一个下载网页的方法GetHtml
public static string GetHtml(string url) { try { WebClient wb = new WebClient(); return wb.DownloadString(url); } catch { return ""; } }
传入的是这个网页的URL,这个方法能帮我们把网页下载下来
封装一个匹配图书类URL的的方法
public static ArrayList GetList(string html) { ArrayList list = new ArrayList(); MatchCollection matches = Regex.Matches(html, "http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html"); for (int i = 0; i < matches.Count; i++) { if (!list.Contains(matches[i].Value.ToString()))//去重 { list.Add(matches[i].Value.ToString()); } } return list; }
这里使用了正则http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html来匹配URL地址
封装一个一个获取图书类名的方法
public static string GetBookClassName(string html) { // <meta name="keywords" content="计算机/网络,家庭与办公室用书" /> //计算机/网络、家庭与办公室用书等商品 string name = ""; MatchCollection matches = Regex.Matches(html, "<meta name=\"keywords\" content=\".{1,30}\" />"); if (matches.Count>0) { string temp= matches[0].ToString(); int x= temp.IndexOf("/"); int y = temp.LastIndexOf(">"); if (y-x>4) { name = temp.Substring(x + 1, y - x - "\" />".Length); } } return name; }
查看网页的源代码
<meta name="keywords" content="计算机/网络,家庭与办公室用书" />
图书类名就在这里 接着我们使用正则把它抓取到
接下来我们要抓取每个图书类别共有多少页
public static int GetPages(string html) { int result = 1; MatchCollection matches = Regex.Matches(html, "<li class=\"page_input\"><span>共[0-9]{1,4}页 到第</span>"); if (matches.Count > 0) { string temp = matches[0].ToString(); int y1 = temp.IndexOf("共", 0); int y2 = temp.IndexOf("页", y1); if (y1>0&&y2>0) { string page = temp.Substring(y1 + "共".Length, y2 - y1 - "共".Length); result = int.Parse(page); } } return result; }
处理好BookClass接下来处理Book了
获取图书详细页面的URL
public static ArrayList GetProduct(string html) { //http://product.dangdang.com/22862060.html ArrayList list = new ArrayList(); MatchCollection matches = Regex.Matches(html, "http://product.dangdang.com/[0-9]{8}.html"); for (int i = 0; i < matches.Count; i++) { Console.WriteLine(matches[i].Value.ToString()); if (!list.Contains(matches[i].Value.ToString())) list.Add(matches[i].Value.ToString()); } return list; }
封装一个方法,待爬虫获取图书详细页来抓取图书信息
public static Dictionary<int, string> analysis(string html) { string BookName = ""; string price = "0"; string author = ""; string publisher = ""; string imgurl = ""; string Content = ""; Dictionary<int, string> dictionary = new Dictionary<int, string>(); MatchCollection matches = Regex.Matches(html, " <span class=\"yen\">¥</span>.{1,4}.[0-9]{2}"); if (matches.Count > 0) { string temp = matches[0].ToString(); int y1 = temp.IndexOf("</span>", 0); if (y1>0) price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length); } matches = Regex.Matches(html, "<title>.*</title>"); if (matches.Count > 0) { string thtml = matches[0].ToString(); int n1 = thtml.IndexOf("《", 0); if (n1 > 0) { int n2 = thtml.IndexOf("》", n1); if (n2 > n1) BookName = thtml.Substring(n1 + 1, n2 - n1 - 1); else { n2 = thtml.IndexOf("【简介_书评_在线阅读】 - 当当图书", n1); if (n2 > n1) BookName = thtml.Substring(n1 + 1, n2 - n1 - 1); } } } //作者:<a href="http://www.dangdang.com/author/%40%C7%EF%D2%B6_1" target="_blank" >@秋叶</a> //>作者:<a href="http://www.dangdang.com/author/Marty_1" target="_blank" >Marty</a> int a1 = html.IndexOf("target=\"_blank\" dd_name=\"作者\">",0); if (a1 > 0) { int a2 = html.IndexOf("</a>", a1); if (a2>a1) { author = html.Substring(a1 + "target=\"_blank\" dd_name=\"作者\">".Length, a2 - a1 - "target=\"_blank\" dd_name=\"作者\">".Length); } } // target="_blank" dd_name="出版社"> int p1 = html.IndexOf("target=\"_blank\" dd_name=\"出版社\">", 0); if (p1 > 0) { int p2 = html.IndexOf("</a>", p1); if (p2>0) { publisher = html.Substring(p1 + "target=\"_blank\" dd_name=\"出版社\">".Length, p2 - p1 - "target=\"_blank\" dd_name=\"出版社\">".Length); } } //<img src="http://img3x6.ddimg.cn/88/36/23845426-1_u_5.jpg" alt="" height="800" width="800"> // <img src="http://img3x0.ddimg.cn/52/15/23465230-1_u_1.jpg" alt="" height="800" width="800"> // <img src="http://img3x6.ddimg.cn/45/19/23915376-1_u_6.jpg" alt="" height="800" width="800"> matches = Regex.Matches(html, "http://img3x[0-9].ddimg.cn/[0-9]{2}/[0-9]{2}/[0-9]{8}-[0-9]_u_[0-9].jpg"); if (matches.Count > 0) { imgurl = matches[0].ToString(); } //content int c1 = html.IndexOf("<meta name=\"description\" content=\""); if (c1>0) { int c2 = html.IndexOf("\">", c1); if (c2>0) { Content = html.Substring(c1 + "<meta name=\"description\" content=\"".Length, c2 - c1 - "<meta name=\"description\" content=\"".Length); } } dictionary.Add(1, BookName); dictionary.Add(2, price); dictionary.Add(3, author); dictionary.Add(4, publisher); dictionary.Add(5, imgurl); dictionary.Add(6, Content); return dictionary; }
Tool类完成
爬取当当网的图书信息之封装一个工具类
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。