首页 > 代码库 > HtmlAgilityPack C#爬虫
HtmlAgilityPack C#爬虫
Main程序
class Program { static void Main(string[] args) { var tmpStr1 = "http://www.****.com/txtxz/{0}/down.html"; WebClient wc = new WebClient(); for (int i = 54422; i < 54423; i++) { var str = String.Format(tmpStr1, i); String result = wc.DownloadString(str); BookDataParser parser = new BookDataParser(result, str); var item = parser.GetBookData(); if (item == null) { Console.WriteLine("fail to get in url{0}", i); continue; } wc.DownloadFile(item.DownLink, item.Title + ".txt"); Console.WriteLine(item); } Console.ReadKey(); } }
BookDataParser.cs
/// <summary> /// BookData解析 /// </summary> public class BookDataParser { private String _content; //html 内容 private HtmlDocument _doc; //HtmlAgilityPack 处理文档类 private String _url; //当前链接,仅用于构造BookData /// <summary> /// 构造BookDataParser /// </summary> /// <param name="content">Html内容</param> /// <param name="url">当前链接</param> public BookDataParser(string content,string url) { _url = url; _content = content; _doc = new HtmlDocument(); _doc.LoadHtml(content); } /// <summary> /// 解析返回bookData /// </summary> /// <returns></returns> public BookData GetBookData() { if (!VaildContent()) { return null; } var bookData = http://www.mamicode.com/new BookData(); //Title var titleNode = _doc.DocumentNode.SelectSingleNode("//div[@id=‘titlename‘]/h1"); var str = titleNode.ChildNodes.First(n => n.Name == "#text").InnerHtml; bookData.Title = str.Substring(0, str.IndexOf("TXT")); var infoNode = _doc.DocumentNode.SelectNodes("//div[@class=‘txt_info‘]/span"); //Author var str2 = infoNode[0].InnerText; bookData.AuthorName = str2.Substring(str2.IndexOf(‘:‘)+1); //class var str3 = infoNode[1].InnerText; bookData.Class = str3.Substring(str3.IndexOf(":") + 1); //date var str4 = infoNode.Last().InnerText; bookData.UploadTime = DateTime.Parse(str4.Substring(str4.IndexOf(":") + 1)); //Description var descNode = _doc.DocumentNode.SelectSingleNode("//div[@class=‘infos_txt‘]"); bookData.Description = GetDesc(descNode); //DownLink var linkNode1 = _doc.DocumentNode.SelectSingleNode("//div[@class=‘pan_url‘]/a[last()]"); bookData.DownLink = linkNode1.GetAttributeValue("href", null); //InfoLink bookData.InfoLink = _url; return bookData; } /// <summary> /// 处理descNode获得描述 /// </summary> /// <param name="descNode">描述信息的节点</param> /// <returns>book 描述</returns> private string GetDesc(HtmlNode descNode) { StringBuilder sb = new StringBuilder(); foreach (var node in descNode.ChildNodes) { if (node.Name=="#text") { var str = node.InnerText; if (!String.IsNullOrWhiteSpace(str)) { sb.Append(HttpUtility.HtmlDecode(str).Trim()); } } if (node.Name=="br") { sb.Append("\n"); } } return sb.ToString(); } /// <summary> /// 验证Content为空或不合法 /// </summary> /// <returns></returns> public bool VaildContent() { if (_content == null) return false; var node = _doc.DocumentNode.SelectSingleNode("//div[class=‘blocktitle‘]"); return node == null || node.InnerText.StartsWith("出现错误"); } }
HtmlAgilityPack C#爬虫
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。