首页 > 代码库 > HtmlAgilityPack C#爬虫

HtmlAgilityPack C#爬虫

Main程序

  class Program
    {
        static void Main(string[] args)
        {
            var tmpStr1 = "http://www.****.com/txtxz/{0}/down.html";
            WebClient wc = new WebClient();


            for (int i = 54422; i < 54423; i++)
            {
                var str = String.Format(tmpStr1, i);
                String result = wc.DownloadString(str);
                BookDataParser parser = new BookDataParser(result, str);
                var item = parser.GetBookData();
                if (item == null)
                {
                    Console.WriteLine("fail to get in url{0}", i);
                    continue;
                }
                wc.DownloadFile(item.DownLink, item.Title + ".txt");
                Console.WriteLine(item);
            }

            Console.ReadKey();
        }
    }

BookDataParser.cs

/// <summary>
    /// BookData解析
    /// </summary>
    public class BookDataParser 
    {
        private String _content;    //html 内容
        private HtmlDocument _doc;  //HtmlAgilityPack 处理文档类
        private String _url;        //当前链接,仅用于构造BookData

        /// <summary>
        /// 构造BookDataParser
        /// </summary>
        /// <param name="content">Html内容</param>
        /// <param name="url">当前链接</param>
        public BookDataParser(string content,string url)
        {
            _url = url;
            _content = content;
            _doc = new HtmlDocument();
            _doc.LoadHtml(content);
        }

        /// <summary>
        /// 解析返回bookData
        /// </summary>
        /// <returns></returns>
        public BookData GetBookData()
        {
            if (!VaildContent()) { return null; }

            var bookData = http://www.mamicode.com/new BookData();
            //Title
            var titleNode = _doc.DocumentNode.SelectSingleNode("//div[@id=‘titlename‘]/h1");
            var str = titleNode.ChildNodes.First(n => n.Name == "#text").InnerHtml;
            bookData.Title = str.Substring(0, str.IndexOf("TXT"));

            var infoNode = _doc.DocumentNode.SelectNodes("//div[@class=‘txt_info‘]/span");
            //Author
            var str2 = infoNode[0].InnerText;
            bookData.AuthorName = str2.Substring(str2.IndexOf()+1);
            //class
            var str3 = infoNode[1].InnerText;
            bookData.Class = str3.Substring(str3.IndexOf("") + 1);
            //date
            var str4 = infoNode.Last().InnerText;
            bookData.UploadTime = DateTime.Parse(str4.Substring(str4.IndexOf("") + 1));

            //Description
            var descNode = _doc.DocumentNode.SelectSingleNode("//div[@class=‘infos_txt‘]");
            bookData.Description = GetDesc(descNode);
            //DownLink
            var linkNode1 = _doc.DocumentNode.SelectSingleNode("//div[@class=‘pan_url‘]/a[last()]");
            bookData.DownLink = linkNode1.GetAttributeValue("href", null);
            //InfoLink
            bookData.InfoLink = _url;
            return bookData;
        }

        /// <summary>
        /// 处理descNode获得描述
        /// </summary>
        /// <param name="descNode">描述信息的节点</param>
        /// <returns>book 描述</returns>
        private string GetDesc(HtmlNode descNode)
        {
            StringBuilder sb = new StringBuilder();
            foreach (var node in descNode.ChildNodes)
            {
                if (node.Name=="#text")
                {
                    var str = node.InnerText;
                    if (!String.IsNullOrWhiteSpace(str))
                    {
                        sb.Append(HttpUtility.HtmlDecode(str).Trim());
                    }
                }
                if (node.Name=="br")
                {
                    sb.Append("\n");
                }
            }
            return sb.ToString();
        }

        /// <summary>
        /// 验证Content为空或不合法
        /// </summary>
        /// <returns></returns>
        public bool VaildContent()
        {
            if (_content == null) return false;
            var node = _doc.DocumentNode.SelectSingleNode("//div[class=‘blocktitle‘]");
            return node == null || node.InnerText.StartsWith("出现错误");
        }
    }

 

HtmlAgilityPack C#爬虫