首页 > 代码库 > C# 访问一个页面,并根据该页面上的a标签递归访问每个页面
C# 访问一个页面,并根据该页面上的a标签递归访问每个页面
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.IO; using System.Net; using HtmlAgilityPack; namespace SimpleWebRobot { class Program { public static int LINKCOUNT = 0; public static int ERRCOUNT = 0; static void Main(string[] args) { Output.WriteLine("Createing log ..."); LINKCOUNT = 0; //FileInfo logFile = new FileInfo("E:\\SimpleWebRobot\\SimpleWebRobot\\SimpleWebRobot\\bin\\Debug\\log\\log_111.txt"); //logFile.Create(); StreamWriter fs = new StreamWriter("log.txt"); try { Output.WriteLine("Start at :" + DateTime.Now.ToString(), fs); //load start link list Output.WriteLine("Loading start links list ...", fs); IList<string> linkList = new List<string>(); FileInfo listFile = new FileInfo("list.txt"); StreamReader sr = new StreamReader(listFile.OpenRead()); string line; while ((line = sr.ReadLine()) != null) { linkList.Add(line); Output.WriteLine("URL added:\"" + line + "\"", fs); } sr.Close(); Output.WriteLine("Loading link complate.", fs); //do visit IList<string> visitedList = new List<string>(); foreach (string link in linkList) { DoVisit(link, ref visitedList, fs); } Output.WriteLine("Finished...", fs); Output.WriteLine("Visited " + LINKCOUNT + " links in all. " + (LINKCOUNT - ERRCOUNT) + " SUCCESS AND " + ERRCOUNT + " ERRORS", fs); //end } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Output.WriteLine("Exception!! :" + ex.Message, fs); Console.ForegroundColor = ConsoleColor.White; //throw; } finally { fs.Close(); Output.WriteLine("PRESS ANY KEY TO CLOSE..."); Console.Read(); } } private static void DoVisit(string link, ref IList<string> visitedList, StreamWriter fs) { if (!visitedList.Contains(AddToVistitedList(link))) { visitedList.Add(AddToVistitedList(link)); Output.WriteLine("Visiting :" + link, fs); LINKCOUNT++; try { Uri Uri = new Uri(link); HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = hw.Load(link); if (doc == null || doc.DocumentNode == null) throw new Exception("Can not visit this url!"); if (hw.StatusCode != HttpStatusCode.OK) throw new Exception("Can not visit this url! StatusCode:" + hw.StatusCode.ToString()); foreach (HtmlNode a in doc.DocumentNode.SelectNodes("//a[@href]")) { string innerLink = a.GetAttributeValue("href", null); if (!string.IsNullOrEmpty(innerLink) && !innerLink.ToLower().StartsWith("javascript") && !innerLink.StartsWith("#") && !innerLink.ToLower().StartsWith("mailto"))//排除javascript里面的连接,以及mailto { if (innerLink.StartsWith("http")) { var innerUri = new Uri(innerLink); if (innerUri.Host.ToLower() != Uri.Host.ToLower()) { continue; } } else if (innerLink.StartsWith("/")) { innerLink = "http://" + Uri.Host + innerLink; } else if (innerLink.StartsWith("../")) { int wildCard = 1; innerLink = innerLink.Substring(3); if(innerLink.StartsWith("../")) { wildCard += 1; innerLink = innerLink.Substring(3); } var pathArray = link.Split(‘/‘); string combinLink = ""; foreach (string path in pathArray.Take(pathArray.Length - wildCard - 1)) { combinLink = combinLink + path + "/"; } innerLink = combinLink + innerLink; } else { innerLink = link.Substring(0, link.LastIndexOf("/") + 1) + innerLink; } innerLink = innerLink.Replace(".html", "");//此处业务需求将*.html页面转化为*.aspx页面 DoVisit(innerLink, ref visitedList, fs); } } } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Output.WriteLine("Exception!! :" + ex.Message, fs); Console.ForegroundColor = ConsoleColor.White; ERRCOUNT++; //throw; } } } private static string AddToVistitedList(string url) { url = url.Replace("http://", ""); if(url.IndexOf("#") > 0) { url = url.Substring(0, url.IndexOf("#")); } return url.ToLower(); } } }
list.txt 、log.txt放在Debug文件夹下,list.txt文件内容如下:
http://dev.static.com/en-gb/home http://dev.static.com/de-de/home http://dev.static.com/it-it/home http://dev.static.com/en-gb/404 http://dev.static.com/de-de/404 http://dev.static.com/it-it/404
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。