首页 > 代码库 > 正则抓取网页所有href和src

正则抓取网页所有href和src

根据抓取的页面,用正则来匹配页面href和src

string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0";    string ContentType = "";    Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/");    protected void Application_BeginRequest(object sender, EventArgs e)    {        Uri u = new Uri(strReqUrl, Request.RawUrl);        byte[] b = getVerificationCode(u);        //MemoryStream ms = new MemoryStream(b);        //Response.ClearContent();        //Response.ContentType = ContentType;        //Response.BinaryWrite(b);        StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b));        StringBuilder sb = new StringBuilder();        GetHtmlUrl(ref strHtml);        Response.Write(strHtml.ToString());        Response.End();    }    public byte[] getVerificationCode(Uri url)    {        WebClient MyWebClient = new WebClient();        MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");        MyWebClient.Headers.Add("Accept-Language", "    zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");        MyWebClient.Headers.Add("User-Agent", this.UserAgent);        MyWebClient.Credentials = CredentialCache.DefaultCredentials;        try        {            Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri);            ContentType = MyWebClient.ResponseHeaders["Content-Type"];            return (pageData);        }        catch        {            return null;        }    }
View Code

 

    void GetHtmlUrl(ref StringBuilder strHtml)    {        //string headstr = "(src|href)=", endstr = "(\")";        //string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")";        string reg = "(src|href)\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))";        Regex r = new Regex(reg, RegexOptions.None);        Match match = r.Match(strHtml.ToString());        StringBuilder sb = new StringBuilder();        while (match.Success)        {            //sb.Append(match.Groups["url"].Value + "\n");//得到href值                            //sb.Append(match.Groups["text"].Value + "\n");//得到<a><a/>中间的内容                 sb.Append(match + "\n");//得到href值                 match = match.NextMatch();            //try            //{            //    Uri u = new Uri(strReqUrl, match.Value.Replace("\"", "").Replace("‘", ""));            //    strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), ""));            //}            //catch            //{            //}        }    }