首页 > 代码库 > 正则 挖网站表格复习
正则 挖网站表格复习
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using System.Xml; namespace WebApplication19 { public enum SearchRange { th=0, td=1 } public partial class WebForm1 : System.Web.UI.Page { public string MKT; private string getHtml() { List<string> trList = new List<string>(); try { WebClient wc = new WebClient(); using (Stream stream = wc.OpenRead("http://srh.bankofchina.com/search/whpj/search.jsp?erectDate=2001-11-01¬hing=2016-11-04&pjname=1316&page=4")) { using (StreamReader sr = new StreamReader(stream, Encoding.UTF8)) { string content = sr.ReadToEnd(); //提取div内容开始 string divPatern = @"(?<=<div (.*)?class=""BOC_main publish""[^>]*?>)([\s\S]*?)(?=</div>)"; MatchCollection divMatches = Regex.Matches(content, divPatern); string divContent = string.Empty; foreach (Match match in divMatches) { divContent = match.Groups[0].Value; break; } //提取div内容结束 //提取表格内容开始 string tablePatern = @"(?<=<table (.*)?[^>]*?>)([\s\S]*?)(?=</table>)"; MatchCollection tableMatches = Regex.Matches(divContent, tablePatern); string tableContent = string.Empty; foreach (Match match in tableMatches) { tableContent = match.Groups[0].Value; break; } //提取表格内容结束 //提取行开始 string trPatern = @"(?<=<tr(.*)?[^>]*?>)([\s\S]*?)(?=</tr>)"; MatchCollection trMatchCollection = Regex.Matches(tableContent, trPatern); for (int j = 0; j < trMatchCollection.Count; j++) { Match match = trMatchCollection[j]; string tr = string.Empty; tr = match.Groups[0].Value; trList.Add(tr); } //提取行结束 } //获取表头列元素,或者内容行的单元格元素 trlist[0]是表头 SearchR,ange告诉程序要查表头 还是 内容行 List<string> thList = GET_TH_OR_TD_LIST(SearchRange.th, trList[0]); System.Collections.ArrayList tdsList = new System.Collections.ArrayList(); for (int i = 1; i < trList.Count; i++) { tdsList.Add(GET_TH_OR_TD_LIST(SearchRange.td, trList[i])); } } } catch (Exception ex) { } return MKT; } private List<string> GET_TH_OR_TD_LIST(SearchRange range,string row) { string tmp = ""; tmp = range.ToString(); string tdPatern = $@"(?<=(<{tmp}[^>]*?>))(?<tdCell>[\s\S]*?)(?=</{tmp}>)"; MatchCollection CurrenttdMatchCollection = Regex.Matches(row, tdPatern); string td = string.Empty; List<string> tdlList = new List<string>(); List<string> contentList = new List<string>(); foreach (Match match in CurrenttdMatchCollection) { td = match.Groups["tdCell"].Value; contentList.Add(td); } return contentList; } protected void Page_Load(object sender, EventArgs e) { getHtml(); } } }
正则 挖网站表格复习
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。