首页 > 代码库 > 正则表达式抓取新闻数据
正则表达式抓取新闻数据
抓取网站数据建议用服务操作,此例只实现从请求到拿数据,并把数据写入xml的功能
List<youcaimodel> _list = new List<youcaimodel>();
int page = 1000; //一共抓取1000页的数据
WebClient client = new WebClientto(3000);
client.Credentials = CredentialCache.DefaultCredentials; //获取或设置用于向Internet资源的请求进行身份验证的网络凭据
for (int k = 0; k < page; k++) //循环获取当前页的数据,建议用服务跑
{
string htmlstr = "";
if (k == 0)
{
byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0");
htmlstr = Encoding.GetEncoding("utf-8").GetString(b);
}else
{
byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0/"+k+"");
htmlstr = Encoding.GetEncoding("utf-8").GetString(b);
}
string regstr = "<ul[\\s]*>[\\s\\S]*?</ul>";
var matches = Regex.Matches(htmlstr, regstr);
int i = 0;
foreach (Match match in matches)
{
if (i == 4)
{
html = match.Value;
}
i++;
}
string regdel = "<!--[\\s]*.*?[\\s\\S]*-->";//过滤已经注销的html代码
string regli = "<li[\\s]*[^>]*>[\\s\\S]*?</li>";//获取li下面的数据列表
string imgurl = "<img[\\s]*src[\\s]*=[\\s]*\"(?<imgurl>.*?)\"[\\s]*?[\\s]*[^>]*>";//获取图片路径
string imgtitle = "<h3[\\s]*[^>]*>[\\s]*<a[\\s]*href=http://www.mamicode.com/[//s]*/"(?<url>.*?)\"[^>]*>[\\s]*(?<title>.*?)[\\s]*</a>[\\s]*</h3>";//获取文章链接地址和文章标题
string imgdescribe = "<p[\\s]*[^>]*>(?<describe>.*?)</p>";//获取文章描述
string imgtime = "<span[\\s]*class=\"time\">(?<time>.*?)</span>";//获取文章发布时间
MatchCollection delmatches = Regex.Matches(html, regdel);
foreach (Match match in delmatches)
{
html = html.Replace(match.Value, "");
MatchCollection mc = new Regex(regli, RegexOptions.Compiled | RegexOptions.IgnoreCase).Matches(html);
foreach (Match limc in mc)
{
youcaimodel info = new youcaimodel();
Regex reg2 = new Regex(imgurl);
MatchCollection mc2 = reg2.Matches(limc.Value);
foreach (Match m2 in mc2)
{
info.Imgscr = m2.Groups["imgurl"].Value;
}
reg2 = new Regex(imgtitle);
MatchCollection mc3 = reg2.Matches(limc.Value);
foreach (Match m2 in mc3)
{
info.Title = m2.Groups["title"].Value;
info.Url = m2.Groups["url"].Value;
}
reg2 = new Regex(imgdescribe);
MatchCollection mc4 = reg2.Matches(limc.Value);
foreach (Match m2 in mc4)
{
info.Describe = m2.Groups["describe"].Value;
}
reg2 = new Regex(imgtime);
MatchCollection mc5 = reg2.Matches(limc.Value);
foreach (Match m2 in mc5)
{
info.Time = m2.Groups["time"].Value;
}
_list.Add(info);
}
}
}
//把数据写入本地xml
using (StringWriter stringWriter = new StringWriter(new StringBuilder()))
{
string qxfilepath = System.Web.Hosting.HostingEnvironment.MapPath("~" + "/xml");
XmlSerializer xmlSerializer = new XmlSerializer(typeof(List<youcaimodel>));
xmlSerializer.Serialize(stringWriter, _list);
//File.WriteAllText(""+ qxfilepath + "/youcai.xml", stringWriter.ToString());
FileStream fs = new FileStream("" + qxfilepath + "/youcai.xml", FileMode.OpenOrCreate);
StreamWriter sw = new StreamWriter(fs);
sw.Write(stringWriter.ToString());
sw.Close();
fs.Close();
html = stringWriter.ToString();
}
正则表达式抓取新闻数据