正则表达式抓取新闻数据

首页 > 代码库 > 正则表达式抓取新闻数据

正则表达式抓取新闻数据

2024-09-03 12:47:04 220人阅读

抓取网站数据建议用服务操作，此例只实现从请求到拿数据，并把数据写入xml的功能

List<youcaimodel> _list = new List<youcaimodel>();
int page = 1000;　　　　　　　　　　　　　　　　　　　　　　//一共抓取1000页的数据
WebClient client = new WebClientto(3000);
client.Credentials = CredentialCache.DefaultCredentials;　　 //获取或设置用于向Internet资源的请求进行身份验证的网络凭据
for (int k = 0; k < page; k++)　　　　　　　　　　　　　　　 //循环获取当前页的数据，建议用服务跑
{
　　string htmlstr = "";
　　if (k == 0)
　　 {
　　　　byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0");
　　　　htmlstr = Encoding.GetEncoding("utf-8").GetString(b);

　　 }else

　　 {
　　　　byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0/"+k+"");
　　　　htmlstr = Encoding.GetEncoding("utf-8").GetString(b);
　　 }

　　　　string regstr = "<ul[\\s]*>[\\s\\S]*?</ul>";
　　　　var matches = Regex.Matches(htmlstr, regstr);
　　　　int i = 0;
　　　　foreach (Match match in matches)
　　　　{
　　　　　　if (i == 4)
　　　　　　{
　　　　　　　　html = match.Value;
　　　　　　}
　　i++;
}

string regdel = "";//过滤已经注销的html代码
string regli = "<li[\\s]*[^>]*>[\\s\\S]*?</li>";//获取li下面的数据列表
string imgurl = "<img[\\s]*src[\\s]*=[\\s]*\"(?<imgurl>.*?)\"[\\s]*?[\\s]*[^>]*>";//获取图片路径
string imgtitle = "<h3[\\s]*[^>]*>[\\s]*<a[\\s]*href=http://www.mamicode.com/[//s]*/"(?<url>.*?)\"[^>]*>[\\s]*(?<title>.*?)[\\s]*</a>[\\s]*</h3>";//获取文章链接地址和文章标题
string imgdescribe = "<p[\\s]*[^>]*>(?<describe>.*?)</p>";//获取文章描述
string imgtime = "<span[\\s]*class=\"time\">(?<time>.*?)</span>";//获取文章发布时间
MatchCollection delmatches = Regex.Matches(html, regdel);
foreach (Match match in delmatches)
{
html = html.Replace(match.Value, "");
MatchCollection mc = new Regex(regli, RegexOptions.Compiled | RegexOptions.IgnoreCase).Matches(html);
foreach (Match limc in mc)
{
youcaimodel info = new youcaimodel();
Regex reg2 = new Regex(imgurl);
MatchCollection mc2 = reg2.Matches(limc.Value);
foreach (Match m2 in mc2)
{
info.Imgscr = m2.Groups["imgurl"].Value;
}

reg2 = new Regex(imgtitle);
MatchCollection mc3 = reg2.Matches(limc.Value);
foreach (Match m2 in mc3)
{
info.Title = m2.Groups["title"].Value;
info.Url = m2.Groups["url"].Value;
}

reg2 = new Regex(imgdescribe);
MatchCollection mc4 = reg2.Matches(limc.Value);
foreach (Match m2 in mc4)
{
info.Describe = m2.Groups["describe"].Value;
}

reg2 = new Regex(imgtime);
MatchCollection mc5 = reg2.Matches(limc.Value);
foreach (Match m2 in mc5)
{
info.Time = m2.Groups["time"].Value;
}
_list.Add(info);
}

}
}

//把数据写入本地xml

using (StringWriter stringWriter = new StringWriter(new StringBuilder()))
{
string qxfilepath = System.Web.Hosting.HostingEnvironment.MapPath("~" + "/xml");
XmlSerializer xmlSerializer = new XmlSerializer(typeof(List<youcaimodel>));
xmlSerializer.Serialize(stringWriter, _list);
//File.WriteAllText(""+ qxfilepath + "/youcai.xml", stringWriter.ToString());

FileStream fs = new FileStream("" + qxfilepath + "/youcai.xml", FileMode.OpenOrCreate);
StreamWriter sw = new StreamWriter(fs);
sw.Write(stringWriter.ToString());
sw.Close();
fs.Close();
html = stringWriter.ToString();
}

正则表达式抓取新闻数据

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 正则表达式抓取新闻数据

正则表达式抓取新闻数据

看完仍有疑问？有类似问题直接问程序猿