首页 > 代码库 > Soufun_News
Soufun_News
using AnfleCrawler.Common;using System;using System.Collections.Generic;using System.ComponentModel;using System.IO;using System.Linq;using System.Net;using System.Text;using System.Threading.Tasks;namespace AnfleCrawler.DataAnalyzer{ internal class Soufun_News : AnalyzerBase { private enum Kind { [Description("市场")] Market = 32, [Description("政策")] Policy = 35, [Description("公司")] Company = 736, } private static readonly string[] FilterTags = new string[] { "script", "iframe" }; public override void Init(PageCrawler crawler) { string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>())); crawler.PushUrl(new StringPatternGenerator(exp), 0); base.Init(crawler); } protected override void AnalyzeInternal(PageLandEntity current) { var lander = Crawler.Lander; dynamic repository = Repository; var pHandler = CreateContentHandler(current); switch (current.Depth) { case 0: { var dom = lander.GetDocument(pHandler); foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext")) { var linkNode = QueryNode(node, "a.link_01"); string url = GetHref(linkNode, current.Url).OriginalString; int i = url.LastIndexOf("."); Crawler.PushUrl(new Uri(url.Insert(i, "_all")), 1); } } break; case 1: { var dom = lander.GetDocument(pHandler); var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26"); string kind = QueryNodes(hackNode, "a").Last().InnerText; string title = QueryNode(dom.DocumentNode, "h1").InnerText; var contentNode = QueryNode(dom.DocumentNode, "#news_body"); foreach (string tag in FilterTags) { foreach (var node in QueryNodes(contentNode, tag, false).ToArray()) { node.Remove(); } } var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take(2).ToArray(); string source = null; DateTime publishDate; DateTime.TryParse(set[0].InnerText, out publishDate); if (set.Length == 2) { source = set[1].InnerText; } repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate); Crawler.OutWrite("保存新闻 {0}", title); } break; } } }}
public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate) { Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString); using (var db = Create()) { var q = from t in db.News where t.RowID == rowID select t; var news = q.SingleOrDefault(); if (news == null) { db.News.Add(news = new News() { RowID = rowID, SiteID = pageUrl.Authority, }); } news.Kind = kind; news.Source = source; news.Title = title; news.Content = content; news.PublishDate = publishDate; db._SaveChanges(); } }
Soufun_News
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。