首页 > 代码库 > Soufun_News

Soufun_News

using AnfleCrawler.Common;using System;using System.Collections.Generic;using System.ComponentModel;using System.IO;using System.Linq;using System.Net;using System.Text;using System.Threading.Tasks;namespace AnfleCrawler.DataAnalyzer{    internal class Soufun_News : AnalyzerBase    {        private enum Kind        {            [Description("市场")]            Market = 32,            [Description("政策")]            Policy = 35,            [Description("公司")]            Company = 736,        }        private static readonly string[] FilterTags = new string[] { "script", "iframe" };        public override void Init(PageCrawler crawler)        {            string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));            crawler.PushUrl(new StringPatternGenerator(exp), 0);            base.Init(crawler);        }        protected override void AnalyzeInternal(PageLandEntity current)        {            var lander = Crawler.Lander;            dynamic repository = Repository;            var pHandler = CreateContentHandler(current);            switch (current.Depth)            {                case 0:                    {                        var dom = lander.GetDocument(pHandler);                        foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))                        {                            var linkNode = QueryNode(node, "a.link_01");                            string url = GetHref(linkNode, current.Url).OriginalString;                            int i = url.LastIndexOf(".");                            Crawler.PushUrl(new Uri(url.Insert(i, "_all")), 1);                        }                    }                    break;                case 1:                    {                        var dom = lander.GetDocument(pHandler);                        var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");                        string kind = QueryNodes(hackNode, "a").Last().InnerText;                        string title = QueryNode(dom.DocumentNode, "h1").InnerText;                        var contentNode = QueryNode(dom.DocumentNode, "#news_body");                        foreach (string tag in FilterTags)                        {                            foreach (var node in QueryNodes(contentNode, tag, false).ToArray())                            {                                node.Remove();                            }                        }                        var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take(2).ToArray();                        string source = null;                        DateTime publishDate;                        DateTime.TryParse(set[0].InnerText, out publishDate);                        if (set.Length == 2)                        {                            source = set[1].InnerText;                        }                        repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);                        Crawler.OutWrite("保存新闻 {0}", title);                    }                    break;            }        }    }}

 

 

        public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)        {            Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);            using (var db = Create())            {                var q = from t in db.News                        where t.RowID == rowID                        select t;                var news = q.SingleOrDefault();                if (news == null)                {                    db.News.Add(news = new News()                    {                        RowID = rowID,                        SiteID = pageUrl.Authority,                    });                }                news.Kind = kind;                news.Source = source;                news.Title = title;                news.Content = content;                news.PublishDate = publishDate;                db._SaveChanges();            }        }

 

Soufun_News