首页 > 代码库 > MongoDBcrud操作,采集部分代码

MongoDBcrud操作,采集部分代码

using System;using System.Collections.Generic;using System.ComponentModel.Design;using System.Linq;using System.Text;using System.Text.RegularExpressions;using System.Threading.Tasks;using CDPWIB.DAL;using CDPWIB.Data;using CommonUtility;using HtmlAgilityPack;using MongoDB.Driver;using MongoDB.Driver.Builders;using MongoDB.Driver.Linq;using Newtonsoft.Json;using Newtonsoft.Json.Linq;using WebKit;namespace CDPWIB.WebCollection{    internal class QiDianCol : INovalCollect    {        private int Source = Convert.ToInt32(NovalSource.QiDian);        private readonly MongoCollection<NovalTempBase> Novalcol =            MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));        public void GetNovalTypeTemp()        {            try            {                var typecol = MongoConnectionFactory.GetMongoCollction<NovalTypeTemp>("Noval", typeof (NovalTypeTemp));                var subcol = MongoConnectionFactory.GetMongoCollction<NovalSubType>("Noval", typeof (NovalSubType));                // 大类 http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917                string typeshtml =                    NetHelper.HttpGet("http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917")                        .Replace("/", "")                        .Replace("&nbsp", "")                        .Replace("\r", "")                        .Replace("\n", "")                        .Replace("\t", "")                        .Replace("|", "")                        .Replace(" ", "");                ;                string subtypes =                    NetHelper.HttpGet("http://script.cmfu.com/script/BookStore.js ")                        .Replace("&nbsp", "")                        .Replace("\r", "")                        .Replace("\n", "")                        .Replace("\t", "")                        .Replace("|", "")                        .Replace(" ", "");                ;                Match mtype = Regex.Match(typeshtml, "CategoryArr:(.*?)]]",                    RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);                string typesstring = mtype.Groups[1].Value + "]]";                JArray typearr = (JArray) JsonConvert.DeserializeObject(typesstring);                //JsonTextWriter                Match msubtype = Regex.Match(subtypes, "SubCategoryArr=(.*?);",                    RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);                string subtypesstring = msubtype.Groups[1].Value;                JArray subarr = (JArray) JsonConvert.DeserializeObject(subtypesstring);                List<NovalTypeTemp> lstypes = new List<NovalTypeTemp>(10);                //CategoryArr: [["全部", "-1"], ["玄幻", "21"], ["奇幻", "1"], ["武侠", "2"], ["仙侠", "22"], ["都市", "4"], ["历史", "5"], ["军事", "6"], ["游戏", "7"]                for (int i = 0; i < typearr.Count; i++)                {                    if (typearr[i][1].ToString() != "-1")                    {                        NovalTypeTemp type = new NovalTypeTemp()                        {                            WebNum = typearr[i][1].ToString().ToInt(),                            Name = typearr[i][0].ToString(),                            Source = Source                        };                        lstypes.Add(type);                    }                }                IMongoQuery query = Query<NovalTypeTemp>.EQ(p => p.Source, Source);                typecol.Remove(query);                typecol.InsertBatch(lstypes);                List<NovalSubType> subtypels = new List<NovalSubType>(300);                foreach (var NovalTypeTemp in lstypes)                {                                     for (int i = 0; i < subarr.Count; i++)                    {                        var obj = subarr[i];                        if (obj[0].ToString() == NovalTypeTemp.WebNum.ToString())                        {                            NovalSubType subtype = new NovalSubType()                            {                                Name = obj[2].ToString(),                                ParentWebNum = NovalTypeTemp.WebNum,                                WebNum = obj[1].ToString().ToInt(),                                Source = Source                            };                            subtypels.Add(subtype);                        }                    }                                   }                query = Query<NovalSubType>.EQ(p => p.Source, Source);                subcol.Remove(query);                subcol.InsertBatch(subtypels);            }            catch (Exception ex)            {                throw;            }        }        /// <summary>        /// 根据点击数页面查小说        /// </summary>        public void GetNovals()        {            //取1到10页            //得到月点击排行小说。            string sourcehtml = string.Empty;            HtmlDocument htmldocc = new HtmlDocument();            List<NovalTempBase> qdls = new List<NovalTempBase>(500);            for (int j = 1; j < 11; j++)            {                sourcehtml =                    NetHelper.HttpGet("http://top.qidian.com/Book/TopDetail.aspx?TopType&Time=2&PageIndex=" + j);                ;                htmldocc.LoadHtml(sourcehtml);                var doc = htmldocc.GetElementbyId("textlist");                //string tablehtml = "<table>" + doc.InnerHtml + "</table>";                //     htmldocc.LoadHtml(tablehtml);                //一页50列                for (int i = 2; i < 52; i++)                {                    var trdoc = doc.SelectSingleNode("tr[" + i + "]");                    //这里的下标,从1算起                    var tdtype = trdoc.SelectSingleNode("td[2]/a");                    var tdbook = trdoc.SelectSingleNode("td[3]/a[1]");                    var tdclick = trdoc.SelectSingleNode("td[4]");                    var tdauth = trdoc.SelectSingleNode("td[5]/a");                    Match typematch = Regex.Match(tdtype.OuterHtml, "ChannelId=(\\d*?)&SubCategoryId=(\\d*?)‘");                    Match bookmatck = Regex.Match(tdbook.OuterHtml, "Book/(\\d*?).aspx");                    Match authmatch = Regex.Match(tdauth.OuterHtml, "id=(\\d*?)\"");                    int authid = authmatch.Groups[1].Value.ToInt();                    int type = typematch.Groups[1].Value.ToInt();                    int subtype = typematch.Groups[2].Value.ToInt();                    int booknum = bookmatck.Groups[1].Value.ToInt();                    string bookname = tdbook.InnerText.Trim();                    //http://image.cmfu.com/books/3127618/3127618.jpg                    string titleimg = "http://image.cmfu.com/books/" + booknum + "/" + booknum + ".jpg";                   bool exist= qdls.Exists(p => p.SourceWebNum == booknum);                    if (!exist)                    {                        NovalTempBase qidian = new NovalTempBase()                        {                            AuthName = tdauth.InnerText.Trim(),                            AuthId = authid,                            SubType = subtype,                            TitleImg = titleimg,                            Title = bookname,                            TotalClick = tdclick.InnerText.ToInt(),                            TotalComment = 0,                            Type = type,                            SourceWebNum = booknum,                            Source = Source                        };                        qdls.Add(qidian);                    }                                    }            }            PublicMethod.InsertAndUpdateNovalTmp(qdls,Source);        }        //public void GetNovalsByType()        //{        //}        /// <summary>        /// 得到小说章节 ,个别来源,带分卷。        /// </summary>        public void GetNovalChapers()        {            //http://sight.qq.com/book/chapterpage?uin=0&g_tk=5381&callback=_Callback&pagesize=100&pageno=2&bid=16043&_r=0.6934567329008132            var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));            var books = novalcol.AsQueryable().Where(p=>p.Source==Source).ToList();            foreach (var infoQidian in books)            {                GetSingleNovalChapers(infoQidian.SourceWebNum);            }        }        public void GetSingleNovalChapers(int novalwebnum)        {            IMongoQuery q2 = Query<NovalVolumeTemp>.EQ(p => p.Source, Source);            IMongoQuery q1 = Query<NovalVolumeTemp>.EQ(p => p.NovalWebNum, novalwebnum);            IMongoQuery[] qarray = { q1, q2 };            IMongoQuery query = Query.And(qarray);                 var chaptercol = MongoConnectionFactory.GetMongoCollction<NovalChapterTemp>("Noval", typeof(NovalChapterTemp));            var volumecol = MongoConnectionFactory.GetMongoCollction<NovalVolumeTemp>("Noval", typeof (NovalVolumeTemp));            List<NovalChapterTemp> lschapters = new List<NovalChapterTemp>(1000);            List<NovalVolumeTemp> lsvolumes = new List<NovalVolumeTemp>(10);            int chapterorder = 1;            int volumeorder = 1;            HtmlDocument htmldocc = new HtmlDocument();            //http://read.qidian.com/BookReader/3127618.aspx            string sourcehtml = string.Empty;            string url = "http://read.qidian.com/BookReader/" + novalwebnum + ".aspx";            try            {                sourcehtml = NetHelper.HttpGet(url);                //目录主页                htmldocc.LoadHtml(sourcehtml);                var doc = htmldocc.GetElementbyId("content");                int i = 1;                var topdoc = doc.SelectSingleNode("div[" + i + "]");                while (topdoc != null)                {                    var topa = topdoc.SelectSingleNode("div/a");                    //如果是vip章节,没有这个A标签。                    int topnum;                    //分卷信息                    if (topa != null)                    {                        string topahtml = topa.OuterHtml;                        //href="http://www.mamicode.com/http://www.qidian.com/BookReader/vol,107580,486625.aspx"                        Match m = Regex.Match(topahtml, ",(\\d*?).aspx");                        topnum = m.Groups[1].Value.ToInt();                        var topaname = topdoc.SelectSingleNode("div/b");                        string topname = topaname.InnerText.Trim();                        topname = topname.Replace("&nbsp", "").Split(;)[1];                        //if(topname=="作品相关")                        NovalVolumeTemp volume = new NovalVolumeTemp()                        {                            Sort = volumeorder,                            WebNum = topnum,                            Name = topname,                            NovalWebNum = novalwebnum,                            Source = Source                        };                        lsvolumes.Add(volume);                        volumeorder++;                    }                    else                    {                        topnum = 0;                    }                    var contextdoc = doc.SelectSingleNode("div[" + (i + 1) + "]");                    var chaperas = contextdoc.SelectNodes("div/ul/li/a");                    //<a itemprop=‘url‘ href="http://www.mamicode.com/http://read.qidian.com/BookReader/107580,20901221.aspx" title=‘凡人修仙传&#xd;字数:84  更新时间:2008-08-01 07:54:48‘><span itemprop=‘headline‘>呵呵!终于上架了!</span></a>                    //,(\d*?).aspx                    string chaptername = string.Empty;                    //章节信息                    int chapterwebnum = 0;                    for (int x = 0; x < chaperas.Count; x++)                    {                        var chapera = chaperas[x];                        chaptername = chapera.InnerText.Trim();                        Match chapmatchwebnum = Regex.Match(chapera.OuterHtml, ",(\\d*?).aspx");                        chapterwebnum = chapmatchwebnum.Groups[1].Value.ToInt();                        NovalChapterTemp chapter = new NovalChapterTemp()                        {                            Name = chaptername,                            Sort = chapterorder,                            WebNum = chapterwebnum,                            VolumeId = topnum                            ,                            NovalWebNum = novalwebnum,                            Source = Source                        };                        lschapters.Add(chapter);                        chapterorder++;                    }                    i += 2;                    topdoc = doc.SelectSingleNode("div[" + i + "]");                }                volumecol.Remove(query);                volumecol.InsertBatch(lsvolumes);                PublicMethod.InsertChapterTempToSQL(lschapters, Source, novalwebnum);                        }            catch (Exception ex)            {                return;            }        }        public void GetNovalCilckComment()        {            var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof(NovalTempBase));            var books = novalcol.AsQueryable().Where(p => p.Source == Source).ToList();            string sourcehtml = string.Empty;            string url = string.Empty;            HtmlDocument htmldocc = new HtmlDocument();            foreach (var novalTempBase in books)            {                //http://www.qidian.com/Book/3106580.aspx                 url = "http://www.qidian.com/Book/" + novalTempBase.SourceWebNum + ".aspx";                sourcehtml = NetHelper.HttpGet(url);                htmldocc.LoadHtml(sourcehtml);                var cliclickdiv = htmldocc.GetElementbyId("contentdiv");                // /div/div/div[1]/table/tbody/tr/td[1]                var clickcount =                    cliclickdiv.SelectSingleNode("div/div[1]/table/tr/td[1]")                        .InnerText.Replace("总点击", "")                        .Replace("", "").Trim();                int click = Convert.ToInt32(clickcount);                          //    string urlcom = "http://forum.qidian.com/NewForum/List.aspx?BookId=3106580";            ////http://forum.qidian.com/NewForum/List.aspx?BookId=3106580                      // //   http://c.pingba.qidian.com/BookComment.aspx?BookId=3106580            //    url = "http://c.pingba.qidian.com/BookComment.aspx?" + novalTempBase.SourceWebNum;            //    sourcehtml = NetHelper.HttpGet(url);            //    htmldocc.LoadHtml(sourcehtml);                novalTempBase.TotalClick = click;                novalcol.Save(novalTempBase);            }                                                                                         //目录主页                                   }        }}

 

MongoDBcrud操作,采集部分代码