首页 > 代码库 > 国标地址采集

国标地址采集

因为工作需要,需要对京东地址和国标地址转换,特意采集了一下最新的国标地址库,以方便进行数据映射。

因为前端技术不太好,只能一点一点的页面分析(大神勿笑),故整理出来,一是为了存储方便以后再用,二是为了一样的方便同样需要这部分数据的小伙伴。

  /// <summary>
    /// 国标地址库采集
    /// 
    /// 作者:南丘伟
    /// 时间:2017-06-21
    /// 版本:V1.0.0
    /// 版本说明:
    /// 下次采集时很有可能因为页面格式变化导致采集不到
    /// 此方法可作为下次采集的Demo,采集时另做调整
    /// 
    /// </summary>
    public class GbAddressHelper
    {
        public GbAddressHelper() { }

        /// <summary>
        /// 国家统计局地址信息采集
        /// 地址:http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html
        /// 调试结果:
        /// [{省,市,区县}]
        /// 
        /// </summary>
        /// <param name="gbUrl"></param>
        /// <returns></returns>
        public string DownLoadGbAddress(string gbUrl)
        {
            string newjson = string.Empty;
            string html = HttpUtilitys.SendHttpRequest(gbUrl, "");
            if (html.Contains("xilan_con"))
            {
                //第一步:从页面上获取地址数据相关块
                string regEx = "<div class=\"xilan_con\"([\\w\\W]+)</span></b></p></div>";//获取当前页面
                MatchCollection mcPageIndex = Regex.Matches(html, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                if (mcPageIndex.Count == 1)
                {
                    html = mcPageIndex[0].Groups[0].Value.Trim();

                    var sb = new StringBuilder();
                    newjson += "[";
                    //第二步:页面地址数据字符串,根据省份切割
                    var prohtmlArry = Regex.Split(html, "<p class=\"MsoNormal\"><span lang=\"EN-US\">", RegexOptions.IgnoreCase);
                    foreach (var item in prohtmlArry)
                    {
                        var prolist = Regex.Split(item, "<p class=\"MsoNormal\"><b><span lang=\"EN-US\">", RegexOptions.IgnoreCase);
                        for (int i = 0; i < prolist.Length; i++)
                        {
                            string newhtml = "<p class=\"MsoNormal\"><b><span lang=\"EN-US\">" + prolist[i];

                            regEx = "<p class=\"MsoNormal\"><b><span lang=\"EN-US\">([\\d]+)<span>";//获取当前页面
                            MatchCollection mcproId = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                            if (mcproId.Count > 0)
                            {
                                int proId = Utils.ObjToInt(mcproId[0].Groups[1].Value.Trim(), 0);
                                if (proId > 0)
                                {

                                    //
                                    regEx = "<b><span style=\"font-family: 宋体\">([\u4e00-\u9fa5]+)</span></b></p>";//获取当前页面

                                    // regEx = "</span></span></b><b><span style=\"font-family: 宋体\">([\\w\\W]+)</span></b></p>";//获取当前页面
                                    MatchCollection mcproName = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                                    if (mcproName.Count > 0)
                                    {
                                        string proName = mcproName[0].Groups[1].Value.Trim();

                                        newjson += "{";
                                        newjson += "\"PId\":\"0\",";
                                        newjson += "\"AreaId\":\"" + proId + "\",";
                                        newjson += "\"AreaName\":\"" + proName + "\"},";


                                        //第三步:根据省数据,分割城市
                                        //匹配城市
                                        //"<span lang=\"EN-US\">([\\d]+)<span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(\\s*)</span></span><span style=\"font-family: 宋体\">(\\s*)([\u4e00-\u9fa5]+)</span></p>"

                                        var citylist = Regex.Split(newhtml, "<p class=\"MsoNormal\"><span style=\"font-family: 宋体\"> </span>", RegexOptions.IgnoreCase);
                                        for (int k = 0; k < citylist.Length; k++)
                                        {
                                            int cityId = 0;
                                            //拆分出来的每个城市详情
                                            newhtml = citylist[k];
                                            regEx = "<span lang=\"EN-US\">([\\d]+)<span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(\\s*)</span></span><span style=\"font-family: 宋体\">(\\s*)([\u4e00-\u9fa5]+)</span></p>";
                                            //拆分城市ID和名称
                                            MatchCollection cityslist = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                                            if (cityslist.Count > 0)
                                            {
                                                cityId = Utils.ObjToInt(cityslist[0].Groups[1].Value.Trim(), 0);
                                                newjson += "{";
                                                newjson += "\"PId\":\"" + proId + "\",";
                                                newjson += "\"AreaId\":\"" + cityId + "\",";
                                                newjson += "\"AreaName\":\"" + cityslist[0].Groups[4].Value.Trim() + "\"},";
                                                //拆分区县
                                                regEx = "<span lang=\"EN-US\">([\\d]+)<span>&nbsp;&nbsp;&nbsp;&nbsp; </span></span><span style=\"font-family: 宋体\">(\\s*)([\u4e00-\u9fa5]+)</span></p>";
                                                MatchCollection townlist = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                                                if (townlist.Count > 0)
                                                {
                                                    for (int j = 0; j < townlist.Count; j++)
                                                    {
                                                        var town = townlist[j].Groups[0].Value.Trim();
                                                        newjson += "{";
                                                        newjson += "\"PId\":\"" + cityId + "\",";
                                                        newjson += "\"AreaId\":\"" + Utils.ObjToInt(townlist[j].Groups[1].Value.Trim(), 0) + "\",";
                                                        newjson += "\"AreaName\":\"" + townlist[j].Groups[3].Value.Trim() + "\"},";
                                                    }
                                                }

                                            }
                                        }
                                    }
                                }
                                else
                                {
                                    newjson = "{\"status\":\"n\",\"info\":\"省份代码为空!,请核查采集源码!\"}";
                                }
                            }
                        }

                    }
                    newjson = newjson.Substring(0, newjson.Length - 1);
                    newjson += "]";

                }
                else
                {
                    newjson = "{\"status\":\"n\",\"info\":\"页面数据标签位置变动,请核查采集源码!\"}";
                }
            }
            else
            {
                newjson = "{\"status\":\"n\",\"info\":\"采集页面数据格式变动,请核查采集源码!\"}";
            }
            return newjson;
        }



    }

 

国标地址采集