首页 > 代码库 > 网络爬虫爬取邮箱,并将其存入xml中作为数据库
网络爬虫爬取邮箱,并将其存入xml中作为数据库
package com.bjsxt.ly;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.XMLWriter;
public class HelloSpider {
public static void main(String[] args) throws Exception {
//获取路径
String path = System.getProperty("user.dir") + File.separator + "src" + File.separator + "postcode.xml";
//邮政编码
String postcode = "100088";
//抓取网络信息
CharSequence charSequence = webSpider("http://tool.cncn.com/youbian/" + postcode);
//将抓取的信息通过正则表达式匹配,获取需要的内容
List<String> list = regexpPostcode("([\\u4e00-\\u9fa5\\w\\(\\)58-]+)(?=</li>)", 0, charSequence);
//将爬取的数据存放至XML
createXml(postcode, list, path);
}
/**
* 创建XML文档
* @param postcode
* @param list
* @param path
* @throws IOException
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
private static void createXml(String postcode, List<String> list, String path) throws UnsupportedEncodingException, FileNotFoundException, IOException {
//创建根节点
Element rootElement = DocumentHelper.createElement("postcodes");
//开始创建子节点
Element postcodeElement = DocumentHelper.createElement("postcode");
postcodeElement.addAttribute("code", postcode);
//遍历创建地址
for (String address : list) {
//创建节点
Element addressElement = DocumentHelper.createElement("address");
addressElement.setText(address);
//添加节点
postcodeElement.add(addressElement);
}
//拼接到根节点
rootElement.add(postcodeElement);
//开始创建文档对象模型
Document document = DocumentHelper.createDocument(rootElement);
//开始输出
new XMLWriter(new FileOutputStream(path), OutputFormat.createPrettyPrint()).write(document);
}
/**
* 正则表达式获取邮编
* @param regex
* @param flags
* @param charSequence
* @return
*/
private static List<String> regexpPostcode(String regex, int flags, CharSequence charSequence) {
//声明一个容器存放邮编地址
List<String> list = new ArrayList<>();
//获取模板
Pattern pattern = Pattern.compile(regex, flags);
//获取匹配器
Matcher matcher = pattern.matcher(charSequence);
//开始读取
while (matcher.find()) {
list.add(matcher.group());
}
//返回结果
return list;
}
/**
* 网络爬虫
* @param spec
* @return
* @throws IOException
*/
private static CharSequence webSpider(String spec) throws IOException {
//获取URL地址
URL url = new URL(spec);
//获取连接
URLConnection connection = url.openConnection();
//伪装成浏览器
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0");
//获取输入流
InputStream inputStream = connection.getInputStream();
//开始转换
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "GBK"));
//开始获取字符串
StringBuffer buffer = new StringBuffer();
String line = "";
//开始遍历读取
while ((line = reader.readLine()) != null) {
//开始拼接字符串
buffer.append(line);
}
//返回爬取的内容
return buffer;
}
}
网络爬虫爬取邮箱,并将其存入xml中作为数据库