首页 > 代码库 > java抓取网页数据,登录之后抓取数据。

java抓取网页数据,登录之后抓取数据。

最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。

也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。

首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqOuHa

1,获取网页内容(核心代码,技术有限没封装)。

2,登录之后抓取网页数据(如何在请求中携带cookie)。

3,获取网站的ajax请求方法(返回json)。

以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)

一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试

package com.minxinloan.black.web.utils;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.URL;import java.net.URLConnection;import java.net.URLEncoder;import java.nio.charset.Charset;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Map.Entry;import java.util.StringTokenizer;import net.sf.json.JSONArray;import net.sf.json.JSONObject;import org.jsoup.Connection;import org.jsoup.Connection.Method;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class CookieUtil {    public final static String CONTENT_TYPE = "Content-Type";    public static void main(String[] args) {                //String loginURL = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567";        String listURL = "http://www.p2peye.com/blacklist.php?p=2";        String logURL = "http://www.p2peye.com/member.php";        //********************************需要登录的*************************************************        try {                Connection.Response  res =                         Jsoup.connect(logURL)                            .data("mod","logging"                                    ,"action","login"                                    ,"loginsubmit","yes"                                    ,"loginhash","Lsc66"                                    ,"username","puqiuxiaomao"                                    ,"password","a1234567")                            .method(Method.POST)                            .execute();                                                //这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定                Connection con=Jsoup.connect(listURL);                //设置访问形式(电脑访问,手机访问):直接百度都参数设置                con.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");                //把登录信息的cookies保存如map对象里面                Map <String,String>  map=res.cookies();                Iterator<Entry<String,String>> it =map.entrySet().iterator();                while(it.hasNext()){                    Entry<String,String> en= it.next();                     //把登录的信息放入请求里面                    con =con.cookie(en.getKey(), en.getValue());                                    }                //再次获取Document对象。                Document objectDoc = con.get();                                Elements elements = objectDoc.getAllElements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)                for (Element element : elements) {                    //element是迭代出来的标签:如:<div><span></span></div>                    Elements elements2= element.getAllElements();//                     for (Element element2 : elements2) {                         element2.text();                         element2.attr("href");//获取标签属性。element2代表a标签:href代表属性                         element2.text();//获取标签文本                    }                }                                //********************************不需要登录的*************************************************                                String URL = "http://www.p2peye.com/blacklist.php?p=2";                Document conTemp = Jsoup.connect(URL).get();                Elements elementsTemps = conTemp.getAllElements();                 for (Element elementsTemp : elementsTemps) {                     elementsTemp.text();                     elementsTemp.attr("href");//获取标签属性。element2代表a标签:href代表属性                     elementsTemp.text();//获取标签文本                }                                                //********************************ajax方法获取内容。。。*************************************************。                 HttpURLConnection connection = null;                    BufferedReader reader = null;                    try {                        StringBuffer sb = new StringBuffer();                        URL getUrl = new URL(URL);                        connection = (HttpURLConnection)getUrl.openConnection();                        reader = new BufferedReader(new InputStreamReader(                                connection.getInputStream(),"utf-8"));                        String lines;                        while ((lines = reader.readLine()) != null) {                            sb.append(lines);                        };                        List<Map<String, Object>> list = parseJSON2List(sb.toString());//json转换成list                    } catch (Exception e) {                                            } finally{                        if(reader!=null)                            try {                                reader.close();                            } catch (IOException e) {                            }                        // 断开连接                        connection.disconnect();                    }                        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }            }        public static Map<String, Object> parseJSON2Map(String jsonStr){          Map<String, Object> map = new HashMap<String, Object>();          //最外层解析          JSONObject json = JSONObject.fromObject(jsonStr);          for(Object k : json.keySet()){              Object v = json.get(k);               //如果内层还是数组的话,继续解析              if(v instanceof JSONArray){                  List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();                  Iterator<JSONObject> it = ((JSONArray)v).iterator();                  while(it.hasNext()){                      JSONObject json2 = it.next();                      list.add(parseJSON2Map(json2.toString()));                  }                  map.put(k.toString(), list);              } else {                  map.put(k.toString(), v);              }          }          return map;      }          public static List<Map<String, Object>> parseJSON2List(String jsonStr){          JSONArray jsonArr = JSONArray.fromObject(jsonStr);          List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();          Iterator<JSONObject> it = jsonArr.iterator();          while(it.hasNext()){              JSONObject json2 = it.next();              list.add(parseJSON2Map(json2.toString()));          }          return list;      }          }

二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)

package com.minxinloan.black.web.utils;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.URL;import java.net.URLConnection;import java.nio.charset.Charset;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.StringTokenizer;public class Utils {//解析验证码的    public static Content getRandom(String method, String sUrl,// 要解析的url            Map<String, String> paramMap, // 存放用户名和密码的map            Map<String, String> requestHeaderMap,// 存放COOKIE的map            boolean isOnlyReturnHeader, String path) {        Content content = null;        HttpURLConnection httpUrlConnection = null;        InputStream in = null;        try {            URL url = new URL(sUrl);            boolean isPost = "POST".equals(method);            if (method == null                    || (!"GET".equalsIgnoreCase(method) && !"POST"                            .equalsIgnoreCase(method))) {                method = "POST";            }            URL resolvedURL = url;            URLConnection urlConnection = resolvedURL.openConnection();            httpUrlConnection = (HttpURLConnection) urlConnection;            httpUrlConnection.setRequestMethod(method);            httpUrlConnection.setRequestProperty("Accept-Language",                    "zh-cn,zh;q=0.5");            // Do not follow redirects, We will handle redirects ourself            httpUrlConnection.setInstanceFollowRedirects(false);            httpUrlConnection.setDoOutput(true);            httpUrlConnection.setDoInput(true);            httpUrlConnection.setConnectTimeout(5000);            httpUrlConnection.setReadTimeout(5000);            httpUrlConnection.setUseCaches(false);            httpUrlConnection.setDefaultUseCaches(false);            httpUrlConnection.connect();            int responseCode = httpUrlConnection.getResponseCode();            if (responseCode == HttpURLConnection.HTTP_OK                    || responseCode == HttpURLConnection.HTTP_CREATED) {                byte[] bytes = new byte[0];                if (!isOnlyReturnHeader) {                    DataInputStream ins = new DataInputStream(                            httpUrlConnection.getInputStream());                    // 验证码的位置                    DataOutputStream out = new DataOutputStream(                            new FileOutputStream(path + "/code.bmp"));                    byte[] buffer = new byte[4096];                    int count = 0;                    while ((count = ins.read(buffer)) > 0) {                        out.write(buffer, 0, count);                    }                    out.close();                    ins.close();                }                String encoding = null;                if (encoding == null) {                    encoding = getEncodingFromContentType(httpUrlConnection                            .getHeaderField(""));                }                content = new Content(sUrl, new String(bytes, encoding),                        httpUrlConnection.getHeaderFields());            }        } catch (Exception e) {            return null;        } finally {            if (httpUrlConnection != null) {                httpUrlConnection.disconnect();            }        }        return content;    }    public static String getEncodingFromContentType(String contentType) {        String encoding = null;        if (contentType == null) {            return null;        }        StringTokenizer tok = new StringTokenizer(contentType, ";");        if (tok.hasMoreTokens()) {            tok.nextToken();            while (tok.hasMoreTokens()) {                String assignment = tok.nextToken().trim();                int eqIdx = assignment.indexOf(‘=‘);                if (eqIdx != -1) {                    String varName = assignment.substring(0, eqIdx).trim();                    if ("charset".equalsIgnoreCase(varName)) {                        String varValue = http://www.mamicode.com/assignment.substring(eqIdx + 1)                                .trim();                        if (varValue.startsWith("\"")                                && varValue.endsWith("\"")) {                            // substring works on indices                            varValue = http://www.mamicode.com/varValue.substring(1,                                    varValue.length() - 1);                        }                        if (Charset.isSupported(varValue)) {                            encoding = varValue;                        }                    }                }            }        }        if (encoding == null) {            return "UTF-8";        }        return encoding;    }    // 这个是输出    public static boolean inFile(String content, String path) {        PrintWriter out = null;        File file = new File(path);        try {            if (!file.exists()) {                file.createNewFile();            }            out = new PrintWriter(new FileWriter(file));            out.write(content);            out.flush();            return true;        } catch (Exception e) {            e.printStackTrace();        } finally {            out.close();        }        return false;    }    public static String getHtmlReadLine(String httpurl) {        String CurrentLine = "";        String TotalString = "";        InputStream urlStream;        String content = "";        try {            URL url = new URL(httpurl);            HttpURLConnection connection = (HttpURLConnection) url                    .openConnection();            connection.connect();            System.out.println(connection.getResponseCode());            urlStream = connection.getInputStream();            BufferedReader reader = new BufferedReader(            new InputStreamReader(urlStream, "utf-8"));            while ((CurrentLine = reader.readLine()) != null) {                TotalString += CurrentLine + "\n";            }            content = TotalString;        } catch (Exception e) {        }        return content;    }}class Content {    private String url;    private String body;    private Map<String, List<String>> m_mHeaders = new HashMap<String, List<String>>();    public Content(String url, String body, Map<String, List<String>> headers) {        this.url = url;        this.body = body;        this.m_mHeaders = headers;    }    public String getUrl() {        return url;    }    public String getBody() {        return body;    }    public Map<String, List<String>> getHeaders() {        return m_mHeaders;    }}

 

java抓取网页数据,登录之后抓取数据。