首页 > 代码库 > Java WebClient 总结

Java WebClient 总结

private WebClient getAWebClient() {        WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);        webClient.getOptions().setTimeout(20000);        // webClient.getCookieManager().setCookiesEnabled(true);        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);        webClient.getOptions().setThrowExceptionOnScriptError(false);        webClient.getOptions().setCssEnabled(false);        webClient.getOptions().setJavaScriptEnabled(false);        webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");        webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");        webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");        webClient.addRequestHeader("Cache-Control", "max-age=0");        webClient.addRequestHeader("Connection", "keep-alive");        webClient.addRequestHeader("Host", "www.amazon.com");        webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");        return webClient;    }
/**     * 采集网页     */    public StringBuilder crawlPage(String url) {        StringBuilder builder = new StringBuilder();        logger.info(Thread.currentThread().getName() + " crawl " + url);        // mygetpage代码放在这里        webClient.getCookieManager().clearCookies();        logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");        File file = new File(cookiePathAppendRandom());        logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");        if (file.exists()) {            FileInputStream fin = null;            try {                fin = new FileInputStream(file);            } catch (FileNotFoundException e1) {                e1.printStackTrace();            }            CookieStore cookieStore = null;            ObjectInputStream in;            try {                in = new ObjectInputStream(fin);                cookieStore = (CookieStore) in.readObject();                in.close();            } catch (IOException e) {                logger.error(e);            } catch (ClassNotFoundException e) {                logger.error(e);            }            List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();            for (org.apache.http.cookie.Cookie temp : l) {                Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),                        temp.getExpiryDate(), false);                webClient.getCookieManager().addCookie(cookie);            }        }        logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);        HtmlPage page = MyGetPage(new StringBuffer(url));        logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);        if (page == null) {            // 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列            logger.info("Page null!");            AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);            exceptionFun(model);            return (new StringBuilder("getNullPage"));        }        logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");        builder.append(page.asXml());        logger.info(Thread.currentThread().getName() + " return builder;");        logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());        if(builder.toString().length()<=300){            AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);            exceptionFun(model);            return (new StringBuilder("getNullPage"));        }        return builder;    }

 

/***     * 自定义的getpage,遇到验证码页面识别直至成功     *      */    private HtmlPage MyGetPage(StringBuffer URL) {        HtmlPage page = null;        boolean flag = true;        int TryTimeCnt = 1;        int UnknowHostTryTimeCnt = 1;        while (flag) {            flag = false;            try {                logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"                        + crawlURLId);                page = webClient.getPage(URL.toString());                Document doc = Jsoup.parse(page.asXml());                int robotchecknum = 1;                while (doc.select("title").text().equals("Robot Check")) {                    logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())                            + " [Robot Check,URL:" + URL + "]");                    String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));                    logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())                            + " end AmazonGetCaptcha.GetCaptcha");                    logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "                            + captcha_str);                    HtmlForm form = null;                    logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");                    form = page.getForms().get(0);                    logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End");                    HtmlButton button = null;                    logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");                    button = (HtmlButton) form.getElementsByTagName("button").get(0);                    logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End");                    logger.info(Thread.currentThread().getName() + " setValueAttribute Start");                    form.getInputByName("field-keywords").setValueAttribute(captcha_str);                    logger.info(Thread.currentThread().getName() + " setValueAttribute End");                    logger.info(Thread.currentThread().getName() + " button.click Start");                    boolean click_flag = false;                    while (!click_flag) {                        try {                            click_flag = true;                            page = button.click();                        } catch (Exception e1) {                            logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);                            //e1.printStackTrace();                            click_flag = false;                        }                    }                    logger.info(Thread.currentThread().getName() + " button.click end");                    while (page.asXml() == null) {                        logger.info(Thread.currentThread().getName() + " page xml null");                        logger.info(Thread.currentThread().getName() +" "+ page.asXml());                        page.refresh();                        logger.info(Thread.currentThread().getName() + " refresh End!");                    }                    logger.info(Thread.currentThread().getName() + " button.click End");                    logger.info(Thread.currentThread().getName() + " Start ParsePage!");                    doc = Jsoup.parse(page.asXml());                    if (!doc.select("title").text().equals("Robot Check")) {                        logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());                        logger.info(Thread.currentThread().getName() + " "                                + dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"                                + captcha_str + ",try num:" + robotchecknum + "]");                    }                    robotchecknum++;                }            } catch (FailingHttpStatusCodeException e) {                logger.error(Thread.currentThread().getName() +" "+ e);                flag = true;            } catch (MalformedURLException e) {                logger.error(Thread.currentThread().getName() +" "+ e);                flag = true;            }catch(UnknownHostException e) {                logger.error(Thread.currentThread().getName() +" "+ e);                flag = true;                logger.info("found UnknownHostException,start sleep 20 min");                try {                    Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));                } catch (InterruptedException e1) {                    logger.error(Thread.currentThread().getName() +" "+ e1);                }                logger.info("found UnknownHostException,end sleep 20 min");                UnknowHostTryTimeCnt++;// 访问异常数加一                logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())                        + " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");                if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {                    return null;                }            }catch (Exception eq) {                logger.error(Thread.currentThread().getName() + " "+eq);                TryTimeCnt++;// 访问异常数加一                logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())                        + " [TryTimeCnt:" + TryTimeCnt + "]");                if (TryTimeCnt > 5) {                    return null;                }                try {                    Thread.sleep(1000);                } catch (InterruptedException e) {                    e.printStackTrace();                    logger.error(Thread.currentThread().getName() + e);                }                flag = true;            }            try {                Thread.sleep(random.nextInt(500) + 1500);            } catch (InterruptedException e) {                logger.error(Thread.currentThread().getName() + e);                flag = true;            }        }        return page;    }

 

Java WebClient 总结