首页 > 代码库 > 提取新闻下一页

提取新闻下一页

 

 

package com.unbank.robotspider.util;import java.util.HashMap;import java.util.Map;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class SmartNextPageFecther {	public static void main(String[] args) {		String url = "http://focus.stockstar.com/SS2014061700001351.shtml";		Document document = JsoupUtil.readUrl(url);		Map<Integer, String> pageList = new SmartNextPageFecther()				.getNextPageUrl(document, url);		for (int i = 0; i < pageList.size() + 3; i++) {			String nextUrl = pageList.get(i);			if (nextUrl != null) {				System.out.println(nextUrl);			}		}	}	public Map<Integer, String> getNextPageUrl(Document doc, String baseurl) {		Document document = doc.clone();		Map<Integer, String> map = new HashMap<Integer, String>();		Elements a_elements = document.getElementsByTag("a");		int prePageNum = 5;		int pageNum = 0;		for (Element e : a_elements) {			String uu = e.attr("href");			uu = UrlTools.getFullUrl(baseurl, uu);			if (uu == null || uu.trim().isEmpty()) {				continue;			}			String a_text = e.text();			// 是否是下一页的			boolean bl = checkText(a_text);			if (bl) {				int cu = checkUrl(baseurl, uu);				if (cu != -1) {					pageNum = pageNum > cu ? pageNum : cu;					prePageNum = prePageNum < cu ? prePageNum : cu;					map.put(cu, uu);				}			}		}		if (map.size() >= 2) {			// 说明是3页了			String second = null;			String third = null;			if (prePageNum == 0) {				second = map.get(0);				third = map.get(1);			} else if (prePageNum == 1) {				second = map.get(1);				third = map.get(2);			} else if (prePageNum == 2) {				second = map.get(2);				third = map.get(3);			}			String urlRule = UrlRuleUtil.getURlRule(second, third);			for (int i = prePageNum; i <= pageNum; i++) {				if (map.get(i) == null) {					String page = UrlRuleUtil.getcheckURL(urlRule, i);					map.put(i, page);				}			}		}		return map;	}	public boolean checkText(String text) {		String[] texts = { "首页", "第一页", "下一页", "末页", "最后一页", "尾页" };		for (int i = 0; i < texts.length; i++) {			if (texts[i].equals(text)) {				return true;			}		}		if (text.matches("\\d{1,2}")) {			return true;		}		return false;	}	public int checkUrl(String url1, String url2) {		int l1 = url1.length();		int l2 = url2.length();		if (l1 == 0 || l2 == 0) {			return -1;		}		String longStr = l1 > l2 ? url1 : url2;		String shortStr = l1 < l2 ? url1 : url2;		int j = 0;		StringBuffer sb = new StringBuffer();		for (int i = 0; i < longStr.length() - 1; i++) {			if (longStr.charAt(i) != shortStr.charAt(j)) {				sb.append(longStr.charAt(i));			} else {				j++;				if (j == shortStr.length()) {					break;				}			}		}		if (sb.length() == 0) {			return -1;		}		String variances = sb.toString();		String numStr = variances.replaceAll("_", "").replaceAll("=", "")				.replaceAll("index", "").replaceAll("page", "")				.replaceAll("p", "").replaceAll("-", "");		if (numStr.matches("\\d{1,2}")) {			return Integer.valueOf(numStr);		} else {			return -1;		}	}}