首页 > 代码库 > java抓取网页内容
java抓取网页内容
直接上代码
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TestHtml { /** * 读取一个网页全部内容 */ public String getOneHtml(final String htmlurl) throws IOException { URL url; String temp; final StringBuffer sb = new StringBuffer(); try { url = new URL(htmlurl); final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));// 读取网页全部内容 while ((temp = in.readLine()) != null) { sb.append(temp); } in.close(); } catch (final MalformedURLException me) { System.out.println("你输入的URL格式有问题!请仔细输入"); me.getMessage(); throw me; } catch (final IOException e) { e.printStackTrace(); throw e; } return sb.toString(); } /** * * @param s * @return 获得网页标题 */ public String getTitle(final String s) { String regex; String title = ""; final List<String> list = new ArrayList<String>(); regex = "<title>.*?</title>"; final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } for (int i = 0; i < list.size(); i++) { title = title + list.get(i); } return outTag(title); }/** * 获取参数 * @param s * @param regexarg * @return */ public String getByRegex(final String s,String regexarg) { String regex; String title = ""; final List<String> list = new ArrayList<String>(); regex = regexarg; final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } for (int i = 0; i < list.size(); i++) { title = title + list.get(i); } return outTag(title); }/** * * @param args * */ public static void main(final String args[]) { String url = "http://detail.1688.com/offer/41797007099.html?tracelog=p4p"; try { String html = ""; TestHtml testHtml=new TestHtml(); html = testHtml.getOneHtml(url); String Regex="<span class=\"value price-length-5\">.*?</span>"; String content=testHtml.getByRegex(html, Regex); System.out.println("contet is :"+content); } catch (final Exception e) { e.getMessage(); } }}
java抓取网页内容
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。