首页 > 代码库 > java抓取网页内容

java抓取网页内容

直接上代码

    import java.io.BufferedReader;    import java.io.IOException;    import java.io.InputStreamReader;    import java.net.MalformedURLException;    import java.net.URL;    import java.util.ArrayList;    import java.util.HashMap;    import java.util.List;    import java.util.regex.Matcher;    import java.util.regex.Pattern;        public class TestHtml {     /**      * 读取一个网页全部内容      */     public String getOneHtml(final String htmlurl) throws IOException     {      URL url;      String temp;      final StringBuffer sb = new StringBuffer();      try      {       url = new URL(htmlurl);       final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));// 读取网页全部内容       while ((temp = in.readLine()) != null)       {        sb.append(temp);       }       in.close();      }      catch (final MalformedURLException me)      {       System.out.println("你输入的URL格式有问题!请仔细输入");       me.getMessage();       throw me;      }      catch (final IOException e)      {       e.printStackTrace();       throw e;      }      return sb.toString();     }     /**      *       * @param s      * @return 获得网页标题      */     public String getTitle(final String s)     {      String regex;      String title = "";      final List<String> list = new ArrayList<String>();      regex = "<title>.*?</title>";      final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);      final Matcher ma = pa.matcher(s);      while (ma.find())      {       list.add(ma.group());      }      for (int i = 0; i < list.size(); i++)      {       title = title + list.get(i);      }      return outTag(title);     }/**      * 获取参数      * @param s      * @param regexarg      * @return      */     public String getByRegex(final String s,String regexarg)     {      String regex;      String title = "";      final List<String> list = new ArrayList<String>();      regex = regexarg;      final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);      final Matcher ma = pa.matcher(s);      while (ma.find())      {       list.add(ma.group());      }      for (int i = 0; i < list.size(); i++)      {       title = title + list.get(i);      }      return outTag(title);     }/**      *       * @param args      *             */     public static void main(final String args[])     {      String url = "http://detail.1688.com/offer/41797007099.html?tracelog=p4p";      try      {       String html = "";       TestHtml testHtml=new TestHtml();       html = testHtml.getOneHtml(url);       String Regex="<span class=\"value price-length-5\">.*?</span>";       String content=testHtml.getByRegex(html, Regex);              System.out.println("contet is :"+content);      }      catch (final Exception e)      {       e.getMessage();      }     }}

 

java抓取网页内容