首页 > 代码库 > 将整个html内容保存到指定文件

将整个html内容保存到指定文件


 0人阅读 /**

 * 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件

 *

 *@author chenguoyong

 *

 */

public class ScrubSelectedWeb {

       privatefinal static String CRLF = System.getProperty("line.separator");

 

       /**

        * @param args

        */

       publicstatic void main(String[] args) {

              try{

                     URLur = newURL("http://www.google.cn/");

                     InputStreaminstr = ur.openStream();

                     Strings, str;

                     BufferedReaderin = new BufferedReader(new InputStreamReader(instr));

                     StringBuffersb = new StringBuffer();

                     BufferedWriterout = new BufferedWriter(new FileWriter(

                                   "D:/outPut.txt"));

                     while((s = in.readLine()) != null) {

                            sb.append(s+ CRLF);

                     }

                     System.out.println(sb);

                     str= new String(sb);

                     out.write(str);

                     out.close();

                     in.close();

              }catch (MalformedURLException e) {

                     e.printStackTrace();

              }catch (IOException e) {

                     e.printStackTrace();

              }

 

       }

 

}

 

4利用htmlparser提取网页纯文本的例子

 

package parser;

 

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

importorg.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.TableTag;

import org.htmlparser.util.NodeList;

 

/**

 * 标题:利用htmlparser提取网页纯文本的例子

 */

public class TestHTMLParser2 {

       /**

        * 读取目标html内容

        *

        */

       publicstatic void testHtml() {

              try{

                     StringsCurrentLine;

                     StringsTotalString;

                     sCurrentLine= "";

                     sTotalString= "";

                     java.io.InputStreaml_urlStream;

                     java.net.URLl_url = new java.net.URL(

                                   "http://10.249.187.199:8083/injs100/");

                     java.net.HttpURLConnectionl_connection = (java.net.HttpURLConnection) l_url

                                   .openConnection();

                     l_connection.connect();

                     l_urlStream= l_connection.getInputStream();

                     java.io.BufferedReaderl_reader = new java.io.BufferedReader(

                                   newjava.io.InputStreamReader(l_urlStream));

                     while((sCurrentLine = l_reader.readLine()) != null) {

                            sTotalString+= sCurrentLine + "\r\n";

                     }

 

                     StringtestText = extractText(sTotalString);

              }catch (Exception e) {

                     e.printStackTrace();

              }

 

       }

   /**

    * 抽取纯文本信息

    * @param inputHtml:html文本

    * @return

    * @throws Exception

    */

       publicstatic String extractText(String inputHtml) throws Exception {

              StringBuffertext = new StringBuffer();

              Parserparser = Parser.createParser(new String(inputHtml.getBytes(),

                            "GBK"),"GBK");

              //遍历所有的节点

              NodeListnodes = parser.extractAllNodesThatMatch(new NodeFilter() {

                     publicboolean accept(Node node) {

                            returntrue;

                     }

              });

 

              System.out.println(nodes.size());

              for(int i = 0; i < nodes.size(); i++) {

                     Nodenodet = nodes.elementAt(i);

                     //字符串的代表性节点:节点的描述

                     text.append(newString(nodet.toPlainTextString().getBytes("GBK"))

                                   +"\r\n");

              }

              returntext.toString();

       }

   /**

    *  读取文件的方式/utl 来分析内容.filePath也可以是一个Url.

    * @param resource :文件/Url

    * @throws Exception

    */

       publicstatic void test5(String resource) throws Exception {

              ParsermyParser = new Parser(resource);

              myParser.setEncoding("GBK");

              StringfilterStr = "table";

              NodeFilterfilter = new TagNameFilter(filterStr);

              NodeListnodeList = myParser.extractAllNodesThatMatch(filter);

              /*for(inti=0;i<nodeList.size();i++)

              {

                     TableTagtabletag = (TableTag) nodeList.elementAt(i);

                     //标签名称

                     System.out.println(tabletag.getTagName());

                     System.out.println(tabletag.getText());

              }*/

              TableTagtabletag = (TableTag) nodeList.elementAt(1);

             

              http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283860
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283861
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283862
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283863
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283864
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283865
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283866
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283867
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283868
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283869
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283870
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283871
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283872
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283873
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283874
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283875
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283876
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283877
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283878
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283879
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283880
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283881
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283882
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283883
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283884
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283885
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283886
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283887
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283888
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283889
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283890
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283891
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283892
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283893
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283894
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283895
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283896
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283897
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283898
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283899
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283900
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283901
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283902
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283903
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283904
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283905
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283906
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283907
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283908
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283909
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283910
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283911
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283912
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283913
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283914
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283915
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283916
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283917
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283918
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283919
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283920
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283921
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283922
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283923
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283924
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283925
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283926
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283927
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283928
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283929
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283930
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283931
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283932
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283933
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283934
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283935
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283936
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283937
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283938
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283939
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283940
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283941
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283942
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283943
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283944
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283945
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283946
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283947
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283948
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283949
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283950
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283951
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283952
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283953
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283954
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283955
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283956
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283957
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283958
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283959
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283960
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283961
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283962
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283963
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283964
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283965
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283966
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283967
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283968
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283969
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283970
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283971
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283972
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283973
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283974
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283975
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283976
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283977
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283978
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283979
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283980
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283981
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283982
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283983
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283984
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283985
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283986
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283987
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283988
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283989
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283990
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283991
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283992
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283993
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283994
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283995
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283996
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283997
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283998
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283999
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284000
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284001
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284002
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284003
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284004
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284005
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284006
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284007
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284008
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284009
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284010
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284011
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284012
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284013
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284014
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284015
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284016
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284017
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284018
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284019

将整个html内容保存到指定文件