首页 > 代码库 > 将整个html内容保存到指定文件
将整个html内容保存到指定文件
* 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件
*
*@author chenguoyong
*
*/
public class ScrubSelectedWeb {
privatefinal static String CRLF = System.getProperty("line.separator");
/**
* @param args
*/
publicstatic void main(String[] args) {
try{
URLur = newURL("http://www.google.cn/");
InputStreaminstr = ur.openStream();
Strings, str;
BufferedReaderin = new BufferedReader(new InputStreamReader(instr));
StringBuffersb = new StringBuffer();
BufferedWriterout = new BufferedWriter(new FileWriter(
"D:/outPut.txt"));
while((s = in.readLine()) != null) {
sb.append(s+ CRLF);
}
System.out.println(sb);
str= new String(sb);
out.write(str);
out.close();
in.close();
}catch (MalformedURLException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}
}
}
4利用htmlparser提取网页纯文本的例子
package parser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
importorg.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
/**
* 标题:利用htmlparser提取网页纯文本的例子
*/
public class TestHTMLParser2 {
/**
* 读取目标html内容
*
*/
publicstatic void testHtml() {
try{
StringsCurrentLine;
StringsTotalString;
sCurrentLine= "";
sTotalString= "";
java.io.InputStreaml_urlStream;
java.net.URLl_url = new java.net.URL(
"http://10.249.187.199:8083/injs100/");
java.net.HttpURLConnectionl_connection = (java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream= l_connection.getInputStream();
java.io.BufferedReaderl_reader = new java.io.BufferedReader(
newjava.io.InputStreamReader(l_urlStream));
while((sCurrentLine = l_reader.readLine()) != null) {
sTotalString+= sCurrentLine + "\r\n";
}
StringtestText = extractText(sTotalString);
}catch (Exception e) {
e.printStackTrace();
}
}
/**
* 抽取纯文本信息
* @param inputHtml:html文本
* @return
* @throws Exception
*/
publicstatic String extractText(String inputHtml) throws Exception {
StringBuffertext = new StringBuffer();
Parserparser = Parser.createParser(new String(inputHtml.getBytes(),
"GBK"),"GBK");
//遍历所有的节点
NodeListnodes = parser.extractAllNodesThatMatch(new NodeFilter() {
publicboolean accept(Node node) {
returntrue;
}
});
System.out.println(nodes.size());
for(int i = 0; i < nodes.size(); i++) {
Nodenodet = nodes.elementAt(i);
//字符串的代表性节点:节点的描述
text.append(newString(nodet.toPlainTextString().getBytes("GBK"))
+"\r\n");
}
returntext.toString();
}
/**
* 读取文件的方式/utl 来分析内容.filePath也可以是一个Url.
* @param resource :文件/Url
* @throws Exception
*/
publicstatic void test5(String resource) throws Exception {
ParsermyParser = new Parser(resource);
myParser.setEncoding("GBK");
StringfilterStr = "table";
NodeFilterfilter = new TagNameFilter(filterStr);
NodeListnodeList = myParser.extractAllNodesThatMatch(filter);
/*for(inti=0;i<nodeList.size();i++)
{
TableTagtabletag = (TableTag) nodeList.elementAt(i);
//标签名称
System.out.println(tabletag.getTagName());
System.out.println(tabletag.getText());
}*/
TableTagtabletag = (TableTag) nodeList.elementAt(1);
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283860
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283861
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283862
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283863
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283864
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283865
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283866
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283867
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283868
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283869
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283870
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283871
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283872
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283873
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283874
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283875
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283876
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283877
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283878
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283879
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283880
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283881
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283882
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283883
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283884
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283885
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283886
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283887
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283888
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283889
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283890
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283891
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283892
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283893
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283894
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283895
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283896
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283897
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283898
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283899
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283900
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283901
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283902
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283903
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283904
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283905
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283906
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283907
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283908
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283909
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283910
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283911
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283912
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283913
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283914
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283915
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283916
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283917
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283918
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283919
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283920
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283921
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283922
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283923
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283924
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283925
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283926
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283927
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283928
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283929
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283930
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283931
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283932
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283933
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283934
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283935
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283936
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283937
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283938
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283939
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283940
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283941
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283942
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283943
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283944
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283945
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283946
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283947
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283948
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283949
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283950
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283951
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283952
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283953
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283954
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283955
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283956
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283957
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283958
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283959
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283960
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283961
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283962
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283963
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283964
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283965
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283966
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283967
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283968
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283969
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283970
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283971
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283972
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283973
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283974
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283975
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283976
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283977
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283978
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283979
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283980
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283981
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283982
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283983
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283984
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283985
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283986
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283987
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283988
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283989
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283990
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283991
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283992
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283993
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283994
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283995
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283996
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283997
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283998
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1283999
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284000
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284001
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284002
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284003
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284004
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284005
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284006
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284007
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284008
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284009
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284010
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284011
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284012
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284013
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284014
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284015
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284016
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284017
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284018
http://www.ituite.com/space.php?uid=1040447&do=poll&pid=1284019
将整个html内容保存到指定文件