首页 > 代码库 > 抓取网页内容
抓取网页内容
上一篇博客已经介绍了如何得到网页的编码,得到编码之后根据编码得到相应的流,我们将网页的内容获取存在一个string类型的变量中即可
package Spider;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import org.junit.Test;
public class Capture {
@Test
public String k() throws Exception{
URL url=new URL("http://www.baidu.com");
//得到百度首页的编码
String charset=i(url);
StringBuffer codeBuffer=null;
BufferedReader in=null;
try{
//设置代理
Proxy proxy=new Proxy(Proxy.Type.HTTP, new InetSocketAddress("proxy3.bj.petrochina", 8080));
//打开连接
HttpURLConnection urlcon=(HttpURLConnection) url.openConnection(proxy);
urlcon.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
InputStream is=urlcon.getInputStream();
in=new BufferedReader(new InputStreamReader(is,charset));
codeBuffer=new StringBuffer();
String tmpCode=null;
//把buffer内的值读出来,保存到code中
while((tmpCode=in.readLine())!=null){
codeBuffer.append(tmpCode).append("\n");
}
in.close();
}catch(Exception e){
e.printStackTrace();
}
String temp = codeBuffer.toString();
System.out.println(temp);
return temp;
}
public String i(URL url) throws Exception{
//设置代理
Proxy proxy=new Proxy(Proxy.Type.HTTP, new InetSocketAddress("proxy3.bj.petrochina", 8080));
//URL url=new URL("http://www.baidu.com");
//打开连接
HttpURLConnection urlcon=(HttpURLConnection) url.openConnection(proxy);
urlcon.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
String charset=null;
String contentType=urlcon.getHeaderField("Content-Type");
for(String param:contentType.replace(" ","").split(";")){
if(param.startsWith("charset=")){
charset=param.split("=", 2)[1];
break;
}
}
return charset;
}
}
抓取网页内容