首页 > 代码库 > 通用httpclient生成方式
通用httpclient生成方式
在做爬虫的时候,如何生成一个靠谱可用的httpclient对象是非常关键的。在踩了无数的坑之后,总结出一个较为完善的httpclient生成方式下载。
可以解决以下问题:
1、设置代理问题
2、设置默认的cookiestore对象,用来保存请求中的cookie。以便进行深层次访问。
3、在请求失败的重试策略问题
4、默认useragent的问题
5、https及自签名证书的验证问题
Java代码
/**
* 新建一个通用httpclientbuider
* 使用代理时,必须一起传入host对象。
* 不传入host对象的时候,代理不会生效
*/
public static HttpClientBuilder getInstanceClientBuilder(boolean isNeedProxy, CookieStore store, HttpHost host, HttpRequestRetryHandler handler, String userAgent) {
org.apache.http.ssl.SSLContextBuilder context_b = SSLContextBuilder.create();
SSLContext ssl_context = null;
try {
context_b.loadTrustMaterial(null, (x509Certificates, s) -> true);
//信任所有证书,解决https证书问题
ssl_context = context_b.build();
} catch (Exception e) {
e.printStackTrace();
}
ConnectionSocketFactory sslSocketFactory = null;
Registry<ConnectionSocketFactory> registry = null;
if (ssl_context != null) {
sslSocketFactory = new SSLConnectionSocketFactory(ssl_context, new String[]{"TLSv1", "TLSv1.1", "TLSv1.2"}, null, (s, sslSession) -> true);
//应用多种tls协议,解决偶尔握手中断问题
registry = RegistryBuilder.<ConnectionSocketFactory>create().register("https", sslSocketFactory).register("http", new PlainConnectionSocketFactory()).build();
} 下载
PoolingHttpClientConnectionManager manager = null;
if (registry != null) {
manager = new PoolingHttpClientConnectionManager(registry);
} else {
manager = new PoolingHttpClientConnectionManager();
}
manager.setMaxTotal(150);
manager.setDefaultMaxPerRoute(200);
HttpClientBuilder builder = HttpClients.custom().setRetryHandler(handler)
.setConnectionTimeToLive(6000, TimeUnit.SECONDS)
.setUserAgent(userAgent);
if (store != null) {
builder.setDefaultCookieStore(store);
}
if (isNeedProxy && host != null) {
// HttpHost proxy = new HttpHost("127.0.0.1", 1080);// 代理ip
DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(host);
builder = builder.setRoutePlanner(routePlanner);
}
builder.setConnectionManager(manager);//httpclient连接池
builder.setRedirectStrategy(new AllowAllRedirectStrategy());//默认重定向所有302和307,否则httpclient只自动处理get请求导致的302和307
return builder;
}
通用httpclient生成方式