前言
Httpclient 3.x和Httpclient 4.x如何设置浏览器缓存策略?Java HttpClient(Java爬虫)请求网站时候,需要设置浏览器的缓存策略,模拟人工请求页面,得到网站正常正确的返回的数据,从而达到Java爬虫的目的,那么如何设置浏览器缓存策略,让我们的代码像浏览器一样来处理网站的请求和返回的链接呢,可以参考如下代码。
方法1:对HttpClient添加cookie策略 httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); 方法2:对Get或者Post方法添加cookie策略 method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
httpclient 4.x初始化完成前设置缓存策略的方法如下: DefaultHttpClient httpClient = null ; DefaultHttpClient defaultClient; HttpParams httpParams = new BasicHttpParams(); SchemeRegistry registry = new SchemeRegistry(); ClientConnectionManager connectionManager = new ThreadSafeClientConnManager(httpParams,registry); defaultClient = new DefaultHttpClient(connectionManager, httpParams); defaultClient.getParams().setIntParameter(HttpConnectionParams.SOCKET_BUFFER_SIZE, 20*1024); HttpClientParams.setCookiePolicy(defaultClient.getParams(),CookiePolicy.BROWSER_COMPATIBILITY); httpclient 4.x初始化完成后设置缓存策略的方法如下: httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
代码例子供大家参考:
public class BaiduClick {
public static boolean click(String urls, String ip, int port, ClickFrame shuaFenFrame,
String webAgent, DefaultHttpClient httpClient){
boolean re = false ;
boolean isyanzheng = false ;
List<com.alibaba.fastjson.JSONObject> remap = new ArrayList<com.alibaba.fastjson.JSONObject>();
System.out.println("点击URLS----"+urls) ;
String errorinfo = "" ;
// DefaultHttpClient httpClient = HttpClientUtuils.handleNewHttpClient(120000, 120000) ;
if(ip!=null && !ip.equals("")){
HttpHost proxy = new HttpHost(ip,port);
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);
}
if(httpClient == null){
httpClient = HttpClientUtuils.handleNewHttpClient(120000, 120000) ;
}
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 200000);
httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 200000);
HttpGet g1 = null;
try {
g1 = new HttpGet(urls) ;
System.out.println(urls);
g1.setHeader("Accept", "text/html, application/xhtml+xml, */*") ;
g1.setHeader("Accept-Language", "zh-CN") ;
g1.setHeader("Proxy-Connection", "Keep-Alive") ;
g1.setHeader("User-Agent", webAgent) ;
HttpResponse response2 = httpClient.execute(g1);
String sg1 = EntityUtils.toString(response2.getEntity(),"utf-8") ;
sg1 = StringRandomUtils.unicodeToString(sg1) ;
g1 = new HttpGet("http://www.med39.net/") ;
System.out.println(urls);
g1.setHeader("Accept", "text/html, application/xhtml+xml, */*") ;
g1.setHeader("Accept-Language", "zh-CN") ;
g1.setHeader("Proxy-Connection", "Keep-Alive") ;
g1.setHeader("User-Agent", webAgent) ;
response2 = httpClient.execute(g1);
//httpClient.executeMethod(g1) ;
sg1 = EntityUtils.toString(response2.getEntity(),"utf-8") ;
if(response2.getStatusLine().getStatusCode() == 200 ){
System.out.println("点击成功:"+urls);
re = true;
}else{
System.out.println("点击失败1");
errorinfo = "点击失败,无法访问网站,请换ip" ;
}
}catch(Exception e){
System.out.println(e.toString());
e.printStackTrace() ;
}finally{
// if(httpClient !=null){
// httpClient.getConnectionManager().shutdown();
// }
}
System.gc() ;
return re ;
}
/**
* @param args
*/
public static void main(String[] args) {
DefaultHttpClient httpClient = HttpClientUtuils.handleNewHttpClient(12000, 12000) ;
String urls = "https://www.baidu.com/link?url=JvBQ0pT3UGS5A9-GzsRa743a0hTYB6bGiH40O8tYehi&wd=&eqid=c78ed8940002f946000000065d6a2e88" ;
String webAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" ;
BaiduClick.click(urls, "", 0, null, webAgent, httpClient) ;
}
}
注明: 代码中的网站均为例子,仅供参考,不一定真实有效额,请勿全部Copy。
【蝴蝶效应-虎】