前言
Httpclient 3.x和Httpclient 4.x如何设置浏览器缓存策略?Java HttpClient(Java爬虫)请求网站时候,需要设置浏览器的缓存策略,模拟人工请求页面,得到网站正常正确的返回的数据,从而达到Java爬虫的目的,那么如何设置浏览器缓存策略,让我们的代码像浏览器一样来处理网站的请求和返回的链接呢,可以参考如下代码。
方法1:对HttpClient添加cookie策略 httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); 方法2:对Get或者Post方法添加cookie策略 method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
httpclient 4.x初始化完成前设置缓存策略的方法如下: DefaultHttpClient httpClient = null ; DefaultHttpClient defaultClient; HttpParams httpParams = new BasicHttpParams(); SchemeRegistry registry = new SchemeRegistry(); ClientConnectionManager connectionManager = new ThreadSafeClientConnManager(httpParams,registry); defaultClient = new DefaultHttpClient(connectionManager, httpParams); defaultClient.getParams().setIntParameter(HttpConnectionParams.SOCKET_BUFFER_SIZE, 20*1024); HttpClientParams.setCookiePolicy(defaultClient.getParams(),CookiePolicy.BROWSER_COMPATIBILITY); httpclient 4.x初始化完成后设置缓存策略的方法如下: httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
代码例子供大家参考:
public class BaiduClick { public static boolean click(String urls, String ip, int port, ClickFrame shuaFenFrame, String webAgent, DefaultHttpClient httpClient){ boolean re = false ; boolean isyanzheng = false ; List<com.alibaba.fastjson.JSONObject> remap = new ArrayList<com.alibaba.fastjson.JSONObject>(); System.out.println("点击URLS----"+urls) ; String errorinfo = "" ; // DefaultHttpClient httpClient = HttpClientUtuils.handleNewHttpClient(120000, 120000) ; if(ip!=null && !ip.equals("")){ HttpHost proxy = new HttpHost(ip,port); httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy); } if(httpClient == null){ httpClient = HttpClientUtuils.handleNewHttpClient(120000, 120000) ; } httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 200000); httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 200000); HttpGet g1 = null; try { g1 = new HttpGet(urls) ; System.out.println(urls); g1.setHeader("Accept", "text/html, application/xhtml+xml, */*") ; g1.setHeader("Accept-Language", "zh-CN") ; g1.setHeader("Proxy-Connection", "Keep-Alive") ; g1.setHeader("User-Agent", webAgent) ; HttpResponse response2 = httpClient.execute(g1); String sg1 = EntityUtils.toString(response2.getEntity(),"utf-8") ; sg1 = StringRandomUtils.unicodeToString(sg1) ; g1 = new HttpGet("http://www.med39.net/") ; System.out.println(urls); g1.setHeader("Accept", "text/html, application/xhtml+xml, */*") ; g1.setHeader("Accept-Language", "zh-CN") ; g1.setHeader("Proxy-Connection", "Keep-Alive") ; g1.setHeader("User-Agent", webAgent) ; response2 = httpClient.execute(g1); //httpClient.executeMethod(g1) ; sg1 = EntityUtils.toString(response2.getEntity(),"utf-8") ; if(response2.getStatusLine().getStatusCode() == 200 ){ System.out.println("点击成功:"+urls); re = true; }else{ System.out.println("点击失败1"); errorinfo = "点击失败,无法访问网站,请换ip" ; } }catch(Exception e){ System.out.println(e.toString()); e.printStackTrace() ; }finally{ // if(httpClient !=null){ // httpClient.getConnectionManager().shutdown(); // } } System.gc() ; return re ; } /** * @param args */ public static void main(String[] args) { DefaultHttpClient httpClient = HttpClientUtuils.handleNewHttpClient(12000, 12000) ; String urls = "https://www.baidu.com/link?url=JvBQ0pT3UGS5A9-GzsRa743a0hTYB6bGiH40O8tYehi&wd=&eqid=c78ed8940002f946000000065d6a2e88" ; String webAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" ; BaiduClick.click(urls, "", 0, null, webAgent, httpClient) ; } }
注明: 代码中的网站均为例子,仅供参考,不一定真实有效额,请勿全部Copy。