转载自 java爬虫之基于httpclient的简单Demo(二)
延续demo1的 java爬虫的2种爬取方式(HTTP||Socket)简单Demo(一),demo2出炉啦,大家想学爬虫都可以从这个网盘学习哦:https://pan.baidu.com/s/1pJJrcqJ#list/path=%2F
免费课程,非常不错。其实还是主要学习一个httpclient,httpclient全是英文文档,看的我心累啊
package com.simple.crawImpl; import com.simple.Icrawl.ICrawl; import com.simple.pojos.CrawlResultPojo; import com.simple.pojos.UrlPojo; import org.apache.http.HttpEntity; import org.apache.http.ParseException; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.net.URISyntaxException; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; /** * * Created by lewis on 2016/10/16. */ public class HttpClientCrawlerImpl implements ICrawl{ public CloseableHttpClient httpClient = HttpClients.custom().build(); //创建定制HttpClient @Override public CrawlResultPojo crawl(UrlPojo urlpojo) { if(urlpojo==null){ return null; } CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); //结果集 CloseableHttpResponse response = null; //HTTP返回的各种信息集合,包含协议http标准,httpcode状态码 BufferedReader br = null; // try { HttpGet httpGet = new HttpGet(urlpojo.getUrl()); response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); //获取输入流 InputStreamReader isr = new InputStreamReader(entity.getContent(),"utf-8"); //字节流转化为字符流,设置编码 br =new BufferedReader(isr); String line =null; StringBuilder context = new StringBuilder(); while((line=br.readLine())!=null){ context.append(line+"\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(context.toString()); return crawlResultPojo; } catch (IOException e) { e.printStackTrace(); crawlResultPojo.setSuccess(false); }finally { try { if (br!=null) br.close(); //关闭流 if(response!=null) response.close(); } catch (IOException e) { e.printStackTrace(); } } return crawlResultPojo; } /** * 带参数post的urlpojo * */ public CrawlResultPojo crawl4Post(UrlPojo urlPojo){ if(urlPojo==null||urlPojo.getUrl()==null){ return null; } CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); BufferedReader br= null; try { RequestBuilder rb = RequestBuilder.post().setUri(new URI(urlPojo.getUrl())); Map<String,Object> parasMap = urlPojo.getParasMap() ; if(parasMap!=null){ for(Entry<String,Object> entry:parasMap.entrySet()){ rb.addParameter(entry.getKey(),entry.getValue().toString()); } } HttpUriRequest httpUriRequest = rb.build(); HttpEntity entity =httpClient.execute(httpUriRequest).getEntity(); InputStreamReader isr=new InputStreamReader(entity.getContent(),"utf-8"); br = new BufferedReader(isr); String line = null; StringBuilder stringBuilder = new StringBuilder(); while((line=br.readLine())!=null){ stringBuilder.append(line+"\n"); } crawlResultPojo.setPageContent(stringBuilder.toString()); crawlResultPojo.setSuccess(true); return crawlResultPojo; } catch (URISyntaxException e) { e.printStackTrace(); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(br!=null) br.close(); } catch (IOException e) { e.printStackTrace(); } } crawlResultPojo.setSuccess(false); return crawlResultPojo; } public static void main(String []args){ HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl(); String url = "http://www.wangdaizhijia.com/front_select-plat"; UrlPojo urlPojo = new UrlPojo(url); Map<String, Object> parasMap = new HashMap<String, Object>(); int max_page_number = 1000; parasMap.put("currPage", 30); parasMap.put("params", ""); parasMap.put("sort", 0); urlPojo.setParasMap(parasMap); CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo); print(resultPojo); resultPojo=httpClientCrawlerImpl.crawl(urlPojo); print(resultPojo); } public static void print(Object s){ System.out.println(s); } }