java爬虫之基于httpclient的简单Demo(二)

转载自 java爬虫之基于httpclient的简单Demo(二)

延续demo1的 java爬虫的2种爬取方式(HTTP||Socket)简单Demo(一),demo2出炉啦,大家想学爬虫都可以从这个网盘学习哦:https://pan.baidu.com/s/1pJJrcqJ#list/path=%2F

免费课程,非常不错。其实还是主要学习一个httpclient,httpclient全是英文文档,看的我心累啊

package com.simple.crawImpl;  
  
import com.simple.Icrawl.ICrawl;  
import com.simple.pojos.CrawlResultPojo;  
import com.simple.pojos.UrlPojo;  
import org.apache.http.HttpEntity;  
import org.apache.http.ParseException;  
import org.apache.http.client.ClientProtocolException;  
import org.apache.http.client.methods.CloseableHttpResponse;  
import org.apache.http.client.methods.HttpGet;  
import org.apache.http.client.methods.HttpUriRequest;  
import org.apache.http.client.methods.RequestBuilder;  
import org.apache.http.impl.client.CloseableHttpClient;  
import org.apache.http.impl.client.HttpClients;  
  
import java.io.BufferedReader;  
import java.io.IOException;  
import java.io.InputStreamReader;  
import java.net.URI;  
import java.net.URISyntaxException;  
import java.util.HashMap;  
import java.util.Map;  
import java.util.Map.Entry;  
  
/** 
 * 
 * Created by lewis on 2016/10/16. 
 */  
public class HttpClientCrawlerImpl implements ICrawl{  
  
    public CloseableHttpClient httpClient = HttpClients.custom().build();           //创建定制HttpClient  
    @Override  
    public CrawlResultPojo crawl(UrlPojo urlpojo) {  
  
        if(urlpojo==null){  
            return null;  
        }  
        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();                //结果集  
        CloseableHttpResponse response = null;                                  //HTTP返回的各种信息集合,包含协议http标准,httpcode状态码  
        BufferedReader br = null;                                               //  
  
        try {  
            HttpGet httpGet = new HttpGet(urlpojo.getUrl());  
            response = httpClient.execute(httpGet);  
            HttpEntity entity = response.getEntity();                                       //获取输入流  
            InputStreamReader isr = new InputStreamReader(entity.getContent(),"utf-8");     //字节流转化为字符流,设置编码  
            br =new BufferedReader(isr);  
  
            String line =null;  
            StringBuilder context = new StringBuilder();  
  
            while((line=br.readLine())!=null){  
                context.append(line+"\n");  
            }  
  
            crawlResultPojo.setSuccess(true);  
            crawlResultPojo.setPageContent(context.toString());  
  
            return crawlResultPojo;  
  
        } catch (IOException e) {  
            e.printStackTrace();  
            crawlResultPojo.setSuccess(false);  
        }finally {  
            try {  
                if (br!=null)  
                    br.close();                                                                 //关闭流  
                if(response!=null)  
                    response.close();  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
        }  
        return crawlResultPojo;  
    }  
  
    /** 
     * 带参数post的urlpojo 
     * */  
    public CrawlResultPojo crawl4Post(UrlPojo urlPojo){  
        if(urlPojo==null||urlPojo.getUrl()==null){  
            return null;  
        }  
  
        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();  
        BufferedReader br= null;  
  
        try {  
  
            RequestBuilder rb = RequestBuilder.post().setUri(new URI(urlPojo.getUrl()));  
            Map<String,Object> parasMap = urlPojo.getParasMap() ;  
            if(parasMap!=null){  
                for(Entry<String,Object> entry:parasMap.entrySet()){  
                    rb.addParameter(entry.getKey(),entry.getValue().toString());  
                }  
            }  
            HttpUriRequest httpUriRequest = rb.build();  
            HttpEntity entity =httpClient.execute(httpUriRequest).getEntity();  
            InputStreamReader isr=new InputStreamReader(entity.getContent(),"utf-8");  
            br = new BufferedReader(isr);  
  
            String line = null;  
            StringBuilder stringBuilder = new StringBuilder();  
  
            while((line=br.readLine())!=null){  
                stringBuilder.append(line+"\n");  
            }  
  
            crawlResultPojo.setPageContent(stringBuilder.toString());  
            crawlResultPojo.setSuccess(true);  
  
            return crawlResultPojo;  
        } catch (URISyntaxException e) {  
            e.printStackTrace();  
        } catch (ClientProtocolException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            try {  
                if(br!=null)  
                    br.close();  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
        }  
        crawlResultPojo.setSuccess(false);  
        return crawlResultPojo;  
    }  
  
    public static void main(String []args){  
  
        HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl();  
        String url = "http://www.wangdaizhijia.com/front_select-plat";  
        UrlPojo urlPojo = new UrlPojo(url);  
        Map<String, Object> parasMap = new HashMap<String, Object>();  
  
        int max_page_number = 1000;  
  
        parasMap.put("currPage", 30);  
        parasMap.put("params", "");  
        parasMap.put("sort", 0);  
        urlPojo.setParasMap(parasMap);  
  
        CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo);  
        print(resultPojo);  
        resultPojo=httpClientCrawlerImpl.crawl(urlPojo);  
        print(resultPojo);  
    }  
  
    public static void print(Object s){  
        System.out.println(s);  
    }  
  
}  

猜你喜欢

转载自blog.csdn.net/moakun/article/details/80554177