Java crawler①HttpClient

Java crawler ②Jsoup
Java crawler ③WebMagic crawling used car webpage case
HttpClient is an open source project in Apache, used to provide efficient, the latest, feature-rich client programming toolkit that supports the HTTP protocol, and it supports the latest version of the HTTP protocol And suggestions.

Introduce dependencies

<dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>
    </dependencies>

Configuration file

log4j.rootLogger=DEBUGOER, m
log4j.appender.m=org.apache.log4j.ConsoleAppender
log4j.appender.m.layout=org.apache.log4j.PatternLayout
log4j.appender.m.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n

example:

package com.sihi.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;


public class TestHttpClient {
    public static void main(String[] args) throws IOException {
        //1.打开浏览器,创建HTTP客户端
        CloseableHttpClient client = HttpClients.createDefault();
        //2.输入网址,创建请求
//        HttpGet httpGet = new HttpGet("http://www.sikiedu.com");
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75");

        //3.发送请求,发送请求
        CloseableHttpResponse response = client.execute(httpGet);
        //4.服务器响应,解析响应
        HttpEntity entity = response.getEntity();

        System.out.println(EntityUtils.toString(entity,"UTF-8"));
    }
}

get request

package com.sihi.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class TestHttpGet {
    public static void main(String[] args) {
        //get无参请求
        //httpGet();
        //get带参数
        //httpgetByParam1();
        //get带参数2
        httpgetByParam2();
    }

    private static void httpGet(){
        //1.创建httpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com");

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void httpgetByParam1(){
        //1.创建httpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void httpgetByParam2(){
        //1.创建httpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.创建httpGet请求
        //HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
        URI uri = null;
        try {
            uri = new URIBuilder().setScheme("http").setHost("www.sikiedu.com").setPath("/course/search")
                    .setParameter("categoryId", "0")
                    //.setParameters(new BasicNameValuePair("categoryId", "0"),new BasicNameValuePair("orderBy", "recommendedSeq")).build();
                    .setParameter("orderBy", "recommendedSeq").build();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        HttpGet httpGet = new HttpGet(uri);

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

post request

Import a json parsing dependency of alibaba

<!--        https://mvnrepository.com/atrifaJt/com.alibaba/fastjson-->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.68</version>
package com.sihi.crawler.test;

import com.alibaba.fastjson.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class TestHttpPost {
    public static void main(String[] args) {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
        nameValuePairs.add(new BasicNameValuePair("app","ip.local"));
        nameValuePairs.add(new BasicNameValuePair("format","json"));
        //http://api.k780.com/?app=ip.local&format=json
        URI uri = null;
        try {
            uri = new URIBuilder().setScheme("http").setHost("api.k780.com").setParameters(nameValuePairs).build();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        HttpPost httpPost = new HttpPost(uri);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            int statusCode = response.getStatusLine().getStatusCode();
            if (200 == statusCode){
                HttpEntity entity = response.getEntity();
                //System.out.println(EntityUtils.toString(entity,"UTF-8"));
                String json = EntityUtils.toString(entity,"UTF-8");
                //解析json拿到具体的ip {"success":"1","result":{"ip":"113.101.45.58","proxy":"1","att":"中国,广东,揭阳","operators":"电信"}}
                Map<String,Object> map =(Map<String,Object>) JSONObject.parse(json);
                Map<String,Object> result =(Map<String,Object>) map.get("result");
                Object ip = result.get("ip");
                System.out.println(ip.toString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

HttpClient connection pool

HttpClient proxy

A crawler with the same IP address frequently visits the same server, it will be banned, so set an IP address on the Internet to access and get a response
https://www.xicidaili.com/nn 西刺Proxy

package com.sihi.crawler.test;


import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class TestHttpPool {
    public static void main(String[] args) {
        //配置连接池的参数
        PoolingHttpClientConnectionManager poolManger = new PoolingHttpClientConnectionManager();
        poolManger.setMaxTotal(100);//最大连接数
        poolManger.setDefaultMaxPerRoute(20);//路由最大连接数

        //配置连接池中连接的参数
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(5000)//发送请求的超时时间
                .setSocketTimeout(2000)//响应超时时间
                .setConnectionRequestTimeout(500)//从连接池中获取的超时时间
                .setProxy(new HttpHost("114.105.103.142",4216))  //设置代理IP和端口号
                .build();
        //拿到httpClient
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(poolManger)
                .setDefaultRequestConfig(config)
                .build();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com");

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

Encapsulate the above operation into a tool class
HttpClientUtil

package com.sihi.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientUtil {

    private static PoolingHttpClientConnectionManager poolManger;
    private static RequestConfig config;

    public static CloseableHttpClient getHttpClient(){
        if(poolManger == null){
            //配置连接池的参数
            poolManger = new PoolingHttpClientConnectionManager();
            poolManger.setMaxTotal(100); //最大的连接数
            poolManger.setDefaultMaxPerRoute(20); //路由最大连接数
        }
        if(config == null){
            //配置连接池中连接的参数
            config = RequestConfig.custom()
                    .setConnectTimeout(5000)//发送请求的超时时间
                    .setSocketTimeout(2000)//响应超时时间
                    .setConnectionRequestTimeout(500)//从连接池中获取的超时时间
                    //.setProxy(new HttpHost("114.105.103.142",4216))  //设置代理IP和端口号
                    .build();
        }
        //拿到httpClient
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(poolManger)
                .setDefaultRequestConfig(config)
                .build();

        return httpClient;
    }

    /**
     * 执行get请求返回的结果
     * @param url
     * @return
     */
    public static String doGet(String url){
        String result = "";
        //1.创建httpClient
        CloseableHttpClient httpClient = getHttpClient();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet(url);

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                result = EntityUtils.toString(entity,"UTF-8");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return result;
    }

    /**
     * 执行post请求返回的结果
     * @param url
     * @return
     */
    public static String doPost(String url){
        String result = "";
        //1.创建httpClient
        CloseableHttpClient httpClient = getHttpClient();

        //2.创建httpGet请求
        HttpPost httpPost = new HttpPost(url);

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                result = EntityUtils.toString(entity,"UTF-8");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return result;
    }
}

Guess you like

Origin blog.csdn.net/weixin_46083166/article/details/112410236