Actual Combat: Building Java Crawlers by Hand - Based on JDK11 Native HttpClient (3)

Table of contents

Homestead

request configuration wrapper

 request tool package

response package


The first two articles have maintained the basic environment and basic tools, and this chapter is about to start with some dry goods

Homestead _ _

The essence of a crawler is still a "request". I think everyone here should agree, but there are only a few types of common network requests, such as Socket requests, Http requests, FTP requests... If we want to crawl web pages, we must need one Http request tool.

At the beginning, we mentioned Apache's HttpClient tool. In fact, there should be the most tutorials on the use of this tool on the Internet. In fact, JDK has always had related Http request tools, but it has been criticized until... until the emergence of JDK11, it also indicates that The JDK's Http request tool (HttpClient) is basically close to perfection (convenient and easy to use), so we have this article (otherwise I also use Apache...).

To realize the JDK's HttpClient encapsulation, we have to specify the encapsulation target:

1. Request configuration encapsulation

2. Request tool package

3. Response encapsulation

In fact, among many Http request tools, these three points are nothing more than encapsulation, so I don’t install them here, just upload the code directly.

request configuration wrapper

The purpose of encapsulating request configuration is to simplify the complexity of use as much as possible, so we need to clarify which are necessary parameters, which are non-essential parameters, and which parameters need to have default values. Sometimes in order to meet various situations, there will be repeated In the case of loading and rewriting, we may not use some functions temporarily, but we may use them in the future, so don’t worry about encapsulation, the more careful the encapsulation, in order to unify our configuration initialization problems, so our HttpConfig It appeared, in order to further simplify the initialization of the tool, our HttpConfig uses the constructor mode to build the Http request tool.

HttpConfig.java

package com.vtarj.pythagoras.explore;

import javax.net.ssl.SSLContext;
import java.net.*;
import java.net.http.HttpClient;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.StringJoiner;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;

/**
 * @Author Vtarj
 * @Description HttpExplore探测器配置工具
 * @Time 2022/4/1 9:46
 **/
public class HttpConfig {

    /** * Http版本,默认HTTP_2 */
    private HttpClient.Version version;
    /** * 转发策略 */
    private HttpClient.Redirect redirect;
    /** * 连接超时时间,毫秒 */
    private Duration connectTimeout;
    /** * 线程池,默认5个连接 */
    private Executor executor;
    /** * 认证信息 */
    private Authenticator authenticator;
    /** * 代理信息 */
    private ProxySelector proxySelector;
    /** * Cookies信息 */
    private CookieHandler cookieHandler;
    /** * SSL连接信息 */
    private SSLContext sslContext;
    /** * 给客户端发送HTTP/2请求时的默认优先级,取值范围1~256 */
    private int priority;

    /** * 定义请求头信息 */
    private Map<String,String> headerMap;
    /** * 定义默认ContextType */
    private static final String DEFAULT_CONTEXT_TYPE = "application/json";
    /** * 定义请求参数 */
    private Map<String,Object> requestParams;
    /** * 定义请求方法 */
    private String requestMethod;
    /** * 定义请求地址 */
    private URI requestURI;

    /** * 定义请求字符编码 */
    private Charset reqCode = StandardCharsets.UTF_8;
    /** * 定义响应字符编码 */
    private Charset resCode = StandardCharsets.UTF_8;
    /** * 定义配置锁,用于确认是否首次配置生效 */
    private boolean locked = true;

    /**
     * 构造配置工具,初始化默认参数
     */
    public HttpConfig() {
        version = HttpClient.Version.HTTP_2;
        redirect = HttpClient.Redirect.NORMAL;
        connectTimeout = Duration.ofMinutes(2);
        setHeader("Content-Type",DEFAULT_CONTEXT_TYPE);
        setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55");
        requestMethod = "GET";
        priority = 1;
    }

    public HttpClient.Version getVersion() {
        return version;
    }

    public HttpConfig setVersion(HttpClient.Version version) {
        this.version = version;
        return this;
    }

    public HttpClient.Redirect getRedirect() {
        return redirect;
    }

    public HttpConfig setRedirect(HttpClient.Redirect redirect) {
        this.redirect = redirect;
        return this;
    }

    public Duration getConnectTimeout() {
        return connectTimeout;
    }

    public HttpConfig setConnectTimeout(Duration connectTimeout) {
        this.connectTimeout = connectTimeout;
        return this;
    }

    public Executor getExecutor() {
        if(executor == null){
            this.executor = Executors.newFixedThreadPool(5);
        }
        return executor;
    }

    public HttpConfig setExecutor(Executor executor) {
        this.executor = executor;
        return this;
    }

    public Authenticator getAuthenticator() {
        return authenticator;
    }

    public HttpConfig setAuthenticator(Authenticator authenticator) {
        this.authenticator = authenticator;
        return this;
    }

    public ProxySelector getProxySelector() {
        return proxySelector;
    }

    public HttpConfig setProxySelector(ProxySelector proxySelector) {
        this.proxySelector = proxySelector;
        return this;
    }

    public CookieHandler getCookieHandler() {
        return cookieHandler;
    }

    public HttpConfig setCookieHandler(CookieHandler cookieHandler) {
        this.cookieHandler = cookieHandler;
        return this;
    }

    public SSLContext getSslContext() {
        return sslContext;
    }

    public HttpConfig setSslContext(SSLContext sslContext) {
        this.sslContext = sslContext;
        return this;
    }

    public int getPriority() {
        return priority;
    }

    public HttpConfig setPriority(int priority) {
        if(priority < 1 || priority > 256){
            throw new RuntimeException("您输入的优先级不合法,优先级范围[1~256]");
        }
        this.priority = priority;
        return this;
    }

    public Map<String, String> getHeaderMap() {
        return headerMap;
    }

    /**
     * 设置Header信息
     * @param key   header标识
     * @param value header值
     * @return  配置信息,用于链式调用
     */
    public HttpConfig setHeader(String key,String value){
        if (headerMap == null) {
            headerMap = new HashMap<>();
        }
        headerMap.put(key,value);
        return this;
    }

    /**
     * 通过Map方式设置Header信息,仅做内容合并,不做内容替换,避免缺失的参数丢失
     * @param headerMap Map形式的Header信息
     * @return  返回HttpConfig,以便于链式调用
     */
    public HttpConfig setHeaderMap(Map<String, String> headerMap) {
        this.headerMap.putAll(headerMap);
        return this;
    }

    public Map<String, Object> getRequestParams() {
        return requestParams;
    }

    public HttpConfig setRequestParam(String key,String value) {
        if (requestParams == null){
            requestParams = new HashMap<>();
        }
        requestParams.put(key,value);
        return this;
    }

    public HttpConfig setRequestParams(Map<String, Object> requestParams) {
        this.requestParams = requestParams;
        return this;
    }

    public String getRequestMethod() {
        return requestMethod;
    }

    /**
     * 设置请求方法
     * @param requestMethod 请求方法,取值:"GET","POST","PUT","DELETE"
     * @return  返回HttpConfig对象,用于链式调用
     */
    public HttpConfig setRequestMethod(String requestMethod) {
        String[] methods = {"GET","POST","PUT","DELETE"};
        if(!Arrays.asList(methods).contains(requestMethod.toUpperCase())){
            throw new RuntimeException("请求方法设置错误,不符合规范要求");
        }
        this.requestMethod = requestMethod.toUpperCase();
        return this;
    }

    public URI getRequestURI() {
        return requestURI;
    }

    public HttpConfig setRequestURI(String requestURI) {
        this.requestURI = formatURI(requestURI);
        return this;
    }

    /**
     * 格式化URI地址,补全缺失部分
     * @param uri   待格式化URI地址
     * @return  格式化后的URI地址
     */
    private URI formatURI(String uri) {
        if (!uri.toLowerCase().startsWith("http://") && !uri.toLowerCase().startsWith("https://")){
            uri = "http://" + uri;
        }
        try {
            return new URI(uri);
        } catch (URISyntaxException e) {
            throw new RuntimeException(e);
        }
    }

    public Charset getReqCode() {
        return reqCode;
    }

    public HttpConfig setReqCode(Charset reqCode) {
        this.reqCode = reqCode;
        return this;
    }

    public Charset getResCode() {
        return resCode;
    }

    public HttpConfig setResCode(Charset resCode) {
        this.resCode = resCode;
        return this;
    }

    public boolean isLocked() {
        return locked;
    }

    public HttpConfig setLocked(boolean locked) {
        this.locked = locked;
        return this;
    }

    /**
     * 构造HttpExplore
     * @return  初始化HttpExplore
     */
    public HttpExplore build(){
        //针对Get方法,组装传递参数
        String paramsStr = paramsToString();
        if (paramsStr != null && requestMethod.equals("GET")){
            if (requestURI.toString().indexOf("?") > 0) {
                setRequestURI(requestURI + "&" + paramsStr);
            } else {
                setRequestURI(requestURI + "?" + paramsStr);
            }
        }
        return new HttpExplore(this);
    }

    /**
     * 将数据参数转换为字符串
     * @return  转换后的字符串参数
     */
    protected String paramsToString(){
        StringJoiner sj = new StringJoiner("&");
        if (requestParams != null && requestParams.size() > 0){
            requestParams.forEach((k,v) -> sj.add(k + "=" + v.toString()));
            return sj.toString();
        }
        return null;
    }

}

 request tool package

The configuration is complete, and now we start to package the tool. We can package several common solutions for the tool, such as: GET request, POST request, PUT request, DELETE request, etc. We obtain the request result in the form of a string or in the form of a file. Two types are enough (if necessary in the future, please add it by yourself)

HttpExplore.java

package com.vtarj.pythagoras.explore;

import java.io.File;
import java.io.IOException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Path;
import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;

/**
 * @Author Vtarj
 * @Description Http探测器
 * @Time 2022/4/1 9:42
 **/
public class HttpExplore {

    //统一通道管理,节省资源
    private static HttpClient client;
    private final HttpConfig config;

    public HttpExplore(HttpConfig config){
        this.config = config;
        //单例模式
        if (client == null || !config.isLocked()){
            synchronized (HttpExplore.class){
                //设置构造者必须参数
                HttpClient.Builder builder = HttpClient.newBuilder();
                builder.version(config.getVersion());
                builder.followRedirects(config.getRedirect());
                builder.connectTimeout(config.getConnectTimeout());
                builder.priority(config.getPriority());
                //设置构造者非必须参数
                Optional.ofNullable(config.getExecutor()).ifPresent(builder::executor);
                Optional.ofNullable(config.getAuthenticator()).ifPresent(builder::authenticator);
                Optional.ofNullable(config.getCookieHandler()).ifPresent(builder::cookieHandler);
                Optional.ofNullable(config.getProxySelector()).ifPresent(builder::proxy);
                Optional.ofNullable(config.getSslContext()).ifPresent(builder::sslContext);
                //构建HttpClient
                client = builder.build();
            }
        }
    }


    /**
     * 构建请求信息
     */
    private HttpRequest buildRequest() {
        HttpRequest.Builder builder = HttpRequest.newBuilder();
        builder.uri(config.getRequestURI());
        builder.uri(config.getRequestURI());
        builder.timeout(config.getConnectTimeout());
        builder.headers(buildHeader());
        builder.version(config.getVersion());
        builder.method(config.getRequestMethod(),buildPublisher());
        return builder.build();
    }

    /**
     * 执行请求,结果以字符串形式响应
     * @return  响应结果
     */
    public HttpResult<String> executeToString() throws IOException, InterruptedException {
        HttpRequest request = buildRequest();
        HashMap<String,Object> options = new HashMap<>();
        options.put("startime", Instant.now());
        HttpResponse<String> response = client.send(request,HttpResponse.BodyHandlers.ofString(config.getResCode()));
        options.put("endtime",Instant.now());
        return new HttpResult<>(response.statusCode(), response.body(), client, request, response, options);
    }

    /**
     * 执行请求,结果以文件形式响应
     * @param pathStr   文件保存路径
     * @return  响应结果
     */
    public HttpResult<File> executeToFile(String pathStr) throws IOException, InterruptedException {
        File file = new File(pathStr);
        if (!file.exists()){
            file.getParentFile().mkdirs();
        }
        HttpRequest request = buildRequest();
        HttpResponse<Path> response = client.send(buildRequest(),HttpResponse.BodyHandlers.ofFile(file.toPath()));
        return new HttpResult<>(response.statusCode(), file, client, request, response);
    }

    /**
     * 构建头信息
     * @return  头信息
     */
    private String[] buildHeader(){
        Map<String,String> headerMap = config.getHeaderMap();
        String[] headers = new String[headerMap.size() * 2];
        int index = 0;
        for (Map.Entry<String,String> entry:
                headerMap.entrySet()) {
            headers[index++] = entry.getKey();
            headers[index++] = entry.getValue();
        }
        return headers;
    }

    /**
     * 将提交参数转换为Publisher
     * @return 转换后的Publisher
     */
    private HttpRequest.BodyPublisher buildPublisher(){
        String paramsStr = config.paramsToString();
        if (paramsStr != null) {
            return HttpRequest.BodyPublishers.ofString(paramsStr,config.getReqCode());
        }
        return HttpRequest.BodyPublishers.noBody();
    }



    /**
     * 创建HttpExplore的构造器,初始化相关配置
     * @return  初始化HttpConfig配置
     */
    public static HttpConfig builder(){
        return new HttpConfig();
    }
}

Have you found it? HttpExplore and HttpConfig are closely combined, and no one can do without the other, so that when we use the tool, we will naturally do the configuration first, which is the benefit of the constructor model.

In addition, the advantage of JDK's HttpClient is that no matter get, post, put or delete, it is actually a parameter, and the way of passing parameters is also highly consistent. Isn't it very convenient?

In addition, special attention should be paid to the problem of character encoding. Remember to set the request encoding and response encoding, otherwise the problem of garbled characters will be a headache. 

response package

After the encapsulation of the request tool is completed, we can generally get the content of the remote site, but how to parse the content is also a problem (it has not been found that we have not used any other third-party toolkits except JDK so far), so for convenience, we use here Jsoup to parse the HTML content of the response. Jsoup can convert HTML content into nodes, and we can directly obtain node content, which is convenient, fast, clean and hygienic.

HttpResult.java

package com.vtarj.pythagoras.explore;

import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.HashMap;

/**
 * @Author Vtarj
 * @Description 规范化响应结果
 * @Time 2022/4/2 11:07
 **/
public class HttpResult<T> {
    private final int code;
    private final T data;
    private final HttpClient client;
    private final HttpRequest request;
    private final HttpResponse response;
    private final HashMap<String,Object> options;

    public HttpResult(int code, T data, HttpClient client, HttpRequest request, HttpResponse response, HashMap<String, Object> options) {
        this.code = code;
        this.data = data;
        this.client = client;
        this.request = request;
        this.response = response;
        this.options = options;
    }

    public HttpResult(int code, T data, HttpClient client, HttpRequest request, HttpResponse response) {
        this.code = code;
        this.data = data;
        this.client = client;
        this.request = request;
        this.response = response;
        this.options = null;
    }

    public int getCode() {
        return code;
    }

    public T getData() {
        return data;
    }

    public HttpClient getClient() {
        return client;
    }

    public HttpRequest getRequest() {
        return request;
    }

    public HttpResponse getResponse() {
        return response;
    }

    public HashMap<String, Object> getOptions() {
        return options;
    }

    @Override
    public String toString() {
        return "HttpResult{" +
                "code=" + code +
                ", data=" + data +
                ", request=" + request +
                ", response=" + response +
                '}';
    }
}

In response to the results, we try to keep the original content as much as possible, so as to prevent the loss of relevant information later due to poor consideration when packaging. Therefore, it is recommended to add a customizable option field to customize the storage content.

To be continued~~~

Previous: Actual Combat: Building a Java Crawler by Hand - Based on JDK11 Native HttpClient (2)

Next: Actual Combat: Building a Java Crawler by Hand - Based on JDK11 Native HttpClient (4)

Guess you like

Origin blog.csdn.net/Asgard_Hu/article/details/124602917