[Reptile] JAVA- use Jsoup + HttpClient crawled pages of information

The general idea:

1. Add dependence dependency

2. The preparation of tools, according to the url (required crawling pages) parameters, using HttpClient source connected to the web page acquisition

Element 3. Jsoup obtain the required elements and each web page source code according to the attribute values ​​acquired html format, using

 Code:

1. Dependence:

<!--httpclient-->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.6</version>
</dependency>
<!--jsoup-->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
</dependency>

2. Tools:

package com.gourd.base.utils;

import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;

/**
 * JsoupHttpClient工具类
 *
 * @author gourd
 */
@Slf4j
public class JsoupHttpClientUtils {

    private final static String DEFAULT_CHARSET = "utf8";

    /**
     * 根据url获取网页源码
     *
     * @param url 需要爬取的网页URL
     * @return
     */
    public static String getHtmlByUrl(String url) {
        // 获取到的网页源码
        String html = null;
        // 建立一个新的请求客户端
        CloseableHttpClient httpClient = null;
        if (url.startsWith("https://")) {
            httpClient = getHttpsClient();
        } else {
            httpClient = HttpClients.createDefault();
        }

        // 使用HttpGet的方式请求网址
        HttpGet httpGet = new HttpGet(url);
        // 模拟浏览器
        httpGet.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

        // 使用代理 IP
//        HttpHost proxy = new HttpHost("118.114.77.47", 8080);
        RequestConfig config = RequestConfig.custom()
                .setConnectionRequestTimeout(10000)
                // 设置连接超时时间 10秒钟
                .setConnectTimeout(10000)
                // 设置读取超时时间10秒钟
                .setSocketTimeout(10000)
                .build();
        httpGet.setConfig(config);
        // 获取网址的返回结果
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response != null && response.getStatusLine().getStatusCode() == 200) {
                //获取返回结果中的实体
                HttpEntity entity = response.getEntity();
                //将返回的实体输出
                html = EntityUtils.toString(entity, DEFAULT_CHARSET);
            } else {
                log.error("获取网页源码失败");
            }
        } catch (IOException e) {
            log.error("获取网页源码异常:", e);
        } finally {
            // 关闭资源
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                log.error("response关闭错误,请检查原因");
            }
            try {
                if(httpClient != null){
                    httpClient.close();
                }
            } catch (IOException e) {
                log.error("httpClient关闭错误,请检查原因");
            }
        }
        return html;
    }

    /**
     * 根据html获取元素
     *
     * @param html    网页源码
     * @param tagName 标签名
     * @return
     */
    public static Elements getElements(String html, String tagName) {
        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(html);
        // 获取tag是tagName的所有DOM元素,数组
        Elements elements = doc.getElementsByTag(tagName);
        return elements;
    }


    /**
     * 获取https连接(不验证证书)
     *
     * @return
     */
    private static CloseableHttpClient getHttpsClient() {
        RegistryBuilder<ConnectionSocketFactory> registryBuilder = RegistryBuilder.<ConnectionSocketFactory>create();
        ConnectionSocketFactory plainSF = new PlainConnectionSocketFactory();
        registryBuilder.register("http", plainSF);
        // 指定信任密钥存储对象和连接套接字工厂
        try {
            KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
            // 信任任何链接
            TrustStrategy anyTrustStrategy = new TrustStrategy() {

                @Override
                public boolean isTrusted(java.security.cert.X509Certificate[] arg0, String arg1) throws java.security.cert.CertificateException {
                    return true;
                }
            };
            SSLContext sslContext = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, anyTrustStrategy).build();
            LayeredConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            registryBuilder.register("https", sslSF);
        } catch (KeyStoreException e) {
            throw new RuntimeException(e);
        } catch (KeyManagementException e) {
            throw new RuntimeException(e);
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
        Registry<ConnectionSocketFactory> registry = registryBuilder.build();
        // 设置连接管理器
        PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(registry);
        // 构建客户端
        return HttpClientBuilder.create().setConnectionManager(connManager).build();
    }


}

3. Call Test:

import com.gourd.base.utils.JsoupHttpClientUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.jsoup.select.Elements;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

/**
 * @author gourd
 * createAt: 2018/9/17
 */

@RestController
@Api(description = "爬虫控制器")
@RequestMapping("/spider")
public class SpiderController {

    @GetMapping(value = "/spider" )
    @ApiOperation(value = "获取网页爬虫数据", notes = "获取网页爬虫数据")
    public void logout(@RequestParam String url){
        String htmlByUrl = JsoupHttpClientUtils.getHtmlByUrl(url);
        Elements elements = JsoupHttpClientUtils.getElements(htmlByUrl, "a");

        System.out.println("success");
    }


}

 

Guess you like

Origin blog.csdn.net/HXNLYW/article/details/95327493