【JAVA-爬虫】使用 Jsoup+HttpClient 爬取网页信息

大致思路：

1.添加相依依赖

2.编写工具类，根据url（需要爬取的页面）参数，使用HttpClient连接到网页获取网页源码

3.根据获取到的html格式的网页源码，使用 Jsoup获取所需的 Element 元素及各属性值

代码：

1.依赖：

<!--httpclient-->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.6</version>
</dependency>
<!--jsoup-->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
</dependency>

2.工具类：

package com.gourd.base.utils;

import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;

/**
 * JsoupHttpClient工具类
 *
 * @author gourd
 */
@Slf4j
public class JsoupHttpClientUtils {

    private final static String DEFAULT_CHARSET = "utf8";

    /**
     * 根据url获取网页源码
     *
     * @param url 需要爬取的网页URL
     * @return
     */
    public static String getHtmlByUrl(String url) {
        // 获取到的网页源码
        String html = null;
        // 建立一个新的请求客户端
        CloseableHttpClient httpClient = null;
        if (url.startsWith("https://")) {
            httpClient = getHttpsClient();
        } else {
            httpClient = HttpClients.createDefault();
        }

        // 使用HttpGet的方式请求网址
        HttpGet httpGet = new HttpGet(url);
        // 模拟浏览器
        httpGet.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

        // 使用代理 IP
//        HttpHost proxy = new HttpHost("118.114.77.47", 8080);
        RequestConfig config = RequestConfig.custom()
                .setConnectionRequestTimeout(10000)
                // 设置连接超时时间 10秒钟
                .setConnectTimeout(10000)
                // 设置读取超时时间10秒钟
                .setSocketTimeout(10000)
                .build();
        httpGet.setConfig(config);
        // 获取网址的返回结果
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response != null && response.getStatusLine().getStatusCode() == 200) {
                //获取返回结果中的实体
                HttpEntity entity = response.getEntity();
                //将返回的实体输出
                html = EntityUtils.toString(entity, DEFAULT_CHARSET);
            } else {
                log.error("获取网页源码失败");
            }
        } catch (IOException e) {
            log.error("获取网页源码异常：", e);
        } finally {
            // 关闭资源
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                log.error("response关闭错误，请检查原因");
            }
            try {
                if(httpClient != null){
                    httpClient.close();
                }
            } catch (IOException e) {
                log.error("httpClient关闭错误，请检查原因");
            }
        }
        return html;
    }

    /**
     * 根据html获取元素
     *
     * @param html    网页源码
     * @param tagName 标签名
     * @return
     */
    public static Elements getElements(String html, String tagName) {
        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(html);
        // 获取tag是tagName的所有DOM元素，数组
        Elements elements = doc.getElementsByTag(tagName);
        return elements;
    }


    /**
     * 获取https连接（不验证证书）
     *
     * @return
     */
    private static CloseableHttpClient getHttpsClient() {
        RegistryBuilder<ConnectionSocketFactory> registryBuilder = RegistryBuilder.<ConnectionSocketFactory>create();
        ConnectionSocketFactory plainSF = new PlainConnectionSocketFactory();
        registryBuilder.register("http", plainSF);
        // 指定信任密钥存储对象和连接套接字工厂
        try {
            KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
            // 信任任何链接
            TrustStrategy anyTrustStrategy = new TrustStrategy() {

                @Override
                public boolean isTrusted(java.security.cert.X509Certificate[] arg0, String arg1) throws java.security.cert.CertificateException {
                    return true;
                }
            };
            SSLContext sslContext = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, anyTrustStrategy).build();
            LayeredConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            registryBuilder.register("https", sslSF);
        } catch (KeyStoreException e) {
            throw new RuntimeException(e);
        } catch (KeyManagementException e) {
            throw new RuntimeException(e);
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
        Registry<ConnectionSocketFactory> registry = registryBuilder.build();
        // 设置连接管理器
        PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(registry);
        // 构建客户端
        return HttpClientBuilder.create().setConnectionManager(connManager).build();
    }


}

3.调用测试：

import com.gourd.base.utils.JsoupHttpClientUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.jsoup.select.Elements;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

/**
 * @author gourd
 * createAt: 2018/9/17
 */

@RestController
@Api(description = "爬虫控制器")
@RequestMapping("/spider")
public class SpiderController {

    @GetMapping(value = "/spider" )
    @ApiOperation(value = "获取网页爬虫数据", notes = "获取网页爬虫数据")
    public void logout(@RequestParam String url){
        String htmlByUrl = JsoupHttpClientUtils.getHtmlByUrl(url);
        Elements elements = JsoupHttpClientUtils.getElements(htmlByUrl, "a");

        System.out.println("success");
    }


}

【JAVA-爬虫】使用 Jsoup+HttpClient 爬取网页信息

猜你喜欢