selenium reptile

Selenium can dynamically crawl web page data, just like real users operate browsers, and test applications from the perspective of end users. WebDriver directly controls browsers through native browser support or browser extensions

webdriverdownload

Because selenuim has compatibility issues with browser versions, Gu needs to download the specified version for the specified browser.

1. Add dependencies

        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>4.11.0</version>
        </dependency>
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>32.1.2-jre</version>
        </dependency>

2. Tools

import cn.hutool.core.collection.CollectionUtil;
import com.google.common.collect.Lists;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.edge.EdgeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;

/**
 * Selenium 工具类
 *
 * @author kou
 */
@Slf4j
@RequiredArgsConstructor
@Component
public class SeleniumUtil {

    private final ReptileProperties reptileProperties;

    /**
     * 获取chromeDriver
     *
     * @return chromeDriver
     */
    public WebDriver chromeDriver() {

        // 加载驱动路径
        System.setProperty("webdriver.chrome.driver", "D:/chromedriver.exe");
        // Chrome默认不允许跨机器调试,需要给启动命令加上白名单
        System.setProperty("webdriver.chrome.whitelistedIps", "");

        ChromeOptions options = new ChromeOptions();
        // 开启一个实验性参数excludeSwitches,用来隐藏window.navigator.webdriver返回true,这个参数必须是List
        options.setExperimentalOption("useAutomationExtension", false);
        // 开启开发者模式
        options.setExperimentalOption("excludeSwitches", Lists.newArrayList("enable-automation"));
        // 发现主要是这句是关键
        options.addArguments("--disable-blink-features=AutomationControlled");
        // options.addArguments("--incognito");
        // options.addArguments("--disable-infobars");
        //options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
        options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");

        // 禁用沙箱
        options.addArguments("--no-sandbox");
        // 无头浏览器,这样不会打开浏览器窗口
        // options.addArguments("--headless");
        // options.addArguments("--disable-gpu");
        options.addArguments("--remote-allow-origins=*");

        // 初始化一个谷歌浏览器实例,实例名称叫driver
        WebDriver driver = new ChromeDriver(options);

        return driver;
    }

    /**
     * 获取edgeDriver
     *
     * @return edgeDriver
     */
    public WebDriver edgeDriver() {
        // 加载驱动路径
        System.setProperty("webdriver.edge.driver", "D:/msedgedriver.exe");

        EdgeOptions options = new EdgeOptions();
        // 开启一个实验性参数excludeSwitches,用来隐藏window.navigator.webdriver返回true,这个参数必须是List
        options.setExperimentalOption("useAutomationExtension", false);
        //开启开发者模式
        options.setExperimentalOption("excludeSwitches", Lists.newArrayList("enable-automation"));
        // 发现主要是这句是关键
        options.addArguments("--disable-blink-features=AutomationControlled");
        options.addArguments("--incognito", "--disable-infobars");
        // options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");
        options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");

        // 禁用沙箱
        options.addArguments("--no-sandbox");
        // 无头浏览器,这样不会打开浏览器窗口
        // options.addArguments("--headless");
        options.addArguments("--disable-gpu");
        options.addArguments("--remote-allow-origins=*");

        // 初始化一个谷歌浏览器实例,实例名称叫driver
        WebDriver driver = new EdgeDriver(options);

        return driver;
    }

    /**
     * 获取firefoxDriver
     *
     * @return firefoxDriver
     */
    public WebDriver firefoxDriver() {
        // 加载驱动路径
        System.setProperty("webdriver.gecko.driver", "D:/geckodriver.exe");
        System.setProperty("webdriver.chrome.whitelistedIps", "");

        FirefoxOptions options = new FirefoxOptions();
        options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");

        // 无头浏览器,这样不会打开浏览器窗口
        options.addArguments("--headless");

        // 初始化一个谷歌浏览器实例,实例名称叫driver
        WebDriver driver = new FirefoxDriver(options);

        return driver;
    }

    /**
     * 获取表头
     *
     * @param table 表格
     * @return 表头
     */
    public List<String> getTableHead(WebElement table) {

        log.info("开始解析表头...");
        // 获取表头
        WebElement head = table.findElement(By.tagName("thead"));
        if (null == head) {
            return Collections.emptyList();
        }
        List<WebElement> headths = head.findElements(By.tagName("th"));
        List<String> headList = new ArrayList<>(headths.size());
        headths.forEach(t -> {
            headList.add(t.getText());
        });
        log.info("表头解析完成!!!");
        return headList;
    }

    /**
     * 获取表数据
     *
     * @param table 表格
     * @return 表头
     */
    public List<List<String>> getTableBody(WebElement table) {

        log.info("开始解析表数据...");
        // 获取表头
        WebElement tbody = table.findElement(By.tagName("tbody"));
        if (null == tbody) {
            return Collections.emptyList();
        }
        // 获取body数据行
        List<WebElement> bodyTrs = tbody.findElements(By.tagName("tr"));
        if (CollectionUtil.isEmpty(bodyTrs)) {
            return Collections.emptyList();
        }
        List<List<String>> bodyDatas = new ArrayList<>(bodyTrs.size());
        bodyTrs.stream().forEach(r -> {
            List<WebElement> tds = r.findElements(By.tagName("td"));
            List<String> rows = new ArrayList<>(tds.size());
            tds.forEach(d -> {
                rows.add(d.getText());
            });
            bodyDatas.add(rows);
        });
        log.info("表数据解析完成!!!");
        return bodyDatas;
    }

    /**
     * 将参数转化为路径参数
     *
     * @param params 参数
     * @return 路径参数
     */
    public String convertPathParams(Map<String, Object> params) {

        if (CollectionUtil.isEmpty(params)) {
            return "";
        }
        StringBuffer path = new StringBuffer();

        for (Map.Entry<String, Object> p : params.entrySet()) {
            path.append(p.getKey()).append("=").append(p.getValue().toString()).append("&");
        }
        return path.substring(0, path.length() - 1);
    }

}

3. Crawl data

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;

import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
 * 数据接口实现类
 *
 * @author kou
 */
@Slf4j
@RequiredArgsConstructor
@Service
public class DataServiceImpl {


    private final SeleniumUtil seleniumUtil;

    /**
     * 获取页面数据
     *
     * @return 数据
     */
    @Override
    public Map<String, Object> getHtmlData() {
        try {

            Map<String, Object> data = new HashMap<>();
            
            String url = "url";
            Map<String, Object> params = new HashMap<>();
            params.put("pageNum", 1);
            params.put("pageSize", 1000);

            String fullUrl = url + seleniumUtil.convertPathParams(params);
            WebDriver driver = seleniumUtil.firefoxDriver();
            driver.get(fullUrl);

            // 打开一个站点
            log.info("开始访问:{}", fullUrl);
            driver.get(fullUrl);

            String title = driver.getTitle();
            log.info("网页:{}", title);

            // 获取表格数据
            WebElement table = driver.findElement(By.id("table"));
            //显式等待,针对某个元素等待,等待超时时间100s,2s检测一次
            WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(100), Duration.ofSeconds(2));
            // wait.until(ExpectedConditions.presenceOfElementLocated(By.id("table")));
            wait.until(new ExpectedCondition<WebElement>() {
                @Override
                public WebElement apply(WebDriver text) {
                    log.info("开始检查tbody数据是否已加载");
                    WebElement table = text.findElement(By.id("table")).findElement(By.tagName("tbody"));
                    if (!table.isDisplayed()) {
                        log.info("检查结果:tbody数据未加载完,等待加载...");
                        return null;
                    }
                    log.info("检查结果:tbody数据加载完成!!!");
                    return table;
                }
            });

            // 获取表头
            List<String> headList = seleniumUtil.getTableHead(table);
            List<List<String>> bodyList = seleniumUtil.getTableBody(table);
            data.put("header", headList);
            data.put("body", bodyList);
            driver.close();
            return data;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

}

Guess you like

Origin blog.csdn.net/fengxing_2/article/details/132280188