版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_35001776/article/details/84871406
pom依赖:
<!-- cdp4j依赖 -->
<dependency>
<groupId>io.webfolder</groupId>
<artifactId>cdp4j</artifactId>
<version>2.2.1</version>
</dependency>
<!-- jsoup依赖 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
<!-- webmagic相关依赖 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.1</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.1</version>
</dependency>
<dependency>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
<version>1.4.01</version>
</dependency>
代码示例:
public static void main(String[] args) {
ArrayList<String> command = new ArrayList<>();
//不显示google 浏览器
command.add("--headless");
Launcher launcher = new Launcher();
try (SessionFactory factory = launcher.launch(command);
Session session = factory.create()) {
try {
String url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Damazon-devices&field-keywords=";
setUserAgent(session);
// 访问url
session.navigate(url);
// 等待页面加载完成,超时时间为30s
session.waitDocumentReady(30000);
// 获取JS加载后的页面
String html = session.getContent();
// 通过Jsoup把字符串解析成DOM树
Document doc = Jsoup.parse(html);
// 通过Xsoup去获取想要的元素集合
Elements elements = Xsoup.compile("").evaluate(doc).getElements();
} catch (Exception e) {
e.printStackTrace();
}
} catch (Exception e) {
e.printStackTrace();
}
}
相关链接:cdp4j cdp4j demo webmagic