using the Rhinojs engine to simulate js running. htmlUnit is more powerful than httpClient+jsoup.
pom.xml
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.26</version>
</dependency>
Introduce htmlUnit.
Simple test:
public class TestHtmlUnit { public static void main(String[] args) { Set<String> urls=getPageUrls(); if(null!=urls && urls.size() > 0){ for (String url : urls) { System.out.println(url); } } } /** * Get the latest recommended video page address of uc-Funny * @return Set<String> */ public static Set<String> getPageUrls(){ Set<String> set = new HashSet<String>(); String base="https://news.uc.cn"; WebClient webClient = new WebClient(BrowserVersion.FIREFOX_52);//Instantiate the client (Firefox) String targurl="https://news.uc.cn/c_shipin/"; webClient.getOptions().setJavaScriptEnabled(false);//关闭javaScript //webClient.getOptions().setCssEnabled(false);//Close css try { HtmlPage page=webClient.getPage(targurl); try { Thread.sleep(2000); } catch (InterruptedException e) { e.printStackTrace (); } List<HtmlDivision> divs=page.getByXPath("//div[@class='news-list']"); HtmlDivision div=divs.get(0); List<HtmlListItem> lis=div.getByXPath("//li[@class='news-item flag']"); if(null!=lis && lis.size()>0){ for (HtmlListItem li : lis) { DomNodeList<HtmlElement> as=li.getElementsByTagName("a"); if(null!=as && as.size() > 0){ HtmlElement a=as.get(0); String href=a.getAttribute("href"); if(null!=href && !href.equals("")){ href=base+href; System.out.println(href); set.add(href); } } } } } catch (FailingHttpStatusCodeException e) { e.printStackTrace (); } catch (MalformedURLException e) { e.printStackTrace (); } catch (IOException e) { e.printStackTrace (); }finally{ webClient.close(); } return set; } }