Java Crawler(1)HTMLUnit

Java Crawler(1)HTMLUnit

pom.xml Add Few Dependencies

+              <dependency>
+                  <groupId>net.sourceforge.htmlunit</groupId>
+                  <artifactId>htmlunit</artifactId>
+              </dependency>
+              <dependency>
+                  <groupId>net.sourceforge.htmlunit</groupId>
+                  <artifactId>htmlunit-core-js</artifactId>
+                  <version>2.27</version>
+              </dependency>
+              <dependency>
+                  <groupId>net.sourceforge.htmlunit</groupId>
+                  <artifactId>neko-htmlunit</artifactId>
+                  <version>2.27</version>
+              </dependency>
+              <dependency>
+                  <groupId>org.w3c.css</groupId>
+                  <artifactId>sac</artifactId>
+                  <version>1.3</version>
+              </dependency><dependency>
+                  <groupId>commons-codec</groupId>
+                  <artifactId>commons-codec</artifactId>
+              </dependency>
+              <dependency>
+                  <groupId>xalan</groupId>
+                  <artifactId>xalan</artifactId>
+                  <version>2.7.2</version>
+              </dependency>
+              <dependency>
+                  <groupId>net.sourceforge.cssparser</groupId>
+                  <artifactId>cssparser</artifactId>
+                  <version>0.9.23</version>
+              </dependency>
+              <dependency>
+                  <groupId>org.eclipse.jetty.websocket</groupId>
+                  <artifactId>websocket-client</artifactId>
+              </dependency>
+              <dependency>
+                      <groupId>xerces</groupId>
+                      <artifactId>xercesImpl</artifactId>
+                      <version>2.11.0</version>
+              </dependency>

Most easiest Test Class CrawlerTest.java

package com.sillycat.jobsmonitorapi.service;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.List;
import org.junit.Test;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSpan;

public class CrawlerTest {

@Test
public void testCrawlWalmart() throws FailingHttpStatusCodeException, MalformedURLException, IOException {
    try (final WebClient webClient = new WebClient(BrowserVersion.INTERNET_EXPLORER)) {
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setRedirectEnabled(true);
        webClient.getOptions().setTimeout(30000);
        webClient.setJavaScriptTimeout(30000);
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        HtmlPage page = webClient.getPage("https://www.walmart.com/search/?grid=false&page=2&query=computer#searchProductResult");
        webClient.waitForBackgroundJavaScript(10000);
        //String htmlContent = page.asXml();
        //File htmlFile = new File("/Users/carl/Downloads/products.html");
        //PrintWriter pw = new PrintWriter(htmlFile);
        //pw.print(htmlContent);
        //pw.close();
        HtmlDivision div = page.getFirstByXPath("//div[@class='search-result-listview-items']");
        Iterator<DomElement> itDivs = div.getChildElements().iterator();
        int count = 0;
        for(;itDivs.hasNext();){
            count++;
            DomElement helloDiv = itDivs.next();
            //helloDiv.click();
            System.out.println("Print==== " + count + " " + helloDiv.asText());
         }
        //List<?> links = page.getByXPath("//a[@class='product-title-link']");
        //for(int i = 0;i<links.size();i++){
            //System.out.println("Link===========" + (i+1) + " " + links.get(i));
        //}
        //List<HtmlAnchor> anchors = page.getAnchors();
        //int count = 0;
        //for(int i = 0;i<anchors.size();i++){
            //HtmlAnchor anchor = anchors.get(i);
            //String url = anchor.getHrefAttribute();
            //if(url.startsWith("/ip")){
                //count++;
                //System.out.println("URL ============== " + count + " " + url);
            //}
        //}
       
//          Detail Page
//HtmlPage page = webClient.getPage("https://www.walmart.com/ip/HP-15-ay041wm-15-6-Silver-Fusion-Laptop-Touch-Screen-Windows-10-Intel-Core-i3-6100U-Processor-8GB-Memory-1TB-Hard-Drive/51397784");
//HtmlSpan priceSpan = page.getFirstByXPath("//span[@class='Price-characteristic']");
//System.out.println("Price========" + priceSpan.getTextContent());
}
}
}



References:


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326222249&siteId=291194637