webmagic crawler

package com.wilian.astro.webmagic.follifollie;

 

import com.wilian.astro.webmagic.vo.WatchVO;

 

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

 

/**

 * 

* @ClassName: OfficialWebsitePageProcessor  

* @Description: Official website data  

* @author PENGDI052  

* @date April 16, 2018  

*

 */

public class OfficialWebsitePageProcessor  implements PageProcessor {

// Official website watch entry page & pagination

// http://www.follifollie.com.cn/ch-ch/online-shop/watches/all#pg=1

// product details page

// http://www.follifollie.com.cn/ch-ch/online-shop/watches/jewelled/wf9a019bsw_xx-carousel-%E7%B3%BB%E5%88%97%E6%89%8B%E8%A1%A8

private static final String DETAIL_URL = "http://www.follifollie.com.cn/ch-ch/online-shop/watches/[a-zA-z]+/[\\w]+";

private static final String LIST_URL = "http://www.follifollie.com.cn/ch-ch/online-shop/watches/all#pg=[0-9]+";

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

public void process(Page page) {

System.out.println("11");

if (page.getUrl().regex(LIST_URL).match()) {

page.addTargetRequests(page.getHtml().links().regex(DETAIL_URL).all());

}

else{

WatchVO watch = new WatchVO();

watch.setSource("官网");

watch.setUrl(page.getUrl().get());

watch.setBrand("Folli Follie");

watch.setSeries(page.getHtml().xpath("/html/body/div[1]/section[1]/div/div/div[3]/div/div/div[1]/div[2]/div/h1/text()").get());

System.out.println(watch.toString());

}

 

}

 

public Site getSite() {

return site;

}

public static void main(String[] args) {

Spider.create(new OfficialWebsitePageProcessor())

.addUrl(new String[]{"http://www.follifollie.com.cn/ch-ch/online-shop/watches/all#pg=1"})

.thread(5)

.run();

        

    }

}

 

log4j.properties

 

log4j.rootLogger=INFO, stdout

 

log4j.appender.stdout=org.apache.log4j.ConsoleAppender

log4j.appender.stdout.Target=System.out

log4j.appender.stdout.Threshold=INFO

log4j.appender.stdout.ImmediateFlush=true

log4j.appender.stdout.layout=org.apache.log4j.PatternLayout

log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p %X{RequestId} - %m%n

 

log4j.appender.mongodb=org.apache.log4j.ConsoleAppender

log4j.appender.mongodb.Target=System.out

log4j.appender.mongodb.Threshold=DEBUG

log4j.appender.mongodb.ImmediateFlush=true

log4j.appender.mongodb.layout=org.apache.log4j.PatternLayout

log4j.appender.mongodb.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p %X{RequestId} - %m%n

 

 

package com.wilian.astro.webmagic.vo;

 

import java.util.Date;

 

public class WatchVO {

private String brand; // brand

private String imgUrl; // image

private String salePrice;//Current selling price

private String discount; //discount

private String price ; //Listing price

private String model; //Model, the specific model under the brand

private String style; //type,quartz,ceramic,mechanical

private String series ;// series

private String source; //Data source

private String url ; //Source URL

private String stock; //stock

private Date createDate;//Creation time

private Date updateDate; //Update time

public String getBrand() {

return brand;

}

public void setBrand(String brand) {

this.brand = brand;

}

public String getImgUrl() {

return imgUrl;

}

public void setImgUrl(String imgUrl) {

this.imgUrl = imgUrl;

}

public String getSalePrice() {

return salePrice;

}

public void setSalePrice(String salePrice) {

this.salePrice = salePrice;

}

public String getDiscount() {

return discount;

}

public void setDiscount(String discount) {

this.discount = discount;

}

public String getPrice() {

return price;

}

public void setPrice(String price) {

this.price = price;

}

public String getStyle() {

return style;

}

public void setStyle(String style) {

this.style = style;

}

public String getSource() {

return source;

}

public void setSource(String source) {

this.source = source;

}

public String getUrl() {

return url;

}

public void setUrl(String url) {

this.url = url;

}

public String getStock() {

return stock;

}

public void setStock(String stock) {

this.stock = stock;

}

public Date getCreateDate() {

return createDate;

}

public void setCreateDate(Date createDate) {

this.createDate = createDate;

}

public Date getUpdateDate() {

return updateDate;

}

public void setUpdateDate(Date updateDate) {

this.updateDate = updateDate;

}

public String getModel() {

return model;

}

public void setModel(String model) {

this.model = model;

}

public String getSeries() {

return series;

}

public void setSeries(String series) {

this.series = series;

}

public String toString() {

return "WatchVO [brand=" + brand + ", imgUrl=" + imgUrl

+ ", salePrice=" + salePrice + ", discount=" + discount

+ ", price=" + price + ", model=" + model + ", style=" + style

+ ", series=" + series + ", source=" + source + ", url=" + url

+ ", stock=" + stock + ", createDate=" + createDate

+ ", updateDate=" + updateDate + "]";

}

 

}

 

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326123698&siteId=291194637