webmagic爬虫

package com.wilian.astro.webmagic.follifollie;

import com.wilian.astro.webmagic.vo.WatchVO;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

/**

 * 

* @ClassName: OfficialWebsitePageProcessor  

* @Description: 官网数据  

* @author PENGDI052  

* @date 2018年4月16日  

*

 */

public class OfficialWebsitePageProcessor  implements PageProcessor {

// 官网手表入口页面&分页

// http://www.follifollie.com.cn/ch-ch/online-shop/watches/all#pg=1

// 产品详情页面

// http://www.follifollie.com.cn/ch-ch/online-shop/watches/jewelled/wf9a019bsw_xx-carousel-%E7%B3%BB%E5%88%97%E6%89%8B%E8%A1%A8

private static final String DETAIL_URL = "http://www.follifollie.com.cn/ch-ch/online-shop/watches/[a-zA-z]+/[\\w]+";

private static final String LIST_URL = "http://www.follifollie.com.cn/ch-ch/online-shop/watches/all#pg=[0-9]+";

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

public void process(Page page) {

System.out.println("11");

if (page.getUrl().regex(LIST_URL).match()) {

page.addTargetRequests(page.getHtml().links().regex(DETAIL_URL).all());

}

else{

WatchVO watch = new WatchVO();

watch.setSource("官网");

watch.setUrl(page.getUrl().get());

watch.setBrand("Folli Follie");

watch.setSeries(page.getHtml().xpath("/html/body/div[1]/section[1]/div/div/div[3]/div/div/div[1]/div[2]/div/h1/text()").get());

System.out.println(watch.toString());

}

}

public Site getSite() {

return site;

}

public static void main(String[] args) {

Spider.create(new OfficialWebsitePageProcessor())

.addUrl(new String[]{"http://www.follifollie.com.cn/ch-ch/online-shop/watches/all#pg=1"})

.thread(5)

.run();

        

    }

}

log4j.properties

log4j.rootLogger=INFO, stdout

log4j.appender.stdout=org.apache.log4j.ConsoleAppender

log4j.appender.stdout.Target=System.out

log4j.appender.stdout.Threshold=INFO

log4j.appender.stdout.ImmediateFlush=true

log4j.appender.stdout.layout=org.apache.log4j.PatternLayout

log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p %X{RequestId} - %m%n

log4j.appender.mongodb=org.apache.log4j.ConsoleAppender

log4j.appender.mongodb.Target=System.out

log4j.appender.mongodb.Threshold=DEBUG

log4j.appender.mongodb.ImmediateFlush=true

log4j.appender.mongodb.layout=org.apache.log4j.PatternLayout

log4j.appender.mongodb.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p %X{RequestId} - %m%n

package com.wilian.astro.webmagic.vo;

import java.util.Date;

public class WatchVO {

private String brand; // 品牌

private String imgUrl; // 图片

private String salePrice ;//当前售价

private String discount; //折扣

private String price ; //挂牌价格

private String model; //型号,品牌下的具体型号

private String style ; //类型,石英,陶瓷,机械

private String series ;// 系列

private String source; //数据来源

private String url ; //来源网址

private String stock; //库存

private Date createDate;//创建时间

private Date updateDate; //更新时间

public String getBrand() {

return brand;

}

public void setBrand(String brand) {

this.brand = brand;

}

public String getImgUrl() {

return imgUrl;

}

public void setImgUrl(String imgUrl) {

this.imgUrl = imgUrl;

}

public String getSalePrice() {

return salePrice;

}

public void setSalePrice(String salePrice) {

this.salePrice = salePrice;

}

public String getDiscount() {

return discount;

}

public void setDiscount(String discount) {

this.discount = discount;

}

public String getPrice() {

return price;

}

public void setPrice(String price) {

this.price = price;

}

public String getStyle() {

return style;

}

public void setStyle(String style) {

this.style = style;

}

public String getSource() {

return source;

}

public void setSource(String source) {

this.source = source;

}

public String getUrl() {

return url;

}

public void setUrl(String url) {

this.url = url;

}

public String getStock() {

return stock;

}

public void setStock(String stock) {

this.stock = stock;

}

public Date getCreateDate() {

return createDate;

}

public void setCreateDate(Date createDate) {

this.createDate = createDate;

}

public Date getUpdateDate() {

return updateDate;

}

public void setUpdateDate(Date updateDate) {

this.updateDate = updateDate;

}

public String getModel() {

return model;

}

public void setModel(String model) {

this.model = model;

}

public String getSeries() {

return series;

}

public void setSeries(String series) {

this.series = series;

}

public String toString() {

return "WatchVO [brand=" + brand + ", imgUrl=" + imgUrl

+ ", salePrice=" + salePrice + ", discount=" + discount

+ ", price=" + price + ", model=" + model + ", style=" + style

+ ", series=" + series + ", source=" + source + ", url=" + url

+ ", stock=" + stock + ", createDate=" + createDate

+ ", updateDate=" + updateDate + "]";

}

}

猜你喜欢

转载自wilian.iteye.com/blog/2418745