从头学习爬虫(二十九)实战篇----WebMagic爬CSDN博客 WebMagic入门实战下CSDN,20行代码实现爬取标题

WebMagic入门实战下CSDN,20行代码实现爬取标题
spider
import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class CsdnSpider implements PageProcessor{
	Site site=Site.me().setRetryTimes(5).setTimeOut(5000).setSleepTime(200).addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
	@Override
	public void process(Page page) {
		List<String> list= page.getHtml().xpath("//div[@class='article-list']/div/@data-articleid").all();
		if(!list.isEmpty()) {
			for (String string : list) {
				page.addTargetRequest("https://blog.csdn.net/qq_36783371/article/details/"+string);
			}
		}
		if(page.getRequest().getUrl().matches("https://blog\\.csdn\\.net/qq_36783371/article/details/\\d+")) {
			page.putField("title", page.getHtml().xpath("//h6[@class='title-article']/text()").toString());
			//page.putField("text", page.getHtml().xpath("").toString());
		}
	}

	@Override
	public Site getSite() {
		return site;
	}
	public static void main(String[] args) {
		Spider.create(new CsdnSpider()).addUrl("https://blog.csdn.net/qq_36783371","https://blog.csdn.net/qq_36783371/article/list/2?").addPipeline(new CsdnPipline()).thread(5).runAsync();
	}
}

pipline

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

public class CsdnPipline implements Pipeline{

	@Override
	public void process(ResultItems resultItems, Task task) {
		try {
			System.out.println(resultItems.get("title").toString());
		} catch (Exception e) {
		}
	}

}

改造下变成刷访问量

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class CsdnSpider implements PageProcessor{
	Site site=Site.me().setRetryTimes(5).setTimeOut(5000).setSleepTime(200).addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
	@Override
	public void process(Page page) {
		List<String> list= page.getHtml().xpath("//div[@class='article-list']/div/@data-articleid").all();
		if(!list.isEmpty()) {
			for (String string : list) {
				page.addTargetRequest("https://blog.csdn.net/qq_36783371/article/details/"+string);
			}
		}
		if(page.getRequest().getUrl().matches("https://blog\\.csdn\\.net/qq_36783371/article/details/\\d+")) {
			//page.putField("title", page.getHtml().xpath("//h6[@class='title-article']/text()").toString());
			//page.putField("text", page.getHtml().xpath("").toString());
		}
	}

	@Override
	public Site getSite() {
		return site;
	}
	public static void main(String[] args) throws Exception {
		for (int i = 0; i < 100; i++) {
			Thread.sleep(5000);
			Spider.create(new CsdnSpider()).addUrl("https://blog.csdn.net/qq_36783371","https://blog.csdn.net/qq_36783371/article/list/2?").thread(5).runAsync();
		}
	}
}

欢迎加群313557283(刚创建),小白互相学习~


猜你喜欢

转载自blog.csdn.net/qq_36783371/article/details/80183542