这里尝试 书海
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; public class NovelProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setRetrySleepTime(100); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("www.shuhai.com/read/([0-9]+)/([0-9]+).html").all()); page.putField("title", page.getHtml().xpath("//div[@class='read_top']/h1/text()").toString()); if (page.getResultItems().get("title") == null) { page.setSkip(true); } else { Selectable xpath = page.getHtml().xpath("//div[@class='txt fon_size']/"); page.putField("content", NodesUtis.nodesToString(xpath)); } for (Request req : page.getTargetRequests()) { System.out.println(req.getUrl()); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new NovelProcessor()).addUrl("http://www.shuhai.com/read/37504/") .addPipeline(new JsonFilePipeline("D:\\webmagic\\")).thread(5).run(); } }
import java.util.List; import us.codecraft.webmagic.selector.Selectable; public class NodesUtis { public static String nodesToString(Selectable xpath) { List<Selectable> nodes = xpath.nodes(); String content = ""; for(Selectable node:nodes) { content += node.xpath("//p/text()").toString(); } return content; } }
不过好像有点小问题,下次再改吧