(11)Java爬虫框架webmagic实战

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Fighting_No1/article/details/85014317

Java爬虫框架webmagic实战

本文是我关于webmagic爬虫框架的实战——爬取古诗词网站的诗词数据。此代码只用于爬虫学习,勿用于商业用途。

安装webmagic

webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic:

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.3</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
</dependency>

定义Article类保存诗词数据

package edu.nlp.model;

public class Article {

    private int articleId;
    /**
     * 类型
     **/
    private String type;
    /**
     * 作者
     **/
    private String author;
    /**
     * 朝代
     **/
    private String dynasty;
    /**
     * 作者简介
     **/
    private String authorInfo;
    /**
     * 标题
     **/
    private String title;
    /**
     * 原文
     **/
    private String content;
    /**
     * 译文
     **/
    private String translation;
    /**
     * 注释
     **/
    private String comment;
    /**
     * 赏析
     **/
    private String appreciation;
    /**
     * UUID
     **/
    private String id;
    /**
     * 匹配度
     **/
    private float score;

    public int getArticleId() {
        return articleId;
    }

    public void setArticleId(int articleId) {
        this.articleId = articleId;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getDynasty() {
        return dynasty;
    }

    public void setDynasty(String dynasty) {
        this.dynasty = dynasty;
    }

    public String getAuthorInfo() {
        return authorInfo;
    }

    public void setAuthorInfo(String authorInfo) {
        this.authorInfo = authorInfo;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getTranslation() {
        return translation;
    }

    public void setTranslation(String translation) {
        this.translation = translation;
    }

    public String getComment() {
        return comment;
    }

    public void setComment(String comment) {
        this.comment = comment;
    }

    public String getAppreciation() {
        return appreciation;
    }

    public void setAppreciation(String appreciation) {
        this.appreciation = appreciation;
    }

    public String toString() {
        return "Article:{id=" + id + ",score=" + score + ",type=" + type
                + ",dynasty=" + dynasty + ",author=" + author
                + ",authorInfo=" + authorInfo + ",title=" + title + ",content="
                + content + ",translation=" + translation + ",comment=" + comment
                + ",appreciation=" + appreciation + "}";
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public float getScore() {
        return score;
    }

    public void setScore(float score) {
        this.score = score;
    }

}

爬取中国诗词网的数据

以各个朝代为初始链接,爬取中国诗词网中每条诗词的所属朝代、作者信息、原文、翻译、赏析,保存每条诗词数据为json文本。

package edu.nlp.processer;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import edu.nlp.model.Article;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

public class ShiWenPageProcessor implements PageProcessor {

    /**
     * 匹配朝代
     **/
    private final static String PATTER_DYNASTY =
            "(xianqin|hanchao|weijin|nanbeichao|suichao|tangshi|wudai|"
                    + "songci|jinchao|yuanchao|mingchao|qingchao)";
    /**
     * 朝代链接
     **/
    private final static String URL_DYNASTY =
            "http://www\\.shici\\.net/" + PATTER_DYNASTY + "/$";
    /**
     * 作者链接
     **/
    private final static String URL_AUTHOR =
            "http://www\\.shici\\.net/shiren/[a-z]{5}\\.html";
    /**
     * 诗词链接
     **/
    private final static String URL_ARTICLE =
            "http://www\\.shici\\.net/" + PATTER_DYNASTY + "/[a-z]{5}\\.html";
    /**
     * 翻译链接
     **/
    private final static String URL_TRANSLATION =
            "/fanyi/[a-z]{5}\\.html";//http://www\\.shici\\.net
    /**
     * 赏析链接
     **/
    private final static String URL_APPRECIATION =
            "/shangxi/[a-z]{5}\\.html";
    /**
     * 文章Map,暂存Article
     **/
    private static Map<String, Article> articleMap =
            new HashMap<String, Article>();

    /**
     * 保存Article
     **/
    private void saveArticle(Article article, Page page) {
        System.out.println("诗歌:" + article);
        page.putField("dynasty", article.getDynasty());
        page.putField("author", article.getAuthor());
        page.putField("authorInfo", article.getAuthorInfo());
        page.putField("title", article.getTitle());
        page.putField("content", article.getContent());
        page.putField("translation", article.getTranslation());
        page.putField("comment", article.getComment());
        page.putField("appreciation", article.getAppreciation());
    }

    private Site site = Site.me().setCycleRetryTimes(5)
            .setRetryTimes(5).setSleepTime(1000)
            .setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
            .setCharset("UTF-8");


    public void process(Page page) {
        if (page.getUrl().regex(URL_DYNASTY).match()) {
            //System.out.println("朝代:"+page.getUrl());
            //作者列表
            List<String> authorUrl = page.getHtml()
                    .xpath("//div[@class='shirenlist']")
                    .links().all();
            page.addTargetRequests(authorUrl);
            //古诗文列表
            List<String> essayUrl = page.getHtml()
                    .xpath("//div[@id='related']/ul")
                    .links().all();
            page.addTargetRequests(essayUrl);
            page.setSkip(true);//跳过这个页面
        } else if (page.getUrl().regex(URL_AUTHOR).match()) {
            //System.out.println("作者:"+page.getUrl());
            //诗词列表
            List<String> poemUrl = page.getHtml()
                    .xpath("//div[@id='related']/ul/li/a/@href")
                    .all();
            //System.out.println(poemUrl);
            page.addTargetRequests(poemUrl);
            page.setSkip(true);//跳过这个页面
        } else if (page.getUrl().regex(URL_ARTICLE).match()) {
            //System.out.println("诗词:"+page.getUrl());
            Html html = page.getHtml();
            Article article = new Article();
            //朝代
            String dynasty = html
                    .xpath("//div[@id='article']/div[@class='info']")
                    .regex("<span>朝代:</span>(.*?)</p>").toString();
            //System.out.println(dynasty);
            article.setDynasty(dynasty);
            //作者
            String author = html
                    .xpath("//div[@id='article']/div[@class='info']")
                    .regex("<span>作者:</span><.*>(.*?)</a>").toString();
            //System.out.println(author);
            article.setAuthor(author);
            if (!author.equals("佚名")) {
                //作者简介
                String authorInfo = html
                        .xpath("//div[@class='authorinfo']")
                        .regex("<br>(.*)</div>").toString();
                //System.out.println(authorInfo);
                article.setAuthorInfo(authorInfo);
            }
            //标题
            String title = html.xpath("div[@id='article']/h1/text()")
                    .toString();
            //System.out.println(title);
            article.setTitle(title);
            //原文
            String content = html
                    .xpath("div[@id='article']/div[@class='content']")
                    .regex("<div class=\"content\">(.*)</div>")
                    .toString();
            //System.out.println(content);
            article.setContent(content);
            //译文链接
            String translationUrl = html
                    .xpath("div[@id='related']/ul/li/h3/a/@href")
                    .regex(URL_TRANSLATION)
                    .toString();
            //赏析链接
            String appreciateUrl = html
                    .xpath("div[@id='related']/ul/li/h3/a/@href")
                    .regex(URL_APPRECIATION)
                    .toString();
            //System.out.println("翻译:" + translationUrl);
            //System.out.println("赏析:" + appreciateUrl);
            if (translationUrl == null && appreciateUrl == null) {
                saveArticle(article, page);
            } else {
                if (translationUrl != null) {
                    article.setTranslation("http://www.shici.net" + translationUrl);
                    page.addTargetRequest("http://www.shici.net" + translationUrl);
                }
                if (appreciateUrl != null) {
                    article.setAppreciation("http://www.shici.net" + appreciateUrl);
                    page.addTargetRequest("http://www.shici.net" + appreciateUrl);
                }
                articleMap.put(page.getUrl().toString(), article);
                page.setSkip(true);//跳过这个页面
            }
        } else if (page.getUrl().regex(URL_TRANSLATION).match()) {
            Html html = page.getHtml();
            String articleUrl = "http://www.shici.net" + html
                    .xpath("//div[@class='relatedshici']/h2/a/@href")
                    .toString();
            System.out.println(articleUrl);
            String title = html.xpath("//div[@id='article']/h1/text()").toString();
            String translation = null;
            String comment = null;
            //处理译文与注释
            if (title.endsWith("译文及注释")) {
                translation = html
                        .xpath("//div[@id='article']/div[@class='content']")
                        .regex("<p><strong>译文</strong><br>(.*?)</p>")
                        .toString();
                comment = html
                        .xpath("//div[@id='article']/div[@class='content']")
                        .regex("<p><strong>注释</strong><br>(.*?)</p>")
                        .toString();
            } else {
                if (title.endsWith("译文")) {
                    translation = html
                            .xpath("//div[@id='article']")
                            .regex("<div class=\"content\">(.*?)</div>")
                            .toString();
                }
                if (title.endsWith("注释")) {
                    comment = html
                            .xpath("//div[@id='article']")
                            .regex("<div class=\"content\">(.*?)</div>")
                            .toString();
                }
            }
            System.out.println("注释:" + comment);
            System.out.println("翻译:" + translation);
            Article article = articleMap.get(articleUrl);
            article.setTranslation(translation);
            article.setComment(comment);
            String appreciation = article.getAppreciation();
            if (appreciation != null && appreciation.startsWith("http")) {
                page.setSkip(true);//跳过这个页面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        } else if (page.getUrl().regex(URL_APPRECIATION).match()) {
            Html html = page.getHtml();
            String articleUrl = "http://www.shici.net" + html
                    .xpath("//div[@class='relatedshici']/h2/a/@href")
                    .toString();
            System.out.println(articleUrl);
            String title = html.xpath("//div[@id='article']/h1").toString();
            String appreciation = html
                    .xpath("//div[@id='article']")
                    .regex("<div class=\"content\">(.*?)</div>")
                    .toString();
            System.out.println("赏析:" + title + appreciation);
            Article article = articleMap.get(articleUrl);
            article.setAppreciation(title + appreciation);
            String translation = article.getTranslation();
            if (translation != null && translation.startsWith("http")) {
                page.setSkip(true);//跳过这个页面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        }
    }

    public Site getSite() {
        return site;
    }

    private final static String[] intiUrls = {
            "http://www.shici.net/xianqin/",
            "http://www.shici.net/hanchao/",
            "http://www.shici.net/weijin/",
            "http://www.shici.net/nanbeichao/",
            "http://www.shici.net/suichao/",
            "http://www.shici.net/tangshi/",
            "http://www.shici.net/wudai/",
            "http://www.shici.net/songci/",
            "http://www.shici.net/jinchao/",
            "http://www.shici.net/yuanchao/",
            "http://www.shici.net/mingchao/",
            "http://www.shici.net/qingchao/",
    };

    public static void main(String[] args) {
        Spider.create(new ShiWenPageProcessor())
//                .addUrl("http://www.shici.net/xianqin/")
                .addUrl(intiUrls)
                .addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
                .thread(5)
                .run();
        System.out.println("运行结束");
    }

}

运行程序后,生成的数据如下:

在这里插入图片描述

爬取好诗文网的数据

以各个朝代下各种诗文类型为初始链接(总共55个链接),爬取好诗文网中每条诗文的所属朝代、作者信息、原文、翻译、赏析,保存每条诗文数据为json文本。

package edu.nlp.processer;

import edu.nlp.model.Article;
import edu.nlp.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
//import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class HaoShiWenPageProcessor implements PageProcessor {

    /**
     * 开始链接
     **/
    private final static String URL_START = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]$";
    /**
     * 列表链接
     **/
    private final static String URL_LIST = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]&page=\\d+";
    /**
     * 诗词链接
     **/
    private final static String URL_ARTICLE = "/view\\.php\\?id=\\d+";
    /**
     * 翻译链接
     **/
    private final static String URL_TRANSLATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=2&id=\\d+";
    /**
     * 赏析链接
     **/
    private final static String URL_APPRECIATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=1&id=\\d+";

    /**
     * 暂存Article
     **/
    private static Map<String, Article> articleMap =
            new HashMap<String, Article>();
    /**
     * 暂存article的类型
     **/
    private static Map<String, String> articleType =
            new HashMap<String, String>();

    /**
     * 初始化开始爬取的链接
     **/
    private static String[] intiUrls() {
        String[] urls = new String[55];
        int count = 0;
        for (int i = 1; i <= 11; i++) {
            for (int j = 1; j <= 5; j++) {
                urls[count++] = "http://www.haoshiwen.org/type.php?c=" + i + "&x=" + j;
            }
        }
        return urls;
    }

    /**
     * 获取article的类型
     *
     * @param url 开始链接或列表链接,从中提取出article类型Num
     * @return
     */
    private static String getType(Selectable url) {
        String type = null;
        int typeNum = Integer.parseInt(url.regex("c=\\d+&x=([1-5)])").toString());
        switch (typeNum) {
            case 1:
                type = "诗";
                break;
            case 2:
                type = "词";
                break;
            case 3:
                type = "曲";
                break;
            case 4:
                type = "文言文";
                break;
            case 5:
                type = "辞赋";
                break;
        }
        return type;
    }

    /**
     * 保存Article
     **/
    private void saveArticle(Article article, Page page) {
        //System.out.println("诗歌:" + article);
        page.putField("articleId", article.getArticleId());
        page.putField("type", article.getType());
        page.putField("dynasty", article.getDynasty());
        page.putField("author", article.getAuthor());
        page.putField("authorInfo", article.getAuthorInfo());
        page.putField("title", article.getTitle());
        page.putField("content", article.getContent());
        page.putField("translation", article.getTranslation());
        page.putField("comment", article.getComment());
        page.putField("appreciation", article.getAppreciation());
    }

    /**
     * 配置Site
     **/
    private Site site = Site.me()
            .setCycleRetryTimes(3)// 设置循环重试次数
            .setRetryTimes(3)// 设置重试次数
            .setSleepTime(100)// 设置处理page的间隔时间,单位毫秒
            .setTimeOut(3000)// 设置访问url的超时时间,单位毫秒
            // 设置 userAgent
            .setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
            // 设置 header信息
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
            .setCharset("UTF-8");// 设置编码

    public Site getSite() {
        return site;
    }

    /**
     * 提取想要的信息
     **/
    public void process(Page page) {
        if (page.getUrl().regex(URL_START).match()) {
            //获取页数OK
            String pageStr = page.getHtml()
                    .xpath("//div[@class='pages']")
                    .regex("/type.php\\?c=\\d+&amp;x=[1-5]&amp;page=(\\d+)\">尾页</a>")
                    .toString();
            //System.out.println("开始:" + page.getUrl() + " 页数" + pageStr);
            if (pageStr != null) {
                int pageNum = Integer.parseInt(pageStr);
                //System.out.println(pageNum);
                List<String> pageUrl = new ArrayList<String>();
                //把其余页的url添加到Request队列中
                for (int i = 2; i <= pageNum; i++) {
                    pageUrl.add(page.getUrl() + "&page=" + i);
                }
                page.addTargetRequests(pageUrl);
            }
            //添加起始页的古诗文列表
            List<String> articleUrl = page.getHtml()
                    .xpath("//div[@class='typeleft']/div[@class='sons']")
                    .regex(URL_ARTICLE)
                    .all();
            //System.out.println("诗歌列表:" + articleUrl);
            page.addTargetRequests(articleUrl);
            page.setSkip(true);//跳过这个页面
            //获取类型
            String type = getType(page.getUrl());
            for (String url : articleUrl) {
                //System.out.println("诗歌链接:" + url + ", " + type);
                articleType.put(url, type);
            }
        }
        if (page.getUrl().regex(URL_LIST).match()) {
            //System.out.println("列表:" + page.getUrl());
            //古诗文列表
            List<String> articleUrl = page.getHtml()
                    .xpath("//div[@class='typeleft']/div[@class='sons']")
                    .regex(URL_ARTICLE)
                    .all();
            //System.out.println("诗歌列表:" + articleUrl);
            page.addTargetRequests(articleUrl);
            page.setSkip(true);//跳过这个页面
            //获取类型
            String type = getType(page.getUrl());
            for (String url : articleUrl) {
                articleType.put(url, type);
            }
        } else if (page.getUrl().regex(URL_ARTICLE).match()) {
            System.out.println("诗词:" + page.getUrl());
            // remove the prefix of article url
            String articleUrl = page.getUrl().toString().replace("http://www.haoshiwen.org", "");
            Html html = page.getHtml();
            Article article = new Article();
            //诗歌ID
            article.setArticleId(Integer.parseInt(articleUrl.replace("/view.php?id=", "")));
            //类型
            article.setType(articleType.get(articleUrl));
            //朝代
            String dynasty = html
                    .xpath("//div[@class='son2']")
                    .regex("<span>朝代:</span>(.*?)</p>").toString();
            //System.out.println("朝代:" + dynasty);
            article.setDynasty(dynasty);
            //作者
            String author = html
                    .xpath("//div[@class='son2']")
                    .regex("<span>作者:</span>(.*?)</p>")
                    .toString().replaceAll("</?a.*?>", "");
            //System.out.println("作者" + author);
            article.setAuthor(author);
            if (!author.equals("佚名")) {
                //作者简介
                String authorInfo = html
                        .regex("<div class=\"son5\" style=\"overflow:auto;\">" +
                                ".*<img.*></a>(.*)<a.*?>\\.▶</a>")
                        .toString();
                //System.out.println("作者简介:"+authorInfo);
                if (authorInfo != "0") {
                    article.setAuthorInfo(authorInfo);
                }
            }
            //标题
            String title = html.xpath("div[@class='son1']/h1/text()")
                    .toString();
            //System.out.println("标题:"+title);
            article.setTitle(title);
            //原文
            String content = html
                    .xpath("//div[@class='shileft']/div[@class='son2']")
                    .regex("<p style=\"margin\\-top:0px;\">\\&nbsp;</p>\\s+(.*?)<br>\\s+" +
                            "<strong><span style=\"color:#FFFFFF;background-color:#E53333;\">精彩推荐</span></strong>")
                    .toString();
            //System.out.println("原文:" + content);
            article.setContent(content);
            //译文链接
            String translateUrl = html
                    .xpath("div[@class='son5']").links()
                    .regex(URL_TRANSLATION).toString();
            //赏析链接
            String appreciationUrl = html
                    .xpath("div[@class='son5']").links()
                    .regex(URL_APPRECIATION).toString();
            //System.out.println("翻译:" + translateUrl);
            //System.out.println("赏析:" + appreciationUrl);
            if (translateUrl == null && appreciationUrl == null) {
                //如果没有译文和赏析,则直接保存该Article对象
                saveArticle(article, page);
            } else {
                //否则,则把Article存在articleMap,等待信息被补齐才保存
                if (translateUrl != null) {
                    article.setTranslation(translateUrl);
                    page.addTargetRequest(translateUrl);
                }
                if (appreciationUrl != null) {
                    article.setAppreciation(appreciationUrl);
                    page.addTargetRequest(appreciationUrl);
                }
                articleMap.put(articleUrl, article);
                page.setSkip(true);//跳过这个页面
            }
        } else if (page.getUrl().regex(URL_TRANSLATION).match()) {
            Html html = page.getHtml();
            String articleUrl = html
                    .xpath("//div[@class='sontitle']/span/a/@href")
                    .toString();
            //System.out.println("诗词的链接"+articleUrl);
            //翻译标题
            String translationTitle = html
                    .xpath("//div[@class='shileft']/div[@class='son1']/h1/text()")
                    .toString();
            //System.out.println("翻译标题:" + translationTitle);
            String translation = null;
            String comment = null;
            if (translationTitle.endsWith("译文及注释")) {
                translation = html
                        .xpath("//div[@class='shangxicont']")
                        .regex("<p><strong>译文.*?</strong>(.*?)</p>")
                        .toString();
                if (translation != null)//去掉无关内容
                    translation = translation.replaceAll("</?a.*?>", "");
                comment = html
                        .xpath("//div[@class='shangxicont']")
                        .regex("<p><strong>注释.*?</strong>(.*?)</p>")
                        .toString();
                if (comment != null)
                    comment = comment.replaceAll("</?a.*?>", "");
                if (translation == null && comment == null) {
                    //译文和注释被合并在了一起
                    translation = html
                            .xpath("//div[@class='shangxicont']")
                            .regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                            .toString();
                    if (translation != null)
                        translation = translation.replaceAll("</?a.*?>", "");
                }
            } else {
                //只有译文
                if (translationTitle.endsWith("译文")) {
                    translation = html
                            .xpath("//div[@class='shangxicont']")
                            .regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                            .toString();
                    if (translation != null)//去掉无关内容
                        translation = translation.replaceAll("</?a.*?>", "");
                }
                //只有注释
                if (translationTitle.endsWith("注释")) {
                    comment = html
                            .xpath("//div[@class='shangxicont']")
                            .regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                            .toString();
                    if (comment != null) {
                        comment = comment.replaceAll("</?a.*?>", "");
                    }
                }
            }
            //System.out.println("翻译:" + translation);
            //System.out.println("注释:" + comment);
            Article article = articleMap.get(articleUrl);
            article.setTranslation(translation);
            article.setComment(comment);
            String appreciation = article.getAppreciation();
            if (appreciation != null && appreciation.startsWith("http")) {
                page.setSkip(true);//跳过这个页面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        } else if (page.getUrl().regex(URL_APPRECIATION).match()) {
            Html html = page.getHtml();
            String articleUrl = html
                    .xpath("//div[@class='sontitle']/span/a/@href")
                    .toString();
            //System.out.println("诗词的链接" + articleUrl);
            String appreciateTitle = html
                    .xpath("//div[@class='shileft']/div[@class='son1']/h1")
                    .toString();
            //System.out.println(appreciateTitle);
            String appreciation = html
                    .xpath("//div[@class='shangxicont']")
                    .regex("<p.*>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
                    .toString();
            if (appreciation != null)
                appreciation = appreciation.replaceAll("</?a.*?>", "");
            //System.out.println("赏析:" + appreciation);
            Article article = articleMap.get(articleUrl);
            article.setAppreciation(appreciateTitle + appreciation);
            String translation = article.getTranslation();
            if (translation != null && translation.startsWith("http")) {
                page.setSkip(true);//跳过这个页面
            } else {
                saveArticle(article, page);
                articleMap.remove(articleUrl);
            }
        }
    }

    /**
     * 获取article总数
     **/
    public static int articleCount() {
        return articleType.size();
    }

    public static void main(String[] args) {
        HaoShiWenPageProcessor processor = new HaoShiWenPageProcessor();
        Spider.create(processor)//指定PageProcessor页面处理器
                .addUrl(intiUrls())//添加爬取链接
                //.addUrl("http://www.haoshiwen.org/view.php?id=47834")
                //指定Pipeline结果处理对象,这里把结果保存成JSON文件
                // 默认保存到/data/webmagic,这里保存数据到/data下
                // 使用自定义的jsonFilePipeline来保存json数据,以诗歌ID来命名json文件
                // 默认的jsonFilePipeline是以链接的Url经过md5加密后来命名json文件的,如果重复下载的话,会出现重复的文件
                .addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
                .thread(5)//指定线程数
                .run();//开始爬虫
        System.out.println("诗词总数有:" + processor.articleCount());//75604
        System.out.println("运行结束");
    }

}

自定义JsonFilePipe保存json数据

由于使用webmagic默认的JsonFilePipe生成的json文件的文件名是使用MD5对文件进行命名的,生成的json文件无法从文件名上和网站的诗文链接进行一一对应,所以这里自定义JsonFilePipe,设置保存的json文件名为诗文的ID,方便查找原始的诗文内容。

package edu.nlp.pipeline;

import com.alibaba.fastjson.JSON;

import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;

import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;

public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
    private Logger logger = LoggerFactory.getLogger(this.getClass());

    public JsonFilePipeline() {
        this.setPath("/data/webmagic");
    }

    public JsonFilePipeline(String path) {
        this.setPath(path);
    }

    public void process(ResultItems resultItems, Task task) {
        String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
        try {
            // 用articleId来命名文件名
            PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + resultItems.get("articleId") + ".json")));
            //PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
            printWriter.write(JSON.toJSONString(resultItems.getAll()));
            printWriter.close();
        } catch (IOException var5) {
            this.logger.warn("write file error", var5);
        }
    }
}

运行程序后,生成的数据如下:

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/Fighting_No1/article/details/85014317
今日推荐