Java网页爬虫：Spring Boot通过webmagic实现网页爬虫

一、需求

因为业务需求，需要实现新闻资讯功能。初步方案通过第三方提供的服务接口来实现此功能。由于谈判失败，因此决定自开发一套爬虫接口。因此通过查询相关文档，决定采用webmagic开源框架实现自己的爬虫功能。

二、实施过程

1、引入依赖

在pom文件中添加依赖：

<!-- 爬虫 -->
<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-core</artifactId>
	<version>0.7.3</version>
</dependency>
<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-extension</artifactId>
	<version>0.7.3</version>
</dependency>

2、创建相关接口

创建实现类，代码如下（仅供参考）：

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import team.biteeny.admin.db.write.cache.ConfigMapper;
import team.biteeny.admin.db.write.mapper.CrawlMapper;
import team.biteeny.admin.db.write.model.CrawlModel;
import team.biteeny.push.getui.PushApp;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
public class HuobiInfoProcessor implements PageProcessor {

    @Autowired
    private CrawlMapper crawlMapper;

    @Autowired
    private ConfigMapper configMapper;

    private Site site;

    private static Map map = new ConcurrentHashMap();

    @Override
    public void process(Page page) {
        if (page.getUrl().toString().contains("flash")){
            insertFlash(page);
        }
        if (page.getUrl().toString().contains("article")){
            List<String> urlList = new ArrayList<>();
            Json json = page.getJson();
            JSONObject jsonObject = JSONObject.parseObject(json.toString());
            JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("data");
            for (Object o : jsonArray) {
                JSONObject object = JSONObject.parseObject(JSONObject.toJSONString(o));
                String key = "baseDetail_" + object.getString("id");
                urlList.add("https://www.huobiinfo.com/news/"+key);
                map.put(key + "_listPicturePath",object.getString("listPicturePath"));
                map.put(key + "_title",object.getString("title"));
            }
            page.addTargetRequests(urlList);
        }
        if (page.getUrl().toString().contains("news/baseDetail_")){
            insertNews(page);
        }
    }

    @Override
    public Site getSite() {
        if (site==null){
            site= Site.me().setDomain("www.huobiinfo.com")
                    .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36")
                    .setCharset("UTF-8")
                    .setSleepTime(500);
        }
        return site;
    }

    public static void main(String[] args) {
//        Spider.create(new HuobiInfoProcessor()).addUrl("https://www.huobiinfo.com/flash/").runAsync();
//        Request request = new Request("https://huobi-news-app-gateway-outer.huobi.cn:8005/article/listPagedArticleListByParam");
//        request.setMethod(HttpConstant.Method.POST);
//        request.setRequestBody(HttpRequestBody.json("{\"pageSize\":10,\"pageNum\":1,\"categoryPcId\":15}","utf-8"));
//        Spider.create(new HuobiInfoProcessor()).addRequest(request).runAsync();
//        String title = "BTC链上基础指标略有回暖，链上场内场外交易均较活跃";
//        String c = "根据Searchain.io数据分析：昨日BTC从4100下降到3900点。从链上指标来看，昨日反映BTC内部价值的基础指标整体有所上升，新增地址上升14.89%，活跃地址上升12.20%。从链上交易指标来看，交易用户的活跃度也在上升，交易所流入增加49.16%，流出增加40.78%；链上大额转账的活跃程度集中在100-600 BTC区间，600+ BTC的转账有所下降，大额流入交易所占比有所上升，场内场外均比较活跃。综合链上基础和交易指标来看，近期BTC内部价值略有回暖，链上场内场外交易均活跃。独立分析师Edward对近期BTC市场呈较为悲观状态。\n" +
//                "只有币名和百分比，没有价格波动词，所以不符合推送条件";
//        boolean b = checkPush(title+c);
//        System.out.println(b);

    }

    private void insertFlash(Page page){
        Elements elements = page.getHtml().getDocument().getElementsByClass("item-flash");
        for (Element element : elements) {
            Html html = new Html(element.toString());
            String s = html.xpath("//div[@class='item-flash']//h3[@class='med']//nuxt-link/@to").toString();
            String key = s.substring(1, s.lastIndexOf("/")).replace("/", "_");
            if (crawlMapper.checkExist(key) <= 0){
                String title = html.xpath("//div[@class='item-flash']//h3[@class='med']//nuxt-link/text()").toString();
                String content = html.xpath("//div[@class='item-flash']//div[@class='content']/text()").toString();
                CrawlModel model = new CrawlModel();
                boolean b = checkPush(title + content);
                model.setId(key);
                model.setBody(content);
                model.setTitle(title);
                model.setSource("HuobiInfo");
                model.setType("flash");
                if (b){
                    model.setIs_push(true);
                    push(title,content);
                }else {
                    model.setIs_push(false);
                }
                model.setCreate_time(new Date());
                crawlMapper.crawlInsert(model);
            }
        }
    }
    private void insertNews(Page page){
        String path = page.getUrl().toString();
        String key = path.substring(path.lastIndexOf("/") + 1);
        if (crawlMapper.checkExist(key) <= 0) {
            String source = "<div><p>来源：" + page.getHtml().xpath("//div[@class='detail-platform-msg']//p[@class='detail-platform']/text()").toString()+"</p></div>";
            String notice = "<div><em><span style='font-size: 12px;'>" +
                    page.getHtml().xpath("//div[@class='detail-source']/text()") +
                    "</span></em></div>";
            String article = page.getHtml().xpath("//div[@class='detail-content article-content hb-article']").toString();
            String content = source + notice + article;
            if (!checkDomain(article)){
                CrawlModel model = new CrawlModel();
                model.setId(key);
                model.setTitle((String) map.get(key + "_title"));
                model.setBody(content);
                model.setList_picture((String) map.get(key + "_listPicturePath"));
                model.setSource("HuobiInfo");
                model.setType("news");
                model.setCreate_time(new Date());
                crawlMapper.crawlInsert(model);
            }
        }
    }

    private static boolean checkPush(String str){
        if (str == null){
            return false;
        }
        String regex = "btc|eth|bch|ltc|etc|eos|xrp|dash|trx";
        String regex1 = "涨|跌|涨幅|跌幅|上涨|下跌";
        String regexF = "大额转账|净流入|净流出";
        String regexH = "okex|火币|币安|比特大陆";
        String regex2 = "\\d+(\\.?\\d*?)(?=%)";
        Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
        Pattern p1 = Pattern.compile(regex1);
        Pattern pf = Pattern.compile(regexF);
        Pattern ph = Pattern.compile(regexH);
        Pattern p2 = Pattern.compile(regex2);
        Matcher matcher = p.matcher(str);
        Matcher matcher1 = p1.matcher(str);
        Matcher matcherF = pf.matcher(str);
        Matcher matcherH = ph.matcher(str);
        Matcher matcher2 = p2.matcher(str);
        if (matcher.find() && matcherF.find()){
            return true;
        }
        if (matcherH.find()){
            return true;
        }
        if (matcher.find() && matcher1.find()){
            while (matcher2.find()){
                Double d = Double.valueOf(matcher2.group());
                if (d > 5){
                    return true;
                }
            }
            return false;
        }
        return false;
    }

    private void push(String title,String text){
        // 推送相关
//        int hour = Calendar.getInstance().get(Calendar.HOUR);
//        if (hour >= 8 && hour <= 22){
//        }
    }

    private boolean checkDomain(String content){
        if (content == null){
            return false;
        }
        String pattern = "mmbiz\\.qpic\\.cn";
        Pattern p = Pattern.compile(pattern);
        Matcher m = p.matcher(content);
        if(m.find()){
           return true;
        }
        return false;
    }

以上是一个简单的实例，包括一些过滤逻辑和数据持久化逻辑等。

三、总结

以上是java通过webmagic实现网页爬虫的简单例子，对webmagic的应用也不是很到位，只是已经满足了当前的业务需求。也未作深入研究。对webmagic也没有详细描述，有兴趣的童鞋们可以自行查阅相关文档学习，欢迎大家加入进来一起讨论学习。

能力有限，难免有不当之处，欢迎大家批评指正。把技术死磕到底！

希尔伯特

发布了23 篇原创文章 · 获赞 24 · 访问量 2万+

私信关注

Java网页爬虫：Spring Boot通过webmagic实现网页爬虫

猜你喜欢