Java+Jsoup: 爬取二次元妹子图片并下载到本地(完整代码)

简介

这是一个基于Jsoup的用来爬取网页上图片并下载到本地的Java项目。

完整项目见 https://github.com/AsajuHuishi/CrawlByJsoup
exe文件见getImageByPixivPainterId.exe

环境

  • JDK 1.8
  • IntelliJ Idea 2020
  • Jsoup 1.13.1

目录结构

├─saveImage
│  ├─喵咕君QAQ(KH3)
│  └─小逝lullaby
└─src
    └─indi
        └─huishi
            ├─service   // Crawl.java
            └─utils		// CrawlUtils.java

结果

服务端![在这里插入图片描述](https://img-blog.csdnimg.cn/20210409170303120.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzM2OTM3Njg0,size_16,color_FFFFFF,t_70)客户端![在这里插入图片描述](https://img-blog.csdnimg.cn/20210409170442230.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzM2OTM3Njg0,size_16,color_FFFFFF,t_70)

Crawl类

package indi.huishi.service;

import indi.huishi.utils.CrawlUtils;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.util.List;
import java.util.Map;

public class Crawl {
    
    

    public static final String Proto = "https://";
    public static final String BasicUrl = "www.huashi6.com";
    static final String ImageUrlCssQuery = ".p-painter-detail .painter-page-body .hot-work-list .hot-work-page .cover-img .img-vec,.img-hor";
    static final String ImageTitleCssQuery = ".p-painter-detail .painter-page-body .hot-work-list .hot-work-page .work-info .name";
    static final String PainterHomePageCssQuery = ".search-container .container .painter .classify-painter .c-painter-item .painter .painter-info";
    static final String PainterNameCssQuery = ".search-container .container .painter .classify-painter .c-painter-item .painter .painter-name";
    static final String BaseFolder = "crawler//saveImage//";
    static String painterName = "未命名画师";
    static String suffix = ".png";

    public static void main(String[] args) throws IOException {
    
    
        // 输入画师id
        String painterId = CrawlUtils.getPainterId();
        StringBuffer buffer = new StringBuffer(Proto + BasicUrl + "/search?searchText=");
        // 1.根据url连接返回document
        Document document = null;
        try {
    
    
            document = CrawlUtils.getConnection(buffer.append(painterId).toString());
        }catch (Exception e){
    
    
            e.printStackTrace();
        }
        // 2.根据层级元素查找画师的主页url并跳转
        String painterHomePageUrl = null;
        try {
    
    
            painterHomePageUrl = CrawlUtils.getPainterHomePageURL(document, PainterHomePageCssQuery, "a", "href");
        } catch (Exception e){
    
    
            e.printStackTrace();
        }
        // 获取画师姓名
        try {
    
    
            painterName = CrawlUtils.getPainterName(document, PainterNameCssQuery, "span");
        } catch(Exception e){
    
    
            e.printStackTrace();
        }
        // 根据画师主页 连接返回document
        Document documentPainterHomePage = null;
        try {
    
    
            documentPainterHomePage = CrawlUtils.getConnection(Proto + BasicUrl + painterHomePageUrl);
        } catch (Exception e){
    
    
            e.printStackTrace();
        }
        // 3.获取图片信息,包括图片链接和图片名称
        List<String> imgURLList = CrawlUtils.getInfoList(documentPainterHomePage, ImageUrlCssQuery, "img", "src");
        List<String> imgTitleList = CrawlUtils.getInfoList(documentPainterHomePage, ImageTitleCssQuery, "div", "title");
        // 保存为map
        Map<String, String> map = CrawlUtils.listsToMap(imgTitleList, imgURLList);

        // 4.下载到 画师姓名的文件夹
        try {
    
    
            CrawlUtils.downloadImages(map, BaseFolder, painterName, suffix);
        }catch (Exception e){
    
    
            throw new RuntimeException("下载失败");
        }
    }
}

おすすめ

転載: blog.csdn.net/qq_36937684/article/details/115579720