Jsoup抓取图片

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u012848709/article/details/84798282

楔子

jsoup抓取图片,其实就是分析页面图片地址,然后下载图片

demo

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @Title: DownPic.java
 * @Package com.pic
 * @Description: TODO(用一句话描述该文件做什么)
 * @author 作者 grq
 * @version 创建时间:2018年12月2日 下午9:24:03
 *
 */
public class DownPic {
	static String mainUrl = "https://www.meitulu.com/t/toutiaonvshen/";
	File saveFile = new File("c://picc");

	public static void main(String[] args) throws IOException {

		for (int i = 1; i < 10; i++) {
			String pageUrl = "";
			final int pageNum = i;
			if (i == 1) {
				pageUrl = mainUrl;
			} else {
				pageUrl = mainUrl + i + ".html";
			}
			parseMinPage(pageUrl, "page_" + pageNum);
		}

	}

	private static void parseMinPage(String url, String pageNum) {
		try {
			Document document = Jsoup.connect(url).get();
			Elements imgLis = document.getElementsByClass("img");
			// 从li中获取第一个a标签
			Elements elementsByTag = Jsoup.parse(imgLis.toString()).getElementsByTag("li");
			for (Element element : elementsByTag) {
				Element child = element.child(0);
				Elements allElements = child.getAllElements();
				String picURL = allElements.get(0).attr("href");
				String attr = allElements.get(1).attr("alt");
				// 图片 数量
				String picCount = element.child(1).text().substring("图片: ".length(), "图片: ".length() + 2);

				downDetail(picURL, pageNum + "/" + attr.replaceAll(" ", ""), picCount.trim());
				System.out.println("down pic is " + attr + "  地址是:" + picURL);
			}
		} catch (IOException e) {
			System.out.println("主页连接超时");
			e.printStackTrace();
		}
	}

	/**
	 * 在主页下载图片
	 * 
	 * @param picURL
	 * @param title
	 * @param picCount
	 * @throws IOException
	 */
	private static void downDetail(String picURL, String title, String picCount) throws IOException {

		String baseUrl = picURL.substring(0, picURL.length() - 5);
		for (int i = 1; i <= (Integer.valueOf(picCount) + 3) / 4; i++) {
			if (i == 1) {
				picURL = baseUrl + ".html";
			} else {
				picURL = baseUrl + "_" + i + ".html";
			}
			Elements pics = Jsoup.connect(picURL).get().getElementsByClass("content");
			Elements picImg = pics.get(0).getElementsByTag("img");
			for (Element ele : picImg) {
				downPic(ele.attr("src"), title);
			}

		}

	}

	private static void downPic(String picUrl, String title) throws IOException {
		// https://www.meitulu.com/t/toutiaonvshen/
		Connection connect = Jsoup.connect(picUrl);
		Response execute = connect.referrer("https://www.meitulu.com/").ignoreContentType(true).execute();
		String name = FilenameUtils.getName(picUrl);
		BufferedInputStream bodyStream = execute.bodyStream();
		FileUtils.copyInputStreamToFile(bodyStream, new File("c:/piccc/" + title, name));
	}
}

pom

<dependency>
	<groupId>commons-io</groupId>
	<artifactId>commons-io</artifactId>
	<version>2.4</version>
</dependency>
	<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>${jsoup.version}</version>
</dependency>

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/u012848709/article/details/84798282