下载阿里巴巴商品详情页的主图和详情图(Java版本)

上次写了如何抓取微信公众号的,这次写了一个如何抓取alibaba的详情页图片的,费话不多说了,下面直接贴代码:

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 阿里巴巴商品详情页图片下载
 * 
 * @author yuxuan
 *
 */
public class AlibabaDetails implements Runnable {

	// 默认URL
	private String baseUrl;
	// 默认的下载目录
	private String downloadDir = System.getProperty("user.home") + "\\aliimg";

	/**
	 * 构造函数
	 */
	public AlibabaDetails() {
		mkdir(downloadDir);
	}

	/**
	 * 覆盖默认的下载路径
	 * 
	 * @param downloadDir
	 */
	public AlibabaDetails(String downloadDir) {
		this.downloadDir = downloadDir;
		mkdir(this.downloadDir);
	}

	/**
	 * 创建文件夹
	 * 
	 * @param downloadDir
	 */
	private void mkdir(String downloadDir) {
		File file = new File(downloadDir);
		// 文件夹不存在则进行创建
		if (!file.exists()) {
			file.mkdirs();
		}
		System.out.println("work dir : " + file.getAbsolutePath());
	}

	public void setBaseUrl(String baseUrl) {
		this.baseUrl = baseUrl;
	}

	/**
	 * 执行返回 Document
	 * 
	 * @return
	 */
	private Document execute() {
		try {
			URL url = new URL(baseUrl);
			URLConnection conn = url.openConnection();
			conn.setDoInput(true);
			BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "GBK"));
			StringBuilder sb = new StringBuilder();
			String line = null;
			while ((line = br.readLine()) != null) {
				sb.append(line);
			}
			return Jsoup.parse(sb.toString());

		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	/**
	 * 单线程下载
	 */
	public void spiderImgsDownLoad() {

		Document root = execute();
		if (root == null) {
			System.out.println("root is null");
			return;
		}

		// 截取保存的目录
		String urlDir = "";
		urlDir = baseUrl.substring(0, baseUrl.indexOf("?") - 1);
		urlDir = urlDir.substring(urlDir.lastIndexOf("/") + 1, urlDir.length());
		urlDir = urlDir.substring(0, urlDir.indexOf("."));

		// 产品主图
		productMainImg(root, urlDir);
		// 详情页图片
		productDetailImg(root, urlDir);
	}

	/**
	 * 下载详情页图片
	 * 
	 * @param root
	 * @param urlDir
	 */
	private void productDetailImg(Element root, String urlDir) {
		Element mod = root.getElementById("mod-detail-description");
		Elements imgs = mod.getElementsByTag("img");
		for (Element img : imgs) {
			String url = img.attr("src");
			if (url != null && url.length() > 0) {
				downLoadFromUrl(url, getSuffixFileName(url), downloadDir, urlDir + File.separator + "详情图");
			}
		}
	}

	/**
	 * 下载主图
	 * 
	 * @param root
	 * @param urlDir
	 */
	private void productMainImg(Element root, String urlDir) {
		Elements eles = root.getElementsByClass("nav-tabs");
		for (Element ele : eles) {
			Elements imgs = ele.getElementsByTag("img");
			for (Element img : imgs) {
				String url = img.attr("src");
				if (url.contains("60x60")) {
					url = url.replace("60x60", "400x400");
					downLoadFromUrl(url, getSuffixFileName(url), downloadDir, urlDir + File.separator + "主图");
				}
			}
		}
	}

	private String getSuffixFileName(String url) {
		return url.substring(url.lastIndexOf("/") + 1, url.length());
	}

	/**
	 * 下载图片
	 * 
	 * @param urlStr
	 * @param fileName
	 * @param savePath
	 */
	public static void downLoadFromUrl(String urlStr, String fileName, String savePath, String dir) {
		try {
			System.out.println(urlStr);
			URL url = new URL(urlStr);
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			// 设置超时间为3秒
			conn.setConnectTimeout(3 * 1000);
			// 防止屏蔽程序抓取而返回403错误
			conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
			// 得到输入流
			InputStream inputStream = conn.getInputStream();

			byte[] buffer = new byte[1024];
			int len = 0;
			ByteArrayOutputStream bos = new ByteArrayOutputStream();
			while ((len = inputStream.read(buffer)) != -1) {
				bos.write(buffer, 0, len);
			}
			bos.close();

			// 获取自己数组
			byte[] getData = bos.toByteArray();

			// 文件保存位置
			File saveDir = new File(savePath + File.separator + dir);
			if (!saveDir.exists()) {
				saveDir.mkdirs();
			}
			File file = new File(saveDir + File.separator + fileName);
			FileOutputStream fos = new FileOutputStream(file);
			fos.write(getData);
			if (fos != null) {
				fos.close();
			}
			if (inputStream != null) {
				inputStream.close();
			}
			System.out.println(fileName + ":" + url + " download success");
		} catch (Exception e) {
			e.printStackTrace();
			System.err.println(urlStr);
		}
	}

	/**
	 * 线程run方法
	 */
	@Override
	public void run() {
		spiderImgsDownLoad();
	}

	public void startThread() {
		Thread thread = new Thread(this);
		thread.start();
	}

	public static void main(String[] args) {

//		new Thread(()->{
//			AlibabaDetails aliArt = new AlibabaDetails("F:\\tmp\\aliimg");
////			wxArt.setBaseUrl("https://detail.1688.com/offer/587734725047.html?spm=a262eq.12572798.jsczf959.1.215c2fb13cCAVo");
//			aliArt.setBaseUrl("https://detail.1688.com/offer/605722027272.html?spm=b26110380.sw1688.mof001.1.31566458C3zffZ&tracelog=p4p&clickid=fa17cf7a253546f0b1c34996f72b9ee0&sessionid=00f43deb2241205aa1287eaeafa97d84");
//			//调用单线程下载方法
//			aliArt.spiderImgsDownLoad();
//		}).start();

//		new Thread(()->{
//			AlibabaDetails aliArt = new AlibabaDetails("F:\\tmp\\aliimg");
//			aliArt.setBaseUrl("https://detail.1688.com/offer/587734725047.html?spm=a262eq.12572798.jsczf959.1.215c2fb13cCAVo");
////			aliArt.setBaseUrl("https://detail.1688.com/offer/605722027272.html?spm=b26110380.sw1688.mof001.1.31566458C3zffZ&tracelog=p4p&clickid=fa17cf7a253546f0b1c34996f72b9ee0&sessionid=00f43deb2241205aa1287eaeafa97d84");
//			//调用单线程下载方法
//			aliArt.spiderImgsDownLoad();
//		}).start();

		AlibabaDetails aliArt = new AlibabaDetails("F:\\tmp\\aliimg");
		aliArt.setBaseUrl(
				"https://detail.1688.com/offer/587734725047.html?spm=a262eq.12572798.jsczf959.1.215c2fb13cCAVo");
//		aliArt.setBaseUrl("https://detail.1688.com/offer/605722027272.html?spm=b26110380.sw1688.mof001.1.31566458C3zffZ&tracelog=p4p&clickid=fa17cf7a253546f0b1c34996f72b9ee0&sessionid=00f43deb2241205aa1287eaeafa97d84");
		// 调用单线程下载方法
		aliArt.spiderImgsDownLoad();
	}

}

以上为全部的代码,运行main方法即可,效果如下:

有问题可以在评论区留言,技术问题可以私信我。 

发布了106 篇原创文章 · 获赞 101 · 访问量 56万+

猜你喜欢

转载自blog.csdn.net/qq_24434671/article/details/103975748