上次写了如何抓取微信公众号的,这次写了一个如何抓取alibaba的详情页图片的,费话不多说了,下面直接贴代码:
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 阿里巴巴商品详情页图片下载
*
* @author yuxuan
*
*/
public class AlibabaDetails implements Runnable {
// 默认URL
private String baseUrl;
// 默认的下载目录
private String downloadDir = System.getProperty("user.home") + "\\aliimg";
/**
* 构造函数
*/
public AlibabaDetails() {
mkdir(downloadDir);
}
/**
* 覆盖默认的下载路径
*
* @param downloadDir
*/
public AlibabaDetails(String downloadDir) {
this.downloadDir = downloadDir;
mkdir(this.downloadDir);
}
/**
* 创建文件夹
*
* @param downloadDir
*/
private void mkdir(String downloadDir) {
File file = new File(downloadDir);
// 文件夹不存在则进行创建
if (!file.exists()) {
file.mkdirs();
}
System.out.println("work dir : " + file.getAbsolutePath());
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
/**
* 执行返回 Document
*
* @return
*/
private Document execute() {
try {
URL url = new URL(baseUrl);
URLConnection conn = url.openConnection();
conn.setDoInput(true);
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "GBK"));
StringBuilder sb = new StringBuilder();
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
return Jsoup.parse(sb.toString());
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 单线程下载
*/
public void spiderImgsDownLoad() {
Document root = execute();
if (root == null) {
System.out.println("root is null");
return;
}
// 截取保存的目录
String urlDir = "";
urlDir = baseUrl.substring(0, baseUrl.indexOf("?") - 1);
urlDir = urlDir.substring(urlDir.lastIndexOf("/") + 1, urlDir.length());
urlDir = urlDir.substring(0, urlDir.indexOf("."));
// 产品主图
productMainImg(root, urlDir);
// 详情页图片
productDetailImg(root, urlDir);
}
/**
* 下载详情页图片
*
* @param root
* @param urlDir
*/
private void productDetailImg(Element root, String urlDir) {
Element mod = root.getElementById("mod-detail-description");
Elements imgs = mod.getElementsByTag("img");
for (Element img : imgs) {
String url = img.attr("src");
if (url != null && url.length() > 0) {
downLoadFromUrl(url, getSuffixFileName(url), downloadDir, urlDir + File.separator + "详情图");
}
}
}
/**
* 下载主图
*
* @param root
* @param urlDir
*/
private void productMainImg(Element root, String urlDir) {
Elements eles = root.getElementsByClass("nav-tabs");
for (Element ele : eles) {
Elements imgs = ele.getElementsByTag("img");
for (Element img : imgs) {
String url = img.attr("src");
if (url.contains("60x60")) {
url = url.replace("60x60", "400x400");
downLoadFromUrl(url, getSuffixFileName(url), downloadDir, urlDir + File.separator + "主图");
}
}
}
}
private String getSuffixFileName(String url) {
return url.substring(url.lastIndexOf("/") + 1, url.length());
}
/**
* 下载图片
*
* @param urlStr
* @param fileName
* @param savePath
*/
public static void downLoadFromUrl(String urlStr, String fileName, String savePath, String dir) {
try {
System.out.println(urlStr);
URL url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
// 设置超时间为3秒
conn.setConnectTimeout(3 * 1000);
// 防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
// 得到输入流
InputStream inputStream = conn.getInputStream();
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
// 获取自己数组
byte[] getData = bos.toByteArray();
// 文件保存位置
File saveDir = new File(savePath + File.separator + dir);
if (!saveDir.exists()) {
saveDir.mkdirs();
}
File file = new File(saveDir + File.separator + fileName);
FileOutputStream fos = new FileOutputStream(file);
fos.write(getData);
if (fos != null) {
fos.close();
}
if (inputStream != null) {
inputStream.close();
}
System.out.println(fileName + ":" + url + " download success");
} catch (Exception e) {
e.printStackTrace();
System.err.println(urlStr);
}
}
/**
* 线程run方法
*/
@Override
public void run() {
spiderImgsDownLoad();
}
public void startThread() {
Thread thread = new Thread(this);
thread.start();
}
public static void main(String[] args) {
// new Thread(()->{
// AlibabaDetails aliArt = new AlibabaDetails("F:\\tmp\\aliimg");
//// wxArt.setBaseUrl("https://detail.1688.com/offer/587734725047.html?spm=a262eq.12572798.jsczf959.1.215c2fb13cCAVo");
// aliArt.setBaseUrl("https://detail.1688.com/offer/605722027272.html?spm=b26110380.sw1688.mof001.1.31566458C3zffZ&tracelog=p4p&clickid=fa17cf7a253546f0b1c34996f72b9ee0&sessionid=00f43deb2241205aa1287eaeafa97d84");
// //调用单线程下载方法
// aliArt.spiderImgsDownLoad();
// }).start();
// new Thread(()->{
// AlibabaDetails aliArt = new AlibabaDetails("F:\\tmp\\aliimg");
// aliArt.setBaseUrl("https://detail.1688.com/offer/587734725047.html?spm=a262eq.12572798.jsczf959.1.215c2fb13cCAVo");
//// aliArt.setBaseUrl("https://detail.1688.com/offer/605722027272.html?spm=b26110380.sw1688.mof001.1.31566458C3zffZ&tracelog=p4p&clickid=fa17cf7a253546f0b1c34996f72b9ee0&sessionid=00f43deb2241205aa1287eaeafa97d84");
// //调用单线程下载方法
// aliArt.spiderImgsDownLoad();
// }).start();
AlibabaDetails aliArt = new AlibabaDetails("F:\\tmp\\aliimg");
aliArt.setBaseUrl(
"https://detail.1688.com/offer/587734725047.html?spm=a262eq.12572798.jsczf959.1.215c2fb13cCAVo");
// aliArt.setBaseUrl("https://detail.1688.com/offer/605722027272.html?spm=b26110380.sw1688.mof001.1.31566458C3zffZ&tracelog=p4p&clickid=fa17cf7a253546f0b1c34996f72b9ee0&sessionid=00f43deb2241205aa1287eaeafa97d84");
// 调用单线程下载方法
aliArt.spiderImgsDownLoad();
}
}
以上为全部的代码,运行main方法即可,效果如下:
有问题可以在评论区留言,技术问题可以私信我。