准备工作:
url地址: http://www.mmonly.cc/ktmh/
Maven依赖:
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>6</source>
<target>6</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>2.18.0</version>
</dependency>
</dependencies>
思路:main方法传入地址,process解析,传入的地址是列表页还是详情页
列表页:获取下一页的URL地址,得到详情页的URL地址
详情页:获取下一页的URL地址,得到img下src图片地址
详情页:获取下一页的URL地址,得到img下src图片地址
页面解析:
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
public class ProcesserTest implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public void process(Page page) {
//如果页面是列表页
if (page.getUrl().toString().equals("http://www.mmonly.cc/ktmh/dmmn/")) {
//得到列表图片的所有链接
page.addTargetRequests(page.getHtml().$("div.item_t > div > div.ABox > a").links().all());
// 获取下一页,倒数第个a标签
// 规定属于其父元素的第二个子元素的每个 p 元素,从最后一个子元素开始计数:
//这样就拿到了下一页的link
page.addTargetRequest(page.getHtml().$("#pageNum > a:nth-last-child(2)").links().toString());
//详情页 匹配括号内的数字
} else if (page.getUrl().regex("http://www.mmonly.cc/ktmh/dmmn/[\\d]+") != null) {
// 得到详情页里面的下一页按钮的 href
Selectable links = page.getHtml().$("#nl > a").links();
if (links != null )
page.addTargetRequest(links.toString());
// 抓取图片内容p标签
String img = page.getHtml().$("#big-pic p img").toString();
//细查a标签
if (img == "null")
//img标签直接使用link()不能直接获取,所以只能获取到内容,进行内容的截取
img = page.getHtml().$("#big-pic a img").toString();
//截取从s开始计,+5就刚好到h的位置
img = img.substring(img.indexOf("src=\"") + 5, img.length() - 2);
page.putField("img", img);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ProcesserTest())
.addUrl("http://www.mmonly.cc/ktmh/dmmn/")
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000)))
.addPipeline(new MyPipeline()).thread(5).run();
}
}
定制保存:
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class MyPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
String url = resultItems.get("img").toString();
UrlFileDownloadUtil.downloadPicture(url);
}
}
下载保存类:
import java.io.*;
import java.net.URL;
import java.util.UUID;
public class UrlFileDownloadUtil {
public static void downloadPicture(String url) {
String file = "C:\\Users\\Administrator\\Desktop\\新建文件夹\\";
try {
URL u = new URL(url);
String name = UUID.randomUUID().toString();
DataInputStream inputStream = new DataInputStream(u.openStream());
FileOutputStream outputStream = new FileOutputStream(file+name+".jpg");
byte [] bytes = new byte[1024*100];
int length ;
while ((length=inputStream.read(bytes))>0){
outputStream.write(bytes,0,length);
}
System.out.println("下载完成:"+file+name+".jpg");
inputStream.close();
outputStream.close();
} catch ( Exception e) {
e.printStackTrace();
}
}
}
照搬(大佬的博客: https://blog.csdn.net/qq_35641192/article/details/80547262)
模仿
理解
改写
我的
(手动滑稽)