java爬虫Demo1 爬取图片到本地

java爬虫Demo1            爬取图片到本地

准备工作:

url地址: http://www.mmonly.cc/ktmh/

Maven依赖:


    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>6</source>
                    <target>6</target>
                </configuration>
            </plugin>
        </plugins>
    </build>


    <dependencies>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-selenium</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.0.1</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-chrome-driver</artifactId>
            <version>3.0.1</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-server</artifactId>
            <version>2.18.0</version>
        </dependency>
    </dependencies>

思路:main方法传入地址,process解析,传入的地址是列表页还是详情页
                           列表页:获取下一页的URL地址,得到详情页的URL地址
                           详情页:获取下一页的URL地址,得到img下src图片地址
                           详情页:获取下一页的URL地址,得到img下src图片地址

页面解析:

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;

import us.codecraft.webmagic.selector.Selectable;


public class ProcesserTest implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

    @Override
    public void process(Page page) {
        //如果页面是列表页
        if (page.getUrl().toString().equals("http://www.mmonly.cc/ktmh/dmmn/")) {

            //得到列表图片的所有链接
            page.addTargetRequests(page.getHtml().$("div.item_t > div > div.ABox > a").links().all());

            // 获取下一页,倒数第个a标签
            // 规定属于其父元素的第二个子元素的每个 p 元素,从最后一个子元素开始计数:
            //这样就拿到了下一页的link
            page.addTargetRequest(page.getHtml().$("#pageNum > a:nth-last-child(2)").links().toString());

            //详情页                                                   匹配括号内的数字
        } else if (page.getUrl().regex("http://www.mmonly.cc/ktmh/dmmn/[\\d]+") != null) {
            // 得到详情页里面的下一页按钮的  href
            Selectable links = page.getHtml().$("#nl > a").links();

            if (links != null )
                page.addTargetRequest(links.toString());
            // 抓取图片内容p标签
            String img = page.getHtml().$("#big-pic p img").toString();
            //细查a标签
            if (img == "null")
                //img标签直接使用link()不能直接获取,所以只能获取到内容,进行内容的截取
                img = page.getHtml().$("#big-pic a img").toString();
            //截取从s开始计,+5就刚好到h的位置
            img = img.substring(img.indexOf("src=\"") + 5, img.length() - 2);

            page.putField("img", img);



        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new ProcesserTest())
                .addUrl("http://www.mmonly.cc/ktmh/dmmn/")
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000)))
                .addPipeline(new MyPipeline()).thread(5).run();
    }

}

定制保存:

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

public class MyPipeline implements Pipeline {

    @Override
    public void process(ResultItems resultItems, Task task) {

        String url = resultItems.get("img").toString();
        UrlFileDownloadUtil.downloadPicture(url);
    }
}

下载保存类:

import java.io.*;
import java.net.URL;
import java.util.UUID;

public class UrlFileDownloadUtil {

    public static void downloadPicture(String url) {
        String file = "C:\\Users\\Administrator\\Desktop\\新建文件夹\\";
        try {
            URL u = new URL(url);
            String name = UUID.randomUUID().toString();

            DataInputStream inputStream = new DataInputStream(u.openStream());
            FileOutputStream outputStream = new FileOutputStream(file+name+".jpg");

            byte [] bytes = new byte[1024*100];
            int length ;
            while ((length=inputStream.read(bytes))>0){
                outputStream.write(bytes,0,length);
            }
            System.out.println("下载完成:"+file+name+".jpg");

            inputStream.close();
            outputStream.close();
        } catch (  Exception e) {
            e.printStackTrace();
        }
    }
}

照搬(大佬的博客: https://blog.csdn.net/qq_35641192/article/details/80547262)
模仿
理解
改写
我的
(手动滑稽)

发布了24 篇原创文章 · 获赞 2 · 访问量 459

猜你喜欢

转载自blog.csdn.net/tiangoua/article/details/103588698