Java爬虫入门(三)爬取京东上手机信息(Springboot)

掘金原文传送门

说在开头的话:以下的代码存在不严谨的做法,可自行更改:)

项目结构(使用maven管理)

application.properties

#DB Configuration
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/corejava?useUnicode=true&serverTimezone=Asia/Shanghai&characterEncoding=utf-8&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=23333

#Jpa Configuration
spring.jpa.database=MySQL
spring.jpa.show-sql=true
spring.jpa.open-in-view=true


pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.6.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>icn.tcast</groupId>
    <artifactId>itcast-crawler-jd</artifactId>
    <version>1.0-SNAPSHOT</version>
        <properties>
            <java.version>1.8</java.version>
        </properties>

        <dependencies>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-data-jpa</artifactId>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-jdbc</artifactId>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-web</artifactId>
            </dependency>

            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-devtools</artifactId>
                <scope>runtime</scope>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
                <scope>runtime</scope>
            </dependency>
            <dependency>
                <groupId>commons-logging</groupId>
                <artifactId>commons-logging</artifactId>
                <version>1.1.1</version>
            </dependency>
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-test</artifactId>
                <scope>test</scope>

            </dependency>
            <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
                <version>4.5.2</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.8.3</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
                <version>3.8.1</version>
            </dependency>
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
            </dependency>
        </dependencies>
        <build>
            <plugins>
                <plugin>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-maven-plugin</artifactId>
                </plugin>
            </plugins>
        </build>
    </project>

MySQL表的结构

HTML源码分析

Item类、ItemService接口、ItemDao接口、ItemServiceImpl类的编写

Item类

package cn.itcast.jd.pojo;




import javax.persistence.*;
import java.util.Date;

//声明这是个实体
@Entity
// 对应表的映射
@Table(name = "jd_item")  //表的一个映射

public class Item {
    //声明id是主键 以及自增的类型
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)

    private Long id;

    //商品集合id
    private Long spu;
    //商品最小单位类单元id
    private Long sku;
    //商品标题
    private String title;
    //商品价格
    private Double price;

    public Double getPrice() {
        return price;
    }

    public void setPrice(Double price) {
        this.price = price;
    }

    //商品图片
    private String pic;
    //商品的详情的地址
    private String url;
    //创建时间
    private Date created;
    //更新时间
    private Date updated;

    public Date getCreated() {
        return created;
    }

    public Long getId() {
        return id;
    }

    public void setId(Long id) {
        this.id = id;
    }

    public void setCreated(Date created) {
        this.created = created;
    }

    public String getPic() {
        return pic;
    }

    public void setPic(String pic) {
        this.pic = pic;
    }


    public Long getSku() {
        return sku;
    }

    public void setSku(Long sku) {
        this.sku = sku;
    }

    public Long getSpu() {
        return spu;
    }

    public void setSpu(Long spu) {
        this.spu = spu;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public Date getUpdated() {
        return updated;
    }

    public void setUpdated(Date updated) {
        this.updated = updated;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }
}

itemDao

package cn.itcast.jd.dao;

import cn.itcast.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;
//操作什么数据  主键类型是啥
public interface ItemDao extends JpaRepository<Item,Long> {
}

ItemService

package cn.itcast.jd.service;

import cn.itcast.jd.pojo.Item;

import java.util.List;

public interface ItemService {
    //保存商品
    public void save(Item item);

    //查询商品
    public List<Item> findAll(Item item);
}

ItemServiceImpl

package cn.itcast.jd.impl;

import cn.itcast.jd.dao.ItemDao;
import cn.itcast.jd.pojo.Item;
import cn.itcast.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;
//表示由spring创建它的实例
@Service
public class ItemServiceImpl implements ItemService {
    //注入商品的dao  自动装配
    @Autowired
    private ItemDao itemDao;
    @Override
    //开启事务
    @Transactional
    public void save(Item item) {
        this.itemDao.save(item);
    }

    @Override
    public List<Item> findAll(Item item) {
        //声明查询条件
        org.springframework.data.domain.Example<Item> example = Example.of(item);
        //依据查询条件来查询数据
        List<Item> list = this.itemDao.findAll(example);
        return list;
    }
}

HttpClient类的封装

package cn.itcast.jd.util;


import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;


//spring创建它的实例
@Component
public class HttpUtils {

    //使用连接池
    private PoolingHttpClientConnectionManager cm;


    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();

        //设置最大连接数
        this.cm.setMaxTotal(100);
        //设置主机最大连接数
        this.cm.setDefaultMaxPerRoute(10);




    }

    //根据请求地址 下载页面数据
    public String DoGetHtml(String url) {
        //获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        CloseableHttpResponse response = null;
        //设置HttpGet请求对象  设置url
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");

        //httpGet.setHeader("Referer", "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=b1a43153d64f4920a10f8ca31aa6fa6b");
        //设置请求信息

        httpGet.setConfig(this.getConfig());

        try {
            //使用httpClient发起请求  获取响应
            response = httpClient.execute(httpGet);
            //解析响应
            if (response.getStatusLine().getStatusCode() == 200) {
                //判断响应体Entity是否是不为空
                if (response.getEntity() != null) {
                    String content = EntityUtils.toString(response.getEntity(), "utf8");
                    return content;
                }

            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

        }
        return "";
    }


    /*
    * 下载图片
    * @param url
    * @return  图片名称
    */
    public  String doGetImages(String url){
        //获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        CloseableHttpResponse response = null;
        //设置HttpGet请求对象  设置url
        HttpGet httpGet;
        httpGet = new HttpGet(url);

        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");


        //设置请求信息

        httpGet.setConfig(this.getConfig());

        try {
            //使用httpClient发起请求  获取响应
            response = httpClient.execute(httpGet);
            //解析响应
            if (response.getStatusLine().getStatusCode() == 200) {
                //判断响应体Entity是否是不为空
                if (response.getEntity() != null) {
                    //下载图片

                    //获取图片的后缀
                    String extName = url.substring(url.lastIndexOf("."));
                    //创建图片名  重命名图片
                    String picName = UUID.randomUUID().toString()+extName;
                    //下载图片
                    //声明OutputStream
                    OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\mac12\\Desktop\\手机图片\\"+picName));
                    response.getEntity().writeTo(outputStream);
                    //返回图片名称
                    return picName;
                }

            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

        }
        //返回空
        return "";
    }
    //设置请求信息
    private RequestConfig getConfig(){
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)
                .setConnectionRequestTimeout(500)
                .setSocketTimeout(10*1000).build();
        return config;
    };

}

Application引导类的编写

package cn.itcast.jd;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication

//使用定时任务 需要开启定时任务,添加注解
@EnableScheduling

public class Application {
    public static void main(String[] args){
         SpringApplication.run(Application.class,args);
    }
}

ItemTask定时抓取任务

package cn.itcast.jd.task;

import cn.itcast.jd.pojo.Item;
import cn.itcast.jd.service.ItemService;
import cn.itcast.jd.util.HttpUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.http.client.methods.HttpGet;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.util.Date;
import java.util.List;

//表示由Spring创建实例
@Component
public class ItemTask {
    @Autowired
    private HttpUtils httpUtils;
    @Autowired
    private ItemService itemService;

    //解析json的工具类
    private static final ObjectMapper MAPPER = new ObjectMapper();


    //当下载任务完成后    间隔多长时间进行下一次的任务
    @Scheduled(fixedDelay = 100*1000)
    public void itemtsk() throws Exception{
        //声明解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=110&click=0&page=";
        //遍历页面  按照页面对手机的搜索进行遍历
        for(int i = 1;i < 100;i = i+2){
            //获取页面
            String html =httpUtils.DoGetHtml(url+i);
            //解析页面
            if(html != null) {
                this.parse(html);
            }

        }

        System.out.println("手机数据抓取完成...");





    }

    //解析页面并存储数据
    private void parse(String html) throws Exception {
        //解析HTML获取DoM对象
        Document doc =  Jsoup.parse(html);
        //获取spu
        Elements spuEles=doc.select("div#J_goodsList >ul > li");
        //Element spuEle = spuEles.first();
        for(Element spuEle:spuEles){
            //获取spu
            Long spu = Long.parseLong(spuEle.attr("data-spu"));

            //获取sku
            Elements skuEles = spuEle.select("li.ps-item");

             for(Element skuEle: skuEles){
                //获取sku
                Long sku =Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));
                //根据sku查询商品数据
                Item item = new Item();
                item.setSku(sku);
                List<Item>list = this.itemService.findAll(item);
                
                  //如果商品存在  就执行下一个

                if(list.size() > 0){
                    continue;
                }
                 //s设置商品的spu
                 item.setSpu(spu);

                //获取商品的url
                String itemUrl = "https://item.jd.com/"+sku+".html";
                item.setUrl(itemUrl);

                //商品的图片
                 String picUrl = "https:"+skuEle.select("img[data-sku]").first().attr("data-lazy-img");
                 picUrl = picUrl.replace("/n9/","/n1/");
                String picName = this.httpUtils.doGetImages(picUrl);
                item.setPic(picName);

                //商品的价格
                 String priceJson = this.httpUtils.DoGetHtml("https://p.3.cn/prices/mgets?skuIds=J_"+sku);
                 //取第一行里面的p对应的字符串并转化为Double
                 double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                item.setPrice(price);
                //商品的标题
                 String itemInfo = this.httpUtils.DoGetHtml(itemUrl);
                 String text = Jsoup.parse(itemInfo).select("div.sku-name").text();
                item.setTitle(text);
                //商品的创建时间
                item.setCreated(new Date());
                //商品的更新时间
                item.setUpdated(item.getCreated());
                this.itemService.save(item);
            }
        }

    }
}

获取的内容

商品信息

商品图片

发布了19 篇原创文章 · 获赞 3 · 访问量 3826

猜你喜欢

转载自blog.csdn.net/weixin_42792088/article/details/99840986