java爬虫案例——SpringBoot使用HttpClient、Jsoup爬取京东手机数据


前言

之前同事分享了一些关于Java爬虫的视频,其中有一个是用HttpClient及Jsoup爬取京东上的一些手机数据(如图片、标题、sku、spu等),同时参考几篇博客后基本实现目标,在此篇做个简单记录。

一、准备工作

由于需要将爬取到的数据的数据存储到数据库表中,因此需要建库建表。建库建表SQL如下:

DROP DATABASE IF EXISTS `crawler`;
CREATE DATABASE IF NOT EXISTS `crawler` DEFAULT CHARSET = `utf8`;
USE `crawler`;

SET FOREIGN_KEY_CHECKS = 0;

DROP TABLE IF EXISTS `jd_item`;
CREATE TABLE `jd_item` (
 `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
 `spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
 `sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
 `title` varchar(100) DEFAULT NULL COMMENT '商品标题',
 `price` bigint(10) DEFAULT NULL COMMENT '商品价格',
 `pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
 `url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
 `created` datetime DEFAULT NULL COMMENT '创建时间',
 `updated` datetime DEFAULT NULL COMMENT '更新时间',
 PRIMARY KEY (`id`),
 KEY `sku` (`sku`) USING BTREE 
) ENGINE = InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET = utf8 COMMENT =  '京东商品表';
  • 项目目录
    在这里插入图片描述

二、项目文件

1.项目依赖

pom.xml:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <artifactId>spring-boot-starter-parent</artifactId>
        <groupId>org.springframework.boot</groupId>
        <version>2.3.4.RELEASE</version>
    </parent>
    <groupId>cn.mlnt</groupId>
    <artifactId>mlnt-crawler-jd</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>
        <!--MySQL连接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.21</version>
        </dependency>
        <!--HttpClient-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>
        <!--Jsoup-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>
        <!--工具包-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
    </dependencies>

</project>

2.项目配置文件

application.properties(或使用.yml):

#DB Configuration:
spring.datasource.driverClassName=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
spring.datasource.username=root
spring.datasource.password=123456

#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true

3.pojo

Item.java:

package cn.mlnt.jd.pojo;

import javax.persistence.*;
import java.util.Date;

@Entity
@Table(name="jd_item")
public class Item {
    
    
    // 主键
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;

    // 标准产品单位(商品集合)
    private Long spu;

    // 库存量单位(最小品类单元)
    private Long sku;

    // 商品标题
    private String title;

    // 商品价格
    private Double price;

    // 商品图片
    private String pic;

    // 商品详情地址
    private String url;

    // 创建时间
    private Date created;

    // 更新时间
    private Date updated;

    public Long getId() {
    
    
        return id;
    }

    public void setId(Long id) {
    
    
        this.id = id;
    }

    public Long getSpu() {
    
    
        return spu;
    }

    public void setSpu(Long spu) {
    
    
        this.spu = spu;
    }

    public Long getSku() {
    
    
        return sku;
    }

    public void setSku(Long sku) {
    
    
        this.sku = sku;
    }

    public String getTitle() {
    
    
        return title;
    }

    public void setTitle(String title) {
    
    
        this.title = title;
    }

    public Double getPrice() {
    
    
        return price;
    }

    public void setPrice(Double price) {
    
    
        this.price = price;
    }

    public String getPic() {
    
    
        return pic;
    }

    public void setPic(String pic) {
    
    
        this.pic = pic;
    }

    public String getUrl() {
    
    
        return url;
    }

    public void setUrl(String url) {
    
    
        this.url = url;
    }

    public Date getCreated() {
    
    
        return created;
    }

    public void setCreated(Date created) {
    
    
        this.created = created;
    }

    public Date getUpdated() {
    
    
        return updated;
    }

    public void setUpdated(Date updated) {
    
    
        this.updated = updated;
    }
}

4.dao接口

ItemDao.java:

package cn.mlnt.jd.dao;

import cn.mlnt.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;

public interface ItemDao extends JpaRepository<Item, Long> {
    
    
}

5.service接口及其实现类

ItemService.java:

package cn.mlnt.jd.service;

import cn.mlnt.jd.pojo.Item;

import java.util.List;

public interface ItemService {
    
    

    /**
     * 保存商品
     * @param item
     */
    public void save(Item item);

    /**
     * 根据条件查询商品
     * @param item
     * @return
     */
    public List<Item> findAll(Item item);
}

ItemServiceImpl.java:

package cn.mlnt.jd.service.impl;

import cn.mlnt.jd.dao.ItemDao;
import cn.mlnt.jd.pojo.Item;
import cn.mlnt.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;

@Service
public class ItemServiceImpl implements ItemService {
    
    

    @Autowired
    private ItemDao itemDao;

    @Override
    @Transactional
    public void save(Item item) {
    
    
        this.itemDao.save(item);
    }

    @Override
    public List<Item> findAll(Item item) {
    
    
        // 声明查询条件
        Example<Item> example = Example.of(item);
        // 根据查询条件进行查询数据
        List<Item> list = this.itemDao.findAll(example);
        return list;
    }
}

6.HttpClient封装工具类

HttpUtils.java:

package cn.mlnt.jd.util;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

@Component
public class HttpUtils {
    
    

    private PoolingHttpClientConnectionManager cm;

    public HttpUtils() {
    
    
        this.cm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        this.cm.setMaxTotal(100);
        // 设置每个主机的最大连接数
        this.cm.setDefaultMaxPerRoute(10);
    }

    /**
     * 根据请求地址下载页面数据
     * @param url
     * @return 页面数据
     */
    public String doGetHtml(String url) {
    
    
        // 获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        // 创建httpGet对象,设置url地址
        HttpGet httpGet = new HttpGet(url);

        // 设置请求信息
        httpGet.setConfig(this.getConfig());

        // 设置请求Request Headers中的User-Agent,浏览器访问
        httpGet.addHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36");

        CloseableHttpResponse response = null;

        try {
    
    
            // 使用HttpClient发起请求,获取响应
            response = httpClient.execute(httpGet);

            // 解析响应,返回结果
            if(response.getStatusLine().getStatusCode() == 200) {
    
    
                String content = "";
                // 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
                if(response.getEntity() != null) {
    
    
                    content = EntityUtils.toString(response.getEntity(), "utf8");
                    return content;
                }
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            // 关闭response
           if(response != null) {
    
    
               try {
    
    
                   response.close();
               } catch (IOException e) {
    
    
                   e.printStackTrace();
               }
           }
        }
        // 返回空字符串
        return "";
    }

    /**
     * 下载图片
     * @param url
     * @return 图片名称
     */
    public String doGetImage(String url) {
    
    
        // 获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        // 创建httpGet对象,设置url地址
        HttpGet httpGet = new HttpGet(url);

        // 设置请求信息
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;

        try {
    
    
            // 使用HttpClient发起请求,获取响应
            response = httpClient.execute(httpGet);

            // 解析响应,返回结果
            if(response.getStatusLine().getStatusCode() == 200) {
    
    
                // 判断响应体Entity是否不为空
                if(response.getEntity() != null) {
    
    
                    // 下载图片
                    // 获取图片的后缀
                    String  extName = url.substring(url.lastIndexOf("."));

                    // 创建图片名,重命名图片
                    String picName = UUID.randomUUID().toString()+extName;

                    // 下载图片
                    // 声明OutPutStream,下载图片存储路径
                    OutputStream outputStream = new FileOutputStream(new File("E:\\images\\"+picName));

                    response.getEntity().writeTo(outputStream);

                    // 返回图片名称
                    return picName;
                }
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            // 关闭response
            if(response != null) {
    
    
                try {
    
    
                    response.close();
                } catch (IOException e) {
    
    
                    e.printStackTrace();
                }
            }
        }
        // 如果下载失败,返回空字符串
        return "";
    }


    /**
     * 设置请求信息
     * @return
     */
    private RequestConfig getConfig() {
    
    
        RequestConfig config = RequestConfig.custom()
                // 创建链接的最长时间
                .setConnectTimeout(1000)
                // 获取连接到最长时间
                .setConnectionRequestTimeout(500)
                // 数据传输的最长时间
                .setSocketTimeout(10000)
                .build();
        return config;
    }

    public static void main(String[] args) throws IOException {
    
    
        HttpUtils httpUtils = new HttpUtils();
        String itemInfo = httpUtils.doGetHtml("https://item.jd.com/100009082466.html");
        String title = Jsoup.parse(itemInfo).select("div#itemName").text();
        System.out.println(Jsoup.parse(itemInfo).select("div#itemName"));
        System.out.println(title);
    }
}

7.爬取任务实现

ItemTask.java:

package cn.mlnt.jd.task;

import cn.mlnt.jd.pojo.Item;
import cn.mlnt.jd.service.ItemService;
import cn.mlnt.jd.util.HttpUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.Date;
import java.util.List;

@Component
public class ItemTask {
    
    

    @Resource
    private HttpUtils httpUtils;

    @Resource
    private ItemService itemService;

    private static final ObjectMapper MAPPER = new ObjectMapper();

    /**
     * 当下载任务完成后,间隔多长时间进行下一次任务
     * @throws Exception
     */
    @Scheduled(fixedDelay = 100*1000)
    public void itemTask() throws Exception {
    
    
        // 声明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=1e449f956a3b49319117b81bbde91f3c";

        // 按照页面对手机的搜索结果进行遍历解析
        for (int i = 1; i < 10; i=i+2) {
    
    
            String html = httpUtils.doGetHtml(url + i);

            // 解析页面,获取商品数据并存储
            if (html != null) {
    
    
                this.parse(html);
            }
        }
        System.out.println("手机数据抓取完成!");
    }

    /**
     * 解析页面,获取商品数据并存储
     * @param html
     * @throws Exception
     */
    private void parse(String html) throws Exception {
    
    
        // 解析html获取Document对象
        Document document = Jsoup.parse(html);

        // 获取spu
        Elements spuEles = document.select("div#J_goodsList > ul > li");
        for (Element spuEle : spuEles) {
    
    
            // 获取spu
            String attr = spuEle.attr("data-spu");
            long spu = Long.parseLong(attr.equals("") ? "0" : attr);

            // 获取sku
            Elements skuEles = spuEle.select("li.ps-item");
            for (Element skuELe : skuEles) {
    
    
                // 获取sku
                long sku = Long.parseLong(skuELe.select("[data-sku]").attr("data-sku"));

                // 根据sku查询商品数据
                Item item = new Item();
                item.setSku(sku);
                List<Item> list = this.itemService.findAll(item);

                if(list.size() > 0) {
    
    
                    // 如果商品存在,就进行下一个循环,该商品不保存,因为已存在
                    continue;
                }

                // 设置商品的spu
                item.setSpu(spu);

                // 获取商品详情的url
                String itemUrl = "https://item.jd.com/" + sku + ".html";
                item.setUrl(itemUrl);

                // 获取商品的图片
                String picUrl = "https:" + skuELe.select("img[data-sku]").first().attr("data-lazy-img");
                //图片路径可能会为空的情况:一下为两种解决方式,第一种会让数据不全,第二种任会报错
                if(picUrl.equals("https:")){
    
    
                    break;
                }
                picUrl = picUrl.replace("/n9/", "/n1/");
                String picName = this.httpUtils.doGetImage(picUrl);
                item.setPic(picName);

                // 获取商品的价格
                String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                item.setPrice(price);

                //获取商品的标题
                String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
                // String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
                String title = Jsoup.parse(itemInfo).select("div#itemName").text();
                item.setTitle(title);

                item.setCreated(new Date());
                item.setUpdated(item.getCreated());

                // 保存商品数据到数据库中
                this.itemService.save(item);
            }
        }
    }
}

8.启动类

Application.java:

package cn.mlnt.jd;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
/**
 * 使用定时任务,需要先开启定时任务,需添加注解
 */
@EnableScheduling
public class Application {
    
    
    public static void main(String[] args) {
    
    
        SpringApplication.run(Application.class, args);
    }
}

三、项目执行效果

在这里插入图片描述

  • 爬取到的图片
    在这里插入图片描述
  • 存储到数据库中的记录
    在这里插入图片描述

总结

参照视频敲完后,执行项目并没有爬到数据,因为视频中没有提及要添加header,声明为浏览器访问。后来参考网上的博客后,遇到的问题基本解决。
 //设置请求Request Headers中的User-Agent,告诉京东说这是浏览器访问
        httpGet.addHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36");

参考文章地址:

猜你喜欢

转载自blog.csdn.net/username666/article/details/109134932