Java 爬虫学习(三)关于京东手机信息爬取

0. 效果 ?

  • 数据库 ?

  • 手机图片 ?

 

1. 项目搭建 (创建 springboot 项目,集成 jpa,lombok)

  • 项目结构

  • 数据库表结构

  • pom.xml 
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.6.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>mr.s</groupId>
    <artifactId>crawlerjd</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>crawlerjd</name>
    <description>crawler-jd</description>

    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-jdbc</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-devtools</artifactId>
            <scope>runtime</scope>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.8.1</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>

</project>
  • application.properties 

#DB Configuration
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&serverTimezone=Asia/Shanghai&characterEncoding=utf-8&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=123

#Jpa Configuration
spring.jpa.database=MySQL
spring.jpa.show-sql=true
spring.jpa.open-in-view=false

2. 代码编写

  •  pojo 下的 Item 类编写

package mr.s.jd.pojo;

import lombok.Data;

import javax.persistence.*;
import java.util.Date;

@Entity
@Table(name = "jd_item")
@Data
public class Item {
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    private Long spu;
    private Long sku;
    private String title;
    private Double price;
    private String pic;
    private String url;
    private Date created;
    private Date updated;
}
  • dao 下的 ItemDao 接口编写
package mr.s.jd.dao;

import mr.s.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;

public interface ItemDao extends JpaRepository<Item, Long> {

}
  • util 下的 HttpUtils 工具类编写 (注意一下,图片下载保存的地址)
package mr.s.jd.util;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

@Component
public class HttpUtils {
    // 连接池管理器
    private PoolingHttpClientConnectionManager cm;

    public HttpUtils(){
        this.cm = new PoolingHttpClientConnectionManager();

        // 设置最大连接数
        this.cm.setMaxTotal(100);
        // 设置每个主机的最大连接数
        this.cm.setDefaultMaxPerRoute(10);
    }

    /**
     * get 方式获取页面
     * @param url
     * @return 页面数据
     */
    public String doGetHtml(String url){
        // 获取 HttpClient 对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        // 创建 HttpGet 对象
        HttpGet httpGet = new HttpGet(url);

        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");

        httpGet.setHeader("Referer", "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=b1a43153d64f4920a10f8ca31aa6fa6b");

        // 设置请求信息
        httpGet.setConfig(this.getConfig());

        // 发起请求获得请求数据
        CloseableHttpResponse httpResponse = null;
        try {
            httpResponse = httpClient.execute(httpGet);
            if (httpResponse.getStatusLine().getStatusCode() == 200){
                // 判断响应体是否为空
                if (httpResponse.getEntity() != null){
                    return EntityUtils.toString(httpResponse.getEntity(), "utf8");
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            // 关闭
            if (httpResponse != null){
                try {
                    httpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        // 请求失败,返回空串
        return "";
    }

    /**
     * get 方式获取图片
     * @param url
     * @return 图片名称
     */
    public String doGetImage(String url){
        // 获取 HttpClient 对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        // 创建 HttpGet 对象
        HttpGet httpGet = new HttpGet(url);

        // 设置请求信息
        httpGet.setConfig(this.getConfig());

        // 发起请求获得请求数据
        CloseableHttpResponse httpResponse = null;
        try {
            httpResponse = httpClient.execute(httpGet);
            if (httpResponse.getStatusLine().getStatusCode() == 200){
                // 下载图片
                // 获得图片后缀
                String extName = url.substring(url.lastIndexOf("."));
                // 创建图片名
                String picName = UUID.randomUUID().toString() + extName;
                // 创建 OutputStream
                OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\xxx\\Desktop\\download\\" + picName));
                // 图片下载
                httpResponse.getEntity().writeTo(outputStream);
                return picName;

            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            // 关闭
            if (httpResponse != null){
                try {
                    httpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        // 下载失败返回空串
        return "";
    }

    // 设置请求信息
    private RequestConfig getConfig() {
        return RequestConfig.custom()
                .setConnectTimeout(1000)
                .setConnectionRequestTimeout(500)
                .setSocketTimeout(10000)
                .build();
    }
}
  • service 下的 ItemService 服务接口编写
package mr.s.jd.service;

import mr.s.jd.pojo.Item;

import java.util.List;

public interface ItemService {

    /**
     * 保存
     * @param item
     */
    public void save(Item item);

    /**
     * 查询
     * @param item
     * @return
     */
    public List<Item> findAll(Item item);
}
  • service.impl 下的 ItemServiceImpl 服务实现类编写
package mr.s.jd.service.impl;

import mr.s.jd.dao.ItemDao;
import mr.s.jd.pojo.Item;
import mr.s.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;

@Service
public class ItemServiceImpl implements ItemService {
    @Autowired
    private ItemDao itemDao;

    @Transactional
    @Override
    public void save(Item item) {
        itemDao.save(item);
    }

    @Override
    public List<Item> findAll(Item item) {
        Example<Item> example = Example.of(item);
        return itemDao.findAll(example);
    }
}
  • 最重要的 task 下的 ItemTask 定时任务编写
package mr.s.jd.task;

import com.fasterxml.jackson.databind.ObjectMapper;
import mr.s.jd.pojo.Item;
import mr.s.jd.service.ItemService;
import mr.s.jd.util.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;

import java.util.Date;
import java.util.List;

@Configuration //1.主要用于标记配置类,兼备Component的效果。
@EnableScheduling // 2.开启定时任务
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;

    @Autowired
    private ItemService itemService;

    private static final ObjectMapper MAPPER = new ObjectMapper();

    // 当下载任务完成后,间隔多长时间进行下一次的任务,单位是毫秒
    @Scheduled(fixedDelay = 100 * 1000)
    public void itemTask() throws Exception{
        // 声明解析初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=57&click=0&page=";
        // 按照页码进行遍历
        for (int page = 1; page < 10; page = page + 2){
            String html = httpUtils.doGetHtml(url + page);
            // 解析页面,获取商品数据并存储
            this.parse(html);
        }

        System.out.println("手机数据抓取完成");
    }

    // 解析页面,获取商品数据并存储
    private void parse(String html) throws Exception{
        // 解析页面,获取商品数据并存储
        Document doc = Jsoup.parse(html);

        // 获取 spu 信息
        Elements spuElements = doc.select("div#J_goodsList > ul > li");

        for (Element spuElement : spuElements){
            // 获取 spu
            Long spu = Long.parseLong(spuElement.attr("data-spu"));

            // 获取 sku 信息
            Elements skuElements = spuElement.select("li.ps-item");

            for (Element skuElement : skuElements) {
                // 获取 sku
                Long sku = Long.parseLong(skuElement.select("[data-sku]").attr("data-sku"));

                // 根据 sku 查询商品数据
                Item item = new Item();
                item.setSku(sku);
                List<Item> itemList = itemService.findAll(item);

                // 如果不存在,则进行保存操作
                if (itemList.size() == 0){
                    // 设置 spu
                    item.setSpu(spu);
                    // 拼接商品详情地址
                    String itemUrl = "https://item.jd.com/"+sku+".html";
                    item.setUrl(itemUrl);

                    // 获取商品图片
                    String picUrl = "https:" + skuElement.select("img[data-sku]").attr("data-lazy-img");
                    picUrl = picUrl.replace("/n9/", "/n1/");
                    String picName = httpUtils.doGetImage(picUrl);
                    item.setPic(picName);

                    // 获取商品价格
                    String priceJson = httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                    double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                    item.setPrice(price);

                    // 获取商品标题
                    String itemInfo = httpUtils.doGetHtml(item.getUrl());
                    String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
                    item.setTitle(title);

                    item.setCreated(new Date());
                    item.setUpdated(item.getCreated());

                    // 保存商品数据
                    itemService.save(item);
                }
            }
        }


    }
}

超级重要的 CrawlerjdApplication 类的编写!!!

package mr.s.jd;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
// 开启定时任务
@EnableScheduling
public class CrawlerjdApplication {

    public static void main(String[] args) {
        SpringApplication.run(CrawlerjdApplication.class, args);
    }

}

按照以上的操作,就可以通过爬虫方式获取到京东手机的信息了,感谢浏览!

发布了92 篇原创文章 · 获赞 23 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/assiduous_me/article/details/96710997