Java爬取京东商品数据

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/QuietHRH/article/details/82808212

爬取京东商品数据

我把项目部署到了linux中,进行爬取,爬到了3000条手机信息,只是爬了一些简单的文本信息.

SQL

本文爬取的数据为京东手机信息

准备工作

  • 导入爬取数据需要的依赖包
  • 编写httpClient工具类
  • 编写pojo类
  • 编写dao
<dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.4</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>

        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.2.4.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>

        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.1</version>
        </dependency>
    </dependencies>

package com.hrh.utils;

import com.hrh.pojo.Product;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class HttpClientUtils {

    //创建httpclient连接池
    private static PoolingHttpClientConnectionManager connectionManager;
    static{
        connectionManager=new PoolingHttpClientConnectionManager();
        //定义连接池最大连接数
        connectionManager.setMaxTotal(200);
        //对指定的网址最多只有20个连接
        connectionManager.setDefaultMaxPerRoute(20);
    }

    private static CloseableHttpClient getCloseableHttpClient(){
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connectionManager).build();
        return httpClient;
    }

    private static String execute(HttpRequestBase httpRequestBase) throws IOException {
        httpRequestBase.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");

        //设置超时时间
        RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000).setSocketTimeout(10 * 1000).build();

        httpRequestBase.setConfig(config);
        CloseableHttpClient httpClient = getCloseableHttpClient();
        CloseableHttpResponse response = httpClient.execute(httpRequestBase);

        String html = EntityUtils.toString(response.getEntity(), "utf-8");
        return html;
    }

    public static String doGet(String url) throws IOException {
        HttpGet httpGet = new HttpGet(url);
        String html = execute(httpGet);
        return html;
    }

    public static String doPost(String url, Map<String,String> params) throws IOException {
        HttpPost httpPost = new HttpPost(url);

        List<BasicNameValuePair> list = new ArrayList<>();
        for (String key : params.keySet()) {
            list.add(new BasicNameValuePair(key,params.get(key)));
        }

        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list);
        httpPost.setEntity(entity);

        return execute(httpPost);
    }
}

package com.hrh.pojo;

/**
 * 商品表
 */
public class Product {

    private String pid;
    private String title;
    private String brand;
    private String pname;
    private String price;

	public String getPid() {
		return pid;
	}

	public void setPid(String pid) {
		this.pid = pid;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String getBrand() {
		return brand;
	}

	public void setBrand(String brand) {
		this.brand = brand;
	}

	public String getPname() {
		return pname;
	}

	public void setPname(String pname) {
		this.pname = pname;
	}

	public String getPrice() {
		return price;
	}

	public void setPrice(String price) {
		this.price = price;
	}

	@Override
	public String toString() {
		return "Product{" +
				"pid=" + pid +
				", title='" + title + '\'' +
				", brand='" + brand + '\'' +
				", pname='" + pname + '\'' +
				", price=" + price +
				'}';
	}
}

package com.hrh.dao;

import com.hrh.pojo.Product;
import com.mchange.v2.c3p0.ComboPooledDataSource;
import org.springframework.jdbc.core.JdbcTemplate;

import java.beans.PropertyVetoException;

public class ProductDao extends JdbcTemplate{

    public ProductDao(){

        //定义c3p0连接池
        ComboPooledDataSource ds = new ComboPooledDataSource();
        try {
            ds.setDriverClass("com.mysql.jdbc.Driver");
            ds.setUser("root");
            ds.setPassword("123");
            ds.setJdbcUrl("jdbc:mysql://localhost:3306/crawler?characterEncoding=utf-8");
        } catch (PropertyVetoException e) {
            e.printStackTrace();
        }

        super.setDataSource(ds);
    }
    public void addProduct(Product product){
        super.update("insert into jd_phone values (?,?,?,?,?)",
                product.getPid(),product.getTitle(),product.getPname(),product.getBrand(),product.getPrice());

    }


}

核心代码

创建线程池和队列 开启线程 等待队列中的数据并进行分析
博客: 线程池和队列的基本使用

在获得手机列表时,pid一个一个的解析,效率太低,解析完一个页面的pid,才能进入下一页,继续解析.所以要引入多线程

  • 线程池的使用: 提高程序执行效率
    • 如果使用线程池,就要考虑线程安全问题
    • pid在存储时,要放到线程安全的容器中, 并且容器时FIFO的
  • 队列的使用: 线程安全(阻塞队列)

基本流程 :

  • 确定手机列表页的URL 进行分析 得到本页中所有手机的pid
  • 将pid 放入阻塞队列中 等待线程的解析
  • 根据pid获得确定具体手机的URL 进行解析 将数据封装到product对象中
  • 调用dao 将product对象存到数据库中
package com.hrh.test;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.hrh.dao.ProductDao;
import com.hrh.pojo.Product;
import com.hrh.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;


public class JDPhone {

    //创建dao对象
    static ProductDao productDao = new ProductDao();
    //创建线程池
    static ExecutorService threadPool = Executors.newFixedThreadPool(20);
    //创建原生阻塞队列  队列最大容量为1000
    static BlockingQueue<String> queue=new ArrayBlockingQueue<String>(1000);

    public static void main(String[] args) throws IOException, InterruptedException {

        //监视队列大小的线程
        threadPool.execute(new Runnable() {
            @Override
            public void run() {
                while(true){
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    //获得队列当前的大小
                    int size = queue.size();
                    System.out.println("当前队列中有"+size+"个pid");
                }
            }
        });

        //开启10个线程去解析手机列表页获得的pids
        for (int i = 1; i <=10; i++) {
            threadPool.execute(new Runnable() {
                @Override
                public void run() {
                    while (true){
                        String pid=null;

                        try {
                            //从队列中取出pid
                            pid = queue.take();
                            Product product = parsePid(pid);
                            //存入数据库
                            productDao.addProduct(product);
                        } catch (Exception e) {
                            e.printStackTrace();
                            try {
                                //出现异常则放回队列
                                queue.put(pid);
                            } catch (InterruptedException e1) {
                                e1.printStackTrace();
                            }
                        }
                    }
                }
            });
        }


        //分页查找手机数据 共100页
        for (int i = 1; i <=100 ; i++) {
            //京东分页page为 1 3 5 7 .....
            //         对应第一页 第二页....
            String url="https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&page="+(2*i-1);
            String html = HttpClientUtils.doGet(url);
            parseIndex(html);
        }

    }

    //解析手机列表页
    private  static void parseIndex(String html) throws IOException, InterruptedException {
        Document document = Jsoup.parse(html);
        //手机列表
        Elements elements = document.select("#J_goodsList>ul>li");

        if(elements!=null||elements.size()!=0){
            for (Element element : elements) {
                //获得每个li的pid
                String pid = element.attr("data-pid");
                //将pid放入队列中
                queue.put(pid);
            }
        }
    }

    //解析每个手机的页面 获得某个手机的详细数据
    private static Product parsePid(String pid) throws IOException {
        //拼接url 进入手机详情页
        String productUrl="https://item.jd.com/"+pid+".html";
        String productHtml = HttpClientUtils.doGet(productUrl);
        Document document = Jsoup.parse(productHtml);

        Product product = new Product();

        //获得手机标题
        if(document.select("div.sku-name").size()>0){
            String title = document.select("div.sku-name").get(0).text();
            product.setTitle(title);
        }

        //获得手机品牌
        String brand = document.select("#parameter-brand li").attr("title");
        product.setBrand(brand);

        //获得手机名称
        String pname = document.select("[class=parameter2 p-parameter-list] li:first-child").attr("title");
        product.setPname(pname);

        /*  此方案无法获取到价格
            jd的价格采用异步刷新,price不在返回的html文档中,需要我们去请求价格页面
            Elements select = document.select("span[class=price J-p-" + pid + "]");
            System.out.println(select);
        */

        //拼接价格页面url 经过测试 返回Json数据  jd对IP进行了限制,加入pduid为随机数,是为了可以获取更多数据,但是依然只能爬取部分
        String priceUrl="https://p.3.cn/prices/mgets?pduid="+Math.random()+"&skuIds=J_"+pid;
        String priceJson = HttpClientUtils.doGet(priceUrl);
        System.out.println(priceJson);
        Gson gson = new GsonBuilder().create();
        List<Map<String,String>> list = gson.fromJson(priceJson, List.class);
        String price = list.get(0).get("p");
        product.setPrice(price);


        product.setPid(pid);
        return product;
    }


}

出现的问题:

  • SocketTimeException 超时异常,因为jd对IP进行了限制,请求次数太多时,会被限制, 以后的文章会解决这个问题…

---------------------------------------------------------------------更新…-------------------------------------------------------------

爬笔记本数据的代码(只是更换了URL)

SQL

package com.hrh.test;

import com.google.gson.Gson;
import com.hrh.dao.ProductDao;
import com.hrh.pojo.Product;
import com.hrh.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * @author QuietHR
 * @create 2018/9/22
 **/
public class JDPC {

    private static BlockingQueue<String> queue=new ArrayBlockingQueue<String>(1000);

    private static  ExecutorService executorService = Executors.newFixedThreadPool(50);

    private static ProductDao productDao=new ProductDao();



    public static void main(String[] args) throws Exception {

        executorService.execute(new Runnable() {
            @Override
            public void run() {
                while (true){
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    int size = queue.size();
                    System.out.println("当前队列中有"+size+"个pid");

                }
            }
        });

        for (int i = 0; i < 30; i++) {
            executorService.execute(new Runnable() {
                @Override
                public void run() {
                    while (true){
                        String pid = null;
                        try {
                            pid = queue.take();
                            Product product = parsePid(pid);
                            productDao.addProduct(product);
                        } catch (Exception e) {
                            e.printStackTrace();
                            try {
                                queue.put(pid);
                            } catch (InterruptedException e1) {
                                e1.printStackTrace();
                            }
                        }

                    }
                }
            });
        }

        page();

    }
    private static void page() throws Exception {
        for (int i = 1; i <=100 ; i++) {
            String url="https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&page="+(2*i-1);
            String html = HttpClientUtils.doGet(url);
            parseIndex(html);
        }
    }
    private static void parseIndex(String html) throws InterruptedException {
        Document document = Jsoup.parse(html);
        Elements liEl = document.select("[class=gl-warp clearfix]>li");
        for (Element li : liEl) {
            queue.put(li.attr("data-sku"));
        }
    }
    private static Product parsePid(String pid) throws Exception {
        String url="https://item.jd.com/"+pid+".html";
        String html = HttpClientUtils.doGet(url);
        Document document = Jsoup.parse(html);

        Product product = new Product();

        product.setPid(pid);

        Elements titleEl = document.select("[class=sku-name]");
        product.setTitle(titleEl.text());

        Elements brandEl = document.select("#parameter-brand>li");
        product.setBrand(brandEl.attr("title"));

        Elements pnameEl = document.select("[class=parameter2 p-parameter-list]>li:first-child");
        product.setPname(pnameEl.attr("title"));

        String productUrl="https://p.3.cn/prices/mgets?pduid="+Math.random()+"&skuIds=J_"+pid;
        String json = HttpClientUtils.doGet(productUrl);
        Gson gson = new Gson();
        List<Map<String,String>> list = gson.fromJson(json, List.class);
        String price = list.get(0).get("p");

        product.setPrice(price);

        return product;
    }
}

猜你喜欢

转载自blog.csdn.net/QuietHRH/article/details/82808212