java京东商品信息爬取

在京东首页输入手机,我们能看到很多手机商品信息,接下来用java实现100页商品信息的爬取并录入到数据库


使用到的技术:HttpClient,Jsoup,多线程,阻塞队列
1.创建数据库,创建手机信息表

DROP TABLE IF EXISTS `phone`;
CREATE TABLE `phone` (
  `id` bigint(11) DEFAULT NULL,
  `name` varchar(255) DEFAULT NULL,
  `price` double DEFAULT NULL,
  `shop` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

2.创建maven工程,导入依赖
 

    <dependencies>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.32</version>
        </dependency>
        <dependency>
            <groupId>commons-dbutils</groupId>
            <artifactId>commons-dbutils</artifactId>
            <version>1.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>1.0.9</version>
        </dependency>
    </dependencies>

3.创建jdbcConfig.properties文件,封装了DataSource数据源的JDBCUtils类和商品实体类Phone
3.1    在工程目录下新建jdbcConfig.properties配置文件

jdbc.driver=com.mysql.jdbc.Driver
jdbc.url=jdbc:mysql://localhost:3306/db1
jdbc.username=root
jdbc.password=123456

3.2    JDBCUtils.java
 

package cn.swun.utils;

import com.alibaba.druid.pool.DruidDataSource;

import javax.sql.DataSource;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

public class JDBCUtils {

	private static String driver;
	private static String url;
	private static String username;
	private static String password;
	private static DruidDataSource dataSource = new DruidDataSource();;
	static {
		try {
			InputStream is = new FileInputStream("jdbcConfig1.properties");
			Properties properties = new Properties();
			properties.load(is);
			driver = properties.getProperty("jdbc.driver");
			url = properties.getProperty("jdbc.url");
			username = properties.getProperty("jdbc.username");
			password = properties.getProperty("jdbc.password");
		} catch (FileNotFoundException e) {
			System.out.println("配置文件不存在");
			System.exit(0);
		} catch (IOException e) {
			System.out.println("配置文件有误");
			System.exit(0);
		}
	}
	public static DataSource getDataSource(){
		dataSource.setDriverClassName(driver);
		dataSource.setUrl(url);
		dataSource.setUsername(username);
		dataSource.setPassword(password);
		return dataSource;
	}

}

3.3    Phone.java
 

package cn.swun.domain;

public class Phone {

	private Long id;
	private String name;
	private Double price;
	private String shop;

	public Long getId() {
		return id;
	}

	public void setId(String id) {
		this.id = Long.parseLong(id);
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public Double getPrice() {
		return price;
	}

	public void setPrice(String price) {
		this.price = Double.parseDouble(price);
	}

	public String getShop() {
		return shop;
	}

	public void setShop(String shop) {
		this.shop = shop;
	}

	@Override
	public String toString() {
		return "Phone{" +
				"id=" + id +
				", name='" + name + '\'' +
				", price=" + price +
				", shop='" + shop + '\'' +
				'}';
	}
}

4.京东爬虫核心类JdSplider.java

package cn.swun.splider;


import cn.swun.domain.Phone;
import cn.swun.utils.JDBCUtils;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.sql.SQLException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class JdSplider {

	//线程池
	private static final ExecutorService executorService = Executors.newCachedThreadPool();
	//阻塞队列,用于存放商品盒子li
	private static final BlockingQueue<Element> queueLi = new ArrayBlockingQueue<Element>(100);
	//阻塞队列,用于存放phone
	private static final BlockingQueue<Phone> queuePhone = new ArrayBlockingQueue<Phone>(100);
	//爬取的首页
	private String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=5b2751339d874f89b1a53a0b7eb6a55c";

	//开始爬取
	public void start() throws IOException {

		final String sql = "insert into phone(id,name,price,shop) values(?,?,?,?)";
		//创建10个消费者,消费phone队列并向数据库中插入商品信息
		for (int i = 0; i < 10; i++) {
			executorService.execute(new Runnable() {
				public void run() {
					QueryRunner queryRunner = new QueryRunner(JDBCUtils.getDataSource());
					while (true) {
						try {
							Phone phone = queuePhone.take();
							queryRunner.update(sql,phone.getId(),phone.getName(),phone.getPrice(),phone.getShop());
						} catch (InterruptedException e) {
							e.printStackTrace();
						} catch (SQLException e) {
							e.printStackTrace();
						}
					}
				}
			});
		}
		//创建10个消费者(解析队列中存放的li)
		for (int i = 0; i < 10; i++) {
			executorService.execute(new Runnable() {
				public void run() {
					//从队列中取出li进行解析
					while (true) {
						Element li = null;
						try {
							li = queueLi.take();
						} catch (InterruptedException e) {
							e.printStackTrace();
						}
						Phone phone = parseLi(li);
						if (phone != null) {
							queuePhone.offer(phone);
						}
					}
				}
			});
		}
		//获取首页
		CloseableHttpResponse indexRes = sendGet(url);
		//解析结果
		parseIndex(indexRes, 1);

	}

	//发送get请求,获取响应结果
	public CloseableHttpResponse sendGet(String url) throws IOException {

		//创建httpClient客户端
		CloseableHttpClient httpClient = HttpClients.createDefault();
		//创建请求对象,发送请求
		HttpGet httpGet = new HttpGet(url);
		httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36");
		CloseableHttpResponse response = httpClient.execute(httpGet);
		return response;

	}

	//解析首页
	public void parseIndex(CloseableHttpResponse indexRes, int page) throws IOException {

		System.out.println("---第" + page + "页抓取完毕---");
		//得到document对象
		String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
		//System.out.println(indexHtml);
		Document document = Jsoup.parse(indexHtml);
		//获取所有商品盒子(li.gl-item)
		Elements lis = document.select("li[class=gl-item]");
		//取出每个盒子置于队列中
		for (Element li : lis) {
			queueLi.offer(li);
		}
		if (++page <= 100) {
			int index = 2 * page - 1;
			String url = "https://search.jd.com/Search?keyword=手机&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=手机&cid2=653&cid3=655&page=" + index + "&click=0";
			CloseableHttpResponse nextRes = sendGet(url);
			parseIndex(nextRes, page);
		}

	}

	//解析每个盒子,封装到phone并返回
	public Phone parseLi(Element li) {

		try {
			Phone phone = new Phone();
			String id = li.attr("data-sku");
			String name = li.select("div.p-name em").get(0).text();
			String price = li.select("div.p-price i").get(0).text();
			String shop = li.select("div.p-shop a").attr("title");
			phone.setId(id);
			phone.setName(name);
			phone.setPrice(price);
			phone.setShop(shop);
			return phone;
		} catch (Exception e) {
			//System.out.println("错误数据");
		}
		return null;

	}

}

5.程序入口类JdSpliderApp.java

package cn.swun.app;

import cn.swun.splider.JdSplider;

import java.io.IOException;

public class JdSpliderApp {

	public static void main(String[] args) throws IOException, ClassNotFoundException {
		long start = System.currentTimeMillis();
		JdSplider jdSplider = new JdSplider();
		jdSplider.start();
		long end = System.currentTimeMillis();
		System.out.println("100页抓取完毕并保存至数据库用时:" + ((double)(end-start))/1000.00 + "s");
	}

}

项目目录结构如下
 

导出jar包,打开cmd窗口java -jar jd_splider.jar,数据已经保存到数据库
 

扫描二维码关注公众号,回复: 3252559 查看本文章

猜你喜欢

转载自blog.csdn.net/qq_38634814/article/details/82696279