Java爬虫之科目一题目

用到的工具:

1、FastJson(JSON与Java对象进行转换)

2、Loombok(简化JavaBean的编写)

3、线程池(提高爬虫爬取效率,多线程执行任务)

4、批量提交执行SQL(减少与数据库的连接,可以批量插入数据)

5、Jsoup(一个基于Java开发的爬虫库)

建议使用Maven进行开发,以下是我用到的Pom文件:

        <!--MySQL-->
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.47</version>
		</dependency>
        <!--jsoup爬虫库-->
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.3</version>
		</dependency>
        <!--fastjson-->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.54</version>
		</dependency>
        <!--lombok-->
        <dependency>
			<groupId>org.projectlombok</groupId>
			<artifactId>lombok</artifactId>
			<version>1.18.4</version>
		</dependency>
		<dependency>
			<groupId>org.projectlombok</groupId>
			<artifactId>lombok</artifactId>
			<version>1.18.4</version>
		</dependency>

实体类对象VO:QuestionVO

package drive;

import com.alibaba.fastjson.annotation.JSONField;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.io.Serializable;

/**
 * The type QuestionVO.
 *
 * @author 小书包
 * @date 2018 /12/31 11:48
 */
@Data
@AllArgsConstructor
@NoArgsConstructor
public class QuestionVO implements Serializable {

	/**
	 * id : 2
	 * question : 机动车驾驶人违法驾驶造成重大交通事故构成犯罪的,依法追究什么责任?
	 * answerA : 刑事责任
	 * answerB : 民事责任
	 * answerC : 经济责任
	 * answerD : 直接责任
	 * correctAnswer : 1
	 * imageUrl :
	 * bestAnswer : 《道路交通安全法》第一百零一条:违反道路交通安全法律、法规的规定,发生重大交通事故,构成犯罪的,依法追究刑事责任,并由公安机关交通管理部门吊销机动车驾驶证。
	 * bestAnswerId : 2600002
	 * type : 2
	 * sinaImg :
	 */
	//id
	@JSONField(name = "id")
	private int id;
	//问题
	@JSONField(name = "question")
	private String question;
	@JSONField(name = "a")
	private String answerA;
	@JSONField(name = "b")
	private String answerB;
	@JSONField(name = "c")
	private String answerC;
	@JSONField(name = "d")
	private String answerD;
	/**
	 * 正确答案
	 * 单选题:1:A;2:B;3:C;4:D;
	 * 多选题:12:AB;13:AC;14:AD;23:BC;24:BD;34:CD;123:ABC;124:ABD;234:BCD;1234:ABCD;
	 * 判断题:1:正确;2:错误;
	 */
	@JSONField(name = "correctAnswer")
	private int correctAnswer;
	//图片原始url
	@JSONField(name = "imageurl")
	private String imageUrl;
	//题目解释
	@JSONField(name = "bestanswer")
	private String bestAnswer;
	@JSONField(name = "bestanswerid")
	private String bestAnswerId;
	/**
	 * 题目类型
	 * 1.判断题
	 * 2.单选题
	 * 3.多选题
	 */
	@JSONField(name = "Type")
	private int type;
	//如果为空,则使用imageUrl的地址,如果不为空"http://ww"+(random.nextInt(5))+".sinaimg.cn/mw600/"+sinaImg;
	@JSONField(name = "sinaimg")
	private String sinaImg;

}
@Data注解:提供了setter、getter、toString等方法的编写
@AllArgsConstructor:具有所有参数构造方法
@NoArgsConstructor:无参构造方法
@JSONField:json对象中对应的名称

实体类对象DO:QuestionDO

package drive;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.sql.Timestamp;
import java.util.Random;

/**
 * @author 小书包
 * @date 2018/12/31 13:40
 */
@Data
@AllArgsConstructor
@NoArgsConstructor
public class QuestionDO {

	private String question;
	private String answerA;
	private String answerB;
	private String answerC;
	private String answerD;
	private int correctAnswer;
	private String imageUrl;
	private String bestAnswer;
	private int type;
	private Timestamp createTime;
	private Timestamp updateTime;

	public QuestionDO(QuestionVO questionVO) {
		this.setQuestion(questionVO.getQuestion());
		this.setAnswerA(questionVO.getAnswerA());
		this.setAnswerB(questionVO.getAnswerB());
		this.setAnswerC(questionVO.getAnswerC());
		this.setAnswerD(questionVO.getAnswerD());
		this.setCorrectAnswer(questionVO.getCorrectAnswer());
		if (!"".equals(questionVO.getSinaImg())) {
			String imageUrl = "http://ww" + (new Random().nextInt(4) + 1) + ".sinaimg.cn/mw600/" + questionVO.getSinaImg();
			this.setImageUrl(imageUrl);
		} else {
			this.setImageUrl(questionVO.getImageUrl());
		}
		this.setBestAnswer(questionVO.getBestAnswer());
		this.setType(questionVO.getType());
		this.setCreateTime(new Timestamp(System.currentTimeMillis()));
		this.setUpdateTime(new Timestamp(System.currentTimeMillis()));
	}
}

里面有一个构造函数,用于实现VO到DO的转换

爬虫类对象

package drive;

import com.alibaba.fastjson.JSON;
import lombok.Cleanup;
import lombok.NoArgsConstructor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import sql.JDBCUtils;

import java.io.*;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * @author 小书包
 * @date 2018/12/31 12:11
 */
@NoArgsConstructor
public class ParseUrl implements Runnable {

	//原子类,用于每个线程执行的时候自增
	private static AtomicInteger id = new AtomicInteger(0);
	//获得当前机器的cpu核数
	private final int processor = Runtime.getRuntime().availableProcessors();
	//线程池,初始线程数为:核数*10,最大线程数为:核数*核数,如果有其他任务将放置在阻塞队列中
	private ThreadPoolExecutor executor = new ThreadPoolExecutor(processor * 10, processor * processor,
			0L, TimeUnit.MILLISECONDS,
			new LinkedBlockingQueue<>());
	//科目一题目总数
	private final static int QUESTION_NUMBER = 13696;
	//初始化arrayList大小,避免反复扩容引起的性能下降,存储从页面中解析的json对象
	private static ArrayList<QuestionVO> arrayList = new ArrayList<>(QUESTION_NUMBER);
	//需要放入数据库的对象
	private static ArrayList<QuestionDO> questionDOArrayList = new ArrayList<>(QUESTION_NUMBER);
	//当所有的线程全部执行完毕之后,再关闭线程池
	private CountDownLatch countDownLatch = new CountDownLatch(QUESTION_NUMBER);
	//保存从页面解析的文本,需要稍后写入文件中
	private static StringBuilder builder = new StringBuilder();

	private ParseUrl(CountDownLatch countDownLatch) {
		this.countDownLatch = countDownLatch;
	}

	@Override
	public void run() {
		try {
			TimeUnit.MILLISECONDS.sleep(new Random().nextInt(500));
			String url = "http://mnks.jxedt.com/get_question?index=" + id.incrementAndGet();
			System.out.println("当前访问的地址为:" + url);
			String json = Jsoup.connect(url)
					.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
					.timeout(3000)
					.get()
					.body()
					.text();
			builder.append(json).append("\r\n");
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			countDownLatch.countDown();
		}
	}

	public static void main(String[] args) throws SQLException, ClassNotFoundException, IOException {
		ParseUrl parseUrl = new ParseUrl();
		parseUrl.runThreadPool();
		parseUrl.writeJsonToFile();
		parseUrl.jsonTransFormToObject();
		parseUrl.objectTransFormToObject();
		parseUrl.insertToDB();
	}

	/**
	 * 执行爬虫线程
	 */
	private void runThreadPool() {
		long startTime = System.currentTimeMillis();
		for (int i = 0; i < QUESTION_NUMBER; i++) {
			//提交线程任务
			executor.submit(new ParseUrl(countDownLatch));
		}
		//当所有的任务执行完毕之后,线程池关闭
		try {
			countDownLatch.await();
		} catch (InterruptedException e) {
			e.printStackTrace();
		} finally {
			executor.shutdown();
		}
		System.out.println("跑线程的时间为:" + (System.currentTimeMillis() - startTime) + "ms");
	}

	/**
	 * 将从页面获得的值写入文本
	 *
	 * @throws IOException
	 */
	private void writeJsonToFile() throws IOException {
		@Cleanup BufferedWriter writer = new BufferedWriter(new FileWriter(new File("driver.json")));
		writer.write(builder.toString());
		writer.flush();
	}

	/**
	 * 将文本转成对象
	 *
	 * @throws IOException
	 */
	private void jsonTransFormToObject() throws IOException {
		@Cleanup BufferedReader reader = new BufferedReader(new FileReader(new File("driver-copy.json")));
		String json;
		while ((json = reader.readLine()) != null) {
			if (json.startsWith("ERROR")) {
				continue;
			}
			QuestionVO question;
			try {
				question = JSON.parseObject(json, QuestionVO.class);
			} catch (Exception e) {
				try {
					json = json.replaceAll("\"\"", "\"");
					json = json.replaceAll(" \",", "\"\",");
					json = json.replaceAll(": \" }", ": \"\" }");
					question = JSON.parseObject(json, QuestionVO.class);
				} catch (Exception e1) {
					json = json.replaceAll("\\\\", "/");
					question = JSON.parseObject(json, QuestionVO.class);
				}
			}
			arrayList.add(question);
		}
	}

	/**
	 * 将VO对象转成DO对象
	 */
	private void objectTransFormToObject() {
		System.out.println(arrayList.size());
		for (QuestionVO questionVO : arrayList) {
			questionDOArrayList.add(new QuestionDO(questionVO));
		}
	}

	/**
	 * 批量插入数据到数据库
	 *
	 * @throws SQLException
	 * @throws ClassNotFoundException
	 */
	private void insertToDB() throws SQLException, ClassNotFoundException {
		long startTime = System.currentTimeMillis();
		@Cleanup Connection connection = JDBCUtils.getConnection();
		connection.setAutoCommit(false);
		String sql = "insert into question(question,answerA,answerB,answerC,answerD,correctAnswer,imageUrl,bestAnswer,type,createTime,updateTime ) values(?,?,?,?,?,?,?,?,?,?,?)";
		PreparedStatement statement = connection.prepareStatement(sql);
		for (QuestionDO questionDO : questionDOArrayList) {
			int index = 1;
			statement.setString(index++, questionDO.getQuestion());
			statement.setString(index++, questionDO.getAnswerA());
			statement.setString(index++, questionDO.getAnswerB());
			statement.setString(index++, questionDO.getAnswerC());
			statement.setString(index++, questionDO.getAnswerD());
			statement.setInt(index++, questionDO.getCorrectAnswer());
			statement.setString(index++, questionDO.getImageUrl());
			statement.setString(index++, questionDO.getBestAnswer());
			statement.setInt(index++, questionDO.getType());
			statement.setTimestamp(index++, questionDO.getCreateTime());
			statement.setTimestamp(index++, questionDO.getUpdateTime());
			statement.addBatch();
		}
		statement.executeBatch();
		connection.commit();
		System.out.println("插入数据库的时间为:" + (System.currentTimeMillis() - startTime) + "ms");
	}
}

数据库连接工具类:JDBCUtils

package sql;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;

/**
 * @author 小书包
 * @date 2018/12/22 21:43
 */
public class JDBCUtils {

	public static Connection getConnection() throws SQLException, ClassNotFoundException {
		String driver = "com.mysql.jdbc.Driver";
		String url = "jdbc:mysql://localhost:3306/driver";
		String username = "root";
		String password = "xxx";
		Class.forName(driver);
		Connection connection = null;
		connection = DriverManager.getConnection(url, username, password);
		return connection;
	}

}

爬取的j结果共有12000多条数据

猜你喜欢

转载自blog.csdn.net/qq_32409957/article/details/85479739
今日推荐