Java爬虫更新mysql数据库(简单事例)

http://webmagic.io/docs/zh/posts/ch4-basic-page-processor/selectable.html

webmagic官网文档解释

所需jar包:

以及webmagic(负责爬虫的jar包)

一般的jar包都在此网站找到       https://jar-download.com/  

文件目录:

      对于复杂的爬虫采取如上简单的分包。

 c3p0-config.xml:

<?xml version="1.0" encoding="UTF-8"?>
<c3p0-config>

	<default-config>
		<property name="driverClass">com.mysql.jdbc.Driver</property>
		<!-- <property name="jdbcUrl">jdbc:mysql:///web15</property> -->
		<property name="jdbcUrl">jdbc:mysql://localhost:3306/pachong?serverTimezone=UTC</property>
		<!-- &amp;useSSL=false -->
		<property name="user">root</property>
		<property name="password">123456</property>
		<property name="initialPoolSize">5</property>
		<property name="maxPoolSize">20</property>
	</default-config>

	<named-config name="itheima">
		<property name="driverClass">com.mysql.jdbc.Driver</property>
		<property name="jdbcUrl">jdbc:mysql:///web15</property>
		<property name="user">root</property>
		<property name="password">123456</property>
	</named-config>

	<named-config name="experiment">
		<property name="driverClass">com.mysql.jdbc.Driver</property>
		<property name="jdbcUrl">jdbc:mysql:///experiment</property>
		<property name="user">root</property>
		<property name="password">123456</property>
	</named-config>
</c3p0-config>

JdongMain.java:

package jingdong.main;

import java.sql.Connection;
import java.sql.SQLException;
import java.util.List;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.junit.Test;

import com.mchange.v2.c3p0.ComboPooledDataSource;

import jingdong.model.JdModel;
import jingdong.util.URLFerter;


public class JdongMain {
	
	public static void main(String[] args) {
		
		Connection conn = null;
		System.setProperty("com.mchange.v2.c3p0.cfg.xml",new JdongMain().getClass().getClassLoader().getResource("").getPath() + "c3p0-config.xml");
		//src下的c3p0路径
		DataSource dataSource = new ComboPooledDataSource();
		try {
			conn = dataSource.getConnection();
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//String url="http://search.jd.com/Search?keyword=Python&enc=utf-8&book=y&wq=Python&pvid=33xo9lni.p4a1qb"; 
		//抓取的数据
		String url = "https://www.xhsd.com/activity/channel-renwensheke";
		List<JdModel> bookdatas = URLFerter.URLParser(url);
		
		/*for(JdModel jd:bookdatas) {
			System.out.println(jd.getBookName());
			System.out.println(jd.getBookPrice());
		}*/
		//System.out.println(bookdatas);
		System.out.println("111111");
		
		QueryRunner runner = new QueryRunner();
		String sql = "insert into bookinfo(bookName,bookPrice) value(?,?)";
		for(JdModel jd:bookdatas) {
			//System.out.println(jd);
			try {
				runner.update(conn, sql,jd.getBookName(),jd.getBookPrice());
			} catch (SQLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}
}

JdModel.java:

package jingdong.model;

public class JdModel {
	private String bookID;
	private String bookName;
	private Double bookPrice;
	
	
	public String getBookID() {
		return bookID;
	}
	public void setBookID(String bookID) {
		this.bookID = bookID;
	}
	public String getBookName() {
		return bookName;
	}
	public void setBookName(String bookName) {
		this.bookName = bookName;
	}
	public Double getBookPrice() {
		return bookPrice;
	}
	public void setBookPrice(Double double1) {
		this.bookPrice = double1;
	}
	
	
}

URLFerter.java:

package jingdong.util;

import java.util.ArrayList;
import java.util.List;

import jingdong.model.JdModel;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class URLFerter implements PageProcessor{
	private static int count = 1;
//	private static String url;
	private static List<JdModel> jd = new ArrayList<JdModel>();
	
	@Override
	public Site getSite() {
		// TODO Auto-generated method stub
		Site site = Site.me();
		site.setTimeOut(3000);
		site.setRetrySleepTime(3);
		site.setSleepTime(1000);
		return site;
	}

	@Override
	public void process(Page page) {
		// TODO Auto-generated method stub
		if (count==1) {
			List<String> all = page.getHtml().regex("item-link.{1,50}target").all();
		//List<String> all = page.getHtml().regex("\\d+").all();
		//System.out.println(page.getHtml());
		//System.out.println(all);
			for(String jd:all) {
				String substring = jd.substring(18, jd.length()-8);
				String url1 = "https:/"+substring;
				//System.out.println(url1);
				page.addTargetRequest(url1);
			}
			count++;
		}
		
		if (count==2) {
			List<String> bookName = page.getHtml().regex("js-item-name.{1,50}</span>").all();
			List<String> bookPrice = page.getHtml().regex("js-item-price.{1,50}</span>").all();
			List<String> rBookName=new ArrayList<String>();
			List<Double> rBookPrice =new ArrayList<Double>();
			//System.out.println("111"+bookName);
			//System.out.println("222"+bookPrice);
			
			for(String string:bookName) {
				int indexOf = string.indexOf("<span>");
				String sub = string.substring(indexOf+6,string.length()-8);
				//System.out.println(sub);
				rBookName.add(sub);
			}
			for(String string:bookPrice) {
				int indexOf = string.indexOf("sale-price");
				int indexOf2 = string.indexOf("</span>");
				String substring = string.substring(indexOf+14,indexOf2);
				//System.out.println(substring);
				double parseDouble = Double.parseDouble(substring);
				//System.out.println("hhaha"+parseDouble);
				rBookPrice.add(parseDouble);
				
			}
			JdModel jdModel = null;
			for(int i=0;i<=bookName.size()-1;i++) {
				jdModel = new JdModel();
				jdModel.setBookName(rBookName.get(i));
				//System.out.println(jdModel.getBookName());
				jdModel.setBookPrice(rBookPrice.get(i));
				//System.out.println(jdModel.getBookPrice());
				jd.add(jdModel);
				
			}
		}
		/*for(JdModel one:jd) {
			System.out.println(one.getBookName());
			System.out.println(one.getBookPrice());
		}*/
	}
	
	public static List<JdModel> URLParser(String URL){
//		url = URL;
		Spider create = Spider.create(new URLFerter());
		create.addUrl(URL);
		create.thread(1);
		create.run();
		//System.out.println(jd.size());
		/*for(int i=0;i<=jd.size()-1;i++) {
			System.out.println(jd.get(i).getBookName());
			System.out.println(jd.get(i).getBookPrice());
		}*/
		return jd;
		
	}
}

数据库配置如下:

  自此,爬取书店书籍信息成功。

Tip:1、文件名称如JDongMedel.java虽然是表示京东含义,但是实践JAVA爬虫过程中发现京东貌似设置了反爬机制,导致爬取实属不易,出此下策爬取新华书店书籍名。

2、URLFerter为爬取核心代码,其中的字符筛选可以使用三种方式  1)regex方法 (推荐)   2)xpath方法 (推荐)   3)Jsoup方法

3、爬虫代码的执行实际上是多次执行proceed方法,每request一次页面就会执行一次,所以使用静态变量的count可以有效的划分执行逻辑,减少不必要的执行。

4、如上为很简单类型的爬虫,网站并没有反爬机制。一般网站通过简单的数据对比分析可以爬取到自己想要的信息,想要爬取更多的网站学习更深的内容。

5、反爬机制:

发布了40 篇原创文章 · 获赞 24 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_41466575/article/details/98535290