http://webmagic.io/docs/zh/posts/ch4-basic-page-processor/selectable.html
webmagic官网文档解释
所需jar包:
以及webmagic(负责爬虫的jar包)
一般的jar包都在此网站找到 https://jar-download.com/
文件目录:
对于复杂的爬虫采取如上简单的分包。
c3p0-config.xml:
<?xml version="1.0" encoding="UTF-8"?> <c3p0-config> <default-config> <property name="driverClass">com.mysql.jdbc.Driver</property> <!-- <property name="jdbcUrl">jdbc:mysql:///web15</property> --> <property name="jdbcUrl">jdbc:mysql://localhost:3306/pachong?serverTimezone=UTC</property> <!-- &useSSL=false --> <property name="user">root</property> <property name="password">123456</property> <property name="initialPoolSize">5</property> <property name="maxPoolSize">20</property> </default-config> <named-config name="itheima"> <property name="driverClass">com.mysql.jdbc.Driver</property> <property name="jdbcUrl">jdbc:mysql:///web15</property> <property name="user">root</property> <property name="password">123456</property> </named-config> <named-config name="experiment"> <property name="driverClass">com.mysql.jdbc.Driver</property> <property name="jdbcUrl">jdbc:mysql:///experiment</property> <property name="user">root</property> <property name="password">123456</property> </named-config> </c3p0-config>
JdongMain.java:
package jingdong.main; import java.sql.Connection; import java.sql.SQLException; import java.util.List; import javax.sql.DataSource; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.BeanListHandler; import org.apache.http.impl.client.DefaultHttpClient; import org.junit.Test; import com.mchange.v2.c3p0.ComboPooledDataSource; import jingdong.model.JdModel; import jingdong.util.URLFerter; public class JdongMain { public static void main(String[] args) { Connection conn = null; System.setProperty("com.mchange.v2.c3p0.cfg.xml",new JdongMain().getClass().getClassLoader().getResource("").getPath() + "c3p0-config.xml"); //src下的c3p0路径 DataSource dataSource = new ComboPooledDataSource(); try { conn = dataSource.getConnection(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } //String url="http://search.jd.com/Search?keyword=Python&enc=utf-8&book=y&wq=Python&pvid=33xo9lni.p4a1qb"; //抓取的数据 String url = "https://www.xhsd.com/activity/channel-renwensheke"; List<JdModel> bookdatas = URLFerter.URLParser(url); /*for(JdModel jd:bookdatas) { System.out.println(jd.getBookName()); System.out.println(jd.getBookPrice()); }*/ //System.out.println(bookdatas); System.out.println("111111"); QueryRunner runner = new QueryRunner(); String sql = "insert into bookinfo(bookName,bookPrice) value(?,?)"; for(JdModel jd:bookdatas) { //System.out.println(jd); try { runner.update(conn, sql,jd.getBookName(),jd.getBookPrice()); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
JdModel.java:
package jingdong.model; public class JdModel { private String bookID; private String bookName; private Double bookPrice; public String getBookID() { return bookID; } public void setBookID(String bookID) { this.bookID = bookID; } public String getBookName() { return bookName; } public void setBookName(String bookName) { this.bookName = bookName; } public Double getBookPrice() { return bookPrice; } public void setBookPrice(Double double1) { this.bookPrice = double1; } }
URLFerter.java:
package jingdong.util; import java.util.ArrayList; import java.util.List; import jingdong.model.JdModel; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; public class URLFerter implements PageProcessor{ private static int count = 1; // private static String url; private static List<JdModel> jd = new ArrayList<JdModel>(); @Override public Site getSite() { // TODO Auto-generated method stub Site site = Site.me(); site.setTimeOut(3000); site.setRetrySleepTime(3); site.setSleepTime(1000); return site; } @Override public void process(Page page) { // TODO Auto-generated method stub if (count==1) { List<String> all = page.getHtml().regex("item-link.{1,50}target").all(); //List<String> all = page.getHtml().regex("\\d+").all(); //System.out.println(page.getHtml()); //System.out.println(all); for(String jd:all) { String substring = jd.substring(18, jd.length()-8); String url1 = "https:/"+substring; //System.out.println(url1); page.addTargetRequest(url1); } count++; } if (count==2) { List<String> bookName = page.getHtml().regex("js-item-name.{1,50}</span>").all(); List<String> bookPrice = page.getHtml().regex("js-item-price.{1,50}</span>").all(); List<String> rBookName=new ArrayList<String>(); List<Double> rBookPrice =new ArrayList<Double>(); //System.out.println("111"+bookName); //System.out.println("222"+bookPrice); for(String string:bookName) { int indexOf = string.indexOf("<span>"); String sub = string.substring(indexOf+6,string.length()-8); //System.out.println(sub); rBookName.add(sub); } for(String string:bookPrice) { int indexOf = string.indexOf("sale-price"); int indexOf2 = string.indexOf("</span>"); String substring = string.substring(indexOf+14,indexOf2); //System.out.println(substring); double parseDouble = Double.parseDouble(substring); //System.out.println("hhaha"+parseDouble); rBookPrice.add(parseDouble); } JdModel jdModel = null; for(int i=0;i<=bookName.size()-1;i++) { jdModel = new JdModel(); jdModel.setBookName(rBookName.get(i)); //System.out.println(jdModel.getBookName()); jdModel.setBookPrice(rBookPrice.get(i)); //System.out.println(jdModel.getBookPrice()); jd.add(jdModel); } } /*for(JdModel one:jd) { System.out.println(one.getBookName()); System.out.println(one.getBookPrice()); }*/ } public static List<JdModel> URLParser(String URL){ // url = URL; Spider create = Spider.create(new URLFerter()); create.addUrl(URL); create.thread(1); create.run(); //System.out.println(jd.size()); /*for(int i=0;i<=jd.size()-1;i++) { System.out.println(jd.get(i).getBookName()); System.out.println(jd.get(i).getBookPrice()); }*/ return jd; } }
数据库配置如下:
自此,爬取书店书籍信息成功。
Tip:1、文件名称如JDongMedel.java虽然是表示京东含义,但是实践JAVA爬虫过程中发现京东貌似设置了反爬机制,导致爬取实属不易,出此下策爬取新华书店书籍名。
2、URLFerter为爬取核心代码,其中的字符筛选可以使用三种方式 1)regex方法 (推荐) 2)xpath方法 (推荐) 3)Jsoup方法
3、爬虫代码的执行实际上是多次执行proceed方法,每request一次页面就会执行一次,所以使用静态变量的count可以有效的划分执行逻辑,减少不必要的执行。
4、如上为很简单类型的爬虫,网站并没有反爬机制。一般网站通过简单的数据对比分析可以爬取到自己想要的信息,想要爬取更多的网站学习更深的内容。
5、反爬机制:
- reboot.txt
- User-Agent https://blog.csdn.net/Lujuntong/article/details/81952519
- ip限制-可以代理ip
- 验证码
- ajax动态加载页面
- cookie限制