Main
package com.crawl; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class Main { public static void main(String[] args) { try { List<String> searchs = new ArrayList<String>(); searches.add("Yuan Zun, Tian Silkworm Potato"); searches.add("The Great Ruler, Silkworm Potato"); searches.add("Shengxu, Chendong"); searches.add("I am the supreme, the wind dominates the world"); for (String search: searchs) { String bookname=search.split(",")[0]; // String auther=search.split(",")[1]; StoreMysql.operateMysql("https://www.ddbiquge.com/s.php?q="+bookname,bookname); } } catch (IOException e) { e.printStackTrace (); } } }
StoreMysql
package com.crawl; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.crawl.dao.HttpClientUtil; import static com.crawl.dao.dao.*; public class StoreMysql { public static void operateMysql(String url,String bookname) throws IOException { String content = HttpClientUtil.getContent(url); // try // { // Thread.currentThread().sleep(3000);//毫秒 // } // catch(Exception ex){} Document doc = Jsoup.parse(content); // Parse the web page to get the document object dbTablesInit(); // Elements hrefElements = doc.getElementsByClass("result-item-title result-game-item-title"); Elements hrefElements = doc.getElementsByClass("bookname"); for (Element e : hrefElements) { String urlIndex = "https://www.ddbiquge.com"+e.getElementsByTag("a").attr("href"); System.out.println(urlIndex); String contentIndex = HttpClientUtil.getContent(urlIndex); Document docIndex = Jsoup.parse(contentIndex); // Parse the web page to get the document object Elements h2Elements = docIndex.getElementsByTag("h2"); // Query DOM by tag name Element h2Element = h2Elements.get(0); String h2 = h2Element.text(); System.out.println("题目:" + h2); if(!h2.equals(bookname))continue; Elements summarizeElements = docIndex.select(".small span");//作者 Element authorElement = summarizeElements.get(0); String author = authorElement.text().substring(3); System.out.println(author); Element typeElement = summarizeElements.get(1); String type = typeElement.text().substring(3); System.out.println(type); Element serialstatusElement = summarizeElements.get(2); String serialstatus = serialstatusElement.text().substring(3); System.out.println(serialstatus); Element wordNumberElement = summarizeElements.get(3); String wordNumber = wordNumberElement.text().substring(3); System.out.println(wordNumber); Elements introElement = docIndex.getElementsByClass("intro");// 简介 String intro = introElement.text().substring(3); System.out.println(intro); Elements hrefIndexElements = docIndex.select(".listmain dl dd a"); // String bookId = urlIndex.substring(30, urlIndex.length() - 5); String bookId = null; String pattern = "/book/([\\d]+).html"; Pattern r = Pattern.compile(pattern); Matches m = r.matches (urlIndex); if(m.find()) bookId = m.group(1); else System.out.println("nobookid"); Element updateTimeElement = docIndex.getElementsByClass("last").get(0); String updateTime = updateTimeElement.text().substring(5); System.out.println(updateTime); Date date = new Date(); DateFormat df = new SimpleDateFormat("yyyy-MM-dd"); String book_createdate=df.format(date); insertBook(bookId, h2, type, serialstatus, author, wordNumber, intro, urlIndex, updateTime, book_createdate); // INSERT INTO `book` (`book_id`,`book_name`, `type`,`serialstatus`,`author`,`word_number`,`desc`,`book_url`,`book_createdate`) Set<String> chapter_urls = new TreeSet<>(); for (Element el : hrefIndexElements) {// Get the chapter name String urlSub = el.attr("href"); String chapter_url = "https://www.ddbiquge.com" + urlSub; chapter_urls.add(chapter_url); } for (String chapter_url:chapter_urls) { String contentIndex1 = HttpClientUtil.getContent(chapter_url); Document docIndex1 = Jsoup.parse(contentIndex1); // Parse the web page to get the document object Elements chapterElements = docIndex1.getElementsByTag("h1"); // Query DOM by tag name Element chapterElement = chapterElements.get(0); String chapter = chapterElement.text(); System.out.println(chapter); String ct = docIndex1.select("#content").text(); ct=ct.replace(chapter_url,""); ct=ct.replace("Genius for a second to remember the address of this site: www.ddbiquge.com. Vertex Biquge mobile version reading website: m.ddbiquge.com",""); ct=ct.replaceAll("\\s+", "\r\n"); // int index = textsIndex.indexOf(" "); // while(index>-1){ // String line = textsIndex.substring(0,index); // textsIndex = textsIndex.substring(index+1); // System.out.println(line); // } String chapter_id =null; String patternc = "/chapter/[\\d]+_([\\d]+).html"; Pattern pa = Pattern.compile(patternc); Matcher ma = pa.matcher (chapter_url); if(ma.find()) chapter_id = ma.group(1); else System.out.println("nochapterid"); Date datec = new Date(); DateFormat dfc = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); String chapter_createdate=dfc.format(datec); insertChapter( chapter_id, bookId, chapter, ct, chapter_url, chapter_createdate); } } } }
dao
package com.crawl.dao; import java.sql.*; import static com.crawl.dao.DruidJdbc.getConnection; //import static com.crawl.dbcp.DBCPTest.getConnection; public class dao { public static void dbTablesInit() { ResultSet rs = null; // Properties p = new Properties(); Connection cn = getConnection(); try { rs = cn.getMetaData().getTables(null, null, "book", null); // p.load(dao.class.getResourceAsStream("/dbconfig.properties")); Statement st = cn.createStatement(); // There is no url table if (!rs.next()) { //create book table // st.execute(p.getProperty("createZhouLinTable")); // book bookid bookname type serialstatus author desc url createdate(yyyy-MM-dd) String sql = "CREATE TABLE `book` (" + "`book_id` int NOT NULL ," + "`book_name` varchar(255) NOT NULL ," + "`type` varchar(255) NOT NULL ," + "`serialstatus` varchar(255) NOT NULL ," + "`author` varchar(255) NOT NULL," + "`word_number` int NOT NULL," + "`desc` text," + "`book_url`varchar(255) NOT NULL," + "`updateTime` varchar(255) NOT NULL," + "`book_createdate` varchar(255) NOT NULL," + "KEY `index1_book_id` (`book_id`)," + "PRIMARY KEY (book_id,book_name,author))ENGINE=InnoDB DEFAULT CHARSET=utf8"; st.executeUpdate(sql); System.out.println("book table created successfully"); } else { System.out.println("book table already exists"); } rs = cn.getMetaData().getTables(null, null, "chapter", null); if (!rs.next()) { //create chapter table // chapter id chapterid name content url createdate(yyyy-mm-dd hh:mm:ss ) String sql = "CREATE TABLE `chapter` (" + "`chapter_id` int NOT NULL," + "`book_id` int NOT NULL," + "`name` varchar(255) NOT NULL," + "`content` text," + "`chapter_url`varchar(255) NOT NULL," + "`chapter_createdate` varchar(255) NOT NULL," + "KEY `index1_chapter_id` (`chapter_id`)," + "PRIMARY KEY (chapter_id))ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8"; st.executeUpdate(sql); System.out.println("chapter table created successfully"); // st.executeUpdate("CREATE INDEX index_book_id ON chapter (book_id)"); // System.out.println("chapter table index created successfully"); } else { System.out.println("chapter table already exists"); } rs.close(); st.close(); cn.close(); } catch (SQLException e) { e.printStackTrace (); } // catch (IOException e) { // e.printStackTrace (); // } } // String sql = "CREATE TABLE `book` (" + // "`book_id` int NOT NULL ," + // "`book_name` varchar(255) NOT NULL ," + // "`type` varchar(255) NOT NULL ," + // "`serialstatus` varchar(255) NOT NULL ," + // "`author` varchar(255) NOT NULL," + // "`word_number` int NOT NULL," + // "`desc` text," + // "`book_url`varchar(255) NOT NULL," + // "`updateTime` varchar(255) NOT NULL," + // "`book_createdate` varchar(255) NOT NULL," + //// "`INDEX`[indexName] (book_id(length)) " + // "PRIMARY KEY (book_id,book_name,author))ENGINE=InnoDB DEFAULT CHARSET=utf8"; public static void insertBook(String id, String book_name, String type, String serialstatus, String author,String wordnumber, String desc, String book_url, String updateTime,String book_createdate) { Connection cn = getConnection(); String sql = " INSERT INTO `book` (`book_id`,`book_name`, `type`,`serialstatus`,`author`,`word_number`,`desc`,`book_url`,`updateTime`,`book_createdate`) VALUES (?,?,?,?,?,?,?,?,?,?)"; try { //represents a precompiled sql object PreparedStatement preparedStatement = cn.prepareStatement(sql); preparedStatement.setString(1, id); preparedStatement.setString(2, book_name); preparedStatement.setString(3, type); preparedStatement.setString(4, serialstatus); preparedStatement.setString(5, author); preparedStatement.setString(6, wordnumber); preparedStatement.setString(7, desc); preparedStatement.setString(8, book_url); preparedStatement.setString(9, updateTime); preparedStatement.setString(10, book_createdate); preparedStatement.executeUpdate(); preparedStatement.close(); cn.close(); } catch (SQLException e) { e.printStackTrace (); } } // String sql = "CREATE TABLE `chapter` (" + // "`chapter_id` int NOT NULL," + // "`book_id` int NOT NULL," + // "`name` varchar(255) NOT NULL," + // "`content` text," + // "`chapter_url`varchar(255) NOT NULL," + // "`chapter_createdate` varchar(255) NOT NULL," + public static void insertChapter(String chapter_id,String bookId, String name, String content,String chapter_url,String chapter_createdate) { Connection cn = getConnection(); String sql = " INSERT INTO `chapter` (`chapter_id`,`book_id`,`name`,`content`,`chapter_url`,`chapter_createdate`) VALUES (?,?,?,?,?,?)"; try { //represents a precompiled sql object PreparedStatement preparedStatement = cn.prepareStatement(sql); preparedStatement.setString(1, chapter_id); preparedStatement.setString(2, bookId); preparedStatement.setString(3, name); preparedStatement.setString(4, content); preparedStatement.setString(5, chapter_url); preparedStatement.setString(6, chapter_createdate); preparedStatement.executeUpdate(); preparedStatement.close(); cn.close(); } catch (SQLException e) { e.printStackTrace (); } } }
Problem: Can't find a suitable website to crawl
Solution: try more, there will always be
The biggest problem is the code hierarchy