Getting started with maven druid mysql crawler (4)

Main

package com.crawl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class Main {
    public static void main(String[] args) {

        try {
            List<String> searchs = new ArrayList<String>();
            searches.add("Yuan Zun, Tian Silkworm Potato");
            searches.add("The Great Ruler, Silkworm Potato");
            searches.add("Shengxu, Chendong");
            searches.add("I am the supreme, the wind dominates the world");

            for (String search: searchs) {
                String bookname=search.split(",")[0];
//                String auther=search.split(",")[1];
                StoreMysql.operateMysql("https://www.ddbiquge.com/s.php?q="+bookname,bookname);
            }

        } catch (IOException e) {
            e.printStackTrace ();
        }

    }
}


StoreMysql

package com.crawl;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.crawl.dao.HttpClientUtil;
import static com.crawl.dao.dao.*;
public class StoreMysql {

    public static void operateMysql(String url,String bookname) throws IOException {
        String content = HttpClientUtil.getContent(url);
//        try
//        {
//            Thread.currentThread().sleep(3000);//毫秒
//        }
//        catch(Exception ex){}
        Document doc = Jsoup.parse(content); // Parse the web page to get the document object

        dbTablesInit();

//        Elements hrefElements = doc.getElementsByClass("result-item-title result-game-item-title");
        Elements hrefElements = doc.getElementsByClass("bookname");
        for (Element e : hrefElements) {

            String urlIndex = "https://www.ddbiquge.com"+e.getElementsByTag("a").attr("href");
            System.out.println(urlIndex);
            String contentIndex = HttpClientUtil.getContent(urlIndex);
            Document docIndex = Jsoup.parse(contentIndex); // Parse the web page to get the document object

            Elements h2Elements = docIndex.getElementsByTag("h2"); // Query DOM by tag name
            Element h2Element = h2Elements.get(0);
            String h2 = h2Element.text();
            System.out.println("题目:" + h2);
            if(!h2.equals(bookname))continue;

            Elements summarizeElements = docIndex.select(".small span");//作者
            Element authorElement = summarizeElements.get(0);
            String author = authorElement.text().substring(3);
            System.out.println(author);

            Element typeElement = summarizeElements.get(1);
            String type = typeElement.text().substring(3);
            System.out.println(type);

            Element serialstatusElement = summarizeElements.get(2);
            String serialstatus = serialstatusElement.text().substring(3);
            System.out.println(serialstatus);

            Element wordNumberElement = summarizeElements.get(3);
            String wordNumber = wordNumberElement.text().substring(3);
            System.out.println(wordNumber);

            Elements introElement = docIndex.getElementsByClass("intro");// 简介
            String intro = introElement.text().substring(3);
            System.out.println(intro);

            Elements hrefIndexElements = docIndex.select(".listmain dl dd a");

//            String bookId = urlIndex.substring(30, urlIndex.length() - 5);

            String bookId = null;
            String pattern = "/book/([\\d]+).html";
            Pattern r = Pattern.compile(pattern);
            Matches m = r.matches (urlIndex);
            if(m.find())
                bookId = m.group(1);
            else
                System.out.println("nobookid");

            Element updateTimeElement = docIndex.getElementsByClass("last").get(0);
            String updateTime = updateTimeElement.text().substring(5);
            System.out.println(updateTime);

            Date date = new Date();
            DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
            String book_createdate=df.format(date);

            insertBook(bookId, h2, type, serialstatus, author, wordNumber, intro, urlIndex, updateTime, book_createdate);
//            INSERT INTO `book` (`book_id`,`book_name`, `type`,`serialstatus`,`author`,`word_number`,`desc`,`book_url`,`book_createdate`)


            Set<String> chapter_urls = new TreeSet<>();
            for (Element el : hrefIndexElements) {// Get the chapter name
                String urlSub = el.attr("href");
                String chapter_url = "https://www.ddbiquge.com" + urlSub;
                chapter_urls.add(chapter_url);
            }

            for (String chapter_url:chapter_urls) {
                String contentIndex1 = HttpClientUtil.getContent(chapter_url);
                Document docIndex1 = Jsoup.parse(contentIndex1); // Parse the web page to get the document object

                Elements chapterElements = docIndex1.getElementsByTag("h1"); // Query DOM by tag name
                Element chapterElement = chapterElements.get(0);
                String chapter = chapterElement.text();
                System.out.println(chapter);

                String ct = docIndex1.select("#content").text();
                ct=ct.replace(chapter_url,"");
                ct=ct.replace("Genius for a second to remember the address of this site: www.ddbiquge.com. Vertex Biquge mobile version reading website: m.ddbiquge.com","");
                ct=ct.replaceAll("\\s+", "\r\n");

//            int index  = textsIndex.indexOf(" ");
//            while(index>-1){
//                String line = textsIndex.substring(0,index);
//                textsIndex = textsIndex.substring(index+1);
//                System.out.println(line);
//            }
                String chapter_id =null;
                String patternc = "/chapter/[\\d]+_([\\d]+).html";
                Pattern pa = Pattern.compile(patternc);
                Matcher ma = pa.matcher (chapter_url);
                if(ma.find())
                    chapter_id = ma.group(1);
                else
                    System.out.println("nochapterid");

                Date datec = new Date();
                DateFormat dfc = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
                String chapter_createdate=dfc.format(datec);
                insertChapter( chapter_id, bookId, chapter, ct, chapter_url, chapter_createdate);
            }
        }
    }
}



dao

package com.crawl.dao;


import java.sql.*;

import static com.crawl.dao.DruidJdbc.getConnection;


//import static com.crawl.dbcp.DBCPTest.getConnection;

public class dao {
    public static void dbTablesInit() {
        ResultSet rs = null;
//        Properties p = new Properties();

        Connection cn = getConnection();
        try {
            rs = cn.getMetaData().getTables(null, null, "book", null);
//            p.load(dao.class.getResourceAsStream("/dbconfig.properties"));
            Statement st = cn.createStatement();
            // There is no url table
            if (!rs.next()) {
                //create book table
//                st.execute(p.getProperty("createZhouLinTable"));

//                book	bookid	bookname	type	serialstatus	author	desc	url	createdate(yyyy-MM-dd)
                String sql = "CREATE TABLE `book` (" +
                        "`book_id` int NOT NULL ," +
                        "`book_name` varchar(255) NOT NULL ," +
                        "`type` varchar(255) NOT NULL ," +
                        "`serialstatus` varchar(255) NOT NULL ," +
                        "`author` varchar(255) NOT NULL," +
                        "`word_number` int NOT NULL," +
                        "`desc` text," +
                        "`book_url`varchar(255) NOT NULL," +
                        "`updateTime` varchar(255) NOT NULL," +
                        "`book_createdate` varchar(255) NOT NULL," +
                        "KEY `index1_book_id` (`book_id`)," +
                        "PRIMARY KEY (book_id,book_name,author))ENGINE=InnoDB DEFAULT CHARSET=utf8";
                st.executeUpdate(sql);
                System.out.println("book table created successfully");

            } else {
                System.out.println("book table already exists");
            }

            rs = cn.getMetaData().getTables(null, null, "chapter", null);
            if (!rs.next()) {
                //create chapter table
//                chapter id chapterid name content	url  createdate(yyyy-mm-dd hh:mm:ss )
                String sql = "CREATE TABLE `chapter` (" +
                        "`chapter_id` int NOT NULL," +
                        "`book_id` int NOT NULL," +
                        "`name` varchar(255) NOT NULL," +
                        "`content` text," +
                        "`chapter_url`varchar(255) NOT NULL," +
                        "`chapter_createdate` varchar(255) NOT NULL," +
                        "KEY `index1_chapter_id` (`chapter_id`)," +
                        "PRIMARY KEY (chapter_id))ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8";
                st.executeUpdate(sql);
                System.out.println("chapter table created successfully");
//                st.executeUpdate("CREATE INDEX index_book_id ON chapter (book_id)");
// System.out.println("chapter table index created successfully");

            } else {
                System.out.println("chapter table already exists");
            }
            rs.close();
            st.close();
            cn.close();
        } catch (SQLException e) {
            e.printStackTrace ();
        }
//        catch (IOException e) {
// e.printStackTrace ();
//        }

    }

//    String sql = "CREATE TABLE `book` (" +
//            "`book_id` int NOT NULL ," +
//            "`book_name` varchar(255) NOT NULL ," +
//            "`type` varchar(255) NOT NULL ," +
//            "`serialstatus` varchar(255) NOT NULL ," +
//            "`author` varchar(255) NOT NULL," +
//            "`word_number` int NOT NULL," +
//            "`desc` text," +
//            "`book_url`varchar(255) NOT NULL," +
//            "`updateTime` varchar(255) NOT NULL," +
//            "`book_createdate` varchar(255) NOT NULL," +
////                        "`INDEX`[indexName] (book_id(length)) " +
//            "PRIMARY KEY (book_id,book_name,author))ENGINE=InnoDB DEFAULT CHARSET=utf8";
    public static void insertBook(String id, String book_name, String type, String serialstatus, String author,String wordnumber, String desc, String book_url, String updateTime,String book_createdate) {

        Connection cn = getConnection();

        String sql = "  INSERT INTO `book` (`book_id`,`book_name`, `type`,`serialstatus`,`author`,`word_number`,`desc`,`book_url`,`updateTime`,`book_createdate`) VALUES (?,?,?,?,?,?,?,?,?,?)";

        try {

            //represents a precompiled sql object
            PreparedStatement preparedStatement = cn.prepareStatement(sql);

            preparedStatement.setString(1, id);
            preparedStatement.setString(2, book_name);
            preparedStatement.setString(3, type);
            preparedStatement.setString(4, serialstatus);
            preparedStatement.setString(5, author);
            preparedStatement.setString(6, wordnumber);
            preparedStatement.setString(7, desc);
            preparedStatement.setString(8, book_url);
            preparedStatement.setString(9, updateTime);
            preparedStatement.setString(10, book_createdate);
            preparedStatement.executeUpdate();
            preparedStatement.close();
            cn.close();


        } catch (SQLException e) {
            e.printStackTrace ();
        }
    }



//    String sql = "CREATE TABLE `chapter` (" +
//            "`chapter_id` int NOT NULL," +
//            "`book_id` int NOT NULL," +
//            "`name` varchar(255) NOT NULL," +
//            "`content` text," +
//            "`chapter_url`varchar(255) NOT NULL," +
//            "`chapter_createdate` varchar(255) NOT NULL," +

    public static void insertChapter(String chapter_id,String bookId, String name, String content,String chapter_url,String chapter_createdate) {

        Connection cn = getConnection();

        String sql = "  INSERT INTO `chapter` (`chapter_id`,`book_id`,`name`,`content`,`chapter_url`,`chapter_createdate`) VALUES (?,?,?,?,?,?)";

        try {

            //represents a precompiled sql object
            PreparedStatement preparedStatement = cn.prepareStatement(sql);

            preparedStatement.setString(1, chapter_id);
            preparedStatement.setString(2, bookId);
            preparedStatement.setString(3, name);
            preparedStatement.setString(4, content);
            preparedStatement.setString(5, chapter_url);
            preparedStatement.setString(6, chapter_createdate);
            preparedStatement.executeUpdate();
            preparedStatement.close();
            cn.close();


        } catch (SQLException e) {
            e.printStackTrace ();
        }
    }


}


Problem: Can't find a suitable website to crawl

Solution: try more, there will always be


The biggest problem is the code hierarchy

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324839573&siteId=291194637