Getting started with maven druid muysql crawler (3)

pom.xml

<project
        xmlns="http://maven.apache.org/POM/4.0.0"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

  <parent>
    <groupId>org.slf4j</groupId>
    <artifactId>slf4j-parent</artifactId>
    <version>1.7.6</version>
  </parent>

  <modelVersion>4.0.0</modelVersion>

  <groupId>org.slf4j</groupId>
  <artifactId>slf4j-log4j12</artifactId>
  <packaging>jar</packaging>
  <name>SLF4J LOG4J-12 Binding</name>
  <description>SLF4J LOG4J-12 Binding</description>
  <url>http://www.slf4j.org</url>


  <dependencies>
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
    </dependency>

    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
    </dependency>
  </dependencies>

  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-jar-plugin</artifactId>
        <configuration>
          <archive>
            <manifestEntries>
              <Bundle-Version>${parsedVersion.osgiVersion}</Bundle-Version>
              <Bundle-Description>${project.description}</Bundle-Description>
              <Implementation-Version>${project.version}</Implementation-Version>
            </manifestEntries>
            <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
          </archive>
        </configuration>
      </plugin>
    </plugins>
  </build>

</project>


spring.xml

<bean id="dataSource" class="com.alibaba.druid.pool.DruidDataSource" init-method="init" destroy-method="close">
    <!-- Basic attributes url, user, password -->
    <property name="url" value="${jdbc.url}" />
    <property name="username" value="${jdbc.user}" />
    <property name="password" value="${jdbc.password}" />

    <!-- Configure initialization size, minimum, maximum -->
    <property name="initialSize" value="1" />
    <property name="minIdle" value="1" />
    <property name="maxActive" value="20" />

    <!-- Configure the time for getting a connection to wait for timeout-->
    <property name="maxWait" value="60000" />

    <!-- How long is the configuration interval to perform detection, and detect idle connections that need to be closed, in milliseconds-->
    <property name="timeBetweenEvictionRunsMillis" value="60000" />

    <!-- Configure the minimum lifetime of a connection in the pool, in milliseconds-->
    <property name="minEvictableIdleTimeMillis" value="300000" />

    <property name="validationQuery" value="SELECT 'x'" />
    <property name="testWhileIdle" value="true" />
    <property name="testOnBorrow" value="false" />
    <property name="testOnReturn" value="false" />

    <!-- Open PSCache and specify the size of PSCache on each connection-->
    <property name="poolPreparedStatements" value="true" />
    <property name="maxPoolPreparedStatementPerConnectionSize" value="20" />

    <!-- Configure the filters for monitoring statistics interception, after removing it, the monitoring interface sql cannot be counted -->
    <property name="filters" value="stat" />
</bean>

StoreMysql

package com.crawl;

import com.crawl.util.HttpClientUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

import static com.crawl.dao.dao.dbTablesInit;
import static com.crawl.dao.dao.insert;


public class StoreMysql {

    public static void operateMysql(String url) throws IOException {
        String content = HttpClientUtil.getContent(url);
        Document doc = Jsoup.parse(content); // Parse the web page to get the document object
        dbTablesInit();
        Elements hrefElements = doc.select("#list dl dd a");
        for (Element e : hrefElements) {
            String urlIndex = "http://www.biquge.com.tw" + e.attr("href");
            String contentIndex = HttpClientUtil.getContent(urlIndex);
            Document docIndex = Jsoup.parse(contentIndex); // Parse the web page to get the document object
            // Get the chapter name
            Elements chapterElements = docIndex.getElementsByTag("h1"); // Query DOM by tag name
            Element chapterElement = chapterElements.get(0);
            String chapter = chapterElement.text();
            String ct = docIndex.select("#content").text().replaceAll("\\s+","\r\n");
            insert(chapter,ct);

        }

    }
}

HttpClientUtil

package com.crawl.util;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientUtil {

    private static CloseableHttpClient httpClient;

    public static  String getContent(String url)  {
        String content= "";
        CloseableHttpResponse response=null;
        try {

            //The url to be accessed
//        String url = "http://www.biquge.com.tw/17_17380/";
            //Use the default configured httpclient
            httpClient = HttpClients.createDefault();

            // execute the request
            response = getResponse(url);

            //Print the entity content of the request and return the json format
            HttpEntity entity = response.getEntity();
            content=EntityUtils.toString(entity, "GBK");

            response.close();
        } catch (IOException e) {
            e.printStackTrace ();
        }
        return content;
    }

    public static CloseableHttpResponse getResponse(String url) throws IOException {
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
}

DruidTest

package com.crawl.druid;

import com.alibaba.druid.pool.DruidDataSource;
import java.io.IOException;
import java.sql.*;
import java.util.Properties;

public class DruidTest {
    private static DruidDataSource druidDataSource;

    /**
     * Read the configuration file and initialize the connection pool
     */
    private static void init() {
        Properties prop = new Properties();
        try {
            prop.load(DruidTest.class.getResourceAsStream("/druid.properties"));
             //declare DruidDataSource

            druidDataSource = new DruidDataSource();
            druidDataSource.setDriverClassName(prop.getProperty("jdbc.driverClassName"));
            druidDataSource.setUrl(prop.getProperty("jdbc.url"));
            druidDataSource.setUsername(prop.getProperty("jdbc.user"));
            druidDataSource.setPassword(prop.getProperty("jdbc.password"));

        } catch (IOException e) {
            e.printStackTrace ();
        }
    }

    /**
     * Get Connection
     *
     * @return Connection
     */
    public synchronized static Connection getConnection() {
        if (druidDataSource == null) {
            init();
        }
        try {
            return druidDataSource.getConnection();
        } catch (SQLException e) {
            e.printStackTrace ();
        }
        return null;
    }

}


Dao

package com.crawl.dao;


import com.crawl.druid.DruidTest;

import java.io.IOException;
import java.sql.*;
import java.util.Properties;

import static com.crawl.druid.DruidTest.getConnection;
//import static com.crawl.dbcp.DBCPTest.getConnection;

public class dao {
    public static void dbTablesInit() {
        ResultSet rs = null;
//        Properties p = new Properties();

        Connection cn = getConnection();
        try {
            rs = cn.getMetaData().getTables(null, null, "zhoulin", null);
//            p.load(dao.class.getResourceAsStream("/dbconfig.properties"));
            Statement st = cn.createStatement();
            // There is no url table
            if (!rs.next()) {
                //create url table
//                st.execute(p.getProperty("createZhouLinTable"));
                String sql = "CREATE TABLE `zhoulin` (`chapter` varchar(255) NOT NULL,`content` text)ENGINE=MyISAM DEFAULT CHARSET=utf8";
                st.executeUpdate(sql);

            } else {
                System.out.println("zhoulin table already exists");
            }

            rs.close();
            st.close();
            cn.close();
        } catch (SQLException e) {
            e.printStackTrace ();
        }
//        catch (IOException e) {
// e.printStackTrace ();
//        }

    }

//    public static void main(String[] args) {
//        try {
//            Connection cnection =  getConnection();
//            if (cnection != null) {
//                String sql = "select * from t_user2";
//                PreparedStatement preparedStatement = cnection.prepareStatement(sql);
//                ResultSet resultSet = preparedStatement.executeQuery();
//                while (resultSet.next()) {
//                    System.out.printf("%s %s\n", resultSet.getString("user_name"), resultSet.getString("password"));
//                }
//            }else{
// System.out.println("Failed to get Connection");
//            }
//        } catch (SQLException e) {
// e.printStackTrace ();
//        }
//    }

    public static void insert(String chapter, String content) {

        Connection cn = getConnection();

        String sql = "  INSERT INTO zhoulin (chapter, content) VALUES (?,?)";

        try {

            //represents a precompiled sql object
            PreparedStatement preparedStatement = cn.prepareStatement(sql);

            preparedStatement.setString(1, chapter);
            preparedStatement.setString(2, content);
            preparedStatement.executeUpdate();
            preparedStatement.close();
            cn.close();


        } catch (SQLException e) {
            e.printStackTrace ();
        }
    }
}


Problem: The configuration file user has more spaces

Problem: druid is difficult to solve: start learning with dbcp


So far this week, we have basically realized the crawling of novel content into mysql.


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324683660&siteId=291194637