pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <parent> <groupId>org.slf4j</groupId> <artifactId>slf4j-parent</artifactId> <version>1.7.6</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <packaging>jar</packaging> <name>SLF4J LOG4J-12 Binding</name> <description>SLF4J LOG4J-12 Binding</description> <url>http://www.slf4j.org</url> <dependencies> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <configuration> <archive> <manifestEntries> <Bundle-Version>${parsedVersion.osgiVersion}</Bundle-Version> <Bundle-Description>${project.description}</Bundle-Description> <Implementation-Version>${project.version}</Implementation-Version> </manifestEntries> <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile> </archive> </configuration> </plugin> </plugins> </build> </project>
spring.xml
<bean id="dataSource" class="com.alibaba.druid.pool.DruidDataSource" init-method="init" destroy-method="close"> <!-- Basic attributes url, user, password --> <property name="url" value="${jdbc.url}" /> <property name="username" value="${jdbc.user}" /> <property name="password" value="${jdbc.password}" /> <!-- Configure initialization size, minimum, maximum --> <property name="initialSize" value="1" /> <property name="minIdle" value="1" /> <property name="maxActive" value="20" /> <!-- Configure the time for getting a connection to wait for timeout--> <property name="maxWait" value="60000" /> <!-- How long is the configuration interval to perform detection, and detect idle connections that need to be closed, in milliseconds--> <property name="timeBetweenEvictionRunsMillis" value="60000" /> <!-- Configure the minimum lifetime of a connection in the pool, in milliseconds--> <property name="minEvictableIdleTimeMillis" value="300000" /> <property name="validationQuery" value="SELECT 'x'" /> <property name="testWhileIdle" value="true" /> <property name="testOnBorrow" value="false" /> <property name="testOnReturn" value="false" /> <!-- Open PSCache and specify the size of PSCache on each connection--> <property name="poolPreparedStatements" value="true" /> <property name="maxPoolPreparedStatementPerConnectionSize" value="20" /> <!-- Configure the filters for monitoring statistics interception, after removing it, the monitoring interface sql cannot be counted --> <property name="filters" value="stat" /> </bean>
StoreMysql
package com.crawl; import com.crawl.util.HttpClientUtil; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import static com.crawl.dao.dao.dbTablesInit; import static com.crawl.dao.dao.insert; public class StoreMysql { public static void operateMysql(String url) throws IOException { String content = HttpClientUtil.getContent(url); Document doc = Jsoup.parse(content); // Parse the web page to get the document object dbTablesInit(); Elements hrefElements = doc.select("#list dl dd a"); for (Element e : hrefElements) { String urlIndex = "http://www.biquge.com.tw" + e.attr("href"); String contentIndex = HttpClientUtil.getContent(urlIndex); Document docIndex = Jsoup.parse(contentIndex); // Parse the web page to get the document object // Get the chapter name Elements chapterElements = docIndex.getElementsByTag("h1"); // Query DOM by tag name Element chapterElement = chapterElements.get(0); String chapter = chapterElement.text(); String ct = docIndex.select("#content").text().replaceAll("\\s+","\r\n"); insert(chapter,ct); } } }
HttpClientUtil
package com.crawl.util; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpClientUtil { private static CloseableHttpClient httpClient; public static String getContent(String url) { String content= ""; CloseableHttpResponse response=null; try { //The url to be accessed // String url = "http://www.biquge.com.tw/17_17380/"; //Use the default configured httpclient httpClient = HttpClients.createDefault(); // execute the request response = getResponse(url); //Print the entity content of the request and return the json format HttpEntity entity = response.getEntity(); content=EntityUtils.toString(entity, "GBK"); response.close(); } catch (IOException e) { e.printStackTrace (); } return content; } public static CloseableHttpResponse getResponse(String url) throws IOException { HttpGet httpGet = new HttpGet(url); CloseableHttpResponse response = httpClient.execute(httpGet); return response; } }
DruidTest
package com.crawl.druid; import com.alibaba.druid.pool.DruidDataSource; import java.io.IOException; import java.sql.*; import java.util.Properties; public class DruidTest { private static DruidDataSource druidDataSource; /** * Read the configuration file and initialize the connection pool */ private static void init() { Properties prop = new Properties(); try { prop.load(DruidTest.class.getResourceAsStream("/druid.properties")); //declare DruidDataSource druidDataSource = new DruidDataSource(); druidDataSource.setDriverClassName(prop.getProperty("jdbc.driverClassName")); druidDataSource.setUrl(prop.getProperty("jdbc.url")); druidDataSource.setUsername(prop.getProperty("jdbc.user")); druidDataSource.setPassword(prop.getProperty("jdbc.password")); } catch (IOException e) { e.printStackTrace (); } } /** * Get Connection * * @return Connection */ public synchronized static Connection getConnection() { if (druidDataSource == null) { init(); } try { return druidDataSource.getConnection(); } catch (SQLException e) { e.printStackTrace (); } return null; } }
Dao
package com.crawl.dao; import com.crawl.druid.DruidTest; import java.io.IOException; import java.sql.*; import java.util.Properties; import static com.crawl.druid.DruidTest.getConnection; //import static com.crawl.dbcp.DBCPTest.getConnection; public class dao { public static void dbTablesInit() { ResultSet rs = null; // Properties p = new Properties(); Connection cn = getConnection(); try { rs = cn.getMetaData().getTables(null, null, "zhoulin", null); // p.load(dao.class.getResourceAsStream("/dbconfig.properties")); Statement st = cn.createStatement(); // There is no url table if (!rs.next()) { //create url table // st.execute(p.getProperty("createZhouLinTable")); String sql = "CREATE TABLE `zhoulin` (`chapter` varchar(255) NOT NULL,`content` text)ENGINE=MyISAM DEFAULT CHARSET=utf8"; st.executeUpdate(sql); } else { System.out.println("zhoulin table already exists"); } rs.close(); st.close(); cn.close(); } catch (SQLException e) { e.printStackTrace (); } // catch (IOException e) { // e.printStackTrace (); // } } // public static void main(String[] args) { // try { // Connection cnection = getConnection(); // if (cnection != null) { // String sql = "select * from t_user2"; // PreparedStatement preparedStatement = cnection.prepareStatement(sql); // ResultSet resultSet = preparedStatement.executeQuery(); // while (resultSet.next()) { // System.out.printf("%s %s\n", resultSet.getString("user_name"), resultSet.getString("password")); // } // }else{ // System.out.println("Failed to get Connection"); // } // } catch (SQLException e) { // e.printStackTrace (); // } // } public static void insert(String chapter, String content) { Connection cn = getConnection(); String sql = " INSERT INTO zhoulin (chapter, content) VALUES (?,?)"; try { //represents a precompiled sql object PreparedStatement preparedStatement = cn.prepareStatement(sql); preparedStatement.setString(1, chapter); preparedStatement.setString(2, content); preparedStatement.executeUpdate(); preparedStatement.close(); cn.close(); } catch (SQLException e) { e.printStackTrace (); } } }
Problem: The configuration file user has more spaces
Problem: druid is difficult to solve: start learning with dbcp
So far this week, we have basically realized the crawling of novel content into mysql.