pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <parent> <groupId>org.slf4j</groupId> <artifactId>slf4j-parent</artifactId> <version>1.7.6</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <packaging>jar</packaging> <name>SLF4J LOG4J-12 Binding</name> <description>SLF4J LOG4J-12 Binding</description> <url>http://www.slf4j.org</url> <dependencies> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <configuration> <archive> <manifestEntries> <Bundle-Version>${parsedVersion.osgiVersion}</Bundle-Version> <Bundle-Description>${project.description}</Bundle-Description> <Implementation-Version>${project.version}</Implementation-Version> </manifestEntries> <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile> </archive> </configuration> </plugin> </plugins> </build> </project>
spring.xml
<bean id="dataSource" class="com.alibaba.druid.pool.DruidDataSource" init-method="init" destroy-method="close"> <!-- 基本属性 url、user、password --> <property name="url" value="${jdbc.url}" /> <property name="username" value="${jdbc.user}" /> <property name="password" value="${jdbc.password}" /> <!-- 配置初始化大小、最小、最大 --> <property name="initialSize" value="1" /> <property name="minIdle" value="1" /> <property name="maxActive" value="20" /> <!-- 配置获取连接等待超时的时间 --> <property name="maxWait" value="60000" /> <!-- 配置间隔多久才进行一次检测,检测需要关闭的空闲连接,单位是毫秒 --> <property name="timeBetweenEvictionRunsMillis" value="60000" /> <!-- 配置一个连接在池中最小生存的时间,单位是毫秒 --> <property name="minEvictableIdleTimeMillis" value="300000" /> <property name="validationQuery" value="SELECT 'x'" /> <property name="testWhileIdle" value="true" /> <property name="testOnBorrow" value="false" /> <property name="testOnReturn" value="false" /> <!-- 打开PSCache,并且指定每个连接上PSCache的大小 --> <property name="poolPreparedStatements" value="true" /> <property name="maxPoolPreparedStatementPerConnectionSize" value="20" /> <!-- 配置监控统计拦截的filters,去掉后监控界面sql无法统计 --> <property name="filters" value="stat" /> </bean>
StoreMysql
package com.crawl; import com.crawl.util.HttpClientUtil; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import static com.crawl.dao.dao.dbTablesInit; import static com.crawl.dao.dao.insert; public class StoreMysql { public static void operateMysql(String url) throws IOException { String content = HttpClientUtil.getContent(url); Document doc = Jsoup.parse(content); // 解析网页 得到文档对象 dbTablesInit(); Elements hrefElements = doc.select("#list dl dd a"); for (Element e : hrefElements) { String urlIndex = "http://www.biquge.com.tw" + e.attr("href"); String contentIndex = HttpClientUtil.getContent(urlIndex); Document docIndex = Jsoup.parse(contentIndex); // 解析网页 得到文档对象 // 获取章节名 Elements chapterElements = docIndex.getElementsByTag("h1"); // 根据tag名称来查询DOM Element chapterElement = chapterElements.get(0); String chapter = chapterElement.text(); String ct = docIndex.select("#content").text().replaceAll("\\s+","\r\n"); insert(chapter,ct); } } }
HttpClientUtil
package com.crawl.util; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpClientUtil { private static CloseableHttpClient httpClient; public static String getContent(String url) { String content= ""; CloseableHttpResponse response=null; try { //即将访问的url // String url = "http://www.biquge.com.tw/17_17380/"; //使用默认配置的httpclient httpClient = HttpClients.createDefault(); //执行请求 response = getResponse(url); //打印请求的实体内容 返回json格式 HttpEntity entity = response.getEntity(); content=EntityUtils.toString(entity, "GBK"); response.close(); } catch (IOException e) { e.printStackTrace(); } return content; } public static CloseableHttpResponse getResponse(String url) throws IOException { HttpGet httpGet = new HttpGet(url); CloseableHttpResponse response = httpClient.execute(httpGet); return response; } }
DruidTest
package com.crawl.druid; import com.alibaba.druid.pool.DruidDataSource; import java.io.IOException; import java.sql.*; import java.util.Properties; public class DruidTest { private static DruidDataSource druidDataSource; /** * 读取配置文件,并且初始化连接池 */ private static void init() { Properties prop = new Properties(); try { prop.load(DruidTest.class.getResourceAsStream("/druid.properties")); //声明DruidDataSource druidDataSource = new DruidDataSource(); druidDataSource.setDriverClassName(prop.getProperty("jdbc.driverClassName")); druidDataSource.setUrl(prop.getProperty("jdbc.url")); druidDataSource.setUsername(prop.getProperty("jdbc.user")); druidDataSource.setPassword(prop.getProperty("jdbc.password")); } catch (IOException e) { e.printStackTrace(); } } /** * 获得Connection * * @return Connection */ public synchronized static Connection getConnection() { if (druidDataSource == null) { init(); } try { return druidDataSource.getConnection(); } catch (SQLException e) { e.printStackTrace(); } return null; } }
Dao
package com.crawl.dao; import com.crawl.druid.DruidTest; import java.io.IOException; import java.sql.*; import java.util.Properties; import static com.crawl.druid.DruidTest.getConnection; //import static com.crawl.dbcp.DBCPTest.getConnection; public class dao { public static void dbTablesInit() { ResultSet rs = null; // Properties p = new Properties(); Connection cn = getConnection(); try { rs = cn.getMetaData().getTables(null, null, "zhoulin", null); // p.load(dao.class.getResourceAsStream("/dbconfig.properties")); Statement st = cn.createStatement(); //不存在url表 if (!rs.next()) { //创建url表 // st.execute(p.getProperty("createZhouLinTable")); String sql = "CREATE TABLE `zhoulin` (`chapter` varchar(255) NOT NULL,`content` text)ENGINE=MyISAM DEFAULT CHARSET=utf8"; st.executeUpdate(sql); } else { System.out.println("zhoulin表已存在"); } rs.close(); st.close(); cn.close(); } catch (SQLException e) { e.printStackTrace(); } // catch (IOException e) { // e.printStackTrace(); // } } // public static void main(String[] args) { // try { // Connection cnection = getConnection(); // if (cnection != null) { // String sql = "select * from t_user2"; // PreparedStatement preparedStatement = cnection.prepareStatement(sql); // ResultSet resultSet = preparedStatement.executeQuery(); // while (resultSet.next()) { // System.out.printf("%s %s\n", resultSet.getString("user_name"), resultSet.getString("password")); // } // }else{ // System.out.println("获取Connection失败"); // } // } catch (SQLException e) { // e.printStackTrace(); // } // } public static void insert(String chapter, String content) { Connection cn = getConnection(); String sql = " INSERT INTO zhoulin (chapter, content) VALUES (?,?)"; try { //表示预编译的sql对象 PreparedStatement preparedStatement = cn.prepareStatement(sql); preparedStatement.setString(1, chapter); preparedStatement.setString(2, content); preparedStatement.executeUpdate(); preparedStatement.close(); cn.close(); } catch (SQLException e) { e.printStackTrace(); } } }
问题:配置文件user多写了空格
问题:druid难 解决:dbcp入手学习
至此本周基本实现爬取小说内容到mysql中。