1、首先要有 jsoup-1.7.1.jar jar包 引入。
2、jsoup 的工具类:
package com.wp.util; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; public class JsoupUtil { public static Document getDocument(String url) { int error_count = 0; Document doc = null; while (true) { if (error_count > Constants.url_error_count) { break; } try { doc = Jsoup .connect(url) .timeout(Constants.url_ConnectTimeout) .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .header("Accept-Encoding", "gzip,deflate,sdch") .header("Connection", "keep-alive") .followRedirects(true) .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)") .get(); } catch (Exception e) { error_count++; } if (doc != null) { break; } } return doc; } public static Document parseHtml(String html) { return Jsoup.parse(html); } }
3、主要采集的类:
package com.wp.test; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.wp.util.JsoupUtil; import com.wp.util.Util; public class Caiji { public static void main(String[] args) { start(); } public static void start() { BufferedWriter w = null; Document doc = JsoupUtil.getDocument("http://www.00kxs.com/html/0/596/");// 获取章节列表 try { File file = new File("E:/abc.txt"); w = new BufferedWriter(new FileWriter(file)); if (doc != null) { Elements list_a = doc.select("div[id=list] dl dd a");// 获取所有章节的url内容 for (Element el : list_a) { String url = el.attr("abs:href");// 章节url String name = el.text();// 章节名称 int chp_num = Util.parseInt(Util.getMatch("第(\\d+)章", name, 1)); System.out.println(url + "=====" + name + "====" + chp_num); Document e_content = JsoupUtil.getDocument(url);// 获取章节正文 String content = "";// 章节正文 if (e_content != null) { content = e_content.select("div[id=content]").html().replace(" ", " "); content = content.replace("<br />", "\n").replaceAll("(\n){1,}", "\n"); w.append(name + "\n\n" + content.replaceAll("(\n){1,}", "\n") + "\n\n"); } else { w.append(name + "\n\n"); } } } else { System.out.println("没有获取正文"); } } catch (Exception e) { e.printStackTrace(); } finally { try { if (w != null) w.close(); } catch (Exception e) { } } } }