前阵子做了个网页抓取工具,可扩展性较差,今天发现google 的一个开源网页抓取工具jsoup,写了个测试,与大家分享下
package com.gump.net.html.test; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** *测试类 *用jasoup进行html具体的网页解析例子 @author ganliang13 {@link http://ganliang13.iteye.com/} * */ public class test { public static void main(String[] args) throws IOException{ long begin = System.currentTimeMillis(); //整个html内容 Document doc = Jsoup.connect("http://www.qzone.cc/Gexing/Qian/02/26263.html").timeout(30000).get(); // 设置连接超时时间 //打印html文档的<title>内容 System.out.println(doc.getElementsByTag("title")); //打印html文档的<a>内容 Elements aels = doc.getElementsByTag("a"); for (Element el : aels) { System.out.println(el.toString()); } long end = System.currentTimeMillis(); System.out.println(end-begin); } }