package com.hundsun.pc;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
public class Url {
ArrayList<String> links= new ArrayList<String>();
final static String url="http://www.diqudaima.com";
public static void main(String[] args) {
String vistor = null ;
try {
LinkQueue queue = new LinkQueue();
queue.addUnvisitedUrl("/zhejiang/hangzhoushi/");
do{
try {
HttpClient http = new DefaultHttpClient();
vistor = queue.unVisitedUrlDeQueue().toString();
System.out.println("开始访问:" + url + vistor);
HttpGet hg = new HttpGet(url + vistor);
HttpResponse hr;
hr = http.execute(hg);
HttpEntity he = hr.getEntity();//
Url Url = new Url();
if (he != null) {
String charset = EntityUtils.getContentCharSet(he);
InputStream is = he.getContent();
BufferedReader br = new BufferedReader(
new InputStreamReader(is, "GBK"));
String line = null;
int i = 0;
while ((line = br.readLine()) != null) {
Url.geUrl(line, queue);
}
}
queue.addVisitedUrl(vistor);
http.getConnectionManager().shutdown();
System.out.println("获取区域信息数:"+queue.getMap().size()+"已经访问链接数:"+queue.getVisitedUrl().size()+"异常链接数:"+queue.getErrorUrl().size());
} catch (Exception e) {
System.out.println("访问异常:"+url+vistor);
queue.adderrorUrl(vistor);
queue.addUnvisitedUrl(vistor);
}
}while(!queue.isUnvisitedUrlsEmpty());
} catch (Exception e) {
e.printStackTrace();
}finally{
}
}
public void geUrl(String html,LinkQueue queue){
try{
NodeFilter filter = new TagNameFilter("li");
Parser p=new Parser();
p.setInputHTML(html);
NodeList list = p.extractAllNodesThatMatch(filter);
for (int i = 0; i < list.size(); i++) {
Node textnode = (Node) list.elementAt(i);
NodeList listChildren =textnode.getChildren();
String key="";
String value="";
for (int j = 0; j < listChildren.size(); j++) {
Node textnodeChildren = (Node) listChildren.elementAt(j);
if(textnodeChildren.getClass()==LinkTag.class){
LinkTag nodeChildren = (LinkTag)textnodeChildren;
key=nodeChildren.getLinkText();
queue.addUnvisitedUrl(nodeChildren.getLink());
}else{
value=textnodeChildren.getText();
if(value.split("邮编").length>1)
value= value.substring(value.indexOf("地区编码:")+5, value.indexOf("邮编:"));
if(value.startsWith("[")){
value=value.replace("[", "");
value=value.replace("]", "");
}
if(key.equals("")){
String[] args = value.split("\\[");
key = args[0];
value = args[1].replace("]", "");
}
}
}
System.out.println("key="+key+" value="+value);
if(!key.equals(""))
queue.getMap().put(key, value);
}
}catch (Exception e) {
e.printStackTrace();
}
}
辅助类
package com.hundsun.pc;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Created by amosli on 14-7-9.
*/
public class LinkQueue {
//已经访问的队列
private static Set visitedUrl = new HashSet();
//已经访问的队列
private static Set errorUrl = new HashSet();
//未访问的队列
private static Queue unVisitedUrl = new Queue();
private Map map = new HashMap<String,String>();
//获得URL队列
public static Queue getUnVisitedUrl() {
return unVisitedUrl;
}
public static Set getVisitedUrl() {
return visitedUrl;
}
//添加到访问过的URL队列中
public static void addVisitedUrl(String url) {
visitedUrl.add(url);
}
public static void adderrorUrl(String url) {
errorUrl.add(url);
}
//删除已经访问过的URL
public static void removeVisitedUrl(String url){
visitedUrl.remove(url);
}
//未访问的URL出队列
public static Object unVisitedUrlDeQueue(){
return unVisitedUrl.deQueue();
}
//保证每个URL只被访问一次,url不能为空,同时已经访问的URL队列中不能包含该url,而且因为已经出队列了所未访问的URL队列中也不能包含该url
public static void addUnvisitedUrl(String url){
if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url)&&!unVisitedUrl.contains(url))
unVisitedUrl.enQueue(url);
}
//获得已经访问过的URL的数量
public static int getVisitedUrlNum(){
return visitedUrl.size();
}
//判断未访问的URL队列中是否为空
public static boolean isUnvisitedUrlsEmpty(){
return unVisitedUrl.empty();
}
public Map getMap() {
return map;
}
public void setMap(Map map) {
this.map = map;
}
public static Set getErrorUrl() {
return errorUrl;
}
public static void setErrorUrl(Set errorUrl) {
LinkQueue.errorUrl = errorUrl;
}
}
运行结果如下图