Java SE之网络爬虫①

一需求描述
　　给一个url，将url内的所有的链接查找出来，并补充完整为绝对路径
二程序实现
/**
 * 
 * @author Zen Johnny
 * @date 2018年4月29日 下午7:22:44
 *
 */
package spider;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebPageSpider {
	private static Pattern pattern = null;
	private static Matcher matcher = null;
	private static BufferedReader br = null;
	private static StringBuffer text = null;
	private static URL _url = null;
	private static List<String> links = null;
	
	public static String captureWebPageContent(String url) throws UnsupportedEncodingException, IOException {
		text = new StringBuffer();
		_url = new URL(url);
		br = new BufferedReader(new InputStreamReader(_url.openStream(), "utf-8"));
		String line = null;
		while( (line = br.readLine()) != null) {
			text.append(line);
		}
		return text.toString();
	}
	
	/*
	 	在网页文本中检索并返回其所有链接
	 	@param refUrl:参照的绝对路径
	 	Eg：refAbsoluteUrl -> xxxx.com/X/Y/M/L/test.html?key=35435
	 */
	public static List<String> findLinks(String text, String refAbsoluteUrl) throws MalformedURLException {
		links = new LinkedList<String>();
		String regex = "((href)|(src)){1}=([\"\'])(.*?)\\4";//\\4:若前面是双(单)引号,则结束的时候也必须是双(单)引号
		pattern = Pattern.compile(regex);
		matcher = pattern.matcher(text);
		String link = null;
		while(matcher.find()) {
			link = matcher.group().replaceAll("((href=)|(src=)[\"\'])|([\'\"])", "");
			if(link.startsWith(".")) {//如果以相对路径开头，则为其默认添加基URL
				links.add(revertToAbsolutePath(refAbsoluteUrl, link));
			} else if(link.startsWith("/")){//以根路径作为开头
				URL tmp_url = new URL(refAbsoluteUrl);
				links.add(tmp_url.getHost() + link);
			} else if(link.endsWith("#")){//以#作为路径，即 当前路径(参照路径)
				links.add(refAbsoluteUrl);
			} else {
				links.add(link);
			}
			
		}
		return links;
	}
	
	/*
	 	将URL路径转为路径链表
	 	Eg:xxxx.com/X/Y/M/L/test.html?key=35435 => xxxx.com X Y M L test.html?key=35435
	 */
	public static List<String> dirs(String url) {
		java.util.List<String> dirs = new LinkedList<String>();
		String [] dirsArray = url.split("/+");
		for(String item : dirsArray) {
//			System.out.println(item);//test
			dirs.add(item.trim());
		}
		return dirs;
	}
	
	/*
	 	将相对路径恢复为绝对路径
	 	@param:curAbsoluteUrl	参照的绝对路径
	 	@param:relativeSubUrl	相对子路径
	 	
	 	xxxx.com/X/Y/M/L/test.html?key=35435	/hr/ry/ry			=> 	xxxx.com/X/Y/M/L/hr/ry/ry
	 	xxxx.com/X/Y/M/L/						./../../G/J/d.x		=>	xxxx.com/X/Y/G/J/d.x		
	 */
	public static String revertToAbsolutePath(String refAbsoluteUrl, String relativeSubUrl) {
		List<String> refPaths = dirs(refAbsoluteUrl);//参照路径链表
		List<String> relativePaths = dirs(relativeSubUrl);//相对路径链表
		List<String> dirs = new LinkedList<String>();
		StringBuffer path = new StringBuffer();

		if(refPaths.get(refPaths.size() - 1).matches("(.)*[\\.\\?](.)*")) {//若参照路径的最后一项以文件或者query形式结尾，则删除最后一项
//			System.out.println(refPaths.get(dirs.size() - 1));//test
			refPaths.remove(refPaths.size() - 1);
		}
		
		for(String item : relativePaths) {
//			System.out.println("item:"+item);//test
			if(item.equals("..")) {//上一层路径，则：删除refPaths的最后一项
				refPaths.remove(refPaths.size() - 1);
			} else if(!item.equals(".")) {//不为当前路径.或者空路径，即:实实在在的路径 
				if(!item.equals("")) {
					refPaths.add(item);
				} else {
//					System.out.println("【空】");//test
				}
			}
		}
		for(int item = 0,size = refPaths.size();item<size;item++) {//test
			path.append(refPaths.get(item) + (item+1 == size?"":"/"));
		}
		return path.toString();
	}
}
Java SE之网络爬虫①

猜你喜欢