提取页面、文件中的链接


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PatternTest {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		String path="d:/test.txt";
		PatternTest t = new PatternTest();
		String regexhref = "<(?i)img.*?>";
		String content = t.redFIle(path);
		String sss= t.replaceHref(content,regexhref);
		
	}
	
	
	public  String redFIle(String path) {
		FileInputStream fin;
		StringBuffer sb = new StringBuffer();
		try {
			fin = new FileInputStream(path);
			InputStreamReader rdr = new InputStreamReader(fin, "utf-8");
			BufferedReader br = new BufferedReader(rdr);// 文件读字符流
			String s;
			while ((s = br.readLine()) != null) {
				 sb.append(s);
			}
			br.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
		return sb.toString();
		
	}
	
	
	 public String replaceHref(String hBody, String regex) {
			String url="";
			String includ = "";
			Pattern pt = Pattern.compile(regex,Pattern.DOTALL);
			Matcher mt = pt.matcher(hBody);
			while (mt.find()) {
				includ = mt.group();
				System.out.println(includ); //打印img标签
				String srcregex = "src=(\".*?\")|src=(\'.*?\')|src=(.*?\\s)|src=(.*?>)";
				url = getContents(includ, srcregex).trim();
				//System.out.println(url);//打印src中的内容
			}
			return url;
		}
		
		
		
//提取href中的内容
	 private String getContents(String contents,String regx){
		 
		 String url="";
		 Pattern srcpt=Pattern.compile(regx);
		 Matcher srcmt=srcpt.matcher(contents);
		 
		 while(srcmt.find()){	 
			 url= srcmt.group().replaceAll("src=\"|\"|src=\'|\'|src=|>", "");
		 }
		 return url;
	 }
	 
	 
	
	}

	







猜你喜欢

转载自javapx.iteye.com/blog/1958294