java之网络爬虫代码

java网络爬虫爬取网站代码

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;

public class WebSpiderTest {
	

	public static void main(String[] args) {
		try {
			URL url = new URL("http://www.163.com");
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				System.out.println(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		j

	}

}

将上一个代码进行封装

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 获得url对应的网页源码内容
 * @author hejun
 *
 */
public class WebSpiderTest {
	public static String getURLContent(String urlStr,String charset) {
		StringBuilder sb = new StringBuilder();
		try {
			URL url = new URL(urlStr);
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				sb.append(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return sb.toString();
	}

	public static void main(String[] args) {
		String destStr = getURLContent("http://www.163.com","gbk");
		System.out.println(destStr);
		}
		}
		}

获取网页的超连接,及网页乱码处理

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 获得url对应的网页源码内容
 * @author hejun
 *
 */
public class WebSpiderTest {
	public static String getURLContent(String urlStr,String charset) {
		StringBuilder sb = new StringBuilder();
		try {
			URL url = new URL(urlStr);
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				sb.append(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return sb.toString();
	}

	public static void main(String[] args) {
		String destStr = getURLContent("http://www.163.com","gbk");
		System.out.println(destStr);
		//取超连接的整个内容
		//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
		//取到的超链接的地址
		Pattern p = Pattern.compile("href=\"(.+?)\"");
		Matcher m = p.matcher(destStr);
		
		while(m.find()) {
			System.out.println(m.group());
			System.out.println(m.group(1));
		}
		
		

	}

}

将上一个代码进行优化

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 获得url对应的网页源码内容
 * @author hejun
 *
 */
public class WebSpiderTest {
	public static String getURLContent(String urlStr,String charset) {
		StringBuilder sb = new StringBuilder();
		try {
			URL url = new URL(urlStr);
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				sb.append(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return sb.toString();
	}
	
	public static List<String>getMatherSubstrs(String destStr,String regexStr){
		//取到的超链接的地址
		Pattern p = Pattern.compile(regexStr);
		Matcher m = p.matcher(destStr);
		List<String> result = new ArrayList<String>();
		while(m.find()) {
			
			result .add(m.group(1));
			
		}

		return result;
		
	}

	public static void main(String[] args) {
		String destStr = getURLContent("http://www.163.com","gbk");
		System.out.println(destStr);
		//取超连接的整个内容
		//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
		List<String> result=getMatherSubstrs(destStr,"href=\"([\\w\\s./:]+?)\"");
		for(String t:result) {
			System.out.println(t);
		}
	
		
		

	}

}

喜欢我的可以关注我,我们可以一起交流学习

微信公众号:

让我爱上它Computer

qq群:473989408

发布了68 篇原创文章 · 获赞 15 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_42913025/article/details/99470459
今日推荐