java爬虫简单实现

最近稍微有点时间，所以自己简单研究了一下爬虫。原理其实很简单，就是通过url获取当前页面的html文档，根据文档来获取我们需要的数据。爬虫其实就是模仿我们进行鼠标点击操作，只要鼠标点击能获取的文档，爬虫都可以获取。

话不多说，下面直接上代码吧。其实就是一个简单的实现，大家如果看到需要改进的地方，还希望能指点指点。

爬虫需要jar包下载地址：http://download.csdn.net/download/qq_39101581/10271411

package com.test.httpclient;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.Consts;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;

public class TestHttpClient {

	// 创建HttpClient对象
	private static CloseableHttpClient httpClient = null;

	public static void testHttp(String url) {
		CloseableHttpResponse response = null;

		try {
			if (httpClient != null)
				httpClient.close();
			httpClient = getHttpClient();

			// 封装请求参数
			List<NameValuePair> params = new ArrayList<NameValuePair>();
			params.add(new BasicNameValuePair("paramName", "paramValue"));
			// 请求参数转为字符串
			String paramsStr = EntityUtils.toString(new UrlEncodedFormEntity(params, Consts.UTF_8));

			// 爬虫URL大部分都是get请求，创建get请求对象
			HttpGet httpGet = new HttpGet(url + "?" + paramsStr);
			// 向传智播客官方网站发送请求，获取网页源码
			response = httpClient.execute(httpGet);
			// EntityUtils工具类把网页实体转换成字符串
			String entity = EntityUtils.toString(response.getEntity(), "UTF-8");
			System.out.println(entity);

			String urlpath = getFileName(url);// 根据url来获取文件名（去掉https://和.）
			List<String> list = getSplitList();// 获取所有的特殊符号
			urlpath = splitstr(urlpath, list);// 去掉文件名中的所有特殊符号

			String path = "D:\\html\\" + urlpath + ".html";// 指定爬虫获取文件存放地址
			File file = new File(path);

			if (!file.exists())
				try {
					file.createNewFile();
				} catch (Exception e) {
					System.out.println("createNewFile失败" + urlpath);
					e.printStackTrace();
				}
			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
			writer.write(entity);
			writer.close();

			System.out.println("---------------------------------------------------------------------------");

			String str = "<a href=";
			List<Integer> allIndex = getAllIndex(str, entity);// 获取所有str在entity中的索引
			for (Integer index : allIndex) {
				System.out.println(index);
				int indexOfstart = index;

				int indexOfend1 = entity.indexOf(" ", indexOfstart + str.length());
				int indexOfend2 = entity.indexOf(">", indexOfstart + str.length());
				int indexOfend = indexOfend1 <= indexOfend2 ? indexOfend1 : indexOfend2;

				String suburl = entity.substring(indexOfstart + str.length(), indexOfend);
				String suffix = "";
				if (suburl.startsWith("http://")) {
					suffix = "http://";
				} else if (suburl.startsWith("https://")) {
					suffix = "https://";
				} else if (suburl.startsWith("//")) {
					suffix = "//";
				} else if (suburl.startsWith("\"")) {
					continue;
				}
				suburl = "https://" + suburl.substring(suffix.length());
				System.out.println(suburl);
				testHttp(suburl);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (response != null) {
				try {
					response.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			if (httpClient != null) {
				try {
					httpClient.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	/**
	 * 创建HttpClient
	 * 
	 * @return
	 * @throws Exception
	 */
	public static CloseableHttpClient getHttpClient() throws Exception {
		/*
		 * javax.net.ssl.SSLPeerUnverifiedException你的测试服务器常常没有一个（有效的）SSL证书
		 */
		SSLContextBuilder builder = new SSLContextBuilder();
		builder.loadTrustMaterial(null, new TrustSelfSignedStrategy());
		SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(builder.build(),
				NoopHostnameVerifier.INSTANCE);
		Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
				.register("http", new PlainConnectionSocketFactory()).register("https", sslConnectionSocketFactory)
				.build();

		PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(registry);
		cm.setMaxTotal(100);
		CloseableHttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory)
				.setConnectionManager(cm).build();

		return httpClient;
	}

	/**
	 * 获取所有str在string中的位置索引
	 * 
	 * @param str
	 * @param string
	 * @return
	 */
	public static List<Integer> getAllIndex(String str, String string) {
		List<Integer> list = new ArrayList<Integer>();
		for (int i = 0; i < string.lastIndexOf(str); i++) {
			i = string.indexOf(str, i);
			list.add(i);
		}
		return list;
	}

	/**
	 * 根据url来获取文件名（去掉https://和.）
	 * 
	 * @param url
	 * @return
	 */
	public static String getFileName(String url) {
		String suffix = "https://";
		String[] urlchar = url.substring(url.indexOf(suffix) + suffix.length()).split("\\.");
		String urlpath = "";
		for (String str : urlchar) {
			urlpath += str;
		}

		return urlpath;
	}

	/**
	 * 去掉字符串中所有的特殊符号
	 * 
	 * @param str
	 * @param list
	 * @return
	 */
	public static String splitstr(String str, List<String> list) {

		for (String string : list) {
			String newStr = "";
			String[] split = str.split(string);
			for (String string2 : split) {
				newStr += string2;
				str = newStr;
			}
		}
		return str;
	}

	/**
	 * 获取所有的特殊符号
	 * 
	 * @return
	 */
	public static List<String> getSplitList() {
		List<String> list = new ArrayList<String>();
		list.add("\\.");
		list.add("\\/");
		list.add("\\?");
		list.add("\\,");
		return list;
	}

	/**
	 * 程序入口
	 * 
	 * @param args
	 */
	public static void main(String[] args) {
		String url = "https://www.baidu.com";
		testHttp(url);
	}

}

代码中初始url是百度首页，最终能获得的结果如图所示

在此记录一下，希望对有些朋友有些许帮助，不足的地方，希望指正！谢谢！

猜你喜欢