使用HttpClient模拟登陆并爬取网页

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/pengjunlee/article/details/85257369

在使用Java进行网页爬虫时经常需要携带登陆的 Cookie 信息,然而 Cookie 是有时效性的,所以经常会碰到 Cookie 失效的情况。如何在 Cookie 失效后自动重新获取成了爬虫急需解决的难题。

本文将示例如何使用 HttpClient 模拟登陆某知名猫平台并获取其登录的 Cookie 信息。

pom.xml 文件中引入 HttpClient 依赖包:

		<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.6</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpcore</artifactId>
			<version>4.4.10</version>
		</dependency>

 获取 Cookie 的完整代码如下:


import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;

import com.wpp.dc.task.common.config.Constant;

public class CookieUtils {

	public static void main(String[] args) {
		Map<String, String> headParamsMap = new HashMap<String, String>();
		headParamsMap.put("Host", "login.taobao.com");
		headParamsMap.put("Referer",
				"https://sycm.taobao.com/custom/login.htm?_target=http://sycm.taobao.com/portal/home.htm");
		Map<String, String> formMap = new HashMap<String, String>();
		formMap.put("TPL_username", "登录账号");
		formMap.put("TPL_password_2", "账号密码");
		formMap.put("TPL_redirect_url", "http://sycm.taobao.com/portal/home.htm");
		String cookieStr = getCookieByDoPost("https://login.taobao.com/member/login.jhtml", headParamsMap, formMap,
				"utf-8");
		System.out.println(cookieStr);
	}

	public static String getCookieByDoPost(String url, Map<String, String> headParamsMap, Map<String, String> formMap,
			String charset) {
		CloseableHttpClient httpClient = null;
		HttpPost httpPost = null;
		StringBuffer cookie = new StringBuffer();

		try {
			CookieStore cookieStore = new BasicCookieStore();
			httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
			httpPost = new HttpPost(url);
			// 设置请求体参数
			List<NameValuePair> list = new ArrayList<NameValuePair>();
			Iterator<Entry<String, String>> iterator = formMap.entrySet().iterator();
			while (iterator.hasNext()) {
				Entry<String, String> elem = (Entry<String, String>) iterator.next();
				list.add(new BasicNameValuePair(elem.getKey(), elem.getValue()));
			}

			if (list.size() > 0) {
				UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list, charset);
				httpPost.setEntity(entity);
			}
			
			// 设置请求头通用信息
			httpPost.addHeader("Accept", "*/*");
			httpPost.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
			httpPost.addHeader("Connection", "keep-alive");
			httpPost.addHeader("User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36");

			Set<Entry<String, String>> entrySet = headParamsMap.entrySet();
			for (Entry<String, String> entry : entrySet) {
				httpPost.addHeader(entry.getKey(), entry.getValue());
			}

			HttpResponse response = httpClient.execute(httpPost);

			if (response != null) {
				int statusCode = response.getStatusLine().getStatusCode();
				if (statusCode == HttpStatus.SC_OK) {
					// 获得Cookies
					List<Cookie> cookies = cookieStore.getCookies();
					for (Cookie c : cookies) {
						cookie.append(c.getName()).append("=").append(c.getValue()).append(";");
						if (c.getName().equals("_tb_token_")) {
							tokenStr = c.getValue();
						}
					}
				}
			}
		} catch (Exception ex) {
			ex.printStackTrace();
		} finally {
			httpPost.abort();
		}
		return cookie.toString();
	}
}

猜你喜欢

转载自blog.csdn.net/pengjunlee/article/details/85257369