使用HttpClient和Jsoup定向抓取数据

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/LoveJavaYDJ/article/details/72904123

1.业务需求:

从指定外网抓点货,冷启动

2.站点分析:

.限制IP…
.需要登录……
.对登录账号有抓取频率限制……….
.抓取频率过低,直接跳验证码页面…………..
.验证码长度、模样(纯数字&字母数字混合)TM不固定………………..

“我们能不能不抓了?“
“不行!必须得抓…”
“……”

这么说,此前写的爬虫,多线程、生产者—>消费者 并发抓取压根行不通。多线程毫无意义。

3.使用技术:

1.HttpClient:读取指定URL网页内容
2.Jsoup:解析所要的页面数据——省得写恶心的正则表达式
3.Swing:绘制用户操作界面
4.Tess4J:自动识别验证码(http://tess4j.sourceforge.net/
5.Exe4J:生成可独立运行的exe程序——给每人机器安装一个,大家一起监控抓~

4.实现要点:

1.代理IP
从一些网站上抓取代理IP,并检测是否可以使用,如下:

package com.ydj.zhuaqu.proxy;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import net.sf.json.JSONArray;
import net.sf.json.JSONObject;

import org.apache.commons.collections.map.LRUMap;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;

import com.ydj.common.kit.MyLog;

/**
*
* @author : Ares.yi
* @createTime : 2014-11-10 上午11:13:42 
* @version : 1.0 
* @description : 
*
*/
public class ProxyIpPool {

    /**设置最多IP数*/
    private static final int MAX_IP = 100;

    /**设置最少IP数(最好控制和外部使用线程数一致)*/
    @SuppressWarnings("unused")
    private static final int MIN_IP = 10;

//  public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>();
    public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP));
    private static LRUMap notCanUseIPsTemp = new LRUMap(2000);

    /**每次抓取IP数*/
    private static final int NUM = 20;
    private static final String ORDER_ID = "904557733280949";
    private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json";


    private ProxyIpPool(){
    }

    /**
     * 启动抓取代理IP线程 
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午5:58:54
     */
    public static void startCrawl(){
        final int period = 3;

        ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
        scheduledExecutorService.scheduleAtFixedRate(new Runnable() {
            int i = 0 ;
            @Override
            public void run() {
                produceIP(i);
                i++;
            }
        }, 1, period,TimeUnit.MINUTES);
    }

    private static void produceIP(int i){
        int currentSize = canUseIpList.size();

        if( currentSize >= MAX_IP){
            MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size());
            return ;
        }

        JSONArray ips = getIPFromKuaiDaiLi();
        produceIP(ips);

        MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size());
    }

    private static void produceIP(JSONArray ips){
        if(ips == null || ips.isEmpty()){
            return ;
        }
        for(int i = 0 ;i < ips.size() ;i++ ){
            Object one  = ips.get(i);
            String s[] = one.toString().split(":");
            String ip = s[0];
            int port = Integer.valueOf(s[1]);

            ProxyIp proxyIp = new ProxyIp(ip, port);

            if(isCanUse(ip, port)){
                addIP(proxyIp);
            }else{
                removeIP(proxyIp);
            }
        }
    }

    public static ProxyIp useOneProxyIp(){

        if(canUseIpList.isEmpty()){
            MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP.");
            try {
                Thread.sleep(2 * 60 * 1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        Collections.sort(canUseIpList);

        ProxyIp proxyIp = canUseIpList.remove(0);
        proxyIp.useThis();

        return proxyIp;

    }

    public static void returnProxyIp(ProxyIp proxyIp){
        proxyIp.setUseing(false);
        canUseIpList.add(proxyIp);
        return ;
    }

    /**
     * 从快代理网站获取代理IP
     * @return
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午2:36:05
     */
    private static JSONArray getIPFromKuaiDaiLi(){
        JSONArray ips = new JSONArray();
        HttpClient client = new HttpClient();

        GetMethod method = new GetMethod(KDL_URL);

        HttpMethodParams param = method.getParams();
        param.setContentCharset("UTF-8");

        try {
            client.executeMethod(method);
            String res = method.getResponseBodyAsString();
            JSONObject json = JSONObject.fromObject(res);
            if(json != null && json.containsKey("data")){
                ips = json.getJSONObject("data").getJSONArray("proxy_list");
                MyLog.logInfo(ips);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return ips;
    }

    /**
     * 从更多的网站获取代理IP
     * @return
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午2:46:40
     */
    @SuppressWarnings("unused")
    private static JSONArray getIPFromXXX(){
        JSONArray ips = new JSONArray();
        HttpClient client = new HttpClient();

        GetMethod method = new GetMethod("XXX");

        HttpMethodParams param = method.getParams();
        param.setContentCharset("UTF-8");

        try {
            client.executeMethod(method);
            String res = method.getResponseBodyAsString();
            JSONObject json = JSONObject.fromObject(res);
            if(json != null && json.containsKey("data")){
                ips = json.getJSONObject("data").getJSONArray("proxy_list");
                MyLog.logInfo(ips);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return ips;
    }

    /**
     * 检测代理IP是否可用
     * 
     * @param ip
     * @param port
     * @return
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午2:37:22
     */
    private static boolean isCanUse(String ip,int port){
        if(port < 0 ){
            return false;
        }

        if(notCanUseIPsTemp.containsKey(ip)){
            MyLog.logInfo(ip+":"+port+" can't use again.");
            return false;
        }

        if(!checkIp(ip, port)){
            return false;
        }

        return checkIpUseTargetSite(ip, port);
    }

    /**
     * 检测代理IP是否可用
     * 
     * @param ip
     * @param port
     * @return
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午12:35:28
     */
    private static boolean checkIp(String ip,int port){
        Socket server = null;
        try {
            server = new Socket();
            InetSocketAddress address = new InetSocketAddress(ip,port);
            server.connect(address, 3000);
            MyLog.logInfo(ip+":"+port+" is ok!");
            return true;
        }catch (UnknownHostException e) {
            //e.printStackTrace();
            MyLog.logInfo(ip+":"+port+" is wrong!");
        } catch (IOException e) {
            //e.printStackTrace();
            MyLog.logInfo(ip+":"+port+" is wrong!!");
        }
        return false;
    }

    /**
     * 到目标网站准确检测代理IP是否可用
     * 
     * @param ip
     * @param port
     * @return
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午12:06:03
     */
    private static boolean checkIpUseTargetSite(String ip,int port){
        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
        CloseableHttpClient closeableHttpClient = httpClientBuilder.build();

        HttpHost proxy = new HttpHost(ip,port, "http");
        RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();
        HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/");
        httpGet.setConfig(config);

        try {
            CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
            HttpEntity httpentity = response.getEntity();
            String html =  EntityUtils.toString(httpentity, "UTF-8");
            if(Jsoup.parse(html).select("div[class=header fix]").first() != null){
                return true;
            }
        } catch (Exception exc){
//          exc.printStackTrace();
            MyLog.logError(exc.getMessage());
        }

        return false;
    }


    public static void removeIP(ProxyIp proxyIp){
        canUseIpList.remove(proxyIp);
        notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort());
    }

    public static void addIP(ProxyIp proxyIp){
        canUseIpList.add(proxyIp);
        notCanUseIPsTemp.remove(proxyIp.getIp());
    }

    /**
     * 测试使用代理IP 
     *
     * @author : Ares.yi
     * @createTime : 2015年10月29日 下午6:00:16
     */
    private static void testUseProxyIp(){
        ExecutorService threadPool = Executors.newFixedThreadPool(10); 

        for(int i=0 ;i <20 ;i++){
            final int flag = i;
            threadPool.execute(new Runnable() {

                @Override
                public void run() {
                    ProxyIp proxyIp = useOneProxyIp();
                    MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString());

                    long millis = new Random().nextInt(10) * 1000;
                    try {
                        Thread.sleep(millis);//每个线程随机sleep N秒,模拟线程在工作
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }

                    returnProxyIp(proxyIp);

                    MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool.");
                }
            });
        }

    }

}

使用代理IP:

    /**
     * 使用代理获取网页内容
     * 
     * @param url
     * @param proxyIp
     * @param proxyPort
     * @return
     * @throws ParseException
     * @throws IOException
     *
     * @author : Ares.yi
     * @createTime : 2015年10月30日 上午9:55:21
     */
    public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException {

        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
        CloseableHttpClient closeableHttpClient = httpClientBuilder.build();

        HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http");
        RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();

        HttpPost httpGet = new HttpPost(url);
        httpGet.setConfig(config);

        String html = "";
        CloseableHttpResponse response = null;

        try {
            response = closeableHttpClient.execute(httpGet);
        }catch(Exception exc){
            exc.printStackTrace();
            System.out.println("get请求失败!");
            return "cannot connect";
        }

        HttpEntity httpEntity = response.getEntity();
        if (httpEntity != null) {
            // 打印响应内容
            try{
                html =  EntityUtils.toString(httpEntity, "UTF-8");
            }catch(Exception excep){
                System.out.println(url);
            }   
        }else{
            return "cannot connect";
        }

        closeableHttpClient.close();
        return html;
    }

2.模拟登录
提取登录Cookie和User-Agent:
这里写图片描述

代码片段,如下:

public static String postRequest(String url,
            Map<String, String> parameterMap, String charSet)
            throws UnsupportedEncodingException {
        CloseableHttpClient client = HttpClients.createDefault();

        HttpPost httpPost = new HttpPost(url);

        UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet);
        httpPost.setEntity(postEntity);

        httpPost.addHeader("HOST", "sec.1688.com");
        httpPost.addHeader("User-Agent", Constant.userAgent);
        httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        httpPost.addHeader("Cookie", Constant.cookie);

        MyLog.logInfo("request line:" + httpPost.getRequestLine());

        try {
            // 执行post请求
            HttpResponse httpResponse = client.execute(httpPost);

            Header header = httpResponse.getFirstHeader("Location");

            if (header != null && Toolbox.isNotEmpty(header.getValue())) {
                MyLog.logInfo("location:" + header.getValue());
                return "SUCCESS";
            } else {
                String html = printResponse(httpResponse);

                return html;
            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                client.close();
            } catch (IOException e) {
            }
        }

        return "";
    }

3.验证码
获取输入验证码页面信息:

public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){
        Ali1688CheckCodeFormData ali1688CheckCodeFormData= null;

        if(Toolbox.isEmptyString(checkCodePageHtml)){
            return ali1688CheckCodeFormData;
        }

        Document doc = Jsoup.parse(checkCodePageHtml);

        String action = doc.select("input[name=action]").attr("value");
        String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value");
        String smPolicy = doc.select("input[name=smPolicy]").attr("value");
        String smReturn = doc.select("input[name=smReturn]").attr("value");
        String smApp = doc.select("input[name=smApp]").attr("value");
        String smCharset = doc.select("input[name=smCharset]").attr("value");
        String smTag = doc.select("input[name=smTag]").attr("value");
        String smSign = doc.select("input[name=smSign]").attr("value");
        String identity = doc.select("input[name=identity]").attr("value");
        String captcha = doc.select("input[name=captcha]").attr("value");

        String sessionid = doc.select("img[id=checkcodeImg]").attr("src"); 

        sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&"));

        ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url);

        return ali1688CheckCodeFormData;
    }

提交验证码:

public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{

        String smApp = Constant.ali1688CheckCodeFormData.getSmApp();
        String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy();
        String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset();
        String smTag = Constant.ali1688CheckCodeFormData.getSmTag();
        String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn();
        String smSign = Constant.ali1688CheckCodeFormData.getSmSign();

        String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign;

        try {
            get = java.net.URLEncoder.encode(get,"utf-8");
        } catch (UnsupportedEncodingException e1) {
        }

        String formAction = "https://sec.1688.com/query.htm?"+get;

        Map<String,String> parameterMap = new HashMap<String,String>();
        parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction());
        parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query());
        parameterMap.put("smPolicy", smPolicy);
        parameterMap.put("smReturn", smReturn);
        parameterMap.put("smApp", smApp);
        parameterMap.put("smCharset", smCharset);
        parameterMap.put("smTag", smTag);
        parameterMap.put("smSign", smSign);
        parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity());
        parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha());
        parameterMap.put("checkcode", checkcode);

        String res = HttpKit.postRequest(formAction, parameterMap,  "UTF-8");

        if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) {
                return "SUCCESS";
        }else{
              String html = res;
              Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html);
        } 

        return "";
    }

4.exe4j操作:
这里写图片描述

5.部分界面:

这里写图片描述
这里写图片描述
这里写图片描述

6.源码:

https://github.com/Aresyi/simpleSpider

猜你喜欢

转载自blog.csdn.net/LoveJavaYDJ/article/details/72904123