java自写爬虫中常用方法封装

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/ygd1994/article/details/53612385

在没有感受到人生苦短的日子里,一直在用java写爬虫,大大小小的爬虫也算写了十几个吧,来整理一下java爬虫中经常用的方法。


  • 核心方法

    使用HttpURLConnection请求网络

public static String request(String path, String cookie) {
        URL url = null;
        try {
            url = new URL(path);
            HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
            httpURLConnection.setRequestMethod("GET");// 提交模式
            httpURLConnection.addRequestProperty("Cookie", cookie);
            httpURLConnection.addRequestProperty("User-Agent",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36");
            httpURLConnection.addRequestProperty("Connection", "Keep-Alive");
            httpURLConnection.setConnectTimeout(5000);//连接超时 单位毫秒
            httpURLConnection.setReadTimeout(5000);//读取超时 单位毫秒
            BufferedInputStream bis = new BufferedInputStream(httpURLConnection.getInputStream());
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            int len;
            byte[] arr = new byte[1024];
            while ((len = bis.read(arr)) != -1) {
                bos.write(arr, 0, len);
                bos.flush();
            }
            bos.close();
            return bos.toString("utf-8");
        } catch (Exception e) {
            System.out.println(path + "连接失败!");
        }
        return null;
    }
  • 请求方法封装

    Target.java


package bean;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Random;

/**
 * Created by ygd on 2017/5/31.
 */
public class Target {
    private String url;
    private String data;
    private int requestMoethod = GET;
    private String cookie;
    private String userAgent = getRandomModel();
    private String contentType = "application/x-www-form-urlencoded";
    public final static int GET = 0;
    public final static int POST = 1;
    public static String[] modelList = new String[]{"oppo a33", "xiaomi hm note", "oppo r9s", "oppo a37m", "oppo a33m", "oppo r9m", "oppo a59s", "oppo a57", "oppo a59m", "oppo browse", "oppo r9tm", "xiaomi mi 4lte", "vivo x7", "oppo r9s pl", "xiaomi hm 2a", "huawei tag-al00", "oppo r7s", "oppo a53", "oppo r9sk", "huawei mediapad", "vivo x9", "xiaomi mi note", "huawei nxt-al10", "vivo y51a", "vivo y51", "xiaomi mi 5", "oppo a53m", "oppo r7sm", "vivo x7plus", "huawei mla-al10", "vivo x6d", "huawei mha-al00", "xiaomi mi 3", "huawei tit-al00", "xiaomi mi max", "huawei rio-al00", "huawei eva-al10", "vivo y67", "xiaomi hm 1s", "huawei tag-tl00", "vivo y55a", "vivo y31a", "huawei frd-al10", "samsung sm-g9350", "vivo v3max a", "samsung sm-a5000", "huawei caz-al10", "huawei cun-al00", "huawei h60-l01", "vivo y23l", "samsung sm-a7000", "xiaomi mi 5s", "samsung sm-g9250", "huawei tit-tl00", "oppo r9km", "vivo x6s a", "vivo y66", "huawei mt7-cl00", "huawei vns-al00", "huawei eva-al00", "xiaomi mi 2", "huawei mt7-tl10", "apple iphone", "huawei frd-al00", "samsung sm-g9280", "huawei g750-t01", "oppo r7splu", "vivo xplay5a", "samsung sm-g5308w", "vivo x6a", "samsung sm-c7000", "vivo y27", "vivo x5pro d", "vivo v3m a", "huawei vie-al10", "vivo x5m", "vivo x6plus d", "huawei c8817d", "huawei cun-tl00", "vivo x9plus", "vivo y31", "huawei mt7-tl00", "samsung sm-c5000", "samsung sm-g9200", "vivo x5s l", "huawei p7-l07", "vivo x9i", "samsung sm-n9008v", "vivo y35", "huawei eva-tl00"};

    //多参数请求
    public Target(String url, String data, int requestMoethod, String cookie, String userAgent, String contentType) {
        this.url = url;
        if (data != null) {
            this.data = data;
        }
        this.requestMoethod = requestMoethod;
        if (cookie != null) {
            this.cookie = cookie;
        }
        if (userAgent != null) {
            this.userAgent = userAgent;
        }
        if (contentType != null) {
            this.contentType = contentType;
        }
    }

    //精简版请求
    public Target(String url) {
        this.url = url;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getData() {
        return data;
    }

    public void setData(String data) {
        this.data = data;
    }

    public int getRequestMoethod() {
        return requestMoethod;
    }

    public void setRequestMoethod(int requestMoethod) {
        this.requestMoethod = requestMoethod;
    }

    public String getCookie() {
        return cookie;
    }

    public void setCookie(String cookie) {
        this.cookie = cookie;
    }

    public String getUserAgent() {
        return userAgent;
    }

    public void setUserAgent(String userAgent) {
        this.userAgent = userAgent;
    }

    public String getContentType() {
        return contentType;
    }

    public void setContentType(String contentType) {
        this.contentType = contentType;
    }

    //网络请求
    public String request() {
        if (requestMoethod == GET) {
            return requestByGet();
        } else {
            return requestByPost();
        }
    }

    //get请求
    public String requestByGet() {
        URL myUrl = null;
        try {
            if (data != null) {
                url = url + data;
            }
            myUrl = new URL(url);
            HttpURLConnection httpURLConnection = (HttpURLConnection) myUrl.openConnection();
            httpURLConnection.setRequestMethod("GET");// 提交模式
            httpURLConnection.addRequestProperty("Cookie", cookie);
            httpURLConnection.addRequestProperty("User-Agent", userAgent);
            httpURLConnection.addRequestProperty("Connection", "Keep-Alive");
            httpURLConnection.setConnectTimeout(5000);//连接超时 单位毫秒
            httpURLConnection.setReadTimeout(5000);//读取超时 单位毫秒
            httpURLConnection.setDoOutput(true);
            httpURLConnection.setDoInput(true);
            BufferedInputStream bis = new BufferedInputStream(httpURLConnection.getInputStream());
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            int len;
            byte[] arr = new byte[1024];
            while ((len = bis.read(arr)) != -1) {
                bos.write(arr, 0, len);
                bos.flush();
            }
            bos.close();
            return bos.toString("utf-8");
        } catch (Exception e) {
            System.out.println("get请求失败! ---> " + url);

        }
        return null;
    }

    //post请求
    public String requestByPost() {
        URL myUrl = null;
        try {
            myUrl = new URL(url);
            HttpURLConnection httpURLConnection = (HttpURLConnection) myUrl.openConnection();
            httpURLConnection.setRequestMethod("POST");// 提交模式
            httpURLConnection.addRequestProperty("Cookie", cookie);
            httpURLConnection.addRequestProperty("User-Agent", userAgent);
            httpURLConnection.setRequestProperty("Content-Type", contentType);
            httpURLConnection.addRequestProperty("Connection", "Keep-Alive");
            httpURLConnection.setConnectTimeout(5000);//连接超时 单位毫秒
            httpURLConnection.setReadTimeout(5000);//读取超时 单位毫秒
            // 发送POST请求必须设置如下两行
            httpURLConnection.setDoOutput(true);
            httpURLConnection.setDoInput(true);
            if (data != null && data.length() > 0) {
                // 获取URLConnection对象对应的输出流
                PrintWriter printWriter = new PrintWriter(httpURLConnection.getOutputStream());
                // 发送请求参数
                printWriter.write(data);// post的参数 xx=xx&yy=yy
                // flush输出流的缓冲
                printWriter.flush();
            }
            // 开始获取数据
            BufferedInputStream bis = new BufferedInputStream(httpURLConnection.getInputStream());
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            int len;
            byte[] arr = new byte[1024];
            while ((len = bis.read(arr)) != -1) {
                bos.write(arr, 0, len);
                bos.flush();
            }
            bos.close();
            return bos.toString("utf-8");
        } catch (Exception e) {
            System.out.println("post请求失败! ---> " + url);
        }
        return null;
    }

    //获取跳转后的地址
    public String locate() {
        URL myUrl = null;
        try {
            myUrl = new URL(url);
            HttpURLConnection httpURLConnection = (HttpURLConnection) myUrl.openConnection();
            httpURLConnection.setRequestMethod("GET");// 提交模式
            httpURLConnection.addRequestProperty("User-Agent", userAgent);
            httpURLConnection.addRequestProperty("Connection", "Keep-Alive");
            httpURLConnection.setConnectTimeout(5000);//连接超时 单位毫秒
            httpURLConnection.setReadTimeout(5000);//读取超时 单位毫秒
            // 必须设置false,否则会自动redirect到Location的地址
            httpURLConnection.setInstanceFollowRedirects(false);
            httpURLConnection.connect();
            String location = httpURLConnection.getHeaderField("Location");
            return location;
        } catch (Exception e) {
            System.out.println("获取跳转地址失败! ---> " + url);
        }
        return url;
    }

    //下载
    public boolean download(String filePath) {
        // 构造URL
        URL myUrl = null;
        try {
            if (filePath == null) {
                String path = getBasePath() + "download";
                if (!new File(path).exists()) {
                    new File(path).mkdirs();
                }
                filePath = path + System.getProperty("path.separator") + url.substring(url.lastIndexOf("/"));
            }
            myUrl = new URL(url);
            HttpURLConnection httpURLConnection = (HttpURLConnection) myUrl.openConnection();
            httpURLConnection.setRequestMethod("GET");// 提交模式
            httpURLConnection.addRequestProperty("User-Agent", userAgent);
            httpURLConnection.addRequestProperty("Connection", "Keep-Alive");
            httpURLConnection.setConnectTimeout(20000);//连接超时 单位毫秒
            httpURLConnection.setReadTimeout(20000);//读取超时 单位毫秒
            // 输入流
            InputStream is = httpURLConnection.getInputStream();
            // 1K的数据缓冲
            byte[] bs = new byte[1024];
            // 读取到的数据长度
            int len;
            // 输出的文件流
            OutputStream os = new FileOutputStream(filePath);
            // 开始读取
            while ((len = is.read(bs)) != -1) {
                os.write(bs, 0, len);
            }
            // 完毕,关闭所有链接
            os.close();
            is.close();
        } catch (IOException e) {
            System.out.println("下载失败! ---> " + url);
            return false;
        }
        return true;
    }

    //获取jar包所在路径
    public String getBasePath() {
        String basePath = Target.class.getProtectionDomain().getCodeSource().getLocation().getPath();
        int firstIndex = basePath.lastIndexOf(System.getProperty("path.separator")) + 1;
        int lastIndex = basePath.lastIndexOf(File.separator) + 1;
        basePath = basePath.substring(firstIndex, lastIndex);
        return basePath;
    }

    //随机获取设备型号
    public String getRandomModel() {
        Random random = new Random();
        int randomInt = random.nextInt(modelList.length);
        String model = modelList[randomInt];
        return model;
    }


}
  • 代理设置

    ProxyUtil.java

package biz;


import bean.IpInfoBean;
import bean.ProxyIpBean;
import bean.XdlFreeIpBean;
import com.google.gson.Gson;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;


public class ProxyUtil {
    /**
     * 设置代理ip
     *
     * @throws IOException
     */

    public static void main(String[] args) {
        setProxyIp();
//        getProxyIp();
    }

    public static String setProxyIp() {
        String city = null;
        String ipport = null;
        List<String> ipList = new ArrayList<>();
        try {
            String ipPath = getBasePath() + "/proxy.txt";
            BufferedReader proxyIpReader = new BufferedReader(new FileReader(ipPath));
            String ip = "";
            while ((ip = proxyIpReader.readLine()) != null) {
                ipList.add(ip);
            }
            Random random = new Random();
            int randomInt = random.nextInt(ipList.size());
            ipport = ipList.get(randomInt);
            String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));
            String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());
            System.setProperty("http.maxRedirects", "50");
            System.getProperties().setProperty("proxySet", "true");
            System.getProperties().setProperty("http.proxyHost", proxyIp);
            System.getProperties().setProperty("http.proxyPort", proxyPort);
            String ipJson = request("http://int.dpool.sina.com.cn/iplookup/iplookup.php?format=json", null);
            Gson gson = new Gson();
            IpInfoBean ipInfoBean = gson.fromJson(ipJson, IpInfoBean.class);
            city = ipInfoBean.getCity();
            if (city == null || city == "" || city.length() <= 0) {
                ipList.remove(ipport);
                saveProxyIp(ipList);
                city = setProxyIp();
            }
        } catch (Exception e) {
            System.out.println(e);
            ipList.remove(ipport);
            saveProxyIp(ipList);
            city = setProxyIp();
        }
        System.out.println(city);
        return city;
    }

    public static void getProxyIp() {
        List ipList = getProxyIpFromXcdl();
        ipList.addAll(getProxyIpFromXdlFree());
//        ipList.addAll(getProxyIpFromXdlApi());
        System.out.println(ipList);
        saveProxyIp(ipList);
    }

    public static void saveProxyIp(List<String> ipList) {

        String path = getBasePath() + "/proxy.txt";
        if (ipList.size() >= 1) {
            TextFile.write(path, false, ipList.get(0) + "\n");
            for (int i = 1; i < ipList.size(); i++) {
                TextFile.write(path, true, ipList.get(i) + "\n");
            }
        } else {
//            System.out.println("代理池已空!");
            getProxyIp();
        }
    }

    //从讯代理获取免费ip
    public static List<String> getProxyIpFromXdlFree() {
        List<String> ipList = new ArrayList<>();
        String path = "http://www.xdaili.cn/ipagent//freeip/getFreeIps?page=1&rows=10";
        String json = request(path, null);
        while (json == null || json.isEmpty()) {
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            json = request(path, null);
        }
        Gson gson = new Gson();
        try {
            XdlFreeIpBean xdlFreeIpBean = gson.fromJson(json, XdlFreeIpBean.class);
            if (xdlFreeIpBean.getRows().size() > 0) {
                for (int i = 0; i < xdlFreeIpBean.getRows().size(); i++) {
                    XdlFreeIpBean.RowsBean rowsBean = xdlFreeIpBean.getRows().get(i);
                    String ip = rowsBean.getIp();
                    String port = rowsBean.getPort();
                    ipList.add(ip + ":" + port);
                }
            } else {
                try {
                    Thread.sleep(3000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
                getProxyIpFromXdlFree();
            }
        } catch (Exception e) {
            e.printStackTrace();
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e1) {
                e1.printStackTrace();
            }
            getProxyIpFromXdlFree();

        }
        return ipList;
    }

    //从西刺代理获取免费ip
    public static List<String> getProxyIpFromXcdl() {
        List<String> ipList = new ArrayList<>();
        String path = "http://www.xicidaili.com";
        String html = request(path, null);
        while (html == null || html.isEmpty()) {
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            html = request(path, null);
        }
        Document document = Jsoup.parse(html);
        Elements trs = document.getElementsByTag("tr");
        if (trs != null && trs.size() > 2) {
            for (int i = 2; i < trs.size(); i++) {
                if (trs.get(i).children().size() >= 3) {
                    String ip = trs.get(i).child(1).text();
                    String port = trs.get(i).child(2).text();
                    if (ip.equals("代理IP地址") || port.equals("端口")) {
                        continue;
                    }
//                    System.out.println(ip + ":" + port);
                    ipList.add(ip + ":" + port);
                }
            }
        } else {
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            getProxyIpFromXcdl();
        }
        return ipList;
    }

    //提取讯代理收费ip
    public static List<String> getProxyIpFromXdlApi() {
        List<String> ipList = new ArrayList<>();
        String path = "这个需要你去讯代理花钱买";
        String json = request(path, null);
        while (json == null || json.isEmpty()) {
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            json = request(path, null);
        }

        Gson gson = new Gson();
        try {
            ProxyIpBean proxyIpBean = gson.fromJson(json, ProxyIpBean.class);
            if (proxyIpBean.getERRORCODE().equals("0")) {
                for (int i = 0; i < proxyIpBean.getRESULT().size(); i++) {
                    ProxyIpBean.RESULTBean resultBean = proxyIpBean.getRESULT().get(i);
                    String ip = resultBean.getIp();
                    String port = resultBean.getPort();
                    ipList.add(ip + ":" + port);
                }
            } else {
                try {
                    Thread.sleep(3000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
                getProxyIpFromXdlApi();
            }
        } catch (Exception e) {
            e.printStackTrace();
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e1) {
                e1.printStackTrace();
            }
            getProxyIpFromXdlFree();
        }


        return ipList;

    }

    public static String request(String path, String cookie) {
        URL url = null;
        try {
            url = new URL(path);
            HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
            httpURLConnection.setRequestMethod("GET");// 提交模式
            httpURLConnection.addRequestProperty("Cookie", cookie);
            httpURLConnection.addRequestProperty("Accept", "application/json, text/javascript, */*; q=0.01");
            httpURLConnection.addRequestProperty("User-Agent",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36");
            httpURLConnection.addRequestProperty("Connection", "Keep-Alive");
            httpURLConnection.setConnectTimeout(5000);//连接超时 单位毫秒
            httpURLConnection.setReadTimeout(5000);//读取超时 单位毫秒
            httpURLConnection.setDoOutput(true);
            httpURLConnection.setDoInput(true);
            BufferedInputStream bis = new BufferedInputStream(httpURLConnection.getInputStream());
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            int len;
            byte[] arr = new byte[1024];
            while ((len = bis.read(arr)) != -1) {
                bos.write(arr, 0, len);
                bos.flush();
            }
            bos.close();
            return bos.toString("utf-8");
        } catch (Exception e) {
//            System.out.println(path + "连接失败,正在重试...");
        }
        return null;
    }

    //获取保存路径
    public static String getBasePath() {
        String basePath = ProxyUtil.class.getProtectionDomain().getCodeSource().getLocation().getPath();
        int firstIndex = basePath.lastIndexOf(System.getProperty("path.separator")) + 1;
        int lastIndex = basePath.lastIndexOf(File.separator) + 1;
        basePath = basePath.substring(firstIndex, lastIndex);
        return basePath;
    }

}

猜你喜欢

转载自blog.csdn.net/ygd1994/article/details/53612385