java爬取天眼查并存入excel中

功能：

自动读取comyang.txt文件中的公司名进行搜索
把搜索到含有公司详细信息的html保存在info文件夹
把html文件中的信息提取到excel表格中
判断是否出现机器人验证
断点续查（关了再打开不会重复查询）

缺点：

无法跳过机器人验证
程序分为两个执行文件，不是一键完成
偶尔会卡住（请求没有响应）

问题记录

登录问题：一开始没有登录，查询的时候总是被拦截跳转到登录页面。在浏览器上登录，复制cookie信息，在代码中设置即可。
机器人验证：据我观察，同一个IP调用天眼查网站上的接口大约100次就会出现一次机器人验证。虽然很想自动完成，但是能力有限，实现不了，后来想想采取了一个折中的方法，在代码里面检测是否出现机器人验证。当出现机器人验证的时候，打印验证的地址，程序暂停。等待人工完成验证后，输入OK再继续往下执行。
程序卡住：不知道是代码问题还是，网站的问题。每查询一百多个公司的时候，总会有个请求等不到响应，一直在等待。虽然做了处理，把程序关了再打开还是会继续往后查询，但是挺纠结。以后再处理。
图片编码：试着破解机器人验证的过程发先一个挺有意思的地方。天眼查网站的机器人验证是点选汉字的方式，页面中有两张图片。这两张图片有个比较有意思的地方是采用Base64编码的方式进行传输的。也就是把汉字转为字符串的形式，以前不知道还有这种操作。

依赖jar包

httpclient：模拟发起HTTP请求
jsoup：解析HTML
poi-ooxml：Excel表格操作

关键代码

设置请求头：非常关键，需要把登录后的cookie信息复制在这里设置，不然调用接口的时候会被拦截跳转到登录页面。

/**
     * 设置请求头
     * @param httpGet
     */
    public static void setHttpHeaders(HttpGet httpGet) {
        //设置默认请求头 在浏览器登陆后，把cookie的内容复制到这里设置cookie，不然无法查询
        httpGet.setHeader("Cookie","");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
        httpGet.setHeader("Connection", "keep-alive");
        httpGet.setHeader("Host", "www.tianyancha.com");
        httpGet.setHeader("Referer", "https://www.tianyancha.com/");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
    }

下面是全部的代码：

package cn.xiaoyanol.crawler;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Scanner;

/**
 * Created with IntelliJ IDEA.
 * Description: 利用天眼查网站查询指定公司的工商信息
 * @Author: jenrey
 * @Date: 2018-10-16
 * @Time: 下午5:09
 */
public class TianYanChaCrawler {
    public static void main(String[] args) throws IOException {


        //要获取信息的公司名单
        File companyFile = new File("company.txt");
        FileReader fileReader = new FileReader(companyFile);
        BufferedReader bufferedReader = new BufferedReader(fileReader);
        List<String> companyNameList = new ArrayList<String>();
        String companyName = null;
        while ((companyName = bufferedReader.readLine()) != null) {
            companyNameList.add(companyName.trim().replaceAll("　", ""));
        }
        bufferedReader.close();
        fileReader.close();

        //创建文件夹存储含有公司详细信息的html页面
        File directory = new File("info");
        if (!directory.exists()) {
            directory.mkdir();
        } else {
            //过滤已经查询过的公司
            String[] fileNameList = directory.list();
            for (String fileName : fileNameList) {
                fileName = fileName.substring(0, fileName.length()-5);
                if (companyNameList.contains(fileName)) {
                    companyNameList.remove(fileName);
                    System.out.println(fileName+" 已经搜索过，该公司将被跳过。。。");
                    System.out.println();
                }
            }
        }

        if (companyNameList.size() == 0) {
            System.out.println("没有要搜索的公司，程序即将关闭。。。");
            System.exit(0);
        }else {
            System.out.println("程序将要搜索 "+ companyNameList.size()+" 个公司的信息。。。");
            System.out.println();
        }

        HttpClient httpClient = HttpClientBuilder.create().build();
        HttpClientContext context = HttpClientContext.create();
        Scanner scanner = new Scanner(System.in);
        //设置请求和传输超时时间
        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).build();



        // httpClient.
        for (int index = 0; index < companyNameList.size(); index++) {
            companyName = companyNameList.get(index);
            System.out.println((index+1)+"、正在获取 "+ companyName +" 的信息。。。");
            System.out.println();
            HttpGet httpGet = new HttpGet("https://www.tianyancha.com/search?key=" + companyName);
            //设置默认请求头
            setHttpHeaders(httpGet);
            httpGet.setConfig(requestConfig);

            //执行HTTP请求
            HttpResponse response = httpClient.execute(httpGet, context);
            System.out.println("HTTP请求执行完成。。。");

            //判断是否出现机器人验证
            if (checkRobotVerification(response, context)) {
                List<URI> redirectLocations = context.getRedirectLocations();
                System.out.println("注意！出现机器人验证，请点击下面的链接，在验证完后输入 ok 继续运行。。。");
                System.out.println();
                System.out.println(redirectLocations.get(0));
                System.out.println();
                System.out.print("完成验证后，请在此处输入OK：");
                while (! "ok".equalsIgnoreCase(scanner.nextLine())) {
                    System.out.print("完成验证后，请在此处输入OK：");
                }
                System.out.println();
                //再次执行http请求
                response = httpClient.execute(httpGet, context);
                if (checkRobotVerification(response, context)) {
                    System.out.println("没有完成机器人验证，程序结束运行。。。。");
                    System.out.println();
                    break;
                }
            }

            System.out.println((index+1)+"、搜索 "+companyName+" 信息完成");
            System.out.println();
            HttpEntity entity = response.getEntity();

            //从搜索页面中提取目标详细信息的URL
            if (entity != null) {

                //将实体的内容转换为字符串
                String html= EntityUtils.toString(entity);
                Document document = Jsoup.parse(html);
                Elements select = document.select("a.name");
                //提取URL
                String url = select.attr("abs:href");

                //没有搜索结果时跳过
                if ("".equals(url)) {
                    System.out.println("无法在天眼查网站查询到："+companyName+" 的信息");
                    System.out.println();
                    try {
                        //系统暂停10s
                        System.out.println("系统暂停1秒。。。。");
                        Thread.sleep(1 * 1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    continue;
                }else {
                    System.out.println((index+1)+"、获取 "+companyName+" 详情URL成功。。。");
                    System.out.println();
                }
                System.out.println("准备获取详细信息。。。");
                //获取公司的详细信息
                httpGet = new HttpGet(url);
                //设置默认请求头
                setHttpHeaders(httpGet);
                httpGet.setConfig(requestConfig);

                //执行HTTP请求
                response = httpClient.execute(httpGet, context);
                System.out.println("HTTP请求执行完成");

                //判断是否出现机器人验证
                if (checkRobotVerification(response, context)) {
                    List<URI> redirectLocations = context.getRedirectLocations();
                    System.out.println("注意！出现机器人验证，请点击下面的链接，在验证完后输入回车继续运行。。。");
                    System.out.println(redirectLocations.get(0));
                    System.out.print("完成验证后，请在此处输入OK：");
                    while (! "ok".equalsIgnoreCase(scanner.nextLine())) {
                        System.out.print("完成验证后，请在此处输入OK：");
                    }
                    System.out.println();
                    scanner.close();
                    //再次执行http请求
                    response = httpClient.execute(httpGet, context);
                    if (checkRobotVerification(response, context)) {
                        System.out.println("没有完成机器人验证。。。。");
                        break;
                    }
                }


                entity = response.getEntity();
                InputStream content = entity.getContent();
                FileOutputStream fileOutputStream = new FileOutputStream(directory+"/"+companyName+".html");
                byte[] buff = new byte[2048];
                int length = 0;
                //保存公司详细信息
                while ((length = content.read(buff, 0, buff.length)) != -1) {
                    fileOutputStream.write(buff, 0, length);
                }
                html = EntityUtils.toString(entity);
                fileOutputStream.close();
                content.close();

                //提取信息
                File file = new File("info/"+companyName+".html");
                document = Jsoup.parse(file, "UTF-8");
                Elements tbodys = document.select("tbody");
                if (tbodys.size() < 2){
                    System.out.println("注意！"+companyName+" 无法查询到工商信息。。。");
                    continue;
                }

                Element tbody = tbodys.get(1) ;

                Elements rows = tbody.select("tr");

                System.out.println((index+1)+"、"+companyName+" 的工商信息如下：");
                System.out.println();
                for (int i = 0; i < rows.size(); i++){
                    Elements tds = rows.get(i).select("td");
                    for (int j = 0; j < tds.size(); j++){
                        if (j % 2 == 0){
                            System.out.print(tds.get(j).text().split(" ")[0]+" : ");

                        }else {
                            System.out.print(tds.get(j).text()+"\t\t\t");
                        }
                    }
                    System.out.println();
                    System.out.println();

                }
            }
            System.out.println();
            System.out.println((index+1)+"、提取："+companyName+" 信息完成");
            System.out.println();
            if (index == companyNameList.size() - 1 ) {
                System.out.println("搜索完成，程序即将结束。。。");
            }else {
                int time = new Random().nextInt(2)+1;
                System.out.println("系统暂停：" + time + "秒");
                System.out.println();
                try {
                    Thread.sleep(time * 1000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }

        }
    }

    /**
     * 检查是否出现机器人验证
     * @param response
     * @param context
     * @return
     */
    public static boolean checkRobotVerification(HttpResponse response , HttpClientContext context) {
        boolean result  = false;

        StatusLine statusLine = response.getStatusLine();
        int statusCode = statusLine.getStatusCode();
        if (statusCode != HttpStatus.SC_OK) {
            return true;
        }

        return result;
    }

    /**
     * 设置请求头
     * @param httpGet
     */
    public static void setHttpHeaders(HttpGet httpGet) {
        //设置默认请求头 在浏览器登陆后，把cookie的内容复制到这里设置cookie，不然无法查询
        httpGet.setHeader("Cookie", "TYCID=3f2b49d0cd4111e8a5e549f497d021fa; undefined=3f2b49d0cd4111e8a5e549f497d021fa; ssuid=5123032640; _ga=GA1.2.1228278129.1539254092; _gid=GA1.2.1873227241.1539582139; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252243%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215522887713%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg; RTYCID=5ecd4dc1fe2a41ff8c5337236243115a; CT_TYCID=e65b84b666fe47febe8d3d669a99d899; aliyungf_tc=AQAAAKGn5XMvqAkAxZFsyv2ebLRydO1v; csrfToken=eGbxdFydN_eDMCrU8Pxv5JJm");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
        httpGet.setHeader("Connection", "keep-alive");
        httpGet.setHeader("Host", "www.tianyancha.com");
        httpGet.setHeader("Referer", "https://www.tianyancha.com/");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
    }

    public static void UITips(HttpResponse response, HttpClientContext context) {

    }
}

下面是生成excel的代码：

package cn.xiaoyanol.crawler;


import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created with IntelliJ IDEA.
 * Description:
 * 把查询到的页面信息中的数据提取到excel表格中
 * @Author: jenrey
 * @Date: 2018-10-16
 * @Time: 下午2:15
 */
public class GetExcel {
    public static void main(String[] args) throws IOException, InvalidFormatException {

        File directory = new File("info");

        if (!directory.exists()) {
            System.out.println("文件夹不存在，程序结束运行");
            return;
        }

        //获取要提取的文件
        String[] files = directory.list();
        List<String> fileNames = new ArrayList<String>();
        for (String file : files) {
            if (file.contains(".html")) {
                fileNames.add(file);
            }
        }


        File xlsxFile = new File("查询结果.xlsx");

        //如果文件不存在，创建文件
        if (!xlsxFile.exists()) {
            //创建一个工作簿
            XSSFWorkbook workbook = new XSSFWorkbook();
            //创建一个工作表
            XSSFSheet sheet = workbook.createSheet("sheet1");

            //初始化第一行信息头
            Row row = sheet.createRow(0);
            row.createCell(0).setCellValue("搜索公司名");
            row.createCell(1).setCellValue("实际公司名");
            row.createCell(2).setCellValue("工商注册号");
            row.createCell(3).setCellValue("组织机构代码");
            row.createCell(4).setCellValue("统一信用代码");
            row.createCell(5).setCellValue("公司类型");
            row.createCell(6).setCellValue("纳税人识别号");
            row.createCell(7).setCellValue("行业");
            row.createCell(8).setCellValue("营业期限");
            row.createCell(9).setCellValue("核准日期");
            row.createCell(10).setCellValue("纳税人资质");
            row.createCell(11).setCellValue("人员规模");
            row.createCell(12).setCellValue("实缴资本");
            row.createCell(13).setCellValue("登记机关");
            row.createCell(14).setCellValue("参保人数");
            row.createCell(15).setCellValue("英文名称");
            row.createCell(16).setCellValue("注册地址");
            row.createCell(17).setCellValue("经营范围");
            row.createCell(18).setCellValue("法定代表人");


            FileOutputStream outputStream = new FileOutputStream(xlsxFile);
            workbook.write(outputStream);
            outputStream.close();
        }


        //打开工作簿
        FileInputStream fileInputStream = new FileInputStream(xlsxFile);
        Workbook workbook = new XSSFWorkbook(fileInputStream);
        //获取工作表
        Sheet sheet = workbook.getSheet("sheet1");

        //提取信息
        int rowNum = 1;
        for (String fileName : fileNames) {
            try {
                List<String> messageList = new ArrayList<String>();
                //搜索的公司
                messageList.add(fileName.substring(0, fileName.length()-5));

                File file = new File("info/" + fileName);
                Document document = Jsoup.parse(file, "UTF-8");

                //获取实际查询到的公司名
                Elements h1 = document.select("h1");
                String realCompany = h1.text();
                messageList.add(realCompany);

                Elements tbodys = document.select("tbody");

                Element tbody = tbodys.get(1);
                Elements rows = tbody.select("tr");

                //提取查询到的公司的工商信息
                for (int i = 0; i < rows.size(); i++) {
                    Elements tds = rows.get(i).select("td");
                    for (int j = 0; j < tds.size(); j++) {
                        if (j % 2 == 0) {
                            continue;
                        } else {
                            messageList.add(tds.get(j).text());
                        }
                    }
                }
                Element element = tbodys.get(0);
                Element aElement = element.select("a").get(0);
                messageList.add(aElement.text());
                Row row1 = sheet.createRow(rowNum++);
                for (int i = 0; i < messageList.size(); i++) {
                    row1.createCell(i).setCellValue(messageList.get(i));
                }
                FileOutputStream outputStream = new FileOutputStream(xlsxFile);
                outputStream.flush();
                workbook.write(outputStream);
                outputStream.close();
                System.out.println("rowNum:"+(rowNum - 1)+" "+ fileName);
            }catch (Exception e) {
                System.out.println(fileName+"-------------------");
            }
        }

    }
}