版权声明: https://blog.csdn.net/JENREY/article/details/83090002
功能:
自动读取comyang.txt文件中的公司名进行搜索
把搜索到含有公司详细信息的html保存在info文件夹
把html文件中的信息提取到excel表格中
判断是否出现机器人验证
断点续查(关了再打开不会重复查询)
缺点:无法跳过机器人验证
程序分为两个执行文件,不是一键完成
偶尔会卡住(请求没有响应)
问题记录
登录问题:一开始没有登录,查询的时候总是被拦截跳转到登录页面。在浏览器上登录,复制cookie信息,在代码中设置即可。
机器人验证:据我观察,同一个IP调用天眼查网站上的接口大约100次就会出现一次机器人验证。虽然很想自动完成,但是能力有限,实现不了,后来想想采取了一个折中的方法,在代码里面检测是否出现机器人验证。当出现机器人验证的时候,打印验证的地址,程序暂停。等待人工完成验证后,输入OK再继续往下执行。
程序卡住:不知道是代码问题还是,网站的问题。每查询一百多个公司的时候,总会有个请求等不到响应,一直在等待。虽然做了处理,把程序关了再打开还是会继续往后查询,但是挺纠结。以后再处理。
图片编码:试着破解机器人验证的过程发先一个挺有意思的地方。天眼查网站的机器人验证是点选汉字的方式,页面中有两张图片。这两张图片有个比较有意思的地方是采用Base64编码的方式进行传输的。也就是把汉字转为字符串的形式,以前不知道还有这种操作。
依赖jar包
httpclient:模拟发起HTTP请求
jsoup:解析HTML
poi-ooxml:Excel表格操作
关键代码
设置请求头:非常关键,需要把登录后的cookie信息复制在这里设置,不然调用接口的时候会被拦截跳转到登录页面。
/**
* 设置请求头
* @param httpGet
*/
public static void setHttpHeaders(HttpGet httpGet) {
//设置默认请求头 在浏览器登陆后,把cookie的内容复制到这里设置cookie,不然无法查询
httpGet.setHeader("Cookie","");
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Host", "www.tianyancha.com");
httpGet.setHeader("Referer", "https://www.tianyancha.com/");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
}
下面是全部的代码:
package cn.xiaoyanol.crawler;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Scanner;
/**
* Created with IntelliJ IDEA.
* Description: 利用天眼查网站查询指定公司的工商信息
* @Author: jenrey
* @Date: 2018-10-16
* @Time: 下午5:09
*/
public class TianYanChaCrawler {
public static void main(String[] args) throws IOException {
//要获取信息的公司名单
File companyFile = new File("company.txt");
FileReader fileReader = new FileReader(companyFile);
BufferedReader bufferedReader = new BufferedReader(fileReader);
List<String> companyNameList = new ArrayList<String>();
String companyName = null;
while ((companyName = bufferedReader.readLine()) != null) {
companyNameList.add(companyName.trim().replaceAll(" ", ""));
}
bufferedReader.close();
fileReader.close();
//创建文件夹存储含有公司详细信息的html页面
File directory = new File("info");
if (!directory.exists()) {
directory.mkdir();
} else {
//过滤已经查询过的公司
String[] fileNameList = directory.list();
for (String fileName : fileNameList) {
fileName = fileName.substring(0, fileName.length()-5);
if (companyNameList.contains(fileName)) {
companyNameList.remove(fileName);
System.out.println(fileName+" 已经搜索过,该公司将被跳过。。。");
System.out.println();
}
}
}
if (companyNameList.size() == 0) {
System.out.println("没有要搜索的公司,程序即将关闭。。。");
System.exit(0);
}else {
System.out.println("程序将要搜索 "+ companyNameList.size()+" 个公司的信息。。。");
System.out.println();
}
HttpClient httpClient = HttpClientBuilder.create().build();
HttpClientContext context = HttpClientContext.create();
Scanner scanner = new Scanner(System.in);
//设置请求和传输超时时间
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).build();
// httpClient.
for (int index = 0; index < companyNameList.size(); index++) {
companyName = companyNameList.get(index);
System.out.println((index+1)+"、正在获取 "+ companyName +" 的信息。。。");
System.out.println();
HttpGet httpGet = new HttpGet("https://www.tianyancha.com/search?key=" + companyName);
//设置默认请求头
setHttpHeaders(httpGet);
httpGet.setConfig(requestConfig);
//执行HTTP请求
HttpResponse response = httpClient.execute(httpGet, context);
System.out.println("HTTP请求执行完成。。。");
//判断是否出现机器人验证
if (checkRobotVerification(response, context)) {
List<URI> redirectLocations = context.getRedirectLocations();
System.out.println("注意!出现机器人验证,请点击下面的链接,在验证完后输入 ok 继续运行。。。");
System.out.println();
System.out.println(redirectLocations.get(0));
System.out.println();
System.out.print("完成验证后,请在此处输入OK:");
while (! "ok".equalsIgnoreCase(scanner.nextLine())) {
System.out.print("完成验证后,请在此处输入OK:");
}
System.out.println();
//再次执行http请求
response = httpClient.execute(httpGet, context);
if (checkRobotVerification(response, context)) {
System.out.println("没有完成机器人验证,程序结束运行。。。。");
System.out.println();
break;
}
}
System.out.println((index+1)+"、搜索 "+companyName+" 信息完成");
System.out.println();
HttpEntity entity = response.getEntity();
//从搜索页面中提取目标详细信息的URL
if (entity != null) {
//将实体的内容转换为字符串
String html= EntityUtils.toString(entity);
Document document = Jsoup.parse(html);
Elements select = document.select("a.name");
//提取URL
String url = select.attr("abs:href");
//没有搜索结果时跳过
if ("".equals(url)) {
System.out.println("无法在天眼查网站查询到:"+companyName+" 的信息");
System.out.println();
try {
//系统暂停10s
System.out.println("系统暂停1秒。。。。");
Thread.sleep(1 * 1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
continue;
}else {
System.out.println((index+1)+"、获取 "+companyName+" 详情URL成功。。。");
System.out.println();
}
System.out.println("准备获取详细信息。。。");
//获取公司的详细信息
httpGet = new HttpGet(url);
//设置默认请求头
setHttpHeaders(httpGet);
httpGet.setConfig(requestConfig);
//执行HTTP请求
response = httpClient.execute(httpGet, context);
System.out.println("HTTP请求执行完成");
//判断是否出现机器人验证
if (checkRobotVerification(response, context)) {
List<URI> redirectLocations = context.getRedirectLocations();
System.out.println("注意!出现机器人验证,请点击下面的链接,在验证完后输入回车继续运行。。。");
System.out.println(redirectLocations.get(0));
System.out.print("完成验证后,请在此处输入OK:");
while (! "ok".equalsIgnoreCase(scanner.nextLine())) {
System.out.print("完成验证后,请在此处输入OK:");
}
System.out.println();
scanner.close();
//再次执行http请求
response = httpClient.execute(httpGet, context);
if (checkRobotVerification(response, context)) {
System.out.println("没有完成机器人验证。。。。");
break;
}
}
entity = response.getEntity();
InputStream content = entity.getContent();
FileOutputStream fileOutputStream = new FileOutputStream(directory+"/"+companyName+".html");
byte[] buff = new byte[2048];
int length = 0;
//保存公司详细信息
while ((length = content.read(buff, 0, buff.length)) != -1) {
fileOutputStream.write(buff, 0, length);
}
html = EntityUtils.toString(entity);
fileOutputStream.close();
content.close();
//提取信息
File file = new File("info/"+companyName+".html");
document = Jsoup.parse(file, "UTF-8");
Elements tbodys = document.select("tbody");
if (tbodys.size() < 2){
System.out.println("注意!"+companyName+" 无法查询到工商信息。。。");
continue;
}
Element tbody = tbodys.get(1) ;
Elements rows = tbody.select("tr");
System.out.println((index+1)+"、"+companyName+" 的工商信息如下:");
System.out.println();
for (int i = 0; i < rows.size(); i++){
Elements tds = rows.get(i).select("td");
for (int j = 0; j < tds.size(); j++){
if (j % 2 == 0){
System.out.print(tds.get(j).text().split(" ")[0]+" : ");
}else {
System.out.print(tds.get(j).text()+"\t\t\t");
}
}
System.out.println();
System.out.println();
}
}
System.out.println();
System.out.println((index+1)+"、提取:"+companyName+" 信息完成");
System.out.println();
if (index == companyNameList.size() - 1 ) {
System.out.println("搜索完成,程序即将结束。。。");
}else {
int time = new Random().nextInt(2)+1;
System.out.println("系统暂停:" + time + "秒");
System.out.println();
try {
Thread.sleep(time * 1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
/**
* 检查是否出现机器人验证
* @param response
* @param context
* @return
*/
public static boolean checkRobotVerification(HttpResponse response , HttpClientContext context) {
boolean result = false;
StatusLine statusLine = response.getStatusLine();
int statusCode = statusLine.getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
return true;
}
return result;
}
/**
* 设置请求头
* @param httpGet
*/
public static void setHttpHeaders(HttpGet httpGet) {
//设置默认请求头 在浏览器登陆后,把cookie的内容复制到这里设置cookie,不然无法查询
httpGet.setHeader("Cookie", "TYCID=3f2b49d0cd4111e8a5e549f497d021fa; undefined=3f2b49d0cd4111e8a5e549f497d021fa; ssuid=5123032640; _ga=GA1.2.1228278129.1539254092; _gid=GA1.2.1873227241.1539582139; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252243%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215522887713%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg; RTYCID=5ecd4dc1fe2a41ff8c5337236243115a; CT_TYCID=e65b84b666fe47febe8d3d669a99d899; aliyungf_tc=AQAAAKGn5XMvqAkAxZFsyv2ebLRydO1v; csrfToken=eGbxdFydN_eDMCrU8Pxv5JJm");
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Host", "www.tianyancha.com");
httpGet.setHeader("Referer", "https://www.tianyancha.com/");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
}
public static void UITips(HttpResponse response, HttpClientContext context) {
}
}
下面是生成excel的代码:
package cn.xiaoyanol.crawler;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* Description:
* 把查询到的页面信息中的数据提取到excel表格中
* @Author: jenrey
* @Date: 2018-10-16
* @Time: 下午2:15
*/
public class GetExcel {
public static void main(String[] args) throws IOException, InvalidFormatException {
File directory = new File("info");
if (!directory.exists()) {
System.out.println("文件夹不存在,程序结束运行");
return;
}
//获取要提取的文件
String[] files = directory.list();
List<String> fileNames = new ArrayList<String>();
for (String file : files) {
if (file.contains(".html")) {
fileNames.add(file);
}
}
File xlsxFile = new File("查询结果.xlsx");
//如果文件不存在,创建文件
if (!xlsxFile.exists()) {
//创建一个工作簿
XSSFWorkbook workbook = new XSSFWorkbook();
//创建一个工作表
XSSFSheet sheet = workbook.createSheet("sheet1");
//初始化第一行信息头
Row row = sheet.createRow(0);
row.createCell(0).setCellValue("搜索公司名");
row.createCell(1).setCellValue("实际公司名");
row.createCell(2).setCellValue("工商注册号");
row.createCell(3).setCellValue("组织机构代码");
row.createCell(4).setCellValue("统一信用代码");
row.createCell(5).setCellValue("公司类型");
row.createCell(6).setCellValue("纳税人识别号");
row.createCell(7).setCellValue("行业");
row.createCell(8).setCellValue("营业期限");
row.createCell(9).setCellValue("核准日期");
row.createCell(10).setCellValue("纳税人资质");
row.createCell(11).setCellValue("人员规模");
row.createCell(12).setCellValue("实缴资本");
row.createCell(13).setCellValue("登记机关");
row.createCell(14).setCellValue("参保人数");
row.createCell(15).setCellValue("英文名称");
row.createCell(16).setCellValue("注册地址");
row.createCell(17).setCellValue("经营范围");
row.createCell(18).setCellValue("法定代表人");
FileOutputStream outputStream = new FileOutputStream(xlsxFile);
workbook.write(outputStream);
outputStream.close();
}
//打开工作簿
FileInputStream fileInputStream = new FileInputStream(xlsxFile);
Workbook workbook = new XSSFWorkbook(fileInputStream);
//获取工作表
Sheet sheet = workbook.getSheet("sheet1");
//提取信息
int rowNum = 1;
for (String fileName : fileNames) {
try {
List<String> messageList = new ArrayList<String>();
//搜索的公司
messageList.add(fileName.substring(0, fileName.length()-5));
File file = new File("info/" + fileName);
Document document = Jsoup.parse(file, "UTF-8");
//获取实际查询到的公司名
Elements h1 = document.select("h1");
String realCompany = h1.text();
messageList.add(realCompany);
Elements tbodys = document.select("tbody");
Element tbody = tbodys.get(1);
Elements rows = tbody.select("tr");
//提取查询到的公司的工商信息
for (int i = 0; i < rows.size(); i++) {
Elements tds = rows.get(i).select("td");
for (int j = 0; j < tds.size(); j++) {
if (j % 2 == 0) {
continue;
} else {
messageList.add(tds.get(j).text());
}
}
}
Element element = tbodys.get(0);
Element aElement = element.select("a").get(0);
messageList.add(aElement.text());
Row row1 = sheet.createRow(rowNum++);
for (int i = 0; i < messageList.size(); i++) {
row1.createCell(i).setCellValue(messageList.get(i));
}
FileOutputStream outputStream = new FileOutputStream(xlsxFile);
outputStream.flush();
workbook.write(outputStream);
outputStream.close();
System.out.println("rowNum:"+(rowNum - 1)+" "+ fileName);
}catch (Exception e) {
System.out.println(fileName+"-------------------");
}
}
}
}