近期根据领导布置工作,改了改自己的T眼查爬虫代码,通过Xpath来寻找企业工商变更信息,亲测比较好用,但是需要提前拿VIP账号登录(手动狗头),分享给大家,有改进的地方请大家指教。
输入是通过excel读取企业list,最终输出是将爬取内容写入另一个新的excel,具体资源已上传本人资源模块。
# -*- coding: utf-8 -*-
"""
@author: xjt
"""
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from read_comp_names_from_xlsx import read_comp_names_from_xlsx
import json
import solve
from selenium.common.exceptions import NoSuchElementException
import write_data_to_xls
# from log import log_in
def spide_tianyan():
names = read_comp_names_from_xlsx() # 读取公司名称
data = []
error_data = [] # 存放因为错误没有爬取成功的公司名称
col = 1
# 第一步,输入公司名称
driver = webdriver.Chrome()
for name in names:
company_json = ''
holder_json = ''
invset_json = ''
invset_flag=True
try:
driver.get("https://www.tianyancha.com/")
time.sleep(0.5)
inputelement = driver.find_element_by_id('home-main-search')
except NoSuchElementException:
error_data.append(name)
except Exception:
pass
for i in name:
inputelement.send_keys(i)
# driver.find_element_by_class_name("//div[@id='dialog_70311743']/div/div/div").click()
window = driver.window_handles
# 暂时注释
# driver.find_element_by_xpath(r"//div[@id='_modal_container']//i[@class='tic tic-Web_Off']").click()
# driver.find_element_by_xpath("//[@class='input-group-btn btn -xl]")[0].click()
cl_flag = 0
try:
driver.find_element_by_xpath(
'//*[@id="web-content"]/div/div[1]/div[2]/div/div/div[2]/div[2]/div[2]/div/span').click()
cl_flag = 1
except Exception as e_1:
print(e_1)
if cl_flag == 0:
driver.find_element_by_xpath(
'//*[@id="web-content"]/div/div[1]/div[2]/div/div/div[2]/div[2]/div[1]/div/span').click()
driver.find_element_by_class_name('input-group-btn').click()
driver.implicitly_wait(10)
print(driver.current_url)
# try:
# if col==1902:
# log_in(driver)
# except:
# pass
# 第二个窗口
window = driver.window_handles
try:
driver.implicitly_wait(10)
#link = driver.find_element_by_xpath("//a[@tyc-event-ch='CompanySearch.Company']")
#link = driver.find_element_by_xpath("//a[@tyc-event-ch='CompanySearch.Enterprise']")
#link = driver.find_element_by_xpath('.//div[contains(@class, "header")][2]/a[1]')
link = driver.find_element_by_xpath('//*[@id="search_company_0"]/div/div[3]/div[1]/a')
url = link.get_attribute('href')
except NoSuchElementException:
error_data.append(name)
link = driver.find_element_by_xpath('//*[@id="search_company_0"]/div/div[4]/div[1]/a')
url = link.get_attribute('href')
except Exception as e1:
print(e1)
# 第三个窗口
driver.get(url)
driver.implicitly_wait(10)
#// *[ @ id = "nav-main-pastICV2Count"] / div[1] / span
his_info=driver.find_element_by_xpath('//*[@id="_container_pastICV2Count"]')
print(his_info.text)
try:
company_info = driver.find_element_by_xpath('//*[@id="_container_pastICV2Count"]/table/tbody')
company_info = driver.find_element_by_xpath('//*[@id="_container_pastICV2Count"]/table/tbody')
print(company_info.text)
da = []
da.append(name)
da.append(company_info.text)
row = col
write_data_to_xls.write_data_to_xls(da, row, "test.xls")
driver.implicitly_wait(10)
except NoSuchElementException as e2:
error_data.append(name)
print(e2)
except Exception as e1:
print(e1)
# 打开excel文件
col = col + 1
# driver.back()
# driver.back()
# driver.switch_to_window(window[-1])
driver.close()
if __name__ == "__main__":
spide_tianyan()
#spider_baidu()