爬取企查查相关信息
- 解决selenium滑块拖动报错问题
- 爬取vip更多法人手机号
- 根据数据库公司名称查取信息,最终插库
# coding:utf-8
import lxml.html, queue, logging, time, json, datetime, requests, re, lxml.html
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from sqlalchemy import create_engine
import pandas as pd
from bs4 import BeautifulSoup as bs
from pyquery import PyQuery as pq
updateTime = datetime.datetime.now().strftime('%Y-%m-%d')
print(updateTime)
# 获取浏览器driver
def get_driver():
option = webdriver.ChromeOptions()
# option.add_argument("headless")
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option("useAutomationExtension", False)
option.add_argument('lang=zh_CN.UTF-8')
option.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')
driver = webdriver.Chrome(options=option)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined});""",
})
return driver
driver = get_driver()
driver.get('https://www.qcc.com/')
login_element = driver.find_element_by_class_name('navi-btn')
login_element.click()
time.sleep(10)
db = create_engine('mysql+pymysql://root:[email protected]:3306/qcc')
sql = ''' select companyName from companyname where updateTime >= '%s' ; ''' % updateTime
pd_company = pd.read_sql_query(sql, db)
lt_company = [pd_company['companyName'][i] for i in range(len(pd_company))]
for company in lt_company:
home = driver.find_element_by_class_name('m-r-sm')
home.click()
time.sleep(3)
get_input = driver.find_element_by_id('searchkey')
get_input.send_keys(company)
time.sleep(3)
submit = driver.find_element_by_class_name('index-searchbtn')
submit.click()
time.sleep(3)
parser = lxml.html.fromstring(driver.page_source)
soup = pq(parser)
com_all_info = soup.find('.m_srchList')
com_all_info_array = com_all_info.find("tr")
for tr in com_all_info_array.items():
try:
mtxs = tr.find(".m-t-xs")
companyName = tr.find(".ma_h1").text()
legalPweson = mtxs.eq(0).find("a").text()
pre_phone = mtxs.eq(1).text().split(u"邮箱:")[0].replace(' ', '').split('电话:')[1]
if '更多' in pre_phone:
pre_phone3 = mtxs.eq(1).find('a')
pre_phone2 = mtxs.eq(1).find('a')[0]
pre_phone1 = pre_phone2.xpath('./@onclick')[0].split('"')
phone_list = []
for num in pre_phone1:
if 13 > len(num) > 7 and re.search(r'\d', num):
phone_list.append(num)
phone = ','.join(phone_list)
# phone = pre_phone.split('更多')[0]
else:
phone = pre_phone
phoneMore = ''
address = mtxs.eq(3).text().split(':')[1]
info_lt = []
info_dic = {
}
info_dic['legalPerson'] = legalPweson
info_dic['companyName'] = companyName
info_dic['phoneNum'] = phone
info_dic['companyAddress'] = address
info_dic['updateTime'] = updateTime
info_lt.append(info_dic)
insert_data = pd.DataFrame(info_lt)
pd.io.sql.to_sql(insert_data, 'companyinfos', db, if_exists='append', index=False)
print(companyName, legalPweson, phone, address)
except:
pass