爬虫-企查查

爬取企查查相关信息

  • 解决selenium滑块拖动报错问题
  • 爬取vip更多法人手机号
  • 根据数据库公司名称查取信息,最终插库
# coding:utf-8
import lxml.html, queue, logging, time, json, datetime, requests, re, lxml.html
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from sqlalchemy import create_engine
import pandas as pd
from bs4 import BeautifulSoup as bs
from pyquery import PyQuery as pq

updateTime = datetime.datetime.now().strftime('%Y-%m-%d')
print(updateTime)


# 获取浏览器driver
def get_driver():
    option = webdriver.ChromeOptions()
    # option.add_argument("headless")
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    option.add_experimental_option("useAutomationExtension", False)
    option.add_argument('lang=zh_CN.UTF-8')
    option.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')

    driver = webdriver.Chrome(options=option)

    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    
    
        "source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined});""",
    })
    return driver


driver = get_driver()
driver.get('https://www.qcc.com/')
login_element = driver.find_element_by_class_name('navi-btn')
login_element.click()
time.sleep(10)


db = create_engine('mysql+pymysql://root:[email protected]:3306/qcc')
sql = ''' select companyName from companyname where updateTime >= '%s' ; ''' % updateTime
pd_company = pd.read_sql_query(sql, db)
lt_company = [pd_company['companyName'][i] for i in range(len(pd_company))]

for company in lt_company:
    home = driver.find_element_by_class_name('m-r-sm')
    home.click()
    time.sleep(3)
    get_input = driver.find_element_by_id('searchkey')
    get_input.send_keys(company)
    time.sleep(3)
    submit = driver.find_element_by_class_name('index-searchbtn')
    submit.click()
    time.sleep(3)
    parser = lxml.html.fromstring(driver.page_source)
    soup = pq(parser)
    com_all_info = soup.find('.m_srchList')
    com_all_info_array = com_all_info.find("tr")
    for tr in com_all_info_array.items():
        try:
            mtxs = tr.find(".m-t-xs")
            companyName = tr.find(".ma_h1").text()
            legalPweson = mtxs.eq(0).find("a").text()
            pre_phone = mtxs.eq(1).text().split(u"邮箱:")[0].replace(' ', '').split('电话:')[1]
            if '更多' in pre_phone:
                pre_phone3 = mtxs.eq(1).find('a')
                pre_phone2 = mtxs.eq(1).find('a')[0]
                pre_phone1 = pre_phone2.xpath('./@onclick')[0].split('"')
                phone_list = []
                for num in pre_phone1:
                    if 13 > len(num) > 7 and re.search(r'\d', num):
                        phone_list.append(num)
                phone = ','.join(phone_list)
                # phone = pre_phone.split('更多')[0]
            else:
                phone = pre_phone
            phoneMore = ''
            address = mtxs.eq(3).text().split(':')[1]
            info_lt = []
            info_dic = {
    
    }
            info_dic['legalPerson'] = legalPweson
            info_dic['companyName'] = companyName
            info_dic['phoneNum'] = phone
            info_dic['companyAddress'] = address
            info_dic['updateTime'] = updateTime
            info_lt.append(info_dic)
            insert_data = pd.DataFrame(info_lt)
            pd.io.sql.to_sql(insert_data, 'companyinfos', db, if_exists='append', index=False)
            print(companyName, legalPweson, phone, address)
        except:
            pass


猜你喜欢

转载自blog.csdn.net/weixin_46046193/article/details/108605300