抖抖代码-爬取企查查企业信息

这里只写用到的工具 解题思路 授人以鱼不如授人以渔

# -*- coding-8 -*-

import requests
from bs4 import BeautifulSoup
import os
import time
import db


base_url = 'https://www.qcc.com/web/search?key='
headers = {
    
    
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Cookie': '自己的cookie',
    'Host': 'www.qcc.com',
    'Referer':	'https://www.qcc.com/',
    'TE': 'Trailers',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
companyInfo = {
    
    }
debug = False   # 开发时打开调试模式


# 创建文件方法:
def create_file(filename):
    """
    创建日志文件夹和日志文件
    :param filename:
    :return:
    """
    path = filename[0:filename.rfind("/")]
    if not os.path.isdir(path):  # 无文件夹时创建
        os.makedirs(path)
    if not os.path.isfile(filename):  # 无文件时创建
        fd = open(filename, mode="w", encoding="utf-8")
        fd.close()
    else:
        pass


def fileWrite(content):
    if not os.path.exists('cache.txt'):
        create_file('cache.txt')
    f = open('cache.txt', 'w', encoding='utf-8')
    f.write(content)
    f.close()


def fileOpen(fileName='cache.txt'):
    if not os.path.exists('cache.txt'):
        return False
    f = open(fileName, 'r', encoding='utf-8')
    return f.read()


def getCompanyData(companyName=''):
    proxies = {
    
    }
    try:
        text = fileOpen()
        if text and debug:
            print('缓存读取成功')
        else:
            url = base_url + companyName
            response = requests.get(url, headers=headers, proxies=proxies)
            if response.status_code != 200:
                response.encoding = 'utf-8'
                print(response.status_code)
                print('ERROR')
            if debug:
                fileWrite(response.text)  # 请求结果写入缓存文件
            text = response.text
        xmlContent = BeautifulSoup(text, 'lxml')
    except Exception as err:
        print(err)
    try:
        firstList = xmlContent.find_all('tr')[0]  # 获取搜索列表权重最高的信息
        if firstList:
            companyDesc = firstList.find_all('td')[2]  # 获取企业简介基础内容
            title = companyDesc.find('a', class_='title').text  # 标题
            detailUrl = companyDesc.find('a', class_='title').attrs['href']  # 详情链接地址
            status = companyDesc.find('span', class_='text-success').text  # 状态信息
            faren = companyDesc.find_all('span', class_='val')[0].text  # 企业法人
            ziben = companyDesc.find_all('span', class_='val')[1].text  # 注册资本
            tel = companyDesc.find_all('span', class_='val')[3].text  # 企业电话
            email = companyDesc.find_all('a', attrs={
    
    "title": "发送邮件"})[0].text  # 企业邮箱
            if len(companyDesc.find_all('a', attrs={
    
    "title": "进入官网"})) > 0:
                www = companyDesc.find_all('a', attrs={
    
    "title": "进入官网"})[0].attrs['href']  # 企业官网
            else:
                www = ''
            companyInfo['title'] = title
            companyInfo['detailUrl'] = detailUrl
            companyInfo['status'] = status
            companyInfo['faren'] = faren
            companyInfo['ziben'] = ziben
            companyInfo['tel'] = tel
            companyInfo['email'] = email
            companyInfo['www'] = www
            db.insert(companyInfo)  # 插入数据库
            return companyInfo
        else:
            return '未找到相关信息'
    except Exception as err:
        print(err)


if __name__ == '__main__':
    while name := input('请输入你想查询的公司名称:'):
        try:
            re = getCompanyData(name)
            print(re)
        except Exception as err:
            print(err)
        time.sleep(1)

复制完 请顺手 一键三联

猜你喜欢

转载自blog.csdn.net/jackbon8/article/details/114918802