爬取企查查网站中安徽省内的企业数据信息

企查查网站中汇聚了有关注册企业的详细信息,为了更好的查询企业相关信息,本人对网站中安徽省境内的企业进行了爬取,其中遇到的问题和使用的技术如下:

1、遇到的问题:

  1>企查查PC版数据只显示前500页,为了尽可能最大化爬取网站数据,本次爬取按照市级分别爬取,共计爬取安徽省境内16个市区共计80000条企业信息;

  2>在爬取网站数据时,若爬取速度过快,会出现手动验证功能,为了解决手动验证,同时为了避免封号,直接采用随机更换IP代理,IP代理可以在《89免费代理》网站获取免费代理账号,网址为:http://www.89ip.cn/,可以一次性获取30个代理IP,如果不够用,

    可以多次提取,然后构建代理池,本人试了,该网站的免费代理比西次代理和快代理网站的免费代理要好很多,如下图:

    

2、使用的技术:

  1>请求模块:requests请求,为了避免反爬,采用随机代理,同时使用fake_useragent随机产生user-agent;

  2>解析库:使用xpath和正则表达式

  3>提速优化:采用多线程,同时对爬取的数据进行一次性保存,避免磁盘频繁IO;

3、核心代码如下:

  

import requests
from lxml import etree
from queue import Queue
from threading import Thread
from fake_useragent import UserAgent
import csv
import os
import re
import random
import time
from ippools import ProxySpider
from proxy_ip import IP_LIST


class QichachaSpider:
    def __init__(self):
        self.url = 'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}'
        self.q = Queue()
        self.company_info = []
        self.headers = {
            'Host': 'www.qichacha.com',
            'Referer': 'https: // www.qichacha.com /',
            'X-Requested-With': 'XMLHttpRequest'
        }

    # 随机User-Agent
    def random_ua(self):
        ua = UserAgent()
        return ua.random

    # 随机IP
    def random_proxy(self):
        proxy_list = ProxySpider().get_training_ip('https://www.qichacha.com/')
        return proxy_list

    # 爬取目标入队列
    def put_url(self):
        self.headers['User-Agent'] = self.random_ua()
        url = 'https://www.qichacha.com/'
        html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)
        r_list = parse_html.xpath('//div[@class="areacom"]/div[2]/div[2]/a/@href')
        for r in r_list:
            link = r.split('_')[1:]
            for i in range(1, 501):
                url = self.url.format(link[0], link[1], i)
                print(url)
                self.q.put(url)

    # 获取一级页面数据
    def get_data(self):
        while True:
            if not self.q.empty():
                url = self.q.get()
                self.headers['User-Agent'] = self.random_ua()
                # proxies = self.random_proxy()
                proxies = random.choice(IP_LIST)
                try:
                    html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode('utf-8',
                                                                                                              'ignore')
                    # html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore')
                    # time.sleep(random.uniform(0.5, 1.5))
                    parse_html = etree.HTML(html)
                    company_list = parse_html.xpath('//table[@class="m_srchList"]/tbody/tr')
                    if company_list is not None:
                        for company in company_list:
                            try:
                                company_name = company.xpath('./td[2]/a/text()')[0].strip()
                                company_link = 'https://www.qichacha.com' + company.xpath('./td[2]/a/@href')[0].strip()
                                company_type, company_industry, company_business_scope = self.get_company_info(
                                    company_link)
                                company_person = company.xpath('./td[2]/p[1]/a/text()')[0].strip()
                                company_money = company.xpath('./td[2]/p[1]/span[1]/text()')[0].split('')[-1].strip()
                                company_time = company.xpath('./td[2]/p[1]/span[2]/text()')[0].split('')[-1].strip()
                                company_email = company.xpath('./td[2]/p[2]/text()')[0].split('')[-1].strip()
                                company_phone = company.xpath('td[2]/p[2]/span/text()')[0].split('')[-1].strip()
                                company_address = company.xpath('td[2]/p[3]/text()')[0].split('')[-1].strip()
                                company_status = company.xpath('td[3]/span/text()')[0].strip()
                                company_dict = {
                                    '公司名称': company_name,
                                    '公司链接': company_link,
                                    '公司类型': company_type,
                                    '所属行业': company_industry,
                                    '经营范围': company_business_scope,
                                    '公司法人': company_person,
                                    '注册资本': company_money,
                                    '注册时间': company_time,
                                    '邮箱': company_email,
                                    '电话': company_phone,
                                    '地址': company_address,
                                    '是否存续': company_status,
                                }
                                print(company_dict)
                                # self.company_info.append(
                                #     (company_name, company_link, company_type, company_industry, company_business_scope,
                                #      company_person, company_money, company_time, company_email, company_phone,
                                #      company_address, company_status))
                                info_list = [company_name, company_link, company_type, company_industry,
                                             company_business_scope, company_person, company_money, company_time,
                                             company_email, company_phone, company_address, company_status]
                                self.save_data(info_list)

                            except:
                                with open('./bad.csv', 'a', encoding='utf-8', newline='') as f:
                                    writer = csv.writer(f)
                                    writer.writerow(url)
                                continue
                except:
                    self.q.put(url)

            else:
                break

    # 获取二级页面数据
    def get_company_info(self, company_link):
        headers = {'User-Agent': UserAgent().random}
        html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode(
            'utf-8', 'ignore')
        while True:
            if '企业类型' not in html:
                html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST),
                                    timeout=3).content.decode(
                    'utf-8', 'ignore')
            else:
                break
        try:
            company_type = re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
            company_industry = re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
            company_business_scope = re.findall(r'经营范围.*?"3">(.*?)</td>', html, re.S)[0].strip()
            return company_type, company_industry, company_business_scope
        except:
            return '', '', ''

    # 保存数据
    def save_data(self, info):
        with open('./1111.csv', 'a', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(info)

    def main(self):
        if os.path.exists('./1111.csv'):
            os.remove('./1111.csv')
            with open('./1111.csv', 'a', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(
                    ['公司名称', '公司链接', '公司类型', '所属行业', '经营范围', '公司法人', '注册资本', '注册时间', '邮箱', '电话', '地址', '是否存续'])
        self.put_url()
        t_list = []
        for i in range(0, 10):
            t = Thread(target=self.get_data)
            t_list.append(t)
            t.start()

        for j in t_list:
            j.join()


if __name__ == "__main__":
    spider = QichachaSpider()
    spider.main()

    

猜你喜欢

转载自www.cnblogs.com/yuxiangyang/p/11255947.html