企查查网站中汇聚了有关注册企业的详细信息,为了更好的查询企业相关信息,本人对网站中安徽省境内的企业进行了爬取,其中遇到的问题和使用的技术如下:
1、遇到的问题:
1>企查查PC版数据只显示前500页,为了尽可能最大化爬取网站数据,本次爬取按照市级分别爬取,共计爬取安徽省境内16个市区共计80000条企业信息;
2>在爬取网站数据时,若爬取速度过快,会出现手动验证功能,为了解决手动验证,同时为了避免封号,直接采用随机更换IP代理,IP代理可以在《89免费代理》网站获取免费代理账号,网址为:http://www.89ip.cn/,可以一次性获取30个代理IP,如果不够用,
可以多次提取,然后构建代理池,本人试了,该网站的免费代理比西次代理和快代理网站的免费代理要好很多,如下图:
2、使用的技术:
1>请求模块:requests请求,为了避免反爬,采用随机代理,同时使用fake_useragent随机产生user-agent;
2>解析库:使用xpath和正则表达式
3>提速优化:采用多线程,同时对爬取的数据进行一次性保存,避免磁盘频繁IO;
3、核心代码如下:
import requests from lxml import etree from queue import Queue from threading import Thread from fake_useragent import UserAgent import csv import os import re import random import time from ippools import ProxySpider from proxy_ip import IP_LIST class QichachaSpider: def __init__(self): self.url = 'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}' self.q = Queue() self.company_info = [] self.headers = { 'Host': 'www.qichacha.com', 'Referer': 'https: // www.qichacha.com /', 'X-Requested-With': 'XMLHttpRequest' } # 随机User-Agent def random_ua(self): ua = UserAgent() return ua.random # 随机IP def random_proxy(self): proxy_list = ProxySpider().get_training_ip('https://www.qichacha.com/') return proxy_list # 爬取目标入队列 def put_url(self): self.headers['User-Agent'] = self.random_ua() url = 'https://www.qichacha.com/' html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore') parse_html = etree.HTML(html) r_list = parse_html.xpath('//div[@class="areacom"]/div[2]/div[2]/a/@href') for r in r_list: link = r.split('_')[1:] for i in range(1, 501): url = self.url.format(link[0], link[1], i) print(url) self.q.put(url) # 获取一级页面数据 def get_data(self): while True: if not self.q.empty(): url = self.q.get() self.headers['User-Agent'] = self.random_ua() # proxies = self.random_proxy() proxies = random.choice(IP_LIST) try: html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode('utf-8', 'ignore') # html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore') # time.sleep(random.uniform(0.5, 1.5)) parse_html = etree.HTML(html) company_list = parse_html.xpath('//table[@class="m_srchList"]/tbody/tr') if company_list is not None: for company in company_list: try: company_name = company.xpath('./td[2]/a/text()')[0].strip() company_link = 'https://www.qichacha.com' + company.xpath('./td[2]/a/@href')[0].strip() company_type, company_industry, company_business_scope = self.get_company_info( company_link) company_person = company.xpath('./td[2]/p[1]/a/text()')[0].strip() company_money = company.xpath('./td[2]/p[1]/span[1]/text()')[0].split(':')[-1].strip() company_time = company.xpath('./td[2]/p[1]/span[2]/text()')[0].split(':')[-1].strip() company_email = company.xpath('./td[2]/p[2]/text()')[0].split(':')[-1].strip() company_phone = company.xpath('td[2]/p[2]/span/text()')[0].split(':')[-1].strip() company_address = company.xpath('td[2]/p[3]/text()')[0].split(':')[-1].strip() company_status = company.xpath('td[3]/span/text()')[0].strip() company_dict = { '公司名称': company_name, '公司链接': company_link, '公司类型': company_type, '所属行业': company_industry, '经营范围': company_business_scope, '公司法人': company_person, '注册资本': company_money, '注册时间': company_time, '邮箱': company_email, '电话': company_phone, '地址': company_address, '是否存续': company_status, } print(company_dict) # self.company_info.append( # (company_name, company_link, company_type, company_industry, company_business_scope, # company_person, company_money, company_time, company_email, company_phone, # company_address, company_status)) info_list = [company_name, company_link, company_type, company_industry, company_business_scope, company_person, company_money, company_time, company_email, company_phone, company_address, company_status] self.save_data(info_list) except: with open('./bad.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(url) continue except: self.q.put(url) else: break # 获取二级页面数据 def get_company_info(self, company_link): headers = {'User-Agent': UserAgent().random} html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode( 'utf-8', 'ignore') while True: if '企业类型' not in html: html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode( 'utf-8', 'ignore') else: break try: company_type = re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S)[0].strip() company_industry = re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S)[0].strip() company_business_scope = re.findall(r'经营范围.*?"3">(.*?)</td>', html, re.S)[0].strip() return company_type, company_industry, company_business_scope except: return '无', '无', '无' # 保存数据 def save_data(self, info): with open('./1111.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(info) def main(self): if os.path.exists('./1111.csv'): os.remove('./1111.csv') with open('./1111.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow( ['公司名称', '公司链接', '公司类型', '所属行业', '经营范围', '公司法人', '注册资本', '注册时间', '邮箱', '电话', '地址', '是否存续']) self.put_url() t_list = [] for i in range(0, 10): t = Thread(target=self.get_data) t_list.append(t) t.start() for j in t_list: j.join() if __name__ == "__main__": spider = QichachaSpider() spider.main()