python爬虫案例-陶瓷公司数据爬取

  用requests爬取要注意HTTPConnectionPool(host=xxx, port=xxx): Max retries exceeded with url...异常,出现这个异常的解决方法:

    ①在requests.get()方法前调用 disable_warnings()方法  → requests.packages.urllib3.disable_warnings()

    ②在reqeusts.get() 方法中设置verify = False。→ response = requests.get(url = url,headers = self.__class__.headers,verify = False)

    ③在requests.get()方法后设置重试次数。→ requests.adapters.DEFAULT_RETRIES = 5

  详细的操作请看代码:

 1 # Author:K
 2 import requests
 3 from lxml import etree
 4 import os
 5 import csv
 6 
 7 class CompanySpider(object):
 8     headers = {
 9         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
10         'Connection': 'close'
11     }
12 
13     def run(self):
14         self.get_urls()
15 
16     def get_urls(self):
17         detail_urls = []
18         for page in range(1,46):
19             url = 'http://gd.taoci163.com/qiye/a440600p%s/FoShanShi.html' % page
20             requests.packages.urllib3.disable_warnings()
21             response = requests.get(url = url,headers = self.__class__.headers,verify = False)
22             requests.adapters.DEFAULT_RETRIES = 5
23             tree = etree.HTML(response.text)
24             detail_urls = tree.xpath('//div[@class="mainLeft"]//li//span/a[1]/@href')
25             self.parse_page(detail_urls)
26             print('page %s over!!!' % page)
27 
28     def parse_page(self,urls):
29         for url in urls:
30             data = []
31             requests.packages.urllib3.disable_warnings()
32             response = requests.get(url = url,headers = self.__class__.headers,verify = False)
33             requests.adapters.DEFAULT_RETRIES = 5
34             tree = etree.HTML(response.text)
35             # 获取公司名称,添加到列表中
36             try:
37                 company_name = tree.xpath('//div[@class="conA contact"]/ul/li[1]/span/text()')[0]
38                 data.append(company_name)
39                 # 获取公司联系人,添加到列表中
40                 contacts_name = tree.xpath('//div[@class="conA contact"]/ul/li[2]/span/text()')[0]
41                 data.append(contacts_name)
42                 # 获取公司地址,添加到列表中
43                 company_addr = tree.xpath('//div[@class="conA contact"]/ul/li[3]/span/text()')[0]
44                 data.append(company_addr)
45                 # 获取公司电话,添加到列表中
46                 company_phone = tree.xpath('//div[@class="conA contact"]/ul/li[5]/span/text()')[0]
47                 data.append(company_phone)
48                 # 获取手机号,添加到列表中
49                 mobile_phone = tree.xpath('//div[@class="conA contact"]/ul/li[6]/span/text()')[0]
50                 data.append(mobile_phone)
51                 # 获取公司传真,添加到列表中
52                 company_fax = tree.xpath('//div[@class="conA contact"]/ul/li[7]/span/text()')[0]
53                 data.append(company_fax)
54                 # 持久化存储
55                 self.save_data(data)
56             except Exception as e:
57                 print(e)
58 
59     def save_data(self,data):
60         writer.writerow(data)
61 
62 
63 if __name__ == '__main__':
64     if not os.path.exists('H:/陶瓷公司数据'):
65         os.mkdir('H:/陶瓷公司数据')
66     fp = open('H:/陶瓷公司数据/佛山陶瓷公司_test.csv','wt',encoding = 'utf-8-sig')
67     writer = csv.writer(fp)
68     csv_header = ['公司名称','联系人','公司地址','电话','手机','公司传真']
69     writer.writerow(csv_header)
70     spider = CompanySpider()
71     try:
72         spider.run()
73     except Exception as e:
74         print(e)
75     fp.close()
陶瓷公司数据爬取(requests)

  

  而用scrapy框架的话就方便很多,由于采用异步方式,爬取速度也很快,要注意域名必须写对,很重要!代码如下:

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from ..items import ChinaItem
 4 
 5 
 6 class ChinaCompanySpider(scrapy.Spider):
 7     name = 'china_company'
 8     allowed_domains = ['taoci163.com']  # !!!!!!!!!此处域名要写对,不小心掉坑里了!!!!!!!!!
 9     start_urls = ['http://gd.taoci163.com/qiye/a440600p1/FoShanShi.html']
10 
11     def parse(self, response):
12         detail_urls = response.xpath('//div[@class="mainLeft"]//li//span/a[1]/@href').getall()
13         for detail_url in detail_urls:
14             yield scrapy.Request(detail_url,callback = self.parse_detail)
15         next_url = response.xpath('//div[@class="page"]/a[last()]/@href').get()
16         if next_url:
17             yield scrapy.Request(response.urljoin(next_url),callback = self.parse)
18 
19     def parse_detail(self,response):
20         company_name = response.xpath('//div[@class="conA contact"]/ul/li[1]/span/text()').get()
21         # 获取公司联系人,添加到列表中
22         contacts_name = response.xpath('//div[@class="conA contact"]/ul/li[2]/span/text()').get()
23         # 获取公司地址,添加到列表中
24         company_addr = response.xpath('//div[@class="conA contact"]/ul/li[3]/span/text()').get()
25         # 获取公司电话,添加到列表中
26         company_phone = response.xpath('//div[@class="conA contact"]/ul/li[5]/span/text()').get()
27         # 获取手机号,添加到列表中
28         mobile_phone = response.xpath('//div[@class="conA contact"]/ul/li[6]/span/text()').get()
29         # 获取公司传真,添加到列表中
30         company_fax = response.xpath('//div[@class="conA contact"]/ul/li[7]/span/text()').get()
31         # 测试
32         item = ChinaItem(company_name = company_name,contacts_name = contacts_name,
33                          company_addr = company_addr,company_phone = company_phone,
34                          mobile_phone = mobile_phone,company_fax = company_fax)
35         yield item
36         
scrapy中的爬虫模块
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 import csv
 9 
10 
11 class ChinaPipeline(object):
12     def __init__(self):
13         self.fp = open('H:/陶瓷公司数据/佛山陶瓷公司(scrapy).csv','a+',encoding = 'utf-8-sig')
14         self.writer = csv.writer(self.fp)
15         headers = ['公司名称','联系人','公司地址','电话','手机','公司传真']
16         self.writer.writerow(headers)
17 
18     def process_item(self, item, spider):
19         company_name = item['company_name']
20         contacts_name = item['contacts_name']
21         company_addr = item['company_addr']
22         company_phone = item['company_phone']
23         mobile_phone = item['mobile_phone']
24         company_fax = item['company_fax']
25 
26         self.writer.writerow((company_name,contacts_name,company_addr,company_phone,mobile_phone,company_fax))
27         return item
28 
29     def close_spider(self,spider):
30         self.fp.close()
scrapy中的pipelines模块

猜你喜欢

转载自www.cnblogs.com/KisInfinite/p/10952831.html