版权声明:来一来,看一看,有钱的捧个人场,没钱的你不得捧个人场 https://blog.csdn.net/wait_for_eva/article/details/81698334
# -*- coding: utf-8 -*-
import scrapy
from first.items import ProxyItem
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['www.xicidaili.com']
start_urls = ['http://www.xicidaili.com/nt/']
def parse(self, response):
data = response.xpath('//table[@id="ip_list"]/tr')
for unit in data:
item = ProxyItem()
if not unit.xpath('./td'):
continue
try:
item['country'] = unit.xpath('./td[@class="country"]/img/@alt').extract()[0]
except:
item['country'] = 'None'
item['ip'] = unit.xpath('./td[2]/text()').extract()[0]
item['port'] = unit.xpath('./td[3]/text()').extract()[0]
try:
item['server_city'] = unit.xpath('./td[4]/a/text()').extract()[0]
except:
item['server_city'] = 'None'
item['know'] = unit.xpath('./td[5]/text()').extract()[0]
item['Type'] = unit.xpath('./td[6]/text()').extract()[0].lower()
item['speed'] = unit.xpath('./td[7]/div/@title').extract()[0]
item['link_time'] = unit.xpath('./td[8]/div/@title').extract()[0]
item['alive_time'] = unit.xpath('./td[9]/text()').extract()[0]
item['valid_time'] = unit.xpath('./td[10]/text()').extract()[0]
yield item
next_page = response.xpath('//div[@class="pagination"]/a[@rel="next"]/@href').extract()
if next_page:
link = 'http://www.xicidaili.com{}'.format(next_page[0])
print(link.center(50, '*'))
yield scrapy.Request(url=link, callback=self.parse, dont_filter=True)