西刺代理ip解析

版权声明:来一来,看一看,有钱的捧个人场,没钱的你不得捧个人场 https://blog.csdn.net/wait_for_eva/article/details/81698334
# -*- coding: utf-8 -*-
import scrapy
from first.items import ProxyItem


class MyspiderSpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ['www.xicidaili.com']
    start_urls = ['http://www.xicidaili.com/nt/']

    def parse(self, response):
        data = response.xpath('//table[@id="ip_list"]/tr')
        for unit in data:
            item = ProxyItem()
            if not unit.xpath('./td'):
                continue
            try:
                item['country'] = unit.xpath('./td[@class="country"]/img/@alt').extract()[0]
            except:
                item['country'] = 'None'
            item['ip'] = unit.xpath('./td[2]/text()').extract()[0]
            item['port'] = unit.xpath('./td[3]/text()').extract()[0]
            try:
                item['server_city'] = unit.xpath('./td[4]/a/text()').extract()[0]
            except:
                item['server_city'] = 'None'
            item['know'] = unit.xpath('./td[5]/text()').extract()[0]
            item['Type'] = unit.xpath('./td[6]/text()').extract()[0].lower()
            item['speed'] = unit.xpath('./td[7]/div/@title').extract()[0]
            item['link_time'] = unit.xpath('./td[8]/div/@title').extract()[0]
            item['alive_time'] = unit.xpath('./td[9]/text()').extract()[0]
            item['valid_time'] = unit.xpath('./td[10]/text()').extract()[0]
            yield item
        next_page = response.xpath('//div[@class="pagination"]/a[@rel="next"]/@href').extract()
        if next_page:
            link = 'http://www.xicidaili.com{}'.format(next_page[0])
            print(link.center(50, '*'))
            yield scrapy.Request(url=link, callback=self.parse, dont_filter=True)

猜你喜欢

转载自blog.csdn.net/wait_for_eva/article/details/81698334
今日推荐