scrapy--ipproxy

不要急于求成,你只要做的是比昨天的你更优秀一点
                       --匿名

今天给大家讲一下--IpProxy,由于从"http://www.xicidaili.com/nn"爬取,以下是我转载的博客

https://www.jianshu.com/p/8975a3997ab6

需要解决的问题

1.ip,端口和协议都是在静态页面中爬取
2.验证代理ip是否可用

这里就给大家看看爬取的代码怎么写,其他的配置可以看我之前的博客,具体代码可以进我的GitHub:。QAQ!!

# -*- coding: utf-8 -*-
import scrapy
from Iproxy.items import IproxyItem
import pdb
from Iproxy.settings import USER_AGENT
import re
from scrapy.linkextractors import LinkExtractor
import telnetlib

class IproxySpider(scrapy.Spider):
    name = 'iproxy'
    allowed_domains = ['www.xicidaili.com']
    start_urls = ['http://www.xicidaili.com/nn']

    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Content-Length': '11',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Host': 'www.xicidaili.com',
        'Origin': 'www.xicidaili.com',
        'Referer': 'http://www.xicidaili.com/',
        'User-Agent': USER_AGENT,
        'X-Requested-With': 'XMLHttpRequest',
    }

    #验证ip代理是否可用
    def telnet(self,item):
        try:
            telnetlib.Telnet(item['origin_ip'], port=item['port'], timeout=10.0)
        except:
            print('connect failure')
            return False
        else:
            print('conncet success')
            return True

    def parse(self, response):
        iplist = IproxyItem()
        sels = response.xpath('//tr[@class="odd"]')
        items = {}
        for sel in sels:
            ips     = sel.xpath('./td[2]').extract()[0].encode('utf8')
            ports   = sel.xpath('./td[3]').extract()[0].encode('utf8')
            types   = sel.xpath('./td[6]').extract()[0].encode('utf8')
            type    = re.findall(r'\>(.*?)\<',types)[0]

            #获取ip代理协议,低址,端口
            if type == 'HTTP':
                #items = 'http://' + re.findall(r'\>(.*?)\<',ips)[0] +':'+re.findall(r'\>(.*?)\<',ports)[0]
                items['origin_ip'] = re.findall(r'\>(.*?)\<',ips)[0]
                items['port']      = re.findall(r'\>(.*?)\<',ports)[0]
                if self.telnet(items):
                    iplist['ip_name'] = 'http://' + re.findall(r'\>(.*?)\<',ips)[0]
                    iplist['port']    = re.findall(r'\>(.*?)\<',ports)[0]

            if type == 'HTTPS':
                items['origin_ip'] = re.findall(r'\>(.*?)\<', ips)[0]
                items['port'] = re.findall(r'\>(.*?)\<', ports)[0]
                #items = 'https://' + re.findall(r'\>(.*?)\<', ips)[0] +':'+re.findall(r'\>(.*?)\<', ports)[0]
                if self.telnet(items):
                    iplist['ip_name'] = 'https://' + re.findall(r'\>(.*?)\<',ips)[0]
                    iplist['port']    = re.findall(r'\>(.*?)\<', ports)[0]

            print iplist
            yield iplist

        #获取页面链接url
        links = LinkExtractor(restrict_css='div.pagination')
        for link in links.extract_links(response):
            yield scrapy.Request(link.url,callback=self.parse)

猜你喜欢

转载自www.cnblogs.com/eilinge/p/9830079.html
今日推荐