python lxml path 分析表格数据

'''
https://stackoverflow.com/questions/19707978/creating-loop-to-parse-table-data-in-scrapy-python

def parse(self, response):
    hxs = HtmlXPathSelector(response)
    divs = hxs.select('//tr[@class="someclass"]')
    for div in divs:
        item = TestBotItem()
        item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
        item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
        item['var3'] = div.select('./td[4]/p/text()').extract()

        yield item




def parse(self, response):
    hxs = HtmlXPathSelector(response)
    divs = hxs.select('//tr[@class="someclass"]')
    items = []

    for div in divs:

        item = TestBotItem()
        item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
        item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
        item['var3'] = div.select('./td[4]/p/text()').extract()

        items.append(item)

    return items


def parse(self, response):
    divs = response.xpath('//tr[@class="someclass"]')
    for div in divs:
        item = TestBotItem()
        item['var1'] = div.xpath('table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()[0]
        item['var2'] = div.xpath('table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()[0]
        item['var3'] = div.xpath('table/tbody/tr[*]/td[4]/p/text()').extract()[0]
        return item



def parse(self, response):
    hxs = HtmlXPathSelector(response)
    divs = hxs.select('//tr[@class="someclass"]')
    for div in divs:
        item = TestBotItem()
        item['var1'] = div.select('//table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()
        item['var2'] = div.select('//table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()
        item['var3'] = div.select('//table/tbody/tr[*]/td[4]/p/text()').extract()
        return item

'''
#!/usr/bin/env python
# -*- coding:utf8 -*-

import requests
import random
from lxml import etree



def get_ip_list(url,headers):
    web_data = requests.get(url,headers=headers).content
    data_node = etree.HTML(web_data)
    tr_node = data_node.xpath("//table[@id='ip_list']//tr")
    for tr in tr_node:
        img = tr.xpath('./td[1]/img/@src')
        ip = tr.xpath('./td[2]/text()')
        port = tr.xpath('./td[3]/text()')
        if(len(img)):
            print(img[0])
        if(len(ip)):
            print(ip[0])
        if(len(port)):
            print(port[0])


if __name__ =="__main__":
    url = "http://www.xicidaili.com/nn"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
        "Referer": "http://www.xicidaili.com/nn"
    }
    ip_list= get_ip_list(url,headers)

猜你喜欢

转载自my.oschina.net/yonghan/blog/1634179