'''
https://stackoverflow.com/questions/19707978/creating-loop-to-parse-table-data-in-scrapy-python
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[@class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
yield item
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[@class="someclass"]')
items = []
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
items.append(item)
return items
def parse(self, response):
divs = response.xpath('//tr[@class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.xpath('table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()[0]
item['var2'] = div.xpath('table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()[0]
item['var3'] = div.xpath('table/tbody/tr[*]/td[4]/p/text()').extract()[0]
return item
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[@class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('//table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('//table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('//table/tbody/tr[*]/td[4]/p/text()').extract()
return item
'''
#!/usr/bin/env python
# -*- coding:utf8 -*-
import requests
import random
from lxml import etree
def get_ip_list(url,headers):
web_data = requests.get(url,headers=headers).content
data_node = etree.HTML(web_data)
tr_node = data_node.xpath("//table[@id='ip_list']//tr")
for tr in tr_node:
img = tr.xpath('./td[1]/img/@src')
ip = tr.xpath('./td[2]/text()')
port = tr.xpath('./td[3]/text()')
if(len(img)):
print(img[0])
if(len(ip)):
print(ip[0])
if(len(port)):
print(port[0])
if __name__ =="__main__":
url = "http://www.xicidaili.com/nn"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Referer": "http://www.xicidaili.com/nn"
}
ip_list= get_ip_list(url,headers)
python lxml path 分析表格数据
猜你喜欢
转载自my.oschina.net/yonghan/blog/1634179
今日推荐
周排行