首页打开是这样的
点进一个标签后详情页面:
多次查看各标签页寻找规律,发现每一栏如果没有内容都会显示成无,而在相关概念一栏,有些内容定义在p标签 内,有些又定义在a标签内,此处可以用if进行条件判断,下面进行流程实现。
items.py
class PokerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
describe_title = scrapy.Field()
describe_content = scrapy.Field()
illustrate_title = scrapy.Field()
illustrate_content = scrapy.Field()
relevant_title = scrapy.Field()
relevant_content = scrapy.Field()
study_title = scrapy.Field()
study_content = scrapy.Field()
url = scrapy.Field()
poker_spider.py
先获取每个标签的url,回调给处理函数进行数据获取
name = 'poker_spider'
allowed_domains = ['thepokerlogic.com']
# start_urls = ['http://thepokerlogic.com/glossary?']
start_urls = ['http://thepokerlogic.com/glossary']
def parse(self, response):
href_list = response.xpath('//div[@class="content_list"]/div/a/@href').extract()
for list in href_list:
yield scrapy.Request(url="http://thepokerlogic.com"+list, callback=self.parse_tag)
数据获取
def parse_tag(self, response):
item = PokerItem()
item['title'] = response.xpath('//div[@class="glossary-detail"]/h2/text()').extract_first()
item['describe_title'] = response.xpath('//div[@class="detail-describe"]/h3/text()').extract_first()
item['describe_content'] = response.xpath('normalize-space(//div[@class="describe-content"]/p/text())').extract_first()
item['illustrate_title'] = response.xpath('//div[@class="detail-illustrate"]/h4/text()').extract_first()
item['illustrate_content'] = response.xpath('//div[@class="illustrate-content"]/p/text()').extract_first()
item['relevant_title'] = response.xpath('//div[@class="detail-relevant"]/h4/text()').extract_first()
relevant_content_a = response.xpath('//div[@class="relevant-content"]/a/text()').extract_first()
relevant_content_p = response.xpath('//div[@class="relevant-content"]/p/text()').extract_first()
if not relevant_content_a:
item['relevant_content'] = relevant_content_p
else:
item['relevant_content'] = relevant_content_a
item['study_title'] = response.xpath('//div[@class="detail-study"]/h4/text()').extract_first()
study_content_a = response.xpath('//div[@class="study-content"]/a/text()').extract()
study_content_p = response.xpath('//div[@class="study-content"]/p/text()').extract()
if not study_content_a:
item['study_content'] = study_content_p
else:
item['study_content'] = study_content_a
url_title = response.xpath('//div[@class="glossary-detail-content"]//a/text()').extract()
if url_title == '无' or url_title is None:
url_link = ''
else:
url_link = response.xpath('//div[@class="glossary-detail-content"]//a/@href').extract()
item['url'] = dict(zip(url_title, url_link))
return item
为了方便对数据进行查看查询,对数据库也进行了存取
pipelines.py
先获取数据库连接,这里是提前有建立好数据库,所以直接对数据进行赋值插入,调用mysql语句进行数据存取。
def parse_tag(self, response):
item = PokerItem()
item['title'] = response.xpath('//div[@class="glossary-detail"]/h2/text()').extract_first()
item['describe_title'] = response.xpath('//div[@class="detail-describe"]/h3/text()').extract_first()
item['describe_content'] = response.xpath('normalize-space(//div[@class="describe-content"]/p/text())').extract_first()
item['illustrate_title'] = response.xpath('//div[@class="detail-illustrate"]/h4/text()').extract_first()
item['illustrate_content'] = response.xpath('//div[@class="illustrate-content"]/p/text()').extract_first()
item['relevant_title'] = response.xpath('//div[@class="detail-relevant"]/h4/text()').extract_first()
relevant_content_a = response.xpath('//div[@class="relevant-content"]/a/text()').extract_first()
relevant_content_p = response.xpath('//div[@class="relevant-content"]/p/text()').extract_first()
if not relevant_content_a:
item['relevant_content'] = relevant_content_p
else:
item['relevant_content'] = relevant_content_a
item['study_title'] = response.xpath('//div[@class="detail-study"]/h4/text()').extract_first()
study_content_a = response.xpath('//div[@class="study-content"]/a/text()').extract()
study_content_p = response.xpath('//div[@class="study-content"]/p/text()').extract()
if not study_content_a:
item['study_content'] = study_content_p
else:
item['study_content'] = study_content_a
url_title = response.xpath('//div[@class="glossary-detail-content"]//a/text()').extract()
if url_title == '无' or url_title is None:
url_link = ''
else:
url_link = response.xpath('//div[@class="glossary-detail-content"]//a/@href').extract()
item['url'] = dict(zip(url_title, url_link))
return item
- 其他爬虫代码可参考github