scrapy爬取德州扑克

首页打开是这样的

7415868-4830810bb8bb254f.png
image.png

点进一个标签后详情页面:

7415868-e15f8ea35e530dfe.png
image.png

多次查看各标签页寻找规律,发现每一栏如果没有内容都会显示成无,而在相关概念一栏,有些内容定义在p标签 内,有些又定义在a标签内,此处可以用if进行条件判断,下面进行流程实现。

items.py

class PokerItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    describe_title = scrapy.Field()
    describe_content = scrapy.Field()
    illustrate_title = scrapy.Field()
    illustrate_content = scrapy.Field()
    relevant_title = scrapy.Field()
    relevant_content = scrapy.Field()
    study_title = scrapy.Field()
    study_content = scrapy.Field()
    url = scrapy.Field()

poker_spider.py

先获取每个标签的url,回调给处理函数进行数据获取

name = 'poker_spider'
    allowed_domains = ['thepokerlogic.com']
    # start_urls = ['http://thepokerlogic.com/glossary?']
    start_urls = ['http://thepokerlogic.com/glossary']

    def parse(self, response):

        href_list = response.xpath('//div[@class="content_list"]/div/a/@href').extract()
        for list in href_list:
            yield scrapy.Request(url="http://thepokerlogic.com"+list, callback=self.parse_tag)


数据获取

def parse_tag(self, response):
            item = PokerItem()
            item['title'] = response.xpath('//div[@class="glossary-detail"]/h2/text()').extract_first()
            item['describe_title'] = response.xpath('//div[@class="detail-describe"]/h3/text()').extract_first()
            item['describe_content'] = response.xpath('normalize-space(//div[@class="describe-content"]/p/text())').extract_first()
            item['illustrate_title'] = response.xpath('//div[@class="detail-illustrate"]/h4/text()').extract_first()
            item['illustrate_content'] = response.xpath('//div[@class="illustrate-content"]/p/text()').extract_first()
            item['relevant_title'] = response.xpath('//div[@class="detail-relevant"]/h4/text()').extract_first()
            relevant_content_a = response.xpath('//div[@class="relevant-content"]/a/text()').extract_first()
            relevant_content_p = response.xpath('//div[@class="relevant-content"]/p/text()').extract_first()
            if not relevant_content_a:
                item['relevant_content'] = relevant_content_p
            else:
                item['relevant_content'] = relevant_content_a

            item['study_title'] = response.xpath('//div[@class="detail-study"]/h4/text()').extract_first()
            study_content_a = response.xpath('//div[@class="study-content"]/a/text()').extract()
            study_content_p = response.xpath('//div[@class="study-content"]/p/text()').extract()
            if not study_content_a:
                item['study_content'] = study_content_p
            else:
                item['study_content'] = study_content_a
            url_title = response.xpath('//div[@class="glossary-detail-content"]//a/text()').extract()
            if url_title == '无' or url_title is None:
                url_link = ''
            else:
                url_link = response.xpath('//div[@class="glossary-detail-content"]//a/@href').extract()
            item['url'] = dict(zip(url_title, url_link))

            return item

为了方便对数据进行查看查询,对数据库也进行了存取

pipelines.py

先获取数据库连接,这里是提前有建立好数据库,所以直接对数据进行赋值插入,调用mysql语句进行数据存取。

def parse_tag(self, response):
            item = PokerItem()
            item['title'] = response.xpath('//div[@class="glossary-detail"]/h2/text()').extract_first()
            item['describe_title'] = response.xpath('//div[@class="detail-describe"]/h3/text()').extract_first()
            item['describe_content'] = response.xpath('normalize-space(//div[@class="describe-content"]/p/text())').extract_first()
            item['illustrate_title'] = response.xpath('//div[@class="detail-illustrate"]/h4/text()').extract_first()
            item['illustrate_content'] = response.xpath('//div[@class="illustrate-content"]/p/text()').extract_first()
            item['relevant_title'] = response.xpath('//div[@class="detail-relevant"]/h4/text()').extract_first()
            relevant_content_a = response.xpath('//div[@class="relevant-content"]/a/text()').extract_first()
            relevant_content_p = response.xpath('//div[@class="relevant-content"]/p/text()').extract_first()
            if not relevant_content_a:
                item['relevant_content'] = relevant_content_p
            else:
                item['relevant_content'] = relevant_content_a

            item['study_title'] = response.xpath('//div[@class="detail-study"]/h4/text()').extract_first()
            study_content_a = response.xpath('//div[@class="study-content"]/a/text()').extract()
            study_content_p = response.xpath('//div[@class="study-content"]/p/text()').extract()
            if not study_content_a:
                item['study_content'] = study_content_p
            else:
                item['study_content'] = study_content_a
            url_title = response.xpath('//div[@class="glossary-detail-content"]//a/text()').extract()
            if url_title == '无' or url_title is None:
                url_link = ''
            else:
                url_link = response.xpath('//div[@class="glossary-detail-content"]//a/@href').extract()
            item['url'] = dict(zip(url_title, url_link))

            return item
  •    其他爬虫代码可参考github

猜你喜欢

转载自blog.csdn.net/weixin_33853827/article/details/87230546