odd book

# -*- coding: utf-8 -*-
import scrapy
from ..items import BookItem


class QisuuSpider(scrapy.Spider):
    name = 'qisuu'
    allowed_domains = ['qisuu.com','baidu.com']
    start_urls = ['https://www.qisuu.com/']
    # Base address
    base_url = 'https://www.qisuu.com'


    def parse(self, response):
        # Parse the homepage to get the classified address
        links = response.xpath ('//div[@class="nav"]/a')
        # for loop traverses all a
        # enumerate() The enumeration is the corresponding value of the
        index for index,a in enumerate(links):
            # Do not use the first a label
            if index == 0:
                continue
            categray = a.xpath('text()').extract_first('')
            href = a.xpath('@href').extract_first('')
            print(categray,href)
            # Concatenate the complete category address
            cg_url = self .base_url+href
            # Create a request object and use yield to hand it over to the engine for processing
            # meta is a dictionary that can be used to pass parameters
            yield during page parsing scrapy.Request(
                url=cg_url,
                callback=self.parse_categray,
                meta={'categray ':categray}
            )
    # Parse the category page
    def parse_categray(self, response):
        # Get the passed category from response.meta according to the key
        # categray = response.meta.get('categray' )
        # Find all novel information on the current page
        lis = response.xpath('//div[@class="listBox"]/ul/li')
        for li in lis:
            star = li.xpath('div/em/@class').extract_first('')
            href = li.xpath('a/@href').extract_first('')
            # print(star,href)
            # Splicing complete url
            detail_url = self.base_url+href
            # Add novel level\novel detail address
            response to meta .meta['star'] = star
            response.meta['detail_url'] = detail_url


            yield scrapy.Request(
                url=detail_url,
                callback=self.parse_detail,
                meta=response. meta
            )
        # find next page
        # next_as = response.xpath('//div[@class="tspage"]/a')
        next_href = response.xpath('//a[contains(text(),"next")]/@href ').extract_first('')
        if next_href:
            if '3' in next_href:
                return
            # Initiate a request
            yield scrapy.Request(
                url=self.base_url + next_href,
                callback=self.parse_categray,
                # Why pass meta?
                # Next The novel on the page also needs classification information. The classification information is in the meta of the response.
                meta=response.meta
            )




        # for a in next_as:
        # # The for loop traverses each a tag and takes out the text of a
        # text = a.xpath(' text()').extract_first('')
        # # If the text of the a tag is not the next page, do not initiate a request
        # if 'next page' == text:
        # href = a.xpath('@href').extract_first('')
        # # Initiate a request
        # yield scrapy.Request(
        # url=self.base_url+href,
        # callback=self.parse_categray
        # )




    # Parse the novel detail page
    def parse_detail(self, response):


        # Get the category\level\address from meta category
        = response.meta. get('categray')
        star = response.meta.get('star')
        # Get the grade number
        star = star[-1]
        detail_url = response.meta.get('detail_url')
        # Cover image
        src = response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
        src = self.base_url+src
        # Novel name
        name = response.xpath('// div[@class="detail_right"]/h1/text()').extract_first('')
        # Novel details
        infos = response.xpath('//div[@class="detail_right"]/ul/li/ text()').extract()
        # number of clicks
        click_num = infos[0].split(':')[-1]
        # file size
        file_size = infos[1].split(':')[-1]
        # Book type
        book_type = infos[2].split(':')[-1]
        # Update date
        update_time = infos[3].split(':' )[-1]
        # Serial status
        status = infos[4].split(':')[-1]
        # Book author
        author = infos[5].split(':')[-1]
        # Running environment
        run_type = infos[6].split(':')[-1]


        # yield item to pipeline for processing
        item = BookItem ()
        item['run_type'] = run_type
        item['author'] = author
        item['status'] = status
        item['update_time'] = update_time
        item['book_type'] = book_type
        item['file_size'] = file_size
        item['click_num'] = click_num
        item['name'] = name
        # Download pictures, this attribute must be a list
        item['src'] = [src]
        # Download web files to local, The attribute must be a list
        item['detail_url'] = [detail_url]
        item['star'] = star
        item['categray'] = categray
        yield item















Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324683532&siteId=291194637