# -*- coding: utf-8 -*-
import scrapy
from ..items import BookItem
class QisuuSpider(scrapy.Spider):
name = 'qisuu'
allowed_domains = ['qisuu.com','baidu.com']
start_urls = ['https://www.qisuu.com/']
# Base address
base_url = 'https://www.qisuu.com'
def parse(self, response):
# Parse the homepage to get the classified address
links = response.xpath ('//div[@class="nav"]/a')
# for loop traverses all a
# enumerate() The enumeration is the corresponding value of the
index for index,a in enumerate(links):
# Do not use the first a label
if index == 0:
continue
categray = a.xpath('text()').extract_first('')
href = a.xpath('@href').extract_first('')
print(categray,href)
# Concatenate the complete category address
cg_url = self .base_url+href
# Create a request object and use yield to hand it over to the engine for processing
# meta is a dictionary that can be used to pass parameters
yield during page parsing scrapy.Request(
url=cg_url,
callback=self.parse_categray,
meta={'categray ':categray}
)
# Parse the category page
def parse_categray(self, response):
# Get the passed category from response.meta according to the key
# categray = response.meta.get('categray' )
# Find all novel information on the current page
lis = response.xpath('//div[@class="listBox"]/ul/li')
for li in lis:
star = li.xpath('div/em/@class').extract_first('')
href = li.xpath('a/@href').extract_first('')
# print(star,href)
# Splicing complete url
detail_url = self.base_url+href
# Add novel level\novel detail address
response to meta .meta['star'] = star
response.meta['detail_url'] = detail_url
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta=response. meta
)
# find next page
# next_as = response.xpath('//div[@class="tspage"]/a')
next_href = response.xpath('//a[contains(text(),"next")]/@href ').extract_first('')
if next_href:
if '3' in next_href:
return
# Initiate a request
yield scrapy.Request(
url=self.base_url + next_href,
callback=self.parse_categray,
# Why pass meta?
# Next The novel on the page also needs classification information. The classification information is in the meta of the response.
meta=response.meta
)
# for a in next_as:
# # The for loop traverses each a tag and takes out the text of a
# text = a.xpath(' text()').extract_first('')
# # If the text of the a tag is not the next page, do not initiate a request
# if 'next page' == text:
# href = a.xpath('@href').extract_first('')
# # Initiate a request
# yield scrapy.Request(
# url=self.base_url+href,
# callback=self.parse_categray
# )
# Parse the novel detail page
def parse_detail(self, response):
# Get the category\level\address from meta category
= response.meta. get('categray')
star = response.meta.get('star')
# Get the grade number
star = star[-1]
detail_url = response.meta.get('detail_url')
# Cover image
src = response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
src = self.base_url+src
# Novel name
name = response.xpath('// div[@class="detail_right"]/h1/text()').extract_first('')
# Novel details
infos = response.xpath('//div[@class="detail_right"]/ul/li/ text()').extract()
# number of clicks
click_num = infos[0].split(':')[-1]
# file size
file_size = infos[1].split(':')[-1]
# Book type
book_type = infos[2].split(':')[-1]
# Update date
update_time = infos[3].split(':' )[-1]
# Serial status
status = infos[4].split(':')[-1]
# Book author
author = infos[5].split(':')[-1]
# Running environment
run_type = infos[6].split(':')[-1]
# yield item to pipeline for processing
item = BookItem ()
item['run_type'] = run_type
item['author'] = author
item['status'] = status
item['update_time'] = update_time
item['book_type'] = book_type
item['file_size'] = file_size
item['click_num'] = click_num
item['name'] = name
# Download pictures, this attribute must be a list
item['src'] = [src]
# Download web files to local, The attribute must be a list
item['detail_url'] = [detail_url]
item['star'] = star
item['categray'] = categray
yield item
import scrapy
from ..items import BookItem
class QisuuSpider(scrapy.Spider):
name = 'qisuu'
allowed_domains = ['qisuu.com','baidu.com']
start_urls = ['https://www.qisuu.com/']
# Base address
base_url = 'https://www.qisuu.com'
def parse(self, response):
# Parse the homepage to get the classified address
links = response.xpath ('//div[@class="nav"]/a')
# for loop traverses all a
# enumerate() The enumeration is the corresponding value of the
index for index,a in enumerate(links):
# Do not use the first a label
if index == 0:
continue
categray = a.xpath('text()').extract_first('')
href = a.xpath('@href').extract_first('')
print(categray,href)
# Concatenate the complete category address
cg_url = self .base_url+href
# Create a request object and use yield to hand it over to the engine for processing
# meta is a dictionary that can be used to pass parameters
yield during page parsing scrapy.Request(
url=cg_url,
callback=self.parse_categray,
meta={'categray ':categray}
)
# Parse the category page
def parse_categray(self, response):
# Get the passed category from response.meta according to the key
# categray = response.meta.get('categray' )
# Find all novel information on the current page
lis = response.xpath('//div[@class="listBox"]/ul/li')
for li in lis:
star = li.xpath('div/em/@class').extract_first('')
href = li.xpath('a/@href').extract_first('')
# print(star,href)
# Splicing complete url
detail_url = self.base_url+href
# Add novel level\novel detail address
response to meta .meta['star'] = star
response.meta['detail_url'] = detail_url
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta=response. meta
)
# find next page
# next_as = response.xpath('//div[@class="tspage"]/a')
next_href = response.xpath('//a[contains(text(),"next")]/@href ').extract_first('')
if next_href:
if '3' in next_href:
return
# Initiate a request
yield scrapy.Request(
url=self.base_url + next_href,
callback=self.parse_categray,
# Why pass meta?
# Next The novel on the page also needs classification information. The classification information is in the meta of the response.
meta=response.meta
)
# for a in next_as:
# # The for loop traverses each a tag and takes out the text of a
# text = a.xpath(' text()').extract_first('')
# # If the text of the a tag is not the next page, do not initiate a request
# if 'next page' == text:
# href = a.xpath('@href').extract_first('')
# # Initiate a request
# yield scrapy.Request(
# url=self.base_url+href,
# callback=self.parse_categray
# )
# Parse the novel detail page
def parse_detail(self, response):
# Get the category\level\address from meta category
= response.meta. get('categray')
star = response.meta.get('star')
# Get the grade number
star = star[-1]
detail_url = response.meta.get('detail_url')
# Cover image
src = response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
src = self.base_url+src
# Novel name
name = response.xpath('// div[@class="detail_right"]/h1/text()').extract_first('')
# Novel details
infos = response.xpath('//div[@class="detail_right"]/ul/li/ text()').extract()
# number of clicks
click_num = infos[0].split(':')[-1]
# file size
file_size = infos[1].split(':')[-1]
# Book type
book_type = infos[2].split(':')[-1]
# Update date
update_time = infos[3].split(':' )[-1]
# Serial status
status = infos[4].split(':')[-1]
# Book author
author = infos[5].split(':')[-1]
# Running environment
run_type = infos[6].split(':')[-1]
# yield item to pipeline for processing
item = BookItem ()
item['run_type'] = run_type
item['author'] = author
item['status'] = status
item['update_time'] = update_time
item['book_type'] = book_type
item['file_size'] = file_size
item['click_num'] = click_num
item['name'] = name
# Download pictures, this attribute must be a list
item['src'] = [src]
# Download web files to local, The attribute must be a list
item['detail_url'] = [detail_url]
item['star'] = star
item['categray'] = categray
yield item