==============================
pipelines.py
-------------------------
import csv
class ScrapytestPipeline(object):
# 爬虫文件中提取数据的方法每yield一次item,就会运行一次
# 该方法为固定名称函数
def process_item(self, item, spider):
# csv保存
with open(item['book_name'] + '.csv', "a", newline='', encoding='gb18030') as csvfile:
# print(item)
writer = csv.writer(csvfile)
name = item['section_name']
content = item['section_content']
writer.writerows([[name, content]])
return item
==============================
item.py
-------------------------
import scrapy
设置 爬取的key field
class BiqugeItem_detail(scrapy.Item):
section_link = scrapy.Field()
section_name = scrapy.Field()
section_content = scrapy.Field()
book_name = scrapy.Field()
==============================
biquge.py
-------------------------
-- coding: utf-8 --
import time
import scrapy
自定义spider类,继承scrapy.spider
from scrapytest.items import BiqugeItem_detail
class BiqugeSpider(scrapy.Spider):
# 爬虫名字
name = 'biquge'
# 允许爬取的范围,防止爬虫爬到别的网站
allowed_domains = ['xbiquge.la']
# 开始爬取的url地址
start_urls = ['http://www.xbiquge.la/xiaoshuodaquan/']
# 数据提取的方法,接受下载中间件传过来的response
def parse(self, response):
# 分组
li_list = response.xpath('//div[@class="novellist"]//a')
i = 0
for li in li_list:
# 创建一个数据字典
dict_data = {}
# 利用scrapy封装好的xpath选择器定位元素,并通过extract()或extract_first()来获取结果
dict_data['name'] = li.xpath('.//text()').extract_first() # 书名
dict_data['link'] = li.xpath('.//@href').extract_first() # 书链接
# yield dict_data
if i < 10: # 这里限制下载十本小说先
yield scrapy.Request(dict_data['link'], callback=self.parse_detail, meta={'dict_data': dict_data})
i += 1 # 书籍计数
def parse_detail(self, response):
book = {}
# book['book_name'] = response.meta['dict_data']['name']
section_data = BiqugeItem_detail()
section_data['book_name'] = response.meta['dict_data']['name']
section_list = response.xpath('//*[@id="list"]/dl/dd/a')
i = 0
for section in section_list:
section_data['section_link'] = 'http://www.xbiquge.la' + section.xpath('./@href').extract_first()
# section_data['section_name'] = section.xpath('./text()').extract_first()
# book['detail'] = section_data
# yield book
time.sleep(1)
yield scrapy.Request(section_data['section_link'], callback=self.parse_content, meta={'section_data': section_data})
i += 1 # 章节计数