创建spider文件Bookchina.py:
import scrapy
from scrapy import Request
from MyScrapy.items import BookChina
class BookchinaSpider(scrapy.Spider):
name = 'Bookchina'
#allowed_domains = ['blog.jobbole.com/114261/']
start_urls = ['http://www.bookschina.com/kinder/54290000/']
def parse(self, response):
boolList = response.css('.cover a::attr(href)').extract()
for blist in boolList:
link = 'http://www.bookschina.com' + blist
yield Request(url=link, callback=self.bookDetail)
def categoryParse(self,response):
boolList=response.css('.cover a::attr(href)').extract()
for blist in boolList:
link='http://www.bookschina.com'+blist
yield Request(url=link,callback=self.bookDetail)
def bookDetail(self,response):
bookName = response.css('.padLeft10 h1::text').extract_first('')
print(bookName)
bookEditer = response.css('.author a::text').extract_first('')
publicShe = response.css('.publisher a::text').extract_first('')
publictime = response.css('.publisher i ::text').extract_first('')
bookPrice = response.css('.priceWrap .sellPrice::text ').extract_first('')
bookChina=BookChina()
bookChina['name']=bookName
bookChina['bookEditer']=bookEditer
bookChina['publicShe']=publicShe
bookChina['publictime']=publictime
bookChina['bookPrice']=bookPrice
yield bookChina
在items.py文件中加入类:
class BookChina(scrapy.Item):
name=scrapy.Field()
bookEditer = scrapy.Field()
publicShe = scrapy.Field()
publictime = scrapy.Field()
bookPrice = scrapy.Field()
在pipelines.py中添加:
class Bookchina(object):
def __init__(self):
self.file = codecs.open('book.json', 'w', 'utf-8')
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(lines)
return item
def close_spider(self, spider):
self.file.close()
在setting.py文件中设置管道:
ITEM_PIPELINES = {
#管道数越小,就越先执行
'MyScrapy.pipelines.MyscrapyPipeline': 300,
# 'MyScrapy.pipelines.jobbolePipeline': 250,
#自带的下载方法
# 'scrapy.pipelines.images.ImagesPipeline':200,
#引入自己的管道
'MyScrapy.pipelines.Bookchina':100,
}
运行文件就可以看到在当前项目中创建了book.json文件,里面保存了爬去的内容。