# -*- coding: utf-8 -*- # @Time : 2019/4/18 9:10 # @Author : wujf # @Email : [email protected] # @File : mingyan_spider.py # @Software: PyCharm import scrapy class mingyan(scrapy.Spider): #name = "mingyan2" # def start_requests(self): # # #定义爬取的链接 # urls = [ # 'http://lab.scrapyd.cn/page/1/', # 'http://lab.scrapyd.cn/page/2/' # ] # # for url in urls: # yield scrapy.Request(url=url,callback=self.parse) #另外一种写法 name = 'itemSpider' start_urls = [ 'http://lab.scrapyd.cn' ] def parse(self, response): mingyan = response.css('div.quote') for v in mingyan: text = v.css('.text::text').extract_first() author = v.css('.author::text').extract_first() tags = v.css('.tags .tag::text').extract() tags = ','.join(tags) fileName= '%s-语录.txt'%author with open(fileName,'a+') as f: f.write(text+'\n'+'标签:'+tags) next_page = response.css('li.next a::attr(href)').extract_first() if next_page is not None: next_page = response.urljoin(next_page) #urljoin(next_page)把相对路径,如:page/1转换为绝对路径,其实也就是加上网站域名 yield scrapy.Request(next_page,callback=self.parse) #yield 生成器一般 # page = response.url.split('/')[-2] # filename = 'mingyan-%s.html'%page # # with open(filename,'wb') as f: # f.write(response.body) # self.log('保存文件:%s'%filename)