目标网站:盗墓笔记小说网站 目标网址:http://www.daomubiji.com/ 目标内容: 盗墓笔记小说的信息,具体内容包括: 书标题 章数 章标题 输出结果保存在MongoDB中 #################################### 记得每次清空redis 增加:每一章的正文 settings中添加: SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' REDIS_URL = None REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 items增加: text = Field()#用来保存小说的正文
网上查询的代码形式,建议参考
#-*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider from scrapy.selector import Selector from scrapy.http import Request from novelspider.items import NovelspiderItem import json class novelSpider(CrawlSpider): name = 'novelSpider' redis_key = 'novelSpider:start_urls' start_urls = ['http://www.daomubiji.com/'] def parse(self, response): ''' 获取盗墓笔记主页各种书的链接 :param response: :return: ''' selector = Selector(response) section = selector.xpath('//article') bookUrls = section.xpath('p/a/@href').extract() print bookUrls for eachUrl in bookUrls: yield Request(eachUrl, callback=self.parse_news) def parse_news(self, response): ''' 对于给定链接,获取书名,标题,章节、具体内容的链接等信息 :param response: :return: ''' selector = Selector(response) content = selector.xpath('/html/body/section/div[2]/div/article/a/text()').extract() urls = selector.xpath('/html/body/section/div[2]/div/article/a/@href').extract() item = NovelspiderItem() bookName = content[0] shahai_flg = False if bookName.split(' ')[0] == u'沙海1': shahai_flg = True else: shahai_flg = False mhgc_flg = False if bookName.split(' ')[0] == u'蛇沼鬼城(下)': mhgc_flg = True else: mhgc_flg = False i = 0 for each in content: try: if shahai_flg: item['bookName'] = each.split(' ')[0] + each.split(' ')[1] item['chapterNum'] = each.split(' ')[2] item['bookTitle'] = each.split(' ')[3] item['chapterURL'] = urls[i] elif mhgc_flg: item['bookName'] = u'迷海归巢' + each.split(' ')[0] item['chapterNum'] = each.split(' ')[2] item['bookTitle'] = each.split(' ')[3] item['chapterURL'] = urls[i] else: item['bookName'] = each.split(' ')[0] item['chapterNum'] = each.split(' ')[1] item['bookTitle'] = each.split(' ')[2] item['chapterURL'] = urls[i] i += 1 except Exception, e: continue yield item
#encoding=utf-8 # from scrapy_redis.spiders import RedisSpider from scrapy.selector import Selector from novelspider.items import NovelspiderItem from scrapy.http import Request from scrapy_redis.spiders import RedisSpider import re class novSpider(RedisSpider): name = 'novelspider' redis_key = 'novelspider:start_urls' start_urls = [ 'http://www.daomubiji.com/dao-mu-bi-ji-1' ] print start_urls def parse(self,response): selector = Selector(response)#处理网页的源代码 print response.body bookName = selector.xpath('//h1[@class ="focusbox-title"]/text()').extract()[0] # print bookName url = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/@href').extract() # print url excerpts = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/text()').extract() print excerpts item = NovelspiderItem() for i in range(len(url)): item['bookName'] = bookName item['chapterURL'] = url[i] try: item['bookTitle'] = excerpts[i].split(' ')[0] item['chapterNum'] = excerpts[i].split(' ')[1] item['chapterName'] = excerpts[i].split(' ')[2] # item['chapterName2'] = excerpts[i].split(' ')[3] # item['chapterName3'] = excerpts[i].split(' ')[4] except Exception,e: continue for eachUrl in url: yield Request(eachUrl, callback=self.parseContent, meta={'item': item}) # yield Request(self.url[i], callback='parseContent', meta={'item': item}) #eta={'item': item}将上面保存的数据(字典)转移到下面的函数里面来 def parseContent(self,response): selector = Selector(response) item = response.meta('item') html = selector.xpath('//div[@class="content"]').extract()[0] textField = re.search('<article class="article-content">(.*?)</article>',html,re.S).group(1) text = re.findall('<p>(.*?)</p>',textField,re.S) print text fulltext = '' for each in text: fulltext += each item['text'] = fulltext
yield item