今天用scrapy试着爬一下bqg的小说
先是检查网页,确定不是动态加载数据,文章标题和内容url直接就可以获取到
0.创建一个新的项目
scrapy startproject try_1
cd try_1
scrapy genspider santi www.xxx.com
1 spider部分
import scrapy
from items import TrySItem2
import re
class SantiSpider(scrapy.Spider):
name = 'santi'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.52bqg.com/book_88879/']
def parse(self, response):
list_ = response.xpath('//*[@id="list"]/dl/dd')
items = TrySItem2()
for dd in list_:
items['title'] = dd.xpath('./a/text()').get()
# 获取url
d_url = dd.xpath('./a/@href').get()
# 拼接url
d_url = 'https://www.52bqg.com/book_88879/{}'.format(str(d_url))
yield scrapy.Request(url=d_url,callback=self.parse_content,meta={'items':items})
break #防止被封先爬一章试试,去掉之后就全爬
def parse_content(self,response):
items = response.meta['items']
content = response.xpath('//*[@id="content"]//text()').getall()
items['content'] = ''.join(content)
yield items
2 items部分
class TrySItem2(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
- pipelines 部分
class TryS_2_Pipeline(object):
santi = None
def open_spider(self,spider):
self.santi = open('./santi.txt','w',encoding='utf-8')
def process_item(self, items, spider):
title = items['title']
content =items['content']
self.santi.write(title+'\n'+content+'\n')
print(title)
return items
def close_spider(self,spider):
self.santi.close()
4.settings 部分
主要改了这么几个参数
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
'try_2.pipelines.TryS_2_Pipeline': 300,
}
5 .最后为了方便运行创建一个start.py文件
from scrapy import cmdline
cmdline.execute('scrapy crawl santi'.split())