1.引导
默认情况下,scrapy都是采用GET请求。那么如何利用scrapy发送post请求呢?
2.主要代码如下
wb.py
import scrapy, re
from urllib.parse import quote
from scrapy.http import FormRequest
from ..items import WeibospiderItem
class WbSpider(scrapy.Spider):
name = 'wb'
allowed_domains = ['weibo.cn']
start_url = 'https://weibo.cn/search/mblog'
# 最大页码
max_page = 100
# 默认情况下,scrapy都是采用GET请求。目的:初始URL的请求就是用POST请求。
# 需要重写start_requests()方法。
def start_requests(self):
# https://weibo.cn/search/mblog?keyword=%E5%91%A8%E6%9D%B0%E4%BC%A6
key_word = '周杰伦'
url = '{url}?keyword={kw}'.format(url=self.start_url, kw=quote(key_word))
for page_num in range(1, 2):
form_data = {
'mp': str(self.max_page),
'page': str(page_num)
}
# FormRequest()就是用来构造POST请求的类。
request = FormRequest(url, formdata=form_data, callback=self.parse_list_page)
yield request
def parse_list_page(self, response):
"""
解析列表页的url, : 转发微博的详情url,原创微博的详情url
:param response:
:return:
"""
# xpath组合查询:同时符合两个条件
weibo_div = response.xpath('//div[@class="c" and contains(@id, "M_")]')
for weibo in weibo_div:
# 需要区分原创微博和转发微博
has_cmt = weibo.xpath('.//span[@class="cmt"]').extract_first('')
if has_cmt:
# 如果能找到span[@class="cmt"],说明是转发微博
# @class, @id
# . 表示文本内容
detail_url = weibo.xpath('.//a[contains(., "原文评论[")]/@href').extract_first('')
else:
# 没找到,说明是原创微博
detail_url = weibo.xpath('.//a[contains(., "评论[")]/@href').extract_first('')
# detail_url = 'https://weibo.cn/comment/GoNKlkzpR?rl=1#cmtfrm'
# 构造详情页的请求
yield scrapy.Request(detail_url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
"""
解析详情页的数据
:param response:
:return:
"""
wb_url = wb_id = wb_user = wb_content = wb_publish_time = wb_comment_num = wb_vote_up_num = wb_zf_num = ''
if 'page' not in response.url:
# 微博详情的url
# https://weibo.cn/comment/Gj1SC2zrS?rl=1#cmtfrm
wb_url = response.url
# 微博ID
# re.search(re.compile(r'comment/(.*?)?'))
# \?: 就是取消?在正则中的含义,将这个?看作是一个普通的字符。
# group(): 'comment/Gj1SC2zrS'
# groups(): ['Gj1SC2zrS']
# group(1): 'Gj1SC2zrS'
wb_id = re.search('comment\/(.*?)\?', wb_url).group(1)
# 发表微博的用户昵称、发表内容
header_div = response.xpath('//div[@id="M_"]/div[1]')[0]
wb_user = header_div.xpath('./a/text()').extract_first('')
wb_content = header_div.xpath('./span[@class="ctt"]/text()').extract_first('').strip().lstrip(':')
# 微博的发表时间
wb_publish_time = response.xpath('//div[@id="M_"]/div[2]/span[@class="ct"]/text()').extract_first('')
# 微博的评论、点赞、转发数
wb_comment_num = response.xpath('//span[@class="pms" and contains(., "评论")]/text()').re_first('评论\[(.*?)\]',
default=0)
wb_vote_up_num = response.xpath('//a[contains(., "赞[")]/text()').re_first('赞\[(.*?)\]', default=0)
wb_zf_num = response.xpath('//a[contains(., "转发")]/text()').re_first('转发\[(.*?)\]', default=0)
# 获取所有的评论内容:是否含有评论内容
wb_comments = []
if wb_comment_num != 0:
try:
wb_id = response.meta['wb_id']
except:
pass
# 有评论
wb_comment = response.xpath('//div[@class="c" and contains(@id, "C_")]/span[@class="ctt"]/text()').extract()
for comment in wb_comment:
res = comment.strip()
wb_comments.append(res)
# 获取下一页
next_page_obj = response.xpath('//a[contains(., "下页")]')
if next_page_obj:
next_page_url = 'https://weibo.cn' + next_page_obj.xpath('./@href').extract_first()
yield scrapy.Request(next_page_url, callback=self.parse_detail_page, meta={'wb_id': wb_id})
item = WeibospiderItem()
item['wb_url'] = wb_url
item['wb_id'] = wb_id
item['wb_user'] = wb_user
item['wb_content'] = wb_content
item['wb_publish_time'] = wb_publish_time
item['wb_comment_num'] = wb_comment_num
item['wb_vote_up_num'] = wb_vote_up_num
item['wb_zf_num'] = wb_zf_num
item['wb_comments'] = wb_comments
yield item
items.py
import scrapy
class WeibospiderItem(scrapy.Item):
wb_url = scrapy.Field()
wb_id = scrapy.Field()
wb_user = scrapy.Field()
wb_content = scrapy.Field()
wb_publish_time = scrapy.Field()
wb_comment_num = scrapy.Field()
wb_vote_up_num = scrapy.Field()
wb_zf_num = scrapy.Field()
wb_comments = scrapy.Field()
middlewares.py
import requests
import logging
import json
# 自定义微博请求的中间件
class WeiBoMiddleWare(object):
def __init__(self, cookies_pool_url):
self.logging = logging.getLogger("WeiBoMiddleWare")
self.cookies_pool_url = cookies_pool_url
def get_random_cookies(self):
try:
response = requests.get(self.cookies_pool_url)
except Exception as e:
self.logging.info('Get Cookies failed: {}'.format(e))
else:
# 在中间件中,设置请求头携带的Cookies值,必须是一个字典,不能直接设置字符串。
cookies = json.loads(response.text)
self.logging.info('Get Cookies success: {}'.format(response.text))
return cookies
@classmethod
def from_settings(cls, settings):
obj = cls(
cookies_pool_url=settings['WEIBO_COOKIES_URL']
)
return obj
# process_request()该方法会被多次调用,每一个request请求都会经过这个方法交给downloader
def process_request(self, request, spider):
request.cookies = self.get_random_cookies()
return None
def process_response(self, request, response, spider):
"""
对此次请求的响应进行处理。
:param request:
:param response:
:param spider:
:return:
"""
# 携带cookie进行页面请求时,可能会出现cookies失效的情况。访问失败会出现两种情况:1. 重定向302到登录页面;2. 也能会出现验证的情况;
# 想拦截重定向请求,需要在settings中配置。
if response.status in [302, 301]:
# 如果出现了重定向,获取重定向的地址
redirect_url = response.headers['location']
print(redirect_url)
# if 'passport' in redirect_url:
# # 重定向到了登录页面,Cookie失效。
# self.logging.info('Cookies Invaild!')
# if '验证页面' in redirect_url:
# # Cookies还能继续使用,针对账号进行的反爬虫。
# self.logging.info('当前Cookie无法使用,需要认证。')
# 如果出现重定向,说明此次请求失败,继续获取一个新的Cookie,重新对此次请求request进行访问。
request.cookies = self.get_random_cookies()
# 返回值request: 停止后续的response中间件,而是将request重新放入调度器的队列中重新请求。
return request
# 如果没有出现重定向,直接将response向下传递后续的中间件。
return response
pipelines.py
import logging, pymongo, re, time
from datetime import datetime
from .items import WeibospiderItem
class ConvertDatetimePipeline(object):
"""
1. 1小时以内: 5 分钟前
2. 超过1小时,不超过24小时的:今天 13:15
3. 超过24小时:07月10日 23:17
"""
# 2018-07-10 13:50
def convert_time(self, wb_time):
res = ''
if re.search('\d+月\d+日', wb_time):
dt = re.search('(\d+)月(\d+)日 (\d+):(\d+)', wb_time)
month = dt.group(1)
day = dt.group(2)
h = dt.group(3)
m = dt.group(4)
res = str(datetime.now().year) + '-' + month + '-' + day + ' ' + h + ':' + m
elif re.search('\d+分钟前', wb_time):
minute = re.search('(\d+)分钟前', wb_time).group(1)
# 将1分钟转化为秒,采用当前时间的时间戳 - 获取的秒数,得到的就是1分钟之前的时间戳
# '2018-07-10' '%Y-%m-%d'
# '2018/07/10' '%Y/%m/%d'
# strptime('2018-07-10', 时间格式)
# strftime()将一个时间元组,按照一定的格式(自由指定)转化为时间字符串。
res = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60))
elif re.search('今天', wb_time):
dt = re.search('今天(.*?)', wb_time).group(1).strip()
res = time.strftime('%Y-%m-%d', time.localtime()) + ' ' + dt
return res
def process_item(self, item, spider):
# 转化时间
if isinstance(item, WeibospiderItem):
dt = item['wb_publish_time']
item['wb_publish_time'] = self.convert_time(dt)
return item
class WeibospiderPipeline(object):
def __init__(self, mongo_url, mongo_db):
self.logging = logging.getLogger("WeiBoMiddleWare")
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_settings(cls, settings):
obj = cls(
mongo_url=settings['MONGO_URL'],
mongo_db=settings['MONGO_DB']
)
return obj
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 每次存储item的时候,先判断数据库中是否已经存在对应wb_id的的数据,如果不存在就将整个item添加数据库,反之,将wb_comments添加至对应的item中。
# find_one()根据字段查找某一个元素
obj = self.db['wb'].find_one({"wb_id": item['wb_id']})
if obj:
# 遍历解析出来的评论内容,将其添加至数据库的obj['wb_comments']中
for comment in item['wb_comments']:
obj['wb_comments'].append(comment)
# 更新完这个对象以后,重新保存一下。同步至数据库
# to_save=obj 用来设置要保存的对象
self.db['wb'].save(to_save=obj)
else:
self.db['wb'].update_one({'wb_id': item['wb_id']}, {'$set': dict(item)}, True)
return item
if __name__ == '__main__':
print(datetime.now())
# localtime() --> time.struct_time(tm_year=2018, tm_mon=7, tm_mday=11, tm_hour=14, tm_min=36, tm_sec=40, tm_wday=2, tm_yday=192, tm_isdst=0) 参数是以秒为单位的时间。如果不设置参数,默认采用time.time()这个时间戳作为参数。
print(time.localtime())
settings.py
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'weibo.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
}
DOWNLOADER_MIDDLEWARES = {
'weibospider.middlewares.WeiBoMiddleWare': 543,
}
# 配置微博Cookie池的地址
WEIBO_COOKIES_URL = 'http://localhost:5000/weibo/random'
# 禁止重定向
REDIRECT_ENABLED = False
# 数据库配置
MONGO_URL = 'localhost'
MONGO_DB = 'weibo'
ITEM_PIPELINES = {
'weibospider.pipelines.ConvertDatetimePipeline': 300,
'weibospider.pipelines.WeibospiderPipeline': 301,
}