如何利用scrapy发送post请求?(以微博为例)

1.引导

默认情况下,scrapy都是采用GET请求。那么如何利用scrapy发送post请求呢?

2.主要代码如下

wb.py

import scrapy, re
from urllib.parse import quote
from scrapy.http import FormRequest
from ..items import WeibospiderItem


class WbSpider(scrapy.Spider):
    name = 'wb'
    allowed_domains = ['weibo.cn']
    start_url = 'https://weibo.cn/search/mblog'
    # 最大页码
    max_page = 100

    # 默认情况下,scrapy都是采用GET请求。目的:初始URL的请求就是用POST请求。
    # 需要重写start_requests()方法。
    def start_requests(self):
        # https://weibo.cn/search/mblog?keyword=%E5%91%A8%E6%9D%B0%E4%BC%A6
        key_word = '周杰伦'
        url = '{url}?keyword={kw}'.format(url=self.start_url, kw=quote(key_word))

        for page_num in range(1, 2):
            form_data = {
                'mp': str(self.max_page),
                'page': str(page_num)
            }
            # FormRequest()就是用来构造POST请求的类。
            request = FormRequest(url, formdata=form_data, callback=self.parse_list_page)
            yield request

    def parse_list_page(self, response):
        """
        解析列表页的url, : 转发微博的详情url,原创微博的详情url
        :param response:
        :return:
        """
        # xpath组合查询:同时符合两个条件
        weibo_div = response.xpath('//div[@class="c" and contains(@id, "M_")]')
        for weibo in weibo_div:
            # 需要区分原创微博和转发微博
            has_cmt = weibo.xpath('.//span[@class="cmt"]').extract_first('')
            if has_cmt:
                # 如果能找到span[@class="cmt"],说明是转发微博
                # @class, @id
                # . 表示文本内容
                detail_url = weibo.xpath('.//a[contains(., "原文评论[")]/@href').extract_first('')
            else:
                # 没找到,说明是原创微博
                detail_url = weibo.xpath('.//a[contains(., "评论[")]/@href').extract_first('')

        # detail_url = 'https://weibo.cn/comment/GoNKlkzpR?rl=1#cmtfrm'
        # 构造详情页的请求
            yield scrapy.Request(detail_url, callback=self.parse_detail_page)

    def parse_detail_page(self, response):
        """
        解析详情页的数据
        :param response:
        :return:
        """
        wb_url = wb_id = wb_user = wb_content = wb_publish_time = wb_comment_num = wb_vote_up_num = wb_zf_num = ''
        if 'page' not in response.url:
            # 微博详情的url
            # https://weibo.cn/comment/Gj1SC2zrS?rl=1#cmtfrm
            wb_url = response.url
            # 微博ID
            # re.search(re.compile(r'comment/(.*?)?'))
            # \?: 就是取消?在正则中的含义,将这个?看作是一个普通的字符。
            # group(): 'comment/Gj1SC2zrS'
            # groups(): ['Gj1SC2zrS']
            # group(1): 'Gj1SC2zrS'
            wb_id = re.search('comment\/(.*?)\?', wb_url).group(1)

            # 发表微博的用户昵称、发表内容
            header_div = response.xpath('//div[@id="M_"]/div[1]')[0]
            wb_user = header_div.xpath('./a/text()').extract_first('')
            wb_content = header_div.xpath('./span[@class="ctt"]/text()').extract_first('').strip().lstrip(':')

            # 微博的发表时间
            wb_publish_time = response.xpath('//div[@id="M_"]/div[2]/span[@class="ct"]/text()').extract_first('')

            # 微博的评论、点赞、转发数
            wb_comment_num = response.xpath('//span[@class="pms" and contains(., "评论")]/text()').re_first('评论\[(.*?)\]',
                                                                                                          default=0)
            wb_vote_up_num = response.xpath('//a[contains(., "赞[")]/text()').re_first('赞\[(.*?)\]', default=0)
            wb_zf_num = response.xpath('//a[contains(., "转发")]/text()').re_first('转发\[(.*?)\]', default=0)

        # 获取所有的评论内容:是否含有评论内容
        wb_comments = []
        if wb_comment_num != 0:
            try:
                wb_id = response.meta['wb_id']
            except:
                pass

            # 有评论
            wb_comment = response.xpath('//div[@class="c" and contains(@id, "C_")]/span[@class="ctt"]/text()').extract()
            for comment in wb_comment:
                res = comment.strip()
                wb_comments.append(res)

            # 获取下一页
            next_page_obj = response.xpath('//a[contains(., "下页")]')
            if next_page_obj:
                next_page_url = 'https://weibo.cn' + next_page_obj.xpath('./@href').extract_first()
                yield scrapy.Request(next_page_url, callback=self.parse_detail_page, meta={'wb_id': wb_id})

        item = WeibospiderItem()
        item['wb_url'] = wb_url
        item['wb_id'] = wb_id
        item['wb_user'] = wb_user
        item['wb_content'] = wb_content
        item['wb_publish_time'] = wb_publish_time
        item['wb_comment_num'] = wb_comment_num
        item['wb_vote_up_num'] = wb_vote_up_num
        item['wb_zf_num'] = wb_zf_num
        item['wb_comments'] = wb_comments

        yield item


items.py

import scrapy
class WeibospiderItem(scrapy.Item):
    wb_url = scrapy.Field()
    wb_id = scrapy.Field()
    wb_user = scrapy.Field()
    wb_content = scrapy.Field()
    wb_publish_time = scrapy.Field()
    wb_comment_num = scrapy.Field()
    wb_vote_up_num = scrapy.Field()
    wb_zf_num = scrapy.Field()
    wb_comments = scrapy.Field()


middlewares.py


import requests
import logging
import json
# 自定义微博请求的中间件
class WeiBoMiddleWare(object):
    def __init__(self, cookies_pool_url):
        self.logging = logging.getLogger("WeiBoMiddleWare")
        self.cookies_pool_url = cookies_pool_url

    def get_random_cookies(self):
        try:
            response = requests.get(self.cookies_pool_url)
        except Exception as e:
            self.logging.info('Get Cookies failed: {}'.format(e))
        else:
            # 在中间件中,设置请求头携带的Cookies值,必须是一个字典,不能直接设置字符串。
            cookies = json.loads(response.text)
            self.logging.info('Get Cookies success: {}'.format(response.text))
            return cookies

    @classmethod
    def from_settings(cls, settings):
        obj = cls(
            cookies_pool_url=settings['WEIBO_COOKIES_URL']
        )
        return obj

    # process_request()该方法会被多次调用,每一个request请求都会经过这个方法交给downloader
    def process_request(self, request, spider):
        request.cookies = self.get_random_cookies()
        return None

    def process_response(self, request, response, spider):
        """
        对此次请求的响应进行处理。
        :param request:
        :param response:
        :param spider:
        :return:
        """
        # 携带cookie进行页面请求时,可能会出现cookies失效的情况。访问失败会出现两种情况:1. 重定向302到登录页面;2. 也能会出现验证的情况;

        # 想拦截重定向请求,需要在settings中配置。
        if response.status in [302, 301]:
            # 如果出现了重定向,获取重定向的地址
            redirect_url = response.headers['location']
            print(redirect_url)
            # if 'passport' in redirect_url:
            #     # 重定向到了登录页面,Cookie失效。
            #     self.logging.info('Cookies Invaild!')
            # if '验证页面' in redirect_url:
            #     # Cookies还能继续使用,针对账号进行的反爬虫。
            #     self.logging.info('当前Cookie无法使用,需要认证。')

            # 如果出现重定向,说明此次请求失败,继续获取一个新的Cookie,重新对此次请求request进行访问。
            request.cookies = self.get_random_cookies()
            # 返回值request: 停止后续的response中间件,而是将request重新放入调度器的队列中重新请求。
            return request

        # 如果没有出现重定向,直接将response向下传递后续的中间件。
        return response


pipelines.py


import logging, pymongo, re, time
from datetime import datetime
from .items import WeibospiderItem


class ConvertDatetimePipeline(object):
    """
    1. 1小时以内: 5 分钟前
    2. 超过1小时,不超过24小时的:今天 13:15
    3. 超过24小时:07月10日 23:17
    """
    # 2018-07-10 13:50
    def convert_time(self, wb_time):
        res = ''
        if re.search('\d+月\d+日', wb_time):
            dt = re.search('(\d+)月(\d+)日 (\d+):(\d+)', wb_time)
            month = dt.group(1)
            day = dt.group(2)
            h = dt.group(3)
            m = dt.group(4)
            res = str(datetime.now().year) + '-' + month + '-' + day + ' ' + h + ':' + m
        elif re.search('\d+分钟前', wb_time):
            minute = re.search('(\d+)分钟前', wb_time).group(1)
            # 将1分钟转化为秒,采用当前时间的时间戳 - 获取的秒数,得到的就是1分钟之前的时间戳
            # '2018-07-10'  '%Y-%m-%d'
            # '2018/07/10'  '%Y/%m/%d'
            # strptime('2018-07-10', 时间格式)

            # strftime()将一个时间元组,按照一定的格式(自由指定)转化为时间字符串。
            res = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60))
        elif re.search('今天', wb_time):
            dt = re.search('今天(.*?)', wb_time).group(1).strip()
            res = time.strftime('%Y-%m-%d', time.localtime()) + ' ' + dt

        return res

    def process_item(self, item, spider):
        # 转化时间
        if isinstance(item, WeibospiderItem):
            dt = item['wb_publish_time']
            item['wb_publish_time'] = self.convert_time(dt)
        return item


class WeibospiderPipeline(object):
    def __init__(self, mongo_url, mongo_db):
        self.logging = logging.getLogger("WeiBoMiddleWare")
        self.mongo_url = mongo_url
        self.mongo_db = mongo_db

    @classmethod
    def from_settings(cls, settings):
        obj = cls(
            mongo_url=settings['MONGO_URL'],
            mongo_db=settings['MONGO_DB']
        )
        return obj

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        # 每次存储item的时候,先判断数据库中是否已经存在对应wb_id的的数据,如果不存在就将整个item添加数据库,反之,将wb_comments添加至对应的item中。
        # find_one()根据字段查找某一个元素
        obj = self.db['wb'].find_one({"wb_id": item['wb_id']})
        if obj:
            # 遍历解析出来的评论内容,将其添加至数据库的obj['wb_comments']中
            for comment in item['wb_comments']:
                obj['wb_comments'].append(comment)

            # 更新完这个对象以后,重新保存一下。同步至数据库
            # to_save=obj 用来设置要保存的对象
            self.db['wb'].save(to_save=obj)
        else:
            self.db['wb'].update_one({'wb_id': item['wb_id']}, {'$set': dict(item)}, True)
        return item


if __name__ == '__main__':
    print(datetime.now())
    # localtime() --> time.struct_time(tm_year=2018, tm_mon=7, tm_mday=11, tm_hour=14, tm_min=36, tm_sec=40, tm_wday=2, tm_yday=192, tm_isdst=0) 参数是以秒为单位的时间。如果不设置参数,默认采用time.time()这个时间戳作为参数。
    print(time.localtime())


settings.py 


DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'weibo.cn',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
}
DOWNLOADER_MIDDLEWARES = {
   'weibospider.middlewares.WeiBoMiddleWare': 543,
}

# 配置微博Cookie池的地址
WEIBO_COOKIES_URL = 'http://localhost:5000/weibo/random'

# 禁止重定向
REDIRECT_ENABLED = False

# 数据库配置
MONGO_URL = 'localhost'
MONGO_DB = 'weibo'
ITEM_PIPELINES = {
    'weibospider.pipelines.ConvertDatetimePipeline': 300,
    'weibospider.pipelines.WeibospiderPipeline': 301,
}

 





猜你喜欢

转载自blog.csdn.net/qq_38661599/article/details/81009978
今日推荐