腾讯视频影评爬虫(动态网页)

极客学院的入门爬虫最后一课:

环境: win10 scrapy1.6 mongode redis py3.6

该评论为网页源代码看不到的，需要进行抓包，可以通过谷歌浏览器的F12进行简单抓包
这里的腾讯视频的评论在js内容中了
example

打开下图url即为评论的相关地址:
example

打开以后会是这样（在谷歌浏览器上安装JSONView 来对JSON格式的网页进行方便查看）
example

如果不使用插件将会看到这样的结果(找的需要的信息将会很麻烦)
example

from scrapy import cmdline
cmdline.execute('scrapy crawl dmoz'.split())

settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'Comment'

SPIDER_MODULES = ['Comment.spiders']
NEWSPIDER_MODULE = 'Comment.spiders'

# redis数据库的一些基本配置(这里的redis没有配置任何密码，在本地使用)
# 这些内容为自己添加的内容 不用redis数据库时不需要配置
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_POST = 6379

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 连接Mongode数据库的配置
# 以下的内容为自己添加的内容，不需要使用Mongode数据库的时候，不需要写
MONGODE_HOST = '127.0.0.1'
MONGODE_PORT = 27017
MONGODE_DBNAME = 'Test'
MONGODE_DOCNAME = 'Book'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'Comment.middlewares.CommentSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'Comment.middlewares.CommentDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'Comment.pipelines.CommentPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class CommentItem(scrapy.Item):
    content = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

from Comment.items import CommentItem
from scrapy.conf import settings   # 配置项的使用
import pymongo                     # 导入python控制数据库mongode的包


class CommentPipeline(object):
    def __init__(self):
        # 类的初始化 这里连接的mongode数据库是没有上任何密码的
        host = settings['MONGODE_HOST']
        port = settings['MONGODE_PORT']
        dbName = settings['MONGODE_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        tdb = client[dbName]
        self.post = tdb[settings['MONGODE_DOCNAME']]

    def process_item(self, item, spider):
        bookInfo = dict(item)
        self.post.insert(bookInfo)
        return item

Comment.py(核心代码)

from scrapy_redis.spiders import RedisSpider
from Comment.items import CommentItem
from scrapy.http import Request
import re


class Comment(RedisSpider):
    name = 'dmoz'
    redis_key = 'Comment:start_urls'
    start_urls = ['https://v.qq.com/x/cover/qviv9yyjn83eyfu/n0016ibg4eb.html']
    # 这里用sss表示id 用replace的方法直接将sss替换掉
    comment_url = 'https://video.coral.qq.com/varticle/sss/comment/v2?callback=_varticlessscommentv2&orinum=30'
    cid_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid=%s'

    # 获取cid
    def parse(self, response):
        cid = re.search('"srcid":"(.*?)","', response.body.decode('utf-8'), re.S).group(1)
        cidurl = self.cid_url %cid
        yield Request(cidurl, callback=self.parse_id)

    # 获取评论内容url
    def parse_id(self, response):
        id = re.search('"comment_id":"(.*?)","', response.body.decode('utf-8'), re.S).group(1)
        commenturl = self.comment_url.replace('sss', '%s'%id)
        yield Request(commenturl, callback=self.parse_comment)
    # 获取评论内容
    def parse_comment(self, response):
        comment = re.findall(r'"content":"(.*?)",', response.body.decode('utf-8'), re.S)
        for each in comment:
            item = CommentItem()
            item['content'] = each.encode('utf-8').decode('unicode_escape')
            yield item

这里的只是爬取了该视频的一页评论，如果想多页爬取，可以参考一下: https://www.bilibili.com/video/av22833357/?p=33

成果： example

腾讯视频影评爬虫(动态网页)

猜你喜欢