这次,我们来爬取哔哩哔哩的热门视频,爬去其中的标题与详细页的同时观看人数和弹幕的实时在线,并存入mongodb
打开其中一个详情页发现里面是动态的,用静态网页的方法是爬不出来的
这次我们用抓包的方式来完成此次任务,也就是用chrome自带的开发者工具,按F12使用
用xhr筛选,其中的response等来筛选自己所需要的数据,并从Headers里拿到它的url,因为我们要爬八个网址,所以我们要用一个通用的式子来表示它,用正则表达式来拼凑出来。
我们找到了三个网址
我们发现这个网页的url是进不去的,但是我们发现response里面有我们需要的东西,用response.body.decode()来进行提取,再用正则提取。
根据dm二字我们发现了弹幕的url,进去后发现弹幕都在里面,这个即是我们所需要的url。
这里response里面有我们需要的coin等要点,这样就找到了所需要的所有东西。
接下来的难点呢,就是怎样拼凑出我们所通用的表达式
这里的话具体参照下方代码,用正则表达式等方式,表达出来
spider代码如下
# -*- coding: utf-8 -*-
import scrapy
import json
from bilibili.items import BilibiliItem
import re
import time
import requests
class BlibiliSpider(scrapy.Spider):
name = 'ganbei'
# allowed_domains = ['www.bilibili.com'] # 注释掉,不然打不开后面的网页
start_urls = ['https://www.bilibili.com/']
def parse(self, response):
listmain = response.xpath('//*[@id="reportFirst1"]/div[2]/div')[0:8]
for each in listmain:
item = {}
url = ''.join(each.xpath('./div/a/@href').extract())
urls = 'https:' + url
video_name = ' '.join(each.xpath('./div/a/div/p[1]/text()').extract())
item['title'] = video_name # 拿下标题
part_number = urls.split('/av')[1]
cid = requests.get(url=urls).text # 用来组成api
need_part = re.findall('","cid":(.*?),"', cid, re.S)
need_part = ''.join(need_part)
barrage_api = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + need_part
collection_api = 'https://api.bilibili.com/x/web-interface/archive/' + 'stat?aid=' + part_number
watching_url = 'https://api.bilibili.com/x/player.so?id=cid%3A' + need_part + '&aid=' + part_number + '&buvid=D7512C54-9EB9-4D8A-ADF9-040A66C06A6C190950infoc'
item['barrage_api'] = ''.join(barrage_api)
item['watching_url'] = ''.join(watching_url)
item['collection_api'] = ''.join(collection_api)
yield scrapy.Request(url=item['collection_api'], callback=self.collection, meta={'item': item})
def collection(self, response):
item = response.meta['item']
all_text = json.loads(response.text) # 用来拿下json
detail_text = all_text.get('data')
coins = detail_text.get('coin')
favorite = detail_text.get('favorite')
prise_number = detail_text.get('like')
item['prise_number'] = prise_number
item['coin_number'] = coins
item['collection'] = favorite
yield scrapy.Request(item['watching_url'], callback=self.watching, meta={'item': item})
def watching(self, response):
item = response.meta['item']
response = response.body.decode()
online = re.findall('<online_count>(.*?)</online_count>', response, re.S)
online_people = ''.join(online)
item['watching_people'] = online_people
yield scrapy.Request(url=item['barrage_api'], callback=self.barrage, meta={'item': item})
def barrage(self, response):
item = response.meta['item']
bang_list = response.xpath('/i/d')
all_barrage = [] # 方便下面的导入item
for bang in bang_list:
content = bang.xpath('./text()').extract()
content = ''.join(content)
time_base = bang.xpath('./@p').extract()
time_base = ''.join(time_base)
time_one = int(time_base.split(',')[4])
time_is = time.localtime(time_one)
end_finish_time = time.strftime('%Y-%m-%d %H:%M:%S', time_is) # 定义时间
all_dm_content = str(end_finish_time) + content
all_barrage.append(all_dm_content)
item['barrage'] = ''.join(all_barrage)
yield BilibiliItem(
title=item['title'],
praise_number=item['prise_number'],
coin_count=item['coin_number'],
collection_number=item['collection'],
barrage=item['barrage'],
watching_people=item['watching_people'])
items.py
import scrapy
class BilibiliItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
watching_people = scrapy.Field()
barrage = scrapy.Field()
praise_number = scrapy.Field()
coin_count = scrapy.Field()
collection_number = scrapy.Field()
pipelines
class BilibiliPipeline(object):
def process_item(self, item, spider):
return item
class CrawldataToMongoPipline(object):
def __init__(self):
host = Mongoip
port = MongoPort
dbName = MongoDBname
client = MongoClient(host=host, port=port) # 创建连接对象client
db = client[dbName] # 使用文档dbName='datago306'
self.post = db[MongoItem] # 使用item MongoItem='jobItem'
def process_item(self, item, spider):
job_info = dict(item) # item转换为字典格式
self.post.insert(job_info) # 将item写入mongo
return item