Method One (providing only ideas)
Because the initial load time of a page are already loaded, and go directly to the data BILIBILI this page just fine in JS this page, you use regular take out objects are JS, and find ways to turn into a dictionary on OK .
Method Two
# -*- coding: utf-8 -*-
import scrapy
import json
from pabz.items import PabzItem
import re
import time
import requests
from selenium import webdriver
from time import sleep
class BzSpider(scrapy.Spider):
name = 'bz'
# allowed_domains = ['www.com']
start_urls = ['https://www.bilibili.com/']
bro = webdriver.Chrome(executable_path=r'F:\爬虫包\通用爬虫selenium\chromedriver.exe')
page=1
def parse(self, response):
self.bro.get(response.url)
sleep(2)
while(self.page<3):
#热门视频url
hot_url_list=response.xpath('//*[@id="reportFirst1"]/div[2]/div')[0:8]
for url in hot_url_list:
item={}
#dic是用来传网址的
dic={}
need_url=''.join(url.xpath('./div/a/@href').extract())
whole_url='https:'+need_url #完整的url
aid_number = need_url.split('av')[-1]
cid= requests.get(url=whole_url).text
need_cid = re.findall('","cid":(.*?),"', cid, re.M)
need_cid=''.join(need_cid)
dm_api = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + need_cid # 弹幕
zan_api = 'https://api.bilibili.com/x/web-interface/archive/' + 'stat?aid='+aid_number #点赞人数
#在线人数
online_url='https://api.bilibili.com/x/player.so?id=cid%3A' + need_cid+'&aid='+aid_number+'&buvid=D7512C54-9EB9-4D8A-ADF9-040A66C06A6C190950infoc'
dic['online_url']=online_url
dic['dm_api']=dm_api
dic['zan_api']= zan_api
yield scrapy.Request(whole_url,callback=self.title,meta={'dic':dic,'item':item})
self.bro.quit()
#标题
def title(self,response):
item = response.meta['item']
dic=response.meta['dic']
title=response.xpath('//*[@id="viewbox_report"]/h1/span/text()').extract()
item['title'] = title
#由于后台的原因 必须带一个参数
header = {
' Access - Control - Allow - Origin': ' https: // www.bilibili.com'
}
yield scrapy.Request(dic['online_url'], callback=self.online, headers=header,meta={'dic':dic,'item':item})
#获取在线人数
def online(self,response):
dic = response.meta['dic']
item = response.meta['item']
response = response.body.decode()
online = re.findall('<online_count>(.*?)</online_count>', response, re.M)
online_people=''.join(online)
item['online_people']=online_people
yield scrapy.Request(dic['zan_api'],callback=self.dianzan,meta={'dic':dic,'item':item})
#这个函数是获取点赞 投币 收藏转发 的数量
def dianzan(self,response):
dic = response.meta['dic']
item = response.meta['item']
all_data = json.loads(response.text)
#点赞
detail_data =all_data.get('data')
dian_zan=detail_data.get('like')
item['dian_zan'] = dian_zan
#投币
coins=detail_data.get('coin')
item['coins'] = coins
#收藏
favorite=detail_data.get('favorite')
item['collect']=favorite
# print(favorite)
yield scrapy.Request(dic['dm_api'],callback=self.dm,meta={'dic':dic,'item':item})
def dm(self,response):
dic = response.meta['dic']
item = response.meta['item']
d_list = response.xpath('/i/d')
all=[]#装 弹幕 最后加到item里去,防止覆盖了
for d in d_list:
content = d.xpath('./text()').extract()
content=''.join(content)
time_base = d.xpath('./@p').extract()
str_time_base = ''.join(time_base)
unix_time = str_time_base.split(',')[4]
unix_time = int(unix_time)
x = time.localtime(unix_time)
end_finish_time = time.strftime('%Y-%m-%d %H:%M:%S', x)
all_dm_content = str(end_finish_time) + content
all.append(all_dm_content)
item['dm'] = ''.join(all)
yield PabzItem(
title=item['title'],
zan=item['dian_zan'],
coins=item['coins'],
collect=item['collect'],
screen_shoot=item['dm'],
online_people=item['online_people'])
self.page += 1
To_clik=self.bro.find_element_by_xpath('//*[@id="reportFirst1"]/div[2]/div[10]/i')
To_clik.click()
sleep(1)
My item
class PabzItem(scrapy.Item):
title =scrapy.Field()
zan = scrapy.Field()
coins =scrapy.Field()
collect = scrapy.Field()
screen_shoot=scrapy.Field()
online_people=scrapy.Field()
My pipine
class PabzPipeline(object):
def process_item(self, item, spider):
return item
from pymongo import MongoClient # 使用MongoClient连接mongo
from pabz.settings import Mongoip,MongoDBname,MongoPort,MongoItem #从settings.py导入第一步配置的连接信息
class CrawldataToMongoPipline(object):
def __init__(self):
host = Mongoip #主机
port = MongoPort #端口
dbName = MongoDBname #文档名
client = MongoClient(host=host,port=port) # 创建连接对象client
db = client[dbName] # 使用文档dbName='mylove1'
self.post = db[MongoItem] # 使用item MongoItem='DouluodaluItem'
def process_item(self, item, spider):
dl_info = dict(item) # item转换为字典格式
self.post.insert(dl_info) # 将item写入mongo
return item
My setting
Mongoip='127.0.0.1' #mongoDB节点 ip地址 可以写127.0.0.1,或者cmdifconfig查自己的ip 前提是可视化工具(robo 3t 客户端)能打开你可视化能打开本地的mongodb
MongoPort = 27017 #端口号 一般都是 27017
MongoDBname='mylove1' #文档名
MongoItem='PabzItem' #item名
To open the last item - pipiline
ITEM_PIPELINES = {
'pabz.pipelines.PabzPipeline': 300,
'pabz.pipelines.CrawldataToMongoPipline': 301,#入数据库方式
}