【项目小结】某B视频网站的爬虫实践

最近忽来兴致，准备做评论数据的NLP项目。选定了某B视频网站的评论数据，顺带准备把某B视频网站的数据爬虫也一起做了。关于登录验证的问题可以看我的博客https://blog.csdn.net/CY19980216/article/details/89074771，不过目前登录方式稍微有点不同，因为验证图片不太方便获取了，我尝试了后觉得只能通过截图的方式才能拿到，如此鲁棒性较差。而且由于也无法获取到原图的链接，还原的难度也增大了。然而目前数据爬虫不需要登录，暂时不吃力不讨好地去突破当前的验证码。
基本上难度不大，目前先做了基于视频的数据获取。主要逻辑是遍历一个个av号视频，然后获取视频页面上的关于视频的各个信息以及视频下面的评论信息。仍然使用了selenium驱动，主要问题是很多元素的xpath定位可能在网页更新后会失效，不过我已经尽力写得鲁棒性较好了。
# -*- coding:UTF-8 -*-
# 作者: 囚生CY
# 最后更新: 20190715
# 转载请注明原作者, 禁止用于商业用途

import re
import os
import sys
import time
import json
import numpy
import pandas
import random

from PIL import Image
from requests import Session
from bs4 import BeautifulSoup
from selenium import webdriver
from matplotlib import pyplot as plt
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains


class BiliBili():
	def __init__(self,
		username="用户名",
		password="密码",
		userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
	):																	 # 构造函数
		""" 类构造参数 """
		self.username = username
		self.password = password
		self.userAgent = userAgent
		""" 类常用参数 """
		self.workspace = os.getcwd()									 # 类工作目录
		self.date = time.strftime("%Y%m%d")								 # 类构造时间
		self.labelCompiler = re.compile(r"<[^>]+>",re.S)				 # 标签正则编译
		self.tempFolder = "Temp"										 # 存储临时文件的文件夹
		self.videoFolder = "Video"										 # 存储视频数据的文件夹
		self.userFolder = "User"										 # 存储用户数据的文件夹
		self.commentFolder = "Comment"									 # 存储评论数据的文件夹
		self.log = "{}.log".format(self.date)							 # 记录文件
		self.videoPath = "{}\\{}\\{}".format(self.workspace,self.videoFolder,self.date)
		self.userPath = "{}\\{}\\{}".format(self.workspace,self.userFolder,self.date)
		self.commentPath = "{}\\{}\\{}".format(self.workspace,self.commentFolder,self.date)
		self.mainURL = "https://www.bilibili.com/"						 # BiliBili主页
		self.loginURL = "https://passport.bilibili.com/login"			 # 用户登录页面
		self.videoURL = "https://www.bilibili.com/video/av{}/"			 # 视频网址链接
		self.userURL = "https://space.bilibili.com/{}"					 # 用户空间链接
		self.options = webdriver.FirefoxOptions()						 # 火狐驱动配置
		self.headers = {"User-Agent": userAgent}
		self.session = Session()
		self.videoField = [												 # 视频数据库字段
			"av",														 # 视频av号
			"title",													 # 视频标题
			"up",														 # UP主昵称
			"follower",													 # UP粉丝数
			"playback_volume",											 # 播放量
			"barrage",													 # 弹幕数
			"like",														 # 点赞数
			"coin",														 # 硬币数
			"collect",													 # 收藏数
			"comment",													 # 评论数
			"comment_page",												 # 评论页数
			"category",													 # 视频类别
			"tags",														 # 视频标签(用|隔开)
			"timestamp",												 # 爬取数据的时间戳
		]
		self.userField = [												 # 用户数据库字段
			"id",														 # 用户ID
			"name",														 # 用户昵称
			"gender",													 # 性别
			"level",													 # 用户等级
			"signature",												 # 个性签名
			"is_member",												 # 是否为大会员
			"fans_icon",												 # 是否开通粉丝勋章
			"follower",													 # 关注TA的人
			"followee",													 # TA关注的人
			"playback_volume",											 # 总播放量
			"reading_volume",											 # 总阅读数
			"contribution",												 # 投稿数
			"timestamp",												 # 爬取数据的时间戳
		]
		self.commentField = [											 # 评论数据库的字段
			"av",														 # 视频av号
			"id",
			"name",														 # 用户昵称
			"level",													 # 用户等级
			"text",														 # 评论内容
			"like",														 # 点赞数
			"reply",													 # 回复数
			"date",														 # 评论日期
			"timestamp",												 # 爬取数据的时间戳
		]
		""" 类初始化 """
		self.session.headers = self.headers.copy()
		self.options.add_argument("--headless")							 # 设定无头浏览器的配置

		if not os.path.exists("{}\\{}".format(self.workspace,self.tempFolder)):
			string = "正在新建文件夹以存储临时文件..."
			print(string)
			os.mkdir("{}\\{}".format(self.workspace,self.tempFolder))

		if not os.path.exists(self.videoPath):							 # 视频数据文件初始化
			string = "正在新建文件夹以存储视频数据{}...".format(self.date)
			print(string)
			if not os.path.exists("{}\\{}".format(self.workspace,self.videoFolder)): os.mkdir("{}\\{}".format(self.workspace,self.videoFolder))
			os.mkdir(self.videoPath)
			with open("{}\\video{}.csv".format(self.videoPath,self.date),"w") as f:
				count = -1
				for field in self.videoField:
					count += 1
					if count: f.write(",{}".format(field))
					else: f.write(field)
				f.write("\n")

		if not os.path.exists(self.userPath):							 # 用户数据文件初始化
			string = "正在新建文件夹以存储用户数据{}...".format(self.date)
			print(string)
			if not os.path.exists("{}\\{}".format(self.workspace,self.userFolder)): os.mkdir("{}\\{}".format(self.workspace,self.userFolder))
			os.mkdir(self.userPath)
			with open("{}\\user{}.csv".format(self.userPath,self.date),"w") as f:
				count = -1
				for field in self.userField:
					count += 1
					if count: f.write(",{}".format(field))
					else: f.write(field)
				f.write("\n")

		if not os.path.exists(self.commentPath):						 # 评论数据文件初始化
			string = "正在新建文件夹以存储评论数据{}...".format(self.date)
			print(string)
			if not os.path.exists("{}\\{}".format(self.workspace,self.commentFolder)): os.mkdir("{}\\{}".format(self.workspace,self.commentFolder))
			os.mkdir(self.commentPath)
			with open("{}\\comment{}.csv".format(self.commentPath,self.date),"w") as f:
				count = -1
				for field in self.commentField:
					count += 1
					if count: f.write(",{}".format(field))
					else: f.write(field)
				f.write("\n")

	def login_20190408(self,):											 # 用户登录(20190408更新, 20190712检验已失效)

		def download_verifying_picture(divs,name):						 # 下载滑动验证图片	
			style = divs[0].attrs["style"]
			index1 = style.find("(")
			index2 = style.find(")")
			url = eval(style[index1+1:index2])
			html = self.session.get(url).content
			with open("{}\\{}\\{}.webp".format(self.workspace,self.tempFolder,name),"wb") as f: f.write(html)

		def recover_picture(divs,name):									 # 设法复原下载好的图片(该函数默认切片是两行)
			index = []
			for div in divs:											 # 遍历所有切片(52片)
				style = div.attrs["style"]
				index1 = style.find("background-position")				 # 寻找背景图的切片坐标
				temp = style[index1+21:-1].strip().replace("px","").replace("-","").split()
				temp = [int(i) for i in temp]				
				index.append(temp)
			image = Image.open("{}\\{}\\{}.webp".format(self.workspace,self.tempFolder,name))
			image = numpy.asarray(image)								 # 图片转矩阵
			imageRe = numpy.zeros(image.shape)							 # 初始化复原图片矩阵
			total = len(index)											 # 获取总切片数
			Xaxis,Yaxis,Zaxis = image.shape								 # 获取图片三维信息(116×312×3)
			X = int(2*Yaxis/total)										 # 每个切片的列宽(12px)
			Y = int(Xaxis/2)											 # 每个切片的行高(58px)
			index = [[int((indice[0]-1)/X),int(indice[1]>0)] for indice in index]
			for i in range(total):										 # 遍历切片复原
				x1 = index[i][0]*X										 # 切片实际左坐标
				x2 = x1+X												 # 切片实际右坐标
				y1 = index[i][1]*Y										 # 切片实际上坐标
				y2 = y1+Y												 # 切片实际下坐标
				a = int(Y)												 # 切片原横坐标
				b1 = int((i%(total/2))*X)								 # 切片原上坐标
				b2 = int((i%(total/2))*X+X)								 # 切片原下坐标
				""" 判断当前切片是第几行(目前按照默认是前26个为第一行切片,后26个为第二行切片来做的) """
				if i<total/2: imageRe[:a,b1:b2,:] = image[y1:y2,x1:x2,:] # 第一行
				else: imageRe[a:,b1:b2,:] = image[y1:y2,x1:x2,:]		 # 第二行
			imageRe = Image.fromarray(imageRe.astype("uint8"))			 # 图片格式的文件矩阵元素一定为uint8
			imageRe.save("{}\\{}\\test{}.webp".format(self.workspace,self.tempFolder,name))

		def find_block_space(width=53,zoo=1.15,plot=True):				 # 寻找缺块位置(默认参数width为缺块的列宽像素,zoo这边用1.15基本上大概率能过了,但是我查了一下两个图片的属性应该是1.2,设为1.2应该要改常数项了)
			"""
			这里的方法非常简单:
			我本来是想可能需要用到opencv,
			但是我发现因为已知复原图片的数据,
			所以直接将图片数据的列向量计算相似度即可,
			相似度最差的地方即为缺块;
			另外观察发现图片的像素为行高59&列宽53,
			共312px列中前53小的相似度列取中间位置应该即可;
			"""
			image1 = numpy.asarray(Image.open("{}\\{}\\test1.webp".format(self.workspace,self.tempFolder)))
			image2 = numpy.asarray(Image.open("{}\\{}\\test2.webp".format(self.workspace,self.tempFolder)))
			Xaxis,Yaxis,Zaxis = image1.shape							 # 获取图片三维信息(116×312×3)
			errors = []													 # 记录312列宽上每个列向量的非相似度值
			for i in range(Yaxis):
				total = 0
				for j in range(Zaxis):
					X = numpy.array([image1[:,i,j]]).astype("int64")
					Y = numpy.array([image2[:,i,j]]).astype("int64").T
					normX = numpy.linalg.norm(X,2)
					normY = numpy.linalg.norm(Y,2)
					dotXY = numpy.dot(X,Y)[0,0]
					error = 1.- (dotXY/normX/normY)						 # 这里我选择累积RGB在(1-余弦相似度)上的值
					total += error
				errors.append(total)
			tempErrors = errors[:]
			tempErrors.sort(reverse=True)
			index = [errors.index(i) for i in tempErrors[:width]]		 # 计算排序后对应的索引排序(根据图像的结果来看应该前width的索引是至少近似连续的自然数)
			if plot:
				plt.plot([i for i in range(len(errors))],errors)
				plt.savefig("{}\\{}\\error.jpg".format(self.workspace,self.tempFolder))
			return min(index[:10])/zoo-10

		def get_track(xoffset):											 # 获取一条路径
			tracks = []
			x = int(xoffset/2)											 # 先走一半(很关键,不走不给过)
			tracks.append(x)
			xoffset -= x
			while xoffset>=10:
				x = random.randint(5,9)
				tracks.append(x)
				xoffset -= x
			for i in range(int(xoffset)): tracks.append(1)				 # 最后几步慢慢走
			return tracks

		while True:
			browser = webdriver.Firefox()								 # 驱动火狐浏览器
			browser.get(self.loginURL)									 # 访问登录页面
			interval = 1.												 # 初始化页面加载时间(如果页面没有加载成功,将无法获取到下面的滑动验证码按钮,林外我意外的发现有时候竟然不是滑动验证,而是验证图片四字母识别,个人感觉处理滑动验证更有意思)
			while True:													 # 由于可能未成功加载,使用循环确保加载成功
				browser.find_element_by_id("login-username").send_keys(self.username)
				browser.find_element_by_id("login-passwd").send_keys(self.password)
				xpath = "//div[@class='gt_slider_knob gt_show']"		 # 滑动验证码最左边那个按钮的xpath定位
				try:
					time.sleep(interval)								 # 等待加载
					div = browser.find_element_by_xpath(xpath)
					break
				except:
					browser.refresh()
					interval += .5										 # 每失败一次让interval增加0.5秒
					print("页面加载失败！页面加载时间更新为{}".format(interval))

			ActionChains(browser).click_and_hold(on_element=div).perform()
			html = browser.page_source									 # 此时获取的源代码中将包含滑动验证图片以及存在缺块的滑动验证图片
			soup = BeautifulSoup(html,"lxml")							 # 解析页面源代码
			div1s = soup.find_all("div",class_="gt_cut_fullbg_slice")	 # 找到没有缺块的验证图片52个切片
			div2s = soup.find_all("div",class_="gt_cut_bg_slice")		 # 找到存在缺块的验证图片52个切片
			div3 = soup.find("div",class_="gt_slice gt_show gt_moving")	 # 找到那个传说中的缺块						
			download_verifying_picture(div1s,1)							 # 下载无缺块
			download_verifying_picture(div2s,2)							 # 下载有缺块
			recover_picture(div1s,1)									 # 复原无缺块
			recover_picture(div2s,2)									 # 复原有缺块
			xoffset = find_block_space()								 # 寻找缺块位置的横坐标
			tracks = get_track(xoffset)
			total = 0
			for track in tracks:
				print(track)
				total += track
				ActionChains(browser).move_by_offset(xoffset=track,yoffset=random.randint(-5,5)).perform()
				time.sleep(random.randint(50,100)/100)
			ActionChains(browser).move_by_offset(xoffset=5,yoffset=random.randint(-5,5)).perform()
			ActionChains(browser).move_by_offset(xoffset=-5,yoffset=random.randint(-5,5)).perform()
			time.sleep(0.5)
			ActionChains(browser).release(on_element=div).perform()
			time.sleep(3)
			xpath = "//a[@class='btn btn-login']"						 # 登录按钮的xpath定位
			browser.find_element_by_xpath(xpath).click()				 # 点击登录按钮
			html = browser.page_source
			time.sleep(1.)
			soup = BeautifulSoup(html,"lxml")
			title = soup.find("title")
			if str(title.string[4])=="弹":
				print("登录失败！准备重新登录！")
				browser.quit()
			else:
				print("登录成功！")
				return browser

	def login_20190712(self,):											 # 用户登录(20190712更新)
		"""
		目前登录方式相对于20190408的区别在于一下几点:
		 - 次序上是先输入用户名密码, 点击登录后才会出现验证码图片;
		 - 验证码图片的元素结构变化, 没有小切片, 并且无法获取原图链接, 这大大增加了复原的难度(而且我还找不到);
		 - 滑动按钮并未改变, 因此看起来是极验自身升级了, 因为近期无登录需求, 不打算攻破这种验证码, 认为在识别上有一定难度;
		"""
		pass

	def parse_video(self,av,driver,
		isVedioString="player-wrap",									 # 用于判断视频是否失效的字符串
		maxpage=50,														 # 最多获取maxpage页的评论(按热度排序)
	):																	 # 给定av号与浏览器驱动, 获取视频数据
		driver.get(self.videoURL.format(av))							 # 访问视频链接
		html = driver.page_source										 # 立即获取源码
		if not isVedioString in html: return False						 # 确认视频是否存在: 如果源码中有isVedioString则认为视频未失效			 
		while True:														 # 加载需要时间: 之前我使用WebDriverWait方法, 但是发现只要加载成功, 页面会回到顶部, 结果就又找不到底部的元素
			driver.execute_script("window.scrollBy(0,500)")				 # 滚屏找到评论
			try: divs = driver.find_element_by_xpath('//div[@class="baffle"]')
			except: continue											 # 找不到元素继续滚屏
			break														 # 走到这一步当然是找到元素咯
		html = driver.page_source										 # 获取完整的源代码
		timestamp = int(time.time())									 # 即刻获取timestamp

		soup = BeautifulSoup(html,"lxml")								 # 解析页面源代码
		"""
		生成视频数据字段:
		 - title: "//span[@class='tit']";
		 - up: 有点硬写得;
		 - follower: 写得也很硬;
		 - playback_volume: 第一种从title属性中获取,过滤掉的前4个字符是总播放数(精确到个位), 第二种直接拿string(精确到千位);
		 - barrage: 第一种从title属性中获取, 过滤掉前7个字符是历史累计弹幕数(精确到个位), 第二种直接拿string(精确到千位);
		 - like: 第一种从title属性中获取, 过滤掉前3个字符是点赞数(精确到个位), 第二种直接拿string(精确到千位);
		 - coin: "//span[@class='tit']";
		 - collect: "//span[@class='tit']";
		 - comment: 利用先定位到"//div[@class='common']", 降低容错率;
		 - comment_page: 利用先定位到"//div[@class='common']", 降低容错率;
		 - category: 目前来看只有早期部分视频没有分类, 几乎所有视频是有分类的, 因此元素可能定位不到;
		 - tags: 少数视频无tag, 不同的tag用"|"分开;
		 - timestamp: 时间戳;
		"""
		title = str(soup.find("span",class_="tit").string)
		up = str(soup.find("div",class_="u-info").find("div",class_="name").find("a").string)
		follower = str(soup.find("i",class_="van-icon-general_addto_s").find_next_sibling().string)

		playback_volume = soup.find("span",class_="view")
		playback_volume1 = playback_volume.attrs["title"][4:]
		playback_volume2 = str(playback_volume.string)
		playback_volume2 = playback_volume2[:playback_volume2.find("播放")]

		barrage = soup.find("span",class_="dm")
		barrage1 = barrage.attrs["title"][7:]		
		barrage2 = str(barrage.string)					
		barrage2 = barrage2[:barrage2.find("弹幕")]
		
		temp = soup.find("div",class_="ops")
		like = temp.find("span",class_="like")
		like1 = like.attrs["title"][3:]
		like2 = self.labelCompiler.sub("",str(like)).replace("\n","").replace(" ","")
		coin = temp.find("span",class_="coin")
		coin = self.labelCompiler.sub("",str(coin)).replace("\n","").replace(" ","")
		collect = temp.find("span",class_="collect")
		collect = self.labelCompiler.sub("",str(collect)).replace("\n","").replace(" ","")

		temp = soup.find("div",class_="common")
		comment = temp.find("span",class_="b-head-t results").string
		comment = 0 if comment is None else int(comment)
		comment_page = 0 if comment==0 else int(str(temp.find("span",class_="result").string).replace(" ","")[1:-1])

		try: category = str(soup.find("span",class_="a-crumbs").find("a").string)
		except: category = str()										 # 无分类: 使用异常测试尽管不会报错, 但是出问题也将尽快发现20190714;

		temp = soup.find("div",id="v_tag").find_all("li",class_="tag")	 # 该temp包含了所有tag
		tags = str()
		for tag in temp: tags += "{}|".format(tag.find("a").string)
		tags = tags[:-1]												 # 去掉最后一个"|"符号

		string = str()
		for item in [av,title,up,follower,playback_volume1,barrage1,like1,coin,collect,comment,comment_page,category,tags,timestamp]: string += "{},".format(item)
		string = "{}\n".format(string[:-1])
		with open("{}\\video{}.csv".format(self.videoPath,self.date),"a",encoding="UTF-8") as f: f.write(string)

		if comment==0: return 											 # 无评论就告辞了
		""" 以下开始获取评论信息 """
		driver.find_element_by_xpath("//li[@class='hot-sort  on']").click()
		page = 0														 # 记录当前页数
		while page<maxpage:
			timestamp = int(time.time())
			page += 1
			string = " - 正在获取第{}页的评论信息...".format(page)
			print(string)
			with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("{}\t{}\n".format(string,time.strftime("%Y-%m-%d %H:%M:%S")))
			html = driver.page_source									 # 按照热门度排序后的html
			soup = BeautifulSoup(html,"lxml")							 # 重新解析html
			div = soup.find("div",class_="comment-list")				 # 定位到评论表
			if not div.find("div",class_="no-more-reply") is None: break # 某些视频最后一页评论无数据: 如在av32的最后一页评论上是没有数据的20190714
			""" 因为采取了预定位因此基本上不应该找错 """
			for child in div.children:									 # 通常来说div有20个宝宝(一页20条评论)
				temp = child.find("a",class_="name")
				uid = temp.attrs["data-usercard-mid"]					 # id
				name = str(temp.string)									 # name
				level = child.find("i").attrs["class"][1]				 # level
				text = self.labelCompiler.sub("",str( child.find("p",class_="text")))
				text = text.replace("\n","|")							 # text: 因为评论往往有@符号导致出现<a>标签, 不方便直接获取string, 因此选择去标签正则, 有些评论有换行, 目前先用"|"符号替代\n
				like = child.find("span",class_="like").string
				like = 0 if like is None else int(like)					 # like: 当无点赞时string位置是空
				reply = child.find("div",class_="reply-box")
				reply = len(list(reply.children))						 # reply
				date = str(child.find("span",class_="time").string)		 # date
				string = str()
				for item in [av,uid,name,level,text,like,reply,date,timestamp]: string += "{},".format(item)
				string = "{}\n".format(string[:-1])			
				with open("{}\\comment{}.csv".format(self.commentPath,self.date),"a",encoding="UTF-8") as f: f.write(string)
			try: driver.find_element_by_xpath("//a[@class='next']").click()
			except: break												 # 找不到下一页的按钮了

	def parse_user(self,uid,driver,
		xpath_flag="//div[@id='app']"									 # 用于判定页面是否加载完成: 未登录状态时或不为TA的粉丝时为<div id="app" class="vistor">, 登录状态时且为TA的粉丝为<div id="app" class="fans">, 总之只看id属性差不多够了
	):																	 # 给定用户ID与浏览器驱动, 获取用户数据
		driver.get(self.userURL.format(uid))
		WebDriverWait(driver,15).until(lambda driver: driver.find_element_by_xpath(xpath_flag).is_displayed())
		html = driver.page_source										 # 相对来说用户空间的html加载很快
		timestamp = int(time.time())
		soup = BeautifulSoup(html,"lxml")								 # 解析起来也较为容易
		
		self.userField = [												 # 用户数据库字段
			"id",														 # 用户ID
			"name",														 # 用户昵称
			"gender",													 # 性别
			"level",													 # 用户等级
			"signature",												 # 个性签名
			"is_member",												 # 是否为大会员
			"fans_icon",												 # 是否开通粉丝勋章
			"follower",													 # 关注TA的人
			"followee",													 # TA关注的人
			"playback_volume",											 # 总播放量
			"reading_volume",											 # 总阅读数
			"contribution",												 # 投稿数
			"timestamp",												 # 爬取数据的时间戳
		]
		with open("log_{}.html".format(uid),"w",encoding="UTF-8") as f: f.write(html)	 
		temp = soup.find("div",class_="h-basic")						 # 定位到左上部用户信息区域
		name = str(temp.find("span",id="h-name").string)				 # name: 用户昵称应该不会有什么问题
		gender = temp.find("span",id="h-gender").attrs["class"]			 # 在性别标签的class属性下包含了性别信息
		gender = gender[2] if len(gender)==3 else str()					 # gender: 性别为男女不定, class标签为["icon","gender","male"/"female"], 没有填写性别的用户没有第三个class, 且不展示
		level = temp.find("a",class_="h-level m-level").attrs["lvl"][0]	 # level: level1~level6, 应该也不会有什么问题 
		signature = str(temp.find("div",class_="h-basic-spacing").find("h4",class_="h-sign").string)
		signature = str() if signature=="None" else signature.strip()	 # signature: 个性签名为空处理为空字符串, 不展示
		is_member = temp.find("a",class_="h-vipType").string
		is_member = False if is_member is None else True				 # is_member: 开通年度大会员的用户string字段是"年度大会员", 未开通的该字段不展示且为空
		fans_icon = temp.find("span",class_="h-fans-icon")
		fans_icon = False if fans_icon is None else True				 # fans_icon: 与上面的不展示的标签不同, 未开通粉丝勋章的用户是没有该标签的

		temp = soup.find("div",class_="n-statistics")					 # 定位到右上部用户数据统计区域
		follower = temp.find("a",class_="n-data n-fs").attrs["title"]	 # follower: 这个应该没有太多问题, title属性里是精确的个数, string部分里精确到千位
		followee = temp.find("a",class_="n-data n-gz").attrs["title"]	 # followee: 这个应该没有太多问题, title属性里是精确的个数, string部分里精确到千位
		volumes = temp.find_all("a",class_="n-data n-bf")				 # 这部分是流量区域: 目前我只找到播放数与阅读数两种, 没有投稿的人不会有播放数, 没有动态的人不会有阅读数
		if len(volumes)==0: playback_volume = reading_volume = 0		 # 无播放数, 无阅读数(大部分边缘用户)
		elif len(volumes)==1:											 # 播放数阅读数二选一(代表人物:papi酱, 1532165)
			string = str(volume[0].find("p",class_="n-data-k").string)
			if string=="播放数":
				reading_volume = 0
				playback_volume = volumes[0].attrs["title"].replace(",","")				
			elif string=="阅读数":
				playback_volume = 0
				reading_volume = volumes[0].attrs["title"].replace(",","")						
			else:														 # 异常记录
				with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("Error1: 无法确定流量类别！UID{}\t{}\n".format(uid,time.strftime("%Y-%m-%d %H:%M:%S")))	
		elif len(volumes)==2:											 # 有播放数, 有阅读数(代表人物:lexburner, 777536)
			playback_volume = volumes[0].attrs["title"].replace(",","")	 # playback_volume: 播放量超过1000则title属性里的精确播放数会有","符号
			reading_volume = volumes[1].attrs["title"].replace(",","")	 # reading_volume: 阅读量超过1000则title属性里的精确阅读数会有","符号
		else:															 # 如果超过2个我决定抛出异常
			with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("Error2: 流量数量超过2！UID{}\t{}\n".format(uid,time.strftime("%Y-%m-%d %H:%M:%S")))	
		temp = soup.find("a",class_="n-btn n-video n-audio n-article n-album")
		contribution = int(soup.find("span",class_="n-num").string)
		string = str()
		for item in [uid,name,gender,level,signature,is_member,fans_icon,follower,followee,playback_volume,reading_volume,contribution,timestamp]: string += "{},".format(item)
		string = "{}\n".format(string[:-1])		
		with open("{}\\user{}.csv".format(self.userPath,self.date),"a",encoding="UTF-8") as f: f.write(string)

	def parse(self,
		headless=False,
	):		
		av = 0															 # 记录当前av号
		driver = webdriver.Firefox(options=self.options) if headless else webdriver.Firefox()
		driver.implicitly_wait(10)										 # 设置等待超时
		while True:
			av += 1
			string = "正在获取av{}的信息...".format(av)
			print(string)
			with open("{}\\{}".format(self.tempFolder,self.log),"a") as f: f.write("{}\t{}\n".format(string,time.strftime("%Y-%m-%d %H:%M:%S")))
			self.parse_video(av,driver)
		driver.quit()

	def test(self,):													 # 测试代码
		driver = webdriver.Firefox()
		uids = [777536,1532165,281317955]
		driver.implicitly_wait(10)	
		for uid in uids:
			self.parse_user(uid,driver)

if __name__ == "__main__":
	bilibili = BiliBili()
	#bilibili.parse()
	bilibili.test()
先写这么多吧，之后再写基于用户的爬虫，可能会更新吧。测试下来速度尚可，24小时应该可以遍历掉5000~10000个视频的样子，目前某B的评论数据给出了基于时间排序与基于热度排序，这里已经切换为基于热度排序，且至多获取50页的评论（即每个视频最多获取1000条按热度排序的评论，全获取有点慢）。
囚生CY
发布了40 篇原创文章 · 获赞 133 · 访问量 44万+
私信关注
【项目小结】某B视频网站的爬虫实践

猜你喜欢