Python3, after multi-threaded crawling the main video barrage and comments of station B UP, I floated ~ ~ ~

Use threads to crawl the main barrage and comments of station B

Crawl video barrage information

To crawl the barrage information of the video, we also need to use our interface tool, here we use Charles to get the url address of the barrage

Barrage url

https://api.bilibili.com/x/v1/dm/list.so?oid={
    
    cid}

Video url

https://api.bilibili.com/x/player/pagelist?aid={
    
    aid}&jsonp=jsonp

As for how to get it, I think everyone should know it~ ~ I won’t demonstrate it here.
Next, the old rules, just go to the code:

get_video_barrage.py

# -*- coding: utf-8 -*-
"""
@ auth : carl_DJ
@ time : 2020-8-24
"""

def get_video_barrage(aid,uid,d):
	#获取视频弹幕信息
	cid = d['data']
	#弹幕地址,即运用到Charles抓取的接口地址
	barrage_url = 'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
	#设置超时等待设置3秒
	r = get_http_session().get(barrage_url,timeout=3)
	#弹幕保存路径,及格式为 xml, 
	#uid类型要是 str的,不能是int类型,否则报错
	uid_dir_path = os.path.join(dir_path,str(uid))
	if not os.path.exists(uid_dir_path):
		os.makedirs(uid_dir_path)
		#设置弹幕信息保存路径
	barrage_path = os.path.join(uid_dir_path,f'barrage_{aid}.xml')
	#设置字符类型
	r.encoding = 'utf-8'
	content = t.text
	#保存弹幕信息
	save_file(barrage_path,content)
	print(f'video id:{aid} barrage save success !')

def get_up_video_info(name,id,filepath):
	#获取UP主的视频信息
	#读取文件信息
	res = read_json(filepath)
	#读取html页面中的vlist信息
	vlist = res['data']['list']['vlist']
	#循环读取vlist里面的内容
	for v in vlist:
		aid = v['aid']
		url = 'https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp'
		player = get_http_session().get(url,timeout=3)
		#转换成json格式
		player = player.json()
		data = player['data']
		#如果没有数据,则直接返回
		if not data:
			return
		#循环读取data的内容
		for d in data:
			try:
				#这里直接调用上面写好的 get_video_barrage方法
				get_video_barrage(aid,uid,d)
			except Exception as e:
				#如果出错,则输出到get_up_video_info.log这个日志文件中
				log(traceback.format_exc(), 'error', 'get_up_video_info.log')
				error_str = f'name: [{name}], uid: [{uid}]'
				log(error_str,'error','get_up_video_info.log')
				

Explain here:
What I wrote directly is how to get video barrage!

Crawl video comment information

The information code related to obtaining the comment is the same as the method of obtaining the video barrage.
I will directly upload the code without being too long-winded!

get_video_comment.py

# -*- coding: utf-8 -*-
"""
@ auth : carl_DJ
@ time : 2020-8-24
"""

def get_video_comment(uid,aid):
	#获取视频评论的信息
	comment_url = f'视频评论url'
	r = get_http_session().get(comment_url,timeout=3)
	#如果返回的状态码是200,则执行以下动作
	if r.status_code == 200:
		#同样这里uid要设置成str类型
		uid_dir_path = os.path.join(dir_path,str(uid))
		#如果没有地址,则创建
		if not os.path.exists(uid_dir_path):
			os.makedirs(uid_dir_path)
		common_path = os.path.join(uid_dir_path,f'comment_{aid}.json')
		#格式为json,缩进4个,内容不进行编码
		content = json.dumps(r.json(),indent=4,ensure_ascii= False)
		#保存
		save_file(common_path,content)
		#打印结果
		print(f'video id:{aid} comment save success')
		

The code is done here, there is no difficulty,
then we will use threads to speed up

Integrate code, speed up threads~

Similarly, the information of the UP master in crawling station B is that if you crawl a single video, it is better
if you simultaneously crawl the video barrage and comments on the 100+ UP masters, then the amount of crawling will be overwhelming. ~
And it is also possible that when the crawl is a little and a half, it will be crawled back by station B, is this embarrassing~
In order to solve this problem, we must speed up...
save the time, read the comics, Look at the small video, isn't it fragrant! !

Just say it, the old rules, the code:

# -*- coding: utf-8 -*-
"""
@ auth : carl_DJ
@ time : 2020-8-24
"""


import os
import json
import requests
import traceback
import time

from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from queue import Queue

#创建线程池: 10 个
executor = ThreadPoolExecutor(10)
queue = Queue()

#存放数据的根目录
dir_path = os.path.join('Bili_UP')

def get_http_session(pool_connections = 2,pool_maxsize=10,max_retries = 3)
    """
    http的连接池
    :param pool_connection: 连接池数
    :param poll_maxsize: 最大连接池数
    :param max_retries: 最大重试数
    :return:
    """
	session = requests.session()
	#适配器,传入3个参数
	adapter = requests.adapters.HTTPAdapter(pool_connections=pool_connections, pool_maxsize=pool_maxsize, max_retries=max_retries)
    session.mount('http://',adapter)
	session.mount('https://',adapter)
	return session

def save_file(filepath,content):
	'''
	由于这里保存的代码在《[Pyhotn3,爬取B站up主的信息!》已经写过过,
	这就就不在重复定义,不太熟练的,可以点击传送。
	'''
	# 保存文件
	pass   
def log(content,level,filepath):
	#创建错误日志级别
	pass

def make_dir(name):
	#创建dir
    pass
    
def read_json(filepath):
    """
    :param filepath: 读取文件
    :return:
    """
    #这里如果不写encoding='utf-8',转义会有问题
    with open(filepath,'r',encoding='UTF-8') as f:
        res = f.read()
        #将读取的文件转换成json格式,
        return json.loads(res)

def get_up_base_info(name, uid):
	#获取UP主的基本信息
	try:
		url = f'UP主的url地址'
		#设置超时等待设置3秒
		r = get_http_session().get(url, timeout=100)
		#如果返回的状态码是200,则执行以下动作
		if r.status.code ==200:
			up_dir = make_dir(name)
			filepath = os.path.join(up_dir,f'{uid}_base_info.json')
			content = json.dumps(r.json(), indent=4, ensure_ascii=False)
			#保存
			save_file(filepath, content)
			#打印信息
			print(f'{name} up主信息保存成功')
			#将信息推到队列中
			global queue
			queue.put((name,uid,filepath))
		else:
			fail_str = f'name: [{name}], uid: [{uid}], url: [{url}]'
			log(fail_str,'fail','base_info_fail.log')
	except Exception as e:
		log(traceback.format_exc(),'error','base_info_error.log')
		error_str = f'name: [{name}], uid: [{uid}]'
		log(error_str, 'error', 'base_info_error.log')

def get_video_barrage(aid,uid,d):
	#获取视频弹幕信息
	cid = d['data']
	#弹幕地址,即运用到Charles抓取的接口地址
	barrage_url = 'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
	#设置超时等待设置3秒
	r = get_http_session().get(barrage_url,timeout=3)
	#弹幕保存路径,及格式为 xml, 
	#uid类型要是 str的,不能是int类型,否则报错
	uid_dir_path = os.path.join(dir_path,str(uid))
	if not os.path.exists(uid_dir_path):
		os.makedirs(uid_dir_path)
		#设置弹幕信息保存路径
	barrage_path = os.path.join(uid_dir_path,f'barrage_{aid}.xml')
	#设置字符类型
	r.encoding = 'utf-8'
	content = t.text
	#保存弹幕信息
	save_file(barrage_path,content)
	print(f'video id:{aid} barrage save success !')

def get_up_video_info(name,id,filepath):
	#获取UP主的视频信息
	#读取文件信息
	res = read_json(filepath)
	#读取html页面中的vlist信息
	vlist = res['data']['list']['vlist']
	#循环读取vlist里面的内容
	for v in vlist:
		aid = v['aid']
		url = 'https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp'
		player = get_http_session().get(url,timeout=3)
		#转换成json格式
		player = player.json()
		data = player['data']
		#如果没有数据,则直接返回
		if not data:
			return
		#循环读取data的内容
		for d in data:
			try:
				#这里直接调用上面写好的 get_video_barrage方法
				get_video_barrage(aid,uid,d)
			except Exception as e:
				#如果出错,则输出到get_up_video_info.log这个日志文件中
				log(traceback.format_exc(), 'error', 'get_up_video_info.log')
				error_str = f'name: [{name}], uid: [{uid}]'
				log(error_str,'error','get_up_video_info.log')

def get_video_comment(uid,aid):
	#获取视频评论的信息
	comment_url = f'视频评论url'
	r = get_http_session().get(comment_url,timeout=3)
	#如果返回的状态码是200,则执行以下动作
	if r.status_code == 200:
		#同样这里uid要设置成str类型
		uid_dir_path = os.path.join(dir_path,str(uid))
		#如果没有地址,则创建
		if not os.path.exists(uid_dir_path):
			os.makedirs(uid_dir_path)
		common_path = os.path.join(uid_dir_path,f'comment_{aid}.json')
		#格式为json,缩进4个,内容不进行编码
		content = json.dumps(r.json(),indent=4,ensure_ascii= False)
		#保存
		save_file(common_path,content)
		#打印结果
		print(f'video id:{aid} comment save success')

def base_info_task(power_json):
	#设定启动方法,启动获取UP主的信息
	for d in power_json:
        uid = d['uid']
        name = d['name']
        #通过线程池去执行
        executor.submit(get_up_base_info,name,uid)
        
def get_video_task(power_json):
	#设定启动方法,启动获取UP主的视频和弹幕的信息
	#设定最大为10
	with ThreadPoolExecutor(max_workers = 10) as executor:
		#做循环,
		while True:
			global queue:
			name,uid,filepath = queue.get()
			executor.submit(get_up_video_info,name,uid,filepath)
			queue.task_done()
			#设置等待时间,防止被反爬
			time.sleep(2)

def mian():
	#读取up主的基本信息,
	power_json = read_json('UP_base.json')
	#启动线程
	Thread(target = base_info_task,args = (power_json, )).start()
	Thread(target=video_info_task).start()

if __name__ == '__main__':
    main()


Note:
1. The methods such as make_dir and log are omitted here ,
→① because in "Pyhotn3, crawl the information of the up master of station B!" "I have written it
→②This article is mainly about multi-threaded crawling of UP main video barrage and comments. Too much writing can easily cause confusion;
2. If you only want to get the barrage information of a certain UP main, you can use The code inside is extracted and crawled separately;
3. Since the content we crawl is public information, in the process of crawling, station B did not set too many obstacles!
4. If there is too much content to be crawled, Xiaoyu still suggests the following points:
→①Set up User-Agent
→②Set up IP proxy pool
→③Find someone else to crawl for you

If you are banned, you should be a crowd, O(∩_∩)O~

last of the last

Welcome more climbing friends to communicate.
At the same time, I also give Xiaoyu more and better suggestions~~
Let us be happy together and climb together! !

Guess you like

Origin blog.csdn.net/wuyoudeyuer/article/details/108198224