如何爬取B站搜索结果

bilibili搜索页结果爬取

from lxml import etree
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
import time
from pyquery import PyQuery as pq
browser = webdriver.Chrome()
KeyWord = '新型冠状病毒肺炎'//搜什么输什么
wait = WebDriverWait(browser,10)
def get_page(page):
	try:
		url = 'https://search.bilibili.com'
		browser.get(url)
		if page >= 0:
			input = wait.until(EC.presence_of_element_located((By.ID,"search-keyword")))
			input.clear()
			input.send_keys(KeyWord)
			submit = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="server-search-app"]/div/div/div[2]/a')))
			submit.click()
		for page in range(page):
			next = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
			print('正在解析第'+str(page+1)+'页')
			parse_page(page)
			next.click()
	except TimeoutException:
		print('解析完成')

def parse_page(page):
	html = browser.page_source
	with open('./bilibili.html','w',encoding='utf-8') as file:
        	file.write(html)
	html = etree.parse('./bilibili.html',etree.HTMLParser(encoding="utf-8"))
	result = {}
	AV = html.xpath('//*[@class="video-list clearfix"]//li/a/@href')
	Title = html.xpath('//*[@class="video-list clearfix"]//li/a/@title')
	for i in range(20):
		result['视频链接']= AV[i]
		result['标题'] = Title[i]
		with open('./'+KeyWord+'的解析结果.txt','a',encoding='utf-8') as f:
			f.write(str(result))
get_page(50)
发布了2 篇原创文章 · 获赞 0 · 访问量 322

猜你喜欢

转载自blog.csdn.net/qq_44807796/article/details/104171985