bilibili搜索页结果爬取
from lxml import etree
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
import time
from pyquery import PyQuery as pq
browser = webdriver.Chrome()
KeyWord = '新型冠状病毒肺炎'
wait = WebDriverWait(browser,10)
def get_page(page):
try:
url = 'https://search.bilibili.com'
browser.get(url)
if page >= 0:
input = wait.until(EC.presence_of_element_located((By.ID,"search-keyword")))
input.clear()
input.send_keys(KeyWord)
submit = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="server-search-app"]/div/div/div[2]/a')))
submit.click()
for page in range(page):
next = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
print('正在解析第'+str(page+1)+'页')
parse_page(page)
next.click()
except TimeoutException:
print('解析完成')
def parse_page(page):
html = browser.page_source
with open('./bilibili.html','w',encoding='utf-8') as file:
file.write(html)
html = etree.parse('./bilibili.html',etree.HTMLParser(encoding="utf-8"))
result = {}
AV = html.xpath('//*[@class="video-list clearfix"]//li/a/@href')
Title = html.xpath('//*[@class="video-list clearfix"]//li/a/@title')
for i in range(20):
result['视频链接']= AV[i]
result['标题'] = Title[i]
with open('./'+KeyWord+'的解析结果.txt','a',encoding='utf-8') as f:
f.write(str(result))
get_page(50)