selenium抓取视频

今天闲着没事,用selenium抓取视频保存到本地,只爬取了第一页,只要小于等于5分钟的视频。。。

为什么不用requests,没有为什么,就因为有些网站正则和xpath都提取不出来想要的东西,要么就是接口出来的数据加密,要么就因为真正的视频url规律难找!

selenium几行代码轻轻松松就搞定!

from selenium import webdriver
import requests
#设置无界面模式
opt=webdriver.ChromeOptions()
opt.set_headless()

class VideoCrawl(object):
    video_box=[]#收集video真正的url
    def __init__(self,url):
        self.driver=webdriver.Chrome(options=opt)#设置无界面模式
        self.driver.get(url)

    #程序运行完毕,析构函数关闭selenium
    def __del__(self):
        print("爬取结束。。。。。",len(VideoCrawl.video_box),VideoCrawl.video_box)
        self.driver.close()

    def run(self):
        self.get_detail_info()
    #获取列表页所有详情页的url
    def get_detail_info(self):
        detail_info = self.driver.find_elements_by_xpath('//a[@class="video-wrap statpid"]')
        detail_url=[]
        for i in detail_info:
            detail_url.append(i.get_attribute('href'))#获取视频页url
        video_playtime_list=self.driver.find_elements_by_xpath('//span[@class="video-duration"]')
        video_playtime_list=[i.text for i in video_playtime_list]
        for res in zip(detail_url,video_playtime_list):
            playtime=res[1].split(":")[0]
            # print("playtime--------",playtime)
            if int(res[1].split(":")[0])<=5:#播放时间小于5分钟的要
                # print(res[0],"解析的url",playtime)
                self.parse_video(res[0],res[1])
            else:
                pass
    #解析详情页
    def parse_video(self,url,t):
        self.driver.get(url)
        videoobj = self.driver.find_elements_by_xpath('//video')
        video_url=videoobj[0].get_attribute('src')
        title=self.driver.find_elements_by_xpath('//h1[@class="video-title"]')[0].text
        print('video_url--------',video_url,title,t)
        #保存video到本地
        self.save_video(video_url,title,t)
        #类变量统计video_url
        VideoCrawl.video_box.append(video_url)
    #保存,请求video_url,二进制保存为mp4
    def save_video(self,url,title,t):
        filename="video"+title+"-"+t.replace(":","")+".mp4"
        video=requests.get(url).content
        with open(filename,"wb") as file:
            file.write(video)
        print(f"{filename}写入文件完毕")

if __name__ == '__main__':
    crawl=VideoCrawl('https://v.huya.com/cat/7')
    crawl.run()

运行结果如下:

video_url-------- https://huya-w20.huya.com/1946/233921742/1300/c283cdf9b6f3c58fcd1b5933e4718980.mp4 韩舞女王栗子-性感演绎SOLO 03:11
video韩舞女王栗子-性感演绎SOLO-0311.mp4写入文件完毕
video_url-------- https://huya-w20.huya.com/1931/191367515/1300/e30ed7735c832e539790f58d9eccb10f.mp4 夏茉唱歌《明明说好不哭》 03:59
video夏茉唱歌《明明说好不哭》-0359.mp4写入文件完毕
video_url-------- https://huya-w10.huya.com/1845/85360643/1300/96808aaab77334a8aa955caca12c88dd.mp4 夏茉-靠近一点点 04:11
video夏茉-靠近一点点-0411.mp4写入文件完毕
爬取结束。。。。。 12 ['https://huya-w6.huya.com/1927/177950705/1300/4fafecc2d17871a166ba07152ce72192.mp4', 'https://huya-w20.huya.com/1927/177950705/1300/4fafecc2d17871a166ba07152ce72192.mp4', 'https://huya-w6.huya.com/1947/236141420/1300/d734f834c85e46e73d3356041b373cf2.mp4', 'https://huya-w20.huya.com/1947/236141420/1300/d734f834c85e46e73d3356041b373cf2.mp4', 'https://huya-w10.huya.com/1946/233921742/1300/c283cdf9b6f3c58fcd1b5933e4718980.mp4', 'https://huya-w20.huya.com/1946/233921742/1300/c283cdf9b6f3c58fcd1b5933e4718980.mp4', 'https://huya-w20.huya.com/1931/191367515/1300/e30ed7735c832e539790f58d9eccb10f.mp4', 'https://huya-w20.huya.com/1944/227123154/1300/5cb244accae05ffb5b5af2f5c57a76cd.mp4', 'https://huya-w10.huya.com/1944/227123154/1300/5cb244accae05ffb5b5af2f5c57a76cd.mp4', 'https://huya-w10.huya.com/1947/235567528/1300/fc34f3aa9fba9ad64952f6f97ffbd45c.mp4', 'https://huya-w20.huya.com/1946/234336348/1300/b0e275820946f259f46d0f9ed9f3def0.mp4', 'https://huya-w10.huya.com/1845/85360643/1300/96808aaab77334a8aa955caca12c88dd.mp4']

 都可以正常播放。。。

大家自己娱乐下练练手删了即可,千万不要用于商业用途哦!

猜你喜欢

转载自www.cnblogs.com/pfeiliu/p/11914971.html