超简单的视频网站爬虫--应用requests

import requests
import re#正则表达式模块
from urllib.request import urlretrieve   #专门用来下载的方法
import os
#获取网站源代码
def down_loads():
    url = 'https://www.pearvideo.com/category_5'
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'}
    res = requests.get(url, headers=header).text
#正则表达式、bs4
#通过正则表达式匹配<a href="video_1664559" class="vervideo-lilink actplay" target="_blank">
# .*? 匹配所有
    regex = r'<a href="(.*?)" class="vervideo-lilink actplay">'
    video_id=re.findall(regex,res)#视频id列表
    url2='https://www.pearvideo.com/'
    urllist=[ ]
    for i in video_id:
        newurl=url2+i
        urllist.append(newurl)
    for playurl in urllist:
        html2=requests.get(playurl,headers=header).text
        regex2=r'sdUrl="",ldUrl="",srcUrl="(.*?)",vdoUrl=srcUrl'
        video_id2=re.findall(regex2,html2)
        regex3=r'<h1 class="video-tt">(.*?)</h1>'
        name=re.findall(regex3,html2)
        print('正在下载视频:%s'%name[0])

        path='lishiping'
        if path not in os.listdir():
            os.mkdir(path)
        urlretrieve(video_id2[0], path+"/%s.mp4"%name[0])


down_loads()

现在还只能爬一页视频。。。

发布了19 篇原创文章 · 获赞 7 · 访问量 874

猜你喜欢

转载自blog.csdn.net/Pyouthon/article/details/105160064