python的多线程异步爬取梨视频(原创)

import threading
from lxml import etree
import requests
import time
import os
import re

"""
55个标题
//div[@class="vervideo-bd"]/a/div[2]
图片
//div[@class="vervideo-bd"]/a/div[1]/div[1]/div[@class="img"]/@style
视频
//div[@class="vervideo-bd"]/a/@href
"""
#定义梨视频爬虫类
class PearVideo(object):

    #列表网址页
    ulrs = ['https://www.pearvideo.com/category_{}'.format(x) for x in range(1,10)]
    #多线程容器
    threads = []

    #计时器
    def get_ctime(self):
        ms = time.ctime()
        return ms
    def get_time(self):
        ms = time.time()
        return ms

    #抓取网页,并写入文件
    def __fetch(self,url,type):
        #判断类型是列表页或是详情页
        if type == 'index':
            file_name = 'test_pear.html'
        else:
            file_name = 'inner_pear.html'

        #第一次抓取,发送http请求
        r = requests.get(url)
        #301重定向 405主动拒绝访问
        print(r.status_code)
        #解码
        html = r.content.decode('utf-8')
        #写文件 指定文件编码
        with open('./'+file_name,'w',encoding='utf-8') as f:
            f.write(html)
        return html

        # #os模块创建文件
        # if not os.path.exists(file_name):
        #     #第一次抓取,发送http请求
        #     r = requests.get(url)
        #     #301重定向 405主动拒绝访问
        #     print(r.status_code)
        #     #解码
        #     html = r.content.decode('utf-8')
        #     #写文件 指定文件编码
        #     with open('./'+file_name,'w',encoding='utf-8') as f:
        #         f.write(html)
        #     return html
        # #读取文件返回
        # else:
        #     with open('./'+file_name,encoding='utf-8') as f:
        #         html = f.read()
        #     return html

    #分析数据,提取资源
    def __analysis(self,html):
        #补全lxml格式
        html = etree.HTML(html)
        # print(etree.tostring(html).decode())
        # exit(1)
        #匹配详情页网址
        video_url = html.xpath('//div[@class="vervideo-bd"]/a/@href')
        # print(video_url)
        # exit(1)

        #遍历补全内页详情页网址
        url_list = []
        for i in video_url:
            i = 'https://www.pearvideo.com/' + i
            url_list.append(i)
        # print(url_list)
        # exit(1)

        #爬取内页
        for a,b in enumerate(url_list):
            #动态抓取视频网页时,写入文件
            inner_html = self.__fetch(b,'inner')
            # exit(1)
            #匹配真实视频地址
            regex = re.compile(r'srcUrl="(.+?)"')
            b = regex.findall(inner_html)[0]
            print(b)
            # if a == 9:
                # exit(1)
            #追加下载视频 
            r = requests.get(b).content
            #视频标题用索引动态化,"ab"二进制追加写入文件,如果有重名文件,则跳过,你懂得要避免写入重复资源
            with open("E:/good/%d.mp4" % a,"ab") as f:
                f.write(r)
                # exit(-1)#断点

    #定义线程的执行任务
    def __callback(self,url):
        type = 'index'
        html = self.__fetch(url,type)
        self.__analysis(html)

    #创建多线程
    def run(self):
        print(self.ulrs)
        for url in self.ulrs:
            thread = threading.Thread(target=self.__callback(url),args=(url,))
            self.threads.append(thread)
        print(self.threads)


if __name__ == "__main__":

    #实例化对象
    pear = PearVideo()
    #开始时间
    start = pear.get_time()
    #调用内置方法,创建多线程
    pear.run()
    # exit(1)
    #  异步多线程请求
    for t in pear.threads:
        #守护线程
        t.setDaemon(True)
        #执行子线程
        t.start()
        #打印每一个子线程的执行时间
        print("这个子线程执行到%s" % pear.get_ctime())
    # 阻塞一下主线程,等待子线程执行完
    t.join()
    #结束时间
    end = pear.get_time()
    print("爬取数据用时%ds" % (end-start))

猜你喜欢

转载自www.cnblogs.com/justblue/p/10485798.html