import threading
from lxml import etree
import requests
import time
import os
import re
"""
55个标题
//div[@class="vervideo-bd"]/a/div[2]
图片
//div[@class="vervideo-bd"]/a/div[1]/div[1]/div[@class="img"]/@style
视频
//div[@class="vervideo-bd"]/a/@href
"""
#定义梨视频爬虫类
class PearVideo(object):
#列表网址页
ulrs = ['https://www.pearvideo.com/category_{}'.format(x) for x in range(1,10)]
#多线程容器
threads = []
#计时器
def get_ctime(self):
ms = time.ctime()
return ms
def get_time(self):
ms = time.time()
return ms
#抓取网页,并写入文件
def __fetch(self,url,type):
#判断类型是列表页或是详情页
if type == 'index':
file_name = 'test_pear.html'
else:
file_name = 'inner_pear.html'
#第一次抓取,发送http请求
r = requests.get(url)
#301重定向 405主动拒绝访问
print(r.status_code)
#解码
html = r.content.decode('utf-8')
#写文件 指定文件编码
with open('./'+file_name,'w',encoding='utf-8') as f:
f.write(html)
return html
# #os模块创建文件
# if not os.path.exists(file_name):
# #第一次抓取,发送http请求
# r = requests.get(url)
# #301重定向 405主动拒绝访问
# print(r.status_code)
# #解码
# html = r.content.decode('utf-8')
# #写文件 指定文件编码
# with open('./'+file_name,'w',encoding='utf-8') as f:
# f.write(html)
# return html
# #读取文件返回
# else:
# with open('./'+file_name,encoding='utf-8') as f:
# html = f.read()
# return html
#分析数据,提取资源
def __analysis(self,html):
#补全lxml格式
html = etree.HTML(html)
# print(etree.tostring(html).decode())
# exit(1)
#匹配详情页网址
video_url = html.xpath('//div[@class="vervideo-bd"]/a/@href')
# print(video_url)
# exit(1)
#遍历补全内页详情页网址
url_list = []
for i in video_url:
i = 'https://www.pearvideo.com/' + i
url_list.append(i)
# print(url_list)
# exit(1)
#爬取内页
for a,b in enumerate(url_list):
#动态抓取视频网页时,写入文件
inner_html = self.__fetch(b,'inner')
# exit(1)
#匹配真实视频地址
regex = re.compile(r'srcUrl="(.+?)"')
b = regex.findall(inner_html)[0]
print(b)
# if a == 9:
# exit(1)
#追加下载视频
r = requests.get(b).content
#视频标题用索引动态化,"ab"二进制追加写入文件,如果有重名文件,则跳过,你懂得要避免写入重复资源
with open("E:/good/%d.mp4" % a,"ab") as f:
f.write(r)
# exit(-1)#断点
#定义线程的执行任务
def __callback(self,url):
type = 'index'
html = self.__fetch(url,type)
self.__analysis(html)
#创建多线程
def run(self):
print(self.ulrs)
for url in self.ulrs:
thread = threading.Thread(target=self.__callback(url),args=(url,))
self.threads.append(thread)
print(self.threads)
if __name__ == "__main__":
#实例化对象
pear = PearVideo()
#开始时间
start = pear.get_time()
#调用内置方法,创建多线程
pear.run()
# exit(1)
# 异步多线程请求
for t in pear.threads:
#守护线程
t.setDaemon(True)
#执行子线程
t.start()
#打印每一个子线程的执行时间
print("这个子线程执行到%s" % pear.get_ctime())
# 阻塞一下主线程,等待子线程执行完
t.join()
#结束时间
end = pear.get_time()
print("爬取数据用时%ds" % (end-start))
python的多线程异步爬取梨视频(原创)
猜你喜欢
转载自www.cnblogs.com/justblue/p/10485798.html
今日推荐
周排行