Crawling various website data practical case notes

p43 Asynchronous crawling pear video 

"""
@Author:Acoit
@File:线程数爬虫应用.py
@Time:2022/12/4 18:30
"""
import requests
import os
import random
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
headers = {
        "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29"
}

# 原则:线程池处理的是阻塞且耗时的操作
# 对下述url发起请求,解析出视频详情页的url和视频名称
# 请求页面数据
url = "https://www.pearvideo.com/category_59"
response = requests.get(url=url, headers=headers)
page_text = response.text

# 对页面数据进行xpath解析
tree = etree.HTML(page_text)
li_list = tree.xpath("//ul[@id='listvideoListUl']/li")
videos = []  # 用来存储视频的标题和链接
for li in li_list:
    # 分别解析出每个视频详情页的链接和视频的标题
    # 注意:视频链接不完整,需要拼接
    detail_url = "https://www.pearvideo.com/" + li.xpath("./div/a/@href")[0]
    # 获取的标题文本,后面加上视频格式后缀".mp4"
    detail_title = li.xpath("./div/a/div[2]/text()")[0] + '.mp4'

    # 对详情页的url发请求并获取响应数据
    detail_page_text = requests.get(url=detail_url, headers=headers).text

    # 得到的数据是动态加载的,这个一定要验证一下response
    # 去掉问号后面用&连接的两个参数
    ajax_url = "https://www.pearvideo.com/videoStatus.jsp?"
    # 从前面详情页数据中解析出的"video_1746440"中分离出1746440这一部分,作为params中的contId
    cont_Id = li.xpath("./div/a/@href")[0].split("_")[-1]
    # 封装好get请求的参数,"mrd"是一个介于0和1之间的随机数,用random模块下的random()方法,注意要转成字符串类型
    params = {"contId": cont_Id, "mrd": str(random.random())}
    # 请求头中要加上Referer,这个和params中的contId都是动态的,且需要拼接
    # 加了'Referer': 'https://www.pearvideo.com/video_1746440'后就不会显示该文章已下架了
    ajax_headers = {
        "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29",
        "Referer":
            "https://pearvideos.com/video_" + cont_Id
        }
    # 对ajax中的视频链接发起请求,并获取响应数据,注意响应数据的类型是json类型,返回的是一个字典!!
    ajax_data = requests.get(url=ajax_url, params=params, headers=ajax_headers).json()

    # 可以查看抓包工具--network--XHR--response,发现返回的结果是三层字典,需要一层一层地把视频地址srcUrl给剥出来
    video_url = ajax_data["videoInfo"]['videos']["srcUrl"]
    # print(video_url)  用于验证是否能爬取到url,成功!!
    # 但解析出来的视频地址是一个加密后的伪地址,需要将中间的一串13位数字改成cont-cont_id,方可得到真地址
    # 伪地址:https://video.pearvideo.com/mp4/third/20211123/1638172217395-12719568-193109-hd.mp4
    # 真地址:https://video.pearvideo.com/mp4/third/20211123/cont-1746440-12719568-193109-hd.mp4
    # 下面开始替换字符(俺不会正则QAQ,一直匹配不了ORZ)

    # 先将伪地址用"/"切割为列表,['https:', '', 'video.pearvideo.com', 'mp4', 'third', '20211123', '1638180827144-12719568-193109-hd.mp4']
    list1 = video_url.split("/")
    # 取出列表中的最后一个字符串:'1638180827144-12719568-193109-hd.mp4',将其用"-"切割为列表
    # ['1638180827144', '12719568', '193109', 'hd.mp4']
    list2 = list1[-1].split("-")
    # 用"cont-1746440"替换掉列表中的第一个字符串:'1638180827144'
    list2[0] = "cont-" + cont_Id
    # 用-把list2中的元素连接成一个字符串再替换list1中的最后一个元素
    list1[-1] = "-".join(list2)
    # 再将list1中的元素连成一个字符串就搞定了
    video_url_valid = "/".join(list1)
    # print(video_url_valid)  测试无误!!!
    # 将视频标题和链接封装到字典中
    video_info = {"Name": detail_title, "Url": video_url_valid}
    # 将单个视频的信息的字典存储到列表中
    videos.append(video_info)
# 如果文件夹不存在则创建文件夹,用来放视频
if not os.path.exists("./pearvideos"):
    os.mkdir("./pearvideos")


# 定义一个下载视频的函数
def download_video(dic):
    # 提示进度
    print(f"{dic['Name']} 正在下载......")
    # 向视频链接发送请求
    u = dic["Url"]
    video_data = requests.get(url=u, headers=headers).content
    # 持久化存储
    # with open()是上下文管理器,不需要手动进行close()关闭操作,写入二进制数据的时候,要用wb模式打开
    with open(f"pearvideos/{dic['Name']}", "wb") as fp:
        fp.write(video_data)
        print(f"{dic['Name']} 下载成功!!!")


# 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
pool = ThreadPool(4)  # 声明一个Pool对象,子线程数为4
pool.map(download_video, videos)  # 函数为download_videos, 将列表videos中的各字典元素作为函数的参数
# 关闭子线程
pool.close()
# 关闭主线程
pool.join()

How did you find out that it was a fake address, and how did you find out how the real address was changed?

  1. The json string returned by sending the request. The address of mp4 cannot be opened, and then F12 debugs and selects the video code to compare the address. The core is that the mp4 video can be downloaded, and it will be clear after a comparison.
  2. It is found that the debugged video address can be opened and downloaded, but the request cannot be made, and the fake address can be requested and cannot be opened, so compare the parameters to see where the difference is, and then split and splice it to complete.
#一个正则表达式 
'https://video.pearvideo.com/mp4/adshort/.*?/(.*?)-.*?'  ,

#可以直接提取到伪地址中不正确的字段,再用replace方法替换。

ajax_data = str(ajax_data).replace("'", "")  # 转化为字符串便于使用正则查找
    ex = r'srcUrl: (.*?)}}'
    video_url = re.findall(ex, ajax_data)【0】
    ex2 = r'short/.*?/(.*?)-.*?'
    s = re.findall(ex2, video_url)【0】
    video_url = video_url.replace(s, "cont-" + cont_id)

 


p25 source code

58 has added an anti-crawling mechanism, so it will be redirected to another page when accessing the url, so returning an empty list is not a problem with the xpath path, you can check it with dbug for details 

The solution is to refresh the web page for verification and then crawl

import requests
from lxml import etree
if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    url = 'https://anqiu.58.com/ershoufang/'
    page_text = requests.get(url = url,headers = headers).text
    tree = etree.HTML(page_text)
    li_List = tree.xpath('//section[@class = "list"]/div')
    fp = open('58.txt','w',encoding='utf-8')
    for li in li_List:
        title = li.xpath('./a/div[2]//div/h3/text()')[0]
        print(title)
        fp.write(title+'\n')



#如果最后得到的文件是空的,可以加一个
#fp.close()

 

import requests
from lxml import etree

url = 'https://gz.58.com/ershoufang/?' #对应链接
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78'
}#伪装
prage_text = requests.get(url=url,headers=headers).text#获取链接页面
tree = etree.HTML(prage_text) #数据解析或者说把链接页面保存到tree这个变量上
li_List = tree.xpath('//*[@id="esfMain"]/section[@class="list-body"]/section[3]/section[1]/section[2]/div')#定位到页面数据
fp = open('58.txt','w',encoding='utf-8')#打开/创建一个58.txt文件,并且把fp存储的数据保存到58.txt
for div in li_List:#循环把列表遍历
    title = div.xpath('./a/div[2]/div[1]/div[1]/h3/text()')[0]#精确定位到标题并且赋值到title存储名字*重点:一定要是./才能按上面第一次下的标继续跟进,【0】取第一位出现的东西#不太懂勿喷
    fp.write(title+'\n')#把遍历好的title数据写入fp,/n为自动换行
print('结束')

#空数据的话可以刷新下页面可能是有验证

 

import requests
from lxml import html
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
}
url01 = 'https://bj.58.com/ershoufang/a1/'
respone = requests.get(url=url01, headers=headers).text
etree = html.etree
tree = etree.HTML(respone)
list_page = tree.xpath('//section【@class="list"】/div')
for div in list_page:
    title = div.xpath('./a/div【2】/div【1】/div【1】/h3/text()')
    print(str(title) + "\n")
# r = tree.xpath('//h3/text()')
# for i in r:
#     print(i+"\n")

Guess you like

Origin blog.csdn.net/qq_53011270/article/details/130554829