Beginner requests + xpath crawl site

import requests
from lxml import etree

# 注:该网站是电影资源列表,不知道为什么反应速度很慢,通常需要多尝试几次
# 实际上xpath获取内容的关键在于 1.找对路径(源代码) 2. 处理数据编码 f12查看是gbk还是utf-8

url_base = "https://www.ygdy8.net/html/gndy/dyzz/index.html"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'} # 随便设定的

def get_url_list():
    response = requests.get(url_base, headers=headers,)
    html_str = response.content.decode('gbk')
    html = etree.HTML(html_str)
    url_list = html.xpath("//table[@class='tbspan']//a/@href")
    # print(url_list)
    url_list = map(lambda url : 'https://www.ygdy8.net'+url,url_list)  #1. 因为这里网站给出的只有部分 需要用map函数批量补全
    return  url_list

def parse_datail_page(url):
    movie = {}
    response = requests.get(url, headers=headers,)
    html_str = response.content.decode('gbk')
    html = etree.HTML(html_str)
    movie['title'] = html.xpath("//div//h1/font/text()")[0]
    movie['cover'] = html.xpath("//p/img/@src")[0]
    movie['age'] = html.xpath("//p/text()[4]")[0].replace("\u3000","")
    # movie["location"] = html.xpath("//p/text()[5]")[0]
    # 原本提示IndexError: list index out of range 最后发现原因是f12显示的是js渲染后的,还是得从源代码找

    #还有就是这个页面比较扯淡,需要replace掉全角符号(\u3000),要不然出来的数据不好看

    # { r= html.xpath("//span/p[1]/text()[4]")  #  如果发现是取到了空值可进行以下修改 避免了空值报错
    # movie['cover'] = r[0] if r else None }
    
    print(movie)


if __name__ == '__main__':
    url_list = get_url_list()
    for url in url_list:
        parse_datail_page(url)





Published 13 original articles · won 2 · views 231

Guess you like

Origin blog.csdn.net/qq_25871537/article/details/104861530