Python Reptile Project: Best not crawling sister site specified number of pages pictures

Best crawling specified number of pages should not sister site Pictures ( http://www.budejie.com/pic/ )

Need to have advance knowledge of XPATH!

import requests
from lxml import etree
from urllib.request import urlretrieve

# 函数作用:
# 对页面发起请求并接收响应的内容
def getpage(url,headers):
    # 对指定的地址发起请求
    response = requests.get(url,headers = headers)
    # 把响应的内容转换成可以使用的XPATH匹配的内容
    html = etree.HTML(response.text)
    # 返回转换的结果
    return html

# 函数作用:
# 在响应内容中提取所需数据
def parsepage(url,headers):
    html = getpage(url,headers)
    # 获取图片的路径
    img_list = html.xpath('//ul/li')
    for img in img_list:
        src = img.xpath("//div[@class='j-r-list-c']/div[@class='j-r-list-c-img']/a/img/@data-original")
        name = img.xpath("//div[@class='j-r-list-c']/div[@class='j-r-list-c-img']/a/img/@title")
        # print(name)
        # print(src)

    # 定义一个用来接收图片名称的列表(后缀名)
    form = []
    # 提取图片的名称,并添加到form列表当中
    for p in src:
        form.append(p.split("/")[-1])
    # print(form)
    # 返回图片路径和图片名称
    return src,name,form


if __name__ == '__main__':
    # 定制请求头(只添加了浏览器信息)
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }

    # 下载并保存前10页的图片
    for i in range(1,11):
        url = "http://www.budejie.com/pic/%s"%i
        src,name,form = parsepage(url,headers)
        for u in src:
            for f in form:
                # print(f)
                # 当对应图片路径和对应图片的名称索引一致时保存
                if src.index(u) == form.index(f):
                    print("正在下载:%s"%name[src.index(u)])
                    urlretrieve(u, "./images/"+f)
                else:
                    continue

 

Published 50 original articles · won praise 78 · views 20000 +

Guess you like

Origin blog.csdn.net/maergaiyun/article/details/104475620