千图网_性感美女图片爬取--图片懒加载

#爬取千图网性感美女模块的图片
#第一页:http://sc.chinaz.com/tupian/xingganmeinvtupian.html
#第二页:http://sc.chinaz.com/tupian/xingganmeinvtupian_2.html
#两种url结构不同,注意  可以使用if语句判断
import urllib.request
import urllib.parse
from lxml import etree
import time
import os

#定义下载图片的函数
def down_load_image(image_src):
    dirpath = "xinggan"
    #创建一个文件夹
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)
    #搞个文件名
    filename = os.path.basename((image_src))
    #搞图片路径
    filepath =  os.path.join(dirpath,filename)
    #发送请求,保存图片
    #构造请求
    headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
    #合成请求
    request = urllib.request.Request(url=image_src,headers=headers)
    #获取响应
    response = urllib.request.urlopen(request)

    with open(filepath,"wb") as fp:
        fp.write(response.read())

#定义解析回传函数,并且下载蹄片
def parse_content(content):
    tree = etree.HTML(content)
    image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
    #src搜不到 变为src2能搜到 是因为懒加载问题
    # print(image_list)
    # print(len(image_list))
    # exit()
    #遍历列表 依次下载图片
    for image_src in image_list:
        down_load_image(image_src)

#定义构造请求的函数
def handle_request(url,page):
    #区别第一页和以后页的格式
    if page == 1:
        url = "http://sc.chinaz.com/tupian/xingganmeinvtupian.html"
    else:
        url = url % page
    #print(url)
    #构造请求
    headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
    #合成请求
    request = urllib.request.Request(url=url,headers=headers)
    return request

#主函数
def main():
    url = "http://sc.chinaz.com/tupian/xingganmeinvtupian_%s.html"
    start_page = int(input("请输入开始爬取的页码:"))
    end_page = int(input("请输入爬取结束的页码:"))
    for page in range(start_page,end_page+1):
        #生成请求对象
        request = handle_request(url,page)
        #发送请求
        content = urllib.request.urlopen(request).read().decode()
        #解析回传数据
        parse_content(content)
        time.sleep(2)

if __name__=='__main__':
    main()
    print("图片爬取结束...")

 

 如果学这个不用来做点有意思的事情,那将毫无意义!

猜你喜欢

转载自www.cnblogs.com/Qiuzhiyu/p/12183119.html