Python 爬取网页 图片 并进行本地保存

作为一个Python新手,简单的编写了一个爬虫,获取网页图片数据,并下载到本地的 一小段程序 ,有更加优化方法可以共享哦!本人小白一名。

#!/usr/python2.7
# -*- coding:utf-8 -*-
import  re
import urllib
import os
url = 'http://www.地址.com/'#网页地址
path = 'C:/Users/Administrator/Desktop/image'#你的保存路径
def get_content(url):
    '''
    获取网页数据
    '''
    info = urllib.urlopen(url)
    content = info.read()
    info.close()
    return  content
def get_image_info(info,path):
    ''' 我的图片 html 前端代码 根据此代码写正则表达式哦!
        获取图片路径 并下载
        <div class="imgppt relative"><img src="/ppt/spic/2019/08/18/hkzohxu4g5k.png" alt=""><i class="down-icon  icon-vip"></i></div>
    '''
    isExists = os.path.exists(path)
    # 目录不存在 创建目录
    if not isExists:
        os.makedirs(path)
    #你的正则表达式 不一定是我这种
    regx = r'\<img src="([http].+?\.[p,j][n,p]g)" alt=".+?"\>' 
    pat = re.compile(regx)
    img_info = re.findall(pat, info)
    length = len(img_info)
    i = 0
    for img_url in img_info:
        img_len = len(img_url)
        suffix = img_url[-3:]
        suffix_list = [
            'jpg','jpeg','png','gif'
        ]

        if suffix in suffix_list:
            path_address = path + '/' + str(i) + '.' + suffix
        else:
            path_address = path + '/' + str(i) + '.jpg'
        urllib.urlretrieve(img_url,path_address)
        i += 1
        per = 100.0 * i  / length
        if per > 100:
            per = 100
        print('%.2f%%' % per)
    return  img_info
#页面信息
content = get_content(url)
#图片路径
img_info = get_image_info(content,path)
print img_info

 二、改进版 (按照当前日期 进行目录保存 )

#!/usr/python2.7
# -*- coding:utf-8 -*-
import  re
import urllib
import os
import time
import  random
url = ''#网页地址
path = 'C:/Users/Administrator/Desktop/image'#你的保存路径
def get_content(url):
    '''
    获取网页数据
    '''
    info = urllib.urlopen(url)
    content = info.read()
    info.close()
    return  content
def get_image_info(info,path):
    '''
        获取图片路径 并下载
        <div class="imgppt relative"><img src="/2019/08/18/hkzohxu4g5k.png" alt=""><i class="down-icon  icon-vip"></i></div>
    '''
    time_path = time.strftime('%Y%m%d', time.localtime())
    path = path + '/' + str(time_path)
    isExists = os.path.exists(path)
    # 目录不存在 创建目录
    if not isExists:
        os.makedirs(path)
    #你的正则表达式 不一定是我这种
    regx = r'\<img src="([http].+?\.[p,j][n,p]g)" alt=".+?"\>'
    pat = re.compile(regx)
    img_info = re.findall(pat, info)
    length = len(img_info)
    i = 0
    for img_url in img_info:
        img_len = len(img_url)
        suffix = img_url[-3:]
        suffix_list = [
            'jpg','jpeg','png','gif'
        ]
      time_now =  time.strftime('%Y%m%d%H%m%S',time.localtime()) + str(random.randint(0000,9999))
        if suffix in suffix_list:
            path_address = path + '/'+ str(time_now) + '.' + suffix
        else:
            path_address = path + '/' + str(time_now)+ '.jpg'
        urllib.urlretrieve(img_url,path_address)
        i += 1
        per = 100.0 * i  / length
        if per > 100:
            per = 100
        print('%.2f%%' % per)
    return  img_info
#页面信息
content = get_content(url)
#图片路径
img_info = get_image_info(content,path)
print img_info
发布了52 篇原创文章 · 获赞 11 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_40816144/article/details/102570814