爬虫批量下载图片并清理


说明

本文只提供了下载图片并清理的代码块函数,没有添加调用示例。

Python爬虫下载图片

# 获取图片url连接
def getParsePage(pn,name,save_path):
'''
parameter:
	pn : 页数(每页60张)
	name : 需要下载图片的关键字
	save_path : 下载图片的保存路径
'''
    for i in range(int(pn)):
        # 获取网页
        print('Scaning for {} page···'.format(i+1), end=' ')

        # 百度图片首页的url
        # name 关键词
        # pn 是页数

        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20)
        response = requests.get(url, headers=headers)
        html = response.content.decode()

        # 正则表达式解析网页
        url_list = re.findall('"objURL":"(.*?)",', html) # 返回一个列表
        print('Done')

        # 根据获取到的图片链接,把图片保存到本地
        downloadImg(url_list=url_list, save_path=save_path, page=i+1)

def downloadImg(url_list, save_path, page):   # 下载图片
'''
parameter:
	url_list : 图片的url
	save_path : 下载的图片保存地址
	page : 页数
'''
    try:
        os.mkdir(save_path) # 创建文件夹
    except:
        pass
    tem = 1
    print('Page:{} Downloading {} images'.format(page, len(url_list)))
    for url in tqdm(url_list, desc='Downloading ', ncols=80, unit='img'):        
        try:
            img = requests.get(url=url,timeout=10,headers=headers)
            time.sleep(1)
            img_name = save_path + '/{}-{}.jpg'.format(page, tem)
            with open(img_name, 'wb') as f:
                f.write(img.content)
            # print('Done')
        except:
            pass
            # print('Error')
        tem += 1
    print('Page:{} Done\n'.format(page))

清理无效图片

def imgCleaning(save_path, minsize):
'''
parameter:
	save_path : 图片保存的地址
	minsize : 标识为无效图片的最小大小(B)
'''
    print('Delete invalid images\nScaning···',end=' ')
    file_list = [os.path.join(save_path, file) for file in os.listdir(save_path)]
    invalid_list = []
    # 删除无效图片
    for file in file_list:
        size = os.path.getsize(file)
        if size < minsize:
            invalid_list.append(file)
    print('Done\nTotal number of invalid images: {}'.format(len(invalid_list)))    
    # i = 1
    for inv_file in tqdm(invalid_list, desc='Deleting ', ncols=80, unit='img'):
        try:
            os.remove(inv_file)
            time.sleep(0.1)
            # print('Done')
        except:
            pass
            # print('Error')
        # i += 1
    print('Clean {} images Done\n'.format(len(invalid_list)))

清理相似图片

def hashDetect(file_path):
'''
parameter:
	file_path : 图片保存地址
'''
    highfreq_factor = 4     # resize尺度
    hash_size = 32          # hash值长度
    img_scale = 64
    # img_size = hash_size * highfreq_factor 
    file_list = []
    phash_list = []
    ahash_list = []
    dhash_list = []
    whash_list = []
    del_list = []
    for file in tqdm(os.listdir(file_path), desc='Scaning ', ncols=80, unit='img'):
        if os.path.splitext(file)[1] == '.jpg':
            path_file = os.path.join(file_path, file)
            # print(path_file)
            try:
                phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor)    # 感知哈希
                ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size)  # 平均散列哈希
                dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size)         # 梯度散列哈希
                whash = imagehash.whash(Image.open(path_file), hash_size=hash_size, image_scale=img_scale, mode='db4')  # 离散小波变换
                phash_list.append(phash)
                ahash_list.append(ahash)
                dhash_list.append(dhash)
                whash_list.append(whash)
                file_list.append(path_file)
            except:
                del_list.append(path_file)

    file_list_len = len(file_list)
    for i in tqdm(range(file_list_len), desc='Calculating ', ncols=80, unit='img'):
        if i >= file_list_len - 1:
            continue
        for j in range(i + 1, file_list_len):
            if j >= file_list_len - 1 or i >= file_list_len - 1:
                break
            phash_value = 1 - (phash_list[i] - phash_list[j]) / len(phash_list[i].hash) ** 2
            ahash_value = 1 - (ahash_list[i] - ahash_list[j]) / len(ahash_list[i].hash) ** 2
            dhash_value = 1 - (dhash_list[i] - dhash_list[j]) / len(dhash_list[i].hash) ** 2
            whash_value = 1 - (whash_list[i] - whash_list[j]) / len(whash_list[i].hash) ** 2
            hash_value = max(phash_value, ahash_value, dhash_value, whash_value)
            if(hash_value > 0.85):
                del_list.append(file_list.pop(j))
                phash_list.pop(j)
                ahash_list.pop(j)
                dhash_list.pop(j)
                whash_list.pop(j)
                file_list_len -= 1
                j -= 1    
    time.sleep(1)
    # print('Done')
    print('Find total number of same images: {}\n'.format(len(del_list)))
    time.sleep(1)
    return del_list

def imgDeepClean(file_path):
'''
parameter:
	file_path : 图片保存地址
'''
    # print('Scaning···', end='')
    print('Delete same images')
    clean_file_list = hashDetect(file_path)
    if len(clean_file_list) == 0:
        print('None images')
    else:
        i = 1
        print('Delete {} images'.format(len(clean_file_list)))
        for inv_file in tqdm(clean_file_list, desc='Deleting ', ncols=80,unit='img'):
            
            try:
                os.remove(inv_file)
                time.sleep(0.2)
                # print('Done')
            except:
                pass
                # print('Error')
            i += 1
        print('Delete {} images Done\n'.format(len(clean_file_list)))

猜你喜欢

转载自blog.csdn.net/qq_50838982/article/details/124249386
今日推荐