文章目录
说明
本文只提供了下载图片并清理的代码块函数,没有添加调用示例。
Python爬虫下载图片
# 获取图片url连接
def getParsePage(pn,name,save_path):
'''
parameter:
pn : 页数(每页60张)
name : 需要下载图片的关键字
save_path : 下载图片的保存路径
'''
for i in range(int(pn)):
# 获取网页
print('Scaning for {} page···'.format(i+1), end=' ')
# 百度图片首页的url
# name 关键词
# pn 是页数
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20)
response = requests.get(url, headers=headers)
html = response.content.decode()
# 正则表达式解析网页
url_list = re.findall('"objURL":"(.*?)",', html) # 返回一个列表
print('Done')
# 根据获取到的图片链接,把图片保存到本地
downloadImg(url_list=url_list, save_path=save_path, page=i+1)
def downloadImg(url_list, save_path, page): # 下载图片
'''
parameter:
url_list : 图片的url
save_path : 下载的图片保存地址
page : 页数
'''
try:
os.mkdir(save_path) # 创建文件夹
except:
pass
tem = 1
print('Page:{} Downloading {} images'.format(page, len(url_list)))
for url in tqdm(url_list, desc='Downloading ', ncols=80, unit='img'):
try:
img = requests.get(url=url,timeout=10,headers=headers)
time.sleep(1)
img_name = save_path + '/{}-{}.jpg'.format(page, tem)
with open(img_name, 'wb') as f:
f.write(img.content)
# print('Done')
except:
pass
# print('Error')
tem += 1
print('Page:{} Done\n'.format(page))
清理无效图片
def imgCleaning(save_path, minsize):
'''
parameter:
save_path : 图片保存的地址
minsize : 标识为无效图片的最小大小(B)
'''
print('Delete invalid images\nScaning···',end=' ')
file_list = [os.path.join(save_path, file) for file in os.listdir(save_path)]
invalid_list = []
# 删除无效图片
for file in file_list:
size = os.path.getsize(file)
if size < minsize:
invalid_list.append(file)
print('Done\nTotal number of invalid images: {}'.format(len(invalid_list)))
# i = 1
for inv_file in tqdm(invalid_list, desc='Deleting ', ncols=80, unit='img'):
try:
os.remove(inv_file)
time.sleep(0.1)
# print('Done')
except:
pass
# print('Error')
# i += 1
print('Clean {} images Done\n'.format(len(invalid_list)))
清理相似图片
def hashDetect(file_path):
'''
parameter:
file_path : 图片保存地址
'''
highfreq_factor = 4 # resize尺度
hash_size = 32 # hash值长度
img_scale = 64
# img_size = hash_size * highfreq_factor
file_list = []
phash_list = []
ahash_list = []
dhash_list = []
whash_list = []
del_list = []
for file in tqdm(os.listdir(file_path), desc='Scaning ', ncols=80, unit='img'):
if os.path.splitext(file)[1] == '.jpg':
path_file = os.path.join(file_path, file)
# print(path_file)
try:
phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor) # 感知哈希
ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size) # 平均散列哈希
dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size) # 梯度散列哈希
whash = imagehash.whash(Image.open(path_file), hash_size=hash_size, image_scale=img_scale, mode='db4') # 离散小波变换
phash_list.append(phash)
ahash_list.append(ahash)
dhash_list.append(dhash)
whash_list.append(whash)
file_list.append(path_file)
except:
del_list.append(path_file)
file_list_len = len(file_list)
for i in tqdm(range(file_list_len), desc='Calculating ', ncols=80, unit='img'):
if i >= file_list_len - 1:
continue
for j in range(i + 1, file_list_len):
if j >= file_list_len - 1 or i >= file_list_len - 1:
break
phash_value = 1 - (phash_list[i] - phash_list[j]) / len(phash_list[i].hash) ** 2
ahash_value = 1 - (ahash_list[i] - ahash_list[j]) / len(ahash_list[i].hash) ** 2
dhash_value = 1 - (dhash_list[i] - dhash_list[j]) / len(dhash_list[i].hash) ** 2
whash_value = 1 - (whash_list[i] - whash_list[j]) / len(whash_list[i].hash) ** 2
hash_value = max(phash_value, ahash_value, dhash_value, whash_value)
if(hash_value > 0.85):
del_list.append(file_list.pop(j))
phash_list.pop(j)
ahash_list.pop(j)
dhash_list.pop(j)
whash_list.pop(j)
file_list_len -= 1
j -= 1
time.sleep(1)
# print('Done')
print('Find total number of same images: {}\n'.format(len(del_list)))
time.sleep(1)
return del_list
def imgDeepClean(file_path):
'''
parameter:
file_path : 图片保存地址
'''
# print('Scaning···', end='')
print('Delete same images')
clean_file_list = hashDetect(file_path)
if len(clean_file_list) == 0:
print('None images')
else:
i = 1
print('Delete {} images'.format(len(clean_file_list)))
for inv_file in tqdm(clean_file_list, desc='Deleting ', ncols=80,unit='img'):
try:
os.remove(inv_file)
time.sleep(0.2)
# print('Done')
except:
pass
# print('Error')
i += 1
print('Delete {} images Done\n'.format(len(clean_file_list)))