自己写python爬虫从百度上下载图片脚本

参考URL: https://blog.csdn.net/z704630835/article/details/82992036

1 下载脚本

# 导入需要的库
import requests
import os
import json

# 爬取百度图片，解析页面的函数
def getManyPages(keyword, pages):
    '''
    参数keyword：要下载的影像关键词
    参数pages：需要下载的页面数
    '''
    params = []

    for i in range(30, 30 * pages + 30, 30):
        params.append({
            'tn': 'resultjson_com',
            'ipn': 'rj',
            'ct': 201326592,
            'is': '',
            'fp': 'result',
            'queryWord': keyword,
            'cl': 2,
            'lm': -1,
            'ie': 'utf-8',
            'oe': 'utf-8',
            'adpicid': '',
            'st': -1,
            'z': '',
            'ic': 0,
            'word': keyword,
            's': '',
            'se': '',
            'tab': '',
            'width': '',
            'height': '',
            'face': 0,
            'istype': 2,
            'qc': '',
            'nc': 1,
            'fr': '',
            'pn': i,
            'rn': 30,
            'gsm': '1e',
            '1488942260214': ''
        })
    url = 'https://image.baidu.com/search/acjson'
    urls = []
    for i in params:
        try:
            urls.append(requests.get(url, params=i).json().get('data'))
        except json.decoder.JSONDecodeError:
            print("解析出错")
    return urls

# 下载图片并保存
def getImg(dataList, localPath):
    '''
    参数datallist：下载图片的地址集
    参数localPath：保存下载图片的路径
    '''
    if not os.path.exists(localPath):  # 判断是否存在保存路径，如果不存在就创建
        os.makedirs(localPath)
    x = 0
    for list in dataList:
        for i in list:
            if i.get('thumbURL') != None:
                print('正在下载：%s' % i.get('thumbURL'))
                ir = requests.get(i.get('thumbURL'))
                open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
                x += 1
            else:
                print('图片链接不存在')

# 根据关键词来下载图片
if __name__ == '__main__':
    dataList = getManyPages('吃惊', 20)     # 参数1:关键字，参数2:要下载的页数
    getImg(dataList, './data/chijing/')            # 参数2:指定保存的路径

2 通过人脸检测来过滤非人脸和剪切人脸

2.1 使用opencv的人脸检测
#!/usr/bin/env python
# -*- coding:utf-8-*-

import os
import os.path as osp
import cv2
import glob

from io_helper import *

cv_root = 'D:/install packages/opencv-3.4.2/data/haarcascades'
cv_face_model_path = cv_root + '/haarcascade_frontalface_alt2.xml'
cv_face_model_path2 = cv_root + '/haarcascade_profileface.xml'


def test_face_detect_cv():
    classifier1 = cv2.CascadeClassifier(cv_face_model_path)  # 正脸
    filepath = ''
    img = cv2.imread(filepath)  # 读取图片
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # 转换灰色
    faceRects = classifier1.detectMultiScale(gray,
                                             scaleFactor=1.1,
                                             minNeighbors=1,
                                             minSize=(10, 10))
    if len(faceRects):  # 大于0则检测到人脸
        for box in faceRects:  # 单独框出每一张人脸
            x, y, w, h = box
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 1)

        cv2.imshow('a', img)
        cv2.waitKey(0)
    cv2.destroyAllWindows()


def face_detect_save():
    path = r'D:/AI/DataSet/emotion/fer2013/train_class'
    files = glob.glob(path + '/**/*.jpg')

    new_dir = path + '/cut_face'
    new_dir2 = path + '/no_face'
    mkdir_if_not_exist(new_dir)
    mkdir_if_not_exist(new_dir2)

    # OpenCV人脸识别分类器
    classifier1 = cv2.CascadeClassifier(cv_face_model_path)  # 正脸
    # classifier2 = cv2.CascadeClassifier(cv_face_model_path2)  # 侧脸
    for filepath in files:
        chd_dir = new_dir + '/' + filepath.split('\\')[-2]
        mkdir_if_not_exist(chd_dir)
        chd_dir2 = new_dir2 + '/' + filepath.split('\\')[-2]
        mkdir_if_not_exist(chd_dir2)

        filename = osp.basename(filepath)
        img = cv2.imread(filepath)  # 读取图片
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # 转换灰色
        color = (0, 255, 0)  # 定义绘制颜色
        # 调用识别 正脸人脸
        faceRects = classifier1.detectMultiScale(gray,
                                                 scaleFactor=1.1,
                                                 minNeighbors=1,
                                                 minSize=(10, 10))
        if len(faceRects):
            for box in faceRects:  # 单独框出每一张人脸
                x, y, w, h = box
                face_roi = img[y:y + h, x:x + w, :]
                file = chd_dir + '/' + filename
                cv2.imwrite(file,face_roi)
        else:
            file = chd_dir2 + '/' + filename
            shutil.copy(filepath,file)

    print('work is done .')


if __name__ == '__main__':
    face_detect_save()

2.2 使用mtcnn的包进行人脸检测
-----------------------------------------
使用python公开包 mtcnn 来进行人脸检测和关键点检测
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple mtcnn

gpu_id = 3
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
import tensorflow as tf
from mtcnn.mtcnn import MTCNN

detector = MTCNN(scale_factor=0.99)
face_list = detector.detect_faces(img)

for item in face_list:
    box = item['box']
    conf = item['confidence']
    keypoints_dict = item['keypoints']
    # {'left_eye': (14, 16), 'right_eye': (31, 12), \
    # 'nose': (23, 25), 'mouth_left': (19, 35), 'mouth_right': (33, 32)}
    left_eyeXY = keypoints_dict['left_eye']
    right_eyeXY = keypoints_dict['right_eye']
    noseXY = keypoints_dict['nose']
    mouth_leftXY = keypoints_dict['mouth_left']
    mouth_rightXY = keypoints_dict['mouth_right']
    if conf > 0:
        print('detect a face .')
        x, y, w, h = box
        offset = 5
        x = max(0, x - offset)
        y = max(0, y - offset)
        w = min(w + 2 * offset, src_w - x)
        h = min(h + 2 * offset, src_h - y)

        face_img = img[y:y + h, x:x + w, :]


-----------------------------------
2.3 使用关键点来进行人脸对齐

自己写python爬虫从百度上下载图片脚本

猜你喜欢