Face detection mtcnn

MTCNN简介

《Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks》论文解读。
相比于R-CNN系列通用检测方法，本文更加针对人脸检测这一专门的任务，速度和精度都有足够的提升。R-CNN，Fast R-CNN，FasterR-CNN这一系列的方法不是一篇博客能讲清楚的，有兴趣可以找相关论文阅读。类似于TCDCN，本文提出了一种Multi-task的人脸检测框架，将人脸检测和人脸特征点检测同时进行。论文使用3个CNN级联的方式，和Viola-Jones类似，实现了coarse-to-fine的算法结构。

MTCNN主要包括三个部分,PNet,RNet,ONet
其中PNet在训练阶段的输入尺寸为12*12,RNet的输入尺寸为24*24, ONet的输入尺寸为48*48.　 PNet网络参数最小，ceffemodel仅有28.2KB, 所以速度最快.RNet的网络参数次之，caffemodel大小为407.9KB, ONet的caffemodel大小为1.6M,三个网络合起来不到2M.

训练阶段

训练数据生成

该算法训练数据来源于wider和celeba两个公开的数据库，wider提供人脸检测数据，在大图上标注了人脸框groundtruth的坐标信息，celeba提供了5个landmark点的数据。根据参与任务的不同，将训练数据分为四类：人脸正样本（positives）、非人脸负样本（negatives）、部分脸（partfaces）、关键点（landmark）。positives、negatives、partfaces由随机取的框与groundtruth的overlap区域比例的大小决定，大于0.65为positives，小于0.3为negatives，0.4到0.65之间为partfaces。positives和negatives参与到分类任务，positives和partfaces参与到回归任务，landmark数据参与到关键点回归任务。关键点回归仅在第三个net中实用，此时landmark数据的人脸框位置可由前两个net的模型检测得到，或是由landmark的坐标位置拟合出来。在每个batchSize中的样本比例如下，positives：negatives：partfaces：landmark = 1 ： 3 ： 1 ： 2。到此为止，数据的来源组成我们都交代完毕，但是如何生成训练数据呢，这也是很多MTCNN的拥簇者希望能开源训练代码。本文以caffe为例，采用hdf5数据格式，先由脚本随机在wider上截取positives、negatives、partfaces，注意要覆盖到不同尺度的，负样本的量级达到2000w左右（该文作者透露）。之所以采用hdf5的数据格式，是考虑到能够方便的实现多label，以前两个net为例，笔者采用的label为7为，分别是 f1 c1 f2 dx1 dy1 dx2 dy2，f1和f2是标记位，f1标记该样本是否参与分类，f2标记该样本是否参与回归，f1和f2后面紧跟的是真实的label信息，c1是样本类别，dx dy是偏移量。与此对应的是，笔者自己实现了带标志位的softmax和euclidean loss

正负样本，部分样本提取：

从Wider_face随机选出边框，然后和标注数据计算IOU，如果大于0.65，则为正样本，大于0.4小于0.65为部分样本，小于0.4为负样本．
计算边框偏移．对于边框，(x1,y1)为左上角坐标，(x2,y2)为右下角坐标，新剪裁的边框坐标为(xn1,yn1),(xn2,yn2),width,height．则offset_x1 = (x1 - xn1)/width,同上，计算另三个点的坐标偏移．
对于正样本，部分样本均有边框信息，而对于负样本不需要边框信息

关键点样本提取

从celeba中提取，可以根据标注的边框，在满足正样本的要求下，随机裁剪出图片，然后调整关键点的坐标．

#生成positive,negative,part样本

import sys
sys.path.append('D:\\Anaconda2\\libs')    # 在windows系统上，导入python库目录
import numpy as np
import cv2
import os
import numpy.random as npr
from utils import IoU

# stdsize：随机crop的窗口大小，positive,negative,part样本都对应此大小
stdsize = 48    
anno_file = "E:/face_alignment/data/CelebA/Anno/mtcnn_train_label_2.txt"
im_dir = "E:/face_alignment/data/CelebA/Img/img_celeba.7z/img_celeba/"
pos_save_dir = str(stdsize) + "/positive"
part_save_dir = str(stdsize) + "/part"
neg_save_dir = str(stdsize) + '/negative'
save_dir = "./" + str(stdsize)

# 生成文件夹函数
def mkr(dr):
    if not os.path.exists(dr):
        os.mkdir(dr)

mkr(save_dir)
mkr(pos_save_dir)
mkr(part_save_dir)
mkr(neg_save_dir)

# 打开保存pos,neg,part文件名、标签的txt文件，这三个是生成文件
f1 = open(os.path.join(save_dir, 'pos_' + str(stdsize) + '.txt'), 'w')
f2 = open(os.path.join(save_dir, 'neg_' + str(stdsize) + '.txt'), 'w')
f3 = open(os.path.join(save_dir, 'part_' + str(stdsize) + '.txt'), 'w')
# 打开原始图片标注txt文件
with open(anno_file, 'r') as f:
    annotations = f.readlines()
num = len(annotations)
print "%d pics in total" % num
p_idx = 0 # positive
n_idx = 0 # negative
d_idx = 0 # part
idx = 0
box_idx = 0

# 原始图片根据标注的bbox，生成negative,posotive,part图片，标注形式也做相应变化
for annotation in annotations:    #逐行读取，按作者的方式，每行为一个原图
    annotation = annotation.strip().split(' ')    #对读取的每一行，按空格进行切片
    im_path = annotation[0]    # 第1个为图片名
    bbox = map(float, annotation[1:-10]) #第2个~~倒数第11个为bbox
    # pts = map(float, annotation[5:])
    pts = map(float, annotation[-10:])  #倒数第10个~~倒数第1个为landmark
	# 对bbox进行reshape，4个一列
    boxes = np.array(bbox, dtype=np.float32).reshape(-1, 4) 
    im_path = im_dir + im_path  #图片地址拼接
    img = cv2.imread(im_path)  #读取图片
    idx += 1
    if idx % 100 == 0:
        print idx, "images done"

    height, width, channel = img.shape

    neg_num = 0
	# 生成nagative，每个原图生成100个negative sample
    while neg_num < 100:
	    # size表示neg样本大小，在40和min(width, height)/2之间随机取一个整数
        size = npr.randint(40, min(width, height) / 2)
		# neg的左上角坐标(x1,y1)，在0和(width - size)之间随机取一个整数
        nx = npr.randint(0, width - size)
        ny = npr.randint(0, height - size)
        # 随机生成的bbox位置(x1,y1),(x2,y2)
        crop_box = np.array([nx, ny, nx + size, ny + size])

		# 计算随机生成的bbox和原图中所有标注bboxs的交并比
        Iou = IoU(crop_box, boxes)

		# 在原图中crop对应的区域图片，作为negative sample
        cropped_im = img[ny : ny + size, nx : nx + size, :]
		# 对crop的图像进行resize，大小为stdsize*stdsize
        resized_im = cv2.resize(cropped_im, (stdsize, stdsize), interpolation=cv2.INTER_LINEAR)

		# 如果crop_box与所有boxes的Iou都小于0.3，那么认为它是nagative sample
        if np.max(Iou) < 0.3:
            # Iou with all gts must below 0.3
			# 保存图片的地址和图片名
            save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx)
			# 往neg_48.txt文件中写入该negative样本的图片地址和名字，分类标签
            f2.write(str(stdsize)+"/negative/%s"%n_idx + ' 0\n')
			# 保存该负样本图片
            cv2.imwrite(save_file, resized_im)
            n_idx += 1
            neg_num += 1

    backupPts = pts[:]  # 该列表用于landmark      
    for box in boxes:  #逐行读取，每次循环处理一个box
        # box (x_left, y_top, x_right, y_bottom)
        x1, y1, x2, y2 = box
        w = x2 - x1 + 1
        h = y2 - y1 + 1

        # 忽略小脸
        # in case the ground truth boxes of small faces are not accurate
        if max(w, h) < 12 or x1 < 0 or y1 < 0:
            continue

        # 生成 positive examples and part faces
		# 每个box随机生成50个box，Iou>=0.65的作为positive examples，0.4<=Iou<0.65的作为part faces，其他忽略
        for i in range(50):
            pts = backupPts[:]
			# size表示随机生成样本的大小，在int(min(w, h) * 0.8) 和 np.ceil(1.25 * max(w, h)) 之间
            size = npr.randint(int(min(w, h) * 0.8), np.ceil(1.25 * max(w, h)))

            # delta 表示相对于标注box center的偏移量
            delta_x = npr.randint(-w * 0.2, w * 0.2)
            delta_y = npr.randint(-h * 0.2, h * 0.2)

			# nx,ny表示偏移后的box坐标位置
            nx1 = max(x1 + w / 2 + delta_x - size / 2, 0)
            ny1 = max(y1 + h / 2 + delta_y - size / 2, 0)
            nx2 = nx1 + size
            ny2 = ny1 + size

			# 去掉超出原图的box
            if nx2 > width or ny2 > height:
                continue
            crop_box = np.array([nx1, ny1, nx2, ny2])
			
            #bbox偏移量的计算，由 x1 = nx1 + float(size)*offset_x1 推导而来，可以参考bounding box regression博客 
            offset_x1 = (x1 - nx1) / float(size)
            offset_y1 = (y1 - ny1) / float(size)
            offset_x2 = (x2 - nx1) / float(size)
            offset_y2 = (y2 - ny1) / float(size)

			# landmark偏移量的计算，即landmark相对于随机生成bbox的归一化相对位置。
            for k in range(len(pts) / 2):
                pts[k*2] = (pts[k*2] - nx1) / float(size);
                pts[k*2+1] = (pts[k*2+1] - ny1) / float(size);

            cropped_im = img[int(ny1) : int(ny2), int(nx1) : int(nx2), :]
            resized_im = cv2.resize(cropped_im, (stdsize, stdsize), interpolation=cv2.INTER_LINEAR)

			# 将box reshape为一行
            box_ = box.reshape(1, -1)
			# Iou>=0.65的作为positive examples
            if IoU(crop_box, box_) >= 0.65:
                save_file = os.path.join(pos_save_dir, "%s.jpg"%p_idx)
				# 将图片路径，类别，偏移量写入到positive_48.txt文件中
                f1.write(str(stdsize)+"/positive/%s"%p_idx + ' 1 %f %f %f %f'%(offset_x1, offset_y1, offset_x2, offset_y2))
				
				# 将landmark写入到positive_48.txt文件中
                for k in range(len(pts)):
                    f1.write(" %f" % pts[k])
                f1.write("\n")
                cv2.imwrite(save_file, resized_im)
                p_idx += 1
				
			# 0.4<=Iou<0.65的作为part faces
            elif IoU(crop_box, box_) >= 0.4:
                save_file = os.path.join(part_save_dir, "%s.jpg"%d_idx)
                f3.write(str(stdsize)+"/part/%s"%d_idx + ' -1 %f %f %f %f'%(offset_x1, offset_y1, offset_x2, offset_y2))

                for k in range(len(pts)):
                    f3.write(" %f" % pts[k])
                f3.write("\n")
                cv2.imwrite(save_file, resized_im)
                d_idx += 1

        box_idx += 1
        print "%s images done, pos: %s part: %s neg: %s"%(idx, p_idx, d_idx, n_idx)

f1.close()
f2.close()
f3.close()

网络结构

三个net的网络结构如上图所示，注意pnet是全卷积的结构，不包含fc层。笔者在训练pnet和rnet的时候，并没有加入landmark回归的任务，分类和人脸框回归的loss_weight之比为1：0.5，onet加入landmark回归，分类、人脸框回归和关键点回归的loss_weight之比为1：0.5：0.5。

训练主要包括三个任务

人脸分类任务：利用正样本和负样本进行训练
人脸边框回归任务：利用正样本和部分样本进行训练
关键点检测任务：利用关键点样本进行训练

代价函数

loss修改由于训练过程中需要同时计算３个loss,但是对于不同的任务，每个任务需要的loss不同．所有在整理数据中，对于每个图片进行了15个label的标注信息

第1列：为正负样本标志，１正样本,0负样本,2部分样本,3关键点信息
第2-5列：为边框偏移，为float类型，对于无边框信息的数据，全部置为-1
第6-15列：为关键点偏移，为floagt类型，对于无边框信息的数据，全部置为-1

修改softmax_loss_layer.cpp　增加判断，只对于1,0计算loss值修改euclidean_loss_layer.cpp　增加判断，对于置为-1的不进行loss计算困难样本选择论文中作者对与人脸分类任务，采用了在线困难样本选择，实现过程如下：修改softmax_loss_layer.cpp，根据计算出的loss值，进行排序，只对于前70%对应的数据，进行反向传播．

预测阶段

TIP:预测可输入任意大小的图片（因为P_net是全卷积网络）

Proposal Network (P-Net)：在构建图像金字塔的基础上，利用fully convolutional network来进行检测，同时利用boundingbox regression 和 NMS来进行修正。（注意：这里的全卷积网络与R-CNN里面带反卷积的网络是不一样的，这里只是指只有卷积层，可以接受任意尺寸的输入，靠网络stride来自动完成滑窗）

Refine Network (R-Net)：该网络结构还是通过边界框回归和NMS来去掉那些false-positive区域。

只是由于该网络结构和P-Net网络结构有差异，多了一个全连接层，所以会取得更好的抑制false-positive的作用。

Output Network (O-Net)：该层比R-Net层又多了一层卷基层，所以处理的结果会更加精细。作用和R-Net层作用一样。但是该层对人脸区域进行了更多的监督，同时还会输出5个地标（landmark）。

IOU概念：

def IoU(box, boxes):
    """Compute IoU between detect box and gt boxes

    Parameters:
    ----------
    box: numpy array , shape (5, ): x1, y1, x2, y2, score
        input box
    boxes: numpy array, shape (n, 4): x1, y1, x2, y2
        input ground truth boxes

    Returns:
    -------
    ovr: numpy.array, shape (n, )
        IoU
    """
    box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
    area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
    xx1 = np.maximum(box[0], boxes[:, 0])
    yy1 = np.maximum(box[1], boxes[:, 1])
    xx2 = np.minimum(box[2], boxes[:, 2])
    yy2 = np.minimum(box[3], boxes[:, 3])

    # compute the width and height of the bounding box
    w = np.maximum(0, xx2 - xx1 + 1)
    h = np.maximum(0, yy2 - yy1 + 1)

    inter = w * h
    ovr = inter / (box_area + area - inter)
    return ovr

非极大值抑制(NMS)概念：
RCNN会从一张图片中找出n个可能是物体的矩形框，然后为每个矩形框为做类别分类概率：

就像上面的图片一样，定位一个车辆，最后算法就找出了一堆的方框，我们需要判别哪些矩形框是没用的。非极大值抑制的方法是：先假设有6个矩形框，根据分类器的类别分类概率做排序，假设从小到大属于车辆的概率分别为A、B、C、D、E、F。

从最大概率矩形框F开始，分别判断A~E与F的重叠度IOU是否大于某个设定的阈值;
假设B、D与F的重叠度超过阈值，那么就扔掉B、D；并标记第一个矩形框F，是我们保留下来的。
从剩下的矩形框A、C、E中，选择概率最大的E，然后判断E与A、C的重叠度，重叠度大于一定的阈值，那么就扔掉；并标记E是我们保留下来的第二个矩形框。

就这样一直重复，找到所有被保留下来的矩形框。非极大值抑制（NMS）顾名思义就是抑制不是极大值的元素，搜索局部的极大值。这个局部代表的是一个邻域，邻域有两个参数可变，一是邻域的维数，二是邻域的大小。这里不讨论通用的NMS算法，而是用于在目标检测中用于提取分数最高的窗口的。例如在行人检测中，滑动窗口经提取特征，经分类器分类识别后，每个窗口都会得到一个分数。但是滑动窗口会导致很多窗口与其他窗口存在包含或者大部分交叉的情况。这时就需要用到NMS来选取那些邻域里分数最高（是行人的概率最大），并且抑制那些分数低的窗口。

def py_nms(dets, thresh, mode="Union"):
    """
    greedily select boxes with high confidence
    keep boxes overlap <= thresh
    rule out overlap > thresh
    :param dets: [[x1, y1, x2, y2 score]]
    :param thresh: retain overlap <= thresh
    :return: indexes to keep
    """
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        if mode == "Union":
            ovr = inter / (areas[i] + areas[order[1:]] - inter)
        elif mode == "Minimum":
            ovr = inter / np.minimum(areas[i], areas[order[1:]])
        #keep
        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep

预测实现代码

class MtcnnDetector(object):


    def __init__(self,
                 detectors,
                 min_face_size=25,
                 stride=2,
                 threshold=[0.6, 0.7, 0.7],
                 scale_factor=0.79,
                 #scale_factor=0.709,#change
                 slide_window=False):

        self.pnet_detector = detectors[0]
        self.rnet_detector = detectors[1]
        self.onet_detector = detectors[2]
        self.min_face_size = min_face_size
        self.stride = stride
        self.thresh = threshold
        self.scale_factor = scale_factor
        self.slide_window = slide_window

    def convert_to_square(self, bbox):
        """
            convert bbox to square
        Parameters:
        ----------
            bbox: numpy array , shape n x 5
                input bbox
        Returns:
        -------
            square bbox
        """
        square_bbox = bbox.copy()

        h = bbox[:, 3] - bbox[:, 1] + 1
        w = bbox[:, 2] - bbox[:, 0] + 1
        max_side = np.maximum(h, w)
        square_bbox[:, 0] = bbox[:, 0] + w * 0.5 - max_side * 0.5
        square_bbox[:, 1] = bbox[:, 1] + h * 0.5 - max_side * 0.5
        square_bbox[:, 2] = square_bbox[:, 0] + max_side - 1
        square_bbox[:, 3] = square_bbox[:, 1] + max_side - 1
        return square_bbox

    def calibrate_box(self, bbox, reg):
        """
            calibrate bboxes
        Parameters:
        ----------
            bbox: numpy array, shape n x 5
                input bboxes
            reg:  numpy array, shape n x 4
                bboxes adjustment
        Returns:
        -------
            bboxes after refinement
        """

        bbox_c = bbox.copy()
        w = bbox[:, 2] - bbox[:, 0] + 1
        w = np.expand_dims(w, 1)
        h = bbox[:, 3] - bbox[:, 1] + 1
        h = np.expand_dims(h, 1)
        reg_m = np.hstack([w, h, w, h])
        aug = reg_m * reg
        bbox_c[:, 0:4] = bbox_c[:, 0:4] + aug
        return bbox_c

    def generate_bbox(self, cls_map, reg, scale, threshold):
        """
            generate bbox from feature cls_map
        Parameters:
        ----------
            cls_map: numpy array , n x m 
                detect score for each position
            reg: numpy array , n x m x 4
                bbox
            scale: float number
                scale of this detection
            threshold: float number
                detect threshold
        Returns:
        -------
            bbox array
        """
        stride = 2
        #stride = 4
        cellsize = 12
        #cellsize = 25

        t_index = np.where(cls_map > threshold)

        # find nothing
        if t_index[0].size == 0:
            return np.array([])
        #offset
        dx1, dy1, dx2, dy2 = [reg[t_index[0], t_index[1], i] for i in range(4)]

        reg = np.array([dx1, dy1, dx2, dy2])
        score = cls_map[t_index[0], t_index[1]]
        boundingbox = np.vstack([np.round((stride * t_index[1]) / scale),
                                 np.round((stride * t_index[0]) / scale),
                                 np.round((stride * t_index[1] + cellsize) / scale),
                                 np.round((stride * t_index[0] + cellsize) / scale),
                                 score,
                                 reg])

        return boundingbox.T
    #pre-process images
    def processed_image(self, img, scale):
        height, width, channels = img.shape
        new_height = int(height * scale)  # resized new height
        new_width = int(width * scale)  # resized new width
        new_dim = (new_width, new_height)
        img_resized = cv2.resize(img, new_dim, interpolation=cv2.INTER_LINEAR)  # resized image
        img_resized = (img_resized - 127.5) / 128
        return img_resized

    def pad(self, bboxes, w, h):
        """
            pad the the bboxes, alse restrict the size of it
        Parameters:
        ----------
            bboxes: numpy array, n x 5
                input bboxes
            w: float number
                width of the input image
            h: float number
                height of the input image
        Returns :
        ------
            dy, dx : numpy array, n x 1
                start point of the bbox in target image
            edy, edx : numpy array, n x 1
                end point of the bbox in target image
            y, x : numpy array, n x 1
                start point of the bbox in original image
            ex, ex : numpy array, n x 1
                end point of the bbox in original image
            tmph, tmpw: numpy array, n x 1
                height and width of the bbox
        """
        tmpw, tmph = bboxes[:, 2] - bboxes[:, 0] + 1, bboxes[:, 3] - bboxes[:, 1] + 1
        num_box = bboxes.shape[0]

        dx, dy = np.zeros((num_box,)), np.zeros((num_box,))
        edx, edy = tmpw.copy() - 1, tmph.copy() - 1

        x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]

        tmp_index = np.where(ex > w - 1)
        edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index]
        ex[tmp_index] = w - 1

        tmp_index = np.where(ey > h - 1)
        edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index]
        ey[tmp_index] = h - 1

        tmp_index = np.where(x < 0)
        dx[tmp_index] = 0 - x[tmp_index]
        x[tmp_index] = 0

        tmp_index = np.where(y < 0)
        dy[tmp_index] = 0 - y[tmp_index]
        y[tmp_index] = 0

        return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph]
        return_list = [item.astype(np.int32) for item in return_list]

        return return_list
    
    def detect_pnet(self, im):
        """Get face candidates through pnet

        Parameters:
        ----------
        im: numpy array
            input image array

        Returns:
        -------
        boxes: numpy array
            detected boxes before calibration
        boxes_c: numpy array
            boxes after calibration
        """
        h, w, c = im.shape
        net_size = 12
        
        current_scale = float(net_size) / self.min_face_size  # find initial scale
        # print("current_scale", net_size, self.min_face_size, current_scale)
        im_resized = self.processed_image(im, current_scale)
        current_height, current_width, _ = im_resized.shape
        # fcn
        all_boxes = list()
        while min(current_height, current_width) > net_size:
            #return the result predicted by pnet
            #cls_cls_map : H*w*2
            #reg: H*w*4
            cls_cls_map, reg = self.pnet_detector.predict(im_resized)
            #boxes: num*9(x1,y1,x2,y2,score,x1_offset,y1_offset,x2_offset,y2_offset)
            boxes = self.generate_bbox(cls_cls_map[:, :,1], reg, current_scale, self.thresh[0])

            current_scale *= self.scale_factor
            im_resized = self.processed_image(im, current_scale)
            current_height, current_width, _ = im_resized.shape

            if boxes.size == 0:
                continue
            keep = py_nms(boxes[:, :5], 0.5, 'Union')
            boxes = boxes[keep]
            all_boxes.append(boxes)

        if len(all_boxes) == 0:
            return None, None, None

        all_boxes = np.vstack(all_boxes)

        # merge the detection from first stage
        keep = py_nms(all_boxes[:, 0:5], 0.7, 'Union')
        all_boxes = all_boxes[keep]
        boxes = all_boxes[:, :5]

        bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1
        bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1

        # refine the boxes
        boxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw,
                             all_boxes[:, 1] + all_boxes[:, 6] * bbh,
                             all_boxes[:, 2] + all_boxes[:, 7] * bbw,
                             all_boxes[:, 3] + all_boxes[:, 8] * bbh,
                             all_boxes[:, 4]])
        boxes_c = boxes_c.T

        return boxes, boxes_c, None
    def detect_rnet(self, im, dets):
        """Get face candidates using rnet

        Parameters:
        ----------
        im: numpy array
            input image array
        dets: numpy array
            detection results of pnet

        Returns:
        -------
        boxes: numpy array
            detected boxes before calibration
        boxes_c: numpy array
            boxes after calibration
        """
        h, w, c = im.shape
        dets = self.convert_to_square(dets)
        dets[:, 0:4] = np.round(dets[:, 0:4])

        [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)
        num_boxes = dets.shape[0]
        cropped_ims = np.zeros((num_boxes, 24, 24, 3), dtype=np.float32)
        for i in range(num_boxes):
            tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
            tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
            cropped_ims[i, :, :, :] = (cv2.resize(tmp, (24, 24))-127.5) / 128
        #cls_scores : num_data*2
        #reg: num_data*4
        #landmark: num_data*10
        cls_scores, reg, _ = self.rnet_detector.predict(cropped_ims)
        cls_scores = cls_scores[:,1]
        keep_inds = np.where(cls_scores > self.thresh[1])[0]
        if len(keep_inds) > 0:
            boxes = dets[keep_inds]
            boxes[:, 4] = cls_scores[keep_inds]
            reg = reg[keep_inds]
            #landmark = landmark[keep_inds]
        else:
            return None, None, None
        
        
        keep = py_nms(boxes, 0.6)
        boxes = boxes[keep]
        boxes_c = self.calibrate_box(boxes, reg[keep])
        return boxes, boxes_c,None
    def detect_onet(self, im, dets):
        """Get face candidates using onet

        Parameters:
        ----------
        im: numpy array
            input image array
        dets: numpy array
            detection results of rnet

        Returns:
        -------
        boxes: numpy array
            detected boxes before calibration
        boxes_c: numpy array
            boxes after calibration
        """
        h, w, c = im.shape
        dets = self.convert_to_square(dets)
        dets[:, 0:4] = np.round(dets[:, 0:4])
        [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)
        num_boxes = dets.shape[0]
        cropped_ims = np.zeros((num_boxes, 48, 48, 3), dtype=np.float32)
        for i in range(num_boxes):
            tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
            tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
            cropped_ims[i, :, :, :] = (cv2.resize(tmp, (48, 48))-127.5) / 128
            
        cls_scores, reg,landmark = self.onet_detector.predict(cropped_ims)
        #prob belongs to face
        cls_scores = cls_scores[:,1]        
        keep_inds = np.where(cls_scores > self.thresh[2])[0]        
        if len(keep_inds) > 0:
            #pickout filtered box
            boxes = dets[keep_inds]
            boxes[:, 4] = cls_scores[keep_inds]
            reg = reg[keep_inds]
            landmark = landmark[keep_inds]
        else:
            return None, None, None
        
        #width
        w = boxes[:,2] - boxes[:,0] + 1
        #height
        h = boxes[:,3] - boxes[:,1] + 1
        landmark[:,0::2] = (np.tile(w,(5,1)) * landmark[:,0::2].T + np.tile(boxes[:,0],(5,1)) - 1).T
        landmark[:,1::2] = (np.tile(h,(5,1)) * landmark[:,1::2].T + np.tile(boxes[:,1],(5,1)) - 1).T        
        boxes_c = self.calibrate_box(boxes, reg)
        
        
        boxes = boxes[py_nms(boxes, 0.6, "Minimum")]
        keep = py_nms(boxes_c, 0.6, "Minimum")
        boxes_c = boxes_c[keep]
        landmark = landmark[keep]
        return boxes, boxes_c,landmark
    #use for video
    def detect(self, img):
        """Detect face over image
        """
        boxes = None
        t = time.time()
    
        # pnet
        t1 = 0
        if self.pnet_detector:
            boxes, boxes_c,_ = self.detect_pnet(img)
            if boxes_c is None:
                return np.array([]),np.array([])
    
            t1 = time.time() - t
            t = time.time()
    
        # rnet
        t2 = 0
        if self.rnet_detector:
            boxes, boxes_c,_ = self.detect_rnet(img, boxes_c)
            if boxes_c is None:
                return np.array([]),np.array([])
    
            t2 = time.time() - t
            t = time.time()
    
        # onet
        t3 = 0
        if self.onet_detector:
            boxes, boxes_c,landmark = self.detect_onet(img, boxes_c)
            if boxes_c is None:
                return np.array([]),np.array([])
    
            t3 = time.time() - t
            t = time.time()
            print(
                "time cost " + '{:.3f}'.format(t1 + t2 + t3) + '  pnet {:.3f}  rnet {:.3f}  onet {:.3f}'.format(t1, t2,
                                                                                                                t3))
    
        return boxes_c,landmark
    def detect_face(self, test_data):
        all_boxes = []#save each image's bboxes
        landmarks = []
        batch_idx = 0
        sum_time = 0
        #test_data is iter_
        for databatch in test_data:
            #databatch(image returned)
            if batch_idx % 100 == 0:
                print("%d images done" % batch_idx)
            im = databatch
            # pnet
            t1 = 0
            if self.pnet_detector:
                t = time.time()
                #ignore landmark 
                boxes, boxes_c, landmark = self.detect_pnet(im)
                t1 = time.time() - t
                sum_time += t1
                if boxes_c is None:
                    print("boxes_c is None...")
                    all_boxes.append(np.array([]))
                    #pay attention
                    landmarks.append(np.array([]))
                    batch_idx += 1
                    continue
            # rnet
            t2 = 0
            if self.rnet_detector:
                t = time.time()
                #ignore landmark                 
                boxes, boxes_c, landmark = self.detect_rnet(im, boxes_c)
                t2 = time.time() - t
                sum_time += t2
                if boxes_c is None:
                    all_boxes.append(np.array([]))
                    landmarks.append(np.array([]))
                    batch_idx += 1
                    continue
            # onet
            t3 = 0
            if self.onet_detector:
                t = time.time()
                boxes, boxes_c, landmark = self.detect_onet(im, boxes_c)
                t3 = time.time() - t
                sum_time += t3
                if boxes_c is None:
                    all_boxes.append(np.array([]))
                    landmarks.append(np.array([]))                    
                    batch_idx += 1
                    continue
                print(
                    "time cost " + '{:.3f}'.format(sum_time) + '  pnet {:.3f}  rnet {:.3f}  onet {:.3f}'.format(t1, t2,t3))
                                                                                                                    
                                                                                                                   
            all_boxes.append(boxes_c)
            landmarks.append(landmark)
            batch_idx += 1
        #num_of_data*9,num_of_data*10
        return all_boxes,landmarks

MTCNN 人脸检测论文解读，及tensorflow代码实现