21个项目玩转深度学习：基于TensorFlow的实践详解06—人脸检测和识别——MTCNN人脸检测

本篇主要讲述利用MTCNN的预训练模型得到原图中人脸的分割，代码如下：https://github.com/davidsandberg/facenet

结合博客https://blog.csdn.net/FortiLZ/article/details/81396566?tdsourcetag=s_pctim_aiomsg看起来省力些

要是对MTCNN的训练过程感兴趣的，可以看https://www.cnblogs.com/helloworld0604/p/9808795.html

MTCNN原理

经过P，R，O三层网络，一层层检测，具体网络构建如下：

PNet

class PNet(Network):
    def setup(self):
        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
             .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1')
             .prelu(name='PReLU1')
             .max_pool(2, 2, 2, 2, name='pool1')
             .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2')
             .prelu(name='PReLU2')
             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3')
             .prelu(name='PReLU3')
             .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1')
             .softmax(3,name='prob1'))

        (self.feed('PReLU3') #pylint: disable=no-value-for-parameter
             .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2'))

View Code

RNet

class RNet(Network):
    def setup(self):
        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
             .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1')
             .prelu(name='prelu1')
             .max_pool(3, 3, 2, 2, name='pool1')
             .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2')
             .prelu(name='prelu2')
             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
             .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3')
             .prelu(name='prelu3')
             .fc(128, relu=False, name='conv4')
             .prelu(name='prelu4')
             .fc(2, relu=False, name='conv5-1')
             .softmax(1,name='prob1'))

        (self.feed('prelu4') #pylint: disable=no-value-for-parameter
             .fc(4, relu=False, name='conv5-2'))

View Code

ONet

class ONet(Network):
    def setup(self):
        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1')
             .prelu(name='prelu1')
             .max_pool(3, 3, 2, 2, name='pool1')
             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2')
             .prelu(name='prelu2')
             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3')
             .prelu(name='prelu3')
             .max_pool(2, 2, 2, 2, name='pool3')
             .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4')
             .prelu(name='prelu4')
             .fc(256, relu=False, name='conv5')
             .prelu(name='prelu5')
             .fc(2, relu=False, name='conv6-1')
             .softmax(1, name='prob1'))

        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
             .fc(4, relu=False, name='conv6-2'))

        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
             .fc(10, relu=False, name='conv6-3'))

View Code

配置运行环境

export PYTHONPATH=xx/xx/facenet/src    #保证import facenet这句话不出错

参数选项

import argparse
def parse_arguments(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('input_dir', type=str, help='Directory with unaligned images.')      # 待检测的图片
    parser.add_argument('output_dir', type=str, help='Directory with aligned face thumbnails.')   #检测出的人脸图的保存目录
    parser.add_argument('--image_size', type=int, help='Image size (height, width) in pixels.', default=182)    #保存的人脸图大小
    parser.add_argument('--margin', type=int,
                        help='Margin for the crop around the bounding box (height, width) in pixels.', default=44)   #检测出人脸后的外围扩充
    parser.add_argument('--random_order',
                        help='Shuffles the order of images to enable alignment using multiple processes.', action='store_true')
    parser.add_argument('--gpu_memory_fraction', type=float,
                        help='Upper bound on the amount of GPU memory that will be used by the process.', default=1.0)
    parser.add_argument('--detect_multiple_faces', type=bool,
                        help='Detect and align multiple faces per image.', default=False) 
    return parser.parse_args(argv)

检测核心代码:src/align/detect_face.py/detect_face函数

def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
    """Detects faces in an image, and returns bounding boxes and points for them.
    img: input image
    minsize: minimum faces' size   20
    pnet, rnet, onet: caffemodel
    threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold  [ 0.6, 0.7, 0.7 ]
    factor: the factor used to create a scaling pyramid of face（图像金字塔） sizes to detect in the image. 0.709
    """
    #print('img shape before detect:',img.shape)
    factor_count=0
    total_boxes=np.empty((0,9))
    points=np.empty(0)
    h=img.shape[0]
    w=img.shape[1]
    minl=np.amin([h, w])
    m=12.0/minsize
    minl=minl*m
    # create scale pyramid
    scales=[]
    while minl>=12:         #满足这个条件，即min(h,w)>=minsize
        scales += [m*np.power(factor, factor_count)]
        minl = minl*factor
        factor_count += 1
    
    #print('scales:',scales)  #若min(h,w)==250，则有8个scale，乘过(12/minsize)*250*factor^8<12可理解为250*factor^N<minsize,求N即可    
　　 # first stage    
    for scale in scales:
　　　　　#print('******scale:',scale)
        hs=int(np.ceil(h*scale))
        ws=int(np.ceil(w*scale))
        im_data = imresample(img, (hs, ws))
        im_data = (im_data-127.5)*0.0078125
        
        #print('im_data:',im_data.shape)
        img_x = np.expand_dims(im_data, 0)
        img_y = np.transpose(img_x, (0,2,1,3))
        #print('img_y (to the pnet):',img_y.shape)
        out = pnet(img_y)-----------------------------------------------------------------------第一层用PNet模型预测
        out0 = np.transpose(out[0], (0,2,1,3))
        out1 = np.transpose(out[1], (0,2,1,3))
        #print('out0 (after pnet):',out0.shape)
        #print("out0",out0[0,:2,:2,:])
        #from ipdb import set_trace;set_trace()
        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])  
　　　　　'''
　　　　　最终输出的 boundingbox 是形如 (x, 9)，其中前4位是 block 在原图中的坐标，
　　　　　第5位是判定为人脸的概率，后4位是 boundingbox regression 的值。
　　　　　具体 boundingbox regression 到底是什么，现在还不清楚。
　　　　　'''
        
        # inter-scale nms 
　　　　　'''
　　　　　NMS (Non-Maximum Suppression)：在上述生成的 bb 中，找出判定为人脸概率最大的那个 bb，计算出这个 bb 的面积，
　　　　　然后计算其余 bb 与这个 bb 重叠面积的大小，用重叠面积除以：(Min) 两个 bb 中面积较小者；(Union) 两个 bb 的总和面积。
　　　　　如果这个值大于 threshold，那么就认为这两个 bb 框的是同一个地方，舍弃判定概率小的；
　　　　　如果小于 threshold，则认为两个 bb 框的是不同地方，保留判定概率小的。重复上述过程直至所有 bb 都遍历完成。
　　　　　将图片按照所有的 scale 处理过一遍后，会得到在原图上基于不同 scale 的所有的 bb，然后对这些 bb 再进行一次 NMS，并且这次 NMS 的 threshold 要提高。
　　　　　'''
        pick = nms(boxes.copy(), 0.5, 'Union')     
        if boxes.size>0 and pick.size>0:
            boxes = boxes[pick,:]
            #print(boxes[:,2]-boxes[:,0])
            #print(boxes[:,3]-boxes[:,1])
            total_boxes = np.append(total_boxes, boxes, axis=0)
        
    #print('all scales has done!!!')
    numbox = total_boxes.shape[0]
    if numbox>0:
        pick = nms(total_boxes.copy(), 0.7, 'Union')
        total_boxes = total_boxes[pick,:]
        # 因为pnet的移动大小是12*12，对于一种scale，regw和regh是差不多的
        # 校准bb,得到了真真正正的在原图上 bb 的坐标
        regw = total_boxes[:,2]-total_boxes[:,0]
        regh = total_boxes[:,3]-total_boxes[:,1]
        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
        total_boxes = rerec(total_boxes.copy())  #调整成正方形
        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)  #把超过原图边界的坐标剪裁一下
        # print(dy[0], edy[0], dx[0], edx[0], y[0], ey[0], x[0], ex[0], tmpw[0], tmph[0])
        # (1, 76, 1, 76, 85, 160, 189, 264, 76, 76)

    #print("***********  second stage  **************")
    numbox = total_boxes.shape[0]   #45
    if numbox>0:
        # second stage
        tempimg = np.zeros((24,24,3,numbox))
        for k in range(0,numbox):
            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
            #tmp生成的图片是高*宽的，rnet的输入要求是宽*高，别忘记转置
            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
                tempimg[:,:,:,k] = imresample(tmp, (24, 24))
            else:
                return np.empty()
        tempimg = (tempimg-127.5)*0.0078125 
        #print('temping:',tempimg.shape)  #(24, 24, 3, 45)
        tempimg1 = np.transpose(tempimg, (3,1,0,2))
        #print('before rnet:',tempimg1.shape)  #(45, 24, 24, 3)
        out = rnet(tempimg1)----------------------------------------------------------------第二层用RNet
        out0 = np.transpose(out[0])  #（4，45）
        out1 = np.transpose(out[1])  #（2，45）
        score = out1[1,:]
        ipass = np.where(score>threshold[1])
        #print("ipass:",ipass[0].shape)#(8,)
        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])   #(8,5)
        mv = out0[:,ipass[0]]    #再根据rnet预测出的回归校准bb
        if total_boxes.shape[0]>0:
            pick = nms(total_boxes, 0.7, 'Union')
            total_boxes = total_boxes[pick,:]   
            #print('total_boxes:',total_boxes.shape) #（5，5）
            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
            total_boxes = rerec(total_boxes.copy())

    #print("***********  third stage  **************")
    numbox = total_boxes.shape[0]  # 5
    if numbox>0:
        # third stage
        total_boxes = np.fix(total_boxes).astype(np.int32)
        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
        tempimg = np.zeros((48,48,3,numbox))
        for k in range(0,numbox):
            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
            else:
                return np.empty()
        tempimg = (tempimg-127.5)*0.0078125   #(48, 48, 3, 5)
        tempimg1 = np.transpose(tempimg, (3,1,0,2))
        out = onet(tempimg1)-------------------------------------------------------------------第三层用ONet
        out0 = np.transpose(out[0])   #(4,5)
        out1 = np.transpose(out[1])   #(10,5)
        out2 = np.transpose(out[2])   #(2,5)
        score = out2[1,:]
        points = out1
        ipass = np.where(score>threshold[2])   
        points = points[:,ipass[0]]
        #print("ipass:",ipass[0].shape)#(5,)
        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
        mv = out0[:,ipass[0]]

        w = total_boxes[:,2]-total_boxes[:,0]+1
        h = total_boxes[:,3]-total_boxes[:,1]+1
        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
        if total_boxes.shape[0]>0:                       #最后一个阶段是先校准再nms，且采用‘min’的方式
            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
            pick = nms(total_boxes.copy(), 0.7, 'Min')
            total_boxes = total_boxes[pick,:]
            points = points[:,pick]
            #print('total_boxes:',total_boxes.shape) #(1,5)
                
    return total_boxes, points

generateBoundingBox(imap, reg, scale, t)

def generateBoundingBox(imap, reg, scale, t):
    # use heatmap to generate bounding boxes
    #print("generateBoundingBox-scale:",scale)
    stride=2
    cellsize=12

    imap = np.transpose(imap)
    dx1 = np.transpose(reg[:,:,0])
    dy1 = np.transpose(reg[:,:,1])
    dx2 = np.transpose(reg[:,:,2])
    dy2 = np.transpose(reg[:,:,3])
    y, x = np.where(imap >= t)   #返回的是另外两维的序号
    #print("y:",y.shape)    # (H,)
    if y.shape[0]==1:            #只有一个概率>threshold的block
        dx1 = np.flipud(dx1)     #上下翻转
        dy1 = np.flipud(dy1)
        dx2 = np.flipud(dx2)
        dy2 = np.flipud(dy2)
    score = imap[(y,x)]          #取可能是人脸的block的概率值
    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
    #print("reg:",reg.shape) #(H,4)
    if reg.size==0:
        reg = np.empty((0,3))
    bb = np.transpose(np.vstack([y,x]))
    #print("bb:",bb.shape)  #(H,2)
    # 计算原图中的位置
    q1 = np.fix((stride*bb+1)/scale)  #np.fix():Round to nearest integer towards zero.
    q2 = np.fix((stride*bb+cellsize-1+1)/scale)
    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])   #(H,2)+(H,2)+(H,1)+(H,4)
    #print("boundingbox:",boundingbox.shape)  #(H,9)
    #print(boundingbox[1,:])
    #array([  8.50000000e+01,   1.05000000e+02,   1.03000000e+02,  1.23000000e+02,   6.87459767e-01,  -1.91520154e-02, -7.52890855e-03,  -7.97504187e-03,   2.32149601e-01])
    return boundingbox, reg

nms(boxes, threshold, method)

def nms(boxes, threshold, method):
    if boxes.size==0:
        return np.empty((0,3))
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    s = boxes[:,4]
    area = (x2-x1+1) * (y2-y1+1)
    I = np.argsort(s)   #返回排序后的索引
    pick = np.zeros_like(s, dtype=np.int16)
    counter = 0
    while I.size>0:
        i = I[-1]
        pick[counter] = i
        counter += 1
        idx = I[0:-1]
        xx1 = np.maximum(x1[i], x1[idx])
        yy1 = np.maximum(y1[i], y1[idx])
        xx2 = np.minimum(x2[i], x2[idx])
        yy2 = np.minimum(y2[i], y2[idx])
        w = np.maximum(0.0, xx2-xx1+1)
        h = np.maximum(0.0, yy2-yy1+1)
        inter = w * h     # 相交面积
        if method is 'Min':
            o = inter / np.minimum(area[i], area[idx])
        else:
            o = inter / (area[i] + area[idx] - inter)
        I = I[np.where(o<=threshold)]
    pick = pick[0:counter]   #保留下来的box的序号
    return pick

View Code

得到人脸框坐标和关键点坐标之后，即可以将其裁剪和描绘出来了，可自行学习align_dataset_mtcnn.py中的实现

疑惑

自己看的过程中，最为困惑的点就是网络是如何训练的，训练样本是什么样子的，解答都在https://www.cnblogs.com/helloworld0604/p/9808795.html中

21个项目玩转深度学习：基于TensorFlow的实践详解06—人脸检测和识别——MTCNN人脸检测

MTCNN原理

PNet

RNet

ONet

配置运行环境

参数选项

检测核心代码:src/align/detect_face.py/detect_face函数

generateBoundingBox(imap, reg, scale, t)

nms(boxes, threshold, method)

疑惑

猜你喜欢