姿态估计DeepLabCut

一个人走得快，一群人走得远……

论文：DeepLabCut: markerless pose estimation of user-defined body parts with deep learning

Github：https://github.com/AlexEMG/DeepLabCut

安装：

pip3 install opencv-python==3.4.5.20

pip3 install deeplabcut

pip3 install matplotlib==3.0.3

pip3 install tables==3.4.3

pip3 install tensorflow-gpu==1.13.1

pip3 install imgaug

pip3 install wxpython

pip3 install ruamel.yaml

网络结构：

基础网络可以Resnet50，Resnet01，Resnet152，MobileNet_v2_1.0，MobileNet_v2_0.75，MobileNet_v2_0.5，MobileNet_v2_0.35

后续接入了1个反卷积层，对模型进行了1次上采样。其中，基础骨架模型进行了4次下采样，反卷积进行了1次上采样，最终，网络进行了3次下采样。

这里正常的resnet，mobilev2都是进行32倍的下采样，为何这里只进行了16倍呢？

因为作者想要保证16倍的下采样，也就是需要下采样4次，因此修改了原始结构中的第4个stage，将原来的stride = 2换成了stride = 1。最终基础骨架只进行了16倍的下采样。

网络输入为彩色图片，维度为[batch_size, height, width, 3]，输出为2部分，预测特征图part_pred，维度为[batch_size, height/8, width/8, num_joints]，预测特征图的微调，或者说，偏移locref，包含x,y的偏移，所以得乘以2，维度为[batch_size, height/8, width/8, num_joints * 2]，其中，num_joints表示预测的点的个数，一个channel预测一个点。

损失函数：

因为网络最终有2个分支，所以这2个分支都需要进行loss的计算。特征图part_pred分支的损失函数是sigmoid_cross_entropy，偏移locref的损失函数是huber_loss或者MSE loss。

huber_loss，类似于smooth_L1，唯一的区别在于多了一个参数k。

def huber_loss(labels, predictions, weight=1.0, k=1.0, scope=None):
    """Define a huber loss  https://en.wikipedia.org/wiki/Huber_loss
      tensor: tensor to regularize.
      k: value of k in the huber loss
      scope: Optional scope for op_scope.

    Huber loss:
    f(x) = if |x| <= k:
              0.5 * x^2
           else:
              k * |x| - 0.5 * k^2

    Returns:
      the L1 loss op.

    http://concise-bio.readthedocs.io/en/latest/_modules/concise/tf_helper.html
    """
    with ops.name_scope(scope, "absolute_difference",
                        [predictions, labels]) as scope:
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        if weight is None:
            raise ValueError("`weight` cannot be None")
        predictions = math_ops.to_float(predictions)
        labels = math_ops.to_float(labels)
        diff = math_ops.subtract(predictions, labels)
        abs_diff = tf.abs(diff)
        losses = tf.where(abs_diff < k,
                          0.5 * tf.square(diff),
                          k * abs_diff - 0.5 * k ** 2)
        return TF.losses.compute_weighted_loss(losses, weight)

整体的loss为part_pred和locref的加权组合，
Total loss = part_pred loss +locref_loss_weight * locref loss

训练（train.py）：

import deeplabcut

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"


config_path = r'DeepLabCut\www\config.yaml' 


deeplabcut.launch_dlc()
deeplabcut.create_new_project(www,'59', [r'DeepLabCut\www.mp4'], working_directory= r'DeepLabCut',copy_videos=True)
 
deeplabcut.extract_frames(config_path,'automatic','kmeans', crop=False)
deeplabcut.label_frames(config_path)
deeplabcut.check_labels(config_path)
deeplabcut.create_training_dataset(config_path,num_shuffles=1)
deeplabcut.train_network(config_path,shuffle=1)
deeplabcut.evaluate_network(config_path,[1], plotting=True)#保存结果图片
deeplabcut.analyze_videos(config_path,['DeepLabCut/videos/'], videotype='.mp4', save_as_csv=True)#保存csv,h5
deeplabcut.filterpredictions(config_path,['DeepLabCut/videos/tou.mp4'], shuffle=1)#保存csv,h5，并进行过滤
deeplabcut.create_labeled_video(config_path, ['DeepLabCut/videos/tou.mp4','DeepLabCut/videos/tou1.mp4'],filtered=True)#保存视频
deeplabcut.plot_trajectories(config_path,['DeepLabCut/videos/tou.mp4'],filtered=True)#得到曲线图
deeplabcut.extract_outlier_frames(config_path,['DeepLabCut/videos/tou.mp4'])
deeplabcut.refine_labels(config_path)

测试（test.py）：

import cv2
import functools
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import time

import sys
sys.path.append('DeepLabCut')
import mobilenet_v2, mobilenet, conv_blocks
from tensorflow.contrib.slim.nets import resnet_v1




vers = (tf.__version__).split('.')
if int(vers[0])==1 and int(vers[1])>12:
    TF=tf.compat.v1
else:
    TF=tf




def wrapper(func, *args, **kwargs):
    partial_func = functools.partial(func, *args, **kwargs)
    functools.update_wrapper(partial_func, func)
    return partial_func

networks = {
    'mobilenet_v2_1.0': (
        mobilenet_v2.mobilenet_base,
        mobilenet_v2.training_scope,
    ),
    'mobilenet_v2_0.75': (
        wrapper(mobilenet_v2.mobilenet_base,
                        depth_multiplier=0.75,
                        finegrain_classification_mode=True),
        mobilenet_v2.training_scope,
    ),
    'mobilenet_v2_0.5': (
        wrapper(mobilenet_v2.mobilenet_base,
                        depth_multiplier=0.5,
                        finegrain_classification_mode=True),
        mobilenet_v2.training_scope,
    ),
    'mobilenet_v2_0.35': (
        wrapper(mobilenet_v2.mobilenet_base,
                        depth_multiplier=0.35,
                        finegrain_classification_mode=True),
        mobilenet_v2.training_scope,
    ),
    'resnet_50': (resnet_v1.resnet_v1_50,resnet_v1.resnet_arg_scope()),
    'resnet_101': (resnet_v1.resnet_v1_101,resnet_v1.resnet_arg_scope()),
    'resnet_152': (resnet_v1.resnet_v1_152,resnet_v1.resnet_arg_scope())
}




class PoseNet(object):
    def __init__(self):
        self.scale = 0.4#0.8
        self.batch_size = 1
        self.location_refinement =  True
        self.init_weights = r"/models/snapshot-360000"
        self.locref_stdev = 7.2801
        self.stride = 8.0
        self.net_type = 'resnet_50'#'mobilenet_v2_1.0'#
        self.mean_pixel = [123.68, 116.779, 103.939]
        self.weight_decay = 0.0001
        self.num_joints = 17
        
        self.graph_interface = True

        self.intermediate_supervision = False
        self.intermediate_supervision_layer = "12"

        if self.graph_interface == True:
            self.sess, self.inputs, self.outputs = self.setup_pose_prediction_graph()
            #update the true graph
            #self.restorer.save(self.sess, "./20w/20w", global_step=200000)
        else:
            self.sess, self.inputs, self.outputs = self.setup_pose_prediction()






        self.POSE_COLORS = [(255, 0, 85), \
            (255, 0, 0), \
            (255, 85, 0), \
            (255, 170, 0), \
            (255, 255, 0), \
            (170, 255, 0), \
            (85, 255, 0), \
            (0, 255, 0), \
            (0, 255, 85), \
            (0, 255, 170), \
            (0, 255, 255), \
            (0, 170, 255), \
            (0, 85, 255), \
            (0, 0, 255), \
            (255, 0, 170), \
            (170, 0, 255), \
            (255, 0, 255)]
        self.POSE_LINES = [(0,1), \
            (1,2), \
            (2,3), \
            (3,4), \
            (4,5), \
            (5,6), \
            (6,7), \
            (7,8), \
            (6,9), \
            (9,10), \
            (11,12), \
            (12,13), \
            (14,15), \
            (15,16), \
            (0,11), \
            (0,14)]


    #def __del__(self):
    #    self.sess.close()

    def process(self,image):
        if self.graph_interface == True:
            pose ,qulv, cov= self.infer_graph(image)
        else:
            pose ,qulv, cov= self.infer(image)
        return pose ,qulv, cov



    def infer(self,image):
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = image -np.ones_like(image)*self.mean_pixel
        if self.scale!=1:
            image =cv2.resize(image,(int(image.shape[1]*self.scale),int(image.shape[0]*self.scale)),interpolation=cv2.INTER_AREA)
        image_batch = np.expand_dims(image, axis=0).astype(float)#(batch,height,width,3)


    
        outputs_np = self.sess.run(self.outputs, feed_dict={self.inputs: image_batch})

        scmap, locref = self.extract_cnn_output(outputs_np)#scmap:(height/8,width/8,4)    locref:(height/8,width/8,4,2)
        print("image_batch",image_batch.shape,"scmap",scmap.shape,"locref",locref.shape)
        pose = self.argmax_pose_predict(scmap, locref)
        

        pose[:,:-1]=pose[:,:-1]/self.scale

        #print(np.asarray(pose[2:7,:-1]))
        qulv, cov = self.compute_curvature_cov(np.asarray(pose[2:7,:-1]))


        return pose ,qulv, cov


    def infer_graph(self,image):
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = image -np.ones_like(image)*self.mean_pixel
        if self.scale!=1:
            image =cv2.resize(image,(int(image.shape[1]*self.scale),int(image.shape[0]*self.scale)),interpolation=cv2.INTER_AREA)
        image_batch = np.expand_dims(image, axis=0).astype(float)#(batch,height,width,3)


    
        pose = self.sess.run(self.outputs, feed_dict={self.inputs: image_batch})

        #print(pose)
        pose[:,:-1]=pose[:,:-1]/self.scale

        #print(np.asarray(pose[2:7,:-1]))
        qulv, cov = self.compute_curvature_cov(np.asarray(pose[0:4,:-1]))


        return pose ,qulv, cov


    def prediction_layer(self, input, name, num_outputs):
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], padding='SAME',
                            activation_fn=None, normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(self.weight_decay)):
            with tf.variable_scope(name):
                pred = slim.conv2d_transpose(input, num_outputs,
                                             kernel_size=[3, 3], stride=2,
                                             scope='block4')
                
                return pred



    def get_net(self, inputs):
        out = {}
        net_fun, net_arg_scope = networks[self.net_type]

        if self.net_type == 'resnet_50' or self.net_type == 'resnet_101' or self.net_type == 'resnet_152':
            with slim.arg_scope(resnet_v1.resnet_arg_scope()):
                net, end_points = net_fun(inputs,
                                          global_pool=False, output_stride=16,is_training=False)
        else: 
            with slim.arg_scope(net_arg_scope()):
                net, end_points = net_fun(inputs)


        with tf.variable_scope('pose', reuse=False):
            out['part_pred'] = tf.sigmoid(self.prediction_layer(net, 'part_pred',self.num_joints))
            if self.location_refinement:
                out['locref'] = self.prediction_layer(net, 'locref_pred',self.num_joints * 2)
            if self.intermediate_supervision:
                #print(end_points.keys()) >> to see what else is available.
                out['part_pred_interm'] = self.prediction_layer(end_points['layer_'+self.intermediate_supervision_layer],
                                           'intermediate_supervision',
                                           self.num_joints)
            #print("out['part_pred']",out['part_pred'],"out['locref']",out['locref'])



        return out



    def get_net_graph(self,inputs):
        ''' Direct TF inference on GPU. Added with: https://arxiv.org/abs/1909.11229'''
        heads = self.get_net(inputs)

        with tf.variable_scope('post_process', reuse=False):
            locref= tf.transpose(heads['locref'],(0,2,1,3))
            probs = tf.transpose(heads['part_pred'],(0,2,1,3))

            probs = tf.squeeze(probs, axis=0)
            locref = tf.squeeze(locref, axis=0)
            l_shape = tf.shape(probs)

            locref = tf.reshape(locref, (l_shape[0]*l_shape[1], -1, 2))
            probs = tf.reshape(probs , (l_shape[0]*l_shape[1], -1))

            maxloc = tf.argmax(probs, axis=0)

            
            loc = tf.reshape(tf.concat([tf.cast(maxloc/tf.cast(l_shape[1], tf.int64), tf.int64),maxloc - tf.cast(l_shape[1], tf.int64)*(tf.cast(maxloc/tf.cast(l_shape[1], tf.int64), tf.int64))],0),(2,-1))
            #loc = tf.unravel_index(maxloc, (tf.cast(l_shape[0], tf.int64) ,tf.cast(l_shape[1], tf.int64)))

            maxloc = tf.reshape(maxloc, (1, -1))

            joints = tf.reshape(tf.range(0, tf.cast(l_shape[2], dtype=tf.int64)), (1,-1))
            indices = tf.transpose(tf.concat([maxloc,joints] , axis=0))

            offset = tf.gather_nd(locref, indices)
            offset = tf.gather(offset, [1,0], axis=1)
            #likelihood = tf.reshape(tf.gather_nd(probs, indices), (-1,1))
            likelihood = tf.reshape(tf.reduce_max(probs, axis=0), (-1,1))

            pose = self.stride*tf.cast(tf.transpose(loc), dtype=tf.float32) + self.stride*0.5 + offset*self.locref_stdev
            pose = tf.concat([pose, likelihood], axis=1)
            #print("xxxxx",pose)
        return {'pose': pose}

    def setup_pose_prediction_graph(self):
        TF.reset_default_graph()
        #inputs = TF.placeholder(tf.float32, shape=[self.batch_size, 720*0.4, 1280*0.4, 3])
        inputs = TF.placeholder(tf.float32, shape=[self.batch_size, None, None, 3])
        pose = self.get_net_graph(inputs)
        outputs = pose['pose']


        self.restorer = TF.train.Saver()
        sess = TF.Session()
        sess.run(TF.global_variables_initializer())
        sess.run(TF.local_variables_initializer())

        # Restore variables from disk.
        self.restorer.restore(sess, self.init_weights)


        return sess, inputs, outputs



    def setup_pose_prediction(self):
        TF.reset_default_graph()
        inputs = TF.placeholder(tf.float32, shape=[self.batch_size, None, None, 3])
        net_heads = self.get_net(inputs)
        outputs = [net_heads['part_pred']]
        if self.location_refinement:
            outputs.append(net_heads['locref'])

        self.restorer = TF.train.Saver()
        sess = TF.Session()
        sess.run(TF.global_variables_initializer())
        sess.run(TF.local_variables_initializer())

        # Restore variables from disk.
        self.restorer.restore(sess, self.init_weights)


        return sess, inputs, outputs




    def extract_cnn_output(self, outputs_np):
        ''' extract locref + scmap from network '''
        scmap = outputs_np[0]
        scmap = np.squeeze(scmap)
        locref = None
        if self.location_refinement:
            locref = np.squeeze(outputs_np[1])
            shape = locref.shape
            locref = np.reshape(locref, (shape[0], shape[1], -1, 2))
            locref *= self.locref_stdev
        if len(scmap.shape) == 2:  # for single body part!
            scmap = np.expand_dims(scmap, axis=2)
        return scmap, locref

    def argmax_pose_predict(self, scmap, offmat):
        """Combine scoremat and offsets to the final pose."""
        num_joints = scmap.shape[2]
        pose = []
        for joint_idx in range(num_joints):
            maxloc = np.unravel_index(np.argmax(scmap[:, :, joint_idx]),
                                      scmap[:, :, joint_idx].shape)
            offset = np.array(offmat[maxloc][joint_idx])[::-1]
            
            pos_f8 = (np.array(maxloc).astype('float') * self.stride + 0.5 * self.stride +
                      offset)
            pose.append(np.hstack((pos_f8[::-1],
                                   [scmap[maxloc][joint_idx]])))
        return np.array(pose)









    def draw_results(self,image,pose ,qulv, cov):
        for num,point in enumerate(self.POSE_LINES):
            cv2.line(image, (int(pose[self.POSE_LINES[num][0]][0]),int(pose[self.POSE_LINES[num][0]][1])),(int(pose[self.POSE_LINES[num][1]][0]),int(pose[self.POSE_LINES[num][1]][1])), self.POSE_COLORS[num],4, 0)
        for num,point in enumerate(pose):
            cv2.circle(image, (int(point[0]),int(point[1])),12, self.POSE_COLORS[num],-1, 0)
        font = cv2.FONT_HERSHEY_SIMPLEX
        image = cv2.putText(image, 'qulv:{} cov:{}'.format(qulv,cov), (int(pose[6][0]), int(pose[6][1])), font, 0.8, (128, 128, 0), 2)

        return image

    def compute_curvature_cov(self, input_numpy):
        #曲率越大，越弯曲
        #       |y''|
        #k= -----------
        #         3/2
        #   (1+y'2)
        #y’(i) = (y(i+1)-y(i))/h
        #y’’(i) = (y(i+1)+y(i-1)-2*y(i))/h^2

        #双线性插值
        chazhi_ratio = 10#每2个点之间插值10个点
        input_list = []
        #print(input_numpy.shape)
        for i in range(input_numpy.shape[0]-1):
            k = (input_numpy[i+1,1]-input_numpy[i,1])/(input_numpy[i+1,0]-input_numpy[i,0])
            b = input_numpy[i+1,1]-k*input_numpy[i+1,0]
            line_space = (input_numpy[i+1,0]-input_numpy[i,0])/chazhi_ratio
            for d in list(range(chazhi_ratio)):
                input_list.append(k * (input_numpy[i,0]+d)+b )

        input_numpy = np.asarray(input_list)

        input_numpy_yijie = (input_numpy[1:]-input_numpy[:-1])/1.0
        input_numpy_yijie = input_numpy_yijie[:-1]
        input_numpy_erjie = (input_numpy[2:]+input_numpy[:-2]-2*input_numpy[1:-1])/(1.0*1.0)
        #print(input_numpy_yijie.shape,input_numpy_erjie.shape)
        K = np.abs(input_numpy_erjie) /np.power((np.ones_like(input_numpy_yijie)+input_numpy_yijie*input_numpy_yijie),3.0/2.0)

        mean = np.mean(np.asarray(input_numpy))
        std = np.std(np.asarray(input_numpy))
        cov = std/mean


        return round(np.mean(K),3) ,round(cov,3)



def test_image():
    posenet = PoseNet()

    image = cv2.imread(r"img068734.png")
    pose ,qulv, cov= posenet.process(image)
    print(pose)
    image = posenet.draw_results(image, pose ,qulv, cov)

    cv2.imshow("aa",image)
    cv2.waitKey()


def test_video():
    posenet = PoseNet()

    cap=cv2.VideoCapture("../t.mp4")
    while (True):
        ret,image=cap.read()
 
        if ret == True:
            
            time_start=time.time() 
            pose ,qulv, cov= posenet.process(image)
            print('totally cost{} s'.format(time.time()-time_start))
            #print(pose)
            image = posenet.draw_results(image,pose ,qulv, cov)
            cv2.imshow("video",image)
            cv2.waitKey(1)
            # 在播放每一帧时，使用cv2.waitKey()设置适当的持续时间。如果设置的太低视频就会播放的非常快，如果设置的太高就会播放的很慢。通常情况下25ms就ok
            if 0xFF==ord('q'):
                break
        else:
            break
 
    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    #test_image()
    test_video()

实验结果：

总结：

DeepLabCut一个基于关键点的检测网络。可以用于人体，动物的姿势估计。

优点，需要的打标数据少，训练速度很快，效果惊人的好，支持任意多个点。

缺点，测试的时候，只能进行单个目标的检测，对于多个目标也可以检测出关键点，但是点和点之间的连接问题，就没法解决，这点不像openpose这种自底向上更好。当然训练的时候，也可以进行多个目标的关键点的打标，而且这样做还有助于提高训练精度。

watersink

发布了219 篇原创文章 · 获赞 898 · 访问量 140万+

他的留言板关注

猜你喜欢