深度学习——3D Fully Convolutional Network for Vehicle Detection in Point Cloud模型实现 3D Fully Convolutional Network for Vehicle Detection in Point Cloud

1. 参考文献

3D Fully Convolutional Network for Vehicle Detection in Point Cloud

2. 模型实现

'''
Baidu Inc. 

Ref: 
3D Fully Convolutional Network for Vehicle Detection in Point Cloud

Author: HSW 
Date: 2018-05-02 
'''


import sys
import numpy as np 
import tensorflow as tf 
from prepare_data2 import * 
from baidu_cnn_3d import * 

KITTI_TRAIN_DATA_CNT = 7481
KITTI_TEST_DATA_CNT  = 7518


# create 3D-CNN Model
def create_graph(sess, modelType = 0, voxel_shape = (400, 400, 20),  activation=tf.nn.relu, is_train = True): 
	'''
	Inputs: 
		sess: tensorflow Session Object 
		voxel_shape: voxel shape for network first layer 
		activation: 
		phrase_train: 
	Outputs: 
		voxel, graph, sess 
	'''
	voxel = tf.placeholder(tf.float32, [None, voxel_shape[0], voxel_shape[1], voxel_shape[2], 1])

	phase_train = tf.placeholder(tf.bool, name="phase_train") if is_train else None 
	
	with tf.variable_scope("3D_CNN_Model") as scope: 
		model = Full_CNN_3D_Model()
		
		model.cnn3d_graph(voxel, modelType = modelType, activation=activation, phase_train = is_train)
		
	if is_train: 
		initialized_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="3D_CNN_model")
		sess.run(tf.variables_initializer(initialized_var))
	
	return voxel, model, phase_train


# read batch data 
def read_batch_data(batch_size, data_set_dir,objectType = "Car", split = "training", resolution=(0.2, 0.2, 0.2), scale=0.25, limitX = (0,80), limitY=(-40,40), limitZ=(-2.5,1.5)): 
	'''
	Inputs: 
		batch_size: 
		data_set_dir: 
		objectType: default is "Car"
		split: default is "training"
		resolution: 
		scale: outputSize / inputSize 
		limitX: 
		limitY: 
		limitZ: 
	Outputs: 
		
	'''
	kitti_3DVoxel = kitti_3DVoxel_interface(data_set_dir, objectType = objectType, split=split, scale = scale, resolution = resolution, limitX = limitX, limitY = limitY, limitZ = limitZ)
	
	TRAIN_PROCESSED_IDX  = 0
	TEST_PROCESSED_IDX   = 0
	
	if split == "training": 
		while TRAIN_PROCESSED_IDX < KITTI_TRAIN_DATA_CNT: 
			batch_voxel = []
			batch_g_obj = []
			batch_g_cord = []
			
			idx = 0 
			while idx < batch_size and TRAIN_PROCESSED_IDX < KITTI_TRAIN_DATA_CNT: 	
				
				print(TRAIN_PROCESSED_IDX)
				voxel, g_obj, g_cord = kitti_3DVoxel.read_kitti_data(TRAIN_PROCESSED_IDX)
				TRAIN_PROCESSED_IDX += 1
				
				if voxel is None:
					continue
				
				idx += 1 
				
				# print(voxel.shape)
				batch_voxel.append(voxel)
				batch_g_obj.append(g_obj)
				batch_g_cord.append(g_cord)
			
			yield np.array(batch_voxel, dtype=np.float32)[:, :, :, :, np.newaxis], np.array(batch_g_obj, dtype=np.float32), np.array(batch_g_cord, dtype=np.float32)
	
	elif split == "testing": 	
		while TEST_PROCESSED_IDX < KITTI_TEST_DATA_CNT: 
			batch_voxel = []
			
			idx = 0
			while idx < batch_size and TEST_PROCESSED_IDX < KITTI_TEST_DATA_CNT: 
			
				voxel = kitti_3DVoxel.read_kitti_data(iter * batch_size + idx)
				TEST_PROCESSED_IDX += 1
				
				if voxel is None: 
					continue
				
				idx += 1	
				batch_voxel.append(voxel)
			
			yield np.array(batch_voxel, dtype=np.float32)[:, :, :, :, np.newaxis]
		
	

# train 3D-CNN Model 
def train(batch_num, data_set_dir, modelType = 0, objectType = "Car", resolution=(0.2,0.2,0.2), scale = 0.25, lr=0.01, limitX=(0,80), limitY=(-40,40), limitZ=(-2.5,1.5), epoch=101): 
	'''
	Inputs: 
		batch_num: 
		data_set_dir: 
		modelType: 
		objectType: 
		resolution: 
		scale: 
		lr: 
		limitX, limitY, limitZ: 
	Outputs: 
		None
	'''
	batch_size      = batch_num
	training_epochs = epoch
	sizeX = int(round((limitX[1] - limitX[0]) / resolution[0]))
	sizeY = int(round((limitY[1] - limitY[0]) / resolution[0]))
	sizeZ = int(round((limitZ[1] - limitZ[0]) / resolution[0]))
	voxel_shape = (sizeX, sizeY, sizeZ)
	with tf.Session() as sess: 
		voxel, model, phase_train = create_graph(sess, modelType = modelType, voxel_shape = voxel_shape, activation=tf.nn.relu, is_train = True)
		saver = tf.train.Saver()
		
		total_loss, obj_loss, cord_loss, is_obj_loss, non_obj_loss, g_obj, g_cord, y_pred = model.loss_Fun(lossType = 0, cord_loss_weight = 0.02)
		
		optimizer = model.create_optimizer(total_loss, optType = "Adam", learnRate = 0.001)
		
		init = tf.global_variables_initializer()
		
		sess.run(init)
		
		for epoch in range(training_epochs): 
			batchCnt = 0; 
			for (batch_voxel, batch_g_obj, batch_g_cord) in read_batch_data(batch_size, data_set_dir, objectType = objectType, split = "training", resolution = resolution, scale = scale, limitX = limitX, limitY = limitY, limitZ = limitZ): 
				# print("batch_g_obj")
				# print(batch_g_obj.shape)
				sess.run(optimizer, feed_dict={voxel: batch_voxel, g_obj: batch_g_obj, g_cord: batch_g_cord, phase_train: True})
				cord_cost = sess.run(cord_loss, feed_dict={voxel: batch_voxel, g_obj: batch_g_obj, g_cord: batch_g_cord, phase_train: True})
				obj_cost = sess.run(is_obj_loss, feed_dict={voxel: batch_voxel, g_obj: batch_g_obj, g_cord: batch_g_cord, phase_train: True})
				non_obj_cost = sess.run(non_obj_loss, feed_dict={voxel: batch_voxel, g_obj: batch_g_obj, g_cord: batch_g_cord, phase_train: True})
				
				print("Epoch: ",  (epoch + 1), ",", "BatchNum: ", (batchCnt + 1), "," , "cord_cost = ", "{:.9f}".format(cord_cost))
				print("Epoch: ",  (epoch + 1), ",", "BatchNum: ", (batchCnt + 1), "," , "obj_cost = ", "{:.9f}".format(obj_cost))
				print("Epoch: ",  (epoch + 1), ",", "BatchNum: ", (batchCnt + 1), "," , "non_obj_cost = ", "{:.9f}".format(non_obj_cost))
				
				batchCnt += 1
				
			if (epoch > 0) and (epoch % 10 == 0): 
				saver.save(sess, "velodyne_kitti_train_" + str(epoch) + ".ckpt")
				
		print("Training Finishied !")


# test 3D-CNN Model
def test(batch_num, data_set_dir, modelType = 0, objectType = "Car", resolution=(0.2, 0.2, 0.2), scale = 0.25, limitX = (0, 80), limitY = (-40, 40), limitZ=(-2.5, 1.5)): 
	'''
	Inputs: 
		batch_num: 
		data_set_dir: 
		resolution: 
		scale:
		limitX, limitY, limitZ:  
	Outputs: 
		None 
	'''
	sizeX = int(round((limitX[1] - limitX[0]) / resolution[0]))
	sizeY = int(round((limitY[1] - limitY[0]) / resolution[0]))
	sizeZ = int(round((limitZ[1] - limitZ[0]) / resolution[0]))
	voxel_shape = (sizeX, sizeY, sizeZ)
	batch_size = batch_num; 
	
	batch_voxel = read_batch_data(batch_num, data_set_dir, objectType = objectType, split="Testing", resolution=resolution, scale=scale, limitX=limitX, limitY=limitY, limitZ=limitZ)
	
	batch_voxel_x = batch_voxel.reshape(1, batch_voxel.shape[0], batch_voxel.shape[1], batch_voxel.shape[2], 1)
	
	with tf.Session() as sess: 
		is_train = False
		voxel, model, phase_train = create_graph(sess, modelType = modelType, voxel_shape = voxel_shape, activation=tf.nn.relu, is_train = False)
		new_saver = tf.train.import_meta_graph("velodyne_kitti_train_40.ckpt.meta")
		last_model = "./velodyne_kitti_train_40.ckpt"
		saver.restore(sess, last_model)
		
		objectness = model.objectness
		cordinate  = model.cordinate
		y_pred     = model.y 
		
		objectness = sess.run(objectness, feed_dict={voxel: batch_voxel_x})[0, :, :, :, 0]
		cordinate  = sess.run(cordinate, feed_dict={voxel:batch_voxel_x})[0]
		y_pred     = sess.run(y_pred, feed_dict={voxel: batch_voxel_x})[0, :, :, :, 0]
		
		idx        = np.where(y_pred >= 0.995)
		
		spheres    = np.vstack((index[0], np.vstack((index[1], index[2])))).transpose()
		
		centers    = spheres_to_centers(spheres, scale = scale, resolution=resolution, limitX = limitX, limitY = limitY, limitZ = limitZ)
		
		corners    = cordinate[idx].reshape[-1, 8, 3] + centers[:, np.newaxis]
		
		print(centers)
		print(corners)
		

if __name__ == "__main__":
	
	batch_num       = 3
	data_set_dir    = "/home/hsw/桌面/PCL_API_Doc/frustum-pointnets-master/dataset"
	modelType       = 1
	objectType      = "Car"
	resolution      = (0.2, 0.2, 0.2)
	scale           = 0.25 
	lr              = 0.001
	limitX          = (0, 80)
	limitY          = (-40, 40)
	limitZ          = (-2.5, 1.5)       
	epoch           = 101 
	train(batch_num, data_set_dir = data_set_dir, modelType = modelType, objectType = objectType, resolution=resolution, scale=scale, lr =lr, limitX = limitX, limitY = limitY, limitZ = limitZ)
	saver = tf.train.Saver()

2.1 网络模型

'''
Baidu Inc. 

Ref: 
3D Fully Convolutional Network for Vehicle Detection in Point Cloud

Author: HSW 
Date: 2018-05-02 
'''

import numpy as np 
import tensorflow as tf 

class Full_CNN_3D_Model(object): 
	'''
		Define Full CNN Model
	'''
	
	def __init__(self): 
		pass; 
		
	def cnn3d_graph(self, voxel, modelType = 0, activation = tf.nn.relu, phase_train = True): 
		if modelType == 0: 
			# Modefied 3D-CNN, 该网络结构不可使用,因为降采样太严重(降采样1/8)导致在预测时会出现较大误差 
			self.layer1     = self.conv3d_layer(voxel      ,  1, 16, 5, 5, 5, [1, 2, 2, 2, 1], name="layer1", activation=activation, phase_train=phase_train)
						
			self.layer2     = self.conv3d_layer(self.layer1, 16, 32, 5, 5, 5, [1, 2, 2, 2, 1], name="layer2", activation=activation, phase_train=phase_train)
			
			self.layer3     = self.conv3d_layer(self.layer2, 32, 64, 3, 3, 3, [1, 2, 2, 2, 1], name="layer3", activation=activation, phase_train=phase_train)
			
			self.layer4     = self.conv3d_layer(self.layer3, 64, 64, 3, 3, 3, [1, 1, 1, 1, 1], name="layer4", activation=activation, phase_train=phase_train)
			
			self.objectness = self.conv3D_to_output(self.layer4, 64, 2, 3, 3, 3, [1, 1, 1, 1, 1], name="objectness", activation=None)
			
			self.cordinate  = self.conv3D_to_output(self.layer4, 64, 24, 3, 3, 3, [1, 1, 1, 1, 1], name="cordinate", activation=None)
			
			self.y          = tf.nn.softmax(self.objectness, dim=-1)
			
		elif modelType == 1: 
			# 3D-CNN(论文网络结构： 降采样1/4，即InputSize / OutputSize = 0.25)
			self.layer1       = self.conv3d_layer(voxel      ,  1, 10, 5, 5, 5, [1, 2, 2, 2, 1], name="layer1", activation=activation, phase_train=phase_train)
			
			self.layer2       = self.conv3d_layer(self.layer1, 10, 20, 5, 5, 5, [1, 2, 2, 2, 1], name="layer2", activation=activation, phase_train=phase_train)
			
			self.layer3       = self.conv3d_layer(self.layer2, 20, 30, 3, 3, 3, [1, 2, 2, 2, 1], name="layer3", activation=activation, phase_train=phase_train)
			
			base_shape        = self.layer2.get_shape().as_list()
			
			obj_output_shape  = [tf.shape(self.layer3)[0], base_shape[1], base_shape[2], base_shape[3], 2]
			
			cord_output_shape = [tf.shape(self.layer3)[0], base_shape[1], base_shape[2], base_shape[3], 24]
			
			self.objectness   = self.deconv3D_to_output(self.layer3, 30, 2, 3, 3, 3, [1, 2, 2, 2, 1], obj_output_shape, name="objectness", activation=None)
			
			self.cordinate    = self.deconv3D_to_output(self.layer3, 30, 24, 3, 3, 3, [1, 2, 2, 2, 1], cord_output_shape, name="cordinate", activation=None)
			
			self.y            = tf.nn.softmax(self.objectness, dim=-1)
		
	# batch Normalize 
	def batch_norm(self, inputs, phase_train = True, decay = 0.9, eps = 1e-5): 
		'''
		Inputs: 
			inputs: input data for last layer 
			phase_train: True / False, = True is train, = False is Test 
		Outputs: 
			norm data for next layer 
		'''
		gamma = tf.get_variable("gamma", shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(1.0))
		beta = tf.get_variable("beta", shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
		pop_mean = tf.get_variable("pop_mean", trainable=False, shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
		pop_var = tf.get_variable("pop_var", trainable=False, shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(1.0))
		axes = range(len(inputs.get_shape()) - 1)
		if phase_train == True:
			batch_mean, batch_var = tf.nn.moments(inputs, axes = [0, 1, 2, 3])
			train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean*(1 - decay))
			train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay))
			with tf.control_dependencies([train_mean, train_var]):
				return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, eps)
		else: 
			return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, eps)
			
		
	# 3D Conv Layer 
	def conv3d_layer(self, inputs, inputs_dims, outputs_dims, height, width, length, stride, activation=tf.nn.relu, padding="SAME", name="", phase_train = True): 
		'''
		Inputs: 
			inputs: pre-Layer output 
			inputs_dims: pre-Layer output channels 
			outputs_dims: cur-Layer output channels 
			[length, height, width]: cur-Layer conv3d kernel size 
			stride: conv3d kernel move step in length/height/width axis
			activation: default use relu activation function 
			padding: conv3d 'padding' parameter 
		Outputs: 
			3D Conv. Layer outputs 	
		'''
		with tf.variable_scope("conv3D" + name): 
			# conv3d layer kernel 
			kernel = tf.get_variable("weights", shape=[length, height, width, inputs_dims, outputs_dims], dtype = tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.01))
			
			# conv3d layer bias 
			bias = tf.get_variable("bias", shape=[outputs_dims], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
		
			# conv3d 
			conv = tf.nn.conv3d(inputs, kernel, stride, padding=padding)
			
			bias = tf.nn.bias_add(conv, bias)
			
			if activation:
				bias = activation(bias, name="activation")
				
			bias = self.batch_norm(bias, phase_train)
			
		return bias
	
		
		
	# 3D Conv to Classification Layer 
	def conv3D_to_output(self, inputs, inputs_dims, outputs_dims, height, width, length, stride, activation=tf.nn.relu, padding="SAME", name="", phase_train = True): 
		'''
		Inputs: 
			inputs: pre-Layer outputs 
			inputs_dims: pre-Layer output channels 
			outputs_dims: cur-Layer output channels 
			stride: conv3d kernel move step in length/height/width axis
			activation: default use relu activation function 
			padding: conv3d 'padding' parameter 
			outputs_shape: de-conv outputs shape  
		Outputs: 
			conv outputs 
		'''
		with tf.variable_scope("conv3D" + name):
			kernel = tf.get_variable("weights", shape=[length, height, width, inputs_dims, outputs_dims], dtype=tf.float32, initializer=tf.constant_initializer(0.01))
			conv = tf.nn.conv3d(inputs, kernel, stride, padding=padding)
		return conv 
		
		
	# 3D Deconv. to Classification Layer 
	def deconv3D_to_output(self, inputs, inputs_dims, outputs_dims, height, width, length, stride, output_shape, activation=tf.nn.relu, padding="SAME", name="", phase_train = True): 
		'''
		Inputs: 
			inputs: pre-Layer outputs 
			inputs_dims: pre-Layer output channels 
			outputs_dims: cur-Layer output channels 
			stride: conv3d kernel move step in length/height/width axis
			activation: default use relu activation function 
			padding: conv3d 'padding' parameter 
			outputs_shape: de-conv outputs shape  
		Outputs: 
			de-conv outputs 
		'''
		with tf.variable_scope("deconv3D"+name):
			kernel = tf.get_variable("weights", shape=[length, height, width, outputs_dims, inputs_dims], dtype=tf.float32, initializer=tf.constant_initializer(0.01))
			deconv = tf.nn.conv3d_transpose(inputs, kernel, output_shape, stride, padding="SAME")
		
		return deconv 
		
	# define loss 
	def loss_Fun(self, lossType = 0, cord_loss_weight = 0.02): 
		'''
		Inputs: 
			lossType:  =  for difference loss Type 
			cord_loss_weight: 0.02 
		Outputs: 			
		'''
		if lossType == 0: 
			# print("g_obj")
			# print(self.cordinate.get_shape())
			g_obj = tf.placeholder(tf.float32, self.cordinate.get_shape().as_list()[:4])
			g_cord = tf.placeholder(tf.float32, self.cordinate.get_shape().as_list())
			non_g_obj = tf.subtract(tf.ones_like(g_obj, dtype=tf.float32), g_obj )
				
			elosion = 0.00001
			y = self.y 
				
			is_obj_loss = -tf.reduce_sum(tf.multiply(g_obj , tf.log(y[:,:,:,:,0] + elosion)))         # object loss 
			non_obj_loss = tf.reduce_sum(tf.multiply(non_g_obj, tf.log(y[:, :, :, :, 0] + elosion)))  # non-object loss 
			
			cross_entropy = tf.add(is_obj_loss, non_obj_loss)
			obj_loss = cross_entropy
				
			cord_diff = tf.multiply(g_obj , tf.reduce_sum(tf.square(tf.subtract(self.cordinate, g_cord)), 4)) # cord loss 
			cord_loss = tf.multiply(tf.reduce_sum(cord_diff), cord_loss_weight)
				
			return tf.add(obj_loss, cord_loss), obj_loss, cord_loss, is_obj_loss, non_obj_loss, g_obj, g_cord, y 
	
	# Create Optimizer 
	def create_optimizer(self, all_loss, optType = "Adam", learnRate = 0.001): 
		'''
		Inputs: 
			all_loss: graph all_loss 
			lr: learn rate 
		Outputs: 
			optimizer 
		'''
		if optType == "Adam": 
			opt = tf.train.AdamOptimizer(learnRate)
			optimizer = opt.minimize(all_loss)
		
		return optimizer

2.2 数据预处理

'''Prepase KITTI data for 3D Object detection
Ref: 3D Fully Convolutional Network for Vehicle Detection in Point Cloud

Author: Shiwen He 
Date: 28 April 2018 

'''

import numpy as np 
from kitti_object import kitti_object as kittiReader
import kitti_util   

# lidar data => 3D Grid Voxel 

# filter lidar data by camera FoV 
def filter_camera_fov(pc): 
	'''
	Inputs: 
		pc: n x 3 
	Outputs: 
		filter_pc: m x 3, m <= 3 
	Notices: 
		FoV: from -45 degree to 45 degree  
	'''
	logic_fov = np.logical_and((pc[:, 1] < pc[:, 0] - 0.27), (-pc[:, 1] < pc[:, 0] - 0.27))

	filter_pc = pc[logic_fov]
	
	return filter_pc 
	

# filter lidar data by detection range 
def filter_lidar_range(pc, limitX, limitY, limitZ):
	''' 
	Inputs: 
		pc: n x 3, 
		limitX, limitY, limitZ: 1 x 2
	Outputs: 
		filter_pc: m x 3, m <= n  
	'''
	logic_x = np.logical_and(pc[:, 0] >= limitX[0], pc[:, 0] < limitX[1])
	logic_y = np.logical_and(pc[:, 1] >= limitY[0], pc[:, 1] < limitY[1])
	logic_z = np.logical_and(pc[:, 2] >= limitZ[0], pc[:, 2] < limitZ[1])
	logic_xyz = np.logical_and(logic_x, np.logical_and(logic_y, logic_z))
	
	filter_pc = pc[:, :3][logic_xyz]
	
	return  filter_pc
	
	
# filter center + corners 
def filter_center_corners(centers, corners, boxsizes, limitX, limitY, limitZ): 
	'''
	Inputs: 
		centers: n x 3 
		corners: n x 8 x 3 
		limitX, limitY, limitZ: 1 x 2
	Outputs: 
		filter_centers: m x 3, m <= n 
		filter_corners: m x 3, m <= n  
	'''
	
	logic_x = np.logical_and(centers[:, 0] >= limitX[0], centers[:, 0] < limitX[1])
	logic_y = np.logical_and(centers[:, 1] >= limitY[0], centers[:, 1] < limitY[1])
	logic_z = np.logical_and(centers[:, 2] >= limitZ[0], centers[:, 2] < limitZ[1])
	logic_xyz = np.logical_and(logic_x, np.logical_and(logic_y, logic_z))
	
	filter_centers_1 = centers[logic_xyz, :]
	filter_corners_1 = corners[logic_xyz, :, :]
	filter_boxsizes_1 = boxsizes[logic_xyz, :]
	
	shape_centers = filter_centers_1.shape; 
	
	filter_centers = np.zeros([shape_centers[0], 3])
	filter_corners = np.zeros([shape_centers[0], 8, 3]); 
	filter_boxsizes = np.zeros([shape_centers[0], 3]); 
	
	idx = 0
	for idx2 in range(shape_centers[0]): 
		logic_x = np.logical_and(filter_corners_1[idx2, :, 0] >= limitX[0], filter_corners_1[idx2, :, 0] < limitX[1])
		logic_y = np.logical_and(filter_corners_1[idx2, :, 1] >= limitY[0], filter_corners_1[idx2, :, 1] < limitY[1])
		logic_z = np.logical_and(filter_corners_1[idx2, :, 2] >= limitZ[0], filter_corners_1[idx2, :, 2] < limitZ[1])
		logic_xyz = np.logical_and(logic_x, np.logical_and(logic_y, logic_z))

		if logic_xyz.all(): 
			filter_centers[idx, :3] = filter_centers_1[idx2, :]
			filter_corners[idx, :8, :3] = filter_corners_1[idx2, :, :] 
			filter_boxsizes[idx, :3] = filter_boxsizes_1[idx2, :]
			idx += 1		
	if idx > 0:
		return filter_centers[:idx, :], filter_corners[:idx, :, :], filter_boxsizes[:idx, :]
	else:
		return None, None, None

def filter_label(object3Ds, objectType = 'Car'): 
	'''
	Inputs: 
		object3Ds:
		objectType:  
	Outputs: 
		centers, corners, rotatey 
		
	'''
	idx = 0
	data = np.zeros([50, 7]).astype(np.float32)
	for iter in object3Ds: 
		if iter.type == "DontCare": 
			continue;
					
		if iter.type == objectType: 
			# position 
			data[idx, 0] = iter.t[0]
			data[idx, 1] = iter.t[1]
			data[idx, 2] = iter.t[2]
			# size 
			data[idx, 3] = iter.h
			data[idx, 4] = iter.w
			data[idx, 5] = iter.l 
			# rotate 
			data[idx, 6] = iter.ry
			idx += 1         
	if idx > 0:
		return data[:idx, :3], data[:idx, 3:6], data[:idx, 6]
	else:
		return None, None, None
		
		
def proj_to_velo(calib_data):
	"""
	Inputs: 
		calib_data: 
	Outputs: 
		project matrix: from camera cordination to velodyne cordination
	"""
 
	rect            = calib_data.R0; # calib_data["R0_rect"].reshape(3, 3)
	velo_to_cam     = calib_data.V2C; # calib_data["Tr_velo_to_cam"].reshape(3, 4)
	inv_rect        = np.linalg.inv(rect)
	inv_velo_to_cam = np.linalg.pinv(velo_to_cam[:, :3])
	
	return np.dot(inv_velo_to_cam, inv_rect)


# corners_3d
def compute_3d_corners(centers, sizes, rotates): 
	''' 
	Inputs: 
			centers: 
			rotates: 
			sizes: 
	Outputs: 
			corners_3d:  n x 8 x 3 array in Lidar coord.
	'''
	
	# print(centers) 
	
	corners = []
	
	for place, rotate, sz in zip(centers, rotates, sizes):
		x, y, z = place
		h, w, l = sz
		if l > 10:
			continue

		corner = np.array([
			[x - l / 2., y - w / 2., z],
			[x + l / 2., y - w / 2., z],
			[x - l / 2., y + w / 2., z],
			[x - l / 2., y - w / 2., z + h],
			[x - l / 2., y + w / 2., z + h],
			[x + l / 2., y + w / 2., z],
			[x + l / 2., y - w / 2., z + h],
			[x + l / 2., y + w / 2., z + h],
		])

		corner -= np.array([x, y, z])

		rotate_matrix = np.array([
			[np.cos(rotate), -np.sin(rotate), 0],
			[np.sin(rotate), np.cos(rotate), 0],
			[0, 0, 1]
		])

		a = np.dot(corner, rotate_matrix.transpose())
		a += np.array([x, y, z])
		corners.append(a)
		
	corners_3d  = np.array(corners)   
	
	return corners_3d
	
# lidar data to 3D Grid Voxel 
def lidar_to_binary_voxel(pc, resolution, limitX, limitY, limitZ):
	''' 
	Inputs: 
		pc: n x 3, 
		resolution: 1 x 3, 
		limitX, limitY, limitZ: 1 x 2
	Outputs: 
		voxel: shape is inputSize  
	'''
	
	voxel_pc = np.zeros_like(pc).astype(np.int32)
	 
	# Compute PointCloud Position in 3D Grid 
	voxel_pc[:, 0] = ((pc[:, 0] - limitX[0]) / resolution[0]).astype(np.int32)
	voxel_pc[:, 1] = ((pc[:, 1] - limitY[0]) / resolution[1]).astype(np.int32)
	voxel_pc[:, 2] = ((pc[:, 2] - limitZ[0]) / resolution[2]).astype(np.int32)
	
	# 3D Grid 
	voxel = np.zeros((int(round(limitX[1] - limitX[0]) / resolution[0]), int(round(limitY[1] - limitY[0]) / resolution[1]), \
	 int(round((limitZ[1] - limitZ[0]) / resolution[2])))) 
	
	# 3D Grid Value 
	voxel[voxel_pc[:, 0], voxel_pc[:, 1], voxel_pc[:, 2]] = 1
	
	return voxel 


# label center to 3D Grid Voxel Center(sphere) 
def center_to_sphere(centers, boxsize, scale, resolution, limitX, limitY, limitZ):
	''' 
	Inputs: 
		center:  n x 3 
		boxsize: n x 3
		scale: 1 x 1, = outputSize / inputSize
		resolution: 1 x 3
		limitX, limitY, limitZ: 1 x 2
	Outputs: 
		spheres: m x 3, m <= n  
	'''
	
	# from 3D Box's bottom center => 3D center 
	move_center = centers.copy(); 	

	print("centers")
	print(centers)
	print("boxsize")
	print(boxsize)
	
	move_center[:, 2] = centers[:, 2] + boxsize[:, 0] / 2;  

	
	# compute Label Center PointCloud Position in 3D Grid 
	spheres = np.zeros_like(move_center).astype(np.int32)
	
	spheres[:, 0] = ((move_center[:, 0] - limitX[0]) / resolution[0] * scale).astype(np.int32)
	spheres[:, 1] = ((move_center[:, 1] - limitY[0]) / resolution[1] * scale).astype(np.int32)
	spheres[:, 2] = ((move_center[:, 2] - limitZ[0]) / resolution[2] * scale).astype(np.int32)
	

	print("move_center")
	print(move_center)
	print("spheres")
	print(spheres)
	
	return spheres


# 3D Grid Voxel Center(sphere) to label center 
def sphere_to_center(spheres, scale, resolution, limitX, limitY, limitZ): 
	'''
	Inputs: 
		spheres: n x 3 
		scale: 1 x 1, = outputSize /  inputSize 
		resolution: 1 x 3
		limitX, limitY, limitZ: 1 x 2 
	Outputs: 
		centers: m x 3, m <= 3 
	'''
	centers = np.zeros_like(spheres).astype(np.float32); 
	
	centers[:, 0] = spheres[:, 0] * resolution[0] / scale + limitX[0]
	centers[:, 1] = spheres[:, 1] * resolution[1] / scale + limitY[0]
	centers[:, 2] = spheres[:, 2] * resolution[2] / scale + limitZ[0]
	
	return centers

# label corners to 3D Grid Voxel: corners - centers   
def corners_to_train(spheres, corners, scale, resolution, limitX, limitY, limitZ):
	'''
	Inputs: 
		spheres: n x 3
		corners: n x 8 x 3 
		scale: 1 x 1, = outputSize / inputSize
		resolution: 1 x 3
		limitX, limitY, limitZ: 1 x 2 
	Outputs: 
		train_corners: m x 3, m <= n 
	'''
	
	# 3D Grid Voxel Center => label center 
	centers = sphere_to_center(spheres, scale, resolution, limitX, limitY, limitZ)
	train_corners = np.zeros_like(corners).astype(np.float32)
	
	# train corners for regression loss 
	for index, (corner, center) in enumerate(zip(corners, centers)):
		train_corners[index] = corner - center  
	
	return train_corners
	
		
	
	
# create center and cordination for train 
def create_train_label(centers, corners, boxsize, scale, resolution, limitX, limitY, limitZ):
	'''
	Inputs: 
		centers: n x 3 
		corners: n x 8 x 3 
		boxsize: n x 3 
		scale: 1 x 1, outputSize / inputSize
		resolution: 1 x 3 
		limitX. limitY, limitZ: 1 x 2 
	Outputs: 
		train_centers: m x 3, m <= n 
		train_corners: m x 3, m <= n 
	'''
	
	train_centers = center_to_sphere(centers, boxsize, scale, resolution, limitX, limitY, limitZ)
	train_corners = corners_to_train(train_centers, corners, scale, resolution, limitX, limitY, limitZ)
	
	return train_centers, train_corners 
	

def create_obj_map(train_centers, scale, resolution, limitX, limitY, limitZ):
	'''
	Inputs: 
		centers: n x 3 
		scale: 1 x 1, outputSize / inputSize
		resolution: 1 x 3
		limitX, limitY, limitZ: 1 x 2
	Outputs: 
		obj_map: shape is scale * inputSize 
	'''
	
	# 3D Grid 
	sizeX = int(round((limitX[1] - limitX[0]) / resolution[0] * scale))
	sizeY = int(round((limitY[1] - limitY[0]) / resolution[1] * scale))
	sizeZ = int(round((limitZ[1] - limitZ[0]) / resolution[2] * scale))
	obj_map = np.zeros([sizeX, sizeY, sizeZ]) 
	
	# print("sizeX, sizeY, sizeZ")
	# print(sizeX, sizeY, sizeZ)
	
	# objectness map: label center in objectness map where value is 1  
	obj_map[train_centers[:,0], train_centers[:, 1], train_centers[:, 2]] = 1; 
	
	return obj_map  


def create_cord_map(train_centers, train_corners, scale, resolution, limitX, limitY, limitZ):
	'''
	Inputs: 
		train_centers: n x 3 
		train_corners: n x 8 x 3 
		scale: 1 x 1, outputSize / inputSize
		resolution: 1 x 3 
		limitX, limitY, limitZ: 1 x 2
	Outputs: 
		cord_map: shape is inputSize * scale   
	''' 
	# reshape train_corners:  n x 8 x 3 => n x 24 
	corners = train_corners.reshape(train_corners.shape[0], -1) 
	# 3D Grid 
	sizeX = int(round((limitX[1] - limitX[0]) / resolution[0] * scale))
	sizeY = int(round((limitY[1] - limitY[0]) / resolution[1] * scale))
	sizeZ = int(round((limitZ[1] - limitZ[0]) / resolution[2] * scale))
	sizeD = 24
	
	cord_map = np.zeros([sizeX, sizeY, sizeZ, sizeD]) 
	
	# print(train_centers)
	
	cord_map[train_centers[:,0], train_centers[:, 1], train_centers[:, 2]] = corners
	
	return cord_map 
	

# kitti data interface:  
class kitti_3DVoxel_interface(object): 
	
	def __init__(self, root_dir, objectType = 'Car', split='training', scale = 0.25, resolution = (0.2, 0.2, 0.2), limitX = (0, 80), limitY = (-40, 40), limitZ = (-2.5, 1.5)):
		'''
		Inputs: 
			case1 root_dir: train or val. data dir, train or val.'s file struct like: 
				root_dir->training->velodyne
				root_dir->training->calib
				root_dir->training->label_2 
			case2 root_dir: test data dir, test's file struct like: 
				root_dir->testing->velodyne
				root_dir->testing->calib 
		Outputs: 
				-None 
		'''
		self.root_dir   = root_dir
		self.split      = split
		self.object     = kittiReader(self.root_dir, self.split)
		self.objectType = objectType
		self.scale      = scale
		self.resolution = resolution    
		self.limitX     = limitX
		self.limitY     = limitY
		self.limitZ     = limitZ

	def read_kitti_data(self, idx = 0): 
		'''
		Inputs:
			idx: training or testing sample index
		Outputs:
			voxel    : inputSize
			obj_map  : scale * inputSize
			cord_map : scale * inputSize
		'''
	
		kitti_Object3Ds = None
		kitti_Lidar     = None 
		kitti_Calib     = None
	
		
		if self.split == 'training':
			# read Lidar data + Lidar Label + Calib data 
			kitti_Object3Ds = self.object.get_label_objects(idx);  
			kitti_Lidar     = self.object.get_lidar(idx);  
			kitti_Calib     = self.object.get_calibration(idx); 
			
			# lidar data filter 
			filter_fov = filter_camera_fov(kitti_Lidar) 
			filter_range = filter_lidar_range(filter_fov, self.limitX, self.limitY, self.limitZ)
		
			# label filter 
			centers, boxsizes, rotates = filter_label(kitti_Object3Ds, self.objectType)
			
		
			if centers is None:
				return None, None, None 
		
			# label center: Notice from camera Coordination to velo. Coordination  	
			if not(kitti_Calib is None): 
				proj_velo = proj_to_velo(kitti_Calib)[:, :3]
				centers = np.dot(centers, proj_velo.transpose())[:, :3] 
				
			
			# label corners: 
			corners = compute_3d_corners(centers, boxsizes, rotates)
	
			
			# print(corners)
			# print(corners.shape)
		
			# filter centers + corners 
			filter_centers, filter_corners, boxsizes = filter_center_corners(centers, corners, boxsizes, self.limitX, self.limitY, self.limitZ)
		
			# print(filter_centers)
			# print(filter_corners)
		
			if not(filter_centers is None): 
				# training center
				train_centers, train_corners = create_train_label(filter_centers, filter_corners, boxsizes, self.scale, self.resolution, self.limitX, self.limitY, self.limitZ)
				# print("filter_centers")
				# print(filter_centers)
				
				# print("train_centers")
				# print(train_centers)
				
				# obj_map / cord_map / voxel 
				obj_map = create_obj_map(train_centers, self.scale, self.resolution, self.limitX, self.limitY, self.limitZ)
				cord_map = create_cord_map(train_centers, train_corners, self.scale, self.resolution, self.limitX, self.limitY, self.limitZ)
				voxel = lidar_to_binary_voxel(filter_range, self.resolution, self.limitX, self.limitY, self.limitZ)
		
				return voxel, obj_map, cord_map
			else: 
				return None, None, None 
		elif self.split == 'testing':
			# read Lidar Data + Calib + Data 
			kitti_Lidar = self.object.get_lidar(idx);  
			kitti_Calib = self.object.get_calibration(idx); 
			
			# lidar data filter 
			filter_fov = filter_camera_fov(kitti_Lidar) 
			filter_range = filter_lidar_range(filter_fov, self.limitX, self.limitY, self.limitZ)
			
			voxel = lidar_to_binary_voxel(filter_range, self.resolution, self.limitX, self.limitY, self.limitZ)
			
			return voxel

				

if __name__ == '__main__':
	
	data_dir = "/home/hsw/桌面/PCL_API_Doc/frustum-pointnets-master/dataset"
	
	kitti_3DVoxel = kitti_3DVoxel_interface(data_dir, objectType = 'Car', split='training', scale = 0.25, resolution = (0.2, 0.2, 0.2), limitX = (0, 80), limitY = (-40, 40), limitZ = (-2.5, 1.5))
	
	sampleIdx = 195; 
	voxel, obj_map, cord_map = kitti_3DVoxel.read_kitti_data(sampleIdx)
	
	if not(voxel is None): 
		print(voxel.shape)
		print(obj_map.shape) 
		print(cord_map.shape)

2.3 KITTI数据读取相关

''' Helper class and functions for loading KITTI objects

Author: Charles R. Qi
Date: September 2017
'''
from __future__ import print_function

import os
import sys
import numpy as np
import cv2
from PIL import Image
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(BASE_DIR)
sys.path.append(os.path.join(ROOT_DIR, 'mayavi'))
import kitti_util as utils

try:
    raw_input          # Python 2
except NameError:
    raw_input = input  # Python 3

# 3D static data 
class kitti_object(object):
    '''Load and parse object data into a usable format.'''
    
    def __init__(self, root_dir, split='training'):
        '''root_dir contains training and testing folders'''
        self.root_dir = root_dir
        self.split = split
        self.split_dir = os.path.join(root_dir, split)

        if split == 'training':
            self.num_samples = 7481
        elif split == 'testing':
            self.num_samples = 7518
        else:
            print('Unknown split: %s' % (split))
            exit(-1)
		
		# data dir 
        self.image_dir = os.path.join(self.split_dir, 'image_2')
        self.calib_dir = os.path.join(self.split_dir, 'calib')
        self.lidar_dir = os.path.join(self.split_dir, 'velodyne')
        self.label_dir = os.path.join(self.split_dir, 'label_2')

    def __len__(self):
        return self.num_samples
	# read image: return image  
    def get_image(self, idx):
        assert(idx<self.num_samples) 
        img_filename = os.path.join(self.image_dir, '%06d.png'%(idx))
        return utils.load_image(img_filename)
        
	# read lidar: return n x 4 
    def get_lidar(self, idx): 
        assert(idx<self.num_samples) 
        lidar_filename = os.path.join(self.lidar_dir, '%06d.bin'%(idx))
        return utils.load_velo_scan(lidar_filename)
        
	# read calib file:  
    def get_calibration(self, idx):
        assert(idx<self.num_samples) 
        calib_filename = os.path.join(self.calib_dir, '%06d.txt'%(idx))
        return utils.Calibration(calib_filename)
        
    # read label 
    def get_label_objects(self, idx):
        assert(idx<self.num_samples and self.split=='training') 
        label_filename = os.path.join(self.label_dir, '%06d.txt'%(idx))
        return utils.read_label(label_filename)
        
    # read depth map     
    def get_depth_map(self, idx):
        pass
        
	# read top_down image 
    def get_top_down(self, idx):
        pass

class kitti_object_video(object):
    ''' Load data for KITTI videos '''
    def __init__(self, img_dir, lidar_dir, calib_dir):
        self.calib = utils.Calibration(calib_dir, from_video=True)
        self.img_dir = img_dir
        self.lidar_dir = lidar_dir
        self.img_filenames = sorted([os.path.join(img_dir, filename) \
            for filename in os.listdir(img_dir)])
        self.lidar_filenames = sorted([os.path.join(lidar_dir, filename) \
            for filename in os.listdir(lidar_dir)])
        print(len(self.img_filenames))
        print(len(self.lidar_filenames))
        #assert(len(self.img_filenames) == len(self.lidar_filenames))
        self.num_samples = len(self.img_filenames)

    def __len__(self):
        return self.num_samples

    def get_image(self, idx):
        assert(idx<self.num_samples) 
        img_filename = self.img_filenames[idx]
        return utils.load_image(img_filename)

    def get_lidar(self, idx): 
        assert(idx<self.num_samples) 
        lidar_filename = self.lidar_filenames[idx]
        return utils.load_velo_scan(lidar_filename)

    def get_calibration(self, unused):
        return self.calib

def viz_kitti_video():
    video_path = os.path.join(ROOT_DIR, 'dataset/2011_09_26/')
    dataset = kitti_object_video(\
        os.path.join(video_path, '2011_09_26_drive_0023_sync/image_02/data'),
        os.path.join(video_path, '2011_09_26_drive_0023_sync/velodyne_points/data'),
        video_path)
    print(len(dataset))
    for i in range(len(dataset)):
        img = dataset.get_image(0)
        pc = dataset.get_lidar(0)
        Image.fromarray(img).show()
        draw_lidar(pc)
        raw_input()
        pc[:,0:3] = dataset.get_calibration().project_velo_to_rect(pc[:,0:3])
        draw_lidar(pc)
        raw_input()
    return

def show_image_with_boxes(img, objects, calib, show3d=True):
    ''' Show image with 2D bounding boxes '''
    img1 = np.copy(img) # for 2d bbox
    img2 = np.copy(img) # for 3d bbox
    for obj in objects:
        if obj.type=='DontCare':continue
        cv2.rectangle(img1, (int(obj.xmin),int(obj.ymin)),
            (int(obj.xmax),int(obj.ymax)), (0,255,0), 2)
        box3d_pts_2d, box3d_pts_3d = utils.compute_box_3d(obj, calib.P)
        img2 = utils.draw_projected_box3d(img2, box3d_pts_2d)
    Image.fromarray(img1).show()
    if show3d:
        Image.fromarray(img2).show()

def get_lidar_in_image_fov(pc_velo, calib, xmin, ymin, xmax, ymax,
                           return_more=False, clip_distance=2.0):
    ''' Filter lidar points, keep those in image FOV '''
    pts_2d = calib.project_velo_to_image(pc_velo)
    fov_inds = (pts_2d[:,0]<xmax) & (pts_2d[:,0]>=xmin) & \
        (pts_2d[:,1]<ymax) & (pts_2d[:,1]>=ymin)
    fov_inds = fov_inds & (pc_velo[:,0]>clip_distance)
    imgfov_pc_velo = pc_velo[fov_inds,:]
    if return_more:
        return imgfov_pc_velo, pts_2d, fov_inds
    else:
        return imgfov_pc_velo

def show_lidar_with_boxes(pc_velo, objects, calib,
                          img_fov=False, img_width=None, img_height=None): 
    ''' Show all LiDAR points.
        Draw 3d box in LiDAR point cloud (in velo coord system) '''
    if 'mlab' not in sys.modules: import mayavi.mlab as mlab
    from viz_util import draw_lidar_simple, draw_lidar, draw_gt_boxes3d

    print(('All point num: ', pc_velo.shape[0]))
    fig = mlab.figure(figure=None, bgcolor=(0,0,0),
        fgcolor=None, engine=None, size=(1000, 500))
    if img_fov:
        pc_velo = get_lidar_in_image_fov(pc_velo, calib, 0, 0,
            img_width, img_height)
        print(('FOV point num: ', pc_velo.shape[0]))
    draw_lidar(pc_velo, fig=fig)

    for obj in objects:
        if obj.type=='DontCare':continue
        # Draw 3d bounding box
        box3d_pts_2d, box3d_pts_3d = utils.compute_box_3d(obj, calib.P) 
        box3d_pts_3d_velo = calib.project_rect_to_velo(box3d_pts_3d)
        # Draw heading arrow
        ori3d_pts_2d, ori3d_pts_3d = utils.compute_orientation_3d(obj, calib.P)
        ori3d_pts_3d_velo = calib.project_rect_to_velo(ori3d_pts_3d)
        x1,y1,z1 = ori3d_pts_3d_velo[0,:]
        x2,y2,z2 = ori3d_pts_3d_velo[1,:]
        draw_gt_boxes3d([box3d_pts_3d_velo], fig=fig)
        mlab.plot3d([x1, x2], [y1, y2], [z1,z2], color=(0.5,0.5,0.5),
            tube_radius=None, line_width=1, figure=fig)
    mlab.show(1)

def show_lidar_on_image(pc_velo, img, calib, img_width, img_height):
    ''' Project LiDAR points to image '''
    imgfov_pc_velo, pts_2d, fov_inds = get_lidar_in_image_fov(pc_velo,
        calib, 0, 0, img_width, img_height, True)
    imgfov_pts_2d = pts_2d[fov_inds,:]
    imgfov_pc_rect = calib.project_velo_to_rect(imgfov_pc_velo)

    import matplotlib.pyplot as plt
    cmap = plt.cm.get_cmap('hsv', 256)
    cmap = np.array([cmap(i) for i in range(256)])[:,:3]*255

    for i in range(imgfov_pts_2d.shape[0]):
        depth = imgfov_pc_rect[i,2]
        color = cmap[int(640.0/depth),:]
        cv2.circle(img, (int(np.round(imgfov_pts_2d[i,0])),
            int(np.round(imgfov_pts_2d[i,1]))),
            2, color=tuple(color), thickness=-1)
    Image.fromarray(img).show() 
    return img

def dataset_viz():
    dataset = kitti_object(os.path.join(ROOT_DIR, 'dataset/KITTI/object'))

    for data_idx in range(len(dataset)):
        # Load data from dataset
        objects = dataset.get_label_objects(data_idx)
        objects[0].print_object()
        img = dataset.get_image(data_idx)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        img_height, img_width, img_channel = img.shape
        print(('Image shape: ', img.shape))
        pc_velo = dataset.get_lidar(data_idx)[:,0:3]
        calib = dataset.get_calibration(data_idx)

        # Draw 2d and 3d boxes on image
        show_image_with_boxes(img, objects, calib, False)
        raw_input()
        # Show all LiDAR points. Draw 3d box in LiDAR point cloud
        show_lidar_with_boxes(pc_velo, objects, calib, True, img_width, img_height)
        raw_input()

if __name__=='__main__':
    import mayavi.mlab as mlab
    from viz_util import draw_lidar_simple, draw_lidar, draw_gt_boxes3d
    dataset_viz()

""" Helper methods for loading and parsing KITTI data.

Author: Charles R. Qi
Date: September 2017
"""
from __future__ import print_function

import numpy as np
import cv2
import os

class Object3d(object):
    ''' 3d object label '''
    def __init__(self, label_file_line):
        data = label_file_line.split(' ')
        data[1:] = [float(x) for x in data[1:]]

        # extract label, truncation, occlusion
        self.type = data[0] # 'Car', 'Pedestrian', ...
        self.truncation = data[1] # truncated pixel ratio [0..1]
        self.occlusion = int(data[2]) # 0=visible, 1=partly occluded, 2=fully occluded, 3=unknown
        self.alpha = data[3] # object observation angle [-pi..pi]

        # extract 2d bounding box in 0-based coordinates
        self.xmin = data[4] # left
        self.ymin = data[5] # top
        self.xmax = data[6] # right
        self.ymax = data[7] # bottom
        self.box2d = np.array([self.xmin,self.ymin,self.xmax,self.ymax])
        
        # extract 3d bounding box information
        self.h = data[8] # box height
        self.w = data[9] # box width
        self.l = data[10] # box length (in meters)
        self.t = (data[11],data[12],data[13]) # location (x,y,z) in camera coord.
        self.ry = data[14] # yaw angle (around Y-axis in camera coordinates) [-pi..pi]

    def print_object(self):
        print('Type, truncation, occlusion, alpha: %s, %d, %d, %f' % \
            (self.type, self.truncation, self.occlusion, self.alpha))
        print('2d bbox (x0,y0,x1,y1): %f, %f, %f, %f' % \
            (self.xmin, self.ymin, self.xmax, self.ymax))
        print('3d bbox h,w,l: %f, %f, %f' % \
            (self.h, self.w, self.l))
        print('3d bbox location, ry: (%f, %f, %f), %f' % \
            (self.t[0],self.t[1],self.t[2],self.ry))


class Calibration(object):
    ''' Calibration matrices and utils
        3d XYZ in <label>.txt are in rect camera coord.
        2d box xy are in image2 coord
        Points in <lidar>.bin are in Velodyne coord.

        y_image2 = P^2_rect * x_rect
        y_image2 = P^2_rect * R0_rect * Tr_velo_to_cam * x_velo
        x_ref = Tr_velo_to_cam * x_velo
        x_rect = R0_rect * x_ref

        P^2_rect = [f^2_u,  0,      c^2_u,  -f^2_u b^2_x;
                    0,      f^2_v,  c^2_v,  -f^2_v b^2_y;
                    0,      0,      1,      0]
                 = K * [1|t]

        image2 coord:
         ----> x-axis (u)
        |
        |
        v y-axis (v)

        velodyne coord:
        front x, left y, up z

        rect/ref camera coord:
        right x, down y, front z

        Ref (KITTI paper): http://www.cvlibs.net/publications/Geiger2013IJRR.pdf

        TODO(rqi): do matrix multiplication only once for each projection.
    '''
    def __init__(self, calib_filepath, from_video=False):
        if from_video:
            calibs = self.read_calib_from_video(calib_filepath)
        else:
            calibs = self.read_calib_file(calib_filepath)
        # Projection matrix from rect camera coord to image2 coord
        self.P = calibs['P2'] 
        self.P = np.reshape(self.P, [3,4])
        # Rigid transform from Velodyne coord to reference camera coord
        self.V2C = calibs['Tr_velo_to_cam']
        self.V2C = np.reshape(self.V2C, [3,4])
        self.C2V = inverse_rigid_trans(self.V2C)
        # Rotation from reference camera coord to rect camera coord
        self.R0 = calibs['R0_rect']
        self.R0 = np.reshape(self.R0,[3,3])

        # Camera intrinsics and extrinsics
        self.c_u = self.P[0,2]
        self.c_v = self.P[1,2]
        self.f_u = self.P[0,0]
        self.f_v = self.P[1,1]
        self.b_x = self.P[0,3]/(-self.f_u) # relative 
        self.b_y = self.P[1,3]/(-self.f_v)

    def read_calib_file(self, filepath):
        ''' Read in a calibration file and parse into a dictionary.
        Ref: https://github.com/utiasSTARS/pykitti/blob/master/pykitti/utils.py
        '''
        data = {}
        with open(filepath, 'r') as f:
            for line in f.readlines():
                line = line.rstrip()
                if len(line)==0: continue
                key, value = line.split(':', 1)
                # The only non-float values in these files are dates, which
                # we don't care about anyway
                try:
                    data[key] = np.array([float(x) for x in value.split()])
                except ValueError:
                    pass

        return data
    
    def read_calib_from_video(self, calib_root_dir):
        ''' Read calibration for camera 2 from video calib files.
            there are calib_cam_to_cam and calib_velo_to_cam under the calib_root_dir
        '''
        data = {}
        cam2cam = self.read_calib_file(os.path.join(calib_root_dir, 'calib_cam_to_cam.txt'))
        velo2cam = self.read_calib_file(os.path.join(calib_root_dir, 'calib_velo_to_cam.txt'))
        Tr_velo_to_cam = np.zeros((3,4))
        Tr_velo_to_cam[0:3,0:3] = np.reshape(velo2cam['R'], [3,3])
        Tr_velo_to_cam[:,3] = velo2cam['T']
        data['Tr_velo_to_cam'] = np.reshape(Tr_velo_to_cam, [12])
        data['R0_rect'] = cam2cam['R_rect_00']
        data['P2'] = cam2cam['P_rect_02']
        return data

    def cart2hom(self, pts_3d):
        ''' Input: nx3 points in Cartesian
            Oupput: nx4 points in Homogeneous by pending 1
        '''
        n = pts_3d.shape[0]
        pts_3d_hom = np.hstack((pts_3d, np.ones((n,1))))
        return pts_3d_hom
 
    # =========================== 
    # ------- 3d to 3d ---------- 
    # =========================== 
    def project_velo_to_ref(self, pts_3d_velo):
        pts_3d_velo = self.cart2hom(pts_3d_velo) # nx4
        return np.dot(pts_3d_velo, np.transpose(self.V2C))

    def project_ref_to_velo(self, pts_3d_ref):
        pts_3d_ref = self.cart2hom(pts_3d_ref) # nx4
        return np.dot(pts_3d_ref, np.transpose(self.C2V))

    def project_rect_to_ref(self, pts_3d_rect):
        ''' Input and Output are nx3 points '''
        return np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_3d_rect)))
    
    def project_ref_to_rect(self, pts_3d_ref):
        ''' Input and Output are nx3 points '''
        return np.transpose(np.dot(self.R0, np.transpose(pts_3d_ref)))
 
    def project_rect_to_velo(self, pts_3d_rect):
        ''' Input: nx3 points in rect camera coord.
            Output: nx3 points in velodyne coord.
        ''' 
        pts_3d_ref = self.project_rect_to_ref(pts_3d_rect)
        return self.project_ref_to_velo(pts_3d_ref)

    def project_velo_to_rect(self, pts_3d_velo):
        pts_3d_ref = self.project_velo_to_ref(pts_3d_velo)
        return self.project_ref_to_rect(pts_3d_ref)

    # =========================== 
    # ------- 3d to 2d ---------- 
    # =========================== 
    def project_rect_to_image(self, pts_3d_rect):
        ''' Input: nx3 points in rect camera coord.
            Output: nx2 points in image2 coord.
        '''
        pts_3d_rect = self.cart2hom(pts_3d_rect)
        pts_2d = np.dot(pts_3d_rect, np.transpose(self.P)) # nx3
        pts_2d[:,0] /= pts_2d[:,2]
        pts_2d[:,1] /= pts_2d[:,2]
        return pts_2d[:,0:2]
    
    def project_velo_to_image(self, pts_3d_velo):
        ''' Input: nx3 points in velodyne coord.
            Output: nx2 points in image2 coord.
        '''
        pts_3d_rect = self.project_velo_to_rect(pts_3d_velo)
        return self.project_rect_to_image(pts_3d_rect)

    # =========================== 
    # ------- 2d to 3d ---------- 
    # =========================== 
    def project_image_to_rect(self, uv_depth):
        ''' Input: nx3 first two channels are uv, 3rd channel
                   is depth in rect camera coord.
            Output: nx3 points in rect camera coord.
        '''
        n = uv_depth.shape[0]
        x = ((uv_depth[:,0]-self.c_u)*uv_depth[:,2])/self.f_u + self.b_x
        y = ((uv_depth[:,1]-self.c_v)*uv_depth[:,2])/self.f_v + self.b_y
        pts_3d_rect = np.zeros((n,3))
        pts_3d_rect[:,0] = x
        pts_3d_rect[:,1] = y
        pts_3d_rect[:,2] = uv_depth[:,2]
        return pts_3d_rect

    def project_image_to_velo(self, uv_depth):
        pts_3d_rect = self.project_image_to_rect(uv_depth)
        return self.project_rect_to_velo(pts_3d_rect)

 
def rotx(t):
    ''' 3D Rotation about the x-axis. '''
    c = np.cos(t)
    s = np.sin(t)
    return np.array([[1,  0,  0],
                     [0,  c, -s],
                     [0,  s,  c]])


def roty(t):
    ''' Rotation about the y-axis. '''
    c = np.cos(t)
    s = np.sin(t)
    return np.array([[c,  0,  s],
                     [0,  1,  0],
                     [-s, 0,  c]])


def rotz(t):
    ''' Rotation about the z-axis. '''
    c = np.cos(t)
    s = np.sin(t)
    return np.array([[c, -s,  0],
                     [s,  c,  0],
                     [0,  0,  1]])


def transform_from_rot_trans(R, t):
    ''' Transforation matrix from rotation matrix and translation vector. '''
    R = R.reshape(3, 3)
    t = t.reshape(3, 1)
    return np.vstack((np.hstack([R, t]), [0, 0, 0, 1]))


def inverse_rigid_trans(Tr):
    ''' Inverse a rigid body transform matrix (3x4 as [R|t])
        [R'|-R't; 0|1]
    '''
    inv_Tr = np.zeros_like(Tr) # 3x4
    inv_Tr[0:3,0:3] = np.transpose(Tr[0:3,0:3])
    inv_Tr[0:3,3] = np.dot(-np.transpose(Tr[0:3,0:3]), Tr[0:3,3])
    return inv_Tr

def read_label(label_filename):
    lines = [line.rstrip() for line in open(label_filename)]
    objects = [Object3d(line) for line in lines]
    return objects

def load_image(img_filename):
    return cv2.imread(img_filename)

def load_velo_scan(velo_filename):
    scan = np.fromfile(velo_filename, dtype=np.float32)
    scan = scan.reshape((-1, 4))
    return scan

def project_to_image(pts_3d, P):
    ''' Project 3d points to image plane.

    Usage: pts_2d = projectToImage(pts_3d, P)
      input: pts_3d: nx3 matrix
             P:      3x4 projection matrix
      output: pts_2d: nx2 matrix

      P(3x4) dot pts_3d_extended(4xn) = projected_pts_2d(3xn)
      => normalize projected_pts_2d(2xn)

      <=> pts_3d_extended(nx4) dot P'(4x3) = projected_pts_2d(nx3)
          => normalize projected_pts_2d(nx2)
    '''
    n = pts_3d.shape[0]
    pts_3d_extend = np.hstack((pts_3d, np.ones((n,1))))
    print(('pts_3d_extend shape: ', pts_3d_extend.shape))
    pts_2d = np.dot(pts_3d_extend, np.transpose(P)) # nx3
    pts_2d[:,0] /= pts_2d[:,2]
    pts_2d[:,1] /= pts_2d[:,2]
    return pts_2d[:,0:2]
    

# corners_2d + corners_3d
def compute_box_3d(obj, P):
    ''' Takes an object and a projection matrix (P) and projects the 3d
        bounding box into the image plane.
        Returns:
            corners_2d: (8,2) array in left image coord.
            corners_3d: (8,3) array in in rect camera coord.
    '''
    # compute rotational matrix around yaw axis
    R = roty(obj.ry)    

    # 3d bounding box dimensions
    l = obj.l;
    w = obj.w;
    h = obj.h;
    
    # 3d bounding box corners
    x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2];
    y_corners = [0,0,0,0,-h,-h,-h,-h];
    z_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2];
    
    # rotate and translate 3d bounding box
    corners_3d = np.dot(R, np.vstack([x_corners,y_corners,z_corners]))
    #print corners_3d.shape
    corners_3d[0,:] = corners_3d[0,:] + obj.t[0];
    corners_3d[1,:] = corners_3d[1,:] + obj.t[1];
    corners_3d[2,:] = corners_3d[2,:] + obj.t[2];
    #print 'cornsers_3d: ', corners_3d 
    # only draw 3d bounding box for objs in front of the camera
    if np.any(corners_3d[2,:]<0.1):
        corners_2d = None
        return corners_2d, np.transpose(corners_3d)
    
    # project the 3d bounding box into the image plane
    corners_2d = project_to_image(np.transpose(corners_3d), P);
    #print 'corners_2d: ', corners_2d
    return corners_2d, np.transpose(corners_3d)


def compute_orientation_3d(obj, P):
    ''' Takes an object and a projection matrix (P) and projects the 3d
        object orientation vector into the image plane.
        Returns:
            orientation_2d: (2,2) array in left image coord.
            orientation_3d: (2,3) array in in rect camera coord.
    '''
    
    # compute rotational matrix around yaw axis
    R = roty(obj.ry)
   
    # orientation in object coordinate system
    orientation_3d = np.array([[0.0, obj.l],[0,0],[0,0]])
    
    # rotate and translate in camera coordinate system, project in image
    orientation_3d = np.dot(R, orientation_3d)
    orientation_3d[0,:] = orientation_3d[0,:] + obj.t[0]
    orientation_3d[1,:] = orientation_3d[1,:] + obj.t[1]
    orientation_3d[2,:] = orientation_3d[2,:] + obj.t[2]
    
    # vector behind image plane?
    if np.any(orientation_3d[2,:]<0.1):
      orientation_2d = None
      return orientation_2d, np.transpose(orientation_3d)
    
    # project orientation into the image plane
    orientation_2d = project_to_image(np.transpose(orientation_3d), P);
    return orientation_2d, np.transpose(orientation_3d)

def draw_projected_box3d(image, qs, color=(255,255,255), thickness=2):
    ''' Draw 3d bounding box in image
        qs: (8,3) array of vertices for the 3d box in following order:
            1 -------- 0
           /|         /|
          2 -------- 3 .
          | |        | |
          . 5 -------- 4
          |/         |/
          6 -------- 7
    '''
    qs = qs.astype(np.int32)
    for k in range(0,4):
       # Ref: http://docs.enthought.com/mayavi/mayavi/auto/mlab_helper_functions.html
       i,j=k,(k+1)%4
       # use LINE_AA for opencv3
       cv2.line(image, (qs[i,0],qs[i,1]), (qs[j,0],qs[j,1]), color, thickness, cv2.CV_AA)

       i,j=k+4,(k+1)%4 + 4
       cv2.line(image, (qs[i,0],qs[i,1]), (qs[j,0],qs[j,1]), color, thickness, cv2.CV_AA)

       i,j=k,k+4
       cv2.line(image, (qs[i,0],qs[i,1]), (qs[j,0],qs[j,1]), color, thickness, cv2.CV_AA)
    return image

深度学习——3D Fully Convolutional Network for Vehicle Detection in Point Cloud模型实现 3D Fully Convolutional Network for Vehicle Detection in Point Cloud

3D Fully Convolutional Network for Vehicle Detection in Point Cloud

猜你喜欢