在caffe 中添加Crowd counting 数据层

#目录
[TOC]

一、简介

Crowd counting一般以人群图像作为输入，网络回归该图像对应的密度图。以往做法是先在matlab中根据图像的label（人头位置）生成密度图，然后将输入图像及密度图转化为lmdb格式作为caffe datalayer的输入。这样做步骤繁琐，而且lmdb很占存储空间。接下来我们在caffe中添加一个crowd data layer，直接读入图像，以及密度图的生成均由该层完成。
人群图像及其密度图

二、创建CrowdData Layer

1. caffe.proto

在ImageDataParameter中添加：

  // For crowd_data_layer: the number of pooling layer(stride: 2)
  optional uint32 downsamp_times = 17 [default =  2];
  // For crowd_data_layer:base standard deviation of gaussian kernel for density map
  optional float base_sigma = 18 [default = 1.5];

2.crowd_data_layer.hpp

//  Create on: 2016/9/19 ShanghaiTech
//  Author:    Yingying Zhang

#ifndef CAFFE_CROWD_DATA_LAYER_HPP_
#define CAFFE_CROWD_DATA_LAYER_HPP_

#include <string>
#include <utility>
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/data_transformer.hpp"
#include "caffe/internal_thread.hpp"
#include "caffe/layer.hpp"
#include "caffe/layers/base_data_layer.hpp"
#include "caffe/proto/caffe.pb.h"


namespace caffe {

/**
 * @brief Provides data and density map to the Net.
 *  top[0]: crowd image
 *  top[1]: density map
 */

using namespace cv;

template <typename Dtype>
struct HeadLocation {
  Dtype x;
  Dtype y;
};

template <typename Dtype>
class CrowdDataLayer : public BasePrefetchingDataLayer<Dtype> {
 public:
  explicit CrowdDataLayer(const LayerParameter& param)
      : BasePrefetchingDataLayer<Dtype>(param) {}
  virtual ~CrowdDataLayer();
  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "CrowdData"; }
  virtual inline int ExactNumBottomBlobs() const { return 0; }
  virtual inline int ExactNumTopBlobs() const { return 2; }

 protected:
  shared_ptr<Caffe::RNG> prefetch_rng_;
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top);
  virtual void ReadLocationFromTextFile(const std::string filename);
  virtual void ShuffleData();
  virtual void load_batch(Batch<Dtype>* batch);
  virtual void Transform(const cv::Mat &cv_img, Blob<Dtype> *transformed_blob,
          const bool do_mirror, const int offset);
  virtual unsigned int PrefetchRand();
  virtual Mat ReadCrowdImageToCVMat(const string& filename,  const bool is_color);
  virtual vector<int> GetImageBlobShape(const Mat& cv_img, const bool is_color);
  virtual vector<int> GetDmapBlobShape(const Mat& cv_img, const int ds_times);
  virtual Mat fspecial(const int size, const float sigma);
  virtual void GetDensityMap(Mat& cv_dmap, const vector<HeadLocation<int> >& head_location,
          const float sigma);


  int lines_id_;
  int downsamp_times_;
  Dtype transform_scale_;
  Dtype base_sigma_;
  vector<std::pair<std::string, std::string> > lines_;
  vector<HeadLocation<int> > gt_loc_;
  vector<Dtype> mean_values_;

};


}  // namespace caffe

#endif  // CAFFE_CROWD_DATA_LAYER_HPP_

3.crowd_data_layer.cpp



//  Create on: 2016/9/19 ShanghaiTech
//  Author:    Yingying Zhang

#ifdef USE_OPENCV
#include <fstream>  
#include <iostream>  
#include <string>
#include <utility>
#include <vector>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include "caffe/data_transformer.hpp"
#include "caffe/layers/base_data_layer.hpp"
#include "caffe/layers/crowd_data_layer.hpp"
#include "caffe/util/benchmark.hpp"
#include "caffe/util/io.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/rng.hpp"

namespace caffe {

using namespace cv;

template <typename Dtype>
CrowdDataLayer<Dtype>::~CrowdDataLayer<Dtype>() {
  this->StopInternalThread();
}

//Read ground truth head location of crowd images
template <typename Dtype>
void CrowdDataLayer<Dtype>::ReadLocationFromTextFile(const std::string filename) {
  DLOG(INFO) << "Opening file " << filename;
  std::ifstream infile(filename.c_str());
  CHECK(infile.good()) << "Failed to open file " << filename;
  int num_crowd;
  CHECK(infile >> num_crowd);
  CHECK_GE(num_crowd, 0) << "Number of crowd must be positive!";
  gt_loc_.clear();
  gt_loc_.resize(num_crowd);
  for (int i = 0; i < num_crowd; i++) {
    HeadLocation<int> location;
    CHECK(infile >> location.x >> location.y);
    CHECK(0 <= location.x && 0 <= location.y);
    for (int j = 0; j < downsamp_times_; ++j) {
        location.x = location.x / 2;
        location.y = location.y / 2;
    }
    gt_loc_[i] = location;
  }
  infile.close();
}

// Read crowd image
template <typename Dtype>
Mat CrowdDataLayer<Dtype>::ReadCrowdImageToCVMat(const string& filename,
        const bool is_color) {
  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
    CV_LOAD_IMAGE_GRAYSCALE);
  Mat cv_img = imread(filename, cv_read_flag);
  if (!cv_img.data) {
    LOG(ERROR) << "Could not open or find file " << filename;
  }
  return cv_img;
}

// Compute initial shape for crowd image
template <typename Dtype>
vector<int> CrowdDataLayer<Dtype>::GetImageBlobShape(const Mat& cv_img,
        const bool is_color) {
  const int channels = is_color ? 3 : 1;
  const int height = cv_img.rows;
  const int width = cv_img.cols;
  // Check dimensions.
  CHECK_GT(height, 0);
  CHECK_GT(width, 0);
  // Build BlobShape.
  vector<int> shape(4);
  shape[0] = 1;
  shape[1] = channels;
  shape[2] = height;
  shape[3] = width;
  return shape;
}

// Compute initial shape for density map
template <typename Dtype>
vector<int> CrowdDataLayer<Dtype>::GetDmapBlobShape(const Mat& cv_img,
        const int ds_times) {
  const int channels = 1;
  int height = cv_img.rows;
  int width = cv_img.cols;
  for (int i = 0; i < ds_times; ++i) {
      height = height / 2 + (height % 2);
      width = width / 2 + (width % 2);
  }
  // Check dimensions.
  CHECK_GT(height, 0);
  CHECK_GT(width, 0);
  // Build BlobShape.
  vector<int> shape(4);
  shape[0] = 1;
  shape[1] = channels;
  shape[2] = height;
  shape[3] = width;
  return shape;
}

//2D Gaussain kernel, similar to function fspecial in Matlab
//Input size must be odd number
template <typename Dtype>
Mat CrowdDataLayer<Dtype>::fspecial(const int size, const float sigma) {
  int r_size = (size-1)/ 2;
  Mat kernel(size, size, CV_32F);
  float simga_2 = float(2) * sigma * sigma;
  for(int i = -r_size; i <= r_size; ++i) {
    int h = i + r_size;
    for (int j = (-r_size); j <= r_size; ++j) {
      int w = j + r_size;
      float v = exp(-(static_cast<float>(i*i) + static_cast<float>(j*j)) / simga_2);
      kernel.ptr<float>(h)[w] = v;
    }
  }
  Scalar sum_value = sum(kernel);
  Mat gaussian_kernel;
  kernel.convertTo(gaussian_kernel, CV_32F, (1/sum_value[0]));
  return gaussian_kernel;
}

// Generate gaussian kernel based density map
template <typename Dtype>
void CrowdDataLayer<Dtype>::GetDensityMap(Mat& cv_dmap,
       const vector<HeadLocation<int> >& head_location,
       const float sigma) {
  const int k_size = 25;
  const int r_size = (k_size - 1) / 2;
  Mat gaussian_kernel = fspecial(k_size, sigma);
  Mat extend_dmap(cv_dmap.rows + k_size - 1, cv_dmap.cols + k_size - 1, CV_32F, Scalar(0));
  const int num_head = head_location.size();
  for (int i = 0; i < num_head; ++i) {
    int loc_x = head_location[i].x;
    int loc_y = head_location[i].y;
    CHECK_GE(loc_x, 0) << "invalid head location: x!";
    CHECK_GE(loc_y, 0) << "invalid head location: y!";
    CHECK_LE(loc_x, cv_dmap.cols) << "invalid head location: x!";
    CHECK_LE(loc_y, cv_dmap.rows) << "invalid head location: y!";
    Rect location_box;
    location_box.x = loc_x;
    location_box.y = loc_y;
    location_box.width = k_size;
    location_box.height = k_size;
    Mat kernel_location(extend_dmap, location_box);
    kernel_location = kernel_location + gaussian_kernel;
  }
  Rect dmap_loc;
  dmap_loc.x = r_size;
  dmap_loc.y = r_size;
  dmap_loc.width = cv_dmap.cols;
  dmap_loc.height = cv_dmap.rows;
  Mat dstroi = extend_dmap(dmap_loc);
  dstroi.convertTo(cv_dmap, CV_32F, 1, 0);
}

//Shuffle crowd image and ground truth location.
template <typename Dtype>
void CrowdDataLayer<Dtype>::ShuffleData() {
  caffe::rng_t* prefetch_rng =
      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
}

//Return an unsigned integrate number
template <typename Dtype>
unsigned int CrowdDataLayer<Dtype>::PrefetchRand() {
  CHECK(prefetch_rng_);
  caffe::rng_t* prefetch_rng =
    static_cast<caffe::rng_t*>(prefetch_rng_->generator());
  return (*prefetch_rng)();
}

//Convert cvMat to data blob
template <typename Dtype>
void CrowdDataLayer<Dtype>::Transform(const Mat &cv_img,
  Blob<Dtype> *transformed_blob, const bool do_mirror,
  const int offset) {
  // 1. Check dimensions
  const int img_channels = cv_img.channels();
  const int img_height = cv_img.rows;
  const int img_width = cv_img.cols;
  const int channels = transformed_blob->channels();
  const int height = transformed_blob->height();
  const int width = transformed_blob->width();
  CHECK_EQ(channels, img_channels);
  CHECK_EQ(height, img_height);
  CHECK_EQ(width, img_width);
  CHECK(cv_img.depth() == CV_8U || cv_img.depth() == CV_32F);
  // 2. Copy image (or density map data from cvMat to  data blob
  Dtype *data = transformed_blob->mutable_cpu_data() + offset;
  for (int h = 0; h < img_height; h++) {
    const uchar *psrc_u = NULL;
    const float *psrc_f = NULL;
    if(cv_img.depth() == CV_8U) {
      psrc_u = cv_img.ptr<uchar>(h);
    }
    if(cv_img.depth() == CV_32F) {
      psrc_f = cv_img.ptr<float>(h);
    }
    for (int w = 0; w < img_width; w++) {
      for (int c = 0; c < channels; c++) {
        if (do_mirror) {
          // For crowd image
          if(cv_img.depth() == CV_8U)
            data[(c * height + h) * width + w] = static_cast<Dtype> (
              (psrc_u[(img_width - 1 - w) * channels + c] - mean_values_[c])
              * transform_scale_);
          // For density map
          if (cv_img.depth() == CV_32F)
            data[(c * height + h) * width + w] = static_cast<Dtype> (
              psrc_f[(img_width - 1 - w) * channels + c]);
        } else {
          // For crowd image
          if(cv_img.depth() == CV_8U)
            data[(c * height + h) * width + w] = static_cast<Dtype> (
              (psrc_u[w * channels + c] - mean_values_[c])
              * transform_scale_);
          // For density map
          if (cv_img.depth() == CV_32F)
            data[(c * height + h) * width + w] = static_cast<Dtype> (
              psrc_f[w * channels + c]);
        }
      }
    }
  }
}

template <typename Dtype>
void CrowdDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {

  downsamp_times_ = this->layer_param_.image_data_param().downsamp_times();
  base_sigma_ = this->layer_param_.image_data_param().base_sigma();
  const bool is_color  = this->layer_param_.image_data_param().is_color();
  string root_folder = this->layer_param_.image_data_param().root_folder();

  // 1.  Configure mean values and transform scale
  const bool has_mean_values = this->transform_param_.mean_value_size() > 0;
  int channel = is_color ? 3 : 1;
  if (has_mean_values) {
    const int mean_channel = this->transform_param_.mean_value_size();
    CHECK(mean_channel == 1 || mean_channel == channel) <<
        "Specify either 1 mean_value or as many as channels: " << channel;
    if (channel > 1 && mean_channel == 1) {
      // Replicate the mean_value for simplicity
      for (int c = 0; c < channel; ++c) {
        mean_values_.push_back(this->transform_param_.mean_value(0));
      }
    } else {
      for (int c = 0; c < channel; ++c) {
        mean_values_.push_back(this->transform_param_.mean_value(c));
      }
    }
  } else {
    // If not specify mean value, set to zero
    for (int c = 0; c < channel; ++c) {
      mean_values_.push_back(Dtype(0));
    }
  }
  transform_scale_ = this->transform_param_.scale();

  // 2. Read the image and location file with filenames.
  const string& source = this->layer_param_.image_data_param().source();
  LOG(INFO) << "Opening file " << source;
  std::ifstream infile(source.c_str());
  string img_filename;
  string loc_filename;
  while (infile >> img_filename >> loc_filename) {
    lines_.push_back(std::make_pair(img_filename, loc_filename));
  }

  // 3. Randomly shuffle data.
  if (this->layer_param_.image_data_param().shuffle()) {
    LOG(INFO) << "Shuffling data";
    const unsigned int prefetch_rng_seed = caffe_rng_rand();
    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
    ShuffleData();
  }
  LOG(INFO) << "A total of " << lines_.size() << " crowd images.";

  // 4. Read an crowd image to initialize the top blob.
  // 4.1. crowd image.
  lines_id_ = 0;
  Mat cv_img = ReadCrowdImageToCVMat(root_folder + lines_[lines_id_].first,
          is_color);
  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
  vector<int> data_shape = GetImageBlobShape(cv_img, is_color);
  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    this->prefetch_[i].data_.Reshape(data_shape);
  }
  top[0]->Reshape(data_shape);
  LOG(INFO) << "output crowd image size: " << top[0]->num() << ","
      << top[0]->channels() << "," << top[0]->height() << ","
      << top[0]->width();
  // 4.2. density map.
  vector<int> dmap_shape = GetDmapBlobShape(cv_img, downsamp_times_);
  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    this->prefetch_[i].label_.Reshape(dmap_shape);
  }
  top[1]->Reshape(dmap_shape);
  LOG(INFO) << "output density map size: " << top[1]->num() << ","
      << top[1]->channels() << "," << top[1]->height() << ","
      << top[1]->width();

}

// This function is called on prefetch thread.
template <typename Dtype>
void CrowdDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
  CPUTimer batch_timer;
  batch_timer.Start();
  double read_time = 0;
  double trans_time = 0;
  double dmap_time = 0;
  CPUTimer timer;

  ImageDataParameter image_data_param = this->layer_param_.image_data_param();
  const bool is_color = image_data_param.is_color();
  const bool mirror = this->transform_param_.mirror();
  string root_folder = image_data_param.root_folder();
  const unsigned int prefetch_rng_seed = caffe_rng_rand();
  prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));

  // 1. Reshape each batch according the image read currently.
  // Because each crowd image has different size, the batch_size needs
  // to be set to one.
  // Maybe use batch_size more than one in future.
  // 1.1. Reshape crowd image (data) and density map (label) of batch.
  int batch_size  = 1;
  Mat cv_img = ReadCrowdImageToCVMat(root_folder + lines_[lines_id_].first,
          is_color);
  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
  vector<int> data_shape = GetImageBlobShape(cv_img, is_color);
  data_shape[0] = batch_size;
  batch->data_.Reshape(data_shape);
  vector<int> dmap_shape = GetDmapBlobShape(cv_img, downsamp_times_);
  dmap_shape[0] = batch_size;
  batch->label_.Reshape(dmap_shape);

  // 2. Load crowd images and generate density maps.
  const int lines_size = lines_.size();
  for (int item_id = 0; item_id < batch_size; ++item_id) {
    // 2.1 Load crowd images
    timer.Start();
    CHECK_GT(lines_size, lines_id_);
    Mat cv_img = ReadCrowdImageToCVMat(root_folder + lines_[lines_id_].first,
          is_color);
    CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
    read_time += timer.MicroSeconds();
    // 2.2 Apply transformations (mirror,maybe crop in future...) to the image
    timer.Start();
    int data_offset = item_id * (batch->data_.count(1));
    bool do_mirror = mirror;      // for validation and testing
    if (this->phase_ == TRAIN) {  // for training, do mirror jittering
      do_mirror = mirror && PrefetchRand() % 2;
    }
    Transform(cv_img, &batch->data_, do_mirror, data_offset);
    trans_time += timer.MicroSeconds();
    // 2.3 Generate density map
    timer.Start();
    int dmap_offset = item_id * (batch->label_.count(1));
    Mat dmap(dmap_shape[2], dmap_shape[3], CV_32F, Scalar(0));
    ReadLocationFromTextFile(root_folder + lines_[lines_id_].second);
    GetDensityMap(dmap, gt_loc_, base_sigma_);
    Transform(dmap, &batch->label_, do_mirror, dmap_offset);
    // 2.4 Go to the next iter
    lines_id_++;
    if (lines_id_ >= lines_size) {
      // We have reached the end. Restart from the first.
      DLOG(INFO) << "Restarting data prefetching from start.";
      lines_id_ = 0;
      if (this->layer_param_.image_data_param().shuffle()) {
        ShuffleData();
      }
    }
  }
  batch_timer.Stop();
  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
  DLOG(INFO) << "Generate Density map: " << dmap_time / 1000 << "ms.";
}

INSTANTIATE_CLASS(CrowdDataLayer);
REGISTER_LAYER_CLASS(CrowdData);

}  // namespace caffe
#endif  // USE_OPENCV

4.crowd_data_layer.cu


//  Create on: 2016/9/19 ShanghaiTech
//  Author:    Yingying Zhang

#include <vector>
#include "caffe/layers/crowd_data_layer.hpp"

namespace caffe {

template <typename Dtype>
void CrowdDataLayer<Dtype>::Forward_gpu(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  Batch<Dtype>* batch = this->prefetch_full_.pop("Data layer prefetch queue empty");
  // 1. Reshape to loaded crowd image and copy it to top[0].
  top[0]->ReshapeLike(batch->data_);
  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
      top[0]->mutable_gpu_data());
  // 2. Reshape to loaded density map and copy it to top[1].
  top[1]->ReshapeLike(batch->label_);
  caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
      top[1]->mutable_gpu_data());
  DLOG(INFO) << "Prefetch copied";
  // 3. Ensure the copy is synchronous wrt the host, so that the next batch isn't
  // copied in meanwhile.
  CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
  this->prefetch_free_.push(batch);
}

INSTANTIATE_LAYER_GPU_FORWARD(CrowdDataLayer);

}  // namespace caffe

PS: