Caffe source -SyncedMemory class

SyncedMemory class Introduction

Caffe recently read source code from a BVLC / caffe , the reference network is substantially more recommended > Solver - Blob -> Layer - > Net analyzed sequence. Wherein SyncedMemory caffe class is the underlying structure, is responsible for the operation (application, copy, etc.) a data memory or display memory.

syncedmem.cpp source

SyncedMemory::SyncedMemory()    //构造函数,初始化内部的变量,size为0,指针为空等
  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
#ifndef CPU_ONLY
#ifdef DEBUG
  CUDA_CHECK(cudaGetDevice(&device_));  //cudaGetDevice()函数会返回当前被使用的设备
#endif
#endif
}

SyncedMemory::SyncedMemory(size_t size)   //构造函数,设置size_的值(不会分配内存)
  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
#ifndef CPU_ONLY
#ifdef DEBUG
  CUDA_CHECK(cudaGetDevice(&device_));
#endif
#endif
}

SyncedMemory::~SyncedMemory() {       //析构函数
  check_device();                     //检查gpu设备
  if (cpu_ptr_ && own_cpu_data_) {    //如果cpu数据的指针不为空并且数据为自身创建的
    CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);    //释放数据
  }

#ifndef CPU_ONLY
  if (gpu_ptr_ && own_gpu_data_) {    //同理,gpu数据指针不为空并且数据为自身创建的
    CUDA_CHECK(cudaFree(gpu_ptr_));   //释放
  }
#endif  // CPU_ONLY
}

//将数据转移到cpu中.如果还未创建内存则申请对应大小的内存,
//如果数据只在gpu中则将数据拷至cpu中,如果cpu中已存在则不处理
inline void SyncedMemory::to_cpu() {
  check_device();
  switch (head_) {          //当前数据的状态
  case UNINITIALIZED:       //未分配状态
    CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);   //申请内存
    caffe_memset(size_, 0, cpu_ptr_);   //数据全部设置为0
    head_ = HEAD_AT_CPU;    //设置状态为数据位于内存中,由cpu处理
    own_cpu_data_ = true;   //数据由自身申请创建
    break;
  case HEAD_AT_GPU:         //当前数据位于gpu中
#ifndef CPU_ONLY
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);   //如果cpu数据指针为空,则申请内存
      own_cpu_data_ = true;
    }
    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);    //将gpu_ptr_中的数据复制到cpu_ptr_中,复制size_大小
    head_ = SYNCED;         //设置状态为已同步(cpu数据与gpu数据拥有相同的数据)
#else
    NO_GPU;     //数据在gpu中但是不支持gpu,错误
#endif
    break;
  case HEAD_AT_CPU:   //数据已经在cpu中,不进行处理
  case SYNCED:
    break;
  }
}

//同理,将数据转移到gpu中
inline void SyncedMemory::to_gpu() {
  check_device();
#ifndef CPU_ONLY
  switch (head_) {
  case UNINITIALIZED:   //未初始化
    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));   //申请显存
    caffe_gpu_memset(size_, 0, gpu_ptr_);       //置为0
    head_ = HEAD_AT_GPU;    //设置状态为gpu
    own_gpu_data_ = true;
    break;
  case HEAD_AT_CPU:     //数据位于cpu中
    if (gpu_ptr_ == NULL) {
      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));   //申请显存
      own_gpu_data_ = true;
    }
    caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);  //将数据从cpu_ptr_拷至gpu_ptr_
    head_ = SYNCED;
    break;
  case HEAD_AT_GPU:
  case SYNCED:
    break;
  }
#else
  NO_GPU;
#endif
}

//返回cpu数据的指针,指向的数据不可修改
const void* SyncedMemory::cpu_data() {
  check_device();   //检查设备是否出错
  to_cpu();         //数据转移至cpu中
  return (const void*)cpu_ptr_;
}

//将cpu的数据指针设置为data
void SyncedMemory::set_cpu_data(void* data) {
  check_device();   //检查
  CHECK(data);      //非空检查
  if (own_cpu_data_) {    //自身已经创建了cpu数据,先释放
    CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
  }
  cpu_ptr_ = data;        //指向data
  head_ = HEAD_AT_CPU;    //修改状态
  own_cpu_data_ = false;  //数据并非自身申请创建的,在调用析构函数时,并不会释放cpu_ptr_指向的内存
}

//返回gpu数据的指针,指向的数据不可修改
const void* SyncedMemory::gpu_data() {
  check_device();
#ifndef CPU_ONLY
  to_gpu();   //转移到gpu中
  return (const void*)gpu_ptr_;
#else
  NO_GPU;
  return NULL;
#endif
}

//设置gpu数据的指针
void SyncedMemory::set_gpu_data(void* data) {
  check_device();
#ifndef CPU_ONLY
  CHECK(data);
  if (own_gpu_data_) {    //自身创建的gpu数据,先释放
    CUDA_CHECK(cudaFree(gpu_ptr_));
  }
  gpu_ptr_ = data;
  head_ = HEAD_AT_GPU;
  own_gpu_data_ = false;    //同样设置为false
#else
  NO_GPU;
#endif
}

//返回cpu上的数据指针,指向的数据可修改
void* SyncedMemory::mutable_cpu_data() {
  check_device();
  to_cpu();
  head_ = HEAD_AT_CPU;
  return cpu_ptr_;
}

//返回gpu上的数据指针,指向的数据可修改
void* SyncedMemory::mutable_gpu_data() {
  check_device();
#ifndef CPU_ONLY
  to_gpu();
  head_ = HEAD_AT_GPU;
  return gpu_ptr_;
#else
  NO_GPU;
  return NULL;
#endif
}

//从cpu中来拷贝数据至gpu,异步拷贝
#ifndef CPU_ONLY
void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
  check_device();
  CHECK(head_ == HEAD_AT_CPU);    //当前数据应在cpu中
  if (gpu_ptr_ == NULL) {
    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));   //申请gpu显存
    own_gpu_data_ = true;
  }
  const cudaMemcpyKind put = cudaMemcpyHostToDevice;    //设置拷贝方向,Host To Device
  //Copies data between host and device.异步操作,可能在数据拷贝完成之前函数便返回
  //cudaMemcpy()为同步的,数据拷贝完后函数才会返回
  CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));  //将cpu_ptr_数据拷贝至gpu_ptr_中
  // Assume caller will synchronize on the stream before use
  head_ = SYNCED;   //共享
}
#endif

void SyncedMemory::check_device() {   //检查设备,判断是否出错
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  cudaGetDevice(&device);   //返回当前被使用的设备
  CHECK(device == device_);
  if (gpu_ptr_ && own_gpu_data_) {
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, gpu_ptr_));  //返回gpu_ptr_指针的属性到attributes中
    CHECK(attributes.device == device_);    //检查指针所在的设备与类中保存的设备device_是否一致
  }
#endif
#endif
}

syncedmem.hpp

// If CUDA is available and in GPU mode, host memory will be allocated pinned,
// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
// The improvement in performance seems negligible in the single GPU case,
// but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs.
//申请内存
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {        //gpu模式下
    CUDA_CHECK(cudaMallocHost(ptr, size));  //分配锁页内存
    *use_cuda = true;   //使用了cuda
    return;
  }
#endif
#ifdef USE_MKL          //使用了Intel的Math Kernel Library库
  *ptr = mkl_malloc(size ? size:1, 64);
#else
  *ptr = malloc(size);  //朴实无华的内存创建(分页内存)
#endif
  *use_cuda = false;    //未使用cuda
  CHECK(*ptr) << "host allocation of size " << size << " failed";
}

//释放内存
inline void CaffeFreeHost(void* ptr, bool use_cuda) {
#ifndef CPU_ONLY
  if (use_cuda) {   //使用了cuda,则使用cuda函数释放对应的内存
    CUDA_CHECK(cudaFreeHost(ptr));
    return;
  }
#endif
#ifdef USE_MKL
  mkl_free(ptr);
#else
  free(ptr);
#endif
}

/**
 * @brief Manages memory allocation and synchronization between the host (CPU)
 *        and device (GPU).
 *
 * TODO(dox): more thorough description.
 */
class SyncedMemory {
 public:
  SyncedMemory();
  explicit SyncedMemory(size_t size);
  ~SyncedMemory();
  const void* cpu_data();
  void set_cpu_data(void* data);
  const void* gpu_data();
  void set_gpu_data(void* data);
  void* mutable_cpu_data();
  void* mutable_gpu_data();
  //数据的几种状态,UNINITIALIZED(未初始化,内存或显存还未申请), HEAD_AT_CPU(数据在cpu中),
  //HEAD_AT_GPU(数据在gpu中), SYNCED(数据在cpu和gpu中都存在,并且内容相同)
  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
  SyncedHead head() const { return head_; }
  size_t size() const { return size_; }

#ifndef CPU_ONLY
  void async_gpu_push(const cudaStream_t& stream);
#endif

 private:
  void check_device();

  void to_cpu();        //数据转移到cpu中
  void to_gpu();        //数据转移到gpu中
  void* cpu_ptr_;       //cpu中的数据指针
  void* gpu_ptr_;       //gpu中的数据指针
  size_t size_;         //数据的大小
  SyncedHead head_;     //数据的状态,共SyncedHead中指示的四种
  //cpu中的数据是否有自身创建,还是外部传入的指针?(自身创建自己负责释放,外部传的指针析构时不会释放,由外部决定)
  bool own_cpu_data_;
  bool cpu_malloc_use_cuda_;    //申请cpu数据时是否使用了cuda
  bool own_gpu_data_;   //同理,gpu中的数据是否由自身创建
  int device_;          //当前使用的gpu设备

  DISABLE_COPY_AND_ASSIGN(SyncedMemory);  //禁止类的拷贝或者赋值操作
};  // class SyncedMemory

summary

  1. Cpu processing data corresponding to the data memory, data memory corresponding to the data processing gpu
  2. Does not allocate memory or memory when simply create an instance of the class SyncedMemory, only when the actual need to access the data (such as cpu_data () / mutable_gpu_data (), etc.), will be in to_cpu internal () or to_gpu () function assigned respective memory or memory
  3. CaffeMallocHost (function used) with cudaMallocHost () allocated memory lock page, this memory can be accessed directly gpu device, read and write speed faster than the ordinary paging memory (the malloc application). About CUDA various functions can be provided by the official reference manual.

Caffe source is the first time I read while reading side of the record, to understand and analyze the code there may be errors or omissions, I hope you readers criticism, thank you support!

reference

https://docs.nvidia.com/pdf/CUDA_Runtime_API.pdf
https://www.zhihu.com/question/27982282

Guess you like

Origin www.cnblogs.com/Relu110/p/11946412.html