caffe 里面GPU 选择设置, 通过配置文件 读取GPU 的编号。
// Parse GPU ids or use all available devices
static void get_gpus(vector<int>* gpus) {
if (FLAGS_gpu == "all") {
int count = 0;
#ifndef CPU_ONLY
CUDA_CHECK(cudaGetDeviceCount(&count));
#else
NO_GPU;
#endif
for (int i = 0; i < count; ++i) {
gpus->push_back(i);
}
} else if (FLAGS_gpu.size()) {
vector<string> strings;
boost::split(strings, FLAGS_gpu, boost::is_any_of(","));
for (int i = 0; i < strings.size(); ++i) {
gpus->push_back(boost::lexical_cast<int>(strings[i]));
}
} else {
CHECK_EQ(gpus->size(), 0);
}
}
int device_query() {
LOG(INFO) << "Querying GPUs " << FLAGS_gpu;
vector<int> gpus;
get_gpus(&gpus);
for (int i = 0; i < gpus.size(); ++i) {
caffe::Caffe::SetDevice(gpus[i]);
caffe::Caffe::DeviceQuery();
}
return 0;
}
RegisterBrewFunction(device_query);
static BrewFunction GetBrewFunction(const caffe::string& name) {
if (g_brew_map.count(name)) {
return g_brew_map[name];
} else {
LOG(ERROR) << "Available caffe actions:";
for (BrewMap::iterator it = g_brew_map.begin();
it != g_brew_map.end(); ++it) {
LOG(ERROR) << "\t" << it->first;
}
LOG(FATAL) << "Unknown action: " << name;
return NULL; // not reachable, just to suppress old compiler warnings.
}
}
设置 GPU 编号:
void Caffe::SetDevice(const int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(¤t_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
if (Get().curand_generator_) {
CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
}
CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
cluster_seedgen()));
}
其他调用GPU 相关函数
bool Caffe::CheckDevice(const int device_id) {
// This function checks the availability of GPU #device_id.
// It attempts to create a context on the device by calling cudaFree(0).
// cudaSetDevice() alone is not sufficient to check the availability.
// It lazily records device_id, however, does not initialize a
// context. So it does not know if the host thread has the permission to use
// the device or not.
//
// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
// even if the device is exclusively occupied by another process or thread.
// Cuda operations that initialize the context are needed to check
// the permission. cudaFree(0) is one of those with no side effect,
// except the context initialization.
bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
(cudaSuccess == cudaFree(0)));
// reset any error that may have occurred.
cudaGetLastError();
return r;
}
int Caffe::FindDevice(const int start_id) {
// This function finds the first available device by checking devices with
// ordinal from start_id to the highest available value. In the
// EXCLUSIVE_PROCESS or EXCLUSIVE_THREAD mode, if it succeeds, it also
// claims the device due to the initialization of the context.
int count = 0;
CUDA_CHECK(cudaGetDeviceCount(&count));
for (int i = start_id; i < count; i++) {
if (CheckDevice(i)) return i;
}
return -1;
}
void Caffe::DeviceQuery() {
cudaDeviceProp prop;
int device;
if (cudaSuccess != cudaGetDevice(&device)) {
printf("No cuda device present.\n");
return;
}
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
LOG(INFO) << "Device id: " << device;
LOG(INFO) << "Major revision number: " << prop.major;
LOG(INFO) << "Minor revision number: " << prop.minor;
LOG(INFO) << "Name: " << prop.name;
LOG(INFO) << "Total global memory: " << prop.totalGlobalMem;
LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
LOG(INFO) << "Total registers per block: " << prop.regsPerBlock;
LOG(INFO) << "Warp size: " << prop.warpSize;
LOG(INFO) << "Maximum memory pitch: " << prop.memPitch;
LOG(INFO) << "Maximum threads per block: " << prop.maxThreadsPerBlock;
LOG(INFO) << "Maximum dimension of block: "
<< prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
<< prop.maxThreadsDim[2];
LOG(INFO) << "Maximum dimension of grid: "
<< prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
<< prop.maxGridSize[2];
LOG(INFO) << "Clock rate: " << prop.clockRate;
LOG(INFO) << "Total constant memory: " << prop.totalConstMem;
LOG(INFO) << "Texture alignment: " << prop.textureAlignment;
LOG(INFO) << "Concurrent copy and execution: "
<< (prop.deviceOverlap ? "Yes" : "No");
LOG(INFO) << "Number of multiprocessors: " << prop.multiProcessorCount;
LOG(INFO) << "Kernel execution timeout: "
<< (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
return;
}