Recently, I have to process large-scale point cloud data, and running it on the CPU feels a bit slow. I want to use the GPU to accelerate the point cloud processing process, so I want to learn CUDA programming.
Many tutorials mention that before installing CUDA, you need to open cmd and enter nvidia-smi to check the CUDA version supported by the graphics card. I did not do this step before installing CUDA. I installed it directly, but it is recommended to check it. Afterwards, I checked it and found that it was written directly as 11.8.
CUDA download and installation
CUDA official installation tutorial: https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html
According to the official tutorial, it can be found that VS2019 and Win11 can be configured with CUDA11.8.
CUDA download link: https://developer.nvidia.com/cuda-downloads
The installation process defaults all the way, and the default path is usually on the C drive.
After the installation is complete, open cmd, enter nvcc -V and try to see if you can find the CUDA version. If so, it should be no problem.
VS2019 and CUDA configuration
You can open an existing VS project or create a new empty project. Right-click the source file and add a new item to create a CUDA file with the suffix .cu. If you create a CUDA header file, the suffix is .cuh. debug for x64
Then, right-click the cu file, select Properties, and change the item type to CUDA C++.
Select the project, right-click->Generate dependencies->Custom build->Select CUDA11.8
Right-click the project, find CUDA C/C++ -> Common, enter C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8. Of course, you still need to
configure the include directory and library directory. This is the same as VS configuring other libraries
. Directory: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include
Library directory: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\lib\x64
Linker——>Input—— —>Additional dependencies
cublas.lib
cublasLt.lib
cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
cufft.lib
cufftw.lib
cufilt.lib
curand.lib
cusolver.lib
cusolverMg.lib
cusparse.lib
nppc.lib
nppial.lib
nppicc.lib
nppidei.lib
nppif.lib
nppig.lib
nppim.lib
nppist.lib
nppisu.lib
nppitc.lib
npps.lib
nvblas.lib
nvjpeg.lib
nvml.lib
nvptxcompiler_static.lib
nvrtc.lib
nvrtc_static.lib
nvrtc-builtins_static.lib
OpenCL.lib
After completing the configuration, you usually need to run a routine to try it out and see the effect. But an error was reported
Undefined identifier '__syncthreads'
This blog explains that adding any header file will not work in this situation. I tried it and found that it does not work. But it seems to compile and run.
Another blogger said the same thing, portal
Expression should be entered
It seems that this can be ignored, and I haven't found any solution.
test code
error.cuh file
#pragma once
#include <stdio.h>
#define CHECK(call) \
do \
{
\
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{
\
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
XXX.cu code
#include <stdio.h>
#include <stdlib.h>
#include "error.cuh"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <math.h>
#define TILE_DIM 32 //Don't ask me why I don't set these two values to one
#define BLOCK_SIZE 32
#define N 3001 // for huanhuan, you know that!
__managed__ int input_M[N * N]; //input matrix & GPU result
int cpu_result[N * N]; //CPU result
//in-place matrix transpose
__global__ void ip_transpose(int* data)
{
__shared__ int tile_s[TILE_DIM][TILE_DIM + 1];
__shared__ int tile_d[TILE_DIM][TILE_DIM + 1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
//Threads in the triangle below
if (blockIdx.y > blockIdx.x) {
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
if (x < N && y < N)
{
tile_s[threadIdx.y][threadIdx.x] = data[(y)*N + x];
}
if (dx < N && dy < N)
{
tile_d[threadIdx.y][threadIdx.x] = data[(dy)*N + dx];
}
__syncthreads();
if (dx < N && dy < N)
{
data[(dy)*N + dx] = tile_s[threadIdx.x][threadIdx.y];
}
if (x < N && y < N)
{
data[(y)*N + x] = tile_d[threadIdx.x][threadIdx.y];
}
}
else if (blockIdx.y == blockIdx.x)//Threads on the diagonal
{
if (x < N && y < N)
{
tile_s[threadIdx.y][threadIdx.x] = data[(y)*N + x];
}
__syncthreads();
if (x < N && y < N)
{
data[(y)*N + x] = tile_s[threadIdx.x][threadIdx.y];
}
}
}
void cpu_transpose(int* A, int* B)
{
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++)
{
B[i * N + j] = A[j * N + i];
}
}
}
int main(int argc, char const* argv[])
{
cudaEvent_t start, stop_gpu;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop_gpu));
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
input_M[i * N + j] = rand() % 1000;
}
}
cpu_transpose(input_M, cpu_result);
CHECK(cudaEventRecord(start));
unsigned int grid_rows = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
ip_transpose << <dimGrid, dimBlock >> > (input_M);
CHECK(cudaDeviceSynchronize());
CHECK(cudaEventRecord(stop_gpu));
CHECK(cudaEventSynchronize(stop_gpu));
float elapsed_time_gpu;
CHECK(cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu));
printf("Time_GPU = %g ms.\n", elapsed_time_gpu);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop_gpu));
int ok = 1;
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
if (fabs(input_M[i * N + j] - cpu_result[i * N + j]) > (1.0e-10))
{
ok = 0;
}
}
}
if (ok)
{
printf("Pass!!!\n");
}
else
{
printf("Error!!!\n");
}
return 0;
}
result
GPU information reading code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
void myDeviceInfo();
int main()
{
// mykernel <<<1, 10 >>> ();
myDeviceInfo();
cudaDeviceSynchronize();
return 0;
}
void myDeviceInfo()
{
int dev_count;
cudaGetDeviceCount(&dev_count);
cudaDeviceProp dev_prop;
int i;
for (i = 0; i < dev_count; i++) {
cudaGetDeviceProperties(&dev_prop, i);
printf("----------- Information of device %d -----------\n", i);
printf("The streaming multiprocessor(SM) number is %d\n", dev_prop.multiProcessorCount);
printf("The max thread block numberof per SM is %d\n", dev_prop.maxBlocksPerMultiProcessor);
printf("The max threads number of per SM is %d\n", dev_prop.maxThreadsPerMultiProcessor);
printf("The max threads number of per block is %d\n", dev_prop.maxThreadsPerBlock);
printf("The max thread blocks number in (x, y, z) dim is (%d, %d, %d)\n", dev_prop.maxGridSize[0], dev_prop.maxGridSize[1], dev_prop.maxGridSize[2]);
printf("The max threads number of (x, y, z) dim is (%d. %d, %d)\n", dev_prop.maxThreadsDim[0], dev_prop.maxThreadsDim[1], dev_prop.maxThreadsDim[2]);
printf("----------- Information of device end -----------\n");
}
}