版权声明:https://blog.csdn.net/z0n1l2 https://blog.csdn.net/z0n1l2/article/details/86412079
gpu codes
#include "windows.h"
#include "cuda_sort.h"
#include <iostream>
__global__ void mergesortK(int *a, int *temp, int sortedsize,int N)
{
// int id = blockIdx.x * blockDim.x + threadIdx.x;
int blockid = blockIdx.z * gridDim.x * gridDim.y
+ blockIdx.y * gridDim.x
+ blockIdx.x;
int id = blockid * blockDim.x + threadIdx.x;
unsigned long index1, index2, endIndex1, endIndex2, targetIndex;
index1 = id * 2 * sortedsize;
endIndex1 = index1 + sortedsize;
index2 = endIndex1;
endIndex2 = index2 + sortedsize;
targetIndex = id * 2 * sortedsize;
if(index1 >= N) return;
if(endIndex1 > N)
{
endIndex1 = N;
index2 = endIndex2 = N;
}
if(index2 > N)
{
index2 = endIndex2 = N;
}
if(endIndex2 > N)
endIndex2 = N;
int done = 0;
while (!done)
{
if ((index1 == endIndex1) && (index2 < endIndex2))
temp[targetIndex++] = a[index2++];
else if ((index2 == endIndex2) && (index1 < endIndex1))
temp[targetIndex++] = a[index1++];
else if (a[index1] < a[index2])
temp[targetIndex++] = a[index1++];
else
temp[targetIndex++] = a[index2++];
if ((index1 == endIndex1) && (index2 == endIndex2))
done = 1;
}
}
#define checkCudaErrors( a ) do { \
if (cudaSuccess != (a)) { \
fprintf(stderr, "Cuda runtime error in line %d of file %s \
: %s \n", __LINE__, __FILE__, cudaGetErrorString(cudaGetLastError()) ); \
exit(EXIT_FAILURE); \
} \
} while(0);
#define CUDA_POST_KERNEL_CHECK checkCudaErrors(cudaPeekAtLastError())
int mergesort(int* data, int N, float& cost_time)
{
int* dev_a, *dev_temp;
checkCudaErrors( cudaMalloc((void**)&dev_a, sizeof(int)*N) );
checkCudaErrors( cudaMalloc((void**)&dev_temp, sizeof(int)*N) );
checkCudaErrors( cudaMemcpy(dev_a,data, sizeof(int)*N,cudaMemcpyHostToDevice) );
int blocks = 512;
dim3 grids(128,1,1);
float t0 = GetTickCount();
int sortedsize = 1;
while (sortedsize < N)
{
mergesortK<<<grids,blocks>>>(dev_a, dev_temp, sortedsize,N);
cudaMemcpy(dev_a, dev_temp, N*sizeof(int), cudaMemcpyDeviceToDevice);
sortedsize *= 2;
}
cudaMemcpy(data, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
checkCudaErrors( cudaFree(dev_a) );
checkCudaErrors( cudaFree(dev_temp) );
cudaDeviceSynchronize();
CUDA_POST_KERNEL_CHECK
cost_time = GetTickCount() - t0;
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 2.8)
project(demo)
find_package(CUDA REQUIRED)
include_directories (.)
#set(CUDA_NVCC_FLAGS -O3;-G;-g)
#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=sm_61;-std=c++11;)
file(GLOB_RECURSE CURRENT_HEADERS *.h *.hpp *.cuh)
file(GLOB CURRENT_SOURCES *.cpp *.cu)
source_group("inc" FILES ${CURRENT_HEADERS})
source_group("src" FILES ${CURRENT_SOURCES})
cuda_add_executable(${PROJECT_NAME} ${CURRENT_HEADERS} ${CURRENT_SOURCES})
demo
#include "windows.h"
#include "cuda_sort.h"
#include <iostream>
#include <random>
#include <vector>
#include <iterator>
#include <algorithm>
int main(int argc, char* argv[])
{
std::default_random_engine rnd;
int N = 10240;
int* data = new int[N];
std::vector<int> data_vec;
for (int k = 0; k < N; k++)
{
data[k] = rnd() % 4096;
data_vec.push_back(data[k]);
//std::cout << data[k] << ",";
}
std::cout << std::endl;
//float t0 = GetTickCount();
float cost_gpu;
mergesort(data, N, cost_gpu);
//float t1 = GetTickCount();
float tt0 = GetTickCount();
std::sort(data_vec.begin(), data_vec.end());
float tt1 = GetTickCount();
int flag = 0;
for (int k = 0; k < N; k++)
{
if (data[k] == data_vec[k])
{
flag++;
}
}
std::cout << std::endl;
std::cout << "check result (" << flag << ","<<N<<") = "<<(flag== N) << std::endl;
std::cout << "gpu cost " << cost_gpu << "ms" << std::endl;
std::cout << "cpu cost " << tt1 - tt0 << "ms" << std::endl;
return 0;
}