cuda编程 MergeSort

gpu codes

#include "windows.h"
#include "cuda_sort.h"
#include <iostream>

__global__ void mergesortK(int *a, int *temp, int sortedsize,int N) 
{ 
   // int id = blockIdx.x * blockDim.x + threadIdx.x;

   int blockid = blockIdx.z * gridDim.x * gridDim.y
          + blockIdx.y * gridDim.x  
          + blockIdx.x;
	int id = blockid * blockDim.x + threadIdx.x;

		  
	unsigned long index1, index2, endIndex1, endIndex2, targetIndex;
	index1 = id * 2 * sortedsize; 
    endIndex1 = index1 + sortedsize;  
    index2 = endIndex1; 
    endIndex2 = index2 + sortedsize;  
    targetIndex = id * 2 * sortedsize; 

	if(index1 >= N) return;

	if(endIndex1 > N)
	{
		endIndex1 = N;
		index2 = endIndex2 = N;
	}
	if(index2 > N)
	{
		index2 = endIndex2 = N;
	}
	if(endIndex2 > N)
		endIndex2 = N;



    int done = 0; 
    while (!done) 
    { 
        if ((index1 == endIndex1) && (index2 < endIndex2)) 
            temp[targetIndex++] = a[index2++]; 
        else if ((index2 == endIndex2) && (index1 < endIndex1)) 
            temp[targetIndex++] = a[index1++]; 
        else if (a[index1] < a[index2]) 
            temp[targetIndex++] = a[index1++]; 
        else 
            temp[targetIndex++] = a[index2++]; 
            
        if ((index1 == endIndex1) && (index2 == endIndex2)) 
            done = 1; 
	} 
}




#define checkCudaErrors( a ) do { \
if (cudaSuccess != (a)) { \
fprintf(stderr, "Cuda runtime error in line %d of file %s \
: %s \n", __LINE__, __FILE__, cudaGetErrorString(cudaGetLastError()) ); \
exit(EXIT_FAILURE); \
} \
} while(0); 


#define CUDA_POST_KERNEL_CHECK checkCudaErrors(cudaPeekAtLastError())

int mergesort(int* data, int N, float& cost_time)
{
    int* dev_a, *dev_temp;
    

	checkCudaErrors( cudaMalloc((void**)&dev_a, sizeof(int)*N) ); 
	checkCudaErrors( cudaMalloc((void**)&dev_temp, sizeof(int)*N) ); 

	checkCudaErrors( cudaMemcpy(dev_a,data, sizeof(int)*N,cudaMemcpyHostToDevice) ); 


	int blocks = 512;
	dim3 grids(128,1,1);



	float t0 = GetTickCount();
    int sortedsize = 1; 
    while (sortedsize < N) 
    { 
        mergesortK<<<grids,blocks>>>(dev_a, dev_temp, sortedsize,N); 
        cudaMemcpy(dev_a, dev_temp, N*sizeof(int), cudaMemcpyDeviceToDevice); 
		sortedsize *= 2; 
    }
    cudaMemcpy(data, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);

	checkCudaErrors( cudaFree(dev_a) ); 
	checkCudaErrors( cudaFree(dev_temp) ); 

	cudaDeviceSynchronize(); 
	CUDA_POST_KERNEL_CHECK
	cost_time = GetTickCount() - t0;
    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 2.8) 

project(demo)

find_package(CUDA REQUIRED) 

include_directories (.) 

#set(CUDA_NVCC_FLAGS -O3;-G;-g) 

#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=sm_61;-std=c++11;)




file(GLOB_RECURSE CURRENT_HEADERS *.h *.hpp *.cuh) 
file(GLOB CURRENT_SOURCES *.cpp *.cu) 

source_group("inc" FILES ${CURRENT_HEADERS}) 
source_group("src" FILES ${CURRENT_SOURCES}) 

cuda_add_executable(${PROJECT_NAME} ${CURRENT_HEADERS} ${CURRENT_SOURCES})

demo

#include "windows.h"
#include "cuda_sort.h"
#include <iostream>
#include <random>
#include <vector>
#include <iterator>
#include <algorithm>


int main(int argc, char* argv[])
{
	std::default_random_engine rnd;
	int N = 10240;
	int* data = new int[N];
	std::vector<int> data_vec;
	for (int k = 0; k < N; k++)
	{
		data[k] = rnd() % 4096;
		data_vec.push_back(data[k]);
		//std::cout << data[k] << ",";
	}
	std::cout << std::endl;

	//float t0 = GetTickCount();
	float cost_gpu;
	mergesort(data, N, cost_gpu);
	//float t1 = GetTickCount();

	float tt0 = GetTickCount();
	std::sort(data_vec.begin(), data_vec.end());
	float tt1 = GetTickCount();

	int flag = 0;
	for (int k = 0; k < N; k++)
	{
		if (data[k] == data_vec[k])
		{
			flag++;
		}
	}
	std::cout << std::endl;
	std::cout << "check result (" << flag << ","<<N<<") = "<<(flag== N) << std::endl;
	
	std::cout << "gpu cost " << cost_gpu << "ms" << std::endl;
	std::cout << "cpu cost " << tt1 - tt0 << "ms" << std::endl;
	return 0;
}

猜你喜欢