【CUDA】矩阵乘法（从简单到进阶）

在学习CUDA的过程中，免不了要实现矩阵乘，从而对GPU的运作机制以及如何使用share memory有更深的理解。下面是用FORTRAN对简单矩阵乘和利用分块并将数据放入share memory实现矩阵乘的实现。
! this program demonstates various memory optimzation techniques
! applied to a matrix multiplication.

module dimensions_m

  implicit none

  integer, parameter :: TILE_DIM = 16
  integer, parameter :: NUM_REPS = 100
  integer, parameter :: nx = 256, ny = 256


end module dimensions_m

module kernels_matrix
  use dimensions_m
  implicit none
  contains
!实际没有使用，但通过调用它的输出可以看出数据是如何移动的，
!对于share memory时，每个线程需要搬用全局的哪些数据有帮助
  attributes(global) subroutine matrixMulTestKernel(dA,dB,dC)

    real :: dA(ny,nx),dB(ny,nx),dC(ny,nx)
    real,shared :: tmA(TILE_DIM,TILE_DIM),tmB(TILE_DIM,TILE_DIM)
    integer :: x, y, i,j,tx,ty,ti
    real :: tmc


    tx = threadIdx%x
    ty = threadIdx%y
    x = (blockIdx%x-1) * blockDim%x + threadIdx%x
    y = (blockIdx%y-1) * blockDim%y + threadIdx%y
    tmc = 0.0
    i = TILE_DIM
        if(tx <= nx .and. ty <= ny) then
            tmA(tx,ty) = dA(x,ty+i)
            tmB(tx,ty) = dB(tx+i,ty)
            call syncthreads()
            do j = 1,TILE_DIM
                tmc = tmA(tx,ty) !tmA(tx,j) *tmB(j,ty) + tmc
            end do
            call syncthreads()
        end if
    dC(x,y) = tmc
  end subroutine matrixMulTestKernel
! 简单的矩阵乘实现，基本思路是每个线程获得它在整个线程中的位置，
! 然后计算该点对应的结果即可
  attributes(global) subroutine matrixMulKernel(dA,dB,dC)

    real :: dA(ny,nx),dB(ny,nx),dC(ny,nx)
    integer :: x, y, j

    x = (blockIdx%x-1) * blockDim%x + threadIdx%x
    y = (blockIdx%y-1) * blockDim%y + threadIdx%y

    dC(x,y) = 0.0;
    do j = 1,nx
        dC(x,y) = dA(x,j)*dB(j,y) + dC(x,y)
    end do
  end subroutine matrixMulKernel

!共享内存实现版本，首先将矩阵中的一块放入共享内存，并计算出中间结果，
!接着使用循环的方式，计算出来这块的最终结果。将数据放入共享内存，可以减少
!线程从全局内存中取数据的次数，从而提高计算速度
  attributes(global) subroutine matrixMulSharedKernel(dA,dB,dC)

    real :: dA(ny,nx),dB(ny,nx),dC(ny,nx)
    real,shared :: tmA(TILE_DIM,TILE_DIM),tmB(TILE_DIM,TILE_DIM)
    integer :: x, y, i,j,tx,ty,ti
    real :: tmc


    tx = threadIdx%x
    ty = threadIdx%y
    x = (blockIdx%x-1) * blockDim%x + threadIdx%x
    y = (blockIdx%y-1) * blockDim%y + threadIdx%y
    tmc = 0.0
! 通过循环的方式，计算出最终结果
    do i = 0,nx-1,TILE_DIM
        if(tx+i <= nx .and. ty+i <= ny) then
            tmA(tx,ty) = dA(x,ty+i)
            tmB(tx,ty) = dB(tx+i,y)
            call syncthreads()
            do j = 1,TILE_DIM
                tmc = tmA(tx,j)*tmB(j,ty) + tmc
            end do
            call syncthreads()
        end if
    end do
    dC(x,y) = tmc
  end subroutine matrixMulSharedKernel
end module kernels_matrix

program matrixMulMatrix

  use cudafor
  use dimensions_m
  use kernels_matrix

  implicit none

  type (dim3) :: grid, tBlock
  type (cudaEvent) :: startEvent, stopEvent
  type (cudaDeviceProp) :: prop
  real :: time

  real :: hA(nx,ny), hB(nx,ny), hC(ny,nx),gold(ny,nx)
  real, device :: dA(nx,ny), dB(nx,ny), dC(ny,nx)

  integer :: i,ii, j,k, istat,errflag
  integer :: cputime(0:1),timediff,clock_rate,clock_max

  ! check parameters and calculate execution configuration

!  if (mod(nx, TILE_DIM) /= 0 &
!       .or. mod(ny, TILE_DIM) /= 0) then
!     write(*,*) 'nx and ny must be a multiple of TILE_DIM'
!     stop
!  end if
!

  grid = dim3(nx/TILE_DIM, ny/TILE_DIM, 1)
  tBlock = dim3(TILE_DIM, TILE_DIM, 1)

  ! write parameters

  i = cudaGetDeviceProperties(prop, 0)
  write(*,"(/,'Device Name: ',a)") trim(prop%name)
  write(*,"('Compute Capability: ',i0,'.',i0)") &
       prop%major, prop%minor


  write(*,*)
  write(*,"('Matrix size:', i5, i5, ',  &
        Tile size:', i3, i3)") &
       nx, ny, TILE_DIM, TILE_DIM

  write(*,"('grid:', i4,i4,i4, ',   tBlock:', i4,i4,i4)") &
       grid%x, grid%y, grid%z, tBlock%x, tBlock%y, tBlock%z

  ! initialize data

  ! host

  do j = 1, ny
     do i = 1, nx
        hA(i,j) = i-1
        hB(i,j) = j-1
     enddo
  enddo
! 获得cpu计算的结果
  call system_clock(cputime(0),clock_rate,clock_max)
  do j = 1, ny
     do i = 1, nx
        gold(i,j) = 0
        do k = 1, nx
            gold(i,j) = hA(i,k) * hB(k,j) + gold(i,j)
        end do
     enddo
  enddo
  call system_clock(cputime(1),clock_rate,clock_max)
  timediff = cputime(1) - cputime(0)
       write(*,'(a20,$)') '*** CpuTime(ms):'
       write(*,'(F20.2)') real(timediff)*1000/real(clock_rate)
!  do i = 1, nx
!      do j = 1, ny
!         write(*,"(F10.3,$)") hA(i,j)
!      end do
!      write(*,*)
!  end do
!  write(*,*)
!  do i = 1, nx
!      do j = 1, ny
!         write(*,"(F10.3,$)") hB(i,j)
!      end do
!      write(*,*)
!  end do

  ! device

  dB = hB
  dA = hA

  ! events for timing
  !golbal memory 全局内存实现方式
  write(*,'(a30)') '*** global memory results ***'
  istat = cudaEventCreate(startEvent)
  istat = cudaEventCreate(stopEvent)

  istat = cudaEventRecord(startEvent, 0)
  do i=1, NUM_REPS
     call matrixMulKernel<<<grid, tBlock>>>(dA,dB,dC)
  end do
  istat = cudaEventRecord(stopEvent, 0)
  istat = cudaEventSynchronize(stopEvent)
  istat = cudaEventElapsedTime(time, startEvent, stopEvent)

  hC = dC
  call postprocess(gold,hC,time)

  ! share memory 共享内存实现方式
  write(*,'(a30)') '*** share memory results ***'
  istat = cudaEventCreate(startEvent)
  istat = cudaEventCreate(stopEvent)

  istat = cudaEventRecord(startEvent, 0)
  do i=1, NUM_REPS
     call matrixMulSharedKernel<<<grid, tBlock>>>(dA,dB,dC)
  end do
  istat = cudaEventRecord(stopEvent, 0)
  istat = cudaEventSynchronize(stopEvent)
  istat = cudaEventElapsedTime(time, startEvent, stopEvent)

  hC = dC
  call postprocess(gold,hC,time)

  istat = cudaEventDestroy(startEvent)
  istat = cudaEventDestroy(stopEvent)


! 结果验证函数
contains
  subroutine postprocess(ref, res, t)
    real, intent(in) :: ref(:,:), res(:,:), t
    if (all(res == ref)) then
       write(*,'(a20)') '*** Test Passed ***'
       write(*,'(a20,$)') '*** ElapsedTime(ms):'
       write(*,'(f20.2)') (t/NUM_REPS)
    else
       write(*,'(a20)') '*** Failed ***'
    end if
  end subroutine postprocess


end program matrixMulMatrix
xll_bit
发布了46 篇原创文章 · 获赞 14 · 访问量 4万+
私信关注
【CUDA】矩阵乘法（从简单到进阶）

猜你喜欢