OpenACC 书上的范例代码（Jacobi 迭代），part 4

▶ 使用Jacobi 迭代求泊松方程的数值解

● 使用 routine 导语封装模平方函数，把 u1 放进设备端再次减少拷贝开销，数组初始化也在设备中完成

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <math.h>
 4 #include <time.h>
 5 #include <openacc.h>
 6 
 7 #if defined(_WIN32) || defined(_WIN64)
 8 #include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>    
 9 #define timestruct clock_t
10 #define gettime(a) (*(a) = clock())
11 #define usec(t1,t2) (t2 - t1)
12 #else
13 #include <sys/time.h>
14 #define gettime(a)  gettimeofday(a, NULL)
15 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   
16 typedef struct timeval timestruct;
17 #endif
18 
19 #pragma acc routine seq
20 inline float uval(float x, float y)
21 {
22     return x * x + y * y;
23 }
24 
25 int main()
26 {
27     const int row = 8191, col = 1023;
28     const float height = 1.0, width = 2.0;
29     const float hx = height / row, wy = width / col;
30     const float fij = -4.0f;
31     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));
32     const int maxIter = 100;
33     const int colPlus = col + 1;
34 
35     float *restrict u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
36     float *restrict u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
37     float *utemp = NULL;
38 
39     timestruct t1, t2;
40     acc_init(acc_device_nvidia);
41     gettime(&t1);
42 #pragma acc data copyout(u0[0:(row + 1) * colPlus]) create(u1[0:(row + 1) * colPlus])
43     {
44 #pragma acc kernels present(u0[0:((row + 1) * colPlus)])
45         {
46 #pragma acc loop independent 
47             for (int i = 0; i < (row + 1)*(col + 1); i++)
48                 u0[i] = 0.0f;
49 #pragma acc loop independent
50             for (int ix = 0; ix <= row; ix++)
51                 u0[ix * colPlus + 0] = uval(ix * hx, 0.0f);
52 #pragma acc loop independent
53             for (int ix = 0; ix <= row; ix++)
54                 u0[ix * colPlus + col] = uval(ix * hx, col * wy);
55 #pragma acc loop independent
56             for (int jy = 0; jy <= col; jy++)
57                 u0[jy] = uval(0.0f, jy * wy);
58 #pragma acc loop independent
59             for (int jy = 0; jy <= col; jy++)
60                 u0[row * colPlus + jy] = u1[row * colPlus + jy] = uval(row * hx, jy * wy);
61         }
62         
63         for (int iter = 0; iter < maxIter; iter++)
64         {
65 #pragma acc kernels present(utemp, u0[0:((row + 1) * colPlus)], u1[0:((row + 1) * colPlus)])
66             {
67 #pragma acc loop independent
68                 for (int ix = 1; ix < row; ix++)
69                 {
70 #pragma acc loop independent
71                     for (int jy = 1; jy < col; jy++)
72                     {
73                         u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + \
74                             hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2;
75                     }
76                 }
77             }
78             utemp = u0, u0 = u1, u1 = utemp;
79 #pragma acc wait
80         }                
81     }
82     gettime(&t2);
83 
84     long long timeElapse = usec(t1, t2);
85 #if defined(_WIN32) || defined(_WIN64)
86     printf("\nElapsed time: %13ld ms.\n", timeElapse);
87 #else    
88     printf("\nElapsed time: %13ld us.\n", timeElapse);
89 #endif
90     free(u0);
91     free(u1);
92     acc_shutdown(acc_device_nvidia);
93     //getchar();
94     return 0;
95 }

● 输出结果，win10 中nvvp显示总时间减少到了 247 ms，计算时间 63 ms 几乎不变

D:\Code\OpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc
main:
     42, Generating copyout(u0[:colPlus*(row+1)])
         Generating create(u1[:colPlus*(row+1)])
     44, Generating present(u0[:colPlus*(row+1)])
     47, Loop is parallelizable
         Generating Tesla code
         47, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     47, Memory zero idiom, loop replaced by call to __c_mzero4
     50, Loop is parallelizable
         Generating Tesla code
         50, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     53, Loop is parallelizable
         Generating Tesla code
         53, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     56, Loop is parallelizable
         Generating Tesla code
         56, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     59, Loop is parallelizable
         Generating Tesla code
         59, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     63, Generating present(utemp[:],u1[:colPlus*(row+1)],u0[:colPlus*(row+1)])
     68, Loop is parallelizable
     71, Loop is parallelizable
         Generating Tesla code
         68, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         71, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     71, FMA (fused multiply-add) instruction(s) generated
uval:
     21, Generating acc routine seq
         Generating Tesla code
     22, FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

Elapsed time:            63 ms.

● 输出结果，Ubuntu 中时间也减少了少许

...

Elapsed time:         63274 us.

Accelerator Kernel Timing data
/home/cuan/my.c
  main  NVIDIA  devicenum=0
    time(us): 36,680
    42: data region reached 2 times
        82: data copyout transfers: 3
             device time(us): total=2,587 max=1,299 min=10 avg=862
    44: compute region reached 1 time
        47: kernel launched 1 time
            grid: [65535]  block: [128]
             device time(us): total=155 max=155 min=155 avg=155
            elapsed time(us): total=317 max=317 min=317 avg=317
        50: kernel launched 1 time
            grid: [64]  block: [128]
             device time(us): total=5 max=5 min=5 avg=5
            elapsed time(us): total=17 max=17 min=17 avg=17
        53: kernel launched 1 time
            grid: [64]  block: [128]
             device time(us): total=5 max=5 min=5 avg=5
            elapsed time(us): total=16 max=16 min=16 avg=16
        56: kernel launched 1 time
            grid: [8]  block: [128]
             device time(us): total=1 max=1 min=1 avg=1
            elapsed time(us): total=12 max=12 min=12 avg=12
        59: kernel launched 1 time
            grid: [8]  block: [128]
             device time(us): total=2 max=2 min=2 avg=2
            elapsed time(us): total=13 max=13 min=13 avg=13
    44: data region reached 2 times
    63: data region reached 200 times
    65: compute region reached 100 times
        71: kernel launched 100 times
            grid: [32x1024]  block: [32x4]
             device time(us): total=33,925 max=358 min=336 avg=339
            elapsed time(us): total=36,792 max=872 min=347 avg=367

OpenACC 书上的范例代码（Jacobi 迭代），part 4

猜你喜欢