how should I set copy() when using Openacc(parallelization) in C++?

  c++, openacc

I am using gcc compiler. (g++ -o test testfile.cpp)

I want to use Openacc to parallelize my code but I am a bit confused about using #pragma correctly.

Below is the part where I used parallelization.

Even after using Openacc the code is not faster than before.

I guess this is related to ‘data-moving’ thing.

So I think I need to use #pragma acc data copy here. But I am not sure how to use this properly.

Any help?
Thanks in advance.

while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }

        #pragma acc kernels      
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

    for (int k = 0; k < Ny; k++) {
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                cout << Vnew[i][j][k];
            }
            cout << 'n';
        }
    }

}

Source: Windows Questions C++

LEAVE A COMMENT