CUDA race conditions even though I used atomicAdd

  atomic, c++, cuda, race-condition

I am trying to create a program to approximate the value of PI using the integral of sqrt(1-x^2) and the range -1, 1.
To calculate the integral, I split the range to 2^(input) rectangles and calculate the size of each one of them in a different thread (not exactly, I actually calculate a batch of them in each thread), and than sum them all up in a variable named d_res using atomicAdd().
Here is my code:

#include <stdio.h>
#include <math.h>
#include <iostream>

static void HandleError( cudaError_t err, const char *file, int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %dn", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


__device__ double power(double a, double b){
    return pow(a, b);
}

__global__ void pi_approx_kernel(double n, double *res, double batch_size)
{
    const double index = threadIdx.x + blockDim.x * blockIdx.x;

    // long int threads = blockDim.x * gridDim.x * blockDim.y * gridDim.y;

    const double a = -1;
    const double b = 1;

    const double width = (b-a) / n;

    double x = 0;
    double num = 0;
    double batch_index = batch_size * index;


    for (double i = 0; i < batch_size; i++){
        x = width * (batch_index + i) + a + width/2;
        num += sqrt(1-power(x, 2)) * width;
    }

    atomicAdd(res, num);
}

double pi_approx_gpu(double iters, int block_count=-1, int thread_count=-1){
    // const double batch_size = iters / (block_count * thread_count);
    double batch_size;

    if (block_count == -1 || thread_count == -1){
        if (iters / (1024 * 1024) >= 1024){
            block_count = 1024;
            thread_count = 1024;
        }
        else {
            block_count = ceil(iters / (1024 * 1024));
            thread_count = iters / block_count / 1024;
        }
    }

    batch_size = ceil(iters / (block_count * thread_count));

    double *d_res = 0;
    double res = 0;

    printf("Batch size: %lfnn", batch_size);
    printf("Estimated GPU time in seconds (less accurate for less than 2^30): %lfnn", 0.72 * iters / pow(2, 30));
    
    cudaMalloc((void**) &d_res, sizeof(double));
    cudaMemcpy(d_res, &res, sizeof(double), cudaMemcpyHostToDevice);
    

    float elapsed = 0;
    cudaEvent_t start, stop;

    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));

    HANDLE_ERROR(cudaEventRecord(start, 0));

    pi_approx_kernel<<<block_count, thread_count>>>(iters, d_res, batch_size);

    HANDLE_ERROR(cudaEventRecord(stop, 0));
    HANDLE_ERROR(cudaEventSynchronize(stop));

    HANDLE_ERROR(cudaEventElapsedTime(&elapsed, start, stop) );

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));

    printf("With GPU: %f secondsn", elapsed / 1000);


    cudaMemcpy(&res, d_res, sizeof(double), cudaMemcpyDeviceToHost);

    printf("GPU approx: %.50lfnn", res * 2);
    
    cudaFree(d_res);

    return res * 2;
}


int main()
{
    double x; 
    std::cout << "2^() iterations: "; // Type a number and press enter
    std::cin >> x; // Get user input from the keyboard

    const double iters = pow(2, x);

    pi_approx_gpu(iters);


    return 0;
}

The problem is that even though I use atomicAdd(), the results are still different every time I run the code (for instance try to run it a few times with the input 30 and compare the results).
Why can this happen and how can I fix it? Thanks in advance.

Source: Windows Questions C++

LEAVE A COMMENT