I am trying to create a program to approximate the value of PI using the integral of `sqrt(1-x^2)`

and the range `-1, 1`

.

To calculate the integral, I split the range to `2^(input)`

rectangles and calculate the size of each one of them in a different thread (not exactly, I actually calculate a batch of them in each thread), and than sum them all up in a variable named `d_res`

using `atomicAdd()`

.

Here is my code:

```
#include <stdio.h>
#include <math.h>
#include <iostream>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %dn", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__device__ double power(double a, double b){
return pow(a, b);
}
__global__ void pi_approx_kernel(double n, double *res, double batch_size)
{
const double index = threadIdx.x + blockDim.x * blockIdx.x;
// long int threads = blockDim.x * gridDim.x * blockDim.y * gridDim.y;
const double a = -1;
const double b = 1;
const double width = (b-a) / n;
double x = 0;
double num = 0;
double batch_index = batch_size * index;
for (double i = 0; i < batch_size; i++){
x = width * (batch_index + i) + a + width/2;
num += sqrt(1-power(x, 2)) * width;
}
atomicAdd(res, num);
}
double pi_approx_gpu(double iters, int block_count=-1, int thread_count=-1){
// const double batch_size = iters / (block_count * thread_count);
double batch_size;
if (block_count == -1 || thread_count == -1){
if (iters / (1024 * 1024) >= 1024){
block_count = 1024;
thread_count = 1024;
}
else {
block_count = ceil(iters / (1024 * 1024));
thread_count = iters / block_count / 1024;
}
}
batch_size = ceil(iters / (block_count * thread_count));
double *d_res = 0;
double res = 0;
printf("Batch size: %lfnn", batch_size);
printf("Estimated GPU time in seconds (less accurate for less than 2^30): %lfnn", 0.72 * iters / pow(2, 30));
cudaMalloc((void**) &d_res, sizeof(double));
cudaMemcpy(d_res, &res, sizeof(double), cudaMemcpyHostToDevice);
float elapsed = 0;
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
pi_approx_kernel<<<block_count, thread_count>>>(iters, d_res, batch_size);
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsed, start, stop) );
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
printf("With GPU: %f secondsn", elapsed / 1000);
cudaMemcpy(&res, d_res, sizeof(double), cudaMemcpyDeviceToHost);
printf("GPU approx: %.50lfnn", res * 2);
cudaFree(d_res);
return res * 2;
}
int main()
{
double x;
std::cout << "2^() iterations: "; // Type a number and press enter
std::cin >> x; // Get user input from the keyboard
const double iters = pow(2, x);
pi_approx_gpu(iters);
return 0;
}
```

The problem is that even though I use `atomicAdd()`

, the results are still different every time I run the code (for instance try to run it a few times with the input `30`

and compare the results).

Why can this happen and how can I fix it? Thanks in advance.

Source: Windows Questions C++