#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>

#define THREADS_PER_BLOCK 1024
#define BLOCKS_NUMBER 2048
#define N (BLOCKS_NUMBER*THREADS_PER_BLOCK)

__global__ void setup_kernel(curandState *state){

  int i = threadIdx.x+blockDim.x*blockIdx.x;
  curand_init(1234, i, 0, &state[i]);
}

/* Calculate PI */
__global__ void CalculatePiMC(curandState *my_curandstate, int *out)
{
	__shared__ int temp[THREADS_PER_BLOCK];
	
    int gindex = blockDim.x * blockIdx.x + threadIdx.x;
	curandState localState = my_curandstate[gindex];
	
	double x = curand_uniform(&localState);
	double y = curand_uniform(&localState);
	
	if(x*x + y*y <= 1)
		temp[threadIdx.x] = 1;
	else
		temp[threadIdx.x] = 0;

	my_curandstate[gindex] = localState;
	
	__syncthreads();
	
	if(threadIdx.x == 0)
	{
		int s = 0;
		for(int i = 0; i < THREADS_PER_BLOCK; i++)
			s += temp[i];
		out[blockIdx.x] = s;		
	}
}

int main(void)
{
	curandState *d_state;
	cudaMalloc(&d_state, sizeof(curandState));
	
    // Print the vector length to be used, and compute its size
    size_t size = BLOCKS_NUMBER * sizeof(int);
    printf("Br tacaka: %d\n", N);

	int *h_s = (int *) malloc(size);
    if (h_s == NULL)
    {
        fprintf(stderr, "Failed to allocate host memory!\n");
        exit(EXIT_FAILURE);
    }
	
    // Allocate the device input vector A
    int *d_s = NULL;
	cudaMalloc((void **)&d_s, size);
	
	setup_kernel<<<BLOCKS_NUMBER,THREADS_PER_BLOCK>>>(d_state);
	
    CalculatePiMC<<<BLOCKS_NUMBER, THREADS_PER_BLOCK>>>(d_state, d_s);
				
    cudaMemcpy(h_s, d_s, size, cudaMemcpyDeviceToHost);

	int s = 0;
	for(int i = 0; i < BLOCKS_NUMBER; i++)
		s += h_s[i];		
		
	printf("PI = %10.8lf\n", 4.0*s/N);

    // Free device global memory
	cudaFree(d_s);

    // Free host memory
    free(h_s);

    cudaDeviceReset();

    printf("Done\n");
    return 0;
}