#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>

#define THREADS_PER_BLOCK 1024
#define BLOCKS 2048
#define N (BLOCKS*THREADS_PER_BLOCK)

__global__ void calcPi(int *out) {
	__shared__ int tmp[THREADS_PER_BLOCK];

	int i = blockIdx.x * blockDim.x + threadIdx.x;

	curandState state;
	curand_init((unsigned long long) clock() + i, 0, 0, &state);

	double x = curand_uniform_double(&state);
	double y = curand_uniform_double(&state);

	if (x * x + y * y <= 1) {
		tmp[threadIdx.x] = 1;
	} else {
		tmp[threadIdx.x] = 0;
	}

	__syncthreads();

	if (threadIdx.x == 0) {
		int sum = 0;
		for (int i = 0; i < THREADS_PER_BLOCK; ++i) {
			sum += tmp[i];
		}
		out[blockIdx.x] = sum;
	}
}

int main() {
	printf("Broj tacaka: %d\n", N);

	int *h_vec = (int *) malloc(sizeof(int) * BLOCKS);
	int *d_vec = NULL;
	cudaMalloc((void **) &d_vec, sizeof(int) * BLOCKS);

	calcPi<<<BLOCKS, THREADS_PER_BLOCK>>>(d_vec);

	cudaMemcpy(h_vec, d_vec, sizeof(int) * BLOCKS, cudaMemcpyDeviceToHost);

	int sum = 0;
	for (int i = 0; i < BLOCKS; ++i) {
		sum += h_vec[i];
	}
	printf("%10.8lf\n", 4.0 * ((double) sum / (double) N));

	free(h_vec);
	cudaFree(d_vec);

	cudaDeviceReset();

	return 0;
}
