#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>

#define THREADS_PER_BLOCK 8
#define N (8*8)
#define BLOCK_SIZE  N / THREADS_PER_BLOCK
#define RADIUS 3

__global__ void stencil_1d(int *in, int *out) 
{
	__shared__ int temp[THREADS_PER_BLOCK + 2*RADIUS];
	int gindex = threadIdx.x + blockIdx.x * blockDim.x; // globalni indeks
	int lindex = threadIdx.x + RADIUS; // lokalni indeks
	
	temp[lindex] = in[gindex];
	
	if(threadIdx.x < RADIUS) 
	{
		if(gindex - RADIUS >= 0)
			temp[lindex - RADIUS] = in[gindex - RADIUS];
		if(gindex + THREADS_PER_BLOCK < N)
			temp[lindex + THREADS_PER_BLOCK] = in[gindex + THREADS_PER_BLOCK];
	}
	
	__syncthreads();
	
	int result = 0;
	for(int offset = -RADIUS; offset <= RADIUS; offset++)
		result += temp[lindex + offset];
		
	// cuvanje rezultata
	out[gindex] = result;
}

void redom(int* x, int size)
{
	int i;
	for (i = 0; i < size; i++) 
	{
		x[i] = i % 100;
	}
}

int main(void)
{
	int *a, *b; // host copies of a, b, c
	int *d_a, *d_b; // device copies of a, b, c
	int size = N * sizeof(int);
	
	// Allocate space for device copies of a, b, c
	cudaMalloc((void **)&d_a, size);
	cudaMalloc((void **)&d_b, size);
	
	a = (int *)malloc(size); redom(a, N);
	b = (int *)malloc(size);
	
	// Copy inputs to device
	cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
		
	// Launch add() kernel on GPU
	stencil_1d<<<BLOCK_SIZE,THREADS_PER_BLOCK>>>(d_a, d_b);
	// Copy result back to host
	cudaMemcpy(b, d_b, size, cudaMemcpyDeviceToHost);
	// Cleanup
	cudaFree(d_a); cudaFree(d_b);

	FILE *f = fopen("rezultat.txt","wt");
	for(int i = RADIUS; i < N-RADIUS; i++)
		fprintf(f, "b[%d] =  %d ....... %d\n", i, b[i], a[i]);
	fclose(f);
	
    return 0;
}

