#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <cutil_inline.h>

extern "C"
unsigned int piEstimateCPU(float *x, float *y, int N);
#include "piEstimate_kernel.cu"

////////////////////////////////////////////////////////////////////////////////
// Helper function to perform final reduction on CPU
////////////////////////////////////////////////////////////////////////////////
unsigned int cpuReduce(unsigned int *x, int n)
{
	unsigned int s=x[0];
	for (int i =1; i<n; i++)
		s += x[i];
	return s;
}

int main(int argc, char *argv[])
{
	float *h_x, *h_y, *d_x, *d_y;
	unsigned int *h_counters,*d_counters;
	curandGenerator_t gen;
	int N = 10000, BLOCK_DIM = 128, GRID_SZ, DATA_SZ, RESULT_SZ;
	float result;

        if (argc<3)
        {
        	printf("Usage: %s <N> <BLOCK_DIM> \n",argv[0]);
 		exit(0);
        }   
        else
        {
        	N = atoi(argv[1]);
        	BLOCK_DIM = atoi(argv[2]);
	}

	GRID_SZ = ceil(float(N)/float(BLOCK_DIM));
	DATA_SZ = N * sizeof(float);
	RESULT_SZ = GRID_SZ * sizeof(unsigned int);
	unsigned int hTimer;
	
        // Memory allocation on CPU
	h_x     = (float *)malloc(DATA_SZ);
    	h_y     = (float *)malloc(DATA_SZ);
	h_counters = (unsigned int *)malloc(RESULT_SZ);

    	// Memory allocation on GPU
    	cutilSafeCall( cudaMalloc((void **)&d_x, DATA_SZ)   );
    	cutilSafeCall( cudaMalloc((void **)&d_y, DATA_SZ)   );
    	cutilSafeCall( cudaMalloc((void **)&d_counters, RESULT_SZ) );

	// Create pseudo-random number generator
	curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
	// Set seed
	curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);

	// Generate x and y on device 
	curandGenerateUniform(gen, d_x, N);
	curandGenerateUniform(gen, d_y, N);

	// Copy data from device to host 
	cutilSafeCall( cudaMemcpy(h_x, d_x, DATA_SZ, cudaMemcpyDeviceToHost) );
	cutilSafeCall( cudaMemcpy(h_y, d_y, DATA_SZ, cudaMemcpyDeviceToHost) );

	cutilCheckError( cutCreateTimer(&hTimer) );
	cutilCheckError( cutResetTimer(hTimer) );
	cutilCheckError( cutStartTimer(hTimer) );
	// Launch Kernel
	piEstimateGPU<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(unsigned int)>>>(d_x, d_y, d_counters, N);

	// Copy results back
	cutilSafeCall( cudaMemcpy(h_counters, d_counters, RESULT_SZ, cudaMemcpyDeviceToHost) );
   
	result = 4.0*cpuReduce(h_counters,GRID_SZ)/float(N);
	cutilCheckError( cutStopTimer(hTimer) );
	printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));
	printf("GPU Result = %f\n",result);

	cutilCheckError( cutResetTimer(hTimer) );
	cutilCheckError( cutStartTimer(hTimer) );
	result = 4.0*piEstimateCPU(h_x,h_y,N)/float(N);
	cutilCheckError( cutStopTimer(hTimer) );
	printf("CPU time: %f msecs.\n", cutGetTimerValue(hTimer));
	printf("CPU Result = %f\n",result);

	// Cleanup 
	curandDestroyGenerator(gen);
	cutilSafeCall( cudaFree(d_x) );
	cutilSafeCall( cudaFree(d_y) );
	free(h_x);
	free(h_y);	
	cudaThreadExit();
}


