#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
//#include <cutil_inline.h>

#include <cuda_runtime.h>
//#include <device_functions.h>

#if ( __CUDACC_VER_MAJOR__ >=7 )
    #include <helper_cuda.h>
    #include <helper_functions.h>
//    #define cutilCheckError(call) checkCudaErrors(call)
    #define cudaStopWatchInterface StopWatchInterface *
    #define cutilCheckError(call) call
    #define cutilSafeCall(call) call
    #define cutCreateTimer(x) sdkCreateTimer(x)
    #define cutResetTimer(x) sdkResetTimer(&x)
    #define cutStartTimer(x) sdkStartTimer(&x)
    #define cutStopTimer(x) sdkStopTimer(&x)
    #define cutGetTimerValue(x) sdkGetTimerValue(&x)
    #define cutilDeviceSynchronize cudaDeviceSynchronize
    #define cudaThreadExit  cudaDeviceReset
    #define cutiliTESTSafeCall(call) \
    do { \
        cudaError_t err = call; \
        if (cudaSuccess != err) { \
           fprintf (stderr, "Cuda error in file '%s' in line %i : %s.",  \
                 __FILE__, __LINE__, cudaGetErrorString(err) ); \
           exit(EXIT_FAILURE); \
       } \
    } while (0)
#else
    #include <cutil_inline.h>
    #include <sm_11_atomic_functions.h>
    #define cudaStopWatchInterface uint
#endif
typedef unsigned char uchar;
typedef unsigned int  uint;


extern "C"
void dotProdCPU(float *c, float *a, float *b, int N);

#include "dotProd_kernel.cu"
#include "dotProd_gold.cpp"

////////////////////////////////////////////////////////////////////////////////
// Helper function, returning uniformly distributed
// random float in [low, high] range
////////////////////////////////////////////////////////////////////////////////
float RandFloat(float low, float high){
    float t = (float)rand() / (float)RAND_MAX;
    return (1.0f - t) * low + t * high;
}

////////////////////////////////////////////////////////////////////////////////
// Helper function to perform final reduction on CPU
////////////////////////////////////////////////////////////////////////////////

void cpuReduce(float *x, int n)
{
	for (int i =1; i<n; i++)
		x[0] += x[i];
}




///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){

    float *h_a, *h_b, *h_c_GPU, h_c_CPU;
    float *d_a, *d_b, *d_c;
//    unsigned int hTimer;
    cudaStopWatchInterface hTimer = 0;
    int i;
    int N = 10000;
    int BLOCK_DIM = 128;
    int GRID_SZ = ceil(float(N)/float(BLOCK_DIM));
    int DATA_SZ = N * sizeof(float);
    int RESULT_SZ = GRID_SZ * sizeof(float);
    int KERNEL = 0;
    float time;

    if (argc<4)
    {
        printf("Usage: %s <N> <BLOCK_DIM> <KERNEL={0|1|2|3}> \n",argv[0]);
        exit(0);
    }
    
    else
    {
        N = atoi(argv[1]);
        BLOCK_DIM = atoi(argv[2]);
	KERNEL = atoi(argv[3]);
    }
	
    GRID_SZ = ceil(float(N)/float(BLOCK_DIM));
    DATA_SZ = N * sizeof(float);
    if (KERNEL>0)
	    RESULT_SZ = GRID_SZ * sizeof(float);
    else
            RESULT_SZ = N * sizeof(float);
    	
    printf("N=%d, BLOCK_DIM=%d, GRID_SZ=%d\n", N,BLOCK_DIM,GRID_SZ);

    // Memory allocation on CPU
    h_a     = (float *)malloc(DATA_SZ);
    h_b     = (float *)malloc(DATA_SZ);
    h_c_GPU = (float *)malloc(RESULT_SZ);

    // Memory allocation on GPU
    cutilSafeCall( cudaMalloc((void **)&d_a, DATA_SZ)   );
    cutilSafeCall( cudaMalloc((void **)&d_b, DATA_SZ)   );
    cutilSafeCall( cudaMalloc((void **)&d_c, RESULT_SZ) );

    //Generating input data on CPU
    srand(1234);
    for(i = 0; i < N; i++){
         h_a[i] = RandFloat(0.0f, 1.0f);
         h_b[i] = RandFloat(0.0f, 1.0f);
    }

    // Creating and startig timer
    cutilCheckError( cutCreateTimer(&hTimer) );
    cutilCheckError( cutResetTimer(hTimer) );
    cutilCheckError( cutStartTimer(hTimer) );

    //Copy options data to GPU memory for further processing 
    cutilSafeCall( cudaMemcpy(d_a, h_a, DATA_SZ, cudaMemcpyHostToDevice) );
    cutilSafeCall( cudaMemcpy(d_b, h_b, DATA_SZ, cudaMemcpyHostToDevice) );

    cutilCheckError( cutStopTimer(hTimer) );
    time = cutGetTimerValue(hTimer);
    printf("Memory Transfer HostToDevice: %f msecs.\n", time);

    // Executing kernel
    cutilCheckError( cutStartTimer(hTimer) );
    if (KERNEL==0)
	    dotProdGPU0<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(float)>>>(d_c, d_a, d_b, N);
    else if (KERNEL==1)
	    dotProdGPU1<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(float)>>>(d_c, d_a, d_b, N);
    else if (KERNEL==2)
	    dotProdGPU2<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(float)>>>(d_c, d_a, d_b, N);
    else if (KERNEL==3)
	    dotProdGPU3<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(float)>>>(d_c, d_a, d_b, N);
//    cutilCheckMsg("dotProdGPU() execution failed\n");
    cutilCheckError( cutStopTimer(hTimer) );
    printf("Kernel time: %f msecs.\n", cutGetTimerValue(hTimer)-time);
    time = cutGetTimerValue(hTimer);

    //Read back GPU results to compare them to CPU results
    cutilCheckError( cutStartTimer(hTimer) );
    cutilSafeCall( cudaMemcpy(h_c_GPU, d_c, RESULT_SZ, cudaMemcpyDeviceToHost) );
    cutilCheckError( cutStopTimer(hTimer) );
    printf("Memory Transfer DeviceToHost: %f msecs.\n", cutGetTimerValue(hTimer)-time);
    time = cutGetTimerValue(hTimer);

    // Final reduction on CPU
    cutilCheckError( cutStartTimer(hTimer) );
    if (KERNEL>0)
	    cpuReduce(h_c_GPU,GRID_SZ);
    else
	    cpuReduce(h_c_GPU,N);
    cutilCheckError( cutStopTimer(hTimer) );
    printf("Final CPU Reduction: %f msecs.\n", cutGetTimerValue(hTimer)-time);
    float gputime = cutGetTimerValue(hTimer);
    printf("Total GPU time: %f msecs.\n", gputime);
	
    // Running on CPU
    cutilCheckError( cutResetTimer(hTimer) );
    cutilCheckError( cutStartTimer(hTimer) );
    dotProdCPU(&h_c_CPU, h_a, h_b, N);
    cutilCheckError( cutStopTimer(hTimer) );
    float cputime = cutGetTimerValue(hTimer);
    printf("CPU time: %f msecs.\n", cputime);

    printf("Comparing the results... ");
    printf("CPU Result = %f, GPU Result = %f\n",h_c_CPU,h_c_GPU[0]);
    printf("GPU/CPU speedup = %f\n",gputime/cputime);

    // Deallocating memory
    cutilSafeCall( cudaFree(d_b)   );
    cutilSafeCall( cudaFree(d_a)   );
    free(h_c_GPU);
    free(h_b);
    free(h_a);
//    cutilCheckError( cutDeleteTimer(hTimer) );
    cudaThreadExit();
}
