////////////////////////////////////////////////////////////////////////////
// Find Global Maximum on GPU
////////////////////////////////////////////////////////////////////////////

// shared memory for reduction
extern __shared__ unsigned int regional_maxes[];

__global__ void glMaxGPU0(unsigned int *values, unsigned int N, unsigned int *gl_max)
{
	// global index
	unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;

	if (i<N) {	
		unsigned int val = values[i];
		atomicMax(gl_max, val);
	}
}

__global__ void glMaxGPU1(unsigned int *values, unsigned int N, unsigned int num_regions, unsigned int *gl_max)
{
	// global index
	unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
 	unsigned int region = i % num_regions;

	// init shared data
	regional_maxes[region] = 0;
	__syncthreads();

	if (i<N) {	
		unsigned int val = values[i];
		if (atomicMax(&regional_maxes[region],val) < val) {
			atomicMax(gl_max, val);
		}
	}
}
