// incrementArrays.cu
// 2009-06-21
// run on emulator too see the printf output from the kernel
// nvcc incrementArrays.cu --device-emulation -o incrementArrays

// taken from Dr.Dobbs
// http://www.ddj.com/hpc-high-performance-computing/207402986

#include <stdio.h>
#include <assert.h>
#include <cuda.h>

// The reference implementation on the host
// Used for assertion later on ...
void incrementArrayOnHost(float *a, int N)
{
	int i;
	for (i = 0; i < N; i++) 
		a[i] = a[i] + 1.0f;
}

// The CUDA kernel
// type function qualifier __global__ declares the function to be
// executable on the CUDA device
__global__ void incrementArrayOnDevice(float *a, int N)
{
	// idx is the register index variable
	// blockIdx. blockDim threadIdx. are internal variables and always present in every CUDA kernel
	// printf is only available in emulator mode
	// printf("blockIdx.x = %i, blockDim.x = %i, threadIdx.x = %i\n", blockIdx.x, blockDim.x, threadIdx.x);
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	
	// printf("idx = %i\n", idx);
	if (idx < N) 
		a[idx] = a[idx] + 1.0f;
}

extern "C" void cudaIncrease(void)
{
	float 	*a_h, *b_h;     // pointers to host memory
	float 	*a_d;     		// pointers to device memory
	int 	i;
	int		N = 10;
	size_t size = N*sizeof(float);
 
 	// allocate arrays on host
	a_h = (float *)malloc(size);
	b_h = (float *)malloc(size);
	
	// allocate arrays on device
	cudaMalloc((void **) &a_d, size);

	// initialize host data
	printf("initialize host data\n");
	for (i=0; i<N; i++) {
		a_h[i] = (float)i; 	// a_h = 0 to 9, casting integer to float type
		printf("a_h[%d] = %f\n", i, a_h[i]);
	}

	// copy data from host to device
	// cudaMemcpy(destination, source, size, direction);
	cudaMemcpy(a_d, a_h, sizeof(float) * N, cudaMemcpyHostToDevice);

	// do calculation on host
	incrementArrayOnHost(a_h, N);
	
	// do calculation on device
	// Part 1 of 2. Compute execution configuration
	// blockSize equals the number of threads per block
	// More than 512 threads result in crash ...
	int blockSize = 4; // a.k.a. blockDim.x
	
	// nBlocks equals the number of blocks
	// N % blocksize is the modulo division (rest of division)
	// 0?0:1 means if true value before colon elso value after colon
	// see “conditional expression”
	// Assumption about using less than max # threads is implied (bad design)
	printf("N/blockSize = %i\n", N/blockSize);
	int nBlocks = N/blockSize + (N % blockSize == 0?0:1); // a.k.a. gridDim.x
	printf("nBlocks = %d\n", nBlocks);
	
	// Part 2 of 2. Call incrementArrayOnDeviceKernel
	// execution configuration between <<< and >>>
	// arguments between ( and ), just like standard ISO C99
	// note: blockSize = blockDim.x on device
	// <<< dimension of grid, blockSize >>>
	incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
	
	// Retrieve result from device and store in b_h
	// cudaMemcpy waits until GPU has finished
	// "some concurrency here ..."
	cudaMemcpy(b_h, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost);
	
	// check results
	printf("assert received data\n");
	for (i=0; i<N; i++) {
		assert(a_h[i] == b_h[i]);
		printf("a_h[%d] = %f\t b_h[%d] = %f\n", i, a_h[i], i, b_h[i]); 
		// if correct a_h = 0 to 9
	}
   
   // cleanup
   free(a_h); 
   free(b_h); 
   cudaFree(a_d);
}