// includes, system --> don’t forget cuda.h
#include <stdio.h>
#include <assert.h>
#include <cuda.h>

// Simple utility function to check for CUDA runtime errors 
void checkCUDAError(const char* msg);

# pragma mark kernel
// Part3: implement the kernel 
__global__ void reverseArrayBlock(int *d_out, int *d_in)
{
	int inOffset = blockDim.x * blockIdx.x;
	int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
	int in = inOffset + threadIdx.x;
	int out = outOffset + (blockDim.x - 1 - threadIdx.x);
	d_out[out] = d_in[in];
 }

#pragma mark cudaInverse
// cudaReverse function which is called from AppController.m
extern "C" int cudaInverse(void)
{ 
	// pointer for host memory and size 
	int *h_a; 
	size_t sizeOfInt = sizeof(int);
	printf("Size of an integer: %d\n", sizeOfInt);
	int dimA = 256 * 1024; // 256K elements (1MB total, 4Byte * 256 * 1024)

	// pointer for device memory 
	int *d_b, *d_a;

	// define grid and block size 
	int numThreadsPerBlock = 256;

	// Part 1: compute number of blocks needed based on 
	// array size and desired block size 
	int numBlocks = dimA / numThreadsPerBlock;

	// allocate host memory 
	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int); 
	h_a = (int *) malloc(memSize); 
	// allocate device memory - for input array d_a and output array d_b
	cudaMalloc( (void **) &d_a, memSize ); 
	cudaMalloc( (void **) &d_b, memSize );

	// Initialize input array on host 
	for (int i = 0; i < dimA; ++i) 
	{ 
		h_a[i] = i;
		// print first, last, and every 20,000th line
		// so we see what we are starting with
		
		if (i % 20000 == 0 || i == 0 || i == dimA -1) 
		{
			printf("h_a[%d] = %d\n", i, h_a[i]);
		}
	}

	// Copy host array h_a to device array d_a
	// d_a resides in global memory on device ### 
	cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );

	// launch kernel
	// dim3 is a vector, based on uint3 data type
	// Each missing component defaults to 1 in a dim3 data type 
	// So the next line equals: dim3 dimGrid(numBlocks, 1, 1);
	dim3 dimGrid(numBlocks); 
	dim3 dimBlock(numThreadsPerBlock);
	
	// create events to measure performance
	// see section 3.2.6.2 “Event” in NVIDIA_CUDA_Programming_Guide_2.3.pdf 
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);  
	
	// actually, this launches the kernel …
	reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a );

	// block until the device has completed 
	cudaThreadSynchronize();

	// check if kernel execution generated an error 
	// Check for any CUDA errors 
	checkCUDAError("kernel invocation");
	
	// see section 3.2.6.2 “Event” in NVIDIA_CUDA_Programming_Guide_2.3.pdf 
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	// device to host copy
	// get the reversed array and overwrite the previous h_a array  
	cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );

	// Check for any CUDA errors 
	checkCUDAError("memcpy");

	// verify the data returned to the host is correct
	for (int i = 0; i < dimA; i++) 
	{ 
		assert(h_a[i] == dimA - 1 - i );
		// print first, last, and every 20,000th line
		// this time h_a is reversed
		
		if (i % 20000 == 0 || i == 0 || i == dimA -1) 
		{
			printf("h_a[%d] = %d\n", i, h_a[i]);
		}
	}

	// free device memory 
	cudaFree(d_a); cudaFree(d_b);

	// free host memory 
	free(h_a);

	// If the program makes it this far, then the results are 
	// correct and there are no run-time errors. Good work! 
	printf("reverseArrayBlock worked correctly.\n");
	printf("Elapsed Time = %f ms.\n", elapsedTime); 
	return 0; 
}
 
void checkCUDAError(const char *msg) 
{
	// data type is "cudaError_t"
	cudaError_t err = cudaGetLastError();
	// If everything is ok, this should be 
	// ( cudaSuccess != cudaSuccess )
	// which is false and result in no error message.
	if(cudaSuccess != err) 
	{ 
		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
		exit(EXIT_FAILURE); 
	} 
}
