Here is the code for the discrete Voronoi diagram calculation that I wanted to show you today
//based on code from Christian Trefftz and Greg Wolffe
/*
* In CUDA it is necessary to define block sizes
* The grid of data that will be worked on is divided into blocks
*/
#define BLOCK_SIZE 8
/**
* This is the function that will be executed in each and every one
* of the stream processors
* The __global__ directive identifies this function as being
* an executable kernel on the CUDA device.
* All kernels must be declared with a return type void
*/
__global__ void cu_calcEachCell(int *matrix_d, int sizeOfMatrix,
seed *seeds_d, int nSeeds) {
int x;
int y;
int pos;
int closestSeed;
double thisDistance;
double closestDistance;
int i;
x=blockIdx.x*BLOCK_SIZE+threadIdx.x;
y=blockIdx.y*BLOCK_SIZE+threadIdx.y;
pos = x*sizeOfMatrix+ y;
// Calculate the distance from each seed to this pixel
// Pick the closest seed
closestSeed = 0;
closestDistance = (double)
((x-seeds_d[0].x)*(x-seeds_d[0].x) +
(y-seeds_d[0].y)*(y-seeds_d[0].y));
closestDistance = sqrt(closestDistance);
for(i=1;i < nSeeds;i++) {
thisDistance =
((x-seeds_d[i].x)*(x-seeds_d[i].x) +
(y-seeds_d[i].y)*(y-seeds_d[i].y));
thisDistance = sqrt(thisDistance);
if (thisDistance < closestDistance) {
closestSeed = i;
closestDistance = thisDistance;
}
}
matrix_d[pos] = closestSeed;
}
/**
* The seeds are going to be accessed only for reading.
* Hence it makes sense to place them in the texture area in the GPU card
* which is a high speed cache for read-only variables.
* This is achieved by using the keywords
* __device__ __constant__
*/
__device__ __constant__ seed *seeds_d;
/**
* This function is called in the host computer.
* It, in turn, calls the function that is executed on the GPU.
* Recall that:
* The host computer and the GPU card have separate memories
* Hence it will be necessary to
* - Allocate memory in the memory on the GPU
* - Copy the variables that will be operated from the memory
* in the host to the corresponding variable in the GPU memory
* - Describe the configuration of the grid and the block size
* - Call the kernel, the code that will be executed on the GPU
* - Once the kernel has finished executing, copy back
* the results from the memory of the GPU to the memory on the host
*/
extern "C" void calcVoronoi(int *matrix, int matrixSize, seed *seeds, int nSeeds){
//matrix_d and seeds_d are the GPU counterpart of the array that exists on the host memory
int *matrix_d;
int nBlocks;
cudaError_t result;
//allocate memory on device
// cudaMalloc allocates space in the memory of the GPU card
result = cudaMalloc((void**)&matrix_d, sizeof(int)*matrixSize*matrixSize);
if (result != cudaSuccess) {
printf("cudaMalloc failed - matrix_d \n");
exit(1);
}
// The seeds will be allocated in the texture read-only cache
// memory of the GPU card
result = cudaMalloc((void**)&seeds_d, sizeof(seed)*nSeeds);
if (result != cudaSuccess) {
printf("cudaMalloc failed - seeds_d \n");
exit(1);
}
//The memory from the host is being copied to the corresponding variable
// in the GPU global memory
// One needs to copy only the seeds.
// The original values of the matrix are not needed
result = cudaMemcpy(seeds_d, seeds, sizeof(seed)*nSeeds, cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
printf("cudaMemcpy - Host-> GPU failed - seeds_d \n");
exit(1);
}
//execution configuration...
// Indicate the dimension of the block
dim3 dimblock(BLOCK_SIZE, BLOCK_SIZE);
// Indicate the dimension of the grid
nBlocks = matrixSize/BLOCK_SIZE;
dim3 dimgrid(nBlocks, nBlocks);
//actual computation: Call the kernel, the function that is
// executed by each and every stream processor on the GPU card
cu_calcEachCell<<>>(matrix_d, matrixSize,
seeds_d, nSeeds);
//read results back:
// Copy the results from the memory in the GPU
// back to the memory on the host
// One needs to copy only the matrix, the seeds did not change
result = cudaMemcpy(matrix, matrix_d, sizeof(int)*matrixSize*matrixSize,
cudaMemcpyDeviceToHost);
if (result != cudaSuccess) {
printf("cudaMemcpy - GPU -> Host failed - matrix \n");
exit(1);
}
// Release the memory on the GPU card
cudaFree(matrix_d);
cudaFree(seeds_d);
}