Here is the code for the discrete Voronoi diagram calculation that I wanted to show you today
//based on code from Christian Trefftz and Greg Wolffe

/* 
 * In CUDA it is necessary to define block sizes
 * The grid of data that will be worked on is divided into blocks
 */
#define BLOCK_SIZE 8

/**
 * This is the function that will be executed in each and every one
 * of the stream processors
 * The __global__ directive identifies this function as being
 * an executable kernel on the CUDA device.
 * All kernels must be declared with a return type void 
 */ 
__global__ void cu_calcEachCell(int *matrix_d, int sizeOfMatrix,
				seed *seeds_d, int nSeeds) {
  int x;
  int y;
  int pos;
  int closestSeed;
  double thisDistance;
  double closestDistance;
  int i;
  x=blockIdx.x*BLOCK_SIZE+threadIdx.x;
  y=blockIdx.y*BLOCK_SIZE+threadIdx.y;
  pos = x*sizeOfMatrix+ y;

  // Calculate the distance from each seed to this pixel
  // Pick the closest seed
  closestSeed = 0;
  closestDistance = (double) 
    ((x-seeds_d[0].x)*(x-seeds_d[0].x) +
     (y-seeds_d[0].y)*(y-seeds_d[0].y));
  closestDistance = sqrt(closestDistance);
  for(i=1;i < nSeeds;i++) {
    thisDistance = 
      ((x-seeds_d[i].x)*(x-seeds_d[i].x) +
       (y-seeds_d[i].y)*(y-seeds_d[i].y));
    thisDistance = sqrt(thisDistance);
    if (thisDistance < closestDistance) {
      closestSeed = i;
      closestDistance = thisDistance;
    }
  }
  matrix_d[pos] = closestSeed;
}

/**
 * The seeds are going to be accessed only for reading.
 * Hence it makes sense to place them in the texture area in the GPU card
 * which is a high speed cache for read-only variables.
 * This is achieved by using the keywords 
 *  __device__ __constant__
 */
__device__ __constant__ seed *seeds_d;

/**
 * This function is called in the host computer.
 * It, in turn, calls the function that is executed on the GPU.
 * Recall that:
 *  The host computer and the GPU card have separate memories
 *  Hence it will be necessary to 
 *    - Allocate memory in the memory on the GPU 
 *    - Copy the variables that will be operated from the memory 
 *      in the host to the corresponding variable in the GPU memory
 *    - Describe the configuration of the grid and the block size
 *    - Call the kernel, the code that will be executed on the GPU
 *    - Once the kernel has finished executing, copy back
 *      the results from the memory of the GPU to the memory on the host
 */
extern "C" void calcVoronoi(int *matrix, int matrixSize, seed *seeds, int nSeeds){
  //matrix_d and seeds_d are the GPU counterpart of the array that exists on the host memory 
  int *matrix_d;
  int nBlocks;
  cudaError_t result;

  //allocate memory on device
  // cudaMalloc allocates space in the memory of the GPU card
  result = cudaMalloc((void**)&matrix_d, sizeof(int)*matrixSize*matrixSize);
  if (result != cudaSuccess) {
    printf("cudaMalloc failed - matrix_d \n");
    exit(1);
  }

  // The seeds will be allocated in the texture read-only cache
  // memory of the GPU card
  result = cudaMalloc((void**)&seeds_d, sizeof(seed)*nSeeds);
  if (result != cudaSuccess) {
    printf("cudaMalloc failed - seeds_d \n");
    exit(1);
  }

  //The memory from the host is being copied to the corresponding variable
  // in the GPU global memory
  // One needs to copy only the seeds. 
  // The original values of the matrix are not needed
  result = cudaMemcpy(seeds_d, seeds, sizeof(seed)*nSeeds, cudaMemcpyHostToDevice);
  if (result != cudaSuccess) {
    printf("cudaMemcpy - Host-> GPU failed - seeds_d \n");
    exit(1);
  }

  //execution configuration...
  // Indicate the dimension of the block
  dim3 dimblock(BLOCK_SIZE, BLOCK_SIZE);
  // Indicate the dimension of the grid 
  nBlocks = matrixSize/BLOCK_SIZE;
  dim3 dimgrid(nBlocks, nBlocks);
  //actual computation: Call the kernel, the function that is
  // executed by each and every stream processor on the GPU card
  cu_calcEachCell<<>>(matrix_d, matrixSize,
					seeds_d, nSeeds);
  //read results back:
  // Copy the results from the memory in the GPU 
  // back to the memory on the host
  // One needs to copy only the matrix, the seeds did not change
  result = cudaMemcpy(matrix, matrix_d, sizeof(int)*matrixSize*matrixSize,
		      cudaMemcpyDeviceToHost);
  if (result != cudaSuccess) {
    printf("cudaMemcpy - GPU -> Host failed - matrix \n");
    exit(1);
  }

  // Release the memory on the GPU card
  cudaFree(matrix_d);
  cudaFree(seeds_d);
}