Found something interesting about GPU programming using CUDA that isn't obvious, is hard to find or figure out from the documentation from NVIDIA, or is just plain cool? Pop it on this page.
To select a device to run your GPU kernels on, include the header
#include <cuda_runtime_api.h>
and then the following code will select a device and check for errors
if( cudaSetDevice(deviceNumber) != cudaSuccess ) exit( 1 );
where deviceNumber, an int, is the GPU device index number to be used (0 to n-1). You'll need to run a device querying program (one is an example in the SDK) to know what devices are on the machine you're using.
A simple generic make file:
# Build tools NVCC = /usr/local/cuda-3.2/cuda/bin/nvcc CXX = g++ # here are all the objects GPUOBJS = cuexample.o OBJS = cppexample.o # make and compile cudaexample.out:$(OBJS) $(GPUOBJS) $(NVCC) -o cudaexample.out $(OBJS) $(GPUOBJS) cuexample.o: cuexample.cu $(NVCC) -c cuexample.cu cppexample.o: cppexample.cpp $(CXX) -c cppexample.cpp
If you wish to compile with CUDA 4.0 instead of CUDA 3.2, simply replace ‘cuda-3.2’ with ‘cuda-4.0’ in NVCC’s path. Also, when using 4.0 the default will use ‘LD_LIBRARY_PATH:/usr/local/cuda-4.0/cuda/lib’ which is 32bit and will crash during runtime because we need the 64bit version. To fix this add the following to your ‘~/.bash_profile’:
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-4.0/cuda/lib64
You can test out the above Makefile with the following cppexample.cpp and cuexample.cpp,
cppexample.cpp:
#include <cstdlib>
#include <cstdio>
const int DIMENSION = 10;
extern "C" void exampleHost(float * h, int blockDim, int threadDim);
int main(void){
float * h = (float *)malloc(DIMENSION*DIMENSION*sizeof(float));
exampleHost(h, DIMENSION, DIMENSION);
for(int i = 0; i < DIMENSION; i++){
for(int j = 0; j < DIMENSION; j++){
printf("%2.0f ",h[i*DIMENSION+j]);
}
printf("\n");
}
return 1;
}
cuexample.cu:
__global__ void exampleDevice(float * d){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
d[idx] = idx;
}
extern "C" void exampleHost(float * h, int blockDim, int threadDim){
float * d;
cudaMalloc((void**)&d, blockDim * threadDim*sizeof(float));
exampleDevice<<<blockDim, threadDim>>>(d);
cudaMemcpy(h, d, blockDim*threadDim*sizeof(float),cudaMemcpyDeviceToHost);
}