# modification of one of the RCUDA examples to use use double precision library(RCUDA) if(!exists('unitStrides') || is.null(unitStrides)) unitStrides <- FALSE if(!exists('sharedMem') || is.null(sharedMem)) sharedMem <- FALSE if(!exists('float') || is.null(float)) float <- FALSE M <- as.integer(1e4) # important to have as integer! # get the alphas and generate the random numbers source('setup_calc.R') cat("Setting cuGetContext(TRUE)...\n") cuGetContext(TRUE) # compile the kernel into a form that RCUDA can load; equivalent to this nvcc call: # system("nvcc --ptx -arch=compute_20 -code=sm_20,compute_20 -o compute_probs.ptx compute_probs.cu") fn <- "compute_probs" if(unitStrides) fn <- paste0(fn, "_unitStrides") if(sharedMem) fn <- paste0(fn, "_sharedMem") if(float) fn <- paste0(fn, "_float") ptx = nvcc(file = paste0(fn, ".cu"), out = 'compute_probs.ptx', target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20") mod = loadModule(ptx) compute_probs = mod$compute_probs # setting grid and block dimensions threads_per_block <- as.integer(192) if(sharedMem) threads_per_block <- as.integer(96) # need fewer threads so that have enough room in 48Kb of shared memory block_dims <- c(threads_per_block, as.integer(1), as.integer(1)) grid_d <- as.integer(ceiling(n/threads_per_block)) grid_dims <- c(grid_d, as.integer(1), as.integer(1)) cat("Grid size:\n") print(grid_dims) nthreads <- prod(grid_dims)*prod(block_dims) cat("Total number of threads to launch = ", nthreads, "\n") if (nthreads < n){ stop("Grid is not large enough...!") } cat("Running CUDA kernel...\n") if(unitStrides) { probs <- matrix(0, nrow = n, ncol = K) tmp <- matrix(0, nrow = n, ncol = K) rands <- t(rands) alphas <- t(alphas) } else { probs <- matrix(0, nrow = K, ncol = n) tmp <- matrix(0, nrow = K, ncol = n) } if(!float) { strict = TRUE # for double cuCtxSetSharedMemConfig("CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE") } else strict = FALSE sharedMemSize <- as.integer( ifelse(float, 4, 8)*K*threads_per_block*2 ) if(sharedMem && sharedMemSize > 48000) stop("trying to use too much shared memory") # basic usage with manual transfer tTransferToGPU <- system.time({ devAlphas = copyToDevice(alphas, strict = strict) devRands = copyToDevice(rands, strict = strict) devProbs = copyToDevice(probs, strict = strict) cudaDeviceSynchronize() }) tCalc <- system.time({ if(float) { .cuda(compute_probs, devAlphas, devRands, devProbs, n, K, M, gridDim = grid_dims, blockDim = block_dims, sharedMemBytes = ifelse(sharedMem, sharedMemSize, as.integer(0))) } else .cuda(compute_probs, devAlphas, devRands, devProbs, n, K, M, gridDim = grid_dims, blockDim = block_dims, sharedMemBytes = ifelse(sharedMem, sharedMemSize, as.integer(0)), .numericAsDouble = getOption("CUDA.useDouble", TRUE)) cudaDeviceSynchronize() }) tTransferFromGPU <- system.time({ out = copyFromDevice(obj = devProbs, nels = devProbs@nels, type = "double") cudaDeviceSynchronize() }) cat("Input values: ", alphas[1:3], "\n") cat("Output values: ", out[1:3], "\n") cat("Transfer to GPU time: ", tTransferToGPU[3], "\n") cat("Calculation time (GPU): ", tCalc[3], "\n") cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")