import pycuda.autoinit import pycuda.driver as drv import numpy as np import scipy as sp from scipy.stats import norm from pycuda.compiler import SourceModule import math # Here's the kernel, essentially identical to that used in the CUDA and RCUDA examples mod = SourceModule(""" #include #define SQRT_TWO_PI 2.506628274631000 __global__ void calc_loglik(double *vals, double *x, int n, double mu, double sigma, int dbg) { // note that this assumes no third dimension to the grid int myblock = blockIdx.x + blockIdx.y * gridDim.x; // size of each block (within grid of blocks) int blocksize = blockDim.x * blockDim.y * blockDim.z; // id of thread in a given block int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; // assign overall id/index of the thread int idx = myblock * blocksize + subthread; if (idx < n) { if (dbg){ printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],n,mu,sigma); } double std = (x[idx] - mu)/sigma; double e = exp( - 0.5 * std * std); vals[idx] = e / ( sigma * SQRT_TWO_PI); } else { if (dbg){ printf("thread idx: %04d\\t (>=n=%d)\\n",idx,n); } } return; } """) calc_loglik = mod.get_function("calc_loglik") # Arguments must be numpy datatypes i.e., n = 1000 will not work! n = np.int32(134217728) # Threads per block and number of blocks: threads_per_block = int(1024) block_dims = (threads_per_block, 1, 1) grid_d = int(math.ceil(math.sqrt(n/threads_per_block))) grid_dims = (grid_d, grid_d, 1) print("Generating random normals...") x = np.random.normal(size = n) # Evaluate at N(0.3, 1.5) mu = np.float64(0.3) sigma = np.float64(1.5) dbg = False # True verbose = np.int32(dbg) # Allocate storage for the result: out = np.zeros_like(x) # Create two timers: start = drv.Event() end = drv.Event() # Launch the kernel print("Running GPU code...") start.record() calc_loglik(drv.Out(out), drv.In(x), n, mu, sigma, verbose, block= block_dims, grid = grid_dims) end.record() # end timing # calculate the run length end.synchronize() gpu_secs = start.time_till(end)*1e-3 print("Time for calculation (GPU): %fs" % gpu_secs) # Scipy version: print("Running Scipy CPU code...") start.record() out2 = norm.pdf(x, loc = mu, scale = sigma) end.record() # end timing # calculate the run length end.synchronize() cpu_secs = start.time_till(end)*1e-3 print("Time for calculation (CPU): %fs" % cpu_secs) print("Output from GPU: %f %f %f" % (out[0], out[1], out[2])) print("Output from CPU: %f %f %f" % (out2[0], out2[1], out2[2]))