#include #include #include #include #include #include "cublas_v2.h" #include "magma.h" #include "magma_lapack.h" // compile as: // gcc magmaExample.c -O3 -DADD_ -fopenmp -DHAVE_CUBLAS -I/usr/local/cuda/include -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma -llapack -lblas -lcublas -lcudart -o magmaExample double read_timer() { struct timeval end; gettimeofday( &end, NULL ); return end.tv_sec+1.e-6*end.tv_usec; } // BLAS/LAPACK functions for matrix multiply and Cholesky // not needed as these are in magma_dlapack.h // void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); // int dpotrf_(char* uplo, int* n, double* a, int* lda, int* info); void fillMatrix( double *p, int n ) { int i; srand48(0); for( i = 0; i < n; i++ ) p[i] = 2*drand48()-1; } int main( int argc, char **argv ) { printf("Starting\n"); int size; cudaError_t cudaStat; magma_int_t magmaStat; cublasStatus_t stat; cublasHandle_t handle; int it,i; cublasOperation_t N = 'N'; cublasOperation_t T = 'T'; char N2 = 'N'; char T2 = 'T'; double one = 1., zero=0.; char uplo = 'L'; int info; int err; double* A; double* B; magmaStat = magma_init(); int use_pinned; if(argc > 1) { use_pinned = atoi(argv[1]); } else use_pinned = 0; printf("Setting use_pinned to %d\n", use_pinned); for( size = 512; size <= 8192; size*=4 ) { if(use_pinned) { // allocate pinned memory on CPU err = magma_dmalloc_pinned( &A, size*size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size*size ); assert( err == 0 ); } else { // allocate standard memory on CPU A = (double*) malloc( sizeof(double)*size*size ); B = (double*) malloc( sizeof(double)*size*size ); } cudaDeviceSynchronize(); double tInit = read_timer(); double *dA,*dB; // allocate memory on GPU magma_malloc( (void**) &dA, sizeof(double)*size*size ); magma_malloc( (void**) &dB, sizeof(double)*size*size ); cudaDeviceSynchronize(); double tAlloc = read_timer(); fillMatrix(B, size*size); cudaDeviceSynchronize(); double tInit2 = read_timer(); // transfer data to GPU magma_dsetmatrix( size, size, B, size, dB, size ); cudaDeviceSynchronize(); double tTransferToGPU = read_timer(); // matrix multiply magmablas_dgemm(MagmaNoTrans, MagmaTrans, size, size, size, one, dB, size, dB, size, zero, dA, size ); // magma_dgemm may be more general in terms of being able to call GPU or MIC cudaDeviceSynchronize(); double tMatMult = read_timer(); // Cholesky decomposition on GPU with GPU interface (called with object on GPU) magma_dpotrf_gpu(MagmaLower, size, dA, size, &info ); cudaDeviceSynchronize(); double tChol = read_timer(); // transfer data back to CPU magma_dgetmatrix( size, size, dA, size, A, size ); cudaDeviceSynchronize(); double tTransferFromGPU = read_timer(); // standard BLAS matrix multiply on CPU dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tMatMultBlas = read_timer(); // Cholesky decomposition on GPU with CPU interface (called with object on CPU) magma_dpotrf(MagmaLower, size, A, size, &info ); cudaDeviceSynchronize(); double tCholCpuInterface = read_timer(); // recreate A = B * B (could just do a save and copy instead....) dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tInit3 = read_timer(); // standard Lapack Cholesky decomposition on CPU dpotrf_(&uplo, &size, A, &size, &info); cudaDeviceSynchronize(); double tCholCpu= read_timer(); printf("====================================================\n"); printf("Timing results for n = %d\n", size); printf("GPU memory allocation time: %f\n", tAlloc - tInit); printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU); printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU); printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult); printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas); printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3); printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol); if(use_pinned) { magma_free_pinned(A); magma_free_pinned(B); } else { free(A); free(B); } magma_free(dA); magma_free(dB); } return EXIT_SUCCESS; }