MODULE NVIDIA_LIBRARY USE ISO_C_BINDING IMPLICIT NONE ! Define the INTERFACE to the NVIDIA C code cublasSgemm and to other helper ! functions (cublasAlloc,cublasFree, cublasSetMatrix,cublasGetMatrix) integer(c_int),parameter :: fsize = 4 ! Note: For OpenMP use (threads) sz and dev are !OMP THREAD PRIVATE integer :: sz(3)=0 type(c_ptr) :: dev(3) INTERFACE ! ! void cublasSgemm (char transa, char transb, int m, int n, ! int k, float alpha, const float *A, int lda, ! const float *B, int ldb, float beta, float *C, int ldc) subroutine c_sgemm(cta, ctb, m, n, k,& alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasSgemm') use iso_c_binding character(KIND=c_char),value :: cta, ctb integer(c_int),value :: m,n,k,lda,ldb,ldc real(c_float),value :: alpha,beta type(c_ptr),value :: A,B,C end subroutine c_sgemm ! ! cublasStatus CUBLASAPI cublasAlloc (int n, int elemSize, void **devicePtr); ! integer (c_int) function cublasAlloc(n,size,ptr)& bind(C,name='cublasAlloc') use iso_c_binding integer(c_int), value:: n, size integer(c_int) :: retVal type(c_ptr) :: ptr end function cublasAlloc ! ! cublasStatus CUBLASAPI cublasFree (const void *devicePtr); ! integer (c_int) function cublasFree(ptr) bind(C,name='cublasFree') use iso_c_binding integer(c_int) :: retVal type(c_ptr),value :: ptr end function cublasFree ! ! cublasStatus CUBLASAPI cublasSetMatrix (int rows, int cols, int elemSize, ! const void *A, int lda, void *B, ! int ldb); integer (c_int) function cublasSetMatrix(rows,cols,elemSize,A,lda,devA,ldd)& bind(C,name='cublasSetMatrix') use iso_c_binding integer(c_int), value :: rows,cols, elemSize,lda,ldd integer(c_int) :: retVal real(c_float) :: A(lda,*) type(c_ptr), value :: devA end function cublasSetMatrix ! !cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize, ! const void *A, int lda, void *B, ! int ldb); integer (c_int) function cublasGetMatrix(rows,cols,elemSize,devA,lda,B,ldb)& bind(C,name='cublasGetMatrix') use iso_c_binding integer(c_int), value:: rows,cols, elemSize,lda,ldb integer(c_int) :: retVal type(c_ptr), value:: devA real(c_float):: B(ldb,*) end function cublasGetMatrix end interface CONTAINS subroutine check_buffer_allocation(isz) integer,intent(inout) :: isz(*) integer I, IERR !OMP CRITICAL DO I=1,3 ! If no allocation has been done then allocate. ! If the current buffer size is too small then ! free the old allocation and reallocate. if(isz(I) <= 0) then ! If IERR values are non-zero the allocation did ! not succeed. Print error messages, perhaps. if(sz(I) > 0)IERR=cublasFree(dev(I)) else if(sz(I) > 0 .and. sz(I)< isz(I)) IERR=cublasFree(dev(I)) IERR=cublasAlloc(isz(I),fsize,dev(I)) END IF sz(I)=isz(I) isz(I)=IERR END DO !OMP END CRITICAL end subroutine END MODULE PROGRAM CALL_SGEMM USE NVIDIA_LIBRARY, ONLY:check_buffer_allocation IMPLICIT NONE INTEGER, PARAMETER :: M=4,N=3,K=2 INTEGER, PARAMETER :: LDA=M+1, LDB=K+1, LDC=M+1 REAL ALPHA, BETA REAL A(LDA,K), B(LDB,N), C(LDC,N), D(LDC,N) ! A is M by K, B is K by N and C is M by N. INTEGER I, J, ISZ(3) INTERFACE SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) IMPLICIT NONE ! .. Scalar Arguments .. REAL ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER(LEN=*) TRANSA,TRANSB ! .. ! .. Array Arguments .. REAL A(LDA,*),B(LDB,*),C(LDC,*) END SUBROUTINE END INTERFACE DO J=1,K DO I=1,M A(I,J)=real(I)+real(J)/10. END DO END DO DO J=1,N DO I=1,K B(I,J)=real(I)+real(J)/10. END DO END DO ALPHA=0.5 BETA=0.1 C=0.01 D(1:M,1:N)=alpha*matmul(A(1:M,1:K),B(1:K,1:N))+beta*C(1:M,1:N) CALL SGEMM('No','no',M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) D(1:M,1:N)=D(1:M,1:N)-C(1:M,1:N) ! Should have D(1:M,1:N) = C(1:M,1:N) except for rounding errors. WRITE(*,*) 'Largest Difference: ', MAXVAL(ABS(D(1:M,1:N))) ! Clear memory in the NVIDIA processor. ! The special values ISZ=0 results in clearing. ISZ=0 call check_buffer_allocation(isz) END PROGRAM subroutine sgemm(transa,transb,m,n,k,alpha,& a,lda,b,ldb,beta,c,ldc) use NVIDIA_LIBRARY implicit none ! .. Scalar Arguments .. real alpha,beta integer k,lda,ldb,ldc,m,n character(len=*) transa,transb integer :: retVal=0,retValA=0,retValB=0,retValC=0 ! .. Array Arguments .. real a(lda,*),b(ldb,*),c(ldc,*) character(KIND=c_char) :: cta, ctb integer isz(3) REAL(kind(1.e0)) :: ZERO=0.e0 ! ! The calculation, including memory initialization- ! ! Allocate the memory on the GPU ! If it is already allocted then use it. ! If the size is too small then re-allocate it ! to have the right size. ! Possible refinements of this code should be ! considered for actual use. Namely, use the ! NVIDIA version for values of m,k,n >= some ! threshold value. Otherwise use an alternate ! version that does not move data from the CPU ! to the GPU. isz=(/m*k,k*n,m*n/) call check_buffer_allocation(isz) ! Any error flags for allocation of dev(I) returned ! as an output isz(I). if(any(isz /= 0))& print *,"Error in memory allocation" ! ! Move the data from CPU memory to GPU memory, as needed. ! if(alpha /= zero) THEN retvala=cublassetmatrix(m,k,fsize,a,lda,dev(1),m) retvalb=cublassetmatrix(k,n,fsize,b,ldb,dev(2),k) END IF ! If beta = 0. then the values of C(:,:) are not needed ! by the device. This saves transfering unneeded data. if(beta /= zero)& retValC=cublasSetMatrix(m,n,fsize,C,ldc,dev(3),m) if ( (retValA .ne. 0) .or. (retValB .ne. 0) .or. (retValC .ne. 0) )& print *,"Error in memory copy from CPU to GPU" cta=transa(1:1); ctb=transb(1:1) call c_sgemm(cta, ctb, m, n, k, alpha,& dev(1), m, dev(2), k, beta, dev(3), m) ! ! Move the result from GPU memory to CPU memory ! retVal=cublasGetMatrix(m,n,fsize,dev(3),m,C,ldc) if (retVal .ne. 0)& print *,"Error in memory copy from GPU to CPU" return end