MODULE NVIDIA_LIBRARY
      USE ISO_C_BINDING
      IMPLICIT NONE
      
! Define the INTERFACE to the NVIDIA C code cublasSgemm and to other helper 
! functions (cublasAlloc,cublasFree, cublasSetMatrix,cublasGetMatrix)
      integer(c_int),parameter :: fsize = 4
! Note: For OpenMP use (threads) sz and dev are !OMP THREAD PRIVATE      
      integer :: sz(3)=0
      type(c_ptr) :: dev(3)
      
      INTERFACE
!
!      void cublasSgemm (char transa, char transb, int m, int n,
!                        int k, float alpha, const float *A, int lda,
!                  const float *B, int ldb, float beta, float *C, int ldc)
       subroutine c_sgemm(cta, ctb, m, n, k,&
         alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasSgemm')
             use iso_c_binding
             character(KIND=c_char),value :: cta, ctb
             integer(c_int),value :: m,n,k,lda,ldb,ldc
             real(c_float),value :: alpha,beta
             type(c_ptr),value :: A,B,C
       end subroutine c_sgemm

!
!     cublasStatus CUBLASAPI cublasAlloc (int n, int elemSize, void **devicePtr);
!
       integer  (c_int) function cublasAlloc(n,size,ptr)&
         bind(C,name='cublasAlloc')
         use iso_c_binding
         integer(c_int), value:: n, size
         integer(c_int) :: retVal
         type(c_ptr) :: ptr
       end function cublasAlloc

!
!     cublasStatus CUBLASAPI cublasFree (const void *devicePtr);
!
       integer  (c_int) function cublasFree(ptr) bind(C,name='cublasFree')
         use iso_c_binding
         integer(c_int) :: retVal
         type(c_ptr),value :: ptr
       end function cublasFree
         
!
!    cublasStatus CUBLASAPI cublasSetMatrix (int rows, int cols, int elemSize,
!                                            const void *A, int lda, void *B,
!                                            int ldb);

       integer  (c_int) function cublasSetMatrix(rows,cols,elemSize,A,lda,devA,ldd)&
         bind(C,name='cublasSetMatrix')
         use iso_c_binding
         integer(c_int), value :: rows,cols, elemSize,lda,ldd
         integer(c_int) :: retVal
         real(c_float) :: A(lda,*)
         type(c_ptr), value :: devA
       end function cublasSetMatrix

!
!cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize,
!                                        const void *A, int lda, void *B,
!                                        int ldb);

       integer  (c_int) function cublasGetMatrix(rows,cols,elemSize,devA,lda,B,ldb)&
         bind(C,name='cublasGetMatrix')
         use iso_c_binding
         integer(c_int), value:: rows,cols, elemSize,lda,ldb
         integer(c_int) :: retVal
         type(c_ptr), value:: devA
         real(c_float):: B(ldb,*)
       end function cublasGetMatrix
      end interface
CONTAINS      
      subroutine check_buffer_allocation(isz)
        integer,intent(inout) :: isz(*)
        integer I, IERR
        
!OMP CRITICAL        
        DO I=1,3
! If no allocation has been done then allocate.
! If the current buffer size is too small then
! free the old allocation and reallocate.        
          if(isz(I) <= 0) then
! If IERR values are non-zero the allocation did 
! not succeed.  Print error messages, perhaps.          
            if(sz(I) > 0)IERR=cublasFree(dev(I))
          else
            if(sz(I) > 0 .and. sz(I)< isz(I)) IERR=cublasFree(dev(I))
            IERR=cublasAlloc(isz(I),fsize,dev(I))
          END IF
          sz(I)=isz(I)
          isz(I)=IERR
        END DO
!OMP END CRITICAL        
      end subroutine
              
      END MODULE
      
      PROGRAM CALL_SGEMM
      USE NVIDIA_LIBRARY, ONLY:check_buffer_allocation
      IMPLICIT NONE
      INTEGER, PARAMETER :: M=4,N=3,K=2
      INTEGER, PARAMETER :: LDA=M+1, LDB=K+1, LDC=M+1
      REAL ALPHA, BETA
      REAL A(LDA,K), B(LDB,N), C(LDC,N), D(LDC,N)
! A is M by K, B is K by N and C is M by N.
      INTEGER I, J, ISZ(3)
      INTERFACE
      SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
          IMPLICIT NONE
      
!     .. Scalar Arguments ..
          REAL ALPHA,BETA
          INTEGER K,LDA,LDB,LDC,M,N
          CHARACTER(LEN=*) TRANSA,TRANSB
!     ..
!     .. Array Arguments ..
          REAL A(LDA,*),B(LDB,*),C(LDC,*)
      END SUBROUTINE      
      END INTERFACE    
          DO J=1,K
             DO I=1,M
                A(I,J)=real(I)+real(J)/10.
             END DO
          END DO
          DO J=1,N
             DO I=1,K
                B(I,J)=real(I)+real(J)/10.
             END DO
          END DO
          ALPHA=0.5
          BETA=0.1
          C=0.01 
          D(1:M,1:N)=alpha*matmul(A(1:M,1:K),B(1:K,1:N))+beta*C(1:M,1:N)

          CALL SGEMM('No','no',M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
          D(1:M,1:N)=D(1:M,1:N)-C(1:M,1:N)
! Should have D(1:M,1:N) = C(1:M,1:N) except for rounding errors. 
          WRITE(*,*) 'Largest Difference: ', MAXVAL(ABS(D(1:M,1:N)))

! Clear memory in the NVIDIA processor.
! The special values ISZ=0 results in clearing.
         ISZ=0
         call check_buffer_allocation(isz)          
      END PROGRAM
      

      
subroutine sgemm(transa,transb,m,n,k,alpha,&
        a,lda,b,ldb,beta,c,ldc)
      
      use NVIDIA_LIBRARY
      implicit none
      
!     .. Scalar Arguments ..
      real alpha,beta
      integer k,lda,ldb,ldc,m,n
      character(len=*) transa,transb
      integer :: retVal=0,retValA=0,retValB=0,retValC=0

!     .. Array Arguments ..
      real a(lda,*),b(ldb,*),c(ldc,*)
      character(KIND=c_char) :: cta, ctb
      integer isz(3)
      REAL(kind(1.e0)) :: ZERO=0.e0
!
! The calculation, including memory initialization-

!
!   Allocate the memory on the GPU
!   If it is already allocted then use it.
!   If the size is too small then re-allocate it
!   to have the right size.
!   Possible refinements of this code should be 
!   considered for actual use.  Namely, use the
!   NVIDIA version for values of m,k,n >= some
!   threshold value.  Otherwise use an alternate
!   version that does not move data from the CPU
!   to the GPU.
      isz=(/m*k,k*n,m*n/)
      call check_buffer_allocation(isz)
!   Any error flags for allocation of dev(I) returned
!   as an output isz(I).      
      if(any(isz /= 0))&
        print *,"Error in memory allocation"
!
!   Move the data from CPU memory to GPU memory, as needed.
!
      if(alpha /= zero) THEN
        retvala=cublassetmatrix(m,k,fsize,a,lda,dev(1),m)
        retvalb=cublassetmatrix(k,n,fsize,b,ldb,dev(2),k)
      END IF
! If beta = 0. then the values of C(:,:) are not needed
! by the device.  This saves transfering unneeded data.      
      if(beta /= zero)&
        retValC=cublasSetMatrix(m,n,fsize,C,ldc,dev(3),m)

      if ( (retValA .ne. 0) .or. (retValB .ne. 0) .or. (retValC .ne. 0) )&
         print *,"Error in memory copy from CPU to GPU"
         
      cta=transa(1:1); ctb=transb(1:1)
      call c_sgemm(cta, ctb, m, n, k, alpha,&
                   dev(1), m, dev(2), k, beta, dev(3), m)
!
!   Move the result from GPU memory to CPU memory
!
      retVal=cublasGetMatrix(m,n,fsize,dev(3),m,C,ldc)
      if (retVal .ne. 0)&
       print *,"Error in memory copy from GPU to CPU"
      return     
      end