#include <stdio.h>
/* Emulated use of an NVIDIA C routine from Fortran */

/*
!  cublasSgemm  performs one of the matrix-matrix operations
!     C := alpha*op( A )*op( B ) + beta*C,
!  where  op( X ) is one of
!     op( X ) = X   or   op( X ) = X',
!  alpha and beta are scalars, and A, B and C are matrices, with op( A )
!  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
*/

void cublasSgemm (char transa, char transb, int m, int n,
				  int k, float alpha,  const float *A, int lda,
         const float *B, int ldb, float beta,
         float *C, int ldc)
{
	// Print out input arguments that show a correct interface.
    int ii,jj,kk;
	printf(" Values of adjoint flags TRANSA, TRANSB = %c,%c\n",transa,transb);
	printf(" Values of sizes m,n,k,lda,ldb and ldc = %d,%d,%d,%d,%d,%d\n",
		               m,n,k,lda,ldb,ldc);
	printf(" Values of alpha, beta = %e, %e\n",alpha,beta);
	// Print out input arrays A,B and their column-oriented storage.
	for(jj=1;jj<=k;jj++)
	{
		for(ii=1;ii<=m;ii++)
		{
			printf("Value of i,j and A(i,j) = %d, %d, %e\n",ii,jj,A[(ii-1)+lda*(jj-1)]);
		}
	}
	printf("\n");
	for(jj=1;jj<=n;jj++)
	{
		for(ii=1;ii<=k;ii++)
		{
			printf("Value of i,j and B(i,j) = %d, %d, %e\n",ii,jj,B[(ii-1)+ldb*(jj-1)]);
		}
	}

	/* Multiplication of A and B */ 
	printf("\n");
        for (ii = 1; ii <= m; ii++) 
           for (jj = 1; jj <= n; jj++) 
             { C[ii-1 + ldc*(jj-1)] = beta*C[ii-1 + ldc*(jj-1)];
	     for (kk = 1; kk <= k; kk++)
               { C[ii-1 + ldc*(jj-1)] = C[ii-1 + ldc*(jj-1)] + 
                  alpha*A[ii-1 + lda*(kk-1)]*B[kk-1 + ldb*(jj-1)]; }
             }
	printf("\n");
	for(jj=1;jj<=n;jj++)
	{
		for(ii=1;ii<=m;ii++)
		{
			printf("Value of i,j and C(i,j) = %d, %d, %e\n",ii,jj,C[(ii-1)+ldc*(jj-1)]);
		}
	}

	return;
}