/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
! National Laboratory (LANL), which is operated by Los Alamos National     !
! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
! rights to use, reproduce, and distribute this software.  NEITHER THE     !
! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
! SOFTWARE.  If software is modified to produce derivative works, such     !
! modified software should be clearly marked, so as not to confuse it      !
! with the version available from LANL.                                    !
!                                                                          !
! Additionally, this program is free software; you can redistribute it     !
! and/or modify it under the terms of the GNU General Public License as    !
! published by the Free Software Foundation; version 2.0 of the License.   !
! Accordingly, this program is distributed in the hope that it will be     !
! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
! Public License for more details.                                         !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/

#include "Matrix.h"

#define sgemm sgemm_
#define dgemm dgemm_

extern "C" void sgemm(const char *transa, const char *transb,
   int *l, int *n, int *m, float *alpha,
   const void *a, int *lda, void *b, int *ldb,
   float *beta, void *c, int *ldc);

extern "C" void dgemm(const char *transa, const char *transb,
   int *l, int *n, int *m, double *alpha, 
   const void *a, int *lda, void *b, int *ldb, 
   double *beta, void *c, int *ldc);

void M_Multiply(Matrix A, Matrix B, Matrix C) {
  #ifdef CUDA
    #if REALSIZE==4
      cublasSgemm('N', 'N', A.DM, B.DN, A.DN, 1.0, A.Device, A.DM, B.Device, B.DM, 0.0, C.Device, C.DM);
    #elif REALSIZE==8
      cublasDgemm('N', 'N', A.DM, B.DN, A.DN, 1.0, A.Device, A.DM, B.Device, B.DM, 0.0, C.Device, C.DM);
    #endif
  #endif
  #ifdef BLAS
    REAL ZERO=0.0, ONE=1.0;
    #if REALSIZE==4
      sgemm("N", "N", &A.M, &B.N, &A.N, &ONE, A.Local, &A.M, B.Local, &B.M, &ZERO, C.Local, &C.M);
    #elif REALSIZE==8
      dgemm("N", "N", &A.M, &B.N, &A.N, &ONE, A.Local, &A.M, B.Local, &B.M, &ZERO, C.Local, &C.M);
    #endif
  #endif
}

void M_Multiply(REAL k, Matrix A, Matrix B) {
  #ifdef CUDA
    MultiplyScalarMatrixKernel<<<1,NUM_THREADS>>>(k, A.Device, A.DM, A.DN, B.Device, NUM_THREADS);
  #endif
  #ifdef BLAS
    for (int i=0; i<A.M*A.N; i++)
      B.Local[i]=k*A.Local[i];
  #endif
}
