/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
! National Laboratory (LANL), which is operated by Los Alamos National     !
! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
! rights to use, reproduce, and distribute this software.  NEITHER THE     !
! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
! SOFTWARE.  If software is modified to produce derivative works, such     !
! modified software should be clearly marked, so as not to confuse it      !
! with the version available from LANL.                                    !
!                                                                          !
! Additionally, this program is free software; you can redistribute it     !
! and/or modify it under the terms of the GNU General Public License as    !
! published by the Free Software Foundation; version 2.0 of the License.   !
! Accordingly, this program is distributed in the hope that it will be     !
! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
! Public License for more details.                                         !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>

#if REALSIZE==4
  #undef REAL
  #define REAL float
  #undef Matrix
  #define Matrix Matrix4
#elif REALSIZE==8
  #undef REAL
  #define REAL double
  #undef Matrix
  #define Matrix Matrix8
#endif

#ifdef CUDA
  #include "cublas.h"
  #include "Kernels/Kernels.h"
  #undef num_threads
  #define NUM_THREADS 256
#endif

#ifdef BLAS
  #include <pthread.h>
#endif

typedef struct {
  int M, N, DM, DN;
  REAL *Local;
  REAL *Device;
} Matrix;

void Initialize();
void ShutDown();

void M_Init(Matrix &A, int M, int N);
void M_InitWithLocal(Matrix &A, REAL *iLocal, int iM, int iN);

void M_Push(Matrix A);
void M_Pull(Matrix A);

void M_Copy(Matrix A, Matrix B); // Copy A into B
void M_Add(Matrix A, Matrix B, Matrix C);

void M_AddColumn(REAL k, int j, Matrix A, Matrix B, Matrix C);
void M_SubtractColumn(REAL k, int j, Matrix A, Matrix B, Matrix C);

void M_Subtract(Matrix A, Matrix B, Matrix C); // C=A-B

void M_Multiply(Matrix A, Matrix B, Matrix C);
void M_Multiply(REAL scalar, Matrix A, Matrix B); // B=scalar*A

void M_AddIdentity(Matrix a);

REAL M_Trace(Matrix A);
REAL M_TraceX2(Matrix A);

REAL M_DotProcutOfColumn(int j, Matrix A, Matrix B);

REAL M_CGIterate(Matrix bo, Matrix p0, Matrix tmpmat, Matrix r0);

void M_Randomize(Matrix A);
void M_Print(Matrix A);
void M_DeallocateDevice(Matrix &A);
void M_DeallocateLocal(Matrix &A);

void *Allocate(const char Label[], void *Pointer, size_t Size);

#ifdef BLAS
extern "C" void *CGIterateThreaded(void *arg);
#endif

void sp2pure_nospin(REAL bndfil, int  hdim, REAL *bo_pointer, REAL maxeval, REAL *h_pointer, REAL maxminusmin, int minsp2iter, REAL breaktol);
void sp2pure_spin(REAL bndfil, int  hdim, REAL *rhoup_pointer, REAL *rhodown_pointer, REAL maxeval, REAL *hup_pointer, REAL *hdown_pointer, REAL maxminusmin, int minsp2iter, REAL breaktol);
void sp2fermi_init_nospin(REAL bndfil, int hdim, REAL *bo_pointer, REAL maxeval, REAL *h_pointer, REAL maxminusmin, REAL *chempot_pointer, int norecs, REAL *kbt_pointer, REAL *beta0_pointer, REAL breaktol);
void sp2fermi_init_spin(REAL bndfil, int hdim, REAL *rhoup_ptr, REAL *rhodown_ptr, REAL maxeval, REAL *hup, REAL *hdown, REAL maxminusmin, REAL *chempot_pointer, int norecs, REAL *kbt_pointer, REAL *beta0_pointer, REAL breaktol);
void sp2fermi_nospin(REAL bndfil, int hdim, REAL *bo_pointer, REAL maxeval, REAL *h_pointer, REAL maxminusmin, REAL *chempot_pointer, int norecs, REAL *kbt_pointer, REAL *beta0_pointer, REAL breaktol);
void sp2fermi_spin(REAL bndfil, int hdim, REAL *rhoup_ptr, REAL *rhodown_ptr, REAL maxeval, REAL *hup, REAL *hdown, REAL maxminusmin, REAL *chempot_pointer, int norecs, REAL *kbt_pointer, REAL *beta0_pointer, REAL breaktol);
void solve_matrix_cg(REAL *bo_ptr, int hdim, REAL cgtol2);
