/*
 * $Id: solver.c,v 1.14 2003/12/01 09:50:16 nicoo Exp $
 *
 *
 * Copyright (C) 1999, 2000, 2001 Nicolas LAURENT
 * This file is part of `Haplo'
 * 
 *
 * `Haplo'  is free software;  you can  redistribute  it and/or modify it
 * under the terms of the GNU Library General Public License as published
 * by the Free Software Foundation;  either version 2  of the License, or
 * (at your option) any later version.
 *
 * `Haplo' is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the  implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the  GNU General Public License
 * for more details.
 * 
 * You should have  received  a copy of the   GNU General Public  License
 * along with `Haplo'.  If not, write to  the
 *
 *                                        Free Software Foundation,  Inc.
 *                                        675 Mass Ave, Cambridge, MA
 *                                        02139, USA.
 *
 */

/*
 * This file contains implementations of 2 solvers:
 *
 * - Direct Solver: LDLT factorization with hand optimized code for x86.
 * - Iterative Solver: conjugate gradient
 *
 *
 * References:
 *
 * "Analyse numrique matricielle applique  l'art de l'ingnieur"
 *                      P. LASCAUX / R. THEODOR
 * 
 *  Tome 1 - ISBN 2-225-84122-5 - pages 218-222, 234-250
 *  Tome 2 - ISBN 2-225-84546-8 - pages 422-423
 * 
 */


#include <haplo.h>
#include <float.h>
#include <math.h>
#include <string.h>

#include <stdio.h>

#include "matrix.h"
#include "solver.h"


/*-----------------------------------------------------------------------------
                             P R O T O T Y P E S 
-----------------------------------------------------------------------------*/

static int solver_check(const fem_matrix_t *matrix,
			const fem_vector_t *vector);
static double solver_dot(const double *a, const double *b, unsigned long n);

/* LDLT */
static void solver_ldlt_factorize(fem_matrix_t *mat, double epsilon);
static void solver_ldlt_invert(fem_matrix_t *mat, fem_vector_t *vec);
static void solver_ldlt(fem_matrix_t *mat, fem_vector_t *vec, double epsilon);
fem_vector_t *fem_solver_ldlt(fem_matrix_t *A, const fem_vector_t *b);

/* CG */
static void solver_cg_descent(unsigned long N, double *x, double *r,
			      const double *p, const double *q, double a);
static double solver_cg_finish(unsigned long N, double *p, double b,
			     const double *z);
static void solver_cg_ssor(const matrix_t *mat, double *z, const double *r,
			   const double w);
static double solver_cg_init(const matrix_t *A, double omega, vector_t *p,
			     vector_t *q, vector_t *r, vector_t *x,
			     vector_t *z);
static int solver_cg(const fem_matrix_t *A, fem_vector_t *x,
		      double epsilon, double omega);
fem_vector_t *fem_solver_cg_omega(const fem_matrix_t *A,
				  const fem_vector_t *b, const double *omega);
fem_vector_t *fem_solver_cg(const fem_matrix_t *A, const fem_vector_t *b);


/*-----------------------------------------------------------------------------
                         I M P L E M E N T A T I O N 
-----------------------------------------------------------------------------*/


/**
 *
 */
static int solver_check(const fem_matrix_t *matrix, const fem_vector_t *vector)
{
	int status=0;
	
	if (matrix->size != vector->size)
	{
		status = -1;
		haplo_error("Matrix and vector's dimensions missmatch.");
	}

	return(status);
}


/*
 * Common functions
 */


/**
 * just a simple scalar product.
 */
static double solver_dot(const double *a, const double *b, unsigned long n)
{
#define FEM_SOLVER_OPTIMIZED 
#define FEM_SOLVER_OPTIMIZED_SSE

#ifndef FEM_SOLVER_OPTIMIZED
	/* This writing is better optimized on i386 */
	double res = 0.0;
	
	while(n-- > 0)
		res += *(a++) * *(b++);

	return(res);
#else /* only x86/gcc for now... */
	/*
	 * Hand optimized code
	 * This code speed up things by ~40% on my box! (Athlon-XP) :
	 *
	 *   Matrix (3500x3500, 50.0%)  : 19.44s -> 10.92s (-44%)
	 *   Matrix (10000x10000, 20.0%): 72.80s -> 40.27s (-45%)
	 */
	double res;
	
	__asm__ __volatile__ ("fldz\n\t"
			      "testl	%1, %1\n\t"
			      "jbe	.fem.dot.end\n"
			      ".fem.dot.loop:\n\t"
			      "subl	$1, %1\n\t"
#	ifdef FEM_SOLVER_OPTIMIZED_SSE
			      /*
			       * Theses 2 instructions do all the job!
			       * Whithout them, we get "only" 17% speedup
			       * note: 128 is an empirical value...
			       */
			      "prefetcht2 -128(%3, %1, 8)\n\t"
			      "prefetcht2 -128(%2, %1, 8)\n\t"
#	endif /* FEM_SOLVER_OPTIMIZED_SSE */
			      "fldl	(%3, %1, 8)\n\t"
			      "fmull	(%2, %1, 8)\n\t"
			      "faddp	%0, %0(1)\n\t"
			      "jne	.fem.dot.loop\n"
			      ".fem.dot.end:"
			      : "=%%st" (res)
			      : "q" (n) , "q" (a), "q" (b));
	return(res);
#endif /* FEM_SOLVER_OPTIMIZED */
}


/*
 * LDLT
 */

/**
 *
 */
static void solver_ldlt_factorize(fem_matrix_t *mat, double epsilon)
{
	unsigned long i;
	haplo_timer_t timer;
	
	/*
	 * Algorithm:
	 *
	 * FOR I=1 (2) TO N DO
	 *	FOR J=1 (2) TO I-1 DO
	 *		FOR K=1 TO J-1 DO
	 *			A(I,J)=A(I,J)-A(I,K)*A(J,K)
	 *		DONE
	 *	DONE
	 * 	FOR J=1 TO I-1 DO
	 *		S=A(I,J)/A(J,J)
	 *		A(I,I)=A(I,I)-S*A(I,J)
	 *		A(I,J)=S
	 *	DONE
	 * DONE
	 */

	haplo_timer_start(&timer);

	for(i=1; i<mat->size; i++)
	{
		const unsigned long first_i=i+mat->diag[i-1]+1-mat->diag[i];
		double aii;
		unsigned long j;

		for(j=(first_i>0)?first_i:1; j<i; j++)
		{
			const unsigned long first_j=
				j+mat->diag[j-1]+1-mat->diag[j];

			if (first_i > first_j)
			{
				mat->coef[j+mat->diag[i]-i] -=
					solver_dot(
						mat->coef+mat->diag[i-1]+1,
						mat->coef+mat->diag[j]-j+
						first_i,
						j-first_i);
			} else {
				mat->coef[j+mat->diag[i]-i] -=
					solver_dot(
						mat->coef+mat->diag[i]-i+
						first_j,
						mat->coef+mat->diag[j-1]+1,
						j-first_j);
			}
		}

		aii = mat->coef[mat->diag[i]];
		for(j=first_i; j<i; j++)
		{
			const double aij = mat->coef[mat->diag[i]+j-i];
			const double s   = aij/mat->coef[mat->diag[j]];

			
			aii -= s*aij;
			mat->coef[j-i+mat->diag[i]] = s;
		}
		
		if (fabs(aii) < epsilon)
		{
			aii *= 1.0 / epsilon;
			haplo_warning("Singularity on dof #%lu. Blocking.",
				      i+1);
			mat->singularity = i+1;
		}

		mat->coef[mat->diag[i]] = aii;
	}

	
	mat->state=FEM_MATRIX_LDLT;
	haplo_timer_stop(&timer);
	haplo_timer_print(&timer, "Factorization");
	
	return;
}


/**
 *
 */
static void solver_ldlt_invert(fem_matrix_t *mat, fem_vector_t *vec)
{
	haplo_timer_t timer;
	unsigned long i;

	haplo_timer_start(&timer);

	for(i=1; i<mat->size; ++i)
	{	
		double s=0.0;
		const double  *aij=mat->coef+mat->diag[i-1]+1;
		unsigned long j;

		for(j=i-mat->diag[i]+mat->diag[i-1]+1; j<i; j++)
			s += (*aij++) * vec->coef[j];

		vec->coef[i] -= s;
	}
	
	for(i=0; i<mat->size; ++i)
		vec->coef[i] /= mat->coef[mat->diag[i]];
	
	for(i=mat->size-1; i>=1; --i)
	{
		const double vi=vec->coef[i];
		const double  *aij=mat->coef+mat->diag[i-1]+1;
		unsigned long j;

		for(j=i-mat->diag[i]+mat->diag[i-1]+1; j<i; j++)
			vec->coef[j] -= (*aij++) * vi;
		
	}

	haplo_timer_stop(&timer);
	haplo_timer_print(&timer, "Inversion");

	return;
}


/**
 *
 */
static void solver_ldlt(fem_matrix_t *mat, fem_vector_t *vec, double epsilon)
{
	switch(mat->state)
	{
	case FEM_MATRIX_INIT:
		haplo_fatal(
			"Call to fem_solver_solve() with non-allocated matrix"
			" (BUG?)");
		break;		

	case FEM_MATRIX_ALLOCATED:
		solver_ldlt_factorize(mat, epsilon);
		break;

	case FEM_MATRIX_LDLT:
		break;
	}
	
	solver_ldlt_invert(mat, vec);
	
	return;
}


/**
 *
 */
fem_vector_t *fem_solver_ldlt(fem_matrix_t *A, const fem_vector_t *b)
{
	fem_vector_t *x=NULL;

	if (solver_check(A, b) == 0)
	{
		unsigned long i;
	
		x=fem_vector_new(A->size);
	
		for(i=0; i<x->size; i++)
			x->coef[i]=b->coef[i];
		
		solver_ldlt(A, x, DBL_EPSILON*__fem_vector_norm(x));
	}
	
	return(x);
}


/*
 * Conjugate Gradient
 */


/**
 *
 */
static void solver_cg_descent(unsigned long N, double *x, double *r,
			      const double *p, const double *q, const double a)
{
	unsigned long	i;

	for(i=0; i<N; i++)
	{
		
		*x++ += a * (*p++);
		*r++ -= a * (*q++);
	}
	return;
}


/**
 *
 */
static double solver_cg_finish(unsigned long N, double *p, double b,
			     const double *z)
{
	double np=0.0;
	
	unsigned long i;

	for(i=0; i<N; i++)
	{
		p[i]=z[i]+b*p[i];
		np += p[i]*p[i];
	}
	

	return(np);
}


/**
 *
 *
 * A = D + L + L^t
 *
 * C = 1/(w*(2-w)) * (D+wL) * D^-1 * (D+wL)^t
 *
 * Solve C.z = r
 * 
 */
static void solver_cg_ssor(const matrix_t *mat, double *z, const double *r,
			   const double w)
{
	const unsigned long N=mat->size;
	const double d=(2-w)*w;
	unsigned long i;
	const double  *aij;

	/* Solve (D+wL) . x = r */
	aij=mat->coef;
	z[0] = r[0]/(*aij++);
	for(i=1; i<N; ++i)
	{	
		double s=0.0;
		unsigned long j;

		for(j=i-mat->diag[i]+mat->diag[i-1]+1; j<i; j++)
			s += (*aij++) * z[j];
		
		z[i] = (r[i] - (s*w)) / (*aij++);
	}
	
	/* Solve (w*(2-w)*D)^-1 . y = x */
	for(i=0; i<N; ++i)
		z[i] *= d*mat->coef[mat->diag[i]];

	/* Solve (D+wL)^t . z = y */
	aij=mat->coef+mat->diag[N-1];
	for(i=N-1; i>=1; --i)
	{
		const unsigned long end=i+1+mat->diag[i-1]-mat->diag[i];
		double zi;
		unsigned long j;
		
		z[i] /= (*aij--);
		zi = z[i] * w;
		
		j=i;
		while(j > end)
			z[--j] -= (*aij--) * zi;			
	}
	z[0] /= (*aij);

	return;
}


/**
 * Try to guess
 */
static double solver_cg_omega(const matrix_t *A)
{
	const unsigned long N=A->size;
	double omega;	
	
	omega = 2.0-2.0*(A->diag[A->size-1]+1)/((N+1)*N);
	haplo_info("Guessed SSOR parameter: w=%e", omega);

	return(omega);
}


/**
 *
 */
static double solver_cg_init(const matrix_t *A, double omega, vector_t *p,
			     vector_t *q, vector_t *r, vector_t *x,
			     vector_t *z)
{
	const unsigned long N=A->size;
	unsigned long k;
	double rz=0.0;		

	for(k=0; k<N; k++)
		q->coef[k] = 1.0;

	__fem_matrix_vector(A, q->coef, r->coef);
	
	for(k=0; k<N; k++)
	{
		r->coef[k] = x->coef[k] - r->coef[k] ;
		x->coef[k] = 1.0;
	}

	
	solver_cg_ssor(A, p->coef, r->coef, omega);

	rz=0.0;
	for(k=0; k<N; k++)
	{
		z->coef[k] = p->coef[k];
		rz += r->coef[k]*z->coef[k];
	}

	return(rz);
}


/**
 *
 *
 *   x_0
 *
 *   r_0   = b - A.x_0
 * C.p_0 = r_0
 *   z_0 = p_0
 *
 *   q_k   = A.p_k
 *   a     = (r_k | z_k) / (q_k | p_k)
 *   x_k+1 = x_k + a * p_k
 *   r_k+1 = r_k - a * q_k
 * C.z_k+1 = r_k+1
 *   b     = (r_k+1 | z_k+1) / (r_k | z_k)
 *   p_k+1 = z_k+1 + b * p_k
 */
static int solver_cg(const fem_matrix_t *A, fem_vector_t *x,
		      double epsilon, double omega)
{
	const unsigned long N=x->size;
	unsigned long k;
	double rz1;
	haplo_timer_t timer;
	fem_vector_t *r;
	fem_vector_t *p;
	fem_vector_t *q;
	fem_vector_t *z;
	const double eps=epsilon*epsilon;

	haplo_timer_start(&timer);

	p=fem_vector_new(N);
	q=fem_vector_new(N);
 	r=fem_vector_new(N);	
	z=fem_vector_new(N);

	rz1=solver_cg_init(A, omega, p, q, r, x, z);

	for(k=0; k<N; k++)
	{
		double rz=rz1;
		double a;
		
		__fem_matrix_vector(A, p->coef, q->coef);
		a = solver_dot(q->coef, p->coef, N);

		if (fabs(a) < epsilon*rz)
		{
			haplo_error("Singularity detected.");
			break;
		}
		a=rz/a;

		solver_cg_descent(N, x->coef, r->coef, p->coef, q->coef, a);

		solver_cg_ssor(A, z->coef, r->coef, omega);

		rz1=solver_dot(r->coef, z->coef, N);
		
		if (solver_cg_finish(N, p->coef, rz1/rz, z->coef) < eps)
			break;
	}

	fem_vector_free(r);
	fem_vector_free(p);
	fem_vector_free(q);
	fem_vector_free(z);

	haplo_info("Iterations: %lu/%lu", k, N);
	

	haplo_timer_stop(&timer);
	haplo_timer_print(&timer, "Computation");
	
	if (fabs(rz1) > epsilon)
	{
		haplo_error("Non-conergence!");
		return(-1);
	}

	return(0);
}


/**
 *
 */
fem_vector_t *fem_solver_cg_omega(const fem_matrix_t *A,
				  const fem_vector_t *b, const double *omega)
{
	fem_vector_t *x=NULL;

	if (solver_check(A, b) == 0)
	{
		if (A->state != FEM_MATRIX_LDLT)
		{
			unsigned long i;

			x=fem_vector_new(A->size);

			for(i=0; i<x->size; i++)
				x->coef[i]=b->coef[i];
		
			if (solver_cg(A, x, DBL_EPSILON*__fem_vector_norm(x),
				      *omega))
			{
				fem_vector_free(x);
				x=NULL;
			}
		} else {
			haplo_error("Matrix already factorized. "
				    "Consider using LDLT solver.");
		}
	}
	
	return(x);
}


/**
 *
 */
fem_vector_t *fem_solver_cg(const fem_matrix_t *A, const fem_vector_t *b)
{
	const double omega=solver_cg_omega(A);

	return(fem_solver_cg_omega(A, b, &omega));
}
