// This code is in the public domain -- castanyo@yahoo.es

#include <nvmath/Solver.h>

using namespace nv;

namespace
{
	class Preconditioner
	{
	public:
		// Virtual dtor.
		virtual ~Preconditioner() { }
	 
		// Apply preconditioning step.
		virtual void apply(const FullVector & x, FullVector & y) const = 0;
	};


	// Jacobi preconditioner.
	class JacobiPreconditioner : public Preconditioner
	{
	public:

		JacobiPreconditioner(const SparseMatrix & M, bool symmetric) : m_inverseDiagonal(M.width())
		{
			nvCheck(M.isSquare());
			
			for(uint x = 0; x < M.width(); x++)
			{
				float elem = M.getCoefficient(x, x);
				nvDebugCheck( elem != 0.0f );

				if (symmetric) 
				{
					m_inverseDiagonal[x] = 1.0f / sqrt(fabs(elem));
				}
				else 
				{
					m_inverseDiagonal[x] = 1.0f / elem;
				}
			}
		}

		void apply(const FullVector & x, FullVector & y) const
		{
			nvDebugCheck(x.dimension() == m_inverseDiagonal.dimension());
			nvDebugCheck(y.dimension() == m_inverseDiagonal.dimension());

			// @@ Wrap vector component-wise product into a separate function.
			const uint D = x.dimension();
			for (uint i = 0; i < D; i++)
			{
				y[i] = m_inverseDiagonal[i] * x[i];
			}
		}

	private:

		FullVector m_inverseDiagonal;

	};

} // namespace


static int ConjugateGradientSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon);
static int ConjugateGradientSolver(const Preconditioner & preconditioner, const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon);


// Solve the symmetric system: At·A·x = At·b
void nv::LeastSquaresSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon/*1e-5f*/)
{
	nvDebugCheck(A.width() == x.dimension());
	nvDebugCheck(A.height() == b.dimension());
	nvDebugCheck(A.height() >= A.width()); // @@ If height == width we could solve it directly...

	const uint D = A.width();

	SparseMatrix At(A.height(), A.width());
	transpose(A, At);

	FullVector Atb(D);
	//mult(Transposed, A, b, Atb);
	mult(At, b, Atb);

	SparseMatrix AtA(D);
	//mult(Transposed, A, NoTransposed, A, AtA);
	mult(At, A, AtA);

	SymmetricSolver(AtA, Atb, x, epsilon);
}


// See section 10.4.3 in: Mesh Parameterization: Theory and Practice, Siggraph Course Notes, August 2007
void nv::LeastSquaresSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, const uint * lockedParameters, uint lockedCount, float epsilon/*= 1e-5f*/)
{
	nvDebugCheck(A.width() == x.dimension());
	nvDebugCheck(A.height() == b.dimension());
	nvDebugCheck(A.height() >= A.width() - lockedCount);

	// @@ This is not the most efficient way of building a system with reduced degrees of freedom. It would be faster to do it on the fly.

	const uint D = A.width() - lockedCount;
	nvDebugCheck(D > 0);

	// Compute: b - Al * xl
	FullVector b_Alxl(b);

	for (uint y = 0; y < A.height(); y++)
	{
		const uint count = A.getRow(y).count();
		for (uint e = 0; e < count; e++)
		{
			uint column = A.getRow(y)[e].x;

			bool isFree = true;
			for (uint i = 0; i < lockedCount; i++) 
			{
				isFree &= (lockedParameters[i] != column);
			}

			if (!isFree)
			{
				b_Alxl[y] -= x[column] * A.getRow(y)[e].v;
			}
		}
	}

	// Remove locked columns from A.
	SparseMatrix Af(D, A.height());

	for (uint y = 0; y < A.height(); y++)
	{
		const uint count = A.getRow(y).count();
		for (uint e = 0; e < count; e++)
		{
			uint column = A.getRow(y)[e].x;
			uint ix = column;

			bool isFree = true;
			for (uint i = 0; i < lockedCount; i++) 
			{
				isFree &= (lockedParameters[i] != column);
				if (column > lockedParameters[i]) ix--; // shift columns
			}

			if (isFree)
			{
				Af.setCoefficient(ix, y, A.getRow(y)[e].v);
			}
		}
	}
	
	// Remove elements from x
	FullVector xf(D);

	for (uint i = 0, j = 0; i < A.width(); i++)
	{
		bool isFree = true;
		for (uint l = 0; l < lockedCount; l++) 
		{
			isFree &= (lockedParameters[l] != i);
		}

		if (isFree)
		{
			xf[j++] = x[i];
		}
	}

	// Solve reduced system.
	LeastSquaresSolver(Af, b_Alxl, xf, epsilon);

	// Copy results back to x.
	for (uint i = 0, j = 0; i < A.width(); i++)
	{
		bool isFree = true;
		for (uint l = 0; l < lockedCount; l++) 
		{
			isFree &= (lockedParameters[l] != i);
		}

		if (isFree)
		{
			x[i] = xf[j++];
		}
	}
}


void nv::SymmetricSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon/*1e-5f*/)
{
	nvDebugCheck(A.height() == A.width());
	nvDebugCheck(A.height() == b.dimension());
	nvDebugCheck(b.dimension() == x.dimension());

	JacobiPreconditioner jacobi(A, true);
	ConjugateGradientSolver(jacobi, A, b, x, epsilon);

//	ConjugateGradientSolver(A, b, x, epsilon);
}


/**
 * Compute the solution of the sparse linear system Ab=x using the Conjugate
 * Gradient method.
 *
 * Solving sparse linear systems:
 * (1)		A·x = b
 * 
 * The conjugate gradient algorithm solves (1) only in the case that A is 
 * symmetric and positive definite. It is based on the idea of minimizing the 
 * function
 * 
 * (2)		f(x) = 1/2·x·A·x - b·x
 * 
 * This function is minimized when its gradient
 * 
 * (3)		df = A·x - b
 * 
 * is zero, which is equivalent to (1). The minimization is carried out by 
 * generating a succession of search directions p.k and improved minimizers x.k. 
 * At each stage a quantity alfa.k is found that minimizes f(x.k + alfa.k·p.k), 
 * and x.k+1 is set equal to the new point x.k + alfa.k·p.k. The p.k and x.k are 
 * built up in such a way that x.k+1 is also the minimizer of f over the whole
 * vector space of directions already taken, {p.1, p.2, . . . , p.k}. After N 
 * iterations you arrive at the minimizer over the entire vector space, i.e., the 
 * solution to (1).
 *
 * For a really good explanation of the method see:
 *
 * "An Introduction to the Conjugate Gradient Method Without the Agonizing Pain",
 * Jonhathan Richard Shewchuk.
 *
**/
/*static*/ int ConjugateGradientSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon)
{
	nvDebugCheck( A.isSquare() );
	nvDebugCheck( A.width() == b.dimension() );
	nvDebugCheck( A.width() == x.dimension() );

	int i = 0;
	const int D = A.width();
	const int i_max = 4 * D;   // Convergence should be linear, but in some cases, it's not.

	FullVector r(D);   // residual
	FullVector p(D);   // search direction
	FullVector q(D);   // 
	float delta_0;
	float delta_old;
	float delta_new;
	float alpha;
	float beta;

	// r = b - A·x;
	copy(b, r);
	sgemv(-1, A, x, 1, r);

	// p = r;
	copy(r, p);
	
	delta_new = dot( r, r );
	delta_0 = delta_new;

	while (i < i_max && delta_new > epsilon*epsilon*delta_0)
	{
		i++;

		// q = A·p
		mult(A, p, q);

		// alpha = delta_new / p·q
		alpha = delta_new / dot( p, q );

		// x = alfa·p + x
		saxpy(alpha, p, x);

		if ((i & 31) == 0) // recompute r after 32 steps
		{
			// r = b - A·x
			copy(b, r);
			sgemv(-1, A, x, 1, r);
		}
		else
		{
			// r = r - alpha·q
			saxpy(-alpha, q, r);
		}

		delta_old = delta_new;
		delta_new = dot( r, r );

		beta = delta_new / delta_old;

		// p = beta·p + r
		scal(beta, p);
		saxpy(1, r, p);
	}

	return i;
}


// Conjugate gradient with preconditioner.
/*static*/ int ConjugateGradientSolver(const Preconditioner & preconditioner, const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon)
{
	nvDebugCheck( A.isSquare() );
	nvDebugCheck( A.width() == b.dimension() );
	nvDebugCheck( A.width() == x.dimension() );

	int i = 0;
	const int D = A.width();
	const int i_max = 4 * D;   // Convergence should be linear, but in some cases, it's not.

	FullVector r(D);    // residual
	FullVector p(D);    // search direction
	FullVector q(D);    // 
	FullVector s(D);    // preconditioned
	float delta_0;
	float delta_old;
	float delta_new;
	float alpha;
	float beta;

	// r = b - A·x
	copy(b, r);
	sgemv(-1, A, x, 1, r);


	// p = M^-1 · r
	preconditioner.apply(r, p);
	//copy(r, p);


	delta_new = dot(r, p);
	delta_0 = delta_new;

	while (i < i_max && delta_new > epsilon*epsilon*delta_0)
	{
		i++;

		// q = A·p
		mult(A, p, q);

		// alpha = delta_new / p·q
		alpha = delta_new / dot(p, q);

		// x = alfa·p + x
		saxpy(alpha, p, x);

		if ((i & 31) == 0)  // recompute r after 32 steps
		{			
			// r = b - A·x
			copy(b, r);
			sgemv(-1, A, x, 1, r);
		}
		else
		{
			// r = r - alfa·q
			saxpy(-alpha, q, r);
		}

		// s = M^-1 · r
		preconditioner.apply(r, s);
		//copy(r, s);

		delta_old = delta_new;
		delta_new = dot( r, s );

		beta = delta_new / delta_old;

		// p = s + beta·p
		scal(beta, p);
		saxpy(1, s, p);
	}

	return i;
}


#if 0 // Nonsymmetric solvers

/** Bi-conjugate gradient method.  */
MATHLIB_API int BiConjugateGradientSolve( const SparseMatrix &A, const DenseVector &b, DenseVector &x, float epsilon ) {
	piDebugCheck( A.IsSquare() );
	piDebugCheck( A.Width() == b.Dim() );
	piDebugCheck( A.Width() == x.Dim() );

	int i = 0;
	const int D = A.Width();
	const int i_max = 4 * D;

	float resid;
	float rho_1 = 0;
	float rho_2 = 0;
	float alpha;
	float beta;

	DenseVector r(D);
	DenseVector rtilde(D);
	DenseVector p(D);
	DenseVector ptilde(D);
	DenseVector q(D);
	DenseVector qtilde(D);
	DenseVector tmp(D);	// temporal vector.

	// r = b - A·x;
	A.Product( x, tmp );
	r.Sub( b, tmp );

	// rtilde = r
	rtilde.Set( r );

	// p = r;
	p.Set( r );

	// ptilde = rtilde
	ptilde.Set( rtilde );


	float normb = b.Norm();
  	if( normb == 0.0 ) normb = 1;

	// test convergence
	resid = r.Norm() / normb;
	if( resid < epsilon ) {
		// method converges?
		return 0;
	}


	while( i < i_max ) {

		i++;
		
		rho_1 = DenseVectorDotProduct( r, rtilde );

		if( rho_1 == 0 ) {
			// method fails.
			return -i;
		}

		if (i == 1) {
			p.Set( r );
			ptilde.Set( rtilde );
		} 
		else {
			beta = rho_1 / rho_2;
			
			// p = r + beta * p;
			p.Mad( r, p, beta );
			
			// ptilde = ztilde + beta * ptilde;
			ptilde.Mad( rtilde, ptilde, beta );
		}

		// q = A * p;
		A.Product( p, q );

		// qtilde = A^t * ptilde;
		A.TransProduct( ptilde, qtilde );

		alpha = rho_1 / DenseVectorDotProduct( ptilde, q );

		// x += alpha * p;
		x.Mad( x, p, alpha );

		// r -= alpha * q;
		r.Mad( r, q, -alpha );
		
		// rtilde -= alpha * qtilde;
		rtilde.Mad( rtilde, qtilde, -alpha );

		rho_2 = rho_1;

		// test convergence
		resid = r.Norm() / normb;
		if( resid < epsilon ) {
			// method converges
			return i;
		}
	}

	return i;
}


/** Bi-conjugate gradient stabilized method. */
int BiCGSTABSolve( const SparseMatrix &A, const DenseVector &b, DenseVector &x, float epsilon ) {
	piDebugCheck( A.IsSquare() );
	piDebugCheck( A.Width() == b.Dim() );
	piDebugCheck( A.Width() == x.Dim() );

	int i = 0;
	const int D = A.Width();
	const int i_max = 2 * D;


	float resid;
	float rho_1 = 0;
	float rho_2 = 0;
	float alpha = 0;
	float beta = 0;
	float omega = 0;

	DenseVector p(D);
	DenseVector phat(D);
	DenseVector s(D);
	DenseVector shat(D);
	DenseVector t(D);
	DenseVector v(D);

	DenseVector r(D);
	DenseVector rtilde(D);

	DenseVector tmp(D);

	// r = b - A·x;
	A.Product( x, tmp );
	r.Sub( b, tmp );

	// rtilde = r
	rtilde.Set( r );


	float normb = b.Norm();
	if( normb == 0.0 ) normb = 1;
  
	// test convergence
	resid = r.Norm() / normb;
	if( resid < epsilon ) {
		// method converges?
		return 0;
	}


	while( i<i_max ) {

		i++;
	
		rho_1 = DenseVectorDotProduct( rtilde, r );
		if( rho_1 == 0 ) {
			// method fails
			return -i;
		}


		if( i == 1 ) {
			p.Set( r );
		}
		else {
			beta = (rho_1 / rho_2) * (alpha / omega);

			// p = r + beta * (p - omega * v);
			p.Mad( p, v, -omega );
			p.Mad( r, p, beta );
		}

		//phat = M.solve(p);
		phat.Set( p );
		//Precond( &phat, p );

		//v = A * phat;
		A.Product( phat, v );

		alpha = rho_1 / DenseVectorDotProduct( rtilde, v );

		// s = r - alpha * v;
		s.Mad( r, v, -alpha );


		resid = s.Norm() / normb;
		if( resid < epsilon ) {
			// x += alpha * phat;
			x.Mad( x, phat, alpha );
			return i;
		}

		//shat = M.solve(s);
		shat.Set( s );
		//Precond( &shat, s );

		//t = A * shat;
		A.Product( shat, t );

		omega = DenseVectorDotProduct( t, s ) / DenseVectorDotProduct( t, t );

		// x += alpha * phat + omega * shat;
		x.Mad( x, shat, omega );
		x.Mad( x, phat, alpha );

		//r = s - omega * t;
		r.Mad( s, t, -omega );

		rho_2 = rho_1;
		
		resid = r.Norm() / normb;
		if( resid < epsilon ) {
			return i;
		}

		if( omega == 0 ) {
			return -i;	// ???
		}
	}

	return i;
}


/** Bi-conjugate gradient stabilized method. */
int BiCGSTABPrecondSolve( const SparseMatrix &A, const DenseVector &b, DenseVector &x, const IPreconditioner &M, float epsilon ) {
	piDebugCheck( A.IsSquare() );
	piDebugCheck( A.Width() == b.Dim() );
	piDebugCheck( A.Width() == x.Dim() );

	int i = 0;
	const int D = A.Width();
	const int i_max = D;
//	const int i_max = 1000;


	float resid;
	float rho_1 = 0;
	float rho_2 = 0;
	float alpha = 0;
	float beta = 0;
	float omega = 0;

	DenseVector p(D);
	DenseVector phat(D);
	DenseVector s(D);
	DenseVector shat(D);
	DenseVector t(D);
	DenseVector v(D);

	DenseVector r(D);
	DenseVector rtilde(D);

	DenseVector tmp(D);

	// r = b - A·x;
	A.Product( x, tmp );
	r.Sub( b, tmp );

	// rtilde = r
	rtilde.Set( r );


	float normb = b.Norm();
	if( normb == 0.0 ) normb = 1;
  
	// test convergence
	resid = r.Norm() / normb;
	if( resid < epsilon ) {
		// method converges?
		return 0;
	}


	while( i<i_max ) {

		i++;
	
		rho_1 = DenseVectorDotProduct( rtilde, r );
		if( rho_1 == 0 ) {
			// method fails
			return -i;
		}


		if( i == 1 ) {
			p.Set( r );
		}
		else {
			beta = (rho_1 / rho_2) * (alpha / omega);

			// p = r + beta * (p - omega * v);
			p.Mad( p, v, -omega );
			p.Mad( r, p, beta );
		}

		//phat = M.solve(p);
		//phat.Set( p );
		M.Precond( &phat, p );

		//v = A * phat;
		A.Product( phat, v );

		alpha = rho_1 / DenseVectorDotProduct( rtilde, v );

		// s = r - alpha * v;
		s.Mad( r, v, -alpha );


		resid = s.Norm() / normb;

		//printf( "--- Iteration %d: residual = %f\n", i, resid );

		if( resid < epsilon ) {
			// x += alpha * phat;
			x.Mad( x, phat, alpha );
			return i;
		}

		//shat = M.solve(s);
		//shat.Set( s );
		M.Precond( &shat, s );

		//t = A * shat;
		A.Product( shat, t );

		omega = DenseVectorDotProduct( t, s ) / DenseVectorDotProduct( t, t );

		// x += alpha * phat + omega * shat;
		x.Mad( x, shat, omega );
		x.Mad( x, phat, alpha );

		//r = s - omega * t;
		r.Mad( s, t, -omega );

		rho_2 = rho_1;
		
		resid = r.Norm() / normb;
		if( resid < epsilon ) {
			return i;
		}

		if( omega == 0 ) {
			return -i;	// ???
		}
	}

	return i;
}

#endif