/** 
* @file GPLaplaceOptimizationProblem.cpp
* @brief Hyperparameter Optimization Problem for GP Regression
* @author Erik Rodner
* @date 12/09/2009

*/

#include <iostream>

#include "core/vector/Algorithms.h"
#include "core/algebra/CholeskyRobust.h"
#include "core/algebra/CholeskyRobustAuto.h"

#include "vislearning/math/kernels/OptimKernelParameterGradInterface.h"
#include "GPLaplaceOptimizationProblem.h"

using namespace std;
using namespace NICE;
using namespace OBJREC;


GPLaplaceOptimizationProblem::GPLaplaceOptimizationProblem (
		KernelData *_kernelData, 
		const NICE::Vector & _y, 
		ParameterizedKernel *_kernel,
		const LikelihoodFunction *_likelihoodFunction,
		LaplaceApproximation *_laplaceApproximation, 
		bool verbose ) 
	:	OptimizationProblemFirst ( _kernel->getParameterSize() ),
		kernelData(_kernelData), 
		kernel(_kernel),
		likelihoodFunction(_likelihoodFunction)
{
	this->verbose = verbose;
	this->y.push_back ( _y );
	this->kernel->getParameters( parameters() );
	this->laplaceApproximation.push_back ( _laplaceApproximation );
	if ( verbose ) {
		cerr.precision(20);
		cerr << "GPLaplaceOptimizationProblem: initial parameters = " << parameters() << endl;
	}
	bestLooParameters.resize ( parameters().size() );
	bestAvgLooError = numeric_limits<double>::max();
}
	
GPLaplaceOptimizationProblem::GPLaplaceOptimizationProblem ( KernelData *_kernelData, const VVector & _y, 
		ParameterizedKernel *_kernel, const LikelihoodFunction *_likelihoodFunction,
		const vector<LaplaceApproximation *> & _laplaceApproximation,
		bool verbose ) : OptimizationProblemFirst( _kernel->getParameterSize() ),
		kernelData(_kernelData),
		y(_y),
		kernel(_kernel),
		likelihoodFunction(_likelihoodFunction),
		laplaceApproximation(_laplaceApproximation)
{
	this->verbose = verbose;
	this->kernel->getParameters( parameters() );
	if ( verbose ) {
		cerr.precision(20);
		cerr << "GPLaplaceOptimizationProblem: initial parameters = " << parameters() << endl;
	}
	if ( y.size() != laplaceApproximation.size() )
		fthrow(Exception, "Number of label vectors and number of Laplace approximation objects do not correspond.");

	bestLooParameters.resize ( parameters().size() );
	bestAvgLooError = numeric_limits<double>::max();
}
	
GPLaplaceOptimizationProblem::~GPLaplaceOptimizationProblem()
{
}

void GPLaplaceOptimizationProblem::update ()
{
	kernel->setParameters( parameters() ); 
	kernel->updateKernelData ( kernelData );
	kernelData->updateCholeskyFactorization();
	for ( uint i = 0 ; i < y.size(); i++ )
		laplaceApproximation[i]->approximate ( kernelData, y[i], likelihoodFunction ); 
}

double GPLaplaceOptimizationProblem::computeObjective()
{
	if ( verbose ) {
		cerr << "GPLaplaceOptimizationProblem: computeObjective: " << parameters() << endl;
	}

	try {
		update();
	} catch ( Exception ) {
		// inversion problems
		if ( verbose ) 
			cerr << "GPLaplaceOptimizationProblem: kernel matrix might be singular" << endl;

		return numeric_limits<double>::max();
	} 
	
	double loglikelihood = 0.0;

	/* Objective = - Laplace Objective + 0.5 logdet(B)
	 * Laplace Objective = - 0.5 a^T f + log p(y|f)
	 * B = Id + W^0.5 K W^0.5
	 */
	for ( uint k = 0 ; k < laplaceApproximation.size() ; k++ )
		loglikelihood += - laplaceApproximation[k]->getObjective() + triangleMatrixLogDet ( laplaceApproximation[k]->getCholeskyB() );

	if ( verbose ) {
		cerr << "GPLaplaceOptimizationProblem: negative loglikelihood = " << loglikelihood << endl;
	}

	return loglikelihood;
}
	
void GPLaplaceOptimizationProblem::computeGradient( NICE::Vector& newGradient )
{
	if ( verbose ) 
		cerr << "GPLaplaceOptimizationProblem: gradient computation : " << parameters() << endl;

	newGradient.resize(parameters().size());
	newGradient.set(0.0);
	for ( uint task = 0 ; task < laplaceApproximation.size(); task++ )
	{
		// NON-OPTIMIZED: some things could be computed out of this Loop (maybe)
		// The following code is a simple re-implementation of the MATLAB
		// pseudo-code provided in Rasmussen and Williams 2005
		
		// negative hessian (diagonal matrix) of p(y|f)
		const Vector & w = laplaceApproximation[task]->getHessian();
		// L*L^T = B = I + W^0.5 K W^0.5
		const Matrix & L = laplaceApproximation[task]->getCholeskyB();
		// mode found by Laplace approximation
		const Vector & f = laplaceApproximation[task]->getMode();

		// our kernel matrix
		const Matrix & kernelMatrix = kernelData->getKernelMatrix();
		//cerr << "K size: " << kernelMatrix.rows() << " x " << kernelMatrix.cols() << endl;

		// ------------- line 7: R = W^0.5 L^T\(L \ W^0.5)
		// Matrix W is a diagonal matrix
		// R is therefore a scaled variant of the inverse of B
		// actually: R = (W^-1 + K)^-1
		// matlab: Z = repmat(sW,1,n).*solve_chol(L, diag(sW));
		//cerr << "Hessian size: " << w.size() << endl;
		Matrix R ( w.size(), w.size(), 0.0 );
		for ( uint i = 0 ; i < w.size(); i++ )
			R(i,i) = sqrt( w[i] );
		
		choleskySolveMatrixLargeScale ( L, R );
		// W^0.5 * ... corresponds to a scaling of all rows
		for ( uint i = 0 ; i < R.rows(); i++ )
			for ( uint j = 0 ; j < R.cols(); j++ )
				R(i,j) *= sqrt( w[i] );

		// ------------ line 8: C = L \ (W^0.5 K)
		// matlab: C = L'\(repmat(sW,1,n).*K);
		Matrix C ( kernelMatrix );
		for ( uint i = 0 ; i < C.rows(); i++ )
			for ( uint j = 0 ; j < C.cols(); j++ )
				C(i,j) *= sqrt( w[i] );
		triangleSolveMatrix ( L, C );

		// ----------- line 9: s2 = - 0.5 diag(diag(K) - diag(C^T C)) nabla^3 p(y|f)
		// but there is a sign error!!!
		// matlab: s2 = 0.5*(diag(K)-sum(C.^2,1)').*d3lp;
		Vector s2 (w.size());
		for ( uint i = 0 ; i < w.size() ; i++ )
		{
			double nabla3 = likelihoodFunction->thirdgrad ( y[task][i], f[i] );
			Vector cColumn = C.getColumnRef(i);
			s2[i] = 0.5 * (kernelMatrix(i,i) - cColumn.scalarProduct(cColumn)) * nabla3;
		}

		// a = K^{-1} * f (already computed by the laplace approximation)
		const Vector & a = laplaceApproximation[task]->getAVector();
		// Due to the stationarity of f we have
		// the consistency equation: f = K gradientL = K * \nabla p(y|f)
		const Vector & gradientL = laplaceApproximation[task]->getGradient();

		// Both above equations should lead to a = gradientL
		/* DEBUG: if ( (f - kernelMatrix * a).normL2() > 1e-10 ) */
		/* DEBUG: if ( (f - kernelMatrix * gradientL).normL2() > 1e-10 )  */
		/* DEBUG: if ( (a - gradientL).normL2() > 1e-10 ) 
			fthrow(Exception, "Ups! a vector seems to be wrong!"); */

		// for k = 1 .. dim(theta) 
		for ( uint k = 0 ; k < parameters().size() ; k++ )
		{
			// C is now our Jacobi Matrix
			// This one could be optimized by putting it out of this loop
			// ...but I don't care right now
			//
			// ------------ line 12
			// matlab: dK = feval(covfunc{:}, hyper, x, j);
			kernel->getKernelJacobi ( k, parameters(), kernelData, C );

			// s1 = alpha'*dK*alpha/2-sum(sum(Z.*dK))/2;
			double s1 = C.bilinear ( a );
			// calculate tr(RC) = sum of piecewiese multiplications
			for ( uint i = 0 ; i < R.rows(); i++ )
				for ( uint j = 0 ; j < R.cols(); j++ )
					s1 -= R(i,j) * C(i,j);
			s1 *= 0.5;

			// ------------ line 13
			// b  = dK*dlp;
			Vector b = C * gradientL;

			// ------------ line 14
			// matlab: s3 = b-K*(Z*b);
			Vector s3 = b - kernelMatrix*(R*b);

			// In contrast to Rasmussen we have to flip the sign
			// because we are minimizing the negative log marginal instead
			// of maximizing the log marginal
			// matlab: dnlZ(j) = -s1-s2'*s3;
			newGradient[k] += - s1 - s2.scalarProduct(s3);
		}
	}

	if ( verbose )
		cerr << "GPLaplaceOptimizationProblem: gradient = " << newGradient << endl;
}
		
void GPLaplaceOptimizationProblem::useLooParameters ( ) 
{ 
	parameters() = bestLooParameters;
	update();
}