/** 
* @file LaplaceApproximation.cpp
* @brief some utility functions for laplace approximation
* @author Erik Rodner
* @date 02/17/2010

*/
#include <iostream>
#include <cmath>

#include "core/vector/Algorithms.h"
#include "LaplaceApproximation.h"
#include "LHCumulativeGauss.h"

using namespace NICE;
using namespace OBJREC;


LaplaceApproximation::LaplaceApproximation()
{
	maxiterations = 40;
	minimumDelta = 1e-14;
	noiseTerm = 0.0;
	verbose = false;
}

LaplaceApproximation::LaplaceApproximation( const Config *conf, const std::string & section )
{
	maxiterations = conf->gI(section, "laplace_max_iterations", 40 );
	minimumDelta = conf->gD(section, "laplace_minimum_delta", 1e-14 );

	// in my experiments (scene classification) changing this
	// additional noise parameter to a non-zero value is
	// not a good idea at all
	noiseTerm = conf->gD(section, "laplace_noise_term", 0.0);
	verbose = conf->gB(section, "laplace_verbose", false );
}

LaplaceApproximation::~LaplaceApproximation()
{
}

void LaplaceApproximation::updateCache ( const Matrix & kernelMatrix, const Vector & y, const LikelihoodFunction *likelihoodFunction )
{
	// compute hessianW = - nabla nable log p(y|f)
	for (  uint i = 0 ; i < mode.size(); i++ )
		hessianW[i] = - likelihoodFunction->hessian ( y[i], mode[i] );

	// B = I + hessianW^{1/2} K hessianW^{1/2}
	Matrix B ( kernelMatrix );

	// this should not be necessary according to the bible
	B.addIdentity(noiseTerm);

	for ( uint i = 0 ; i < B.rows() ; i++ )
		for ( uint j = 0 ; j < B.cols(); j++ )
			if ( i == j )
				B(i,i) *= hessianW[i];
			else
				B(i,j) *= sqrt(hessianW[i] * hessianW[j]);
	B.addIdentity(1.0);

	// calculate cholesky decomposition of B
	choleskyDecompLargeScale ( B, cholB );
	
	// calculate gradientL = nabla log p(y|f)
	for ( uint i = 0 ; i < mode.size(); i++ )
		gradientL[i] = likelihoodFunction->gradient( y[i], mode[i] );

	// calculate b = Wf + gradientL
	Vector b = gradientL;
	for ( uint i = 0 ; i < b.size(); i++ )
		b[i] = hessianW[i] * mode[i] + b[i];

	// calculate bb = W^{1/2} K b
	Vector bb = kernelMatrix * b;
	for ( uint i = 0 ; i < bb.size(); i++ )
		bb[i] = sqrt( hessianW[i] ) * bb[i];

	// calculate a = b - W^{1/2} B^{-1} bb
	choleskySolveLargeScale ( cholB, bb, a );
	for ( uint i = 0 ; i < a.size(); i++ )
		a[i] = b[i] - sqrt( hessianW[i] ) * a[i];

}

void LaplaceApproximation::approximate ( KernelData *kernelData, const Vector & y,
					   const LikelihoodFunction *likelihoodFunction )
{
	mode.resize ( y.size() );
	mode.set(0.0);

	// FIXME: we always start with zero, but we could
	// also start with an estimate of the approximation before
	// and compare with our zero estimate
	objective = 0.0;
	double oldobjective = objective;
	uint iteration = 0;

	hessianW.resize(y.size());
	gradientL.resize(y.size());

	const Matrix & kernelMatrix = kernelData->getKernelMatrix();
	while (iteration < maxiterations)
	{
		oldobjective = objective;

		// compute gradient hessian etc.
		updateCache( kernelMatrix, y, likelihoodFunction );

		// gradient of the objective function is gradientL - a
		if ( verbose )
			std::cerr << "findModeLaplace: gradient norm = " << (gradientL - a).normL2() << std::endl;

		mode = kernelMatrix * a;
	
		// compute loglikelihood
		double loglikelihood = 0.0;
		for ( uint i = 0 ; i < mode.size(); i++ )
			loglikelihood += likelihoodFunction->logLike ( y[i], mode[i] );

		if ( NICE::isNaN(loglikelihood) )
			fthrow(Exception, "Log-Likelihood p(y|f) could not be calculated numerically...check your parameters!");

		if ( isinf(loglikelihood) == 1 )
		{
			if ( verbose ) 
			{
				std::cerr << "findModeLaplace: log likelihood is positive infinity...we cannot optimize any more in this case." << std::endl;
				std::cerr << "findModeLaplace: mode = " << mode << std::endl;
			}
			break;
		}
		
		if ( isinf(loglikelihood) == -1 )
		{
			fthrow(Exception, "Log likelihood is negative infinity...this seems to be a really improbable setting.");
		}

		// compute objective
		objective = - 0.5 * mode.scalarProduct(a);
		if ( NICE::isNaN(objective) )
			fthrow(Exception, "Objective function of Laplace-Approximation could not be calculated numerically...check your parameters!");

		objective += loglikelihood;

		if ( verbose ) 
			std::cerr << "findModeLaplace: objective = " << objective << std::endl;

		double delta = fabs(oldobjective-objective)/(fabs(objective)+1);
		if ( verbose )
			std::cerr << "findModeLaplace: delta = " << delta << std::endl;
		if ( delta < minimumDelta ) {
			break;
		}
		iteration++;
	}

	// reevaluate 
	updateCache( kernelMatrix, y, likelihoodFunction );
}

double LaplaceApproximation::predict ( const Vector & kernelVector, double kernelSelf, const Vector & y, 
				const LikelihoodFunction *likelihoodFunction ) const
{
	double fmean = kernelVector.scalarProduct ( gradientL );

	Vector wKernelVector ( kernelVector );
	for ( uint i = 0 ; i < wKernelVector.size(); i++ )
		wKernelVector[i] *= sqrt(hessianW[i]);
	Vector v;
	triangleSolve ( cholB, wKernelVector, v );

	double fvariance = kernelSelf - v.scalarProduct(v);

	if ( fvariance <= 0.0 )
		fthrow(Exception, "There seems to be something odd here: the variance is negative !" <<
			"Do you have used a normalized kernel ?");

	double prob;

	const LHCumulativeGauss *likecumgauss = dynamic_cast<const LHCumulativeGauss *> ( likelihoodFunction );
	if ( likecumgauss ) {
		prob = likecumgauss->predictAnalytically ( fmean, fvariance ); 
	} else {
		fthrow(Exception, "Prediction for likelihood functions which are not cumulative gaussians is not yet implemented!");
	}

	return prob;
}