/** 
* @file GPMSCLooEstimates.cpp
* @brief Model selection criterions for Gaussian Processes
* @author Erik Rodner
* @date 03/07/2010

*/
#include <iostream>

#include "GPMSCLooEstimates.h"
#include "vislearning/cbaselib/ClassificationResults.h"
#include "vislearning/classifier/kernelclassifier/LikelihoodFunction.h"
#include "core/algebra/CholeskyRobustAuto.h"
#include "core/optimization/gradientBased/FirstOrderTrustRegion.h"

#include "core/vector/Algorithms.h"

using namespace std;
using namespace NICE;
using namespace OBJREC;


GPMSCLooLikelihoodRegression::GPMSCLooLikelihoodRegression ( bool _weightedLooProb )
	: weightedLooProb( _weightedLooProb )
{
}

GPMSCLooLikelihoodRegression::GPMSCLooLikelihoodRegression ( bool _weightedLooProb, const NICE::Vector & _selection ) :
	weightedLooProb( _weightedLooProb ), selection( _selection )
{
}

double GPMSCLooLikelihoodRegression::computeObjective ( KernelData *kernelData,
	const NICE::Vector & labels ) const
{
	kernelData->updateCholeskyFactorization();
	kernelData->updateInverseKernelMatrix();
	const Matrix & invKernelMatrix = kernelData->getInverseKernelMatrix();
	Vector invKernelMatrixDiag ( invKernelMatrix.rows() );
	for ( uint l = 0 ; l < invKernelMatrixDiag.size(); l++ )
		invKernelMatrixDiag[l] = invKernelMatrix(l,l);

	Vector alpha;
	kernelData->computeInverseKernelMultiply ( labels, alpha );

	return computeObjective ( invKernelMatrixDiag, alpha, labels );
}


double GPMSCLooLikelihoodRegression::computeObjective ( const NICE::Vector & invKernelMatrixDiag, const NICE::Vector & alpha, 
			const NICE::Vector & labels ) const
{
	uint count_positives = 0;
	uint count_negatives = 0;
	for ( uint l = 0 ; l < labels.size(); l++ )
		if ( (selection.size() == 0) || (selection[l]) )
		{
			count_positives += (labels[l] > 0);
			count_negatives += (labels[l] <= 0);
		}


	double looPredictiveLogProb = 0.0;
	for ( uint l = 0 ; l < labels.size(); l++ )
	{
		if ( (selection.size() > 0) && (!selection[l]) )
		{
			continue;
		}

		double sigma2 = 1.0 / invKernelMatrixDiag[l]; 
		double loo_est = labels[l] - alpha[l] * sigma2;

		double singleLooProb = -0.5 * log( sigma2 ) - pow(labels[l] - loo_est, 2)/(2*sigma2);

		double weight = 1.0;
		if ( weightedLooProb ) 
			weight = labels[l] < 0 ? 1.0 / (double) (count_negatives) 
				: 1.0 / (double) (count_positives);

		looPredictiveLogProb += 0.5*weight*singleLooProb;
	}

	return looPredictiveLogProb;
}

/************************* GPMSCLooLikelihoodClassification ***********/

GPMSCLooLikelihoodClassification::GPMSCLooLikelihoodClassification ()
{
}

GPMSCLooLikelihoodClassification::GPMSCLooLikelihoodClassification ( const NICE::Vector & _selection ) :
	GPMSCLooLikelihoodRegression ( false, _selection )
{
}

double GPMSCLooLikelihoodClassification::computeObjective ( const NICE::Vector & invKernelMatrixDiag, const NICE::Vector & alpha, 
			const NICE::Vector & labels ) const
{
	Vector looMean (labels.size(), 0.0);
	Vector looVariance (labels.size(), 0.0);
	for ( uint l = 0 ; l < labels.size(); l++ )
	{
		if ( (selection.size() > 0) && (!selection[l]) )
			continue;

		looVariance[l] = ( 1.0 / invKernelMatrixDiag[l] ); 
		looMean[l] = (labels[l] - alpha[l] / invKernelMatrixDiag[l]);
	}

	GPMSCLooLikelihoodClassificationOptProb opt ( labels, selection, looMean, looVariance, false );

	FirstOrderTrustRegion *optimizer = new FirstOrderTrustRegion(0.1 /*std value for typical gradient */, false);
	optimizer->setEpsilonG ( 0.01 );
	optimizer->setMaxIterations ( 500 );
	optimizer->optimizeFirst ( opt );
	delete optimizer;

	return - opt.computeObjective();
}

/********************* GPMSCLooLikelihoodClassificationOptProb *****************/

GPMSCLooLikelihoodClassificationOptProb::GPMSCLooLikelihoodClassificationOptProb ( const NICE::Vector & _labels,
			const NICE::Vector & _selection, const NICE::Vector & _looMu, const NICE::Vector & _looVariance, bool _verbose ) 
	: OptimizationProblemFirst(2), labels(_labels), selection(_selection), looMu(_looMu), looVariance(_looVariance),
	  verbose(_verbose)
{
	parameters().resize(2);
	parameters()[0] = 1.0;
	parameters()[1] = 0.0;
}

double GPMSCLooLikelihoodClassificationOptProb::computeObjective ()
{
	double alpha = parameters()[0];
	double beta = parameters()[1];

	double nloglike = 0.0;
	for ( uint l = 0 ; l < labels.size(); l++ )
	{
		if ( (selection.size() > 0) && (!selection[l]) )
			continue;

		double inner = labels[l] * ( alpha * looMu[l] + beta ) / sqrt( 1 + alpha*alpha*looVariance[l] );
		nloglike -= log ( (erf(inner/sqrt(2.0f)) + 1)/2.0 ); 
	}

	if ( verbose )
		cerr << "GPMSCLooLikelihoodClassificationOptProb: " << parameters() << " " << nloglike << endl;

	return nloglike;
}

void GPMSCLooLikelihoodClassificationOptProb::computeGradient ( NICE::Vector & newGradient )
{
	newGradient.resize(2);
	newGradient.set(0.0);
	double alpha = parameters()[0];
	double beta = parameters()[1];
	// Rasmussen & Williams: eq. (6.44)
	for ( uint i = 0 ; i < labels.size(); i++ )
	{
		if ( (selection.size() > 0) && (!selection[i]) )
			continue;

		double sqrtPart = sqrt(1 + alpha*alpha*looVariance[i]);
		double ri = ( alpha*looMu[i] + beta) / sqrtPart;
		double betaGrad2ndPart = (labels[i]/sqrtPart);
		double alphaGrad2ndPart = betaGrad2ndPart * ( looMu[i] - beta*alpha*looVariance[i] ) / (sqrtPart*sqrtPart);
		double firstPart = (exp(-ri*ri/2.0)/sqrt(2.0f*M_PI)) * ( 2.0 / (erf(labels[i] * ri / sqrt(2.0f)) + 1) );
		newGradient[0] -= alphaGrad2ndPart * firstPart;
		newGradient[1] -= betaGrad2ndPart * firstPart;
	}
	
	if ( verbose )
		cerr << "GPMSCLooLikelihoodClassificationOptProb: gradient=" << newGradient << endl;
}

/************************** GPMSCLooLikelihoodClassificationFixed ********************/

GPMSCLooLikelihoodClassificationFixed::GPMSCLooLikelihoodClassificationFixed ()
{
}

GPMSCLooLikelihoodClassificationFixed::GPMSCLooLikelihoodClassificationFixed ( const NICE::Vector & _selection ) :
	GPMSCLooLikelihoodRegression ( false, _selection )
{
}

double GPMSCLooLikelihoodClassificationFixed::computeObjective ( const NICE::Vector & invKernelMatrixDiag, const NICE::Vector & alpha, 
			const NICE::Vector & labels ) const
{
	uint count_positives = 0;
	uint count_negatives = 0;
	for ( uint l = 0 ; l < labels.size(); l++ )
		if ( (selection.size() == 0) || (selection[l]) )
		{
			count_positives += (labels[l] > 0);
			count_negatives += (labels[l] <= 0);
		}


	double objective = 0.0;
	for ( uint l = 0 ; l < labels.size(); l++ )
	{
		if ( (selection.size() > 0) && (!selection[l]) )
			continue;

		double looVariance = 1.0 / invKernelMatrixDiag[l]; 
		double looMean = labels[l] - alpha[l] * looVariance;

		// as proposed by Tommasi and Caputo
		double w = (labels[l] < 0.0) ? (count_negatives + count_positives) / (2*count_negatives) 
									 : (count_negatives + count_positives) / (2*count_positives);
		objective += 1.0 / ( 1.0 + exp(-3.0 * (looMean*labels[l])) ) * w;
	}

	return objective;
}

/************************** GPMSCLooLabor ********************/

GPMSCLooLabor::GPMSCLooLabor ()
{
}

GPMSCLooLabor::GPMSCLooLabor ( const NICE::Vector & _selection ) :
	GPMSCLooLikelihoodRegression ( false, _selection )
{
}

double GPMSCLooLabor::computeObjective ( const NICE::Vector & invKernelMatrixDiag, const NICE::Vector & alpha, 
			const NICE::Vector & labels ) const
{
	Vector posResults;
	for ( uint l = 0 ; l < labels.size(); l++ )
	{
		if ( (selection.size() > 0) && (!selection[l]) )
			continue;

		double looVariance = 1.0 / invKernelMatrixDiag[l]; 
		double looMean = labels[l] - alpha[l] * looVariance;

		if ( labels[l] > 0.0 )
			posResults.append ( looMean );
	}

	return posResults.StdDev();
}
/*************************** GPMSCVarious ******************/

GPMSCLooVarious::GPMSCLooVarious ( int _type ) :
	GPMSCLooLikelihoodRegression ( false ), type(_type)
{

}

GPMSCLooVarious::GPMSCLooVarious ( int _type, const NICE::Vector & _selection ) :
	GPMSCLooLikelihoodRegression ( false, _selection ), type(_type)
{
}

double GPMSCLooVarious::computeObjective ( const NICE::Vector & invKernelMatrixDiag, const NICE::Vector & alpha, 
			const NICE::Vector & labels ) const
{
	uint count_positives = 0;
	uint count_negatives = 0;
	for ( uint l = 0 ; l < labels.size(); l++ )
		if ( (selection.size() == 0) || (selection[l]) )
		{
			count_positives += (labels[l] > 0);
			count_negatives += (labels[l] <= 0);
		}


	Vector looMean ( labels.size(), 0.0 );
	Vector looVariance ( labels.size(), 0.0 );

	// selected only one single positive example
	// here is like doing one class classification with
	// the negative ones. with support data this is just
	// the probability of the example using the support
	// classifier
	ClassificationResults results;
	double errors = 0.0;
	int count = 0;
	for ( uint l = 0 ; l < labels.size(); l++ )
	{
		if ( (selection.size() > 0) && (!selection[l]) )
			continue;

		looVariance[l] = 1.0 / invKernelMatrixDiag[l]; 
		looMean[l] = labels[l] - alpha[l] * looVariance[l];

		FullVector scores (2);
		scores.set(0.0);
		scores[1] = looMean[l];
		ClassificationResult r ( looMean[l] < 0 ? 0 : 1, scores ); 
		r.classno_groundtruth = labels[l] < 0.0 ? 0 : 1;
		results.push_back ( r );

		double weight = labels[l] < 0.0 ? 0.5/count_negatives : 0.5/count_positives;
		if ( looMean[l] * labels[l] < 0.0 )
			errors += weight;

		count++;
	}

	if ( type == P_RECOGNITIONRATE )
		return 1.0 - errors;
	else if ( type == P_AUC )
		return results.getBinaryClassPerformance ( ClassificationResults::PERF_AUC );
	else 
		return results.getBinaryClassPerformance ( ClassificationResults::PERF_AVG_PRECISION );

}

/**************** conditional likelihood **********************/
GPMSCConditionalLikelihood::GPMSCConditionalLikelihood ( const NICE::Vector & _selection ) :
	selection( _selection )
{
}

double GPMSCConditionalLikelihood::computeObjective ( KernelData *kernelData,
	const NICE::Vector & labels ) const
{
	const Matrix & kernelMatrix = kernelData->getKernelMatrix();
	/************************************************
	 * If used for the parameters s of a chai kernel,
	 * likelihood = cond_likelihood + likelihood_support
	 * and likelihood_support does not depend on s
	 * but conditional likelihood seems to be important
	 * when comparing different likelihoods
	 ***********************************************/

	if ( selection.size() == 0 )
	{
		// compute standard likelihood
		cerr << "GPMSCLooEstimates: Computing the standard likelihood !" << endl;
		Vector tmp;
		kernelData->updateCholeskyFactorization();
		kernelData->computeInverseKernelMultiply ( labels, tmp );
		return - kernelData->getLogDetKernelMatrix() - labels.scalarProduct(tmp);
	} else {
		if ( selection.size() != kernelMatrix.rows() )
			fthrow(Exception, "Selection does not fit to the size of the kernel matrix" );

		map<uint, uint> indexMapSelection;
		map<uint, uint> indexMapNSelection;
		uint iSel = 0;
		uint iNSel = 0;
		for ( uint i = 0 ; i < selection.size() ; i++ )
			if ( selection[i] == 0 ) 
			{
				indexMapNSelection.insert ( pair<uint, uint> ( i, iNSel ) );
				iNSel++;
			} else {
				indexMapSelection.insert ( pair<uint, uint> ( i, iSel ) );
				iSel++;
			}

		Vector labelsNSelection ( indexMapNSelection.size() );
		Vector labelsSelection ( indexMapSelection.size() );
		for ( uint i = 0 ; i < selection.size() ; i++ )
			if ( selection[i] == 0 ) 
			{
				labelsNSelection[ indexMapNSelection[i] ] = labels[i];
			} else {
				labelsSelection[ indexMapSelection[i] ] = labels[i];
			}

		Matrix KSelection ( indexMapSelection.size(), indexMapSelection.size() );
		Matrix KNSelection ( indexMapNSelection.size(), indexMapNSelection.size() );
		Matrix Ktilde ( indexMapSelection.size(), indexMapNSelection.size() );
		for ( uint i = 0 ; i < selection.size() ; i++ )
		{
			for ( uint j = 0 ; j < selection.size() ; j++ )
			{
				if ( (selection[i] == 0) && (selection[j] != 0) )
				{
					uint k1 = indexMapNSelection[i];
					uint k2 = indexMapSelection[j];
					Ktilde(k2,k1) = kernelMatrix(i,j);
				} else if ( (selection[i] == 0) && (selection[j] == 0) ) {
					uint k1 = indexMapNSelection[i];
					uint k2 = indexMapNSelection[j];
					KNSelection(k1,k2) = kernelMatrix(i,j);
				} else if ( (selection[i] != 0) && (selection[j] != 0) ) {
					uint k1 = indexMapSelection[i];
					uint k2 = indexMapSelection[j];
					KSelection(k1,k2) = kernelMatrix(i,j);
				} // ignore fourth case due to symmetry
			}

		}

		// Okay, now we have our matrices
		// we try to calculate p(y^selected | y^notselected)
		//

		// cerr << "Computing the conditional likelihood !" << endl;
		// K_{notselected}^{-1}
		Matrix KNSelectionInv;

		CholeskyRobustAuto cra (false);
		cra.robustCholInv ( KNSelection, KNSelectionInv );

		// compute conditional mean and covariance
		Vector conditionalMean = Ktilde * KNSelectionInv * labelsNSelection;
		Matrix conditionalCovariance = KSelection - Ktilde * KNSelectionInv * Ktilde.transpose();
		Vector diff = conditionalMean - labelsSelection;

		Matrix cholA;
		cra.robustChol ( conditionalCovariance, cholA );
		Vector tmp;
		choleskySolve ( cholA, diff, tmp );

		double likelihood = 0.0;
		likelihood -= tmp.scalarProduct ( diff );
		likelihood -= cra.getLastLogDet();
	
		// FIXME: this is just for debugging cerr << - cra.getLastLogDet() << " ";
		//
		// Important notice: we return the likelihood not the negative likelihood !!
		return likelihood;
	}
}