/** 
* @file KCGPLaplaceOneVsAll.cpp
* @brief One vs. All interface for kernel classifiers
* @author Erik Rodner
* @date 12/10/2009

*/
#include <iostream>
#include <sstream>

#include "core/vector/Algorithms.h"

#include "core/optimization/gradientBased/OptimizationAlgorithmFirst.h"
#include "core/optimization/gradientBased/FirstOrderTrustRegion.h"
#include "core/optimization/gradientBased/FirstOrderRasmussen.h"

#include "vislearning/classifier/kernelclassifier/GPLaplaceOptimizationProblem.h"
#include "core/algebra/CholeskyRobust.h"
#include "core/algebra/CholeskyRobustAuto.h"

#include "KCGPLaplaceOneVsAll.h"
#include "LHCumulativeGauss.h"

using namespace std;
using namespace NICE;
using namespace OBJREC;



KCGPLaplaceOneVsAll::KCGPLaplaceOneVsAll( const Config *conf, Kernel *kernelFunction, const string & section )
	: KernelClassifier ( conf, kernelFunction ),
	  // we have to copy the config to initialize Laplace Approximation stuff	
	  confCopy(*conf),
	  confSection(section)
{
	this->maxClassNo = 0;
	this->verbose = conf->gB( section, "verbose", false );
	this->optimizeParameters = (kernelFunction == NULL) ? false : conf->gB( section, "optimize_parameters", true );
	this->maxIterations = conf->gI( section, "optimization_maxiterations", 500 );


	// the only one supported by now
	likelihoodFunction = new LHCumulativeGauss( conf->gD(section, "likelihood_lengthscale", sqrt(2.0f)) );

	useLooParameters = conf->gB( section, "use_loo_parameters", false );
}

KCGPLaplaceOneVsAll::~KCGPLaplaceOneVsAll()
{
	if ( likelihoodFunction != NULL )
		delete likelihoodFunction;
	
	if ( laplaceApproximations.size() != 0 )
	{
		for ( uint i = 0 ; i < laplaceApproximations.size(); i++ )
			delete laplaceApproximations[i];
		laplaceApproximations.clear();
	}

}

void KCGPLaplaceOneVsAll::teach ( KernelData *kernelData, const NICE::Vector & y )
{
	maxClassNo = (int)y.Max();
	
	// FIXME: This code is still not suitable for settings
	// with missing classes between 0..maxClassNo
	classnos.resize(maxClassNo+1);
    for ( int i = 0 ; i <= maxClassNo ; i++ )
	{
		NICE::Vector ySubZeroMean ( y.size() );
		for ( size_t j = 0 ; j < y.size() ; j++ )
			ySubZeroMean[j] = ((int)y[j] == i) ? 1 : -1;
		ySetZeroMean.push_back ( ySubZeroMean );
		classnos[i] = i;
	}

	if ( laplaceApproximations.size() != 0 )
	{
		for ( uint i = 0 ; i < laplaceApproximations.size(); i++ )
			delete laplaceApproximations[i];
		laplaceApproximations.clear();
	}

	for ( uint k = 0 ; k < ySetZeroMean.size(); k++ )
		laplaceApproximations.push_back ( new LaplaceApproximation ( &confCopy, confSection ) );

	// Hyperparameter optimization
	if ( optimizeParameters ) 
	{
		ParameterizedKernel *kernelPara = dynamic_cast< ParameterizedKernel * > ( kernelFunction );
		if ( kernelPara == NULL ) {
			fthrow(Exception, "KCGPLaplaceOneVsAll: you have to specify a parameterized kernel !");
		}
		GPLaplaceOptimizationProblem gpopt ( kernelData, ySetZeroMean, kernelPara, likelihoodFunction, laplaceApproximations, verbose );

		// the trust region classifier is better for my large collection of one classification problem :)
		// FirstOrderRasmussen optimizer;
		FirstOrderTrustRegion optimizer;
		optimizer.setMaxIterations ( maxIterations );
		optimizer.setEpsilonG ( 0.01 );

		cout << "KCGPLaplaceOneVsAll: Hyperparameter optimization ..." << endl;
		optimizer.optimizeFirst ( gpopt );
		cout << "KCGPLaplaceOneVsAll: Hyperparameter optimization ...done" << endl;
		
		if ( useLooParameters ) 
		{
			cerr << "KCGPLaplaceOneVsAll: using best loo parameters" << endl;
			gpopt.useLooParameters();
		}

		gpopt.update();

		Vector parameters;
		kernelPara->getParameters ( parameters );
		cout << "KCGPLaplaceOneVsAll: Optimization finished: " << parameters << endl << endl;
	} else {
		kernelData->updateCholeskyFactorization();
		
		for ( uint i = 0 ; i < ySetZeroMean.size() ; i++ )
		{
			const Vector & ySubZeroMean = ySetZeroMean[i];
			fprintf (stderr, "KCGPLaplaceOneVsAll: training classifier class %d <-> remainder\n", i );
			laplaceApproximations[i]->approximate ( kernelData, ySubZeroMean, likelihoodFunction );
		}

	}
	


}

ClassificationResult KCGPLaplaceOneVsAll::classifyKernel ( const NICE::Vector & kernelVector, double kernelSelf ) const
{
	if ( laplaceApproximations.size() <= 0 ) 
		fthrow(Exception, "The classifier was not trained with training data (use teach(...))");

	FullVector scores ( maxClassNo+1 );
    scores.set(0);
	uint k = 0;
    for ( vector< LaplaceApproximation* >::const_iterator i = 
	    laplaceApproximations.begin(); i != laplaceApproximations.end(); i++,k++ )
    {
		int classno = classnos[k];
		double yEstimate = (*i)->predict(kernelVector, kernelSelf, ySetZeroMean[k], likelihoodFunction);
		for ( uint j = 0 ; j < classnos.size(); j++ )
			if ( classnos[j] == classno ) 
				scores[classnos[j]] += yEstimate;
			else
				scores[classnos[j]] += 1.0 - yEstimate;
    }

	scores.multiply( 1.0/laplaceApproximations.size() );
	return ClassificationResult( scores.maxElement(), scores );
}