/** 
* @file KCGPRegOneVsAll.cpp
* @brief One vs. All interface for kernel classifiers
* @author Erik Rodner
* @date 12/10/2009

*/
#include <iostream>
#include <sstream>

#ifdef NICE_USELIB_OPENMP
#include <omp.h>
#endif

#include "core/vector/Algorithms.h"

#include "core/optimization/OptimizationAlgorithmFirst.h"
#include "core/optimization/FirstOrderTrustRegion.h"
#include "core/optimization/FirstOrderRasmussen.h"

#include "vislearning/regression/gpregression/GPRegressionOptimizationProblem.h"
#include "core/algebra/CholeskyRobust.h"
#include "core/algebra/CholeskyRobustAuto.h"

#include "KCGPRegOneVsAll.h"
#include <limits>

#undef DEBUG

using namespace NICE;
using namespace std;
using namespace OBJREC;

KCGPRegOneVsAll::KCGPRegOneVsAll( const Config *conf, Kernel *kernelFunction, const string & section )
	: KernelClassifier ( conf, kernelFunction )
{
	this->maxClassNo = 0;
	this->verbose = conf->gB( section, "verbose", false );
	this->optimizeParameters = (kernelFunction == NULL) ? false : conf->gB( section, "optimize_parameters", true );
	this->maxIterations = conf->gI( section, "optimization_maxiterations", 500 );
	this->numSamplesCalibration = conf->gI( section, "calibrated_probabilities_numsamples", 2000 );
	this->calibrateProbabilities = conf->gB( section, "calibrated_probabilities", false );
	if ( verbose && this->calibrateProbabilities )
		cerr << "KCGPRegOneVsAll: probability calibration is turned on, this can result in massive additional load!" << endl;

	// we do joint optimization, therefore single classifiers (cloned from prototype)
	// are not allowed to do optimization themselves
	Config confNoOptimize ( *conf );
	confNoOptimize.sB( section, "optimize_parameters", false );
	this->prototype = new RegGaussianProcess ( &confNoOptimize, kernelFunction, section ); 

	// Do not use this option, unless you know what you are doing!
	// This function is just necessary for hyperparameter optimization with really large
	// kernel matrices of a kronecker structure!
	bool approximateTraceTerm = conf->gB(section, "approximate_trace_term", false);
	if ( approximateTraceTerm ) 
		traceApproximation = new TraceApproximation ( conf, section );
	else
		traceApproximation = NULL;

	// select final hyperparameters according to leave-one-out
	useLooParameters = conf->gB(section, "use_loo_parameters", false );
	
	// select the criterion
	modelselcrit = NULL;
	if ( useLooParameters ) {
		string modelselcrit_s = conf->gS(section, "loo_crit", "loo_pred_prob" );
		modelselcrit = GenericGPModelSelection::selectModelSel ( conf, modelselcrit_s );
		cerr << "KCGPRegOneVsAll: using additional model selection criterion " << modelselcrit_s << endl;
	}

	// do we want to compute the uncertainty of the estimate ?
	computeUncertainty = conf->gB(section, "compute_uncertainty", false );
}

KCGPRegOneVsAll::KCGPRegOneVsAll( const KCGPRegOneVsAll &vcova ): KernelClassifier(vcova)
{
	prototype = vcova.prototype->clone();
	optimizeParameters = vcova.optimizeParameters;
	verbose = vcova.verbose;
	maxIterations = vcova.maxIterations;
	useLooParameters = vcova.useLooParameters;
	computeUncertainty = vcova.computeUncertainty;
	choleskyMatrix = vcova.choleskyMatrix;
	calibrateProbabilities = vcova.calibrateProbabilities;
	numSamplesCalibration = vcova.numSamplesCalibration;
	
	for(int i = 0; i < (int)vcova.classifiers.size(); i++)
	{
		classifiers.push_back(pair<int, RegGaussianProcess*>(vcova.classifiers[i].first,vcova.classifiers[i].second->clone()));
	}

	if ( vcova.traceApproximation == NULL )
		traceApproximation = NULL;
	else
		traceApproximation = new TraceApproximation(*vcova.traceApproximation);

	if ( vcova.modelselcrit == NULL )
		modelselcrit = NULL;
	else
		modelselcrit = new GPMSCLooLikelihoodRegression(*vcova.modelselcrit);
}

KCGPRegOneVsAll::~KCGPRegOneVsAll()
{
	if ( traceApproximation != NULL )
		delete traceApproximation;
	if ( modelselcrit != NULL )
		delete modelselcrit;
}

void KCGPRegOneVsAll::teach ( KernelData *kernelData, const NICE::Vector & y )
{
	maxClassNo = (int)y.Max();

	set<int> classesUsed;
	for ( uint i = 0 ; i < y.size(); i++ )
		classesUsed.insert ( (int)y[i] );
	
	classifiers.clear();

	VVector ySet;
	VVector ySetZeroMean;
	for ( set<int>::const_iterator it = classesUsed.begin();
		it != classesUsed.end(); it++)
	{
		int i = *it;

		NICE::Vector ySub ( y.size() );
		NICE::Vector ySubZeroMean ( y.size() );
		for ( size_t j = 0 ; j < ySub.size() ; j++ )
		{
			ySub[j] = ((int)y[j] == i) ? 1 : 0;
			ySubZeroMean[j] = ((int)y[j] == i) ? 1 : -1;
		}
		ySet.push_back ( ySub );
		ySetZeroMean.push_back ( ySubZeroMean );
	}

	// Hyperparameter optimization
	if ( optimizeParameters ) 
	{
		ParameterizedKernel *kernelPara = dynamic_cast< ParameterizedKernel * > ( kernelFunction );
		if ( kernelPara == NULL ) {
			fthrow(Exception, "KCGPRegOneVsAll: you have to specify a parameterized kernel !");
		}
		GPRegressionOptimizationProblem gpopt ( kernelData, ySetZeroMean, kernelPara, verbose, modelselcrit, traceApproximation );

		// the trust region classifier is better for my large collection of one classification problem :)
		// FirstOrderRasmussen optimizer;
		FirstOrderTrustRegion optimizer;
		optimizer.setMaxIterations ( maxIterations );
		optimizer.setEpsilonG ( 0.01 );

		cout << "KCGPRegOneVsAll: Hyperparameter optimization ..." << endl;
		optimizer.optimizeFirst ( gpopt );
		cout << "KCGPRegOneVsAll: Hyperparameter optimization ...done" << endl;
	
		if ( useLooParameters ) 
		{
			cerr << "KCGPRegOneVsAll: using best loo parameters" << endl;
			gpopt.useLooParameters();
		}

		gpopt.update();

		Vector parameters;
		kernelPara->getParameters ( parameters );
		cout << "KCGPRegOneVsAll: Optimization finished: " << parameters << endl << endl;
	} else {
		kernelData->updateCholeskyFactorization();
	}
	
	//for binary problems
	if(classesUsed.size() == 2 && false)
	{
		set<int>::const_iterator it = classesUsed.begin();
		int classno = *it;
		it++;
		int classno2 = *it;
		const Vector & ySub = ySet[0];
		RegGaussianProcess *classifier;
		classifier = prototype->clone();
		if (verbose)
			fprintf (stderr, "KCGPRegOneVsAll: training classifier class %d <-> %d\n", classno, classno2 );
		classifier->teach ( kernelData, ySub );
		classifiers.push_back ( pair<int, RegGaussianProcess*> (classno, classifier) );
		classifiers.push_back ( pair<int, RegGaussianProcess*> (classno2, classifier) );
	}
	else
	{
		int i = 0;
		for ( set<int>::const_iterator it = classesUsed.begin(); it != classesUsed.end(); it++,i++)
		{
			int classno = *it;
			const Vector & ySub = ySet[i];
			RegGaussianProcess *classifier;
			classifier = prototype->clone();
	
			if (verbose)
				fprintf (stderr, "KCGPRegOneVsAll: training classifier class %d <-> remainder\n", classno );
	
			classifier->teach ( kernelData, ySub );

			classifiers.push_back ( pair<int, RegGaussianProcess*> (classno, classifier) );
		}
	}

	if ( computeUncertainty || calibrateProbabilities )
		choleskyMatrix = kernelData->getCholeskyMatrix();

}

/**
* @brief Completely the same, only using a different data structure to store the labels
* @author Alexander Lütz
* @date 18/08/2011
 */
void KCGPRegOneVsAll::teach ( KernelData *kernelData, const std::vector<double> & y )
{
	// FIXME: Do we really need this function? (erik)
	Vector y_nice (y);
	teach ( kernelData, y_nice );
}


ClassificationResult KCGPRegOneVsAll::classifyKernel ( const NICE::Vector & kernelVector, double kernelSelf ) const
{
	if ( classifiers.size() <= 0 ) 
		fthrow(Exception, "The classifier was not trained with training data (use teach(...))");

	FullVector scores ( maxClassNo+1 );
	scores.set(0);

	//for binary problems
	if(classifiers.size() == 2 && false)
	{
		int classno = classifiers[0].first;
		int classno2 = classifiers[1].first;
		RegGaussianProcess *classifier = classifiers[0].second;
		double yEstimate = classifier->predictKernel(kernelVector, kernelSelf);
		scores[classno] = yEstimate;
		scores[classno2] = 0;//1-yEstimate;
		//cout << "i: " << 0 << ": " << scores[classno] << endl;
		//cout << "i: " << 1 << ": " << scores[classno2] << endl;
	}
	else
	{
#pragma omp parallel for
		for ( int classifierIndex = 0 ; classifierIndex < (int)classifiers.size(); classifierIndex++ )
		{
			int classno = classifiers[(uint)classifierIndex].first;
			RegGaussianProcess *classifier = classifiers[(uint)classifierIndex].second;
			double yEstimate = classifier->predictKernel(kernelVector, kernelSelf);
			scores[classno] += yEstimate;
			//cout << "i: " << classifierIndex << ": " << scores[classno] << endl;
		}
	}

	double uncertainty = 0.0;
	if ( computeUncertainty || calibrateProbabilities )
	{
		Vector tmp;
		choleskySolveLargeScale ( choleskyMatrix, kernelVector, tmp );

		// tmp = K^{-1} k*
		uncertainty = kernelSelf - kernelVector.scalarProduct ( tmp );
		
		/*if(uncertainty < 0.0)
// 			uncertainty = 0.0;*/

		if ( calibrateProbabilities )
		{
			/*********************************************************************
			 * Calibration of probabilities is based on the following
			 * idea: 
			 *
			 * The scores \mu_i (or scores[i]) and the uncertainty
			 * r.uncertainty are the mean and the variance of the predictive
			 * distribution p(y_i | ...). So actually we do not know the correct value for
			 * y_i and therefore just taking the maximum score ignores this uncertainty 
			 * completely! 
			 * What we might want to have is the probability for each class k
			 * p_k = p( k = argmax_i y_i )
			 *
			 * Calculating this probability for n>2 is intractable and we
			 * have to do monte carlo estimation which is performed in the following code
			 *
			 * An alternative would be to approximate p_k with 
			 * p( mu_k >= max_{i != k} y_i ) = prod_{i != k} F_i(mu_k) 
			 * with F_i being the cumulative distribution of the normal distribution i
			 *
			 * Details: Erik Rodner, "Learning with Few Examples for Visual Recognition Problems", PhD thesis
			 *
			 ********************************************************************/

			double stddev = sqrt(uncertainty);
			FullVector calibratedScores ( maxClassNo + 1 );
			calibratedScores.set(0.0);
			for ( uint i = 0 ; i < numSamplesCalibration ; i++ )
			{
				uint cmaxClassno = 0;
				double cmax = - std::numeric_limits<double>::max();
				for ( int classifierIndex = 0 ; classifierIndex < (int)classifiers.size(); classifierIndex++ )
				{
					int classno = classifiers[(uint)classifierIndex].first;
					double s = randGaussDouble ( stddev ) + scores[classno];
					if ( s > cmax )
					{
						cmax = s;
						cmaxClassno = classno;
					}
				}
				calibratedScores[ cmaxClassno ]++;
			}
			calibratedScores.normalize();

			// calibrating probabilities should not affect our hard
			// decision for a specific class
			if ( verbose ) {
				if ( scores.maxElement() != calibratedScores.maxElement() )
					cerr << "Calibration of probabilities affected the hard decision, you should increase calibrated_probabilities_numsamples !!" << endl;
			}

			scores = calibratedScores;
		}
	}

	ClassificationResult r ( scores.maxElement(), scores );
	r.uncertainty = uncertainty;

	return r;
}

KCGPRegOneVsAll* KCGPRegOneVsAll::clone(void) const
{
	KCGPRegOneVsAll *classifier = new KCGPRegOneVsAll( *this );
	return classifier;
}

void KCGPRegOneVsAll::clear()
{
	//nothing to clear
}

void KCGPRegOneVsAll::restore(std::istream& ifs, int type)
{
	ifs.precision (numeric_limits<double>::digits10 + 1);
	ifs >> maxClassNo;
	ifs >> computeUncertainty;
	ifs >> calibrateProbabilities;

	if(calibrateProbabilities || computeUncertainty)
	{
		ifs >> choleskyMatrix;
	}

	int size;
	ifs >> size;

	for(int i = 0; i < size; i++)
	{
		int tmp;
		ifs >> tmp;
		RegGaussianProcess *classifier;
		classifier = prototype->clone();
		classifier->restore(ifs);
		classifiers.push_back ( pair<int, RegGaussianProcess*> (tmp, classifier) );
	}

	KernelClassifier::restore(ifs,type);
}

void KCGPRegOneVsAll::store(std::ostream& ofs, int type) const
{
	ofs.precision (numeric_limits<double>::digits10 + 1);
	
	ofs << maxClassNo << endl;
	ofs << computeUncertainty << endl;
	ofs << calibrateProbabilities << endl;

	
	if(calibrateProbabilities || computeUncertainty)
	{
		ofs << choleskyMatrix << endl;
	}
	
	ofs << classifiers.size() << endl;
	for(uint i = 0; i < classifiers.size(); i++)
	{
		ofs << classifiers[i].first << endl;
		classifiers[i].second->store(ofs);
		ofs << endl;
	}
	KernelClassifier::store(ofs,type);
}