/**
* @file KCGPRegOneVsAll.cpp
* @brief One vs. All interface for kernel classifiers
* @author Erik Rodner
* @date 12/10/2009

*/
#include <iostream>
#include <sstream>

#ifdef NICE_USELIB_OPENMP
#include <omp.h>
#endif

#include "core/vector/Algorithms.h"

#include "core/optimization/gradientBased/OptimizationAlgorithmFirst.h"
#include "core/optimization/gradientBased/FirstOrderTrustRegion.h"
#include "core/optimization/gradientBased/FirstOrderRasmussen.h"

#include "vislearning/regression/gpregression/GPRegressionOptimizationProblem.h"
#include "core/algebra/CholeskyRobust.h"
#include "core/algebra/CholeskyRobustAuto.h"

#include "KCGPRegOneVsAll.h"
#include <limits>

#undef DEBUG

using namespace NICE;
using namespace std;
using namespace OBJREC;

KCGPRegOneVsAll::KCGPRegOneVsAll( const Config *conf, Kernel *kernelFunction, const string & section )
    : KernelClassifier ( conf, kernelFunction )
{
  this->maxClassNo = 0;
  this->verbose = conf->gB( section, "verbose", false );
  this->optimizeParameters = (kernelFunction == NULL) ? false : conf->gB( section, "optimize_parameters", true );
  this->maxIterations = conf->gI( section, "optimization_maxiterations", 500 );
  this->numSamplesCalibration = conf->gI( section, "calibrated_probabilities_numsamples", 2000 );
  this->calibrateProbabilities = conf->gB( section, "calibrated_probabilities", false );
  if ( verbose && this->calibrateProbabilities )
    cerr << "KCGPRegOneVsAll: probability calibration is turned on, this can result in massive additional load!" << endl;

  // we do joint optimization, therefore single classifiers (cloned from prototype)
  // are not allowed to do optimization themselves
  Config confNoOptimize ( *conf );
  confNoOptimize.sB( section, "optimize_parameters", false );
  this->prototype = new RegGaussianProcess ( &confNoOptimize, kernelFunction, section );

  // Do not use this option, unless you know what you are doing!
  // This function is just necessary for hyperparameter optimization with really large
  // kernel matrices of a kronecker structure!
  bool approximateTraceTerm = conf->gB(section, "approximate_trace_term", false);
  if ( approximateTraceTerm )
    traceApproximation = new TraceApproximation ( conf, section );
  else
    traceApproximation = NULL;

  // select final hyperparameters according to leave-one-out
  useLooParameters = conf->gB(section, "use_loo_parameters", false );

  // select the criterion
  modelselcrit = NULL;
  if ( useLooParameters ) {
    string modelselcrit_s = conf->gS(section, "loo_crit", "loo_pred_prob" );
    modelselcrit = GenericGPModelSelection::selectModelSel ( conf, modelselcrit_s );
    cerr << "KCGPRegOneVsAll: using additional model selection criterion " << modelselcrit_s << endl;
  }

  // do we want to compute the uncertainty of the estimate ?
  computeUncertainty = conf->gB(section, "compute_uncertainty", false );
}

KCGPRegOneVsAll::KCGPRegOneVsAll( const KCGPRegOneVsAll &vcova ): KernelClassifier(vcova)
{
  prototype = vcova.prototype->clone();
  optimizeParameters = vcova.optimizeParameters;
  verbose = vcova.verbose;
  maxIterations = vcova.maxIterations;
  useLooParameters = vcova.useLooParameters;
  computeUncertainty = vcova.computeUncertainty;
  choleskyMatrix = vcova.choleskyMatrix;
  calibrateProbabilities = vcova.calibrateProbabilities;
  numSamplesCalibration = vcova.numSamplesCalibration;

  for (int i = 0; i < (int)vcova.classifiers.size(); i++)
  {
    classifiers.push_back(pair<int, RegGaussianProcess*>(vcova.classifiers[i].first, vcova.classifiers[i].second->clone()));
  }

  if ( vcova.traceApproximation == NULL )
    traceApproximation = NULL;
  else
    traceApproximation = new TraceApproximation(*vcova.traceApproximation);

  if ( vcova.modelselcrit == NULL )
    modelselcrit = NULL;
  else
    modelselcrit = new GPMSCLooLikelihoodRegression(*vcova.modelselcrit);
}

KCGPRegOneVsAll::~KCGPRegOneVsAll()
{
  if ( traceApproximation != NULL )
    delete traceApproximation;
  if ( modelselcrit != NULL )
    delete modelselcrit;
}

void KCGPRegOneVsAll::teach ( KernelData *kernelData, const NICE::Vector & y )
{
  maxClassNo = (int)y.Max();

  set<int> classesUsed;
  for ( uint i = 0 ; i < y.size(); i++ )
    classesUsed.insert ( (int)y[i] );

  classifiers.clear();

  VVector ySet;
  VVector ySetZeroMean;
  for ( set<int>::const_iterator it = classesUsed.begin();
        it != classesUsed.end(); it++)
  {
    int i = *it;

    NICE::Vector ySub ( y.size() );
    NICE::Vector ySubZeroMean ( y.size() );
    for ( size_t j = 0 ; j < ySub.size() ; j++ )
    {
      ySub[j] = ((int)y[j] == i) ? 1 : 0;
      ySubZeroMean[j] = ((int)y[j] == i) ? 1 : -1;
    }
    ySet.push_back ( ySub );
    ySetZeroMean.push_back ( ySubZeroMean );
  }

  // Hyperparameter optimization
  if ( optimizeParameters )
  {
    ParameterizedKernel *kernelPara = dynamic_cast< ParameterizedKernel * > ( kernelFunction );
    if ( kernelPara == NULL ) {
      fthrow(Exception, "KCGPRegOneVsAll: you have to specify a parameterized kernel !");
    }
    GPRegressionOptimizationProblem gpopt ( kernelData, ySetZeroMean, kernelPara, verbose, modelselcrit, traceApproximation );

    // the trust region classifier is better for my large collection of one classification problem :)
    // FirstOrderRasmussen optimizer;
    FirstOrderTrustRegion optimizer;
    optimizer.setMaxIterations ( maxIterations );
    optimizer.setEpsilonG ( 0.01 );

    cout << "KCGPRegOneVsAll: Hyperparameter optimization ..." << endl;
    optimizer.optimizeFirst ( gpopt );
    cout << "KCGPRegOneVsAll: Hyperparameter optimization ...done" << endl;

    if ( useLooParameters )
    {
      cerr << "KCGPRegOneVsAll: using best loo parameters" << endl;
      gpopt.useLooParameters();
    }

    gpopt.update();

    Vector parameters;
    kernelPara->getParameters ( parameters );
    cout << "KCGPRegOneVsAll: Optimization finished: " << parameters << endl << endl;
  } else {
    kernelData->updateCholeskyFactorization();
  }

  //for binary problems
  if (classesUsed.size() == 2 && false)
  {
    set<int>::const_iterator it = classesUsed.begin();
    int classno = *it;
    it++;
    int classno2 = *it;
    const Vector & ySub = ySet[0];
    RegGaussianProcess *classifier;
    classifier = prototype->clone();
    if (verbose)
      fprintf (stderr, "KCGPRegOneVsAll: training classifier class %d <-> %d\n", classno, classno2 );
    classifier->teach ( kernelData, ySub );
    classifiers.push_back ( pair<int, RegGaussianProcess*> (classno, classifier) );
    classifiers.push_back ( pair<int, RegGaussianProcess*> (classno2, classifier) );
  }
  else
  {
    int i = 0;
    for ( set<int>::const_iterator it = classesUsed.begin(); it != classesUsed.end(); it++, i++)
    {
      int classno = *it;
      const Vector & ySubZeroMean = ySetZeroMean[i];
      RegGaussianProcess *classifier;
      classifier = prototype->clone();

      if (verbose)
        fprintf (stderr, "KCGPRegOneVsAll: training classifier class %d <-> remainder\n", classno );

      classifier->teach ( kernelData, ySubZeroMean );

      classifiers.push_back ( pair<int, RegGaussianProcess*> (classno, classifier) );
    }
  }

  if ( computeUncertainty || calibrateProbabilities )
    choleskyMatrix = kernelData->getCholeskyMatrix();

}

/**
* @brief Completely the same, only using a different data structure to store the labels
* @author Alexander Lütz
* @date 18/08/2011
 */
void KCGPRegOneVsAll::teach ( KernelData *kernelData, const std::vector<double> & y )
{
  // FIXME: Do we really need this function? (erik)
  Vector y_nice (y);
  teach ( kernelData, y_nice );
}


ClassificationResult KCGPRegOneVsAll::classifyKernel ( const NICE::Vector & kernelVector, double kernelSelf ) const
{
  if ( classifiers.size() <= 0 )
    fthrow(Exception, "The classifier was not trained with training data (use teach(...))");

  FullVector scores ( maxClassNo + 1 );
  scores.set(0);

  //for binary problems
  if (classifiers.size() == 2 && false)
  {
    int classno = classifiers[0].first;
    int classno2 = classifiers[1].first;
    RegGaussianProcess *classifier = classifiers[0].second;
    double yEstimate = classifier->predictKernel(kernelVector, kernelSelf);
    scores[classno] = yEstimate;
    scores[classno2] = 0;//1-yEstimate;
    //cout << "i: " << 0 << ": " << scores[classno] << endl;
    //cout << "i: " << 1 << ": " << scores[classno2] << endl;
  }
  else
  {
#pragma omp parallel for
    for ( int classifierIndex = 0 ; classifierIndex < (int)classifiers.size(); classifierIndex++ )
    {
      int classno = classifiers[(uint)classifierIndex].first;
      RegGaussianProcess *classifier = classifiers[(uint)classifierIndex].second;
      double yEstimate = classifier->predictKernel(kernelVector, kernelSelf);
      scores[classno] += yEstimate;
      //cout << "i: " << classifierIndex << ": " << scores[classno] << endl;
    }
  }

  double uncertainty = 0.0;
  if ( computeUncertainty || calibrateProbabilities )
  {
    Vector tmp;
    choleskySolveLargeScale ( choleskyMatrix, kernelVector, tmp );

    // tmp = K^{-1} k*
    uncertainty = kernelSelf - kernelVector.scalarProduct ( tmp );

    /*if(uncertainty < 0.0)
    //    uncertainty = 0.0;*/

    if ( calibrateProbabilities )
    {
      /*********************************************************************
       * Calibration of probabilities is based on the following
       * idea:
       *
       * The scores \mu_i (or scores[i]) and the uncertainty
       * r.uncertainty are the mean and the variance of the predictive
       * distribution p(y_i | ...). So actually we do not know the correct value for
       * y_i and therefore just taking the maximum score ignores this uncertainty
       * completely!
       * What we might want to have is the probability for each class k
       * p_k = p( k = argmax_i y_i )
       *
       * Calculating this probability for n>2 is intractable and we
       * have to do monte carlo estimation which is performed in the following code
       *
       * An alternative would be to approximate p_k with
       * p( mu_k >= max_{i != k} y_i ) = prod_{i != k} F_i(mu_k)
       * with F_i being the cumulative distribution of the normal distribution i
       *
       * Details: Erik Rodner, "Learning with Few Examples for Visual Recognition Problems", PhD thesis
       *
       ********************************************************************/

      double stddev = sqrt(uncertainty);
      FullVector calibratedScores ( maxClassNo + 1 );
      calibratedScores.set(0.0);
      for ( uint i = 0 ; i < numSamplesCalibration ; i++ )
      {
        uint cmaxClassno = 0;
        double cmax = - std::numeric_limits<double>::max();
        for ( int classifierIndex = 0 ; classifierIndex < (int)classifiers.size(); classifierIndex++ )
        {
          int classno = classifiers[(uint)classifierIndex].first;
          double s = randGaussDouble ( stddev ) + scores[classno];
          if ( s > cmax )
          {
            cmax = s;
            cmaxClassno = classno;
          }
        }
        calibratedScores[ cmaxClassno ]++;
      }
      calibratedScores.normalize();

      // calibrating probabilities should not affect our hard
      // decision for a specific class
      if ( verbose ) {
        if ( scores.maxElement() != calibratedScores.maxElement() )
          cerr << "Calibration of probabilities affected the hard decision, you should increase calibrated_probabilities_numsamples !!" << endl;
      }

      scores = calibratedScores;
    }
  }

  ClassificationResult r ( scores.maxElement(), scores );
  r.uncertainty = uncertainty;

  return r;
}

KCGPRegOneVsAll* KCGPRegOneVsAll::clone(void) const
{
  KCGPRegOneVsAll *classifier = new KCGPRegOneVsAll( *this );
  return classifier;
}

void KCGPRegOneVsAll::clear()
{
  //nothing to clear
}

void KCGPRegOneVsAll::restore(std::istream& ifs, int type)
{
  ifs.precision (numeric_limits<double>::digits10 + 1);
  ifs >> maxClassNo;
  ifs >> computeUncertainty;
  ifs >> calibrateProbabilities;

  if (calibrateProbabilities || computeUncertainty)
  {
    ifs >> choleskyMatrix;
  }

  int size;
  ifs >> size;

  for (int i = 0; i < size; i++)
  {
    int tmp;
    ifs >> tmp;
    RegGaussianProcess *classifier;
    classifier = prototype->clone();
    classifier->restore(ifs);
    classifiers.push_back ( pair<int, RegGaussianProcess*> (tmp, classifier) );
  }

  KernelClassifier::restore(ifs, type);
}

void KCGPRegOneVsAll::store(std::ostream& ofs, int type) const
{
  ofs.precision (numeric_limits<double>::digits10 + 1);

  ofs << maxClassNo << endl;
  ofs << computeUncertainty << endl;
  ofs << calibrateProbabilities << endl;


  if (calibrateProbabilities || computeUncertainty)
  {
    ofs << choleskyMatrix << endl;
  }

  ofs << classifiers.size() << endl;
  for (uint i = 0; i < classifiers.size(); i++)
  {
    ofs << classifiers[i].first << endl;
    classifiers[i].second->store(ofs);
    ofs << endl;
  }
  KernelClassifier::store(ofs, type);
}