/** * @file KCGPRegOneVsAll.cpp * @brief One vs. All interface for kernel classifiers * @author Erik Rodner * @date 12/10/2009 */ #include #include #ifdef NICE_USELIB_OPENMP #include #endif #include "core/vector/Algorithms.h" #include "core/optimization/limun/OptimizationAlgorithmFirst.h" #include "core/optimization/limun/FirstOrderTrustRegion.h" #include "core/optimization/limun/FirstOrderRasmussen.h" #include "vislearning/regression/gpregression/GPRegressionOptimizationProblem.h" #include "core/algebra/CholeskyRobust.h" #include "core/algebra/CholeskyRobustAuto.h" #include "KCGPRegOneVsAll.h" #include #undef DEBUG using namespace NICE; using namespace std; using namespace OBJREC; KCGPRegOneVsAll::KCGPRegOneVsAll( const Config *conf, Kernel *kernelFunction, const string & section ) : KernelClassifier ( conf, kernelFunction ) { this->maxClassNo = 0; this->verbose = conf->gB( section, "verbose", false ); this->optimizeParameters = (kernelFunction == NULL) ? false : conf->gB( section, "optimize_parameters", true ); this->maxIterations = conf->gI( section, "optimization_maxiterations", 500 ); this->numSamplesCalibration = conf->gI( section, "calibrated_probabilities_numsamples", 2000 ); this->calibrateProbabilities = conf->gB( section, "calibrated_probabilities", false ); if ( verbose && this->calibrateProbabilities ) cerr << "KCGPRegOneVsAll: probability calibration is turned on, this can result in massive additional load!" << endl; // we do joint optimization, therefore single classifiers (cloned from prototype) // are not allowed to do optimization themselves Config confNoOptimize ( *conf ); confNoOptimize.sB( section, "optimize_parameters", false ); this->prototype = new RegGaussianProcess ( &confNoOptimize, kernelFunction, section ); // Do not use this option, unless you know what you are doing! // This function is just necessary for hyperparameter optimization with really large // kernel matrices of a kronecker structure! bool approximateTraceTerm = conf->gB(section, "approximate_trace_term", false); if ( approximateTraceTerm ) traceApproximation = new TraceApproximation ( conf, section ); else traceApproximation = NULL; // select final hyperparameters according to leave-one-out useLooParameters = conf->gB(section, "use_loo_parameters", false ); // select the criterion modelselcrit = NULL; if ( useLooParameters ) { string modelselcrit_s = conf->gS(section, "loo_crit", "loo_pred_prob" ); modelselcrit = GenericGPModelSelection::selectModelSel ( conf, modelselcrit_s ); cerr << "KCGPRegOneVsAll: using additional model selection criterion " << modelselcrit_s << endl; } // do we want to compute the uncertainty of the estimate ? computeUncertainty = conf->gB(section, "compute_uncertainty", false ); } KCGPRegOneVsAll::KCGPRegOneVsAll( const KCGPRegOneVsAll &vcova ): KernelClassifier(vcova) { prototype = vcova.prototype->clone(); optimizeParameters = vcova.optimizeParameters; verbose = vcova.verbose; maxIterations = vcova.maxIterations; useLooParameters = vcova.useLooParameters; computeUncertainty = vcova.computeUncertainty; choleskyMatrix = vcova.choleskyMatrix; calibrateProbabilities = vcova.calibrateProbabilities; numSamplesCalibration = vcova.numSamplesCalibration; for(int i = 0; i < (int)vcova.classifiers.size(); i++) { classifiers.push_back(pair(vcova.classifiers[i].first,vcova.classifiers[i].second->clone())); } if ( vcova.traceApproximation == NULL ) traceApproximation = NULL; else traceApproximation = new TraceApproximation(*vcova.traceApproximation); if ( vcova.modelselcrit == NULL ) modelselcrit = NULL; else modelselcrit = new GPMSCLooLikelihoodRegression(*vcova.modelselcrit); } KCGPRegOneVsAll::~KCGPRegOneVsAll() { if ( traceApproximation != NULL ) delete traceApproximation; if ( modelselcrit != NULL ) delete modelselcrit; } void KCGPRegOneVsAll::teach ( KernelData *kernelData, const NICE::Vector & y ) { maxClassNo = (int)y.Max(); set classesUsed; for ( uint i = 0 ; i < y.size(); i++ ) classesUsed.insert ( (int)y[i] ); classifiers.clear(); VVector ySet; VVector ySetZeroMean; for ( set::const_iterator it = classesUsed.begin(); it != classesUsed.end(); it++) { int i = *it; NICE::Vector ySub ( y.size() ); NICE::Vector ySubZeroMean ( y.size() ); for ( size_t j = 0 ; j < ySub.size() ; j++ ) { ySub[j] = ((int)y[j] == i) ? 1 : 0; ySubZeroMean[j] = ((int)y[j] == i) ? 1 : -1; } ySet.push_back ( ySub ); ySetZeroMean.push_back ( ySubZeroMean ); } // Hyperparameter optimization if ( optimizeParameters ) { ParameterizedKernel *kernelPara = dynamic_cast< ParameterizedKernel * > ( kernelFunction ); if ( kernelPara == NULL ) { fthrow(Exception, "KCGPRegOneVsAll: you have to specify a parameterized kernel !"); } GPRegressionOptimizationProblem gpopt ( kernelData, ySetZeroMean, kernelPara, verbose, modelselcrit, traceApproximation ); // the trust region classifier is better for my large collection of one classification problem :) // FirstOrderRasmussen optimizer; FirstOrderTrustRegion optimizer; optimizer.setMaxIterations ( maxIterations ); optimizer.setEpsilonG ( 0.01 ); cout << "KCGPRegOneVsAll: Hyperparameter optimization ..." << endl; optimizer.optimizeFirst ( gpopt ); cout << "KCGPRegOneVsAll: Hyperparameter optimization ...done" << endl; if ( useLooParameters ) { cerr << "KCGPRegOneVsAll: using best loo parameters" << endl; gpopt.useLooParameters(); } gpopt.update(); Vector parameters; kernelPara->getParameters ( parameters ); cout << "KCGPRegOneVsAll: Optimization finished: " << parameters << endl << endl; } else { kernelData->updateCholeskyFactorization(); } //for binary problems if(classesUsed.size() == 2 && false) { set::const_iterator it = classesUsed.begin(); int classno = *it; it++; int classno2 = *it; const Vector & ySub = ySet[0]; RegGaussianProcess *classifier; classifier = prototype->clone(); if (verbose) fprintf (stderr, "KCGPRegOneVsAll: training classifier class %d <-> %d\n", classno, classno2 ); classifier->teach ( kernelData, ySub ); classifiers.push_back ( pair (classno, classifier) ); classifiers.push_back ( pair (classno2, classifier) ); } else { int i = 0; for ( set::const_iterator it = classesUsed.begin(); it != classesUsed.end(); it++,i++) { int classno = *it; const Vector & ySub = ySet[i]; RegGaussianProcess *classifier; classifier = prototype->clone(); if (verbose) fprintf (stderr, "KCGPRegOneVsAll: training classifier class %d <-> remainder\n", classno ); classifier->teach ( kernelData, ySub ); classifiers.push_back ( pair (classno, classifier) ); } } if ( computeUncertainty || calibrateProbabilities ) choleskyMatrix = kernelData->getCholeskyMatrix(); } /** * @brief Completely the same, only using a different data structure to store the labels * @author Alexander Lütz * @date 18/08/2011 */ void KCGPRegOneVsAll::teach ( KernelData *kernelData, const std::vector & y ) { // FIXME: Do we really need this function? (erik) Vector y_nice (y); teach ( kernelData, y_nice ); } ClassificationResult KCGPRegOneVsAll::classifyKernel ( const NICE::Vector & kernelVector, double kernelSelf ) const { if ( classifiers.size() <= 0 ) fthrow(Exception, "The classifier was not trained with training data (use teach(...))"); FullVector scores ( maxClassNo+1 ); scores.set(0); //for binary problems if(classifiers.size() == 2 && false) { int classno = classifiers[0].first; int classno2 = classifiers[1].first; RegGaussianProcess *classifier = classifiers[0].second; double yEstimate = classifier->predictKernel(kernelVector, kernelSelf); scores[classno] = yEstimate; scores[classno2] = 0;//1-yEstimate; //cout << "i: " << 0 << ": " << scores[classno] << endl; //cout << "i: " << 1 << ": " << scores[classno2] << endl; } else { #pragma omp parallel for for ( int classifierIndex = 0 ; classifierIndex < (int)classifiers.size(); classifierIndex++ ) { int classno = classifiers[(uint)classifierIndex].first; RegGaussianProcess *classifier = classifiers[(uint)classifierIndex].second; double yEstimate = classifier->predictKernel(kernelVector, kernelSelf); scores[classno] += yEstimate; //cout << "i: " << classifierIndex << ": " << scores[classno] << endl; } } double uncertainty = 0.0; if ( computeUncertainty || calibrateProbabilities ) { Vector tmp; choleskySolveLargeScale ( choleskyMatrix, kernelVector, tmp ); // tmp = K^{-1} k* uncertainty = kernelSelf - kernelVector.scalarProduct ( tmp ); /*if(uncertainty < 0.0) // uncertainty = 0.0;*/ if ( calibrateProbabilities ) { /********************************************************************* * Calibration of probabilities is based on the following * idea: * * The scores \mu_i (or scores[i]) and the uncertainty * r.uncertainty are the mean and the variance of the predictive * distribution p(y_i | ...). So actually we do not know the correct value for * y_i and therefore just taking the maximum score ignores this uncertainty * completely! * What we might want to have is the probability for each class k * p_k = p( k = argmax_i y_i ) * * Calculating this probability for n>2 is intractable and we * have to do monte carlo estimation which is performed in the following code * * An alternative would be to approximate p_k with * p( mu_k >= max_{i != k} y_i ) = prod_{i != k} F_i(mu_k) * with F_i being the cumulative distribution of the normal distribution i * * Details: Erik Rodner, "Learning with Few Examples for Visual Recognition Problems", PhD thesis * ********************************************************************/ double stddev = sqrt(uncertainty); FullVector calibratedScores ( maxClassNo + 1 ); calibratedScores.set(0.0); for ( uint i = 0 ; i < numSamplesCalibration ; i++ ) { uint cmaxClassno = 0; double cmax = - std::numeric_limits::max(); for ( int classifierIndex = 0 ; classifierIndex < (int)classifiers.size(); classifierIndex++ ) { int classno = classifiers[(uint)classifierIndex].first; double s = randGaussDouble ( stddev ) + scores[classno]; if ( s > cmax ) { cmax = s; cmaxClassno = classno; } } calibratedScores[ cmaxClassno ]++; } calibratedScores.normalize(); // calibrating probabilities should not affect our hard // decision for a specific class if ( verbose ) { if ( scores.maxElement() != calibratedScores.maxElement() ) cerr << "Calibration of probabilities affected the hard decision, you should increase calibrated_probabilities_numsamples !!" << endl; } scores = calibratedScores; } } ClassificationResult r ( scores.maxElement(), scores ); r.uncertainty = uncertainty; return r; } KCGPRegOneVsAll* KCGPRegOneVsAll::clone(void) const { KCGPRegOneVsAll *classifier = new KCGPRegOneVsAll( *this ); return classifier; } void KCGPRegOneVsAll::clear() { //nothing to clear } void KCGPRegOneVsAll::restore(std::istream& ifs, int type) { ifs.precision (numeric_limits::digits10 + 1); ifs >> maxClassNo; ifs >> computeUncertainty; ifs >> calibrateProbabilities; if(calibrateProbabilities || computeUncertainty) { ifs >> choleskyMatrix; } int size; ifs >> size; for(int i = 0; i < size; i++) { int tmp; ifs >> tmp; RegGaussianProcess *classifier; classifier = prototype->clone(); classifier->restore(ifs); classifiers.push_back ( pair (tmp, classifier) ); } KernelClassifier::restore(ifs,type); } void KCGPRegOneVsAll::store(std::ostream& ofs, int type) const { ofs.precision (numeric_limits::digits10 + 1); ofs << maxClassNo << endl; ofs << computeUncertainty << endl; ofs << calibrateProbabilities << endl; if(calibrateProbabilities || computeUncertainty) { ofs << choleskyMatrix << endl; } ofs << classifiers.size() << endl; for(uint i = 0; i < classifiers.size(); i++) { ofs << classifiers[i].first << endl; classifiers[i].second->store(ofs); ofs << endl; } KernelClassifier::store(ofs,type); }