GPLikelihoodApprox.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. /**
  2. * @file GPLikelihoodApprox.cpp
  3. * @brief GP likelihood approximation as a cost function (Implementation)
  4. * @author Erik Rodner, Alexander Freytag
  5. * @date 02/09/2012
  6. */
  7. #include <iostream>
  8. #include <core/algebra/CholeskyRobust.h>
  9. #include <core/vector/Algorithms.h>
  10. #include <core/vector/Eigen.h>
  11. #include <core/basics/Timer.h>
  12. #include <core/algebra/ILSConjugateGradients.h>
  13. #include "kernels/GeneralizedIntersectionKernelFunction.h"
  14. #include "kernels/IntersectionKernelFunction.h"
  15. #include "GPLikelihoodApprox.h"
  16. #include "IKMLinearCombination.h"
  17. #include "GMHIKernel.h"
  18. #include "algebra/LogDetApproxBaiAndGolub.h"
  19. using namespace std;
  20. using namespace NICE;
  21. using namespace OPTIMIZATION;
  22. GPLikelihoodApprox::GPLikelihoodApprox( const map<int, Vector> & binaryLabels,
  23. ImplicitKernelMatrix *ikm,
  24. IterativeLinearSolver *linsolver,
  25. EigValues *eig,
  26. bool verifyApproximation,
  27. int _nrOfEigenvaluesToConsider
  28. )
  29. : CostFunction( ikm->getNumParameters() )
  30. {
  31. this->binaryLabels = binaryLabels;
  32. this->ikm = ikm;
  33. this->linsolver = linsolver;
  34. this->eig = eig;
  35. if ( binaryLabels.size() == 1 )
  36. this->nrOfClasses = 2;
  37. else
  38. this->nrOfClasses = binaryLabels.size();
  39. this->min_nlikelihood = std::numeric_limits<double>::max();
  40. this->verifyApproximation = verifyApproximation;
  41. this->nrOfEigenvaluesToConsider = _nrOfEigenvaluesToConsider;
  42. lastAlphas = NULL;
  43. this->verbose = false;
  44. this->debug = false;
  45. this->usePreviousAlphas = true;
  46. }
  47. GPLikelihoodApprox::~GPLikelihoodApprox()
  48. {
  49. //delete the pointer, but not the content (which is stored somewhere else)
  50. if (lastAlphas != NULL)
  51. lastAlphas = NULL;
  52. }
  53. void GPLikelihoodApprox::calculateLikelihood ( double mypara, const FeatureMatrix & f, const std::map< int, NICE::Vector > & yset, double noise, double lambdaMax )
  54. {
  55. // robust cholesky routine without noise !!
  56. CholeskyRobust cr ( true /*verbose*/, 0.0, false /*useCuda*/ );
  57. Timer t;
  58. t.start();
  59. cerr << "VERIFY: Calculating kernel matrix ..." << endl;
  60. Matrix K;
  61. IntersectionKernelFunction<double> hik;
  62. //old version, not needed anymore - we explore sparsity
  63. // K = hik.computeKernelMatrix(data_matrix, noise); // = K + sigma^2 I
  64. K = hik.computeKernelMatrix(f, noise);
  65. t.stop();
  66. cerr << "VERIFY: Time used for calculating kernel matrix is: " << t.getLast() << endl;
  67. cerr << "K is a " << K.rows() << " x " << K.cols() << " matrix" << endl;
  68. if ( K.containsNaN() )
  69. fthrow(Exception, "NaN values in the kernel matrix");
  70. cerr << "VERIFY: Computing likelihood ..." << endl;
  71. t.start();
  72. Matrix choleskyMatrix;
  73. cr.robustChol ( K, choleskyMatrix ); // K = choleskyMatrix^T * choleskyMatrix
  74. double gt_logdet = (yset.size()) * cr.getLastLogDet();
  75. cerr << "chol * chol^T: " << ( choleskyMatrix * choleskyMatrix.transpose() )(0,0,4,4) << endl;
  76. double gt_dataterm = 0.0;
  77. for ( map< int, NICE::Vector >::const_iterator i = yset.begin(); i != yset.end(); i++ )
  78. {
  79. const Vector & y = i->second;
  80. Vector gt_alpha;
  81. choleskySolve ( choleskyMatrix, y, gt_alpha );
  82. cerr << "cholesky error: " << (K*gt_alpha - y).normL2() << endl;
  83. gt_dataterm += y.scalarProduct ( gt_alpha );
  84. }
  85. //cerr << "linsolve error: " << (tmp - y).normL2() << endl;
  86. t.stop();
  87. cerr << "VERIFY: Time used for calculating likelihood: " << t.getLast() << endl;
  88. cerr << "Something of K: " << K(0, 0, 4, 4) << endl;
  89. cerr << "frob norm: gt:" << K.frobeniusNorm() << endl;
  90. /*try {
  91. Vector *eigenv = eigenvalues ( K );
  92. cerr << "lambda_max: gt:" << eigenv->Max() << " est:" << lambdaMax << endl;
  93. delete eigenv;
  94. } catch (...) {
  95. cerr << "NICE eigenvalues function failed!" << endl;
  96. }*/
  97. double gt_nlikelihood = gt_logdet + gt_dataterm;
  98. cerr << "OPTGT: " << mypara << " " << gt_nlikelihood << " " << gt_logdet << " " << gt_dataterm << endl;
  99. }
  100. double GPLikelihoodApprox::evaluate(const OPTIMIZATION::matrix_type & x)
  101. {
  102. Vector xv;
  103. xv.resize ( x.rows() );
  104. for ( uint i = 0 ; i < x.rows(); i++ )
  105. xv[i] = x(i,0);
  106. // check whether we have been here before
  107. unsigned long hashValue = xv.getHashValue();
  108. if (verbose)
  109. std::cerr << "Current parameter: " << xv << " (weird hash value is " << hashValue << ")" << std::endl;
  110. map<unsigned long, double>::const_iterator k = alreadyVisited.find(hashValue);
  111. if ( k != alreadyVisited.end() )
  112. {
  113. if (verbose)
  114. std::cerr << "Using cached value: " << k->second << std::endl;
  115. //already computed, simply return the cached value
  116. return k->second;
  117. }
  118. // set parameter value and check lower and upper bounds of pf
  119. if ( ikm->outOfBounds(xv) )
  120. {
  121. if (verbose)
  122. std::cerr << "Parameters are out of bounds" << std::endl;
  123. return numeric_limits<double>::max();
  124. }
  125. ikm->setParameters ( xv );
  126. if (verbose)
  127. std::cerr << "setParameters xv: " << xv << std::endl;
  128. // --- calculate the likelihood
  129. // (a) logdet(K + sI)
  130. Timer t;
  131. if (verbose)
  132. std::cerr << "Calculating eigendecomposition " << ikm->rows() << " x " << ikm->cols() << std::endl;
  133. t.start();
  134. Vector eigenmax;
  135. Matrix eigenmaxvectors;
  136. int rank = nrOfEigenvaluesToConsider;
  137. /** calculate the biggest eigenvalue */
  138. // We use a Arnoldi solver and not a specialized one.....
  139. // We might also think about a coordinate descent solver for Arnoldi iterations, however,
  140. // the current implementation converges very quickly
  141. //old version: just use the first eigenvalue
  142. //NOTE
  143. // in theory, we have these values already on hand since we've done it in FMKGPHypOpt.
  144. // Think about wether to give them as input to this function or not
  145. eig->getEigenvalues( *ikm, eigenmax, eigenmaxvectors, rank );
  146. if (verbose)
  147. std::cerr << "eigenmax: " << eigenmax << std::endl;
  148. t.stop();
  149. SparseVector binaryDataterms;
  150. Vector diagonalElements;
  151. ikm->getDiagonalElements ( diagonalElements );
  152. // set simple jacobi pre-conditioning
  153. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  154. //TODO why do we need this?
  155. if ( linsolver_cg != NULL )
  156. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  157. // all alpha vectors will be stored!
  158. map<int, Vector> alphas;
  159. // This has to be done m times for the multi-class case
  160. if (verbose)
  161. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  162. for ( map<int, Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  163. {
  164. // (b) y^T (K+sI)^{-1} y
  165. int classCnt = j->first;
  166. if (verbose)
  167. {
  168. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  169. std::cerr << "Size of the kernel matrix " << ikm->rows() << std::endl;
  170. }
  171. /** About finding a good initial solution
  172. * K~ = K + sigma^2 I
  173. *
  174. * (0) we have already estimated alpha for a previous hyperparameter, then
  175. * we should use this as an initial estimate. According to my quick
  176. * tests this really helps!
  177. * (1) K~ \approx lambda_max v v^T
  178. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  179. * => \lambda_max v^T alpha = v^T y
  180. * => alpha = y / lambda_max could be a good initial start
  181. * If we put everything in the first equation this gives us
  182. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  183. * This reduces the number of iterations by 5 or 8
  184. */
  185. Vector alpha;
  186. if ( (usePreviousAlphas) && (lastAlphas != NULL) )
  187. {
  188. std::map<int, NICE::Vector>::iterator alphaIt = lastAlphas->begin();
  189. alpha = (*lastAlphas)[classCnt];
  190. }
  191. else
  192. {
  193. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  194. }
  195. Vector initialAlpha;
  196. if ( verbose )
  197. initialAlpha = alpha;
  198. if ( verbose )
  199. cerr << "Using the standard solver ..." << endl;
  200. t.start();
  201. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  202. t.stop();
  203. //TODO This is only important for the incremental learning stuff.
  204. // if ( verbose )
  205. // {
  206. // double initialAlphaNorm ( initialAlpha.normL1() );
  207. // //compute the difference
  208. // initialAlpha -= alpha;
  209. // //take the abs of the differences
  210. // initialAlpha.absInplace();
  211. // //and compute a final score using a suitable norm
  212. // // double difference( initialAlpha.normInf() );
  213. // double difference( initialAlpha.normL1() );
  214. // std::cerr << "debug -- last entry of new alpha: " << abs(alpha[alpha.size() -1 ]) << std::endl;
  215. // std::cerr << "debug -- difference using inf norm: " << difference << std::endl;
  216. // std::cerr << "debug -- relative difference using inf norm: " << difference / initialAlphaNorm << std::endl;
  217. // }
  218. if ( verbose )
  219. std::cerr << "Time used for solving (K + sigma^2 I)^{-1} y: " << t.getLast() << std::endl;
  220. // this term is no approximation at all
  221. double dataterm = binaryLabels[classCnt].scalarProduct(alpha);
  222. binaryDataterms[classCnt] = (dataterm);
  223. alphas[classCnt] = alpha;
  224. }
  225. // approximation stuff
  226. if (verbose)
  227. cerr << "Approximating logdet(K) ..." << endl;
  228. t.start();
  229. LogDetApproxBaiAndGolub la;
  230. la.setVerbose(this->verbose);
  231. //NOTE: this is already the squared frobenius norm, that we are looking for.
  232. double frobNormSquared(0.0);
  233. // ------------- LOWER BOUND, THAT IS USED --------------------
  234. // frobNormSquared ~ \sum \lambda_i^2 <-- LOWER BOUND
  235. for (int idx = 0; idx < rank; idx++)
  236. {
  237. frobNormSquared += (eigenmax[idx] * eigenmax[idx]);
  238. }
  239. if (verbose)
  240. cerr << " frob norm squared: est:" << frobNormSquared << endl;
  241. if (verbose)
  242. std::cerr << "trace: " << diagonalElements.Sum() << std::endl;
  243. double logdet = la.getLogDetApproximationUpperBound( diagonalElements.Sum(), /* trace = n only for non-transformed features*/
  244. frobNormSquared, /* use a rough approximation of the frobenius norm */
  245. eigenmax[0], /* upper bound for eigen values */
  246. ikm->rows() /* = n */
  247. );
  248. t.stop();
  249. if (verbose)
  250. cerr << "Time used for approximating logdet(K): " << t.getLast() << endl;
  251. // (c) adding the two terms
  252. double nlikelihood = nrOfClasses*logdet;
  253. double dataterm = binaryDataterms.sum();
  254. nlikelihood += dataterm;
  255. if (verbose)
  256. cerr << "OPT: " << xv << " " << nlikelihood << " " << logdet << " " << dataterm << endl;
  257. if ( nlikelihood < min_nlikelihood )
  258. {
  259. min_nlikelihood = nlikelihood;
  260. ikm->getParameters ( min_parameter );
  261. min_alphas = alphas;
  262. }
  263. alreadyVisited.insert ( pair<int, double> ( hashValue, nlikelihood ) );
  264. return nlikelihood;
  265. }
  266. void GPLikelihoodApprox::setParameterLowerBound(const double & _parameterLowerBound)
  267. {
  268. parameterLowerBound = _parameterLowerBound;
  269. }
  270. void GPLikelihoodApprox::setParameterUpperBound(const double & _parameterUpperBound)
  271. {
  272. parameterUpperBound = _parameterUpperBound;
  273. }
  274. void GPLikelihoodApprox::setLastAlphas(std::map<int, NICE::Vector> * _lastAlphas)
  275. {
  276. lastAlphas = _lastAlphas;
  277. }
  278. void GPLikelihoodApprox::setBinaryLabels(const std::map<int, Vector> & _binaryLabels)
  279. {
  280. binaryLabels = _binaryLabels;
  281. }
  282. void GPLikelihoodApprox::setUsePreviousAlphas( const bool & _usePreviousAlphas )
  283. {
  284. this->usePreviousAlphas = _usePreviousAlphas;
  285. }
  286. void GPLikelihoodApprox::setVerbose( const bool & _verbose )
  287. {
  288. this->verbose = _verbose;
  289. }
  290. void GPLikelihoodApprox::setDebug( const bool & _debug )
  291. {
  292. this->debug = _debug;
  293. }