GPLikelihoodApprox.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. /**
  2. * @file GPLikelihoodApprox.cpp
  3. * @brief GP likelihood approximation as a cost function (Implementation)
  4. * @author Erik Rodner, Alexander Freytag
  5. * @date 02/09/2012
  6. */
  7. #include <iostream>
  8. #include <core/algebra/CholeskyRobust.h>
  9. #include <core/vector/Algorithms.h>
  10. #include <core/vector/Eigen.h>
  11. #include <core/basics/Timer.h>
  12. #include <core/algebra/ILSConjugateGradients.h>
  13. #include "kernels/GeneralizedIntersectionKernelFunction.h"
  14. #include "kernels/IntersectionKernelFunction.h"
  15. #include "GPLikelihoodApprox.h"
  16. #include "IKMLinearCombination.h"
  17. #include "GMHIKernel.h"
  18. #include "algebra/LogDetApproxBaiAndGolub.h"
  19. using namespace std;
  20. using namespace NICE;
  21. using namespace OPTIMIZATION;
  22. GPLikelihoodApprox::GPLikelihoodApprox( const map<int, Vector> & binaryLabels,
  23. ImplicitKernelMatrix *ikm,
  24. IterativeLinearSolver *linsolver,
  25. EigValues *eig,
  26. bool verifyApproximation,
  27. int _nrOfEigenvaluesToConsider
  28. )
  29. : CostFunction( ikm->getNumParameters() )
  30. {
  31. this->binaryLabels = binaryLabels;
  32. this->ikm = ikm;
  33. this->linsolver = linsolver;
  34. this->eig = eig;
  35. if ( binaryLabels.size() == 1 )
  36. this->nrOfClasses = 2;
  37. else
  38. this->nrOfClasses = binaryLabels.size();
  39. this->min_nlikelihood = std::numeric_limits<double>::max();
  40. this->verifyApproximation = verifyApproximation;
  41. this->nrOfEigenvaluesToConsider = _nrOfEigenvaluesToConsider;
  42. lastAlphas = NULL;
  43. this->verbose = false;
  44. this->debug = false;
  45. this->usePreviousAlphas = true;
  46. }
  47. GPLikelihoodApprox::~GPLikelihoodApprox()
  48. {
  49. //delete the pointer, but not the content (which is stored somewhere else)
  50. if (lastAlphas != NULL)
  51. lastAlphas = NULL;
  52. }
  53. void GPLikelihoodApprox::calculateLikelihood ( double mypara, const FeatureMatrix & f, const std::map< int, NICE::Vector > & yset, double noise, double lambdaMax )
  54. {
  55. // robust cholesky routine without noise !!
  56. CholeskyRobust cr ( true /*verbose*/, 0.0, false /*useCuda*/ );
  57. Timer t;
  58. t.start();
  59. cerr << "VERIFY: Calculating kernel matrix ..." << endl;
  60. Matrix K;
  61. IntersectionKernelFunction<double> hik;
  62. //old version, not needed anymore - we explore sparsity
  63. // K = hik.computeKernelMatrix(data_matrix, noise); // = K + sigma^2 I
  64. K = hik.computeKernelMatrix(f, noise);
  65. t.stop();
  66. cerr << "VERIFY: Time used for calculating kernel matrix is: " << t.getLast() << endl;
  67. cerr << "K is a " << K.rows() << " x " << K.cols() << " matrix" << endl;
  68. if ( K.containsNaN() )
  69. fthrow(Exception, "NaN values in the kernel matrix");
  70. cerr << "VERIFY: Computing likelihood ..." << endl;
  71. t.start();
  72. Matrix choleskyMatrix;
  73. cr.robustChol ( K, choleskyMatrix ); // K = choleskyMatrix^T * choleskyMatrix
  74. double gt_logdet = (yset.size()) * cr.getLastLogDet();
  75. cerr << "chol * chol^T: " << ( choleskyMatrix * choleskyMatrix.transpose() )(0,0,4,4) << endl;
  76. double gt_dataterm = 0.0;
  77. for ( map< int, NICE::Vector >::const_iterator i = yset.begin(); i != yset.end(); i++ )
  78. {
  79. const Vector & y = i->second;
  80. Vector gt_alpha;
  81. choleskySolve ( choleskyMatrix, y, gt_alpha );
  82. cerr << "cholesky error: " << (K*gt_alpha - y).normL2() << endl;
  83. gt_dataterm += y.scalarProduct ( gt_alpha );
  84. }
  85. //cerr << "linsolve error: " << (tmp - y).normL2() << endl;
  86. t.stop();
  87. cerr << "VERIFY: Time used for calculating likelihood: " << t.getLast() << endl;
  88. cerr << "Something of K: " << K(0, 0, 4, 4) << endl;
  89. cerr << "frob norm: gt:" << K.frobeniusNorm() << endl;
  90. /*try {
  91. Vector *eigenv = eigenvalues ( K );
  92. cerr << "lambda_max: gt:" << eigenv->Max() << " est:" << lambdaMax << endl;
  93. delete eigenv;
  94. } catch (...) {
  95. cerr << "NICE eigenvalues function failed!" << endl;
  96. }*/
  97. double gt_nlikelihood = gt_logdet + gt_dataterm;
  98. cerr << "OPTGT: " << mypara << " " << gt_nlikelihood << " " << gt_logdet << " " << gt_dataterm << endl;
  99. }
  100. void GPLikelihoodApprox::computeAlphaDirect(const OPTIMIZATION::matrix_type & x)
  101. {
  102. Timer t;
  103. // NICE::Vector diagonalElements;
  104. // ikm->getDiagonalElements ( diagonalElements );
  105. // set simple jacobi pre-conditioning
  106. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  107. // //TODO why do we need this?
  108. // if ( linsolver_cg != NULL )
  109. // linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  110. // all alpha vectors will be stored!
  111. std::map<int, NICE::Vector> alphas;
  112. // This has to be done m times for the multi-class case
  113. if (verbose)
  114. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  115. for ( std::map<int, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  116. {
  117. // (b) y^T (K+sI)^{-1} y
  118. int classCnt = j->first;
  119. if (verbose)
  120. {
  121. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  122. std::cerr << "Size of the kernel matrix " << ikm->rows() << std::endl;
  123. std::cerr << "binary label: " << j->second << std::endl;
  124. }
  125. /** About finding a good initial solution
  126. * K~ = K + sigma^2 I
  127. *
  128. * (0) we have already estimated alpha for a previous hyperparameter, then
  129. * we should use this as an initial estimate. According to my quick
  130. * tests this really helps!
  131. * (1) K~ \approx lambda_max v v^T
  132. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  133. * => \lambda_max v^T alpha = v^T y
  134. * => alpha = y / lambda_max could be a good initial start
  135. * If we put everything in the first equation this gives us
  136. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  137. * This reduces the number of iterations by 5 or 8
  138. */
  139. Vector alpha;
  140. if ( (usePreviousAlphas) && (lastAlphas != NULL) )
  141. {
  142. std::map<int, NICE::Vector>::iterator alphaIt = lastAlphas->begin();
  143. alpha = (*lastAlphas)[classCnt];
  144. }
  145. else
  146. {
  147. //TODO hand over the eigenmax
  148. alpha = (binaryLabels[classCnt] ); //* (1.0 / eigenmax[0]) );
  149. }
  150. NICE::Vector initialAlpha;
  151. if ( verbose )
  152. initialAlpha = alpha;
  153. if ( verbose )
  154. std::cerr << "Using the standard solver ..." << std::endl;
  155. t.start();
  156. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  157. t.stop();
  158. alphas.insert( std::pair<int, NICE::Vector> ( classCnt, alpha) );
  159. }
  160. // save the parameter value and alpha vectors
  161. ikm->getParameters ( min_parameter );
  162. this->min_alphas = alphas;
  163. }
  164. double GPLikelihoodApprox::evaluate(const OPTIMIZATION::matrix_type & x)
  165. {
  166. Vector xv;
  167. xv.resize ( x.rows() );
  168. for ( uint i = 0 ; i < x.rows(); i++ )
  169. xv[i] = x(i,0);
  170. // check whether we have been here before
  171. unsigned long hashValue = xv.getHashValue();
  172. if (verbose)
  173. std::cerr << "Current parameter: " << xv << " (weird hash value is " << hashValue << ")" << std::endl;
  174. map<unsigned long, double>::const_iterator k = alreadyVisited.find(hashValue);
  175. if ( k != alreadyVisited.end() )
  176. {
  177. if (verbose)
  178. std::cerr << "Using cached value: " << k->second << std::endl;
  179. //already computed, simply return the cached value
  180. return k->second;
  181. }
  182. // set parameter value and check lower and upper bounds of pf
  183. if ( ikm->outOfBounds(xv) )
  184. {
  185. if (verbose)
  186. std::cerr << "Parameters are out of bounds" << std::endl;
  187. return numeric_limits<double>::max();
  188. }
  189. ikm->setParameters ( xv );
  190. if (verbose)
  191. std::cerr << "setParameters xv: " << xv << std::endl;
  192. // --- calculate the likelihood
  193. // (a) logdet(K + sI)
  194. Timer t;
  195. if (verbose)
  196. std::cerr << "Calculating eigendecomposition " << ikm->rows() << " x " << ikm->cols() << std::endl;
  197. t.start();
  198. Vector eigenmax;
  199. Matrix eigenmaxvectors;
  200. int rank = nrOfEigenvaluesToConsider;
  201. /** calculate the biggest eigenvalue */
  202. // We use a Arnoldi solver and not a specialized one.....
  203. // We might also think about a coordinate descent solver for Arnoldi iterations, however,
  204. // the current implementation converges very quickly
  205. //old version: just use the first eigenvalue
  206. //NOTE
  207. // in theory, we have these values already on hand since we've done it in FMKGPHypOpt.
  208. // Think about wether to give them as input to this function or not
  209. eig->getEigenvalues( *ikm, eigenmax, eigenmaxvectors, rank );
  210. if (verbose)
  211. std::cerr << "eigenmax: " << eigenmax << std::endl;
  212. t.stop();
  213. SparseVector binaryDataterms;
  214. Vector diagonalElements;
  215. ikm->getDiagonalElements ( diagonalElements );
  216. // set simple jacobi pre-conditioning
  217. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  218. //TODO why do we need this?
  219. if ( linsolver_cg != NULL )
  220. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  221. // all alpha vectors will be stored!
  222. map<int, Vector> alphas;
  223. // This has to be done m times for the multi-class case
  224. if (verbose)
  225. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  226. for ( map<int, Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  227. {
  228. // (b) y^T (K+sI)^{-1} y
  229. int classCnt = j->first;
  230. if (verbose)
  231. {
  232. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  233. std::cerr << "Size of the kernel matrix " << ikm->rows() << std::endl;
  234. }
  235. /** About finding a good initial solution
  236. * K~ = K + sigma^2 I
  237. *
  238. * (0) we have already estimated alpha for a previous hyperparameter, then
  239. * we should use this as an initial estimate. According to my quick
  240. * tests this really helps!
  241. * (1) K~ \approx lambda_max v v^T
  242. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  243. * => \lambda_max v^T alpha = v^T y
  244. * => alpha = y / lambda_max could be a good initial start
  245. * If we put everything in the first equation this gives us
  246. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  247. * This reduces the number of iterations by 5 or 8
  248. */
  249. Vector alpha;
  250. if ( (usePreviousAlphas) && (lastAlphas != NULL) )
  251. {
  252. std::map<int, NICE::Vector>::iterator alphaIt = lastAlphas->begin();
  253. alpha = (*lastAlphas)[classCnt];
  254. }
  255. else
  256. {
  257. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  258. }
  259. Vector initialAlpha;
  260. if ( verbose )
  261. initialAlpha = alpha;
  262. if ( verbose )
  263. cerr << "Using the standard solver ..." << endl;
  264. t.start();
  265. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  266. t.stop();
  267. //TODO This is only important for the incremental learning stuff.
  268. // if ( verbose )
  269. // {
  270. // double initialAlphaNorm ( initialAlpha.normL1() );
  271. // //compute the difference
  272. // initialAlpha -= alpha;
  273. // //take the abs of the differences
  274. // initialAlpha.absInplace();
  275. // //and compute a final score using a suitable norm
  276. // // double difference( initialAlpha.normInf() );
  277. // double difference( initialAlpha.normL1() );
  278. // std::cerr << "debug -- last entry of new alpha: " << abs(alpha[alpha.size() -1 ]) << std::endl;
  279. // std::cerr << "debug -- difference using inf norm: " << difference << std::endl;
  280. // std::cerr << "debug -- relative difference using inf norm: " << difference / initialAlphaNorm << std::endl;
  281. // }
  282. if ( verbose )
  283. std::cerr << "Time used for solving (K + sigma^2 I)^{-1} y: " << t.getLast() << std::endl;
  284. // this term is no approximation at all
  285. double dataterm = binaryLabels[classCnt].scalarProduct(alpha);
  286. binaryDataterms[classCnt] = (dataterm);
  287. alphas[classCnt] = alpha;
  288. }
  289. // approximation stuff
  290. if (verbose)
  291. cerr << "Approximating logdet(K) ..." << endl;
  292. t.start();
  293. LogDetApproxBaiAndGolub la;
  294. la.setVerbose(this->verbose);
  295. //NOTE: this is already the squared frobenius norm, that we are looking for.
  296. double frobNormSquared(0.0);
  297. // ------------- LOWER BOUND, THAT IS USED --------------------
  298. // frobNormSquared ~ \sum \lambda_i^2 <-- LOWER BOUND
  299. for (int idx = 0; idx < rank; idx++)
  300. {
  301. frobNormSquared += (eigenmax[idx] * eigenmax[idx]);
  302. }
  303. if (verbose)
  304. cerr << " frob norm squared: est:" << frobNormSquared << endl;
  305. if (verbose)
  306. std::cerr << "trace: " << diagonalElements.Sum() << std::endl;
  307. double logdet = la.getLogDetApproximationUpperBound( diagonalElements.Sum(), /* trace = n only for non-transformed features*/
  308. frobNormSquared, /* use a rough approximation of the frobenius norm */
  309. eigenmax[0], /* upper bound for eigen values */
  310. ikm->rows() /* = n */
  311. );
  312. t.stop();
  313. if (verbose)
  314. cerr << "Time used for approximating logdet(K): " << t.getLast() << endl;
  315. // (c) adding the two terms
  316. double nlikelihood = nrOfClasses*logdet;
  317. double dataterm = binaryDataterms.sum();
  318. nlikelihood += dataterm;
  319. if (verbose)
  320. cerr << "OPT: " << xv << " " << nlikelihood << " " << logdet << " " << dataterm << endl;
  321. if ( nlikelihood < min_nlikelihood )
  322. {
  323. min_nlikelihood = nlikelihood;
  324. ikm->getParameters ( min_parameter );
  325. min_alphas = alphas;
  326. }
  327. alreadyVisited.insert ( pair<int, double> ( hashValue, nlikelihood ) );
  328. return nlikelihood;
  329. }
  330. void GPLikelihoodApprox::setParameterLowerBound(const double & _parameterLowerBound)
  331. {
  332. parameterLowerBound = _parameterLowerBound;
  333. }
  334. void GPLikelihoodApprox::setParameterUpperBound(const double & _parameterUpperBound)
  335. {
  336. parameterUpperBound = _parameterUpperBound;
  337. }
  338. void GPLikelihoodApprox::setLastAlphas(std::map<int, NICE::Vector> * _lastAlphas)
  339. {
  340. lastAlphas = _lastAlphas;
  341. }
  342. void GPLikelihoodApprox::setBinaryLabels(const std::map<int, Vector> & _binaryLabels)
  343. {
  344. binaryLabels = _binaryLabels;
  345. }
  346. void GPLikelihoodApprox::setUsePreviousAlphas( const bool & _usePreviousAlphas )
  347. {
  348. this->usePreviousAlphas = _usePreviousAlphas;
  349. }
  350. void GPLikelihoodApprox::setVerbose( const bool & _verbose )
  351. {
  352. this->verbose = _verbose;
  353. }
  354. void GPLikelihoodApprox::setDebug( const bool & _debug )
  355. {
  356. this->debug = _debug;
  357. }