GPLikelihoodApprox.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /**
  2. * @file GPLikelihoodApprox.cpp
  3. * @brief GP likelihood approximation as a cost function (Implementation)
  4. * @author Erik Rodner, Alexander Freytag
  5. * @date 02/09/2012
  6. */
  7. // STL includes
  8. #include <iostream>
  9. // NICE-core includes
  10. #include <core/algebra/CholeskyRobust.h>
  11. #include <core/algebra/ILSConjugateGradients.h>
  12. //
  13. #include <core/basics/Timer.h>
  14. //
  15. #include <core/vector/Algorithms.h>
  16. #include <core/vector/Eigen.h>
  17. //stuff used for verification only
  18. #include "kernels/GeneralizedIntersectionKernelFunction.h"
  19. #include "kernels/IntersectionKernelFunction.h"
  20. // gp-hik-core includes
  21. #include "gp-hik-core/GPLikelihoodApprox.h"
  22. #include "gp-hik-core/IKMLinearCombination.h"
  23. #include "gp-hik-core/GMHIKernel.h"
  24. #include "gp-hik-core/algebra/LogDetApproxBaiAndGolub.h"
  25. using namespace std;
  26. using namespace NICE;
  27. using namespace OPTIMIZATION;
  28. GPLikelihoodApprox::GPLikelihoodApprox( const std::map<uint, NICE::Vector> & _binaryLabels,
  29. ImplicitKernelMatrix *_ikm,
  30. IterativeLinearSolver *_linsolver,
  31. EigValues *_eig,
  32. bool _verifyApproximation,
  33. int _nrOfEigenvaluesToConsider
  34. )
  35. : CostFunction( _ikm->getNumParameters() )
  36. {
  37. this->binaryLabels = _binaryLabels;
  38. this->ikm = _ikm;
  39. this->linsolver = _linsolver;
  40. this->eig = _eig;
  41. if ( _binaryLabels.size() == 1 )
  42. this->nrOfClasses = 2;
  43. else
  44. this->nrOfClasses = _binaryLabels.size();
  45. this->min_nlikelihood = std::numeric_limits<double>::max();
  46. this->verifyApproximation = _verifyApproximation;
  47. this->nrOfEigenvaluesToConsider = _nrOfEigenvaluesToConsider;
  48. this->verbose = false;
  49. this->debug = false;
  50. this->initialAlphaGuess = NULL;
  51. }
  52. GPLikelihoodApprox::~GPLikelihoodApprox()
  53. {
  54. //we do not have to delete the memory here, since it will be handled externally...
  55. // TODO however, if we should copy the whole vector, than we also have to delete it here accordingly! Check this!
  56. if ( this->initialAlphaGuess != NULL )
  57. this->initialAlphaGuess = NULL;
  58. }
  59. const std::map<uint, Vector> & GPLikelihoodApprox::getBestAlphas () const
  60. {
  61. if ( this->min_alphas.size() > 0 )
  62. {
  63. // did we already computed a local optimal solution?
  64. return this->min_alphas;
  65. }
  66. else if ( this->initialAlphaGuess != NULL)
  67. {
  68. std::cerr << "no known alpha vectors so far, take initial guess instaed" << std::endl;
  69. // computation not started, but initial guess was given, so use this one
  70. return *(this->initialAlphaGuess);
  71. }
  72. // nothing known, min_alphas will be empty
  73. return this->min_alphas;
  74. }
  75. void GPLikelihoodApprox::calculateLikelihood ( double _mypara,
  76. const FeatureMatrix & _f,
  77. const std::map< uint, NICE::Vector > & _yset,
  78. double _noise,
  79. double lambdaMax
  80. )
  81. {
  82. // robust cholesky routine without noise !!
  83. CholeskyRobust cr ( true /*verbose*/, 0.0, false /*useCuda*/ );
  84. Timer t;
  85. t.start();
  86. cerr << "VERIFY: Calculating kernel matrix ..." << endl;
  87. Matrix K;
  88. IntersectionKernelFunction<double> hik;
  89. //old version, not needed anymore - we explore sparsity
  90. // K = hik.computeKernelMatrix(data_matrix, _noise); // = K + sigma^2 I
  91. K = hik.computeKernelMatrix(_f, _noise);
  92. t.stop();
  93. cerr << "VERIFY: Time used for calculating kernel matrix is: " << t.getLast() << endl;
  94. cerr << "K is a " << K.rows() << " x " << K.cols() << " matrix" << endl;
  95. if ( K.containsNaN() )
  96. fthrow(Exception, "NaN values in the kernel matrix");
  97. cerr << "VERIFY: Computing likelihood ..." << endl;
  98. t.start();
  99. Matrix choleskyMatrix;
  100. cr.robustChol ( K, choleskyMatrix ); // K = choleskyMatrix^T * choleskyMatrix
  101. double gt_logdet = (_yset.size()) * cr.getLastLogDet();
  102. cerr << "chol * chol^T: " << ( choleskyMatrix * choleskyMatrix.transpose() )(0,0,4,4) << endl;
  103. double gt_dataterm = 0.0;
  104. for ( std::map< uint, NICE::Vector >::const_iterator i = _yset.begin(); i != _yset.end(); i++ )
  105. {
  106. const NICE::Vector & y = i->second;
  107. Vector gt_alpha;
  108. choleskySolve ( choleskyMatrix, y, gt_alpha );
  109. cerr << "cholesky error: " << (K*gt_alpha - y).normL2() << endl;
  110. gt_dataterm += y.scalarProduct ( gt_alpha );
  111. }
  112. //cerr << "linsolve error: " << (tmp - y).normL2() << endl;
  113. t.stop();
  114. cerr << "VERIFY: Time used for calculating likelihood: " << t.getLast() << endl;
  115. cerr << "Something of K: " << K(0, 0, 4, 4) << endl;
  116. cerr << "frob norm: gt:" << K.frobeniusNorm() << endl;
  117. double gt_nlikelihood = gt_logdet + gt_dataterm;
  118. cerr << "OPTGT: " << _mypara << " " << gt_nlikelihood << " " << gt_logdet << " " << gt_dataterm << endl;
  119. }
  120. void GPLikelihoodApprox::computeAlphaDirect(const OPTIMIZATION::matrix_type & _x,
  121. const NICE::Vector & _eigenValues
  122. )
  123. {
  124. Timer t;
  125. NICE::Vector diagonalElements;
  126. ikm->getDiagonalElements ( diagonalElements );
  127. // set simple jacobi pre-conditioning
  128. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  129. if ( linsolver_cg != NULL )
  130. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  131. // all alpha vectors will be stored!
  132. std::map<uint, NICE::Vector> alphas;
  133. // This has to be done m times for the multi-class case
  134. if ( this->verbose )
  135. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  136. for ( std::map<uint, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  137. {
  138. // (b) y^T (K+sI)^{-1} y
  139. uint classCnt = j->first;
  140. if ( this->verbose )
  141. {
  142. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  143. }
  144. /** About finding a good initial solution
  145. * K~ = K + sigma^2 I
  146. *
  147. * (0) we have already estimated alpha for a previous hyperparameter, then
  148. * we should use this as an initial estimate. According to my quick
  149. * tests this really helps!
  150. * (1) K~ \approx lambda_max v v^T
  151. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  152. * => \lambda_max v^T alpha = v^T y
  153. * => alpha = y / lambda_max could be a good initial start
  154. * If we put everything in the first equation this gives us
  155. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  156. * This reduces the number of iterations by 5 or 8
  157. */
  158. NICE::Vector alpha;
  159. alpha = (binaryLabels[classCnt] * (1.0 / _eigenValues[0]) );
  160. if ( verbose )
  161. std::cerr << "Using the standard solver ..." << std::endl;
  162. t.start();
  163. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  164. t.stop();
  165. alphas.insert( std::pair<uint, NICE::Vector> ( classCnt, alpha) );
  166. }
  167. // save the parameter value and alpha vectors
  168. ikm->getParameters ( min_parameter );
  169. this->min_alphas = alphas;
  170. }
  171. double GPLikelihoodApprox::evaluate(const OPTIMIZATION::matrix_type & _x)
  172. {
  173. NICE::Vector xv;
  174. xv.resize ( _x.rows() );
  175. for ( uint i = 0 ; i < _x.rows(); i++ )
  176. xv[i] = _x(i,0);
  177. // check whether we have been here before
  178. unsigned long hashValue = xv.getHashValue();
  179. if ( this->verbose )
  180. std::cerr << "Current parameter: " << xv << " (weird hash value is " << hashValue << ")" << std::endl;
  181. std::map<unsigned long, double>::const_iterator k = alreadyVisited.find(hashValue);
  182. if ( k != alreadyVisited.end() )
  183. {
  184. if ( this->verbose )
  185. std::cerr << "Using cached value: " << k->second << std::endl;
  186. //already computed, simply return the cached value
  187. return k->second;
  188. }
  189. // set parameter value and check lower and upper bounds of pf
  190. if ( ikm->outOfBounds(xv) )
  191. {
  192. if ( this->verbose )
  193. std::cerr << "Parameters are out of bounds" << std::endl;
  194. return numeric_limits<double>::max();
  195. }
  196. ikm->setParameters ( xv );
  197. if ( this->verbose )
  198. std::cerr << "setParameters xv: " << xv << std::endl;
  199. // --- calculate the likelihood
  200. // (a) logdet(K + sI)
  201. Timer t;
  202. if ( this->verbose )
  203. std::cerr << "Calculating eigendecomposition " << ikm->rows() << " x " << ikm->cols() << std::endl;
  204. t.start();
  205. NICE::Vector eigenmax;
  206. NICE::Matrix eigenmaxvectors;
  207. int rank = nrOfEigenvaluesToConsider;
  208. /** calculate the biggest eigenvalue */
  209. // We use a Arnoldi solver and not a specialized one.....
  210. // We might also think about a coordinate descent solver for Arnoldi iterations, however,
  211. // the current implementation converges very quickly
  212. //old version: just use the first eigenvalue
  213. // we have to re-compute EV and EW in all cases, since we change the hyper parameter and thereby the kernel matrix
  214. eig->getEigenvalues( *ikm, eigenmax, eigenmaxvectors, rank );
  215. if ( this->verbose )
  216. std::cerr << "eigenmax: " << eigenmax << std::endl;
  217. t.stop();
  218. SparseVector binaryDataterms;
  219. Vector diagonalElements;
  220. ikm->getDiagonalElements ( diagonalElements );
  221. // set simple jacobi pre-conditioning
  222. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  223. //TODO why do we need this?
  224. if ( linsolver_cg != NULL )
  225. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  226. // all alpha vectors will be stored!
  227. std::map<uint, NICE::Vector> alphas;
  228. // This has to be done m times for the multi-class case
  229. if ( this->verbose )
  230. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  231. for ( std::map<uint, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  232. {
  233. // (b) y^T (K+sI)^{-1} y
  234. uint classCnt = j->first;
  235. if ( this->verbose )
  236. {
  237. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  238. std::cerr << "Size of the kernel matrix " << ikm->rows() << std::endl;
  239. }
  240. /** About finding a good initial solution
  241. * K~ = K + sigma^2 I
  242. *
  243. * (0) we have already estimated alpha for a previous hyperparameter, then
  244. * we should use this as an initial estimate. According to my quick
  245. * tests this really helps!
  246. * (1) K~ \approx lambda_max v v^T
  247. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  248. * => \lambda_max v^T alpha = v^T y
  249. * => alpha = y / lambda_max could be a good initial start
  250. * If we put everything in the first equation this gives us
  251. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  252. * This reduces the number of iterations by 5 or 8
  253. */
  254. NICE::Vector alpha;
  255. if ( this->initialAlphaGuess != NULL )
  256. {
  257. std::map<uint, NICE::Vector>::iterator myIt = this->initialAlphaGuess->find(classCnt);
  258. if ( myIt != this->initialAlphaGuess->end() )
  259. alpha = myIt->second;
  260. else
  261. {
  262. //NOTE this should never happen in theory...
  263. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  264. }
  265. }
  266. else
  267. {
  268. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  269. }
  270. if ( verbose )
  271. cerr << "Using the standard solver ..." << endl;
  272. t.start();
  273. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  274. t.stop();
  275. if ( verbose )
  276. std::cerr << "Time used for solving (K + sigma^2 I)^{-1} y: " << t.getLast() << std::endl;
  277. // this term is no approximation at all
  278. double dataterm = binaryLabels[classCnt].scalarProduct(alpha);
  279. binaryDataterms[classCnt] = (dataterm);
  280. alphas[classCnt] = alpha;
  281. }
  282. // approximation stuff
  283. if ( this->verbose )
  284. cerr << "Approximating logdet(K) ..." << endl;
  285. t.start();
  286. LogDetApproxBaiAndGolub la;
  287. la.setVerbose(this->verbose);
  288. //NOTE: this is already the squared frobenius norm, that we are looking for.
  289. double frobNormSquared(0.0);
  290. // ------------- LOWER BOUND, THAT IS USED --------------------
  291. // frobNormSquared ~ \sum \lambda_i^2 <-- LOWER BOUND
  292. for (int idx = 0; idx < rank; idx++)
  293. {
  294. frobNormSquared += (eigenmax[idx] * eigenmax[idx]);
  295. }
  296. if ( this->verbose )
  297. cerr << " frob norm squared: est:" << frobNormSquared << endl;
  298. if ( this->verbose )
  299. std::cerr << "trace: " << diagonalElements.Sum() << std::endl;
  300. double logdet = la.getLogDetApproximationUpperBound( diagonalElements.Sum(), /* trace = n only for non-transformed features*/
  301. frobNormSquared, /* use a rough approximation of the frobenius norm */
  302. eigenmax[0], /* upper bound for eigen values */
  303. ikm->rows() /* = n */
  304. );
  305. t.stop();
  306. if ( this->verbose )
  307. cerr << "Time used for approximating logdet(K): " << t.getLast() << endl;
  308. // (c) adding the two terms
  309. double nlikelihood = this->nrOfClasses*logdet;
  310. double dataterm = binaryDataterms.sum();
  311. nlikelihood += dataterm;
  312. if ( this->verbose )
  313. cerr << "OPT: " << xv << " " << nlikelihood << " " << logdet << " " << dataterm << endl;
  314. if ( nlikelihood < min_nlikelihood )
  315. {
  316. min_nlikelihood = nlikelihood;
  317. ikm->getParameters ( min_parameter );
  318. this->min_alphas = alphas;
  319. }
  320. this->alreadyVisited.insert ( std::pair<unsigned long, double> ( hashValue, nlikelihood ) );
  321. return nlikelihood;
  322. }
  323. void GPLikelihoodApprox::setParameterLowerBound(const double & _parameterLowerBound)
  324. {
  325. this->parameterLowerBound = _parameterLowerBound;
  326. }
  327. void GPLikelihoodApprox::setParameterUpperBound(const double & _parameterUpperBound)
  328. {
  329. this->parameterUpperBound = _parameterUpperBound;
  330. }
  331. void GPLikelihoodApprox::setInitialAlphaGuess(std::map< uint, NICE::Vector >* _initialAlphaGuess)
  332. {
  333. this->initialAlphaGuess = _initialAlphaGuess;
  334. }
  335. void GPLikelihoodApprox::setBinaryLabels(const std::map<uint, Vector> & _binaryLabels)
  336. {
  337. this->binaryLabels = _binaryLabels;
  338. }
  339. void GPLikelihoodApprox::setVerbose( const bool & _verbose )
  340. {
  341. this->verbose = _verbose;
  342. }
  343. void GPLikelihoodApprox::setDebug( const bool & _debug )
  344. {
  345. this->debug = _debug;
  346. }