GPLikelihoodApprox.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /**
  2. * @file GPLikelihoodApprox.cpp
  3. * @brief GP likelihood approximation as a cost function (Implementation)
  4. * @author Erik Rodner, Alexander Freytag
  5. * @date 02/09/2012
  6. */
  7. // STL includes
  8. #include <iostream>
  9. // NICE-core includes
  10. #include <core/algebra/CholeskyRobust.h>
  11. #include <core/algebra/ILSConjugateGradients.h>
  12. //
  13. #include <core/basics/Timer.h>
  14. //
  15. #include <core/vector/Algorithms.h>
  16. #include <core/vector/Eigen.h>
  17. //stuff used for verification only
  18. #include "kernels/GeneralizedIntersectionKernelFunction.h"
  19. #include "kernels/IntersectionKernelFunction.h"
  20. // gp-hik-core includes
  21. #include "gp-hik-core/GPLikelihoodApprox.h"
  22. #include "gp-hik-core/IKMLinearCombination.h"
  23. #include "gp-hik-core/GMHIKernel.h"
  24. #include "gp-hik-core/algebra/LogDetApproxBaiAndGolub.h"
  25. using namespace std;
  26. using namespace NICE;
  27. using namespace OPTIMIZATION;
  28. GPLikelihoodApprox::GPLikelihoodApprox( const std::map<int, NICE::Vector> & binaryLabels,
  29. ImplicitKernelMatrix *ikm,
  30. IterativeLinearSolver *linsolver,
  31. EigValues *eig,
  32. bool verifyApproximation,
  33. int _nrOfEigenvaluesToConsider
  34. )
  35. : CostFunction( ikm->getNumParameters() )
  36. {
  37. this->binaryLabels = binaryLabels;
  38. this->ikm = ikm;
  39. this->linsolver = linsolver;
  40. this->eig = eig;
  41. if ( binaryLabels.size() == 1 )
  42. this->nrOfClasses = 2;
  43. else
  44. this->nrOfClasses = binaryLabels.size();
  45. this->min_nlikelihood = std::numeric_limits<double>::max();
  46. this->verifyApproximation = verifyApproximation;
  47. this->nrOfEigenvaluesToConsider = _nrOfEigenvaluesToConsider;
  48. this->verbose = false;
  49. this->debug = false;
  50. this->initialAlphaGuess = NULL;
  51. }
  52. GPLikelihoodApprox::~GPLikelihoodApprox()
  53. {
  54. //we do not have to delete the memory here, since it will be handled externally...
  55. // TODO however, if we should copy the whole vector, than we also have to delete it here accordingly! Check this!
  56. if ( this->initialAlphaGuess != NULL )
  57. this->initialAlphaGuess = NULL;
  58. }
  59. const std::map<int, Vector> & GPLikelihoodApprox::getBestAlphas () const
  60. {
  61. if ( this->min_alphas.size() > 0 )
  62. {
  63. // did we already computed a local optimal solution?
  64. return this->min_alphas;
  65. }
  66. else if ( this->initialAlphaGuess != NULL)
  67. {
  68. std::cerr << "no known alpha vectors so far, take initial guess instaed" << std::endl;
  69. // computation not started, but initial guess was given, so use this one
  70. return *(this->initialAlphaGuess);
  71. }
  72. // nothing known, min_alphas will be empty
  73. return this->min_alphas;
  74. }
  75. void GPLikelihoodApprox::calculateLikelihood ( double mypara, const FeatureMatrix & f, const std::map< int, NICE::Vector > & yset, double noise, double lambdaMax )
  76. {
  77. // robust cholesky routine without noise !!
  78. CholeskyRobust cr ( true /*verbose*/, 0.0, false /*useCuda*/ );
  79. Timer t;
  80. t.start();
  81. cerr << "VERIFY: Calculating kernel matrix ..." << endl;
  82. Matrix K;
  83. IntersectionKernelFunction<double> hik;
  84. //old version, not needed anymore - we explore sparsity
  85. // K = hik.computeKernelMatrix(data_matrix, noise); // = K + sigma^2 I
  86. K = hik.computeKernelMatrix(f, noise);
  87. t.stop();
  88. cerr << "VERIFY: Time used for calculating kernel matrix is: " << t.getLast() << endl;
  89. cerr << "K is a " << K.rows() << " x " << K.cols() << " matrix" << endl;
  90. if ( K.containsNaN() )
  91. fthrow(Exception, "NaN values in the kernel matrix");
  92. cerr << "VERIFY: Computing likelihood ..." << endl;
  93. t.start();
  94. Matrix choleskyMatrix;
  95. cr.robustChol ( K, choleskyMatrix ); // K = choleskyMatrix^T * choleskyMatrix
  96. double gt_logdet = (yset.size()) * cr.getLastLogDet();
  97. cerr << "chol * chol^T: " << ( choleskyMatrix * choleskyMatrix.transpose() )(0,0,4,4) << endl;
  98. double gt_dataterm = 0.0;
  99. for ( std::map< int, NICE::Vector >::const_iterator i = yset.begin(); i != yset.end(); i++ )
  100. {
  101. const NICE::Vector & y = i->second;
  102. Vector gt_alpha;
  103. choleskySolve ( choleskyMatrix, y, gt_alpha );
  104. cerr << "cholesky error: " << (K*gt_alpha - y).normL2() << endl;
  105. gt_dataterm += y.scalarProduct ( gt_alpha );
  106. }
  107. //cerr << "linsolve error: " << (tmp - y).normL2() << endl;
  108. t.stop();
  109. cerr << "VERIFY: Time used for calculating likelihood: " << t.getLast() << endl;
  110. cerr << "Something of K: " << K(0, 0, 4, 4) << endl;
  111. cerr << "frob norm: gt:" << K.frobeniusNorm() << endl;
  112. double gt_nlikelihood = gt_logdet + gt_dataterm;
  113. cerr << "OPTGT: " << mypara << " " << gt_nlikelihood << " " << gt_logdet << " " << gt_dataterm << endl;
  114. }
  115. void GPLikelihoodApprox::computeAlphaDirect(const OPTIMIZATION::matrix_type & x, const NICE::Vector & eigenValues )
  116. {
  117. Timer t;
  118. NICE::Vector diagonalElements;
  119. ikm->getDiagonalElements ( diagonalElements );
  120. // set simple jacobi pre-conditioning
  121. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  122. if ( linsolver_cg != NULL )
  123. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  124. // all alpha vectors will be stored!
  125. std::map<int, NICE::Vector> alphas;
  126. // This has to be done m times for the multi-class case
  127. if ( this->verbose )
  128. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  129. for ( std::map<int, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  130. {
  131. // (b) y^T (K+sI)^{-1} y
  132. int classCnt = j->first;
  133. if ( this->verbose )
  134. {
  135. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  136. }
  137. /** About finding a good initial solution
  138. * K~ = K + sigma^2 I
  139. *
  140. * (0) we have already estimated alpha for a previous hyperparameter, then
  141. * we should use this as an initial estimate. According to my quick
  142. * tests this really helps!
  143. * (1) K~ \approx lambda_max v v^T
  144. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  145. * => \lambda_max v^T alpha = v^T y
  146. * => alpha = y / lambda_max could be a good initial start
  147. * If we put everything in the first equation this gives us
  148. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  149. * This reduces the number of iterations by 5 or 8
  150. */
  151. NICE::Vector alpha;
  152. alpha = (binaryLabels[classCnt] * (1.0 / eigenValues[0]) );
  153. if ( verbose )
  154. std::cerr << "Using the standard solver ..." << std::endl;
  155. t.start();
  156. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  157. t.stop();
  158. alphas.insert( std::pair<int, NICE::Vector> ( classCnt, alpha) );
  159. }
  160. // save the parameter value and alpha vectors
  161. ikm->getParameters ( min_parameter );
  162. this->min_alphas = alphas;
  163. }
  164. double GPLikelihoodApprox::evaluate(const OPTIMIZATION::matrix_type & x)
  165. {
  166. NICE::Vector xv;
  167. xv.resize ( x.rows() );
  168. for ( uint i = 0 ; i < x.rows(); i++ )
  169. xv[i] = x(i,0);
  170. // check whether we have been here before
  171. unsigned long hashValue = xv.getHashValue();
  172. if ( this->verbose )
  173. std::cerr << "Current parameter: " << xv << " (weird hash value is " << hashValue << ")" << std::endl;
  174. std::map<unsigned long, double>::const_iterator k = alreadyVisited.find(hashValue);
  175. if ( k != alreadyVisited.end() )
  176. {
  177. if ( this->verbose )
  178. std::cerr << "Using cached value: " << k->second << std::endl;
  179. //already computed, simply return the cached value
  180. return k->second;
  181. }
  182. // set parameter value and check lower and upper bounds of pf
  183. if ( ikm->outOfBounds(xv) )
  184. {
  185. if ( this->verbose )
  186. std::cerr << "Parameters are out of bounds" << std::endl;
  187. return numeric_limits<double>::max();
  188. }
  189. ikm->setParameters ( xv );
  190. if ( this->verbose )
  191. std::cerr << "setParameters xv: " << xv << std::endl;
  192. // --- calculate the likelihood
  193. // (a) logdet(K + sI)
  194. Timer t;
  195. if ( this->verbose )
  196. std::cerr << "Calculating eigendecomposition " << ikm->rows() << " x " << ikm->cols() << std::endl;
  197. t.start();
  198. NICE::Vector eigenmax;
  199. NICE::Matrix eigenmaxvectors;
  200. int rank = nrOfEigenvaluesToConsider;
  201. /** calculate the biggest eigenvalue */
  202. // We use a Arnoldi solver and not a specialized one.....
  203. // We might also think about a coordinate descent solver for Arnoldi iterations, however,
  204. // the current implementation converges very quickly
  205. //old version: just use the first eigenvalue
  206. // we have to re-compute EV and EW in all cases, since we change the hyper parameter and thereby the kernel matrix
  207. eig->getEigenvalues( *ikm, eigenmax, eigenmaxvectors, rank );
  208. if ( this->verbose )
  209. std::cerr << "eigenmax: " << eigenmax << std::endl;
  210. t.stop();
  211. SparseVector binaryDataterms;
  212. Vector diagonalElements;
  213. ikm->getDiagonalElements ( diagonalElements );
  214. // set simple jacobi pre-conditioning
  215. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  216. //TODO why do we need this?
  217. if ( linsolver_cg != NULL )
  218. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  219. // all alpha vectors will be stored!
  220. std::map<int, NICE::Vector> alphas;
  221. // This has to be done m times for the multi-class case
  222. if ( this->verbose )
  223. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  224. for ( std::map<int, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  225. {
  226. // (b) y^T (K+sI)^{-1} y
  227. int classCnt = j->first;
  228. if ( this->verbose )
  229. {
  230. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  231. std::cerr << "Size of the kernel matrix " << ikm->rows() << std::endl;
  232. }
  233. /** About finding a good initial solution
  234. * K~ = K + sigma^2 I
  235. *
  236. * (0) we have already estimated alpha for a previous hyperparameter, then
  237. * we should use this as an initial estimate. According to my quick
  238. * tests this really helps!
  239. * (1) K~ \approx lambda_max v v^T
  240. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  241. * => \lambda_max v^T alpha = v^T y
  242. * => alpha = y / lambda_max could be a good initial start
  243. * If we put everything in the first equation this gives us
  244. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  245. * This reduces the number of iterations by 5 or 8
  246. */
  247. NICE::Vector alpha;
  248. if ( this->initialAlphaGuess != NULL )
  249. {
  250. std::map<int, NICE::Vector>::iterator myIt = this->initialAlphaGuess->find(classCnt);
  251. if ( myIt != this->initialAlphaGuess->end() )
  252. alpha = myIt->second;
  253. else
  254. {
  255. //NOTE this should never happen in theory...
  256. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  257. }
  258. }
  259. else
  260. {
  261. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  262. }
  263. if ( verbose )
  264. cerr << "Using the standard solver ..." << endl;
  265. t.start();
  266. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  267. t.stop();
  268. if ( verbose )
  269. std::cerr << "Time used for solving (K + sigma^2 I)^{-1} y: " << t.getLast() << std::endl;
  270. // this term is no approximation at all
  271. double dataterm = binaryLabels[classCnt].scalarProduct(alpha);
  272. binaryDataterms[classCnt] = (dataterm);
  273. alphas[classCnt] = alpha;
  274. }
  275. // approximation stuff
  276. if ( this->verbose )
  277. cerr << "Approximating logdet(K) ..." << endl;
  278. t.start();
  279. LogDetApproxBaiAndGolub la;
  280. la.setVerbose(this->verbose);
  281. //NOTE: this is already the squared frobenius norm, that we are looking for.
  282. double frobNormSquared(0.0);
  283. // ------------- LOWER BOUND, THAT IS USED --------------------
  284. // frobNormSquared ~ \sum \lambda_i^2 <-- LOWER BOUND
  285. for (int idx = 0; idx < rank; idx++)
  286. {
  287. frobNormSquared += (eigenmax[idx] * eigenmax[idx]);
  288. }
  289. if ( this->verbose )
  290. cerr << " frob norm squared: est:" << frobNormSquared << endl;
  291. if ( this->verbose )
  292. std::cerr << "trace: " << diagonalElements.Sum() << std::endl;
  293. double logdet = la.getLogDetApproximationUpperBound( diagonalElements.Sum(), /* trace = n only for non-transformed features*/
  294. frobNormSquared, /* use a rough approximation of the frobenius norm */
  295. eigenmax[0], /* upper bound for eigen values */
  296. ikm->rows() /* = n */
  297. );
  298. t.stop();
  299. if ( this->verbose )
  300. cerr << "Time used for approximating logdet(K): " << t.getLast() << endl;
  301. // (c) adding the two terms
  302. double nlikelihood = nrOfClasses*logdet;
  303. double dataterm = binaryDataterms.sum();
  304. nlikelihood += dataterm;
  305. if ( this->verbose )
  306. cerr << "OPT: " << xv << " " << nlikelihood << " " << logdet << " " << dataterm << endl;
  307. if ( nlikelihood < min_nlikelihood )
  308. {
  309. min_nlikelihood = nlikelihood;
  310. ikm->getParameters ( min_parameter );
  311. min_alphas = alphas;
  312. }
  313. alreadyVisited.insert ( pair<int, double> ( hashValue, nlikelihood ) );
  314. return nlikelihood;
  315. }
  316. void GPLikelihoodApprox::setParameterLowerBound(const double & _parameterLowerBound)
  317. {
  318. parameterLowerBound = _parameterLowerBound;
  319. }
  320. void GPLikelihoodApprox::setParameterUpperBound(const double & _parameterUpperBound)
  321. {
  322. parameterUpperBound = _parameterUpperBound;
  323. }
  324. void GPLikelihoodApprox::setInitialAlphaGuess(std::map< int, NICE::Vector >* _initialAlphaGuess)
  325. {
  326. this->initialAlphaGuess = _initialAlphaGuess;
  327. }
  328. void GPLikelihoodApprox::setBinaryLabels(const std::map<int, Vector> & _binaryLabels)
  329. {
  330. binaryLabels = _binaryLabels;
  331. }
  332. void GPLikelihoodApprox::setVerbose( const bool & _verbose )
  333. {
  334. this->verbose = _verbose;
  335. }
  336. void GPLikelihoodApprox::setDebug( const bool & _debug )
  337. {
  338. this->debug = _debug;
  339. }