GPLikelihoodApprox.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /**
  2. * @file GPLikelihoodApprox.cpp
  3. * @brief GP likelihood approximation as a cost function (Implementation)
  4. * @author Erik Rodner, Alexander Freytag
  5. * @date 02/09/2012
  6. */
  7. #include <iostream>
  8. #include <core/algebra/CholeskyRobust.h>
  9. #include <core/vector/Algorithms.h>
  10. #include <core/vector/Eigen.h>
  11. #include <core/basics/Timer.h>
  12. #include <core/algebra/ILSConjugateGradients.h>
  13. #include "kernels/GeneralizedIntersectionKernelFunction.h"
  14. #include "kernels/IntersectionKernelFunction.h"
  15. #include "GPLikelihoodApprox.h"
  16. #include "IKMLinearCombination.h"
  17. #include "GMHIKernel.h"
  18. #include "algebra/LogDetApproxBaiAndGolub.h"
  19. using namespace std;
  20. using namespace NICE;
  21. using namespace OPTIMIZATION;
  22. GPLikelihoodApprox::GPLikelihoodApprox( const map<int, Vector> & binaryLabels,
  23. ImplicitKernelMatrix *ikm,
  24. IterativeLinearSolver *linsolver,
  25. EigValues *eig,
  26. bool verifyApproximation,
  27. int _nrOfEigenvaluesToConsider
  28. )
  29. : CostFunction( ikm->getNumParameters() )
  30. {
  31. this->binaryLabels = binaryLabels;
  32. this->ikm = ikm;
  33. this->linsolver = linsolver;
  34. this->eig = eig;
  35. if ( binaryLabels.size() == 1 )
  36. this->nrOfClasses = 2;
  37. else
  38. this->nrOfClasses = binaryLabels.size();
  39. this->min_nlikelihood = std::numeric_limits<double>::max();
  40. this->verifyApproximation = verifyApproximation;
  41. this->nrOfEigenvaluesToConsider = _nrOfEigenvaluesToConsider;
  42. this->verbose = false;
  43. this->debug = false;
  44. }
  45. GPLikelihoodApprox::~GPLikelihoodApprox()
  46. {
  47. }
  48. void GPLikelihoodApprox::calculateLikelihood ( double mypara, const FeatureMatrix & f, const std::map< int, NICE::Vector > & yset, double noise, double lambdaMax )
  49. {
  50. // robust cholesky routine without noise !!
  51. CholeskyRobust cr ( true /*verbose*/, 0.0, false /*useCuda*/ );
  52. Timer t;
  53. t.start();
  54. cerr << "VERIFY: Calculating kernel matrix ..." << endl;
  55. Matrix K;
  56. IntersectionKernelFunction<double> hik;
  57. //old version, not needed anymore - we explore sparsity
  58. // K = hik.computeKernelMatrix(data_matrix, noise); // = K + sigma^2 I
  59. K = hik.computeKernelMatrix(f, noise);
  60. t.stop();
  61. cerr << "VERIFY: Time used for calculating kernel matrix is: " << t.getLast() << endl;
  62. cerr << "K is a " << K.rows() << " x " << K.cols() << " matrix" << endl;
  63. if ( K.containsNaN() )
  64. fthrow(Exception, "NaN values in the kernel matrix");
  65. cerr << "VERIFY: Computing likelihood ..." << endl;
  66. t.start();
  67. Matrix choleskyMatrix;
  68. cr.robustChol ( K, choleskyMatrix ); // K = choleskyMatrix^T * choleskyMatrix
  69. double gt_logdet = (yset.size()) * cr.getLastLogDet();
  70. cerr << "chol * chol^T: " << ( choleskyMatrix * choleskyMatrix.transpose() )(0,0,4,4) << endl;
  71. double gt_dataterm = 0.0;
  72. for ( std::map< int, NICE::Vector >::const_iterator i = yset.begin(); i != yset.end(); i++ )
  73. {
  74. const NICE::Vector & y = i->second;
  75. Vector gt_alpha;
  76. choleskySolve ( choleskyMatrix, y, gt_alpha );
  77. cerr << "cholesky error: " << (K*gt_alpha - y).normL2() << endl;
  78. gt_dataterm += y.scalarProduct ( gt_alpha );
  79. }
  80. //cerr << "linsolve error: " << (tmp - y).normL2() << endl;
  81. t.stop();
  82. cerr << "VERIFY: Time used for calculating likelihood: " << t.getLast() << endl;
  83. cerr << "Something of K: " << K(0, 0, 4, 4) << endl;
  84. cerr << "frob norm: gt:" << K.frobeniusNorm() << endl;
  85. double gt_nlikelihood = gt_logdet + gt_dataterm;
  86. cerr << "OPTGT: " << mypara << " " << gt_nlikelihood << " " << gt_logdet << " " << gt_dataterm << endl;
  87. }
  88. void GPLikelihoodApprox::computeAlphaDirect(const OPTIMIZATION::matrix_type & x, const NICE::Vector & eigenValues )
  89. {
  90. Timer t;
  91. NICE::Vector diagonalElements;
  92. ikm->getDiagonalElements ( diagonalElements );
  93. // set simple jacobi pre-conditioning
  94. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  95. if ( linsolver_cg != NULL )
  96. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  97. // all alpha vectors will be stored!
  98. std::map<int, NICE::Vector> alphas;
  99. // This has to be done m times for the multi-class case
  100. if (verbose)
  101. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  102. for ( std::map<int, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  103. {
  104. // (b) y^T (K+sI)^{-1} y
  105. int classCnt = j->first;
  106. if (verbose)
  107. {
  108. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  109. }
  110. /** About finding a good initial solution
  111. * K~ = K + sigma^2 I
  112. *
  113. * (0) we have already estimated alpha for a previous hyperparameter, then
  114. * we should use this as an initial estimate. According to my quick
  115. * tests this really helps!
  116. * (1) K~ \approx lambda_max v v^T
  117. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  118. * => \lambda_max v^T alpha = v^T y
  119. * => alpha = y / lambda_max could be a good initial start
  120. * If we put everything in the first equation this gives us
  121. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  122. * This reduces the number of iterations by 5 or 8
  123. */
  124. NICE::Vector alpha;
  125. alpha = (binaryLabels[classCnt] * (1.0 / eigenValues[0]) );
  126. if ( verbose )
  127. std::cerr << "Using the standard solver ..." << std::endl;
  128. t.start();
  129. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  130. t.stop();
  131. alphas.insert( std::pair<int, NICE::Vector> ( classCnt, alpha) );
  132. }
  133. // save the parameter value and alpha vectors
  134. ikm->getParameters ( min_parameter );
  135. this->min_alphas = alphas;
  136. }
  137. double GPLikelihoodApprox::evaluate(const OPTIMIZATION::matrix_type & x)
  138. {
  139. NICE::Vector xv;
  140. xv.resize ( x.rows() );
  141. for ( uint i = 0 ; i < x.rows(); i++ )
  142. xv[i] = x(i,0);
  143. // check whether we have been here before
  144. unsigned long hashValue = xv.getHashValue();
  145. if (verbose)
  146. std::cerr << "Current parameter: " << xv << " (weird hash value is " << hashValue << ")" << std::endl;
  147. std::map<unsigned long, double>::const_iterator k = alreadyVisited.find(hashValue);
  148. if ( k != alreadyVisited.end() )
  149. {
  150. if (verbose)
  151. std::cerr << "Using cached value: " << k->second << std::endl;
  152. //already computed, simply return the cached value
  153. return k->second;
  154. }
  155. // set parameter value and check lower and upper bounds of pf
  156. if ( ikm->outOfBounds(xv) )
  157. {
  158. if (verbose)
  159. std::cerr << "Parameters are out of bounds" << std::endl;
  160. return numeric_limits<double>::max();
  161. }
  162. ikm->setParameters ( xv );
  163. if (verbose)
  164. std::cerr << "setParameters xv: " << xv << std::endl;
  165. // --- calculate the likelihood
  166. // (a) logdet(K + sI)
  167. Timer t;
  168. if (verbose)
  169. std::cerr << "Calculating eigendecomposition " << ikm->rows() << " x " << ikm->cols() << std::endl;
  170. t.start();
  171. NICE::Vector eigenmax;
  172. NICE::Matrix eigenmaxvectors;
  173. int rank = nrOfEigenvaluesToConsider;
  174. /** calculate the biggest eigenvalue */
  175. // We use a Arnoldi solver and not a specialized one.....
  176. // We might also think about a coordinate descent solver for Arnoldi iterations, however,
  177. // the current implementation converges very quickly
  178. //old version: just use the first eigenvalue
  179. // we have to re-compute EV and EW in all cases, since we change the hyper parameter and thereby the kernel matrix
  180. eig->getEigenvalues( *ikm, eigenmax, eigenmaxvectors, rank );
  181. if (verbose)
  182. std::cerr << "eigenmax: " << eigenmax << std::endl;
  183. t.stop();
  184. SparseVector binaryDataterms;
  185. Vector diagonalElements;
  186. ikm->getDiagonalElements ( diagonalElements );
  187. // set simple jacobi pre-conditioning
  188. ILSConjugateGradients *linsolver_cg = dynamic_cast<ILSConjugateGradients *> ( linsolver );
  189. //TODO why do we need this?
  190. if ( linsolver_cg != NULL )
  191. linsolver_cg->setJacobiPreconditioner ( diagonalElements );
  192. // all alpha vectors will be stored!
  193. std::map<int, NICE::Vector> alphas;
  194. // This has to be done m times for the multi-class case
  195. if (verbose)
  196. std::cerr << "run ILS for every bin label. binaryLabels.size(): " << binaryLabels.size() << std::endl;
  197. for ( std::map<int, NICE::Vector>::const_iterator j = binaryLabels.begin(); j != binaryLabels.end() ; j++)
  198. {
  199. // (b) y^T (K+sI)^{-1} y
  200. int classCnt = j->first;
  201. if (verbose)
  202. {
  203. std::cerr << "Solving linear equation system for class " << classCnt << " ..." << std::endl;
  204. std::cerr << "Size of the kernel matrix " << ikm->rows() << std::endl;
  205. }
  206. /** About finding a good initial solution
  207. * K~ = K + sigma^2 I
  208. *
  209. * (0) we have already estimated alpha for a previous hyperparameter, then
  210. * we should use this as an initial estimate. According to my quick
  211. * tests this really helps!
  212. * (1) K~ \approx lambda_max v v^T
  213. * \lambda_max v v^T * alpha = y | multiply with v^T from left
  214. * => \lambda_max v^T alpha = v^T y
  215. * => alpha = y / lambda_max could be a good initial start
  216. * If we put everything in the first equation this gives us
  217. * v = y ....which is somehow a weird assumption (cf Kernel PCA)
  218. * This reduces the number of iterations by 5 or 8
  219. */
  220. NICE::Vector alpha;
  221. alpha = (binaryLabels[classCnt] * (1.0 / eigenmax[0]) );
  222. if ( verbose )
  223. cerr << "Using the standard solver ..." << endl;
  224. t.start();
  225. linsolver->solveLin ( *ikm, binaryLabels[classCnt], alpha );
  226. t.stop();
  227. if ( verbose )
  228. std::cerr << "Time used for solving (K + sigma^2 I)^{-1} y: " << t.getLast() << std::endl;
  229. // this term is no approximation at all
  230. double dataterm = binaryLabels[classCnt].scalarProduct(alpha);
  231. binaryDataterms[classCnt] = (dataterm);
  232. alphas[classCnt] = alpha;
  233. }
  234. // approximation stuff
  235. if (verbose)
  236. cerr << "Approximating logdet(K) ..." << endl;
  237. t.start();
  238. LogDetApproxBaiAndGolub la;
  239. la.setVerbose(this->verbose);
  240. //NOTE: this is already the squared frobenius norm, that we are looking for.
  241. double frobNormSquared(0.0);
  242. // ------------- LOWER BOUND, THAT IS USED --------------------
  243. // frobNormSquared ~ \sum \lambda_i^2 <-- LOWER BOUND
  244. for (int idx = 0; idx < rank; idx++)
  245. {
  246. frobNormSquared += (eigenmax[idx] * eigenmax[idx]);
  247. }
  248. if (verbose)
  249. cerr << " frob norm squared: est:" << frobNormSquared << endl;
  250. if (verbose)
  251. std::cerr << "trace: " << diagonalElements.Sum() << std::endl;
  252. double logdet = la.getLogDetApproximationUpperBound( diagonalElements.Sum(), /* trace = n only for non-transformed features*/
  253. frobNormSquared, /* use a rough approximation of the frobenius norm */
  254. eigenmax[0], /* upper bound for eigen values */
  255. ikm->rows() /* = n */
  256. );
  257. t.stop();
  258. if (verbose)
  259. cerr << "Time used for approximating logdet(K): " << t.getLast() << endl;
  260. // (c) adding the two terms
  261. double nlikelihood = nrOfClasses*logdet;
  262. double dataterm = binaryDataterms.sum();
  263. nlikelihood += dataterm;
  264. if (verbose)
  265. cerr << "OPT: " << xv << " " << nlikelihood << " " << logdet << " " << dataterm << endl;
  266. if ( nlikelihood < min_nlikelihood )
  267. {
  268. min_nlikelihood = nlikelihood;
  269. ikm->getParameters ( min_parameter );
  270. min_alphas = alphas;
  271. }
  272. alreadyVisited.insert ( pair<int, double> ( hashValue, nlikelihood ) );
  273. return nlikelihood;
  274. }
  275. void GPLikelihoodApprox::setParameterLowerBound(const double & _parameterLowerBound)
  276. {
  277. parameterLowerBound = _parameterLowerBound;
  278. }
  279. void GPLikelihoodApprox::setParameterUpperBound(const double & _parameterUpperBound)
  280. {
  281. parameterUpperBound = _parameterUpperBound;
  282. }
  283. void GPLikelihoodApprox::setBinaryLabels(const std::map<int, Vector> & _binaryLabels)
  284. {
  285. binaryLabels = _binaryLabels;
  286. }
  287. void GPLikelihoodApprox::setVerbose( const bool & _verbose )
  288. {
  289. this->verbose = _verbose;
  290. }
  291. void GPLikelihoodApprox::setDebug( const bool & _debug )
  292. {
  293. this->debug = _debug;
  294. }