GPLaplaceOptimizationProblem.cpp 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. /**
  2. * @file GPLaplaceOptimizationProblem.cpp
  3. * @brief Hyperparameter Optimization Problem for GP Regression
  4. * @author Erik Rodner
  5. * @date 12/09/2009
  6. */
  7. #include <iostream>
  8. #include "core/vector/Algorithms.h"
  9. #include "core/algebra/CholeskyRobust.h"
  10. #include "core/algebra/CholeskyRobustAuto.h"
  11. #include "vislearning/math/kernels/OptimKernelParameterGradInterface.h"
  12. #include "GPLaplaceOptimizationProblem.h"
  13. using namespace std;
  14. using namespace NICE;
  15. using namespace OBJREC;
  16. GPLaplaceOptimizationProblem::GPLaplaceOptimizationProblem (
  17. KernelData *_kernelData,
  18. const NICE::Vector & _y,
  19. ParameterizedKernel *_kernel,
  20. const LikelihoodFunction *_likelihoodFunction,
  21. LaplaceApproximation *_laplaceApproximation,
  22. bool verbose )
  23. : OptimizationProblemFirst ( _kernel->getParameterSize() ),
  24. kernelData(_kernelData),
  25. kernel(_kernel),
  26. likelihoodFunction(_likelihoodFunction)
  27. {
  28. this->verbose = verbose;
  29. this->y.push_back ( _y );
  30. this->kernel->getParameters( parameters() );
  31. this->laplaceApproximation.push_back ( _laplaceApproximation );
  32. if ( verbose ) {
  33. cerr.precision(20);
  34. cerr << "GPLaplaceOptimizationProblem: initial parameters = " << parameters() << endl;
  35. }
  36. bestLooParameters.resize ( parameters().size() );
  37. bestAvgLooError = numeric_limits<double>::max();
  38. }
  39. GPLaplaceOptimizationProblem::GPLaplaceOptimizationProblem ( KernelData *_kernelData, const VVector & _y,
  40. ParameterizedKernel *_kernel, const LikelihoodFunction *_likelihoodFunction,
  41. const vector<LaplaceApproximation *> & _laplaceApproximation,
  42. bool verbose ) : OptimizationProblemFirst( _kernel->getParameterSize() ),
  43. kernelData(_kernelData),
  44. y(_y),
  45. kernel(_kernel),
  46. likelihoodFunction(_likelihoodFunction),
  47. laplaceApproximation(_laplaceApproximation)
  48. {
  49. this->verbose = verbose;
  50. this->kernel->getParameters( parameters() );
  51. if ( verbose ) {
  52. cerr.precision(20);
  53. cerr << "GPLaplaceOptimizationProblem: initial parameters = " << parameters() << endl;
  54. }
  55. if ( y.size() != laplaceApproximation.size() )
  56. fthrow(Exception, "Number of label vectors and number of Laplace approximation objects do not correspond.");
  57. bestLooParameters.resize ( parameters().size() );
  58. bestAvgLooError = numeric_limits<double>::max();
  59. }
  60. GPLaplaceOptimizationProblem::~GPLaplaceOptimizationProblem()
  61. {
  62. }
  63. void GPLaplaceOptimizationProblem::update ()
  64. {
  65. kernel->setParameters( parameters() );
  66. kernel->updateKernelData ( kernelData );
  67. kernelData->updateCholeskyFactorization();
  68. for ( uint i = 0 ; i < y.size(); i++ )
  69. laplaceApproximation[i]->approximate ( kernelData, y[i], likelihoodFunction );
  70. }
  71. double GPLaplaceOptimizationProblem::computeObjective()
  72. {
  73. if ( verbose ) {
  74. cerr << "GPLaplaceOptimizationProblem: computeObjective: " << parameters() << endl;
  75. }
  76. try {
  77. update();
  78. } catch ( Exception ) {
  79. // inversion problems
  80. if ( verbose )
  81. cerr << "GPLaplaceOptimizationProblem: kernel matrix might be singular" << endl;
  82. return numeric_limits<double>::max();
  83. }
  84. double loglikelihood = 0.0;
  85. /* Objective = - Laplace Objective + 0.5 logdet(B)
  86. * Laplace Objective = - 0.5 a^T f + log p(y|f)
  87. * B = Id + W^0.5 K W^0.5
  88. */
  89. for ( uint k = 0 ; k < laplaceApproximation.size() ; k++ )
  90. loglikelihood += - laplaceApproximation[k]->getObjective() + triangleMatrixLogDet ( laplaceApproximation[k]->getCholeskyB() );
  91. if ( verbose ) {
  92. cerr << "GPLaplaceOptimizationProblem: negative loglikelihood = " << loglikelihood << endl;
  93. }
  94. return loglikelihood;
  95. }
  96. void GPLaplaceOptimizationProblem::computeGradient( NICE::Vector& newGradient )
  97. {
  98. if ( verbose )
  99. cerr << "GPLaplaceOptimizationProblem: gradient computation : " << parameters() << endl;
  100. newGradient.resize(parameters().size());
  101. newGradient.set(0.0);
  102. for ( uint task = 0 ; task < laplaceApproximation.size(); task++ )
  103. {
  104. // NON-OPTIMIZED: some things could be computed out of this Loop (maybe)
  105. // The following code is a simple re-implementation of the MATLAB
  106. // pseudo-code provided in Rasmussen and Williams 2005
  107. // negative hessian (diagonal matrix) of p(y|f)
  108. const Vector & w = laplaceApproximation[task]->getHessian();
  109. // L*L^T = B = I + W^0.5 K W^0.5
  110. const Matrix & L = laplaceApproximation[task]->getCholeskyB();
  111. // mode found by Laplace approximation
  112. const Vector & f = laplaceApproximation[task]->getMode();
  113. // our kernel matrix
  114. const Matrix & kernelMatrix = kernelData->getKernelMatrix();
  115. //cerr << "K size: " << kernelMatrix.rows() << " x " << kernelMatrix.cols() << endl;
  116. // ------------- line 7: R = W^0.5 L^T\(L \ W^0.5)
  117. // Matrix W is a diagonal matrix
  118. // R is therefore a scaled variant of the inverse of B
  119. // actually: R = (W^-1 + K)^-1
  120. // matlab: Z = repmat(sW,1,n).*solve_chol(L, diag(sW));
  121. //cerr << "Hessian size: " << w.size() << endl;
  122. Matrix R ( w.size(), w.size(), 0.0 );
  123. for ( uint i = 0 ; i < w.size(); i++ )
  124. R(i,i) = sqrt( w[i] );
  125. choleskySolveMatrixLargeScale ( L, R );
  126. // W^0.5 * ... corresponds to a scaling of all rows
  127. for ( uint i = 0 ; i < R.rows(); i++ )
  128. for ( uint j = 0 ; j < R.cols(); j++ )
  129. R(i,j) *= sqrt( w[i] );
  130. // ------------ line 8: C = L \ (W^0.5 K)
  131. // matlab: C = L'\(repmat(sW,1,n).*K);
  132. Matrix C ( kernelMatrix );
  133. for ( uint i = 0 ; i < C.rows(); i++ )
  134. for ( uint j = 0 ; j < C.cols(); j++ )
  135. C(i,j) *= sqrt( w[i] );
  136. triangleSolveMatrix ( L, C );
  137. // ----------- line 9: s2 = - 0.5 diag(diag(K) - diag(C^T C)) nabla^3 p(y|f)
  138. // but there is a sign error!!!
  139. // matlab: s2 = 0.5*(diag(K)-sum(C.^2,1)').*d3lp;
  140. Vector s2 (w.size());
  141. for ( uint i = 0 ; i < w.size() ; i++ )
  142. {
  143. double nabla3 = likelihoodFunction->thirdgrad ( y[task][i], f[i] );
  144. Vector cColumn = C.getColumnRef(i);
  145. s2[i] = 0.5 * (kernelMatrix(i,i) - cColumn.scalarProduct(cColumn)) * nabla3;
  146. }
  147. // a = K^{-1} * f (already computed by the laplace approximation)
  148. const Vector & a = laplaceApproximation[task]->getAVector();
  149. // Due to the stationarity of f we have
  150. // the consistency equation: f = K gradientL = K * \nabla p(y|f)
  151. const Vector & gradientL = laplaceApproximation[task]->getGradient();
  152. // Both above equations should lead to a = gradientL
  153. /* DEBUG: if ( (f - kernelMatrix * a).normL2() > 1e-10 ) */
  154. /* DEBUG: if ( (f - kernelMatrix * gradientL).normL2() > 1e-10 ) */
  155. /* DEBUG: if ( (a - gradientL).normL2() > 1e-10 )
  156. fthrow(Exception, "Ups! a vector seems to be wrong!"); */
  157. // for k = 1 .. dim(theta)
  158. for ( uint k = 0 ; k < parameters().size() ; k++ )
  159. {
  160. // C is now our Jacobi Matrix
  161. // This one could be optimized by putting it out of this loop
  162. // ...but I don't care right now
  163. //
  164. // ------------ line 12
  165. // matlab: dK = feval(covfunc{:}, hyper, x, j);
  166. kernel->getKernelJacobi ( k, parameters(), kernelData, C );
  167. // s1 = alpha'*dK*alpha/2-sum(sum(Z.*dK))/2;
  168. double s1 = C.bilinear ( a );
  169. // calculate tr(RC) = sum of piecewiese multiplications
  170. for ( uint i = 0 ; i < R.rows(); i++ )
  171. for ( uint j = 0 ; j < R.cols(); j++ )
  172. s1 -= R(i,j) * C(i,j);
  173. s1 *= 0.5;
  174. // ------------ line 13
  175. // b = dK*dlp;
  176. Vector b = C * gradientL;
  177. // ------------ line 14
  178. // matlab: s3 = b-K*(Z*b);
  179. Vector s3 = b - kernelMatrix*(R*b);
  180. // In contrast to Rasmussen we have to flip the sign
  181. // because we are minimizing the negative log marginal instead
  182. // of maximizing the log marginal
  183. // matlab: dnlZ(j) = -s1-s2'*s3;
  184. newGradient[k] += - s1 - s2.scalarProduct(s3);
  185. }
  186. }
  187. if ( verbose )
  188. cerr << "GPLaplaceOptimizationProblem: gradient = " << newGradient << endl;
  189. }
  190. void GPLaplaceOptimizationProblem::useLooParameters ( )
  191. {
  192. parameters() = bestLooParameters;
  193. update();
  194. }