/** * @file GPLaplaceOptimizationProblem.cpp * @brief Hyperparameter Optimization Problem for GP Regression * @author Erik Rodner * @date 12/09/2009 */ #include #include "core/vector/Algorithms.h" #include "core/algebra/CholeskyRobust.h" #include "core/algebra/CholeskyRobustAuto.h" #include "vislearning/math/kernels/OptimKernelParameterGradInterface.h" #include "GPLaplaceOptimizationProblem.h" using namespace std; using namespace NICE; using namespace OBJREC; GPLaplaceOptimizationProblem::GPLaplaceOptimizationProblem ( KernelData *_kernelData, const NICE::Vector & _y, ParameterizedKernel *_kernel, const LikelihoodFunction *_likelihoodFunction, LaplaceApproximation *_laplaceApproximation, bool verbose ) : OptimizationProblemFirst ( _kernel->getParameterSize() ), kernelData(_kernelData), kernel(_kernel), likelihoodFunction(_likelihoodFunction) { this->verbose = verbose; this->y.push_back ( _y ); this->kernel->getParameters( parameters() ); this->laplaceApproximation.push_back ( _laplaceApproximation ); if ( verbose ) { cerr.precision(20); cerr << "GPLaplaceOptimizationProblem: initial parameters = " << parameters() << endl; } bestLooParameters.resize ( parameters().size() ); bestAvgLooError = numeric_limits::max(); } GPLaplaceOptimizationProblem::GPLaplaceOptimizationProblem ( KernelData *_kernelData, const VVector & _y, ParameterizedKernel *_kernel, const LikelihoodFunction *_likelihoodFunction, const vector & _laplaceApproximation, bool verbose ) : OptimizationProblemFirst( _kernel->getParameterSize() ), kernelData(_kernelData), y(_y), kernel(_kernel), likelihoodFunction(_likelihoodFunction), laplaceApproximation(_laplaceApproximation) { this->verbose = verbose; this->kernel->getParameters( parameters() ); if ( verbose ) { cerr.precision(20); cerr << "GPLaplaceOptimizationProblem: initial parameters = " << parameters() << endl; } if ( y.size() != laplaceApproximation.size() ) fthrow(Exception, "Number of label vectors and number of Laplace approximation objects do not correspond."); bestLooParameters.resize ( parameters().size() ); bestAvgLooError = numeric_limits::max(); } GPLaplaceOptimizationProblem::~GPLaplaceOptimizationProblem() { } void GPLaplaceOptimizationProblem::update () { kernel->setParameters( parameters() ); kernel->updateKernelData ( kernelData ); kernelData->updateCholeskyFactorization(); for ( uint i = 0 ; i < y.size(); i++ ) laplaceApproximation[i]->approximate ( kernelData, y[i], likelihoodFunction ); } double GPLaplaceOptimizationProblem::computeObjective() { if ( verbose ) { cerr << "GPLaplaceOptimizationProblem: computeObjective: " << parameters() << endl; } try { update(); } catch ( Exception ) { // inversion problems if ( verbose ) cerr << "GPLaplaceOptimizationProblem: kernel matrix might be singular" << endl; return numeric_limits::max(); } double loglikelihood = 0.0; /* Objective = - Laplace Objective + 0.5 logdet(B) * Laplace Objective = - 0.5 a^T f + log p(y|f) * B = Id + W^0.5 K W^0.5 */ for ( uint k = 0 ; k < laplaceApproximation.size() ; k++ ) loglikelihood += - laplaceApproximation[k]->getObjective() + triangleMatrixLogDet ( laplaceApproximation[k]->getCholeskyB() ); if ( verbose ) { cerr << "GPLaplaceOptimizationProblem: negative loglikelihood = " << loglikelihood << endl; } return loglikelihood; } void GPLaplaceOptimizationProblem::computeGradient( NICE::Vector& newGradient ) { if ( verbose ) cerr << "GPLaplaceOptimizationProblem: gradient computation : " << parameters() << endl; newGradient.resize(parameters().size()); newGradient.set(0.0); for ( uint task = 0 ; task < laplaceApproximation.size(); task++ ) { // NON-OPTIMIZED: some things could be computed out of this Loop (maybe) // The following code is a simple re-implementation of the MATLAB // pseudo-code provided in Rasmussen and Williams 2005 // negative hessian (diagonal matrix) of p(y|f) const Vector & w = laplaceApproximation[task]->getHessian(); // L*L^T = B = I + W^0.5 K W^0.5 const Matrix & L = laplaceApproximation[task]->getCholeskyB(); // mode found by Laplace approximation const Vector & f = laplaceApproximation[task]->getMode(); // our kernel matrix const Matrix & kernelMatrix = kernelData->getKernelMatrix(); //cerr << "K size: " << kernelMatrix.rows() << " x " << kernelMatrix.cols() << endl; // ------------- line 7: R = W^0.5 L^T\(L \ W^0.5) // Matrix W is a diagonal matrix // R is therefore a scaled variant of the inverse of B // actually: R = (W^-1 + K)^-1 // matlab: Z = repmat(sW,1,n).*solve_chol(L, diag(sW)); //cerr << "Hessian size: " << w.size() << endl; Matrix R ( w.size(), w.size(), 0.0 ); for ( uint i = 0 ; i < w.size(); i++ ) R(i,i) = sqrt( w[i] ); choleskySolveMatrixLargeScale ( L, R ); // W^0.5 * ... corresponds to a scaling of all rows for ( uint i = 0 ; i < R.rows(); i++ ) for ( uint j = 0 ; j < R.cols(); j++ ) R(i,j) *= sqrt( w[i] ); // ------------ line 8: C = L \ (W^0.5 K) // matlab: C = L'\(repmat(sW,1,n).*K); Matrix C ( kernelMatrix ); for ( uint i = 0 ; i < C.rows(); i++ ) for ( uint j = 0 ; j < C.cols(); j++ ) C(i,j) *= sqrt( w[i] ); triangleSolveMatrix ( L, C ); // ----------- line 9: s2 = - 0.5 diag(diag(K) - diag(C^T C)) nabla^3 p(y|f) // but there is a sign error!!! // matlab: s2 = 0.5*(diag(K)-sum(C.^2,1)').*d3lp; Vector s2 (w.size()); for ( uint i = 0 ; i < w.size() ; i++ ) { double nabla3 = likelihoodFunction->thirdgrad ( y[task][i], f[i] ); Vector cColumn = C.getColumnRef(i); s2[i] = 0.5 * (kernelMatrix(i,i) - cColumn.scalarProduct(cColumn)) * nabla3; } // a = K^{-1} * f (already computed by the laplace approximation) const Vector & a = laplaceApproximation[task]->getAVector(); // Due to the stationarity of f we have // the consistency equation: f = K gradientL = K * \nabla p(y|f) const Vector & gradientL = laplaceApproximation[task]->getGradient(); // Both above equations should lead to a = gradientL /* DEBUG: if ( (f - kernelMatrix * a).normL2() > 1e-10 ) */ /* DEBUG: if ( (f - kernelMatrix * gradientL).normL2() > 1e-10 ) */ /* DEBUG: if ( (a - gradientL).normL2() > 1e-10 ) fthrow(Exception, "Ups! a vector seems to be wrong!"); */ // for k = 1 .. dim(theta) for ( uint k = 0 ; k < parameters().size() ; k++ ) { // C is now our Jacobi Matrix // This one could be optimized by putting it out of this loop // ...but I don't care right now // // ------------ line 12 // matlab: dK = feval(covfunc{:}, hyper, x, j); kernel->getKernelJacobi ( k, parameters(), kernelData, C ); // s1 = alpha'*dK*alpha/2-sum(sum(Z.*dK))/2; double s1 = C.bilinear ( a ); // calculate tr(RC) = sum of piecewiese multiplications for ( uint i = 0 ; i < R.rows(); i++ ) for ( uint j = 0 ; j < R.cols(); j++ ) s1 -= R(i,j) * C(i,j); s1 *= 0.5; // ------------ line 13 // b = dK*dlp; Vector b = C * gradientL; // ------------ line 14 // matlab: s3 = b-K*(Z*b); Vector s3 = b - kernelMatrix*(R*b); // In contrast to Rasmussen we have to flip the sign // because we are minimizing the negative log marginal instead // of maximizing the log marginal // matlab: dnlZ(j) = -s1-s2'*s3; newGradient[k] += - s1 - s2.scalarProduct(s3); } } if ( verbose ) cerr << "GPLaplaceOptimizationProblem: gradient = " << newGradient << endl; } void GPLaplaceOptimizationProblem::useLooParameters ( ) { parameters() = bestLooParameters; update(); }