Browse Source

First init - multi-class gp without parameter optimization

Alexander Freytag 11 years ago
commit
0f054fe75d
12 changed files with 792 additions and 0 deletions
  1. 32 0
      README
  2. 292 0
      evaluateIncrementalLearning.m
  3. 1 0
      init.m
  4. 35 0
      learn_multiclass_gp.m
  5. 51 0
      misc/covSEiso.m
  6. 31 0
      misc/covmin.m
  7. 33 0
      misc/min_kernel.m
  8. 53 0
      misc/sq_dist.m
  9. 61 0
      multiclass_gp_1D_example.m
  10. 110 0
      simulateIncrementalLearning.m
  11. 48 0
      test_multiclass_gp.m
  12. 45 0
      update_multiclass_gp.m

+ 32 - 0
README

@@ -0,0 +1,32 @@
+(1) 'Installing everything'
+This package is self-contained and independent from other external sources. You only need to add the path accordingly:
+     Setup path: init
+
+
+(2) GP models
+  Get to know GP models on a simple 1D example: multiclass_gp_1D_example
+
+(3) Incremental learning
+  Compare efficient incremental learning and training from scratch: evaluateIncrementalLearning
+
+  What should you expect:
+  - figure 1: computation times for model updates while adding several samples are displayed. The blue line (efficient updates) 
+     should be below the red one (learning from scratch). Results are averaged over several runs
+  - figure 2: accuracies obtained while incrementall increasing the training set. Both curves should be identical - if not, something
+     went wrong within the process of model updates. Since the toy example only uses artificial data, the accuracy curve is not 
+     expected to look super fancy
+
+NOTE:
+The technique for incrementally update GP models updates 'the whole model', although for classification, only alpha actually matters. Therefore, if you are only interested
+in classifying samples irrespective of their predictive variance, you might want to use more sophisticated tricks for updating 
+the alpha vector only, which can be found exemplarily in 
+
+  Alexander Freytag  and Erik Rodner and Paul Bodesheim and Joachim Denzler:
+  "Labeling examples that matter: Relevance-Based Active Learning with Gaussian Processes".
+  Proceedings of the German Conference on Pattern Recognition (GCPR), 2013.
+
+If you should be further interested in GP equipped with histogram intersection kernels, you might want to use our GPHIK implementations instead, which avoid
+explicit covariance construction and are algorithmically much more efficient (eg O(n logn) in training compared to O(n^3) ). Refer to the following source for further details
+  Erik Rodner and Alexander Freytag and Paul Bodesheim and Joachim Denzler:
+  " Large-Scale Gaussian Process Classification with Flexible Adaptive Histogram Kernels".
+  Proceedings of the European Conference on Computer Vision (ECCV), 2012.

+ 292 - 0
evaluateIncrementalLearning.m

@@ -0,0 +1,292 @@
+function evaluateIncrementalLearning ( settings )
+% compare multi-class incremental learning with Gaussian processes using
+% either efficient model updates or batch training from scratch
+%
+% Copyright (c) by Alexander Freytag, 2013-11-13.
+
+
+    if ( (nargin < 1) || isempty ( settings) )
+        settings = [];
+    end
+
+    %setup variables for experiments
+
+    if ( ~isfield(  settings, 'i_numDim') || isempty(settings.i_numDim) )
+        settings.i_numDim = 10;
+    end
+    if ( ~isfield(  settings, 'd_mean') || isempty(settings.d_mean) )
+        settings.d_mean = 3.0;
+    end
+    if ( ~isfield(  settings, 'd_var') || isempty(settings.d_var) )
+        settings.d_var = 1.0;
+    end
+
+    if ( ~isfield(  settings, 'offsetSpecific') || isempty(settings.offsetSpecific) )
+        settings.offsetSpecific = [ 0.0, 2.0, 4.0 ];
+    end
+    if ( ~isfield(  settings, 'varSpecific') || isempty(settings.varSpecific) )
+        settings.varSpecific = [2.0, 0.5, 1.0];
+    end
+    if ( ~isfield(  settings, 'dimToChange') || isempty(settings.dimToChange) )
+        settings.dimToChange = [ [1,3] , [2,4] , [5] ];
+    end
+
+    if ( ~isfield(  settings, 'i_numClasses') || isempty(settings.i_numClasses) )
+        settings.i_numClasses = 3;
+    end
+    
+    if ( ~isfield(  settings, 'i_numSamplesPerClassTrain') || isempty(settings.i_numSamplesPerClassTrain) )
+        settings.i_numSamplesPerClassTrain = 10;    
+    end
+    if ( ~isfield(  settings, 'i_numSamplesPerClassUnlabeled') || isempty(settings.i_numSamplesPerClassUnlabeled) )
+        settings.i_numSamplesPerClassUnlabeled = 50;
+    end
+    if ( ~isfield(  settings, 'i_numSamplesPerClassTest') || isempty(settings.i_numSamplesPerClassTest) )
+        settings.i_numSamplesPerClassTest = 10;
+    end
+
+    % how many experiments do you want to average?
+    if ( ~isfield(  settings, 'i_numRepetitions') || isempty(settings.i_numRepetitions) )
+        i_numRepetitions = 10;
+    end
+
+
+    timesBatch = zeros ( i_numRepetitions, (settings.i_numClasses*settings.i_numSamplesPerClassUnlabeled+1) );
+    timesEfficient = zeros ( i_numRepetitions, (settings.i_numClasses*settings.i_numSamplesPerClassUnlabeled+1) );
+    
+    arrBatch = zeros ( i_numRepetitions, (settings.i_numClasses*settings.i_numSamplesPerClassUnlabeled+1) );
+    arrEfficient = zeros ( i_numRepetitions, (settings.i_numClasses*settings.i_numSamplesPerClassUnlabeled+1) );    
+    
+    for i_idxRep = 1:i_numRepetitions
+        
+        %%  (1) sample some features for initial train set, separate test set, and additional separate set to be added incrementally
+
+        % sample training data
+        settings.i_numSamplesPerClass = settings.i_numSamplesPerClassTrain;
+        labeledData = sampleData( settings );
+
+
+        % sample unlabeled data
+        settings.i_numSamplesPerClass = settings.i_numSamplesPerClassUnlabeled;
+        unlabeledData = sampleData( settings );
+
+
+        % sample test data
+        settings.i_numSamplesPerClass = settings.i_numSamplesPerClassTest;
+        testData = sampleData( settings );
+
+        % in which order to we want to add the 'unlabeled' sampels?
+        idxToAdd = randperm( length(unlabeledData.y) ); 
+
+
+
+        %%  (2) run both techniques : efficient updates versus batch training
+
+        b_efficientUpdates = false;
+        resultsBatch = simulateIncrementalLearning ( labeledData, unlabeledData, testData, b_efficientUpdates, idxToAdd );
+        
+        timesBatch(i_idxRep, :) = resultsBatch.times;
+        arrBatch(i_idxRep, :) = resultsBatch.ARR;
+        clear ( 'resultsBatch' );
+        
+        
+        
+
+        b_efficientUpdates = true;
+        resultsEfficient = simulateIncrementalLearning ( labeledData, unlabeledData, testData, b_efficientUpdates, idxToAdd );
+        
+        timesEfficient(i_idxRep, :) = resultsEfficient.times;
+        arrEfficient(i_idxRep, :) = resultsEfficient.ARR;
+        clear ( 'resultsEfficient' );
+        
+        
+    end
+
+
+    %%  (3) evaluation
+    
+    % compute relevant values
+    timesBatch = mean ( timesBatch ) ;
+    timesEfficient = mean ( timesEfficient ) ;
+    
+    arrBatch = mean( arrBatch );
+    arrEfficient = mean ( arrEfficient );      
+    
+    % setup variables
+    if ( ( ~isfield(settings,'s_legendLocation'))  || isempty(settings.s_legendLocation) )
+       s_legendLocation = 'NorthEast';
+    else
+       s_legendLocation = settings.s_legendLocation;
+    end     
+  
+    if ( ( ~isfield(settings,'i_fontSize'))  || isempty(settings.i_fontSize) )
+       i_fontSize = 12;
+    else
+       i_fontSize = settings.i_fontSize;
+    end  
+  
+    if ( ( ~isfield(settings,'i_fontSizeAxis'))  || isempty(settings.i_fontSizeAxis) )
+       i_fontSizeAxis = 16;
+    else
+       i_fontSizeAxis = settings.i_fontSizeAxis;
+    end
+    
+    if ( ( ~isfield(settings,'c'))  || isempty(settings.c) )
+        c={[0,0,1], [1,0,0]};
+    else
+        c = settings.c;
+    end  
+  
+    if ( ( ~isfield(settings,'lineStyle'))  || isempty(settings.lineStyle) )
+        lineStyle={'-',   '--' };
+    else
+        lineStyle = settings.lineStyle;
+    end 
+  
+    if ( ( ~isfield(settings,'marker'))  || isempty(settings.marker) )
+        marker={'none', 'none'};
+    else
+        marker = settings.marker;
+    end 
+    
+    if ( ( ~isfield(settings,'linewidth'))  || isempty(settings.linewidth) )
+        linewidth=3;
+    else
+        linewidth = settings.linewidth;
+    end      
+    
+    %% plot computation times (blue should be lower than red)
+    fig_timeComp = figure;
+    set ( fig_timeComp, 'name', 'Computation Times for Model Updates');
+    hold on;
+
+    plot ( timesEfficient, 'Color', c{ 1 }, 'LineStyle', lineStyle{ 1 },  'Marker', marker{ 1 }, ...
+                'LineWidth', linewidth, 'MarkerSize',8 );
+    plot ( timesBatch, 'Color', c{ 2 }, 'LineStyle', lineStyle{ 2 },  'Marker', marker{ 2 }, ...
+                'LineWidth', linewidth, 'MarkerSize',8);
+    
+    leg=legend( {'Efficient', 'Batch'}, 'Location', s_legendLocation,'fontSize', i_fontSize,'LineWidth', 3);
+    xlabel('Number of samples added');
+    ylabel({'Time spent for model update [s]'});    
+    
+    text_h=findobj(gca,'type','text');  
+    set(text_h,'FontSize',i_fontSize);
+    set(gca, 'FontSize', i_fontSize);
+    set(get(gca,'YLabel'), 'FontSize', i_fontSizeAxis);
+    set(get(gca,'XLabel'), 'FontSize', i_fontSizeAxis);    
+
+    hold off;
+
+
+    %% plot accuracies (blue and red should be identical)
+    fig_accuracyComp = figure;
+    set ( fig_accuracyComp, 'name', 'Accuracy over time');
+    hold on;
+
+    plot ( arrEfficient, 'Color', c{ 1 }, 'LineStyle', lineStyle{ 1 },  'Marker', marker{ 1 }, ...
+                'LineWidth', linewidth, 'MarkerSize',8 );
+    plot ( arrBatch, 'Color', c{ 2 }, 'LineStyle', lineStyle{ 2 },  'Marker', marker{ 2 }, ...
+                'LineWidth', linewidth, 'MarkerSize',8);
+    leg=legend( {'Efficient', 'Batch'}, 'Location', s_legendLocation,'fontSize', i_fontSize,'LineWidth', 3);
+    xlabel('Number of samples added');
+    ylabel({'Time spent for model update [s]'});    
+    
+    text_h=findobj(gca,'type','text');  
+    set(text_h,'FontSize',i_fontSize);
+    set(gca, 'FontSize', i_fontSize);
+    set(get(gca,'YLabel'), 'FontSize', i_fontSizeAxis);
+    set(get(gca,'XLabel'), 'FontSize', i_fontSizeAxis);    
+
+    hold off;
+
+
+    pause;
+    close ( fig_timeComp );
+    close ( fig_accuracyComp );
+    
+end    
+    
+function data = sampleData( settings )
+
+    if ( isfield(  settings, 'i_numDim') && ~isempty(settings.i_numDim) )
+        i_numDim = settings.i_numDim;
+    else
+        i_numDim = 5;
+    end
+    
+
+    if ( isfield(  settings, 'd_mean') && ~isempty(settings.d_mean) )
+        d_mean = settings.d_mean;
+    else
+        d_mean = 3.0;
+    end
+    
+
+    if ( isfield(  settings, 'd_var') && ~isempty(settings.d_var) )
+        d_var = settings.d_var;
+    else
+        d_var = 1.0;
+    end
+    
+    if ( isfield(  settings, 'i_numClasses') && ~isempty(settings.i_numClasses) )
+        i_numClasses = settings.i_numClasses;
+    else
+        i_numClasses = 3;
+    end
+    
+    if ( isfield(  settings, 'offsetSpecific') && ~isempty(settings.offsetSpecific) )
+        offsetSpecific = settings.offsetSpecific;
+    else
+        offsetSpecific = [ 0.0, 2.0, 4.0 ];
+    end
+ 
+
+    if ( isfield(  settings, 'varSpecific') && ~isempty(settings.varSpecific) )
+        varSpecific = settings.varSpecific;
+    else
+        varSpecific = [2.0, 0.5, 1.0];
+    end
+    
+
+    if ( isfield(  settings, 'dimToChange') && ~isempty(settings.dimToChange) )
+        dimToChange = settings.dimToChange;
+    else
+        dimToChange = [ [1,3] , [2,4] , [5] ];
+    end
+    
+
+
+    
+    if ( isfield(  settings, 'i_numSamplesPerClass') && ~isempty(settings.i_numSamplesPerClass) )
+        i_numSamplesPerClass = settings.i_numSamplesPerClass;
+    else
+        i_numSamplesPerClass = 2;
+    end    
+    
+
+    % randomly compute some features
+    data.X = abs ( d_mean + d_var.*randn(i_numClasses*i_numSamplesPerClass,i_numDim) );
+
+    % disturbe features for specific classes
+    for i_clIdx = 1:i_numClasses
+
+        i_idxStart = (i_clIdx-1)*i_numSamplesPerClass+1;
+        i_idxEnd = i_clIdx*i_numSamplesPerClass;
+
+        data.X(  i_idxStart:i_idxEnd,  dimToChange(i_clIdx) ) = ...
+                    abs ( data.X(i_idxStart:i_idxEnd,1)  + ... 
+                    offsetSpecific( i_clIdx) + ... %add class offset
+                    varSpecific ( i_clIdx) .*randn(i_numSamplesPerClass,1) ... % add class variance
+                 );
+    end
+
+
+    % normalize features, thereby simulate histograms, which commonly occure in 
+    % computer vision applications
+    data.X = bsxfun(@times, data.X, 1./(sum(data.X, 2)));
+
+    % adapt class labels
+    data.y =  bsxfun(@times, ones(i_numSamplesPerClass,1), 1:i_numClasses );
+    data.y = data.y(:);
+
+end    

+ 1 - 0
init.m

@@ -0,0 +1 @@
+addpath('misc');

+ 35 - 0
learn_multiclass_gp.m

@@ -0,0 +1,35 @@
+function model = learn_multiclass_gp(K, labels, noise)
+% Learn a multi-class GP model
+% 
+% function model = learn_multiclass_gp(K, labels, noise)
+%
+% INPUT: K -- kernel matrix of all data points
+%        labels -- multi-class labels, e.g., 1, 3, 4, 5, ...
+%        noise -- variance of Gaussian noise model within GP framework
+% Copyright (c) by Alexander Freytag, 2013-11-13.
+
+  % obtain unique class labels (length of this vector equals the number of classes)
+  unique_labels = unique(labels);
+
+  % cholesky factorization of regularized kernel matrix
+  L = chol( K + noise*eye(size(K)) , 'lower');
+
+  % store relevant data
+  model.L = L;
+  model.noise = noise;
+  model.unique_labels = unique_labels;
+
+  % loop over classes and thus over binary one-vs-all tasks
+  for k=1:length(unique_labels)
+
+    % binary label vector for each binary task
+    current_labels = 2*(labels==unique_labels(k))-1;
+
+    % precompute and store alpha vector for each binary task
+    model.alpha{k} = L'\(L\current_labels);
+
+  end
+  
+  model.y = labels;
+
+end

+ 51 - 0
misc/covSEiso.m

@@ -0,0 +1,51 @@
+function K = covSEiso(hyp, x, z, i)
+% borrowed from gpml-toolbox of Rasmussen and Nikkisch
+% see http://www.gaussianprocess.org/gpml/code/matlab/doc/
+
+
+% Squared Exponential covariance function with isotropic distance measure. The 
+% covariance function is parameterized as:
+%
+% k(x^p,x^q) = sf^2 * exp(-(x^p - x^q)'*inv(P)*(x^p - x^q)/2) 
+%
+% where the P matrix is ell^2 times the unit matrix and sf^2 is the signal
+% variance. The hyperparameters are:
+%
+% hyp = [ log(ell)
+%         log(sf)  ]
+%
+% For more help on design of covariance functions, try "help covFunctions".
+%
+% Copyright (c) by Carl Edward Rasmussen and Hannes Nickisch, 2010-09-10.
+%
+% See also COVFUNCTIONS.M.
+
+if nargin<2, K = '2'; return; end                  % report number of parameters
+if nargin<3, z = []; end                                   % make sure, z exists
+xeqz = numel(z)==0; dg = strcmp(z,'diag') && numel(z)>0;        % determine mode
+
+ell = exp(hyp(1));                                 % characteristic length scale
+sf2 = exp(2*hyp(2));                                           % signal variance
+
+% precompute squared distances
+if dg                                                               % vector kxx
+  K = zeros(size(x,1),1);
+else
+  if xeqz                                                 % symmetric matrix Kxx
+    K = sq_dist(x'/ell);
+  else                                                   % cross covariances Kxz
+    K = sq_dist(x'/ell,z'/ell);
+  end
+end
+
+if nargin<4                                                        % covariances
+  K = sf2*exp(-K/2);
+else                                                               % derivatives
+  if i==1
+    K = sf2*exp(-K/2).*K;
+  elseif i==2
+    K = 2*sf2*exp(-K/2);
+  else
+    error('Unknown hyperparameter')
+  end
+end

+ 31 - 0
misc/covmin.m

@@ -0,0 +1,31 @@
+function K = covmin(hyp, x, z)
+% See also COVFUNCTIONS.M.
+% borrowed from gpml-toolbox of Rasmussen and Nikkisch
+% see http://www.gaussianprocess.org/gpml/code/matlab/doc/
+
+if nargin<2, K = '0'; return; end                  % report number of parameters
+if nargin<3, z = []; end                                   % make sure, z exists
+xeqz = numel(z)==0; dg = strcmp(z,'diag') && numel(x)>0;        % determine mode
+
+ell = exp(0);
+% ell = exp(hyp(1));                              % exponent for histogram entries
+sf2 = exp(0);
+% sf2 = exp(hyp(2));                              % factor for histogram entries
+
+% precompute min kernel
+if dg                                                               % vector kxx
+  K = sum(x,2);
+%    K = ones(size(x,1),1);
+else
+  if xeqz                                                 % symmetric matrix Kxx
+    K = sf2*min_kernel(x'.^ell);
+  else                                                   % cross covariances Kxz
+    K = sf2*min_kernel(x'.^ell,z'.^ell);
+  end
+end
+
+if nargin<4                                                        % covariances
+
+else                                                               % derivatives
+  error('not yet implemented')
+end

+ 33 - 0
misc/min_kernel.m

@@ -0,0 +1,33 @@
+function C = min_kernel(a, b)
+% borrowed from gpml-toolbox of Rasmussen and Nikkisch
+% see http://www.gaussianprocess.org/gpml/code/matlab/doc/
+
+if nargin<1  || nargin>2 || nargout>1, error('Wrong number of arguments.'); end
+bsx = exist('bsxfun','builtin');      % since Matlab R2007a 7.4.0 and Octave 3.0
+if ~bsx, bsx = exist('bsxfun'); end      % bsxfun is not yes "builtin" in Octave
+[D, n] = size(a);
+
+if nargin==1                                                     % subtract mean
+  b = a; m = n;
+else
+  [d, m] = size(b);
+  if d ~= D, error('Error: column lengths must agree.'); end
+end
+
+if bsx                                               % compute squared distances
+	C = zeros(n,m);
+	% This code is working but still inefficient :(
+	if ( n > m )
+	  	for i=1:m
+				C(:,i)=sum(bsxfun(@min,a,b(:,i)),1)';
+		end
+	else
+		for j=1:n
+				C(j,:)=sum(bsxfun(@min,a(:,j),b),1);
+		end
+	end
+
+else
+	error('not yet implemented...');
+end
+C = max(C,0);          % numerical noise can cause C to negative i.e. C > -1e-14

+ 53 - 0
misc/sq_dist.m

@@ -0,0 +1,53 @@
+% sq_dist - a function to compute a matrix of all pairwise squared distances
+% between two sets of vectors, stored in the columns of the two matrices, a
+% (of size D by n) and b (of size D by m). If only a single argument is given
+% or the second matrix is empty, the missing matrix is taken to be identical
+% to the first.
+%
+% Usage: C = sq_dist(a, b)
+%    or: C = sq_dist(a)  or equiv.: C = sq_dist(a, [])
+%
+% Where a is of size Dxn, b is of size Dxm (or empty), C is of size nxm.
+%
+% Copyright (c) by Carl Edward Rasmussen and Hannes Nickisch, 2010-12-13.
+
+function C = sq_dist(a, b)
+% borrowed from gpml-toolbox of Rasmussen and Nikkisch
+% see http://www.gaussianprocess.org/gpml/code/matlab/doc/
+
+
+if nargin<1  || nargin>3 || nargout>1, error('Wrong number of arguments.'); end
+bsx = exist('bsxfun','builtin');      % since Matlab R2007a 7.4.0 and Octave 3.0
+if ~bsx, bsx = exist('bsxfun'); end      % bsxfun is not yes "builtin" in Octave
+[D, n] = size(a);
+
+% Computation of a^2 - 2*a*b + b^2 is less stable than (a-b)^2 because numerical
+% precision can be lost when both a and b have very large absolute value and the
+% same sign. For that reason, we subtract the mean from the data beforehand to
+% stabilise the computations. This is OK because the squared error is
+% independent of the mean.
+if nargin==1                                                     % subtract mean
+  mu = mean(a,2);
+  if bsx
+    a = bsxfun(@minus,a,mu);
+  else
+    a = a - repmat(mu,1,size(a,2));  
+  end
+  b = a; m = n;
+else
+  [d, m] = size(b);
+  if d ~= D, error('Error: column lengths must agree.'); end
+  mu = (m/(n+m))*mean(b,2) + (n/(n+m))*mean(a,2);
+  if bsx
+    a = bsxfun(@minus,a,mu); b = bsxfun(@minus,b,mu);
+  else
+    a = a - repmat(mu,1,n);  b = b - repmat(mu,1,m);
+  end
+end
+
+if bsx                                               % compute squared distances
+  C = bsxfun(@plus,sum(a.*a,1)',bsxfun(@minus,sum(b.*b,1),2*a'*b));
+else
+  C = repmat(sum(a.*a,1)',1,m) + repmat(sum(b.*b,1),n,1) - 2*a'*b;
+end
+C = max(C,0);          % numerical noise can cause C to negative i.e. C > -1e-14

+ 61 - 0
multiclass_gp_1D_example.m

@@ -0,0 +1,61 @@
+function multiclass_gp_1D_example()
+% plot GP model outputs for 1D artificial data
+% play around with param settings to become familiar with GP models
+%
+% Copyright (c) by Alexander Freytag, 2013-11-13.
+
+  % get some artificial 1D data
+  train_data = [1;2.5;4.5];
+  train_labels = [1; 2; 4];
+  test_data = (0.1:0.01:5)';
+
+  % some default settings of hyperparameters
+  loghyper = [-1 0];
+  gpnoise = 0.01;
+
+  
+  % learn multi-class GP model
+  K = feval('covSEiso', loghyper, train_data);
+  model = learn_multiclass_gp(K, train_labels, gpnoise);
+
+  % evaluate model on test data
+  Ks = feval('covSEiso', loghyper, train_data, test_data);
+  Kss = feval('covSEiso', loghyper, test_data, 'diag');  
+  [mu, pred_labels, variance] = test_multiclass_gp(model, Ks, Kss');
+
+  % visualize everything nicely
+  f1 = figure(1);
+  plot(train_data(1), 1, 'bx');
+  title('Mean curve of each binary task');
+
+  hold on
+  plot(train_data(2), 1, 'gx');
+  plot(train_data(3), 1, 'rx');
+
+  plot(test_data, mu(:,1), 'b');
+  plot(test_data, mu(:,2), 'g');
+  plot(test_data, mu(:,3), 'r');
+  hold off
+
+  f2 = figure(2);
+  plot(test_data, pred_labels, 'kx');
+  title('Predicted multi-class labels');
+
+  f3 = figure(3);
+  title('Multi-class posterior mean and variance');
+  colors = {'b', 'g', 'r'};
+  hold on
+  max_mean = max(mu,[],2);
+  lower = max_mean-sqrt(variance);
+  upper = max_mean+sqrt(variance);
+  p = [test_data, lower; flipdim(test_data,1),flipdim(upper,1)];
+  fill(p(:,1), p(:,2), 'y');
+  for k=1:length(model.unique_labels)
+
+    tmp_ID = pred_labels == model.unique_labels(k);
+    plot(test_data(tmp_ID),mu(tmp_ID,k),colors{k}, 'LineWidth', 2);
+
+  end
+  hold off
+
+end

+ 110 - 0
simulateIncrementalLearning.m

@@ -0,0 +1,110 @@
+function results = simulateIncrementalLearning ( labeledData, unlabeledData, testData, b_efficientUpdates, idxToAdd )
+% Copyright (c) by Alexander Freytag, 2013-11-13.
+
+    if ( (nargin < 4) || isempty (b_efficientUpdates) )
+        b_efficientUpdates = true;
+    end
+    
+    if ( (nargin < 5) || isempty (idxToAdd) )
+        idxToAdd = 1:length(unlabeledData.y);
+    end    
+    
+    labeled_X = labeledData.X;
+    labeled_y = labeledData.y;
+    
+    unlabeled_X = unlabeledData.X;
+    unlabeled_y = unlabeledData.y;
+    
+    test_X = testData.X;
+    test_y = testData.y;     
+
+    % output
+    ARRscore = zeros ( 1+length(unlabeled_y ) ,1 );
+    times = zeros ( 1+length(unlabeled_y ) ,1 );
+    
+    %% (2.1) initially train the model
+    
+    % settings
+    gpnoise = 0.1;
+    cov = {'covmin'};
+    loghyper = [];
+ 
+    K = feval(cov{:},loghyper, labeled_X);
+  
+    timeStamp_trainStart = tic;
+    model = learn_multiclass_gp(K, labeled_y, gpnoise);
+    times(1) = toc(timeStamp_trainStart);
+  
+    
+    %% (2.2) test initial model
+    %copute similarities to test samples  
+    Ks = feval(cov{:},loghyper, labeled_X, test_X);    
+ 
+    [~, pred_labels] = test_multiclass_gp(model, Ks);
+    
+    % multi-class accuracy: 1/(# classes) \sum accuracy_per_class
+    numCorrect = norm(double(pred_labels == test_y), 1);
+    testSize = double(size(test_y,1)); 
+    ARRscore(1) = double(numCorrect)/double(testSize);
+    
+    
+    %% (3) perform incremental updates
+    
+
+    
+    for i_idxAdd = 1:length(unlabeled_y)
+        
+        %%%%%%%%%%%%%%
+        %% update
+        %%%%%%%%%%%%%%
+        
+        % get new sample to add
+        xNew = unlabeled_X( idxToAdd(i_idxAdd), : );
+        % simulate asking of user
+        yNew = unlabeled_y ( idxToAdd(i_idxAdd) );
+        
+        if ( b_efficientUpdates )
+        
+            % compute kernel values
+            kss = feval(cov{:},loghyper, xNew);        
+            ksNewSample = feval(cov{:},loghyper, labeled_X, xNew);
+            
+
+            timeStamp_updateStart = tic;
+            model = update_multiclass_gp(model, kss, ksNewSample, yNew);
+            times(i_idxAdd+1) = toc(timeStamp_updateStart);
+
+            labeled_X = [ labeled_X; xNew];
+        else
+            kss = feval(cov{:},loghyper, xNew);        
+            ksNewSample = feval(cov{:},loghyper, labeled_X, xNew); 
+            K = [K, ksNewSample;ksNewSample',kss];
+            
+            labeled_X = [ labeled_X; xNew];
+            labeled_y = [ labeled_y; yNew];         
+
+            timeStamp_updateStart = tic;
+            model = learn_multiclass_gp(K, labeled_y, gpnoise);
+            times(i_idxAdd+1) = toc(timeStamp_updateStart);
+         
+        end
+        
+        
+        %%%%%%%%%%%%%
+        %% test
+        %%%%%%%%%%%%%
+        
+        Ks = [Ks; feval(cov{:},loghyper, xNew, test_X)]; 
+
+        [~, pred_labels] = test_multiclass_gp(model, Ks);
+
+        % multi-class accuracy: 1/(# classes) \sum accuracy_per_class
+        numCorrect = norm(double(pred_labels == test_y), 1);
+        testSize = double(size(test_y,1)); 
+        ARRscore(i_idxAdd+1) = double(numCorrect)/double(testSize);        
+    end
+    
+    results.ARR = ARRscore;
+    results.times = times;
+end
+

+ 48 - 0
test_multiclass_gp.m

@@ -0,0 +1,48 @@
+function [pred_mean, pred_labels, pred_var] = test_multiclass_gp(model, Ks, Kss)
+% Prediction with multi-class GP models, outputs predictive mean and variance
+% 
+% function [pred_mean, pred_labels, pred_var] = test_multiclass_gp(model, Ks, Kss)
+%
+% INPUT: model -- model structure obtained from function "learn_multiclass_gp"
+%        Ks -- n x m matrix of kernel values between n training and m test samples
+%        Kss -- m x 1 vector of self-similarities of test samples
+%
+% OUTPUT: pred_mean -- m x c matrix of predicitive mean values for each of the m test samples and each of the c binary tasks 
+%         pred_labels -- m x 1 vector containing predicted multi-class label for each test sample
+%         pred_var -- m x 1 vector containing predicted variance for each test sample
+%
+% NOTE: to get the maximum mean value for each test sample out of the c binary tasks, use: max_mean = max(pred_mean,[],2);
+% Copyright (c) by Alexander Freytag, 2013-11-13.
+
+    if ( nargin < 2)
+        Kss = [];
+    end
+
+  pred_mean = zeros( size(Ks,2),length(model.unique_labels) );
+
+  % loop over classes and thus over binary one-vs-all tasks
+  for k=1:length(model.unique_labels)
+
+    pred_mean(:,k) = Ks'*model.alpha{k};
+  
+  end
+
+  % obtain predicted labels using max-pooling
+  [~, maxID] = max(pred_mean,[],2);
+  pred_labels = model.unique_labels(maxID);
+
+  % compute predictive variances if necessary
+  if nargout > 2
+    
+    v = model.L\Ks;
+    
+    if ( isempty(Kss) ) 
+        disp('No Kss given - break!');
+        return;
+    end
+    
+    pred_var = Kss' - sum(v .* v)' + model.noise;
+
+  end
+
+end

+ 45 - 0
update_multiclass_gp.m

@@ -0,0 +1,45 @@
+function model = update_multiclass_gp(modelOld, kss, ksNewSample, newLabel)
+% Update a multi-class GP model
+% 
+% function model = update_multiclass_gp(modelOld, kss, ksNewSample, newLabel)
+%
+% INPUT: modelOld    -- the old gp-multiclass model (relevant: noise, L, y)
+%        kss         -- self similarity of new example
+%        ksNewSample -- kernel values between new and old training examples
+%        newLabel    -- label of new example
+% Copyright (c) by Alexander Freytag, 2013-11-13.
+%%
+
+  % Cholesky update to include the selected sample
+  l = modelOld.L\ksNewSample;
+  lss = sqrt(kss+modelOld.noise-l'*l);
+  L = [modelOld.L, zeros(length(l),1); l', lss ];  
+  
+  %obtain new labels in a single vector
+  newLabels = [modelOld.y; newLabel];
+  
+  % copy all relevant data
+  model.noise = modelOld.noise;
+  
+  % store relevant new data
+  model.L = L;
+  
+  % obtain unique class labels (length of this vector equals the number of classes)  
+  % Note: we can not re-use the previously known labels, since newLabel
+  % might contain a completely new class
+  model.unique_labels = unique(newLabels);
+
+  % loop over classes and thus over binary one-vs-all tasks
+  for k=1:length(model.unique_labels)
+
+    % binary label vector for each binary task
+    current_labels = 2*(newLabels==model.unique_labels(k))-1;
+
+    % precompute and store alpha vector for each binary task
+    model.alpha{k} = L'\(L\current_labels);
+
+  end
+  
+  model.y = newLabels;
+
+end