|
@@ -0,0 +1,345 @@
|
|
|
+/**
|
|
|
+* @file RegRandomForests.cpp
|
|
|
+* @brief implementation of random set forests for regression
|
|
|
+* @author Sven Sickert
|
|
|
+* @date 06/28/2013
|
|
|
+
|
|
|
+*/
|
|
|
+
|
|
|
+#ifdef NICE_USELIB_OPENMP
|
|
|
+#include <omp.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#include <iostream>
|
|
|
+#include <assert.h>
|
|
|
+
|
|
|
+#include "vislearning/regression/randomforest/RegRandomForests.h"
|
|
|
+#include "vislearning/regression/randomforest/RTBRandom.h"
|
|
|
+#include "vislearning/regression/randomforest/RTBGrid.h"
|
|
|
+#include "vislearning/regression/randomforest/RTBClusterRandom.h"
|
|
|
+#include "vislearning/regression/randomforest/RTBMeanPostImprovement.h"
|
|
|
+
|
|
|
+using namespace OBJREC;
|
|
|
+
|
|
|
+using namespace std;
|
|
|
+
|
|
|
+using namespace NICE;
|
|
|
+
|
|
|
+RegRandomForests::RegRandomForests()
|
|
|
+{
|
|
|
+ builder = NULL;
|
|
|
+ minimum_error_reduction = 0.0;
|
|
|
+ enableOutOfBagEstimates = false;
|
|
|
+}
|
|
|
+
|
|
|
+RegRandomForests::RegRandomForests( const Config *_conf,
|
|
|
+ std::string section ) : conf(_conf)
|
|
|
+{
|
|
|
+ std::string builder_method = conf->gS(section, "builder", "random");
|
|
|
+ minimum_error_reduction = conf->gD(section, "minimum_error_reduction", 10e-3);
|
|
|
+ enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
|
|
|
+
|
|
|
+ confsection = section;
|
|
|
+
|
|
|
+ if ( builder_method == "none" ) {
|
|
|
+ // do not initialize
|
|
|
+ builder = NULL;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ number_of_trees = conf->gI(section, "number_of_trees", 20 );
|
|
|
+ features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
|
|
|
+ samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
|
|
|
+
|
|
|
+ if ( builder_method == "random" )
|
|
|
+ {
|
|
|
+ std::string builder_section = conf->gS(section, "builder_section", "RTBRandom");
|
|
|
+ builder = new RTBRandom ( conf, builder_section );
|
|
|
+ }
|
|
|
+ else if ( builder_method == "grid" )
|
|
|
+ {
|
|
|
+ std::string builder_section = conf->gS(section, "builder_section", "RTBGrid");
|
|
|
+ builder = new RTBGrid ( conf, builder_section );
|
|
|
+ }
|
|
|
+ else if ( builder_method == "cluster_random" )
|
|
|
+ {
|
|
|
+ std::string builder_section = conf->gS(section, "builder_section", "RTBClusterRandom");
|
|
|
+ builder = new RTBClusterRandom ( conf, builder_section );
|
|
|
+ }
|
|
|
+ else if ( builder_method == "mean_post_improvement" )
|
|
|
+ {
|
|
|
+ std::string builder_section = conf->gS(section, "builder_section", "RTBMeanPostImprovement");
|
|
|
+ builder = new RTBMeanPostImprovement ( conf, builder_section );
|
|
|
+ } else {
|
|
|
+ fprintf (stderr, "RegressionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
|
|
|
+ exit(-1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+RegRandomForests::~RegRandomForests()
|
|
|
+{
|
|
|
+ for ( vector<RegressionTree *>::iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++ )
|
|
|
+ delete (*it);
|
|
|
+
|
|
|
+ if ( builder != NULL )
|
|
|
+ delete builder;
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::calcOutOfBagEstimates (
|
|
|
+ std::vector< std::vector<int> > & outofbagtrees,
|
|
|
+ NICE::VVector x,
|
|
|
+ NICE::Vector y )
|
|
|
+{
|
|
|
+ oobResults.clear();
|
|
|
+
|
|
|
+ // calculate out of bag regression results
|
|
|
+ // as suggested bei Breiman
|
|
|
+ // out of bag = training data not used to build
|
|
|
+ // a single tree is used as testing data for the tree
|
|
|
+ long index = 0;
|
|
|
+ for ( int i = 0; i < (int)x.size(); i++, index++ )
|
|
|
+ {
|
|
|
+ double trueValue = y[i];
|
|
|
+ const vector<int> & trees = outofbagtrees[index];
|
|
|
+
|
|
|
+ if ( trees.size() <= 0 ) continue;
|
|
|
+
|
|
|
+ double predValue = predict ( x[i], trees );
|
|
|
+
|
|
|
+ double predError = abs( trueValue - predValue );
|
|
|
+ oobResults.push_back ( pair<double, double> ( predError, trueValue ) );
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::getLeafNodes ( NICE::Vector x,
|
|
|
+ std::vector<RegressionNode *> & leafNodes,
|
|
|
+ int depth )
|
|
|
+{
|
|
|
+ leafNodes.reserve ( forest.size() );
|
|
|
+ for ( vector<RegressionTree *>::const_iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++ )
|
|
|
+ {
|
|
|
+ RegressionTree & rt = *(*it);
|
|
|
+ RegressionNode *leaf = rt.getLeafNode ( x, depth );
|
|
|
+ leafNodes.push_back ( leaf );
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::getAllLeafNodes ( vector<RegressionNode *> & leafNodes)
|
|
|
+{
|
|
|
+ int z = 0;
|
|
|
+ for ( vector<RegressionTree *>::const_iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++, z++ )
|
|
|
+ {
|
|
|
+ RegressionTree & rt = *(*it);
|
|
|
+ vector<RegressionNode *> leaves = rt.getAllLeafNodes();
|
|
|
+ for ( int j = 0; j < (int)leaves.size(); j++ )
|
|
|
+ {
|
|
|
+ for ( int k = 0; k < (int)leaves[j]->trainExamplesIndices.size(); k++ )
|
|
|
+ {
|
|
|
+ leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
|
|
|
+ }
|
|
|
+ leafNodes.push_back(leaves[j]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::teach ( const NICE::VVector & x, const NICE::Vector & y )
|
|
|
+{
|
|
|
+ cerr << "RegRandomForests::teach()" << endl;
|
|
|
+ assert( builder != NULL );
|
|
|
+
|
|
|
+ int featuresCount = (int) (x[0].size() * features_per_tree );
|
|
|
+ fprintf(stderr, "RegRandomForests: number of features %d\n", (int)x[0].size() );
|
|
|
+
|
|
|
+ vector< vector<int> > outofbagtrees;
|
|
|
+ outofbagtrees.resize( x.size() );
|
|
|
+
|
|
|
+ for ( int k = 0; k < number_of_trees; k++ )
|
|
|
+ {
|
|
|
+ vector<int> tmp;
|
|
|
+ exselection.push_back(tmp);
|
|
|
+ }
|
|
|
+
|
|
|
+ #pragma omp parallel for
|
|
|
+ for ( int k = 0; k < number_of_trees; k++ )
|
|
|
+ {
|
|
|
+ fprintf( stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
|
|
|
+
|
|
|
+ vector<int> examples_index;
|
|
|
+ for ( int i = 0; i < (int)x.size(); i++ )
|
|
|
+ {
|
|
|
+ examples_index.push_back( i );
|
|
|
+ }
|
|
|
+
|
|
|
+ int trainingExamples = (int)(examples_index.size() * samples_per_tree);
|
|
|
+ fprintf (stderr, "RegRandomForests: selection of %d examples for each tree\n", trainingExamples );
|
|
|
+
|
|
|
+ if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
|
|
|
+ {
|
|
|
+ fprintf(stderr, "RegRandomForests: number of examples < 3 !! minExamples=%d, trainingExamples=%d\n",
|
|
|
+ (int)x.size(), trainingExamples);
|
|
|
+ trainingExamples = examples_index.size();
|
|
|
+ fprintf(stderr, "RegRandomForests: I will use all %d examples. !!\n", trainingExamples);
|
|
|
+ }
|
|
|
+
|
|
|
+ if ( samples_per_tree < 1.0 )
|
|
|
+ random_shuffle( examples_index.begin(), examples_index.end() );
|
|
|
+
|
|
|
+ VVector subset;
|
|
|
+ Vector subval ( trainingExamples );
|
|
|
+ for ( int e = 0; e < trainingExamples; e++ )
|
|
|
+ {
|
|
|
+ exselection[k].push_back( examples_index[e] );
|
|
|
+ subset.push_back( x[ examples_index[e] ] );
|
|
|
+ subval.set( e, y[ examples_index[e] ] );
|
|
|
+ }
|
|
|
+
|
|
|
+ // set out of bag trees
|
|
|
+ for ( uint e = trainingExamples; e < examples_index.size(); e++ )
|
|
|
+ {
|
|
|
+ int index = examples_index[e];
|
|
|
+ #pragma omp critical
|
|
|
+ outofbagtrees[index].push_back(k);
|
|
|
+ }
|
|
|
+
|
|
|
+ /******* select a random feature set *******/
|
|
|
+ vector<int> features_subset;
|
|
|
+ for ( int j = 0; j < (int)x[0].size(); j++ )
|
|
|
+ features_subset.push_back( j );
|
|
|
+
|
|
|
+ random_shuffle( features_subset.begin(), features_subset.end() );
|
|
|
+ while ((int)features_subset.size() > featuresCount)
|
|
|
+ features_subset.pop_back();
|
|
|
+
|
|
|
+ /******* training of an individual tree ****/
|
|
|
+ RegressionTree *tree = new RegressionTree( conf );
|
|
|
+
|
|
|
+ builder->build( *tree, subset, subval );
|
|
|
+
|
|
|
+ /******* prune tree using least squares criterion *****/
|
|
|
+ //if ( minimum_error_reduction > 0.0 )
|
|
|
+ // tree->pruneTreeLeastSquares( minimum_error_reduction );
|
|
|
+
|
|
|
+ /******* add individual tree to ensemble *****/
|
|
|
+ #pragma omp critical
|
|
|
+ forest.push_back(tree);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (enableOutOfBagEstimates)
|
|
|
+ calcOutOfBagEstimates(outofbagtrees, x, y);
|
|
|
+}
|
|
|
+
|
|
|
+double RegRandomForests::predict ( const NICE::Vector & x,
|
|
|
+ const vector< int > & outofbagtrees )
|
|
|
+{
|
|
|
+ // predict using only a selection of all trees
|
|
|
+ // contained in outofbagtrees
|
|
|
+
|
|
|
+ double overall_prediction = 0.0;
|
|
|
+ int treecount = 0;
|
|
|
+
|
|
|
+ for ( vector<int>::const_iterator it = outofbagtrees.begin();
|
|
|
+ it != outofbagtrees.end();
|
|
|
+ it++ )
|
|
|
+ {
|
|
|
+ assert ( *it < (int)forest.size() );
|
|
|
+ RegressionTree & rt = *(forest[(*it)]);
|
|
|
+ double predVal;
|
|
|
+ rt.traverse( x, predVal );
|
|
|
+
|
|
|
+ overall_prediction += predVal;
|
|
|
+ treecount++;
|
|
|
+ }
|
|
|
+
|
|
|
+ overall_prediction /= treecount;
|
|
|
+
|
|
|
+ return overall_prediction;
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+double RegRandomForests::predict ( const NICE::Vector & x )
|
|
|
+{
|
|
|
+ double overall_prediction = 0.0;
|
|
|
+ int treecount = 0;
|
|
|
+
|
|
|
+ for ( vector<RegressionTree *>::const_iterator it = forest.begin();
|
|
|
+ it != forest.end();
|
|
|
+ it++ )
|
|
|
+ {
|
|
|
+ RegressionTree & rt = *(*it);
|
|
|
+ double predVal;
|
|
|
+ rt.traverse( x, predVal );
|
|
|
+
|
|
|
+ overall_prediction += predVal;
|
|
|
+ treecount++;
|
|
|
+ }
|
|
|
+
|
|
|
+ overall_prediction /= treecount;
|
|
|
+
|
|
|
+ return overall_prediction;
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::restore(istream & is, int format)
|
|
|
+{
|
|
|
+ std::string tag;
|
|
|
+ int index;
|
|
|
+
|
|
|
+ while ( (is >> tag) && (tag == "TREE") )
|
|
|
+ {
|
|
|
+ is >> index;
|
|
|
+ RegressionTree *rt = new RegressionTree ( conf );
|
|
|
+ rt->restore ( is );
|
|
|
+ if ( minimum_error_reduction > 0.0 )
|
|
|
+ rt->pruneTreeLeastSquares ( minimum_error_reduction );
|
|
|
+
|
|
|
+ forest.push_back(rt);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::store(ostream & os, int format) const
|
|
|
+{
|
|
|
+ int index = 0;
|
|
|
+ for ( vector<RegressionTree *>::const_iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++, index++ )
|
|
|
+ {
|
|
|
+ const RegressionTree & rt = *(*it);
|
|
|
+ os << "TREE " << index << endl;
|
|
|
+ rt.store ( os, format );
|
|
|
+ os << "ENDTREE ";
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::clear()
|
|
|
+{
|
|
|
+ for ( vector<RegressionTree *>::iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++ )
|
|
|
+ delete (*it);
|
|
|
+
|
|
|
+ forest.clear();
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::indexDescendants(
|
|
|
+ map<RegressionNode *, pair<long, int> > & index) const
|
|
|
+{
|
|
|
+ long maxindex = 0;
|
|
|
+ for ( vector<RegressionTree *>::const_iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++ )
|
|
|
+ (*it)->indexDescendants ( index, maxindex );
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::resetCounters()
|
|
|
+{
|
|
|
+ for ( vector<RegressionTree *>::const_iterator it = forest.begin();
|
|
|
+ it != forest.end(); it++ )
|
|
|
+ (*it)->resetCounters ();
|
|
|
+}
|
|
|
+
|
|
|
+void RegRandomForests::setComplexity(int size)
|
|
|
+{
|
|
|
+ fprintf (stderr, "RegRandomForests: set complexity to %d, overwriting current value %d\n",
|
|
|
+ size, number_of_trees );
|
|
|
+ number_of_trees = size;
|
|
|
+}
|
|
|
+
|