/** * @file RegRandomForests.cpp * @brief implementation of random set forests for regression * @author Sven Sickert * @date 06/28/2013 */ #ifdef NICE_USELIB_OPENMP #include #endif #include #include #include "vislearning/regression/randomforest/RegRandomForests.h" #include "vislearning/regression/randomforest/RTBRandom.h" #include "vislearning/regression/randomforest/RTBLinear.h" #include "vislearning/regression/randomforest/RTBMinDist.h" #include "vislearning/regression/randomforest/RTBGrid.h" #include "vislearning/regression/randomforest/RTBClusterRandom.h" #include "vislearning/regression/randomforest/RTBMeanPostImprovement.h" using namespace OBJREC; using namespace std; using namespace NICE; RegRandomForests::RegRandomForests() { builder = NULL; minimum_error_reduction = 0.0; enableOutOfBagEstimates = false; } RegRandomForests::RegRandomForests( const Config *_conf, std::string section ) : conf(_conf) { std::string builder_method = conf->gS(section, "builder", "random"); minimum_error_reduction = conf->gD(section, "minimum_error_reduction", 10e-3); enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false); confsection = section; if ( builder_method == "none" ) { // do not initialize builder = NULL; } else { number_of_trees = conf->gI(section, "number_of_trees", 20 ); features_per_tree = conf->gD(section, "features_per_tree", 1.0 ); samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 ); if ( builder_method == "random" ) { std::string builder_section = conf->gS(section, "builder_section", "RTBRandom"); builder = new RTBRandom ( conf, builder_section ); } else if ( builder_method == "min_dist" ) { std::string builder_section = conf->gS(section, "builder_section", "RTBMinDist"); builder = new RTBMinDist ( conf, builder_section ); } else if ( builder_method == "linear" ) { std::string builder_section = conf->gS(section, "builder_section", "RTBRandom"); builder = new RTBLinear ( conf, builder_section ); } else if ( builder_method == "grid" ) { std::string builder_section = conf->gS(section, "builder_section", "RTBGrid"); builder = new RTBGrid ( conf, builder_section ); } else if ( builder_method == "cluster_random" ) { std::string builder_section = conf->gS(section, "builder_section", "RTBClusterRandom"); builder = new RTBClusterRandom ( conf, builder_section ); } else if ( builder_method == "mean_post_improvement" ) { std::string builder_section = conf->gS(section, "builder_section", "RTBMeanPostImprovement"); builder = new RTBMeanPostImprovement ( conf, builder_section ); } else { fprintf (stderr, "RegressionTreeBuilder %s not yet implemented !\n", builder_method.c_str() ); exit(-1); } } } RegRandomForests::~RegRandomForests() { for ( vector::iterator it = forest.begin(); it != forest.end(); it++ ) delete (*it); if ( builder != NULL ) delete builder; } void RegRandomForests::calcOutOfBagEstimates ( std::vector< std::vector > & outofbagtrees, NICE::VVector x, NICE::Vector y ) { oobResults.clear(); // calculate out of bag regression results // as suggested bei Breiman // out of bag = training data not used to build // a single tree is used as testing data for the tree long index = 0; for ( int i = 0; i < (int)x.size(); i++, index++ ) { double trueValue = y[i]; const vector & trees = outofbagtrees[index]; if ( trees.size() <= 0 ) continue; double predValue = predict ( x[i], trees ); double predError = abs( trueValue - predValue ); oobResults.push_back ( pair ( predError, trueValue ) ); } } void RegRandomForests::getLeafNodes ( NICE::Vector x, std::vector & leafNodes, int depth ) { leafNodes.reserve ( forest.size() ); for ( vector::const_iterator it = forest.begin(); it != forest.end(); it++ ) { RegressionTree & rt = *(*it); RegressionNode *leaf = rt.getLeafNode ( x, depth ); leafNodes.push_back ( leaf ); } } void RegRandomForests::getAllLeafNodes ( vector & leafNodes) { int z = 0; for ( vector::const_iterator it = forest.begin(); it != forest.end(); it++, z++ ) { RegressionTree & rt = *(*it); vector leaves = rt.getAllLeafNodes(); for ( int j = 0; j < (int)leaves.size(); j++ ) { for ( int k = 0; k < (int)leaves[j]->trainExamplesIndices.size(); k++ ) { leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]]; } leafNodes.push_back(leaves[j]); } } } void RegRandomForests::teach ( const NICE::VVector & x, const NICE::Vector & y ) { cerr << "RegRandomForests::teach()" << endl; assert( builder != NULL ); int featuresCount = (int) (x[0].size() * features_per_tree ); fprintf(stderr, "RegRandomForests: number of features %d\n", (int)x[0].size() ); vector< vector > outofbagtrees; outofbagtrees.resize( x.size() ); for ( int k = 0; k < number_of_trees; k++ ) { vector tmp; exselection.push_back(tmp); } #pragma omp parallel for for ( int k = 0; k < number_of_trees; k++ ) { fprintf( stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees); vector examples_index; for ( int i = 0; i < (int)x.size(); i++ ) { examples_index.push_back( i ); } int trainingExamples = (int)(examples_index.size() * samples_per_tree); fprintf (stderr, "RegRandomForests: selection of %d examples for each tree\n", trainingExamples ); if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) ) { fprintf(stderr, "RegRandomForests: number of examples < 3 !! minExamples=%d, trainingExamples=%d\n", (int)x.size(), trainingExamples); trainingExamples = examples_index.size(); fprintf(stderr, "RegRandomForests: I will use all %d examples. !!\n", trainingExamples); } if ( samples_per_tree < 1.0 ) random_shuffle( examples_index.begin(), examples_index.end() ); VVector subset; Vector subval ( trainingExamples ); for ( int e = 0; e < trainingExamples; e++ ) { exselection[k].push_back( examples_index[e] ); subset.push_back( x[ examples_index[e] ] ); subval.set( e, y[ examples_index[e] ] ); } // set out of bag trees for ( uint e = trainingExamples; e < examples_index.size(); e++ ) { int index = examples_index[e]; #pragma omp critical outofbagtrees[index].push_back(k); } /******* select a random feature set *******/ vector features_subset; for ( int j = 0; j < (int)x[0].size(); j++ ) features_subset.push_back( j ); random_shuffle( features_subset.begin(), features_subset.end() ); while ((int)features_subset.size() > featuresCount) features_subset.pop_back(); /******* training of an individual tree ****/ RegressionTree *tree = new RegressionTree( conf ); builder->build( *tree, subset, subval ); /******* prune tree using least squares criterion *****/ //if ( minimum_error_reduction > 0.0 ) // tree->pruneTreeLeastSquares( minimum_error_reduction ); /******* add individual tree to ensemble *****/ #pragma omp critical forest.push_back(tree); } if (enableOutOfBagEstimates) calcOutOfBagEstimates(outofbagtrees, x, y); } double RegRandomForests::predict ( const NICE::Vector & x, const vector< int > & outofbagtrees ) { // predict using only a selection of all trees // contained in outofbagtrees double overall_prediction = 0.0; int treecount = 0; for ( vector::const_iterator it = outofbagtrees.begin(); it != outofbagtrees.end(); it++ ) { assert ( *it < (int)forest.size() ); RegressionTree & rt = *(forest[(*it)]); double predVal; rt.traverse( x, predVal ); overall_prediction += predVal; treecount++; } overall_prediction /= treecount; return overall_prediction; } double RegRandomForests::predict ( const NICE::Vector & x ) { double overall_prediction = 0.0; int treecount = 0; for ( vector::const_iterator it = forest.begin(); it != forest.end(); it++ ) { RegressionTree & rt = *(*it); double predVal; rt.traverse( x, predVal ); overall_prediction += predVal; treecount++; } overall_prediction /= treecount; return overall_prediction; } void RegRandomForests::restore(istream & is, int format) { std::string tag; int index; while ( (is >> tag) && (tag == "TREE") ) { is >> index; RegressionTree *rt = new RegressionTree ( conf ); rt->restore ( is ); if ( minimum_error_reduction > 0.0 ) rt->pruneTreeLeastSquares ( minimum_error_reduction ); forest.push_back(rt); } } void RegRandomForests::store(ostream & os, int format) const { int index = 0; for ( vector::const_iterator it = forest.begin(); it != forest.end(); it++, index++ ) { const RegressionTree & rt = *(*it); os << "TREE " << index << endl; rt.store ( os, format ); os << "ENDTREE "; } } void RegRandomForests::clear() { for ( vector::iterator it = forest.begin(); it != forest.end(); it++ ) delete (*it); forest.clear(); } void RegRandomForests::indexDescendants( map > & index) const { long maxindex = 0; for ( vector::const_iterator it = forest.begin(); it != forest.end(); it++ ) (*it)->indexDescendants ( index, maxindex ); } void RegRandomForests::resetCounters() { for ( vector::const_iterator it = forest.begin(); it != forest.end(); it++ ) (*it)->resetCounters (); } void RegRandomForests::setComplexity(int size) { fprintf (stderr, "RegRandomForests: set complexity to %d, overwriting current value %d\n", size, number_of_trees ); number_of_trees = size; }