/** * @file RTBMinDist.cpp * @brief random regression tree; split criterion is to minimize mean distance of all examples of an inner node * @author Frank Prüfer * @date 09/17/2013 */ #include #include "RTBMinDist.h" using namespace OBJREC; #undef DEBUGTREE #undef DETAILTREE using namespace std; using namespace NICE; RTBMinDist::RTBMinDist( const Config *conf, std::string section ) { random_split_tests = conf->gI(section, "random_split_tests", 10 ); random_features = conf->gI(section, "random_features", 500 ); max_depth = conf->gI(section, "max_depth", 10 ); min_examples = conf->gI(section, "min_examples", 50); minimum_distance_reduction = conf->gD("RandomForest", "minimum_distance_reduction", 10e-3 ); if ( conf->gB(section, "start_random_generator", false ) ) srand(time(NULL)); } RTBMinDist::~RTBMinDist() { } void RTBMinDist::computeDistanceToPrototype( const vector &fvalues, const int &countEx, double &dist ) { double prototype = 0.0; for ( int i = 0; i < countEx; i++ ){ prototype += fvalues[i]; } prototype /= (double)countEx; for ( int i = 0; i < countEx; i++ ){ dist += abs(prototype - fvalues[i]); } dist /= (double)countEx; } bool RTBMinDist::averageDistanceLeftRight(const vector< pair< double, int > > values, double threshold, double& avg_dist_left, double& avg_dist_right, int& count_left, int& count_right) { count_left = 0; count_right = 0; vector selection_left; vector selection_right; vector values_left; vector values_right; for ( vector< pair< double, int > >::const_iterator it = values.begin(); it != values.end(); it++ ) { double value = it->first; if ( value < threshold ) { count_left++; selection_left.push_back( it->second ); values_left.push_back( it->first ); } else { count_right++; selection_right.push_back( it->second ); values_right.push_back( it->first ); } } if ( (count_left == 0) || (count_right == 0) ) return false; // no split if ( (count_left < min_examples) || (count_right < min_examples) ) return false; // no split //compute mean distance of left and right group to respective prototype computeDistanceToPrototype( values_left, count_left, avg_dist_left); computeDistanceToPrototype( values_right, count_right, avg_dist_right); return true; } RegressionNode *RTBMinDist::buildRecursive ( const NICE::VVector & x, const NICE::Vector & y, std::vector & selection, int depth) { #ifdef DEBUGTREE fprintf (stderr, "Examples: %d (depth %d)\n", (int)selection.size(), (int)depth); #endif RegressionNode *node = new RegressionNode (); node->nodePrediction( y, selection ); if ( depth > max_depth ) { #ifdef DEBUGTREE fprintf (stderr, "RTBMinDist: maxmimum depth reached !\n"); #endif node->trainExamplesIndices = selection; return node; } if ( (int)selection.size() < min_examples ) { #ifdef DEBUGTREE fprintf (stderr, "RTBMinDist: minimum examples reached %d < %d !\n", (int)selection.size(), min_examples ); #endif node->trainExamplesIndices = selection; return node; } int best_feature = 0; double best_threshold = 0.0; double best_reduct = -1.0; // vector > best_values; vector > values; double distance_left = 0.0; double distance_right = 0.0; for ( int k = 0; k < random_features; k++ ) { #ifdef DETAILTREE fprintf (stderr, "calculating random feature %d\n", k ); #endif int f = rand() % x[0].size(); values.clear(); collectFeatureValues ( x, selection, f, values ); double curDist = 0.0; vector fvalues; for ( vector< pair< double, int > >::const_iterator it = values.begin(); it != values.end(); it++ ) { fvalues.push_back(it->first); } computeDistanceToPrototype( fvalues, (int)values.size(), curDist ); double minValue = (min_element ( values.begin(), values.end() ))->first; double maxValue = (max_element ( values.begin(), values.end() ))->first; #ifdef DETAILTREE fprintf (stderr, "max %f min %f\n", maxValue, minValue ); #endif if ( maxValue - minValue < 1e-7 ) continue; for ( int i = 0; i < random_split_tests; i++ ) { double threshold; threshold = rand() * (maxValue -minValue ) / RAND_MAX + minValue; #ifdef DETAILTREE fprintf (stderr, "calculating split f/s(f) %d/%d %f\n", k, i, threshold ); #endif distance_left = 0.0; distance_right = 0.0; int count_left, count_right; if ( ! averageDistanceLeftRight( values, threshold, distance_left, distance_right, count_left, count_right) ) continue; //double pl = (count_left) / (count_left +count_right); //double errorReduction = lsError - pl*lsError_left - (1-pl)*lsError_right; double distReduction = curDist - distance_left - distance_right; if ( distReduction > best_reduct ) { best_reduct = distReduction; best_threshold = threshold; best_feature = f; #ifdef DETAILTREE fprintf (stderr, "t %f for feature %i\n", best_threshold, best_feature ); #endif } } } if ( best_reduct < minimum_distance_reduction ) { #ifdef DEBUGTREE fprintf (stderr, "RTBMinDist: distance reduction to small !\n"); #endif node->trainExamplesIndices = selection; return node; } node->f = best_feature; node->threshold = best_threshold; // re calculating examples_left and examples_right vector best_examples_left; vector best_examples_right; values.clear(); collectFeatureValues( x, selection, best_feature, values); best_examples_left.reserve ( values.size() / 2 ); best_examples_right.reserve ( values.size() / 2 ); for ( vector< pair < double, int > >::const_iterator it = values.begin(); it != values.end(); it++ ) { double value = it->first; if ( value < best_threshold ) best_examples_left.push_back( it->second ); else best_examples_right.push_back( it->second ); } node->left = buildRecursive( x, y, best_examples_left, depth+1 ); node->right = buildRecursive( x, y, best_examples_right, depth+1 ); return node; } RegressionNode *RTBMinDist::build( const NICE::VVector & x, const NICE::Vector & y ) { int index = 0; vector all; all.reserve ( y.size() ); for ( uint i = 0; i < y.size(); i++ ) { all.push_back( index ); index++; } return buildRecursive( x, y, all, 0); }