123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289 |
- /**
- * @file RTBMeanPostImprovement.cpp
- * @brief random regression tree
- * @author Sven Sickert
- * @date 07/23/2013
- */
- #define _USE_MATH_DEFINES
- #include <iostream>
- #include <math.h>
- #include "RTBMeanPostImprovement.h"
- using namespace OBJREC;
- #undef DEBUGTREE
- #undef DETAILTREE
- using namespace std;
- using namespace NICE;
- RTBMeanPostImprovement::RTBMeanPostImprovement( const Config *conf, std::string section )
- {
- random_split_tests = conf->gI(section, "random_split_tests", 10 );
- random_features = conf->gI(section, "random_features", 500 );
- max_depth = conf->gI(section, "max_depth", 10 );
- min_examples = conf->gI(section, "min_examples", 50);
- minimum_improvement = conf->gD("RandomForest", "minimum_improvement", 10e-3 );
- save_indices = conf->gB(section, "save_indices", false);
- auto_bandwith = conf->gB(section, "auto_bandwith", true);
-
- if ( conf->gB(section, "start_random_generator", false ) )
- srand(time(NULL));
- }
- RTBMeanPostImprovement::~RTBMeanPostImprovement()
- {
- }
- bool RTBMeanPostImprovement::improvementLeftRight(const vector< pair< double, int > > values,
- const Vector & y,
- double threshold,
- vector<double> & empDist_left,
- vector<double> & empDist_right,
- int& count_left,
- int& count_right,
- double& h,
- double& p )
- {
- count_left = 0;
- count_right = 0;
- vector<double> selection_left;
- vector<double> selection_right;
-
- for ( vector< pair< double, int > >::const_iterator it = values.begin();
- it != values.end(); it++ )
- {
- if ( (it->first) < threshold )
- {
- count_left++;
- selection_left.push_back( y[ it->second ] );
- }
- else
- {
- count_right++;
- selection_right.push_back( y[ it->second ] );
- }
- }
-
- if ( (count_left < min_examples) || (count_right < min_examples) )
- return false; // no split
-
- Vector vleft ( selection_left );
- Vector vright ( selection_right );
-
- // empirical distribution [Taylor & Jones, 1996]
- for ( vector< pair< double, int > >::const_iterator it = values.begin();
- it != values.end(); it++ )
- {
- double yval = y[ it->second ];
- int smaller_left = 0;
- int smaller_right = 0;
- for ( int l = 0; l < count_left; l++ )
- {
- if ( selection_left[l] <= yval ) smaller_left++;
- }
- for ( int r = 0; r < count_right; r++ )
- {
- if ( selection_right[r] <= yval ) smaller_right++;
- }
- if ( (it->first) < threshold )
- {
- double emp = (double)(smaller_left)/(double)values.size();
- empDist_left.push_back( emp );
- } else {
- double emp = (double)(smaller_right)/(double)values.size();
- empDist_right.push_back( emp );
- }
- }
-
- // bandwidth parameter [Taylor & Jones, 1996]
- if (auto_bandwith)
- {
- double sigma_hat = sqrt( vleft.StdDev()*vleft.StdDev() + vright.StdDev()*vright.StdDev() );
- double z_hat = (double)( vleft.Mean() - vright.Mean() ) / sigma_hat;
- p = (double)count_left / (double)values.size();
- double tmp = (z_hat*z_hat - 1);
- h = sigma_hat / (double)( 2 * sqrt(M_PI) * p * (1-p) * tmp*tmp * gaussianVal(z_hat, 1.0) );
- }
- else
- h = 1.0;
-
- return true;
- }
- double RTBMeanPostImprovement::gaussianVal ( const double input,
- const double bandwidth )
- {
- return ( 1 / ( sqrt( 2 * M_PI ) * sqrt(2) * bandwidth ) * exp ( -0.25 * input * input ) );
- }
- RegressionNode *RTBMeanPostImprovement::buildRecursive ( const NICE::VVector & x,
- const NICE::Vector & y,
- std::vector<int> & selection,
- int depth)
- {
- #ifdef DEBUGTREE
- fprintf (stderr, "Examples: %d (depth %d)\n", (int)selection.size(),
- (int)depth);
- #endif
-
- RegressionNode *node = new RegressionNode ();
- node->nodePrediction( y, selection );
- double lsError = node->lsError;
-
- if ( depth > max_depth )
- {
- #ifdef DEBUGTREE
- fprintf (stderr, "RTBMeanPostImprovement: maxmimum depth reached !\n");
- #endif
- node->trainExamplesIndices = selection;
- return node;
- }
-
- if ( (int)selection.size() < min_examples )
- {
- #ifdef DEBUGTREE
- fprintf (stderr, "RTBMeanPostImprovement: minimum examples reached %d < %d !\n",
- (int)selection.size(), min_examples );
- #endif
- node->trainExamplesIndices = selection;
- return node;
- }
- int best_feature = 0;
- double best_threshold = 0.0;
- double best_improvement = -1.0;
- vector<pair<double, int> > values;
-
- for ( int k = 0; k < random_features; k++ )
- {
- #ifdef DETAILTREE
- fprintf (stderr, "calculating random feature %d\n", k );
- #endif
- int f = rand() % x[0].size();
-
- values.clear();
- collectFeatureValues ( x, selection, f, values );
-
- double minValue = (min_element ( values.begin(), values.end() ))->first;
- double maxValue = (max_element ( values.begin(), values.end() ))->first;
-
- #ifdef DETAILTREE
- fprintf (stderr, "max %f min %f\n", maxValue, minValue );
- ofstream datafile;
- char buffer [20];
- int n = sprintf(buffer, "detailtree%d.dat", k);
- datafile.open( buffer );
- datafile << "# This file is called detailtree.dat" << endl;
- datafile << "# Data of the Mean Posterior Improvement Criterium" << endl;
- datafile << "# threshold \tI \t\tMPI" << endl;
- #endif
- if ( maxValue - minValue < 1e-7 ) continue;
-
- for ( int i = 0; i < random_split_tests; i++ )
- {
- double threshold;
- threshold = rand() * (maxValue -minValue ) / RAND_MAX + minValue;
- //double step = (maxValue - minValue) / random_split_tests;
- //threshold = minValue + i*step;
-
- #ifdef DETAILTREE
- fprintf (stderr, "calculating split f/s (t) %d/%d (%f)\n", k, i, threshold );
- #endif
-
- vector<double> empDist_left, empDist_right;
- int count_left, count_right;
- double h, p;
- if ( ! improvementLeftRight( values, y, threshold, empDist_left,
- empDist_right, count_left, count_right, h, p) )
- continue;
-
- // mean posterior improvement
- double I_hat = 0.0;
- for ( int l = 0; l < count_left; l++ )
- {
- for ( int r = 0; r < count_right; r++ )
- {
- I_hat += gaussianVal( (empDist_left[l] - empDist_right[r]), h );
- //I_hat += (empDist_left[l] - empDist_right[r]);
- }
- }
- I_hat /= ((double)count_left*(double)count_right);
- double mpi_hat = p * (1-p) * (1-I_hat);
- #ifdef DETAILTREE
- fprintf (stderr, "pL=%f, pR=%f, I=%f --> M=%f\n", p, (1-p), I_hat, mpi_hat);
- datafile << threshold << " " << I_hat << " " << mpi_hat << endl;
- #endif
-
- if ( mpi_hat > best_improvement )
- {
- best_improvement = mpi_hat;
- best_threshold = threshold;
- best_feature = f;
- }
- }
- #ifdef DETAILTREE
- datafile.close();
- #endif
- }
- #ifdef DETAILTREE
- fprintf (stderr, "t %f for feature %i\n", best_threshold, best_feature );
- #endif
-
- if ( best_improvement < minimum_improvement )
- {
- #ifdef DEBUGTREE
- fprintf (stderr, "RTBMeanPostImprovement: error reduction to small !\n");
- #endif
- node->trainExamplesIndices = selection;
- return node;
- }
-
- node->f = best_feature;
- node->threshold = best_threshold;
-
- // re calculating examples_left and examples_right
- vector<int> best_examples_left;
- vector<int> best_examples_right;
- values.clear();
- collectFeatureValues( x, selection, best_feature, values);
-
- best_examples_left.reserve ( values.size() / 2 );
- best_examples_right.reserve ( values.size() / 2 );
-
- for ( vector< pair < double, int > >::const_iterator it = values.begin();
- it != values.end(); it++ )
- {
- double value = it->first;
- if ( value < best_threshold )
- best_examples_left.push_back( it->second );
- else
- best_examples_right.push_back( it->second );
- }
-
- node->left = buildRecursive( x, y, best_examples_left, depth+1 );
- node->right = buildRecursive( x, y, best_examples_right, depth+1 );
-
- return node;
- }
- RegressionNode *RTBMeanPostImprovement::build( const NICE::VVector & x,
- const NICE::Vector & y )
- {
- int index = 0;
-
- vector<int> all;
- all.reserve ( y.size() );
- for ( uint i = 0; i < y.size(); i++ )
- {
- all.push_back( index );
- index++;
- }
-
- return buildRecursive( x, y, all, 0);
- }
|