/** * @file RTBMeanPostImprovement.cpp * @brief random regression tree * @author Sven Sickert * @date 07/23/2013 */ #define _USE_MATH_DEFINES #include #include #include "RTBMeanPostImprovement.h" using namespace OBJREC; #undef DEBUGTREE #undef DETAILTREE using namespace std; using namespace NICE; RTBMeanPostImprovement::RTBMeanPostImprovement( const Config *conf, std::string section ) { random_split_tests = conf->gI(section, "random_split_tests", 10 ); random_features = conf->gI(section, "random_features", 500 ); max_depth = conf->gI(section, "max_depth", 10 ); min_examples = conf->gI(section, "min_examples", 50); minimum_improvement = conf->gD("RandomForest", "minimum_improvement", 10e-3 ); save_indices = conf->gB(section, "save_indices", false); auto_bandwith = conf->gB(section, "auto_bandwith", true); if ( conf->gB(section, "start_random_generator", false ) ) srand(time(NULL)); } RTBMeanPostImprovement::~RTBMeanPostImprovement() { } bool RTBMeanPostImprovement::improvementLeftRight(const vector< pair< double, int > > values, const Vector & y, double threshold, vector & empDist_left, vector & empDist_right, int& count_left, int& count_right, double& h, double& p ) { count_left = 0; count_right = 0; vector selection_left; vector selection_right; for ( vector< pair< double, int > >::const_iterator it = values.begin(); it != values.end(); it++ ) { if ( (it->first) < threshold ) { count_left++; selection_left.push_back( y[ it->second ] ); } else { count_right++; selection_right.push_back( y[ it->second ] ); } } if ( (count_left < min_examples) || (count_right < min_examples) ) return false; // no split Vector vleft ( selection_left ); Vector vright ( selection_right ); // empirical distribution [Taylor & Jones, 1996] for ( vector< pair< double, int > >::const_iterator it = values.begin(); it != values.end(); it++ ) { double yval = y[ it->second ]; int smaller_left = 0; int smaller_right = 0; for ( int l = 0; l < count_left; l++ ) { if ( selection_left[l] <= yval ) smaller_left++; } for ( int r = 0; r < count_right; r++ ) { if ( selection_right[r] <= yval ) smaller_right++; } if ( (it->first) < threshold ) { double emp = (double)(smaller_left)/(double)values.size(); empDist_left.push_back( emp ); } else { double emp = (double)(smaller_right)/(double)values.size(); empDist_right.push_back( emp ); } } // bandwidth parameter [Taylor & Jones, 1996] if (auto_bandwith) { double sigma_hat = sqrt( vleft.StdDev()*vleft.StdDev() + vright.StdDev()*vright.StdDev() ); double z_hat = (double)( vleft.Mean() - vright.Mean() ) / sigma_hat; p = (double)count_left / (double)values.size(); double tmp = (z_hat*z_hat - 1); h = sigma_hat / (double)( 2 * sqrt(M_PI) * p * (1-p) * tmp*tmp * gaussianVal(z_hat, 1.0) ); } else h = 1.0; return true; } double RTBMeanPostImprovement::gaussianVal ( const double input, const double bandwidth ) { return ( 1 / ( sqrt( 2 * M_PI ) * sqrt(2) * bandwidth ) * exp ( -0.25 * input * input ) ); } RegressionNode *RTBMeanPostImprovement::buildRecursive ( const NICE::VVector & x, const NICE::Vector & y, std::vector & selection, int depth) { #ifdef DEBUGTREE fprintf (stderr, "Examples: %d (depth %d)\n", (int)selection.size(), (int)depth); #endif RegressionNode *node = new RegressionNode (); node->nodePrediction( y, selection ); double lsError = node->lsError; if ( depth > max_depth ) { #ifdef DEBUGTREE fprintf (stderr, "RTBMeanPostImprovement: maxmimum depth reached !\n"); #endif node->trainExamplesIndices = selection; return node; } if ( (int)selection.size() < min_examples ) { #ifdef DEBUGTREE fprintf (stderr, "RTBMeanPostImprovement: minimum examples reached %d < %d !\n", (int)selection.size(), min_examples ); #endif node->trainExamplesIndices = selection; return node; } int best_feature = 0; double best_threshold = 0.0; double best_improvement = -1.0; vector > values; for ( int k = 0; k < random_features; k++ ) { #ifdef DETAILTREE fprintf (stderr, "calculating random feature %d\n", k ); #endif int f = rand() % x[0].size(); values.clear(); collectFeatureValues ( x, selection, f, values ); double minValue = (min_element ( values.begin(), values.end() ))->first; double maxValue = (max_element ( values.begin(), values.end() ))->first; #ifdef DETAILTREE fprintf (stderr, "max %f min %f\n", maxValue, minValue ); ofstream datafile; char buffer [20]; int n = sprintf(buffer, "detailtree%d.dat", k); datafile.open( buffer ); datafile << "# This file is called detailtree.dat" << endl; datafile << "# Data of the Mean Posterior Improvement Criterium" << endl; datafile << "# threshold \tI \t\tMPI" << endl; #endif if ( maxValue - minValue < 1e-7 ) continue; for ( int i = 0; i < random_split_tests; i++ ) { double threshold; threshold = rand() * (maxValue -minValue ) / RAND_MAX + minValue; //double step = (maxValue - minValue) / random_split_tests; //threshold = minValue + i*step; #ifdef DETAILTREE fprintf (stderr, "calculating split f/s (t) %d/%d (%f)\n", k, i, threshold ); #endif vector empDist_left, empDist_right; int count_left, count_right; double h, p; if ( ! improvementLeftRight( values, y, threshold, empDist_left, empDist_right, count_left, count_right, h, p) ) continue; // mean posterior improvement double I_hat = 0.0; for ( int l = 0; l < count_left; l++ ) { for ( int r = 0; r < count_right; r++ ) { I_hat += gaussianVal( (empDist_left[l] - empDist_right[r]), h ); //I_hat += (empDist_left[l] - empDist_right[r]); } } I_hat /= ((double)count_left*(double)count_right); double mpi_hat = p * (1-p) * (1-I_hat); #ifdef DETAILTREE fprintf (stderr, "pL=%f, pR=%f, I=%f --> M=%f\n", p, (1-p), I_hat, mpi_hat); datafile << threshold << " " << I_hat << " " << mpi_hat << endl; #endif if ( mpi_hat > best_improvement ) { best_improvement = mpi_hat; best_threshold = threshold; best_feature = f; } } #ifdef DETAILTREE datafile.close(); #endif } #ifdef DETAILTREE fprintf (stderr, "t %f for feature %i\n", best_threshold, best_feature ); #endif if ( best_improvement < minimum_improvement ) { #ifdef DEBUGTREE fprintf (stderr, "RTBMeanPostImprovement: error reduction to small !\n"); #endif node->trainExamplesIndices = selection; return node; } node->f = best_feature; node->threshold = best_threshold; // re calculating examples_left and examples_right vector best_examples_left; vector best_examples_right; values.clear(); collectFeatureValues( x, selection, best_feature, values); best_examples_left.reserve ( values.size() / 2 ); best_examples_right.reserve ( values.size() / 2 ); for ( vector< pair < double, int > >::const_iterator it = values.begin(); it != values.end(); it++ ) { double value = it->first; if ( value < best_threshold ) best_examples_left.push_back( it->second ); else best_examples_right.push_back( it->second ); } node->left = buildRecursive( x, y, best_examples_left, depth+1 ); node->right = buildRecursive( x, y, best_examples_right, depth+1 ); return node; } RegressionNode *RTBMeanPostImprovement::build( const NICE::VVector & x, const NICE::Vector & y ) { int index = 0; vector all; all.reserve ( y.size() ); for ( uint i = 0; i < y.size(); i++ ) { all.push_back( index ); index++; } return buildRecursive( x, y, all, 0); }