RegRandomForests.cpp 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /**
  2. * @file RegRandomForests.cpp
  3. * @brief implementation of random set forests for regression
  4. * @author Sven Sickert
  5. * @date 06/28/2013
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include <assert.h>
  12. #include "vislearning/regression/randomforest/RegRandomForests.h"
  13. #include "vislearning/regression/randomforest/RTBRandom.h"
  14. #include "vislearning/regression/randomforest/RTBMinDist.h"
  15. #include "vislearning/regression/randomforest/RTBGrid.h"
  16. #include "vislearning/regression/randomforest/RTBClusterRandom.h"
  17. #include "vislearning/regression/randomforest/RTBMeanPostImprovement.h"
  18. using namespace OBJREC;
  19. using namespace std;
  20. using namespace NICE;
  21. RegRandomForests::RegRandomForests()
  22. {
  23. builder = NULL;
  24. minimum_error_reduction = 0.0;
  25. enableOutOfBagEstimates = false;
  26. }
  27. RegRandomForests::RegRandomForests( const Config *_conf,
  28. std::string section ) : conf(_conf)
  29. {
  30. std::string builder_method = conf->gS(section, "builder", "random");
  31. minimum_error_reduction = conf->gD(section, "minimum_error_reduction", 10e-3);
  32. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  33. confsection = section;
  34. if ( builder_method == "none" ) {
  35. // do not initialize
  36. builder = NULL;
  37. }
  38. else {
  39. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  40. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  41. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  42. if ( builder_method == "random" )
  43. {
  44. std::string builder_section = conf->gS(section, "builder_section", "RTBRandom");
  45. builder = new RTBRandom ( conf, builder_section );
  46. }
  47. else if ( builder_method == "min_dist" )
  48. {
  49. std::string builder_section = conf->gS(section, "builder_section", "RTBMinDist");
  50. builder = new RTBMinDist ( conf, builder_section );
  51. }
  52. else if ( builder_method == "grid" )
  53. {
  54. std::string builder_section = conf->gS(section, "builder_section", "RTBGrid");
  55. builder = new RTBGrid ( conf, builder_section );
  56. }
  57. else if ( builder_method == "cluster_random" )
  58. {
  59. std::string builder_section = conf->gS(section, "builder_section", "RTBClusterRandom");
  60. builder = new RTBClusterRandom ( conf, builder_section );
  61. }
  62. else if ( builder_method == "mean_post_improvement" )
  63. {
  64. std::string builder_section = conf->gS(section, "builder_section", "RTBMeanPostImprovement");
  65. builder = new RTBMeanPostImprovement ( conf, builder_section );
  66. } else {
  67. fprintf (stderr, "RegressionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  68. exit(-1);
  69. }
  70. }
  71. }
  72. RegRandomForests::~RegRandomForests()
  73. {
  74. for ( vector<RegressionTree *>::iterator it = forest.begin();
  75. it != forest.end(); it++ )
  76. delete (*it);
  77. if ( builder != NULL )
  78. delete builder;
  79. }
  80. void RegRandomForests::calcOutOfBagEstimates (
  81. std::vector< std::vector<int> > & outofbagtrees,
  82. NICE::VVector x,
  83. NICE::Vector y )
  84. {
  85. oobResults.clear();
  86. // calculate out of bag regression results
  87. // as suggested bei Breiman
  88. // out of bag = training data not used to build
  89. // a single tree is used as testing data for the tree
  90. long index = 0;
  91. for ( int i = 0; i < (int)x.size(); i++, index++ )
  92. {
  93. double trueValue = y[i];
  94. const vector<int> & trees = outofbagtrees[index];
  95. if ( trees.size() <= 0 ) continue;
  96. double predValue = predict ( x[i], trees );
  97. double predError = abs( trueValue - predValue );
  98. oobResults.push_back ( pair<double, double> ( predError, trueValue ) );
  99. }
  100. }
  101. void RegRandomForests::getLeafNodes ( NICE::Vector x,
  102. std::vector<RegressionNode *> & leafNodes,
  103. int depth )
  104. {
  105. leafNodes.reserve ( forest.size() );
  106. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  107. it != forest.end(); it++ )
  108. {
  109. RegressionTree & rt = *(*it);
  110. RegressionNode *leaf = rt.getLeafNode ( x, depth );
  111. leafNodes.push_back ( leaf );
  112. }
  113. }
  114. void RegRandomForests::getAllLeafNodes ( vector<RegressionNode *> & leafNodes)
  115. {
  116. int z = 0;
  117. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  118. it != forest.end(); it++, z++ )
  119. {
  120. RegressionTree & rt = *(*it);
  121. vector<RegressionNode *> leaves = rt.getAllLeafNodes();
  122. for ( int j = 0; j < (int)leaves.size(); j++ )
  123. {
  124. for ( int k = 0; k < (int)leaves[j]->trainExamplesIndices.size(); k++ )
  125. {
  126. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  127. }
  128. leafNodes.push_back(leaves[j]);
  129. }
  130. }
  131. }
  132. void RegRandomForests::teach ( const NICE::VVector & x, const NICE::Vector & y )
  133. {
  134. cerr << "RegRandomForests::teach()" << endl;
  135. assert( builder != NULL );
  136. int featuresCount = (int) (x[0].size() * features_per_tree );
  137. fprintf(stderr, "RegRandomForests: number of features %d\n", (int)x[0].size() );
  138. vector< vector<int> > outofbagtrees;
  139. outofbagtrees.resize( x.size() );
  140. for ( int k = 0; k < number_of_trees; k++ )
  141. {
  142. vector<int> tmp;
  143. exselection.push_back(tmp);
  144. }
  145. #pragma omp parallel for
  146. for ( int k = 0; k < number_of_trees; k++ )
  147. {
  148. fprintf( stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  149. vector<int> examples_index;
  150. for ( int i = 0; i < (int)x.size(); i++ )
  151. {
  152. examples_index.push_back( i );
  153. }
  154. int trainingExamples = (int)(examples_index.size() * samples_per_tree);
  155. fprintf (stderr, "RegRandomForests: selection of %d examples for each tree\n", trainingExamples );
  156. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  157. {
  158. fprintf(stderr, "RegRandomForests: number of examples < 3 !! minExamples=%d, trainingExamples=%d\n",
  159. (int)x.size(), trainingExamples);
  160. trainingExamples = examples_index.size();
  161. fprintf(stderr, "RegRandomForests: I will use all %d examples. !!\n", trainingExamples);
  162. }
  163. if ( samples_per_tree < 1.0 )
  164. random_shuffle( examples_index.begin(), examples_index.end() );
  165. VVector subset;
  166. Vector subval ( trainingExamples );
  167. for ( int e = 0; e < trainingExamples; e++ )
  168. {
  169. exselection[k].push_back( examples_index[e] );
  170. subset.push_back( x[ examples_index[e] ] );
  171. subval.set( e, y[ examples_index[e] ] );
  172. }
  173. // set out of bag trees
  174. for ( uint e = trainingExamples; e < examples_index.size(); e++ )
  175. {
  176. int index = examples_index[e];
  177. #pragma omp critical
  178. outofbagtrees[index].push_back(k);
  179. }
  180. /******* select a random feature set *******/
  181. vector<int> features_subset;
  182. for ( int j = 0; j < (int)x[0].size(); j++ )
  183. features_subset.push_back( j );
  184. random_shuffle( features_subset.begin(), features_subset.end() );
  185. while ((int)features_subset.size() > featuresCount)
  186. features_subset.pop_back();
  187. /******* training of an individual tree ****/
  188. RegressionTree *tree = new RegressionTree( conf );
  189. builder->build( *tree, subset, subval );
  190. /******* prune tree using least squares criterion *****/
  191. //if ( minimum_error_reduction > 0.0 )
  192. // tree->pruneTreeLeastSquares( minimum_error_reduction );
  193. /******* add individual tree to ensemble *****/
  194. #pragma omp critical
  195. forest.push_back(tree);
  196. }
  197. if (enableOutOfBagEstimates)
  198. calcOutOfBagEstimates(outofbagtrees, x, y);
  199. }
  200. double RegRandomForests::predict ( const NICE::Vector & x,
  201. const vector< int > & outofbagtrees )
  202. {
  203. // predict using only a selection of all trees
  204. // contained in outofbagtrees
  205. double overall_prediction = 0.0;
  206. int treecount = 0;
  207. for ( vector<int>::const_iterator it = outofbagtrees.begin();
  208. it != outofbagtrees.end();
  209. it++ )
  210. {
  211. assert ( *it < (int)forest.size() );
  212. RegressionTree & rt = *(forest[(*it)]);
  213. double predVal;
  214. rt.traverse( x, predVal );
  215. overall_prediction += predVal;
  216. treecount++;
  217. }
  218. overall_prediction /= treecount;
  219. return overall_prediction;
  220. }
  221. double RegRandomForests::predict ( const NICE::Vector & x )
  222. {
  223. double overall_prediction = 0.0;
  224. int treecount = 0;
  225. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  226. it != forest.end();
  227. it++ )
  228. {
  229. RegressionTree & rt = *(*it);
  230. double predVal;
  231. rt.traverse( x, predVal );
  232. overall_prediction += predVal;
  233. treecount++;
  234. }
  235. overall_prediction /= treecount;
  236. return overall_prediction;
  237. }
  238. void RegRandomForests::restore(istream & is, int format)
  239. {
  240. std::string tag;
  241. int index;
  242. while ( (is >> tag) && (tag == "TREE") )
  243. {
  244. is >> index;
  245. RegressionTree *rt = new RegressionTree ( conf );
  246. rt->restore ( is );
  247. if ( minimum_error_reduction > 0.0 )
  248. rt->pruneTreeLeastSquares ( minimum_error_reduction );
  249. forest.push_back(rt);
  250. }
  251. }
  252. void RegRandomForests::store(ostream & os, int format) const
  253. {
  254. int index = 0;
  255. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  256. it != forest.end(); it++, index++ )
  257. {
  258. const RegressionTree & rt = *(*it);
  259. os << "TREE " << index << endl;
  260. rt.store ( os, format );
  261. os << "ENDTREE ";
  262. }
  263. }
  264. void RegRandomForests::clear()
  265. {
  266. for ( vector<RegressionTree *>::iterator it = forest.begin();
  267. it != forest.end(); it++ )
  268. delete (*it);
  269. forest.clear();
  270. }
  271. void RegRandomForests::indexDescendants(
  272. map<RegressionNode *, pair<long, int> > & index) const
  273. {
  274. long maxindex = 0;
  275. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  276. it != forest.end(); it++ )
  277. (*it)->indexDescendants ( index, maxindex );
  278. }
  279. void RegRandomForests::resetCounters()
  280. {
  281. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  282. it != forest.end(); it++ )
  283. (*it)->resetCounters ();
  284. }
  285. void RegRandomForests::setComplexity(int size)
  286. {
  287. fprintf (stderr, "RegRandomForests: set complexity to %d, overwriting current value %d\n",
  288. size, number_of_trees );
  289. number_of_trees = size;
  290. }