RegRandomForests.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /**
  2. * @file RegRandomForests.cpp
  3. * @brief implementation of random set forests for regression
  4. * @author Sven Sickert
  5. * @date 06/28/2013
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include <assert.h>
  12. #include "vislearning/regression/randomforest/RegRandomForests.h"
  13. #include "vislearning/regression/randomforest/RTBRandom.h"
  14. #include "vislearning/regression/randomforest/RTBLinear.h"
  15. #include "vislearning/regression/randomforest/RTBMinDist.h"
  16. #include "vislearning/regression/randomforest/RTBGrid.h"
  17. #include "vislearning/regression/randomforest/RTBClusterRandom.h"
  18. #include "vislearning/regression/randomforest/RTBMeanPostImprovement.h"
  19. using namespace OBJREC;
  20. using namespace std;
  21. using namespace NICE;
  22. RegRandomForests::RegRandomForests()
  23. {
  24. builder = NULL;
  25. minimum_error_reduction = 0.0;
  26. enableOutOfBagEstimates = false;
  27. }
  28. RegRandomForests::RegRandomForests( const Config *_conf,
  29. std::string section ) : conf(_conf)
  30. {
  31. std::string builder_method = conf->gS(section, "builder", "random");
  32. minimum_error_reduction = conf->gD(section, "minimum_error_reduction", 10e-3);
  33. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  34. confsection = section;
  35. if ( builder_method == "none" ) {
  36. // do not initialize
  37. builder = NULL;
  38. }
  39. else {
  40. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  41. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  42. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  43. if ( builder_method == "random" )
  44. {
  45. std::string builder_section = conf->gS(section, "builder_section", "RTBRandom");
  46. builder = new RTBRandom ( conf, builder_section );
  47. }
  48. else if ( builder_method == "min_dist" )
  49. {
  50. std::string builder_section = conf->gS(section, "builder_section", "RTBMinDist");
  51. builder = new RTBMinDist ( conf, builder_section );
  52. }
  53. else if ( builder_method == "linear" )
  54. {
  55. std::string builder_section = conf->gS(section, "builder_section", "RTBRandom");
  56. builder = new RTBLinear ( conf, builder_section );
  57. }
  58. else if ( builder_method == "grid" )
  59. {
  60. std::string builder_section = conf->gS(section, "builder_section", "RTBGrid");
  61. builder = new RTBGrid ( conf, builder_section );
  62. }
  63. else if ( builder_method == "cluster_random" )
  64. {
  65. std::string builder_section = conf->gS(section, "builder_section", "RTBClusterRandom");
  66. builder = new RTBClusterRandom ( conf, builder_section );
  67. }
  68. else if ( builder_method == "mean_post_improvement" )
  69. {
  70. std::string builder_section = conf->gS(section, "builder_section", "RTBMeanPostImprovement");
  71. builder = new RTBMeanPostImprovement ( conf, builder_section );
  72. } else {
  73. fprintf (stderr, "RegressionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  74. exit(-1);
  75. }
  76. }
  77. }
  78. RegRandomForests::~RegRandomForests()
  79. {
  80. for ( vector<RegressionTree *>::iterator it = forest.begin();
  81. it != forest.end(); it++ )
  82. delete (*it);
  83. if ( builder != NULL )
  84. delete builder;
  85. }
  86. void RegRandomForests::calcOutOfBagEstimates (
  87. std::vector< std::vector<int> > & outofbagtrees,
  88. NICE::VVector x,
  89. NICE::Vector y )
  90. {
  91. oobResults.clear();
  92. // calculate out of bag regression results
  93. // as suggested bei Breiman
  94. // out of bag = training data not used to build
  95. // a single tree is used as testing data for the tree
  96. long index = 0;
  97. for ( int i = 0; i < (int)x.size(); i++, index++ )
  98. {
  99. double trueValue = y[i];
  100. const vector<int> & trees = outofbagtrees[index];
  101. if ( trees.size() <= 0 ) continue;
  102. double predValue = predict ( x[i], trees );
  103. double predError = abs( trueValue - predValue );
  104. oobResults.push_back ( pair<double, double> ( predError, trueValue ) );
  105. }
  106. }
  107. void RegRandomForests::getLeafNodes ( NICE::Vector x,
  108. std::vector<RegressionNode *> & leafNodes,
  109. int depth )
  110. {
  111. leafNodes.reserve ( forest.size() );
  112. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  113. it != forest.end(); it++ )
  114. {
  115. RegressionTree & rt = *(*it);
  116. RegressionNode *leaf = rt.getLeafNode ( x, depth );
  117. leafNodes.push_back ( leaf );
  118. }
  119. }
  120. void RegRandomForests::getAllLeafNodes ( vector<RegressionNode *> & leafNodes)
  121. {
  122. int z = 0;
  123. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  124. it != forest.end(); it++, z++ )
  125. {
  126. RegressionTree & rt = *(*it);
  127. vector<RegressionNode *> leaves = rt.getAllLeafNodes();
  128. for ( int j = 0; j < (int)leaves.size(); j++ )
  129. {
  130. for ( int k = 0; k < (int)leaves[j]->trainExamplesIndices.size(); k++ )
  131. {
  132. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  133. }
  134. leafNodes.push_back(leaves[j]);
  135. }
  136. }
  137. }
  138. void RegRandomForests::teach ( const NICE::VVector & x, const NICE::Vector & y )
  139. {
  140. cerr << "RegRandomForests::teach()" << endl;
  141. assert( builder != NULL );
  142. int featuresCount = (int) (x[0].size() * features_per_tree );
  143. fprintf(stderr, "RegRandomForests: number of features %d\n", (int)x[0].size() );
  144. vector< vector<int> > outofbagtrees;
  145. outofbagtrees.resize( x.size() );
  146. for ( int k = 0; k < number_of_trees; k++ )
  147. {
  148. vector<int> tmp;
  149. exselection.push_back(tmp);
  150. }
  151. #pragma omp parallel for
  152. for ( int k = 0; k < number_of_trees; k++ )
  153. {
  154. fprintf( stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  155. vector<int> examples_index;
  156. for ( int i = 0; i < (int)x.size(); i++ )
  157. {
  158. examples_index.push_back( i );
  159. }
  160. int trainingExamples = (int)(examples_index.size() * samples_per_tree);
  161. fprintf (stderr, "RegRandomForests: selection of %d examples for each tree\n", trainingExamples );
  162. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  163. {
  164. fprintf(stderr, "RegRandomForests: number of examples < 3 !! minExamples=%d, trainingExamples=%d\n",
  165. (int)x.size(), trainingExamples);
  166. trainingExamples = examples_index.size();
  167. fprintf(stderr, "RegRandomForests: I will use all %d examples. !!\n", trainingExamples);
  168. }
  169. if ( samples_per_tree < 1.0 )
  170. random_shuffle( examples_index.begin(), examples_index.end() );
  171. VVector subset;
  172. Vector subval ( trainingExamples );
  173. for ( int e = 0; e < trainingExamples; e++ )
  174. {
  175. exselection[k].push_back( examples_index[e] );
  176. subset.push_back( x[ examples_index[e] ] );
  177. subval.set( e, y[ examples_index[e] ] );
  178. }
  179. // set out of bag trees
  180. for ( uint e = trainingExamples; e < examples_index.size(); e++ )
  181. {
  182. int index = examples_index[e];
  183. #pragma omp critical
  184. outofbagtrees[index].push_back(k);
  185. }
  186. /******* select a random feature set *******/
  187. vector<int> features_subset;
  188. for ( int j = 0; j < (int)x[0].size(); j++ )
  189. features_subset.push_back( j );
  190. random_shuffle( features_subset.begin(), features_subset.end() );
  191. while ((int)features_subset.size() > featuresCount)
  192. features_subset.pop_back();
  193. /******* training of an individual tree ****/
  194. RegressionTree *tree = new RegressionTree( conf );
  195. builder->build( *tree, subset, subval );
  196. /******* prune tree using least squares criterion *****/
  197. //if ( minimum_error_reduction > 0.0 )
  198. // tree->pruneTreeLeastSquares( minimum_error_reduction );
  199. /******* add individual tree to ensemble *****/
  200. #pragma omp critical
  201. forest.push_back(tree);
  202. }
  203. if (enableOutOfBagEstimates)
  204. calcOutOfBagEstimates(outofbagtrees, x, y);
  205. }
  206. double RegRandomForests::predict ( const NICE::Vector & x,
  207. const vector< int > & outofbagtrees )
  208. {
  209. // predict using only a selection of all trees
  210. // contained in outofbagtrees
  211. double overall_prediction = 0.0;
  212. int treecount = 0;
  213. for ( vector<int>::const_iterator it = outofbagtrees.begin();
  214. it != outofbagtrees.end();
  215. it++ )
  216. {
  217. assert ( *it < (int)forest.size() );
  218. RegressionTree & rt = *(forest[(*it)]);
  219. double predVal;
  220. rt.traverse( x, predVal );
  221. overall_prediction += predVal;
  222. treecount++;
  223. }
  224. overall_prediction /= treecount;
  225. return overall_prediction;
  226. }
  227. double RegRandomForests::predict ( const NICE::Vector & x )
  228. {
  229. double overall_prediction = 0.0;
  230. int treecount = 0;
  231. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  232. it != forest.end();
  233. it++ )
  234. {
  235. RegressionTree & rt = *(*it);
  236. double predVal;
  237. rt.traverse( x, predVal );
  238. overall_prediction += predVal;
  239. treecount++;
  240. }
  241. overall_prediction /= treecount;
  242. return overall_prediction;
  243. }
  244. void RegRandomForests::restore(istream & is, int format)
  245. {
  246. std::string tag;
  247. int index;
  248. while ( (is >> tag) && (tag == "TREE") )
  249. {
  250. is >> index;
  251. RegressionTree *rt = new RegressionTree ( conf );
  252. rt->restore ( is );
  253. if ( minimum_error_reduction > 0.0 )
  254. rt->pruneTreeLeastSquares ( minimum_error_reduction );
  255. forest.push_back(rt);
  256. }
  257. }
  258. void RegRandomForests::store(ostream & os, int format) const
  259. {
  260. int index = 0;
  261. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  262. it != forest.end(); it++, index++ )
  263. {
  264. const RegressionTree & rt = *(*it);
  265. os << "TREE " << index << endl;
  266. rt.store ( os, format );
  267. os << "ENDTREE ";
  268. }
  269. }
  270. void RegRandomForests::clear()
  271. {
  272. for ( vector<RegressionTree *>::iterator it = forest.begin();
  273. it != forest.end(); it++ )
  274. delete (*it);
  275. forest.clear();
  276. }
  277. void RegRandomForests::indexDescendants(
  278. map<RegressionNode *, pair<long, int> > & index) const
  279. {
  280. long maxindex = 0;
  281. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  282. it != forest.end(); it++ )
  283. (*it)->indexDescendants ( index, maxindex );
  284. }
  285. void RegRandomForests::resetCounters()
  286. {
  287. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  288. it != forest.end(); it++ )
  289. (*it)->resetCounters ();
  290. }
  291. void RegRandomForests::setComplexity(int size)
  292. {
  293. fprintf (stderr, "RegRandomForests: set complexity to %d, overwriting current value %d\n",
  294. size, number_of_trees );
  295. number_of_trees = size;
  296. }