RegRandomForests.cpp 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. /**
  2. * @file RegRandomForests.cpp
  3. * @brief implementation of random set forests for regression
  4. * @author Sven Sickert
  5. * @date 06/28/2013
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include <assert.h>
  12. #include "vislearning/regression/randomforest/RegRandomForests.h"
  13. #include "vislearning/regression/randomforest/RTBRandom.h"
  14. #include "vislearning/regression/randomforest/RTBGrid.h"
  15. #include "vislearning/regression/randomforest/RTBClusterRandom.h"
  16. #include "vislearning/regression/randomforest/RTBMeanPostImprovement.h"
  17. using namespace OBJREC;
  18. using namespace std;
  19. using namespace NICE;
  20. RegRandomForests::RegRandomForests()
  21. {
  22. builder = NULL;
  23. minimum_error_reduction = 0.0;
  24. enableOutOfBagEstimates = false;
  25. }
  26. RegRandomForests::RegRandomForests( const Config *_conf,
  27. std::string section ) : conf(_conf)
  28. {
  29. std::string builder_method = conf->gS(section, "builder", "random");
  30. minimum_error_reduction = conf->gD(section, "minimum_error_reduction", 10e-3);
  31. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  32. confsection = section;
  33. if ( builder_method == "none" ) {
  34. // do not initialize
  35. builder = NULL;
  36. }
  37. else {
  38. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  39. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  40. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  41. if ( builder_method == "random" )
  42. {
  43. std::string builder_section = conf->gS(section, "builder_section", "RTBRandom");
  44. builder = new RTBRandom ( conf, builder_section );
  45. }
  46. else if ( builder_method == "grid" )
  47. {
  48. std::string builder_section = conf->gS(section, "builder_section", "RTBGrid");
  49. builder = new RTBGrid ( conf, builder_section );
  50. }
  51. else if ( builder_method == "cluster_random" )
  52. {
  53. std::string builder_section = conf->gS(section, "builder_section", "RTBClusterRandom");
  54. builder = new RTBClusterRandom ( conf, builder_section );
  55. }
  56. else if ( builder_method == "mean_post_improvement" )
  57. {
  58. std::string builder_section = conf->gS(section, "builder_section", "RTBMeanPostImprovement");
  59. builder = new RTBMeanPostImprovement ( conf, builder_section );
  60. } else {
  61. fprintf (stderr, "RegressionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  62. exit(-1);
  63. }
  64. }
  65. }
  66. RegRandomForests::~RegRandomForests()
  67. {
  68. for ( vector<RegressionTree *>::iterator it = forest.begin();
  69. it != forest.end(); it++ )
  70. delete (*it);
  71. if ( builder != NULL )
  72. delete builder;
  73. }
  74. void RegRandomForests::calcOutOfBagEstimates (
  75. std::vector< std::vector<int> > & outofbagtrees,
  76. NICE::VVector x,
  77. NICE::Vector y )
  78. {
  79. oobResults.clear();
  80. // calculate out of bag regression results
  81. // as suggested bei Breiman
  82. // out of bag = training data not used to build
  83. // a single tree is used as testing data for the tree
  84. long index = 0;
  85. for ( int i = 0; i < (int)x.size(); i++, index++ )
  86. {
  87. double trueValue = y[i];
  88. const vector<int> & trees = outofbagtrees[index];
  89. if ( trees.size() <= 0 ) continue;
  90. double predValue = predict ( x[i], trees );
  91. double predError = abs( trueValue - predValue );
  92. oobResults.push_back ( pair<double, double> ( predError, trueValue ) );
  93. }
  94. }
  95. void RegRandomForests::getLeafNodes ( NICE::Vector x,
  96. std::vector<RegressionNode *> & leafNodes,
  97. int depth )
  98. {
  99. leafNodes.reserve ( forest.size() );
  100. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  101. it != forest.end(); it++ )
  102. {
  103. RegressionTree & rt = *(*it);
  104. RegressionNode *leaf = rt.getLeafNode ( x, depth );
  105. leafNodes.push_back ( leaf );
  106. }
  107. }
  108. void RegRandomForests::getAllLeafNodes ( vector<RegressionNode *> & leafNodes)
  109. {
  110. int z = 0;
  111. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  112. it != forest.end(); it++, z++ )
  113. {
  114. RegressionTree & rt = *(*it);
  115. vector<RegressionNode *> leaves = rt.getAllLeafNodes();
  116. for ( int j = 0; j < (int)leaves.size(); j++ )
  117. {
  118. for ( int k = 0; k < (int)leaves[j]->trainExamplesIndices.size(); k++ )
  119. {
  120. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  121. }
  122. leafNodes.push_back(leaves[j]);
  123. }
  124. }
  125. }
  126. void RegRandomForests::teach ( const NICE::VVector & x, const NICE::Vector & y )
  127. {
  128. cerr << "RegRandomForests::teach()" << endl;
  129. assert( builder != NULL );
  130. int featuresCount = (int) (x[0].size() * features_per_tree );
  131. fprintf(stderr, "RegRandomForests: number of features %d\n", (int)x[0].size() );
  132. vector< vector<int> > outofbagtrees;
  133. outofbagtrees.resize( x.size() );
  134. for ( int k = 0; k < number_of_trees; k++ )
  135. {
  136. vector<int> tmp;
  137. exselection.push_back(tmp);
  138. }
  139. #pragma omp parallel for
  140. for ( int k = 0; k < number_of_trees; k++ )
  141. {
  142. fprintf( stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  143. vector<int> examples_index;
  144. for ( int i = 0; i < (int)x.size(); i++ )
  145. {
  146. examples_index.push_back( i );
  147. }
  148. int trainingExamples = (int)(examples_index.size() * samples_per_tree);
  149. fprintf (stderr, "RegRandomForests: selection of %d examples for each tree\n", trainingExamples );
  150. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  151. {
  152. fprintf(stderr, "RegRandomForests: number of examples < 3 !! minExamples=%d, trainingExamples=%d\n",
  153. (int)x.size(), trainingExamples);
  154. trainingExamples = examples_index.size();
  155. fprintf(stderr, "RegRandomForests: I will use all %d examples. !!\n", trainingExamples);
  156. }
  157. if ( samples_per_tree < 1.0 )
  158. random_shuffle( examples_index.begin(), examples_index.end() );
  159. VVector subset;
  160. Vector subval ( trainingExamples );
  161. for ( int e = 0; e < trainingExamples; e++ )
  162. {
  163. exselection[k].push_back( examples_index[e] );
  164. subset.push_back( x[ examples_index[e] ] );
  165. subval.set( e, y[ examples_index[e] ] );
  166. }
  167. // set out of bag trees
  168. for ( uint e = trainingExamples; e < examples_index.size(); e++ )
  169. {
  170. int index = examples_index[e];
  171. #pragma omp critical
  172. outofbagtrees[index].push_back(k);
  173. }
  174. /******* select a random feature set *******/
  175. vector<int> features_subset;
  176. for ( int j = 0; j < (int)x[0].size(); j++ )
  177. features_subset.push_back( j );
  178. random_shuffle( features_subset.begin(), features_subset.end() );
  179. while ((int)features_subset.size() > featuresCount)
  180. features_subset.pop_back();
  181. /******* training of an individual tree ****/
  182. RegressionTree *tree = new RegressionTree( conf );
  183. builder->build( *tree, subset, subval );
  184. /******* prune tree using least squares criterion *****/
  185. //if ( minimum_error_reduction > 0.0 )
  186. // tree->pruneTreeLeastSquares( minimum_error_reduction );
  187. /******* add individual tree to ensemble *****/
  188. #pragma omp critical
  189. forest.push_back(tree);
  190. }
  191. if (enableOutOfBagEstimates)
  192. calcOutOfBagEstimates(outofbagtrees, x, y);
  193. }
  194. double RegRandomForests::predict ( const NICE::Vector & x,
  195. const vector< int > & outofbagtrees )
  196. {
  197. // predict using only a selection of all trees
  198. // contained in outofbagtrees
  199. double overall_prediction = 0.0;
  200. int treecount = 0;
  201. for ( vector<int>::const_iterator it = outofbagtrees.begin();
  202. it != outofbagtrees.end();
  203. it++ )
  204. {
  205. assert ( *it < (int)forest.size() );
  206. RegressionTree & rt = *(forest[(*it)]);
  207. double predVal;
  208. rt.traverse( x, predVal );
  209. overall_prediction += predVal;
  210. treecount++;
  211. }
  212. overall_prediction /= treecount;
  213. return overall_prediction;
  214. }
  215. double RegRandomForests::predict ( const NICE::Vector & x )
  216. {
  217. double overall_prediction = 0.0;
  218. int treecount = 0;
  219. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  220. it != forest.end();
  221. it++ )
  222. {
  223. RegressionTree & rt = *(*it);
  224. double predVal;
  225. rt.traverse( x, predVal );
  226. overall_prediction += predVal;
  227. treecount++;
  228. }
  229. overall_prediction /= treecount;
  230. return overall_prediction;
  231. }
  232. void RegRandomForests::restore(istream & is, int format)
  233. {
  234. std::string tag;
  235. int index;
  236. while ( (is >> tag) && (tag == "TREE") )
  237. {
  238. is >> index;
  239. RegressionTree *rt = new RegressionTree ( conf );
  240. rt->restore ( is );
  241. if ( minimum_error_reduction > 0.0 )
  242. rt->pruneTreeLeastSquares ( minimum_error_reduction );
  243. forest.push_back(rt);
  244. }
  245. }
  246. void RegRandomForests::store(ostream & os, int format) const
  247. {
  248. int index = 0;
  249. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  250. it != forest.end(); it++, index++ )
  251. {
  252. const RegressionTree & rt = *(*it);
  253. os << "TREE " << index << endl;
  254. rt.store ( os, format );
  255. os << "ENDTREE ";
  256. }
  257. }
  258. void RegRandomForests::clear()
  259. {
  260. for ( vector<RegressionTree *>::iterator it = forest.begin();
  261. it != forest.end(); it++ )
  262. delete (*it);
  263. forest.clear();
  264. }
  265. void RegRandomForests::indexDescendants(
  266. map<RegressionNode *, pair<long, int> > & index) const
  267. {
  268. long maxindex = 0;
  269. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  270. it != forest.end(); it++ )
  271. (*it)->indexDescendants ( index, maxindex );
  272. }
  273. void RegRandomForests::resetCounters()
  274. {
  275. for ( vector<RegressionTree *>::const_iterator it = forest.begin();
  276. it != forest.end(); it++ )
  277. (*it)->resetCounters ();
  278. }
  279. void RegRandomForests::setComplexity(int size)
  280. {
  281. fprintf (stderr, "RegRandomForests: set complexity to %d, overwriting current value %d\n",
  282. size, number_of_trees );
  283. number_of_trees = size;
  284. }