FPCRandomForests.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. /**
  2. * @file FPCRandomForests.cpp
  3. * @brief implementation of random set forests
  4. * @author Erik Rodner
  5. * @date 04/24/2008
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include "core/image/ImageT.h"
  12. #include "core/imagedisplay/ImageDisplay.h"
  13. #include "vislearning/classifier/fpclassifier/randomforest/FPCRandomForests.h"
  14. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  15. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandom.h"
  16. #include "vislearning/classifier/fpclassifier/randomforest/DTBClusterRandom.h"
  17. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandomOblique.h"
  18. #include "vislearning/cbaselib/FeaturePool.h"
  19. using namespace OBJREC;
  20. using namespace std;
  21. using namespace NICE;
  22. FPCRandomForests::FPCRandomForests()
  23. {
  24. builder = NULL;
  25. minimum_entropy = 0.0;
  26. enableOutOfBagEstimates = false;
  27. maxClassNo = -1;
  28. }
  29. FPCRandomForests::FPCRandomForests(const Config *_conf, std::string section) : conf(_conf)
  30. {
  31. std::string builder_method = conf->gS(section, "builder", "random");
  32. minimum_entropy = conf->gD(section, "minimum_entropy", 0.0);
  33. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  34. maxClassNo = -1;
  35. confsection = section;
  36. if ( builder_method == "none" ) {
  37. // do not initialize
  38. builder = NULL;
  39. } else {
  40. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  41. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  42. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  43. use_simple_balancing = conf->gB(section, "use_simple_balancing", false);
  44. weight_examples = conf->gB(section, "weight_examples", false);
  45. memory_efficient = conf->gB(section, "memory_efficient", false);
  46. std::string builder_section = conf->gS(section, "builder_section", "DTBRandom");
  47. if ( builder_method == "standard" )
  48. builder = new DTBStandard ( conf, builder_section );
  49. else if (builder_method == "random" )
  50. builder = new DTBRandom ( conf, builder_section );
  51. else if (builder_method == "cluster_random" )
  52. builder = new DTBClusterRandom ( conf, builder_section );
  53. else if (builder_method == "random_oblique" )
  54. builder = new DTBRandomOblique ( conf, builder_section );
  55. else {
  56. fprintf (stderr, "DecisionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  57. exit(-1);
  58. }
  59. }
  60. }
  61. FPCRandomForests::~FPCRandomForests()
  62. {
  63. for ( vector<DecisionTree *>::iterator i = forest.begin();
  64. i != forest.end();
  65. i++ )
  66. delete (*i);
  67. if ( builder != NULL )
  68. delete builder;
  69. }
  70. void FPCRandomForests::calcOutOfBagEstimates (
  71. vector< vector<int> > & outofbagtrees,
  72. Examples & examples )
  73. {
  74. oobResults.clear ();
  75. // calculate out of bag classification results
  76. // as suggested by Breiman
  77. // out of bag = training data not used to build
  78. // a single tree is used as testing data for this tree
  79. long index = 0;
  80. for ( Examples::iterator k = examples.begin();
  81. k != examples.end(); k++, index++ )
  82. {
  83. int classno_groundtruth = k->first;
  84. const vector<int> & trees = outofbagtrees[index];
  85. if ( trees.size() <= 0 ) continue;
  86. ClassificationResult r = classify ( k->second, trees );
  87. // FIXME: assumption negative class dst is 0
  88. double score = r.scores.get( 0 /*negativeClassDST*/);
  89. oobResults.push_back ( pair<double, int> ( score, classno_groundtruth ) );
  90. }
  91. }
  92. void FPCRandomForests::getAllLeafNodes ( vector<DecisionNode *> & leafNodes)
  93. {
  94. //leafNodes.reserve ( forest.size() );
  95. int z = 0;
  96. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  97. i != forest.end();
  98. i++,z++ )
  99. {
  100. DecisionTree & dt = *(*i);
  101. vector<DecisionNode *> leaves = dt.getAllLeafNodes();
  102. for(int j = 0; j < (int)leaves.size(); j++)
  103. {
  104. for(int k = 0; k < leaves[j]->trainExamplesIndices.size(); k++)
  105. {
  106. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  107. }
  108. leafNodes.push_back(leaves[j]);
  109. }
  110. }
  111. }
  112. void FPCRandomForests::getLeafNodes ( Example & pce,
  113. vector<DecisionNode *> & leafNodes,
  114. int depth )
  115. {
  116. leafNodes.reserve ( forest.size() );
  117. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  118. i != forest.end();
  119. i++ )
  120. {
  121. DecisionTree & dt = *(*i);
  122. DecisionNode *leaf = dt.getLeafNode ( pce, depth );
  123. leafNodes.push_back ( leaf );
  124. }
  125. }
  126. ClassificationResult FPCRandomForests::classify ( Example & pce,
  127. const vector<int> & outofbagtrees )
  128. {
  129. // classify using only a selection of all trees
  130. // contained in outofbagtrees
  131. FullVector overall_distribution;
  132. for ( vector<int>::const_iterator i = outofbagtrees.begin();
  133. i != outofbagtrees.end();
  134. i++ )
  135. {
  136. assert ( *i < (int)forest.size() );
  137. DecisionTree & dt = *(forest[(*i)]);
  138. FullVector distribution;
  139. dt.traverse ( pce, distribution );
  140. distribution.normalize();
  141. if ( overall_distribution.empty() )
  142. overall_distribution = distribution;
  143. else
  144. overall_distribution.add ( distribution );
  145. }
  146. overall_distribution.normalize();
  147. int classno = overall_distribution.maxElement();
  148. return ClassificationResult(classno, overall_distribution);
  149. }
  150. ClassificationResult FPCRandomForests::classify(Example & pce)
  151. {
  152. FullVector overall_distribution;
  153. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  154. i != forest.end();
  155. i++)
  156. {
  157. DecisionTree & dt = *(*i);
  158. FullVector distribution;
  159. dt.traverse(pce, distribution);
  160. distribution.normalize();
  161. if (overall_distribution.empty())
  162. overall_distribution = distribution;
  163. else
  164. overall_distribution.add(distribution);
  165. }
  166. overall_distribution.normalize();
  167. int classno = overall_distribution.maxElement();
  168. return ClassificationResult(classno, overall_distribution);
  169. }
  170. int FPCRandomForests::classify_optimize(Example & pce)
  171. {
  172. FullVector overall_distribution;
  173. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  174. i != forest.end();
  175. i++)
  176. {
  177. DecisionTree & dt = *(*i);
  178. FullVector distribution;
  179. dt.traverse(pce, distribution);
  180. if (overall_distribution.empty())
  181. overall_distribution = distribution;
  182. else
  183. overall_distribution.add(distribution);
  184. }
  185. return overall_distribution.maxElement();
  186. }
  187. void FPCRandomForests::train(FeaturePool & fp, Examples & examples)
  188. {
  189. assert(builder != NULL);
  190. if (maxClassNo < 0)
  191. maxClassNo = examples.getMaxClassNo();
  192. FullVector example_distribution(maxClassNo + 1);
  193. map<int, vector<int> > class_examples;
  194. long index = 0;
  195. for (Examples::const_iterator i = examples.begin(); i != examples.end(); i++, index++)
  196. {
  197. int classno = i->first;
  198. example_distribution[classno]++;
  199. class_examples[classno].push_back(index);
  200. }
  201. if (weight_examples)
  202. {
  203. for ( Examples::iterator i = examples.begin();
  204. i != examples.end(); i++, index++ )
  205. i->second.weight = examples.size() / example_distribution[i->first];
  206. }
  207. double minExamples = (double)examples.size();
  208. int minExamplesClassNo = 0;
  209. for (int i = 0 ; i < example_distribution.size() ; i++)
  210. {
  211. double val = example_distribution[i];
  212. if (minExamples > val && val != 0.0)
  213. {
  214. minExamples = val;
  215. minExamplesClassNo = i;
  216. }
  217. }
  218. fprintf(stderr, "FPCRandomForests: minimum number of examples: %f (classno: %d)\n", minExamples, minExamplesClassNo);
  219. int featuresCount = (int)(fp.size() * features_per_tree);
  220. fprintf(stderr, "FPCRandomForests: number of features %d\n", (int)fp.size());
  221. vector< vector<int> > outofbagtrees;
  222. outofbagtrees.resize(examples.size());
  223. for (int k = 0 ; k < number_of_trees ; k++)
  224. {
  225. vector<int> tmp;
  226. exselection.push_back(tmp);
  227. }
  228. #pragma omp parallel for
  229. for (int k = 0 ; k < number_of_trees ; k++)
  230. {
  231. fprintf(stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  232. FeaturePool fp_subset;
  233. Examples examples_subset;
  234. for (map<int, vector<int> >::const_iterator j = class_examples.begin();
  235. j != class_examples.end(); j++)
  236. {
  237. vector<int> examples_index ( j->second );
  238. int trainingExamples;
  239. if (use_simple_balancing)
  240. trainingExamples = (int)(minExamples * samples_per_tree);
  241. else
  242. trainingExamples = (int)(examples_index.size() * samples_per_tree);
  243. fprintf (stderr, "FPCRandomForests: selection of %d examples for each tree (classno: %d)\n", trainingExamples, j->first );
  244. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  245. {
  246. fprintf(stderr, "FPCRandomForests: number of examples < 3 !! minExamples=%f, trainingExamples=%d\n",
  247. minExamples, trainingExamples);
  248. trainingExamples = examples_index.size();
  249. fprintf(stderr, "FPCRandomForests: I will use all %d examples of this class !!\n", trainingExamples);
  250. }
  251. // TODO: optional include examples weights
  252. if(samples_per_tree < 1.0)
  253. random_shuffle(examples_index.begin(), examples_index.end());
  254. examples_subset.reserve(examples_subset.size() + trainingExamples);
  255. for (int e = 0 ; e < trainingExamples ; e++)
  256. {
  257. examples_subset.push_back(examples[examples_index[e]]);
  258. exselection[k].push_back(examples_index[e]);
  259. }
  260. // set out of bag trees
  261. for (uint e = trainingExamples; e < examples_index.size() ; e++)
  262. {
  263. int index = examples_index[e];
  264. #pragma omp critical
  265. outofbagtrees[index].push_back(k);
  266. }
  267. }
  268. /******* select a random feature set *******/
  269. FeaturePool fpTree ( fp );
  270. int featuresCountT = featuresCount;
  271. if (featuresCountT >= (int)fpTree.size()) featuresCountT = fpTree.size();
  272. random_shuffle(fpTree.begin(), fpTree.end());
  273. if (featuresCountT < (int)fpTree.size())
  274. {
  275. fp_subset.insert(fp_subset.begin(), fpTree.begin(), fpTree.begin() + featuresCountT);
  276. }
  277. else
  278. {
  279. fp_subset = fpTree;
  280. }
  281. fp_subset.initRandomFeatureSelection();
  282. /******* training of an individual tree ****/
  283. DecisionTree *tree = new DecisionTree(conf, maxClassNo);
  284. builder->build(*tree, fp_subset, examples_subset, maxClassNo);
  285. /******* prune tree using a simple minimum entropy criterion *****/
  286. if (minimum_entropy != 0.0)
  287. tree->pruneTreeEntropy(minimum_entropy);
  288. /******* drop some precalculated data if memory should be saved **/
  289. #pragma omp critical
  290. if (memory_efficient)
  291. {
  292. set<CachedExample *> alreadyDropped;
  293. for (Examples::iterator i = examples_subset.begin();
  294. i != examples_subset.end();
  295. i++)
  296. {
  297. CachedExample *ce = i->second.ce;
  298. if (alreadyDropped.find(ce) == alreadyDropped.end())
  299. {
  300. ce->dropPreCached();
  301. alreadyDropped.insert(ce);
  302. }
  303. }
  304. }
  305. /******* add individual tree to ensemble *****/
  306. #pragma omp critical
  307. forest.push_back(tree);
  308. }
  309. if (enableOutOfBagEstimates)
  310. calcOutOfBagEstimates(outofbagtrees, examples);
  311. }
  312. void FPCRandomForests::restore(istream & is, int format)
  313. {
  314. std::string tag;
  315. int index;
  316. if( (is >> tag) && (tag == "FOREST") )
  317. {
  318. while ( (is >> tag) && (tag == "TREE") )
  319. {
  320. is >> index;
  321. DecisionTree *dt = new DecisionTree ( conf, maxClassNo );
  322. dt->restore ( is );
  323. if ( minimum_entropy != 0.0 )
  324. dt->pruneTreeEntropy ( minimum_entropy );
  325. forest.push_back(dt);
  326. }
  327. }
  328. }
  329. void FPCRandomForests::store(ostream & os, int format) const
  330. {
  331. os << "FOREST " << endl;
  332. int index = 0;
  333. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  334. i != forest.end();
  335. i++, index++ )
  336. {
  337. const DecisionTree & dt = *(*i);
  338. os << "TREE " << index << endl;
  339. dt.store ( os, format );
  340. os << "ENDTREE ";
  341. }
  342. os << endl;
  343. os << "ENDFOREST " << endl;
  344. }
  345. void FPCRandomForests::clear()
  346. {
  347. for ( vector<DecisionTree *>::iterator i = forest.begin();
  348. i != forest.end();
  349. i++ )
  350. delete (*i);
  351. forest.clear();
  352. }
  353. void FPCRandomForests::indexDescendants(map<DecisionNode *, pair<long, int> > & index) const
  354. {
  355. long maxindex = 0;
  356. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  357. i != forest.end();
  358. i++ )
  359. (*i)->indexDescendants ( index, maxindex );
  360. }
  361. void FPCRandomForests::resetCounters()
  362. {
  363. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  364. i != forest.end();
  365. i++ )
  366. (*i)->resetCounters ();
  367. }
  368. FeaturePoolClassifier *FPCRandomForests::clone() const
  369. {
  370. FPCRandomForests *o = new FPCRandomForests(conf, confsection);
  371. o->maxClassNo = maxClassNo;
  372. return o;
  373. }
  374. void FPCRandomForests::setComplexity(int size)
  375. {
  376. fprintf (stderr, "FPCRandomForests: set complexity to %d, overwriting current value %d\n",
  377. size, number_of_trees );
  378. number_of_trees = size;
  379. }