FPCRandomForests.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. /**
  2. * @file FPCRandomForests.cpp
  3. * @brief implementation of random set forests
  4. * @author Erik Rodner
  5. * @date 04/24/2008
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include "core/image/ImageT.h"
  12. #include "core/imagedisplay/ImageDisplay.h"
  13. #include "vislearning/classifier/fpclassifier/randomforest/FPCRandomForests.h"
  14. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  15. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandom.h"
  16. #include "vislearning/classifier/fpclassifier/randomforest/DTBClusterRandom.h"
  17. #include "vislearning/cbaselib/FeaturePool.h"
  18. using namespace OBJREC;
  19. using namespace std;
  20. using namespace NICE;
  21. FPCRandomForests::FPCRandomForests()
  22. {
  23. builder = NULL;
  24. minimum_entropy = 0.0;
  25. enableOutOfBagEstimates = false;
  26. maxClassNo = -1;
  27. }
  28. FPCRandomForests::FPCRandomForests(const Config *_conf, std::string section) : conf(_conf)
  29. {
  30. std::string builder_method = conf->gS(section, "builder", "random");
  31. minimum_entropy = conf->gD(section, "minimum_entropy", 0.0);
  32. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  33. maxClassNo = -1;
  34. confsection = section;
  35. if ( builder_method == "none" ) {
  36. // do not initialize
  37. builder = NULL;
  38. } else {
  39. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  40. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  41. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  42. use_simple_balancing = conf->gB(section, "use_simple_balancing", false);
  43. weight_examples = conf->gB(section, "weight_examples", false);
  44. memory_efficient = conf->gB(section, "memory_efficient", false);
  45. std::string builder_section = conf->gS(section, "builder_section", "DTBRandom");
  46. if ( builder_method == "standard" )
  47. builder = new DTBStandard ( conf, builder_section );
  48. else if (builder_method == "random" )
  49. builder = new DTBRandom ( conf, builder_section );
  50. else if (builder_method == "cluster_random" )
  51. builder = new DTBClusterRandom ( conf, builder_section );
  52. else {
  53. fprintf (stderr, "DecisionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  54. exit(-1);
  55. }
  56. }
  57. }
  58. FPCRandomForests::~FPCRandomForests()
  59. {
  60. for ( vector<DecisionTree *>::iterator i = forest.begin();
  61. i != forest.end();
  62. i++ )
  63. delete (*i);
  64. if ( builder != NULL )
  65. delete builder;
  66. }
  67. void FPCRandomForests::calcOutOfBagEstimates (
  68. vector< vector<int> > & outofbagtrees,
  69. Examples & examples )
  70. {
  71. oobResults.clear ();
  72. // calculate out of bag classification results
  73. // as suggested by Breiman
  74. // out of bag = training data not used to build
  75. // a single tree is used as testing data for this tree
  76. long index = 0;
  77. for ( Examples::iterator k = examples.begin();
  78. k != examples.end(); k++, index++ )
  79. {
  80. int classno_groundtruth = k->first;
  81. const vector<int> & trees = outofbagtrees[index];
  82. if ( trees.size() <= 0 ) continue;
  83. ClassificationResult r = classify ( k->second, trees );
  84. // FIXME: assumption negative class dst is 0
  85. double score = r.scores.get( 0 /*negativeClassDST*/);
  86. oobResults.push_back ( pair<double, int> ( score, classno_groundtruth ) );
  87. }
  88. }
  89. void FPCRandomForests::getAllLeafNodes ( vector<DecisionNode *> & leafNodes)
  90. {
  91. //leafNodes.reserve ( forest.size() );
  92. int z = 0;
  93. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  94. i != forest.end();
  95. i++,z++ )
  96. {
  97. DecisionTree & dt = *(*i);
  98. vector<DecisionNode *> leaves = dt.getAllLeafNodes();
  99. for(int j = 0; j < (int)leaves.size(); j++)
  100. {
  101. for(int k = 0; k < leaves[j]->trainExamplesIndices.size(); k++)
  102. {
  103. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  104. }
  105. leafNodes.push_back(leaves[j]);
  106. }
  107. }
  108. }
  109. void FPCRandomForests::getLeafNodes ( Example & pce,
  110. vector<DecisionNode *> & leafNodes,
  111. int depth )
  112. {
  113. leafNodes.reserve ( forest.size() );
  114. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  115. i != forest.end();
  116. i++ )
  117. {
  118. DecisionTree & dt = *(*i);
  119. DecisionNode *leaf = dt.getLeafNode ( pce, depth );
  120. leafNodes.push_back ( leaf );
  121. }
  122. }
  123. ClassificationResult FPCRandomForests::classify ( Example & pce,
  124. const vector<int> & outofbagtrees )
  125. {
  126. // classify using only a selection of all trees
  127. // contained in outofbagtrees
  128. FullVector overall_distribution;
  129. for ( vector<int>::const_iterator i = outofbagtrees.begin();
  130. i != outofbagtrees.end();
  131. i++ )
  132. {
  133. assert ( *i < (int)forest.size() );
  134. DecisionTree & dt = *(forest[(*i)]);
  135. FullVector distribution;
  136. dt.traverse ( pce, distribution );
  137. distribution.normalize();
  138. if ( overall_distribution.empty() )
  139. overall_distribution = distribution;
  140. else
  141. overall_distribution.add ( distribution );
  142. }
  143. overall_distribution.normalize();
  144. int classno = overall_distribution.maxElement();
  145. return ClassificationResult(classno, overall_distribution);
  146. }
  147. ClassificationResult FPCRandomForests::classify(Example & pce)
  148. {
  149. FullVector overall_distribution;
  150. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  151. i != forest.end();
  152. i++)
  153. {
  154. DecisionTree & dt = *(*i);
  155. FullVector distribution;
  156. dt.traverse(pce, distribution);
  157. distribution.normalize();
  158. if (overall_distribution.empty())
  159. overall_distribution = distribution;
  160. else
  161. overall_distribution.add(distribution);
  162. }
  163. overall_distribution.normalize();
  164. int classno = overall_distribution.maxElement();
  165. return ClassificationResult(classno, overall_distribution);
  166. }
  167. int FPCRandomForests::classify_optimize(Example & pce)
  168. {
  169. FullVector overall_distribution;
  170. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  171. i != forest.end();
  172. i++)
  173. {
  174. DecisionTree & dt = *(*i);
  175. FullVector distribution;
  176. dt.traverse(pce, distribution);
  177. if (overall_distribution.empty())
  178. overall_distribution = distribution;
  179. else
  180. overall_distribution.add(distribution);
  181. }
  182. return overall_distribution.maxElement();
  183. }
  184. void FPCRandomForests::train(FeaturePool & fp, Examples & examples)
  185. {
  186. fprintf (stderr,"FPCRandomForests::train()\n");
  187. assert(builder != NULL);
  188. if (maxClassNo < 0)
  189. maxClassNo = examples.getMaxClassNo();
  190. FullVector example_distribution(maxClassNo + 1);
  191. map<int, vector<int> > class_examples;
  192. long index = 0;
  193. for (Examples::const_iterator i = examples.begin(); i != examples.end(); i++, index++)
  194. {
  195. int classno = i->first;
  196. example_distribution[classno]++;
  197. class_examples[classno].push_back(index);
  198. }
  199. if (weight_examples)
  200. {
  201. for (Examples::iterator i = examples.begin();
  202. i != examples.end();
  203. i++, index++)
  204. i->second.weight = examples.size() / example_distribution[i->first];
  205. }
  206. double minExamples = (double)examples.size();
  207. int minExamplesClassNo = 0;
  208. for (int i = 0 ; i < example_distribution.size() ; i++)
  209. {
  210. double val = example_distribution[i];
  211. if (minExamples > val && val != 0.0)
  212. {
  213. minExamples = val;
  214. minExamplesClassNo = i;
  215. }
  216. }
  217. fprintf(stderr, "FPCRandomForests: minimum number of examples: %f (classno: %d)\n", minExamples, minExamplesClassNo);
  218. int featuresCount = (int)(fp.size() * features_per_tree);
  219. fprintf(stderr, "FPCRandomForests: number of features %d\n", (int)fp.size());
  220. vector< vector<int> > outofbagtrees;
  221. outofbagtrees.resize(examples.size());
  222. for (int k = 0 ; k < number_of_trees ; k++)
  223. {
  224. vector<int> tmp;
  225. exselection.push_back(tmp);
  226. }
  227. #pragma omp parallel for
  228. for (int k = 0 ; k < number_of_trees ; k++)
  229. {
  230. fprintf(stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  231. FeaturePool fp_subset;
  232. Examples examples_subset;
  233. for (map<int, vector<int> >::const_iterator j = class_examples.begin();
  234. j != class_examples.end(); j++)
  235. {
  236. vector<int> examples_index ( j->second );
  237. int trainingExamples;
  238. if (use_simple_balancing)
  239. trainingExamples = (int)(minExamples * samples_per_tree);
  240. else
  241. trainingExamples = (int)(examples_index.size() * samples_per_tree);
  242. fprintf (stderr, "FPCRandomForests: selection of %d examples for each tree\n", trainingExamples );
  243. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  244. {
  245. fprintf(stderr, "FPCRandomForests: number of examples < 3 !! minExamples=%f, trainingExamples=%d\n",
  246. minExamples, trainingExamples);
  247. trainingExamples = examples_index.size();
  248. fprintf(stderr, "FPCRandomForests: I will use all %d examples of this class !!\n", trainingExamples);
  249. }
  250. // TODO: optional include examples weights
  251. if(samples_per_tree < 1.0)
  252. random_shuffle(examples_index.begin(), examples_index.end());
  253. examples_subset.reserve(examples_subset.size() + trainingExamples);
  254. for (int e = 0 ; e < trainingExamples ; e++)
  255. {
  256. examples_subset.push_back(examples[examples_index[e]]);
  257. exselection[k].push_back(examples_index[e]);
  258. }
  259. // set out of bag trees
  260. for (uint e = trainingExamples; e < examples_index.size() ; e++)
  261. {
  262. int index = examples_index[e];
  263. #pragma omp critical
  264. outofbagtrees[index].push_back(k);
  265. }
  266. }
  267. /******* select a random feature set *******/
  268. FeaturePool fpTree ( fp );
  269. int featuresCountT = featuresCount;
  270. if (featuresCountT >= (int)fpTree.size()) featuresCountT = fpTree.size();
  271. random_shuffle(fpTree.begin(), fpTree.end());
  272. if (featuresCountT < (int)fpTree.size())
  273. {
  274. fp_subset.insert(fp_subset.begin(), fpTree.begin(), fpTree.begin() + featuresCountT);
  275. }
  276. else
  277. {
  278. fp_subset = fpTree;
  279. }
  280. fp_subset.initRandomFeatureSelection();
  281. /******* training of an individual tree ****/
  282. DecisionTree *tree = new DecisionTree(conf, maxClassNo);
  283. builder->build(*tree, fp_subset, examples_subset, maxClassNo);
  284. /******* prune tree using a simple minimum entropy criterion *****/
  285. if (minimum_entropy != 0.0)
  286. tree->pruneTreeEntropy(minimum_entropy);
  287. /******* drop some precalculated data if memory should be saved **/
  288. #pragma omp critical
  289. if (memory_efficient)
  290. {
  291. set<CachedExample *> alreadyDropped;
  292. for (Examples::iterator i = examples_subset.begin();
  293. i != examples_subset.end();
  294. i++)
  295. {
  296. CachedExample *ce = i->second.ce;
  297. if (alreadyDropped.find(ce) == alreadyDropped.end())
  298. {
  299. ce->dropPreCached();
  300. alreadyDropped.insert(ce);
  301. }
  302. }
  303. }
  304. /******* add individual tree to ensemble *****/
  305. #pragma omp critical
  306. forest.push_back(tree);
  307. }
  308. if (enableOutOfBagEstimates)
  309. calcOutOfBagEstimates(outofbagtrees, examples);
  310. }
  311. void FPCRandomForests::restore(istream & is, int format)
  312. {
  313. std::string tag;
  314. int index;
  315. if( (is >> tag) && (tag == "FOREST") )
  316. {
  317. while ( (is >> tag) && (tag == "TREE") )
  318. {
  319. is >> index;
  320. DecisionTree *dt = new DecisionTree ( conf, maxClassNo );
  321. dt->restore ( is );
  322. if ( minimum_entropy != 0.0 )
  323. dt->pruneTreeEntropy ( minimum_entropy );
  324. forest.push_back(dt);
  325. }
  326. }
  327. }
  328. void FPCRandomForests::store(ostream & os, int format) const
  329. {
  330. os << "FOREST " << endl;
  331. int index = 0;
  332. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  333. i != forest.end();
  334. i++, index++ )
  335. {
  336. const DecisionTree & dt = *(*i);
  337. os << "TREE " << index << endl;
  338. dt.store ( os, format );
  339. os << "ENDTREE ";
  340. }
  341. os << endl;
  342. os << "ENDFOREST " << endl;
  343. }
  344. void FPCRandomForests::clear()
  345. {
  346. for ( vector<DecisionTree *>::iterator i = forest.begin();
  347. i != forest.end();
  348. i++ )
  349. delete (*i);
  350. forest.clear();
  351. }
  352. void FPCRandomForests::indexDescendants(map<DecisionNode *, pair<long, int> > & index) const
  353. {
  354. long maxindex = 0;
  355. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  356. i != forest.end();
  357. i++ )
  358. (*i)->indexDescendants ( index, maxindex );
  359. }
  360. void FPCRandomForests::resetCounters()
  361. {
  362. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  363. i != forest.end();
  364. i++ )
  365. (*i)->resetCounters ();
  366. }
  367. FeaturePoolClassifier *FPCRandomForests::clone() const
  368. {
  369. FPCRandomForests *o = new FPCRandomForests(conf, confsection);
  370. o->maxClassNo = maxClassNo;
  371. return o;
  372. }
  373. void FPCRandomForests::setComplexity(int size)
  374. {
  375. fprintf (stderr, "FPCRandomForests: set complexity to %d, overwriting current value %d\n",
  376. size, number_of_trees );
  377. number_of_trees = size;
  378. }