FPCRandomForests.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. /**
  2. * @file FPCRandomForests.cpp
  3. * @brief implementation of random set forests
  4. * @author Erik Rodner
  5. * @date 04/24/2008
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include "core/image/ImageT.h"
  12. #include "core/imagedisplay/ImageDisplay.h"
  13. #include "vislearning/classifier/fpclassifier/randomforest/FPCRandomForests.h"
  14. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  15. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandom.h"
  16. #include "vislearning/classifier/fpclassifier/randomforest/DTBClusterRandom.h"
  17. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandomOblique.h"
  18. #include "vislearning/cbaselib/FeaturePool.h"
  19. using namespace OBJREC;
  20. using namespace std;
  21. using namespace NICE;
  22. FPCRandomForests::FPCRandomForests()
  23. {
  24. builder = NULL;
  25. minimum_entropy = 0.0;
  26. enableOutOfBagEstimates = false;
  27. maxClassNo = -1;
  28. }
  29. FPCRandomForests::FPCRandomForests(const Config *_conf, std::string section) : conf(_conf)
  30. {
  31. std::string builder_method = conf->gS(section, "builder", "random");
  32. minimum_entropy = conf->gD(section, "minimum_entropy", 0.0);
  33. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  34. maxClassNo = -1;
  35. confsection = section;
  36. if ( builder_method == "none" ) {
  37. // do not initialize
  38. builder = NULL;
  39. } else {
  40. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  41. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  42. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  43. use_simple_balancing = conf->gB(section, "use_simple_balancing", false);
  44. weight_examples = conf->gB(section, "weight_examples", false);
  45. memory_efficient = conf->gB(section, "memory_efficient", false);
  46. std::string builder_section = conf->gS(section, "builder_section", "DTBRandom");
  47. if ( builder_method == "standard" )
  48. builder = new DTBStandard ( conf, builder_section );
  49. else if (builder_method == "random" )
  50. builder = new DTBRandom ( conf, builder_section );
  51. else if (builder_method == "cluster_random" )
  52. builder = new DTBClusterRandom ( conf, builder_section );
  53. else if (builder_method == "random_oblique" )
  54. builder = new DTBRandomOblique ( conf, builder_section );
  55. else {
  56. fprintf (stderr, "DecisionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  57. exit(-1);
  58. }
  59. }
  60. }
  61. FPCRandomForests::~FPCRandomForests()
  62. {
  63. for ( vector<DecisionTree *>::iterator i = forest.begin();
  64. i != forest.end();
  65. i++ )
  66. delete (*i);
  67. if ( builder != NULL )
  68. delete builder;
  69. }
  70. void FPCRandomForests::calcOutOfBagEstimates (
  71. vector< vector<int> > & outofbagtrees,
  72. Examples & examples )
  73. {
  74. oobResults.clear ();
  75. // calculate out of bag classification results
  76. // as suggested by Breiman
  77. // out of bag = training data not used to build
  78. // a single tree is used as testing data for this tree
  79. long index = 0;
  80. for ( Examples::iterator k = examples.begin();
  81. k != examples.end(); k++, index++ )
  82. {
  83. int classno_groundtruth = k->first;
  84. const vector<int> & trees = outofbagtrees[index];
  85. if ( trees.size() <= 0 ) continue;
  86. ClassificationResult r = classify ( k->second, trees );
  87. // FIXME: assumption negative class dst is 0
  88. double score = r.scores.get( 0 /*negativeClassDST*/);
  89. oobResults.push_back ( pair<double, int> ( score, classno_groundtruth ) );
  90. }
  91. }
  92. void FPCRandomForests::getAllLeafNodes ( vector<DecisionNode *> & leafNodes)
  93. {
  94. //leafNodes.reserve ( forest.size() );
  95. int z = 0;
  96. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  97. i != forest.end();
  98. i++,z++ )
  99. {
  100. DecisionTree & dt = *(*i);
  101. vector<DecisionNode *> leaves = dt.getAllLeafNodes();
  102. for(int j = 0; j < (int)leaves.size(); j++)
  103. {
  104. for(int k = 0; k < leaves[j]->trainExamplesIndices.size(); k++)
  105. {
  106. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  107. }
  108. leafNodes.push_back(leaves[j]);
  109. }
  110. }
  111. }
  112. void FPCRandomForests::getLeafNodes ( Example & pce,
  113. vector<DecisionNode *> & leafNodes,
  114. int depth )
  115. {
  116. leafNodes.reserve ( forest.size() );
  117. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  118. i != forest.end();
  119. i++ )
  120. {
  121. DecisionTree & dt = *(*i);
  122. DecisionNode *leaf = dt.getLeafNode ( pce, depth );
  123. leafNodes.push_back ( leaf );
  124. }
  125. }
  126. ClassificationResult FPCRandomForests::classify ( Example & pce,
  127. const vector<int> & outofbagtrees )
  128. {
  129. // classify using only a selection of all trees
  130. // contained in outofbagtrees
  131. FullVector overall_distribution;
  132. for ( vector<int>::const_iterator i = outofbagtrees.begin();
  133. i != outofbagtrees.end();
  134. i++ )
  135. {
  136. assert ( *i < (int)forest.size() );
  137. DecisionTree & dt = *(forest[(*i)]);
  138. FullVector distribution;
  139. dt.traverse ( pce, distribution );
  140. distribution.normalize();
  141. if ( overall_distribution.empty() )
  142. overall_distribution = distribution;
  143. else
  144. overall_distribution.add ( distribution );
  145. }
  146. overall_distribution.normalize();
  147. int classno = overall_distribution.maxElement();
  148. return ClassificationResult(classno, overall_distribution);
  149. }
  150. ClassificationResult FPCRandomForests::classify(Example & pce)
  151. {
  152. FullVector overall_distribution;
  153. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  154. i != forest.end();
  155. i++)
  156. {
  157. DecisionTree & dt = *(*i);
  158. FullVector distribution;
  159. dt.traverse(pce, distribution);
  160. distribution.normalize();
  161. if (overall_distribution.empty())
  162. overall_distribution = distribution;
  163. else
  164. overall_distribution.add(distribution);
  165. }
  166. overall_distribution.normalize();
  167. int classno = overall_distribution.maxElement();
  168. return ClassificationResult(classno, overall_distribution);
  169. }
  170. int FPCRandomForests::classify_optimize(Example & pce)
  171. {
  172. FullVector overall_distribution;
  173. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  174. i != forest.end();
  175. i++)
  176. {
  177. DecisionTree & dt = *(*i);
  178. FullVector distribution;
  179. dt.traverse(pce, distribution);
  180. if (overall_distribution.empty())
  181. overall_distribution = distribution;
  182. else
  183. overall_distribution.add(distribution);
  184. }
  185. return overall_distribution.maxElement();
  186. }
  187. void FPCRandomForests::train(FeaturePool & fp, Examples & examples)
  188. {
  189. fprintf (stderr,"FPCRandomForests::train()\n");
  190. assert(builder != NULL);
  191. if (maxClassNo < 0)
  192. maxClassNo = examples.getMaxClassNo();
  193. FullVector example_distribution(maxClassNo + 1);
  194. map<int, vector<int> > class_examples;
  195. long index = 0;
  196. for (Examples::const_iterator i = examples.begin(); i != examples.end(); i++, index++)
  197. {
  198. int classno = i->first;
  199. example_distribution[classno]++;
  200. class_examples[classno].push_back(index);
  201. }
  202. if (weight_examples)
  203. {
  204. for (Examples::iterator i = examples.begin();
  205. i != examples.end();
  206. i++, index++)
  207. i->second.weight = examples.size() / example_distribution[i->first];
  208. }
  209. double minExamples = (double)examples.size();
  210. int minExamplesClassNo = 0;
  211. for (int i = 0 ; i < example_distribution.size() ; i++)
  212. {
  213. double val = example_distribution[i];
  214. if (minExamples > val && val != 0.0)
  215. {
  216. minExamples = val;
  217. minExamplesClassNo = i;
  218. }
  219. }
  220. fprintf(stderr, "FPCRandomForests: minimum number of examples: %f (classno: %d)\n", minExamples, minExamplesClassNo);
  221. int featuresCount = (int)(fp.size() * features_per_tree);
  222. fprintf(stderr, "FPCRandomForests: number of features %d\n", (int)fp.size());
  223. vector< vector<int> > outofbagtrees;
  224. outofbagtrees.resize(examples.size());
  225. for (int k = 0 ; k < number_of_trees ; k++)
  226. {
  227. vector<int> tmp;
  228. exselection.push_back(tmp);
  229. }
  230. #pragma omp parallel for
  231. for (int k = 0 ; k < number_of_trees ; k++)
  232. {
  233. fprintf(stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  234. FeaturePool fp_subset;
  235. Examples examples_subset;
  236. for (map<int, vector<int> >::const_iterator j = class_examples.begin();
  237. j != class_examples.end(); j++)
  238. {
  239. vector<int> examples_index ( j->second );
  240. int trainingExamples;
  241. if (use_simple_balancing)
  242. trainingExamples = (int)(minExamples * samples_per_tree);
  243. else
  244. trainingExamples = (int)(examples_index.size() * samples_per_tree);
  245. fprintf (stderr, "FPCRandomForests: selection of %d examples for each tree\n", trainingExamples );
  246. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  247. {
  248. fprintf(stderr, "FPCRandomForests: number of examples < 3 !! minExamples=%f, trainingExamples=%d\n",
  249. minExamples, trainingExamples);
  250. trainingExamples = examples_index.size();
  251. fprintf(stderr, "FPCRandomForests: I will use all %d examples of this class !!\n", trainingExamples);
  252. }
  253. // TODO: optional include examples weights
  254. if(samples_per_tree < 1.0)
  255. random_shuffle(examples_index.begin(), examples_index.end());
  256. examples_subset.reserve(examples_subset.size() + trainingExamples);
  257. for (int e = 0 ; e < trainingExamples ; e++)
  258. {
  259. examples_subset.push_back(examples[examples_index[e]]);
  260. exselection[k].push_back(examples_index[e]);
  261. }
  262. // set out of bag trees
  263. for (uint e = trainingExamples; e < examples_index.size() ; e++)
  264. {
  265. int index = examples_index[e];
  266. #pragma omp critical
  267. outofbagtrees[index].push_back(k);
  268. }
  269. }
  270. /******* select a random feature set *******/
  271. FeaturePool fpTree ( fp );
  272. int featuresCountT = featuresCount;
  273. if (featuresCountT >= (int)fpTree.size()) featuresCountT = fpTree.size();
  274. random_shuffle(fpTree.begin(), fpTree.end());
  275. if (featuresCountT < (int)fpTree.size())
  276. {
  277. fp_subset.insert(fp_subset.begin(), fpTree.begin(), fpTree.begin() + featuresCountT);
  278. }
  279. else
  280. {
  281. fp_subset = fpTree;
  282. }
  283. fp_subset.initRandomFeatureSelection();
  284. /******* training of an individual tree ****/
  285. DecisionTree *tree = new DecisionTree(conf, maxClassNo);
  286. builder->build(*tree, fp_subset, examples_subset, maxClassNo);
  287. /******* prune tree using a simple minimum entropy criterion *****/
  288. if (minimum_entropy != 0.0)
  289. tree->pruneTreeEntropy(minimum_entropy);
  290. /******* drop some precalculated data if memory should be saved **/
  291. #pragma omp critical
  292. if (memory_efficient)
  293. {
  294. set<CachedExample *> alreadyDropped;
  295. for (Examples::iterator i = examples_subset.begin();
  296. i != examples_subset.end();
  297. i++)
  298. {
  299. CachedExample *ce = i->second.ce;
  300. if (alreadyDropped.find(ce) == alreadyDropped.end())
  301. {
  302. ce->dropPreCached();
  303. alreadyDropped.insert(ce);
  304. }
  305. }
  306. }
  307. /******* add individual tree to ensemble *****/
  308. #pragma omp critical
  309. forest.push_back(tree);
  310. }
  311. if (enableOutOfBagEstimates)
  312. calcOutOfBagEstimates(outofbagtrees, examples);
  313. }
  314. void FPCRandomForests::restore(istream & is, int format)
  315. {
  316. std::string tag;
  317. int index;
  318. if( (is >> tag) && (tag == "FOREST") )
  319. {
  320. while ( (is >> tag) && (tag == "TREE") )
  321. {
  322. is >> index;
  323. DecisionTree *dt = new DecisionTree ( conf, maxClassNo );
  324. dt->restore ( is );
  325. if ( minimum_entropy != 0.0 )
  326. dt->pruneTreeEntropy ( minimum_entropy );
  327. forest.push_back(dt);
  328. }
  329. }
  330. }
  331. void FPCRandomForests::store(ostream & os, int format) const
  332. {
  333. os << "FOREST " << endl;
  334. int index = 0;
  335. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  336. i != forest.end();
  337. i++, index++ )
  338. {
  339. const DecisionTree & dt = *(*i);
  340. os << "TREE " << index << endl;
  341. dt.store ( os, format );
  342. os << "ENDTREE ";
  343. }
  344. os << endl;
  345. os << "ENDFOREST " << endl;
  346. }
  347. void FPCRandomForests::clear()
  348. {
  349. for ( vector<DecisionTree *>::iterator i = forest.begin();
  350. i != forest.end();
  351. i++ )
  352. delete (*i);
  353. forest.clear();
  354. }
  355. void FPCRandomForests::indexDescendants(map<DecisionNode *, pair<long, int> > & index) const
  356. {
  357. long maxindex = 0;
  358. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  359. i != forest.end();
  360. i++ )
  361. (*i)->indexDescendants ( index, maxindex );
  362. }
  363. void FPCRandomForests::resetCounters()
  364. {
  365. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  366. i != forest.end();
  367. i++ )
  368. (*i)->resetCounters ();
  369. }
  370. FeaturePoolClassifier *FPCRandomForests::clone() const
  371. {
  372. FPCRandomForests *o = new FPCRandomForests(conf, confsection);
  373. o->maxClassNo = maxClassNo;
  374. return o;
  375. }
  376. void FPCRandomForests::setComplexity(int size)
  377. {
  378. fprintf (stderr, "FPCRandomForests: set complexity to %d, overwriting current value %d\n",
  379. size, number_of_trees );
  380. number_of_trees = size;
  381. }