FPCRandomForests.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. /**
  2. * @file FPCRandomForests.cpp
  3. * @brief implementation of random set forests
  4. * @author Erik Rodner
  5. * @date 04/24/2008
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include "core/image/ImageT.h"
  12. #include "core/imagedisplay/ImageDisplay.h"
  13. #include "vislearning/classifier/fpclassifier/randomforest/FPCRandomForests.h"
  14. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  15. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandom.h"
  16. #include "vislearning/classifier/fpclassifier/randomforest/DTBClusterRandom.h"
  17. #include "vislearning/classifier/fpclassifier/randomforest/DTBObliqueLS.h"
  18. #include "vislearning/cbaselib/FeaturePool.h"
  19. using namespace OBJREC;
  20. using namespace std;
  21. using namespace NICE;
  22. FPCRandomForests::FPCRandomForests()
  23. {
  24. builder = NULL;
  25. minimum_entropy = 0.0;
  26. enableOutOfBagEstimates = false;
  27. maxClassNo = -1;
  28. }
  29. FPCRandomForests::FPCRandomForests(const Config *_conf, std::string section) : conf(_conf)
  30. {
  31. std::string builder_method = conf->gS(section, "builder", "random");
  32. minimum_entropy = conf->gD(section, "minimum_entropy", 0.0);
  33. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  34. maxClassNo = -1;
  35. confsection = section;
  36. if ( builder_method == "none" ) {
  37. // do not initialize
  38. builder = NULL;
  39. } else {
  40. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  41. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  42. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  43. use_simple_balancing = conf->gB(section, "use_simple_balancing", false);
  44. weight_examples = conf->gB(section, "weight_examples", false);
  45. memory_efficient = conf->gB(section, "memory_efficient", false);
  46. std::string builder_section = conf->gS(section, "builder_section", "DTBRandom");
  47. if ( builder_method == "standard" )
  48. builder = new DTBStandard ( conf, builder_section );
  49. else if (builder_method == "random" )
  50. builder = new DTBRandom ( conf, builder_section );
  51. else if (builder_method == "cluster_random" )
  52. builder = new DTBClusterRandom ( conf, builder_section );
  53. else if (builder_method == "oblique_ls" )
  54. builder = new DTBObliqueLS ( conf, builder_section );
  55. else {
  56. fprintf (stderr, "DecisionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  57. exit(-1);
  58. }
  59. }
  60. }
  61. FPCRandomForests::~FPCRandomForests()
  62. {
  63. for ( vector<DecisionTree *>::iterator i = forest.begin();
  64. i != forest.end();
  65. i++ )
  66. delete (*i);
  67. if ( builder != NULL )
  68. delete builder;
  69. }
  70. void FPCRandomForests::calcOutOfBagEstimates (
  71. vector< vector<int> > & outofbagtrees,
  72. Examples & examples )
  73. {
  74. oobResults.clear ();
  75. // calculate out of bag classification results
  76. // as suggested by Breiman
  77. // out of bag = training data not used to build
  78. // a single tree is used as testing data for this tree
  79. long index = 0;
  80. for ( Examples::iterator k = examples.begin();
  81. k != examples.end(); k++, index++ )
  82. {
  83. int classno_groundtruth = k->first;
  84. const vector<int> & trees = outofbagtrees[index];
  85. if ( trees.size() <= 0 ) continue;
  86. ClassificationResult r = classify ( k->second, trees );
  87. // FIXME: assumption negative class dst is 0
  88. double score = r.scores.get( 0 /*negativeClassDST*/);
  89. oobResults.push_back ( pair<double, int> ( score, classno_groundtruth ) );
  90. }
  91. }
  92. void FPCRandomForests::getAllLeafNodes ( vector<DecisionNode *> & leafNodes)
  93. {
  94. //leafNodes.reserve ( forest.size() );
  95. int z = 0;
  96. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  97. i != forest.end();
  98. i++,z++ )
  99. {
  100. DecisionTree & dt = *(*i);
  101. vector<DecisionNode *> leaves = dt.getAllLeafNodes();
  102. for(int j = 0; j < (int)leaves.size(); j++)
  103. {
  104. for(int k = 0; k < leaves[j]->trainExamplesIndices.size(); k++)
  105. {
  106. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  107. }
  108. leafNodes.push_back(leaves[j]);
  109. }
  110. }
  111. }
  112. void FPCRandomForests::getLeafNodes ( Example & pce,
  113. vector<DecisionNode *> & leafNodes,
  114. int depth )
  115. {
  116. leafNodes.reserve ( forest.size() );
  117. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  118. i != forest.end();
  119. i++ )
  120. {
  121. DecisionTree & dt = *(*i);
  122. DecisionNode *leaf = dt.getLeafNode ( pce, depth );
  123. leafNodes.push_back ( leaf );
  124. }
  125. }
  126. ClassificationResult FPCRandomForests::classify ( Example & pce,
  127. const vector<int> & outofbagtrees )
  128. {
  129. // classify using only a selection of all trees
  130. // contained in outofbagtrees
  131. FullVector overall_distribution;
  132. for ( vector<int>::const_iterator i = outofbagtrees.begin();
  133. i != outofbagtrees.end();
  134. i++ )
  135. {
  136. assert ( *i < (int)forest.size() );
  137. DecisionTree & dt = *(forest[(*i)]);
  138. FullVector distribution;
  139. dt.traverse ( pce, distribution );
  140. distribution.normalize();
  141. if ( overall_distribution.empty() )
  142. overall_distribution = distribution;
  143. else
  144. overall_distribution.add ( distribution );
  145. }
  146. overall_distribution.normalize();
  147. int classno = overall_distribution.maxElement();
  148. return ClassificationResult(classno, overall_distribution);
  149. }
  150. ClassificationResult FPCRandomForests::classify(Example & pce)
  151. {
  152. FullVector overall_distribution;
  153. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  154. i != forest.end();
  155. i++)
  156. {
  157. DecisionTree & dt = *(*i);
  158. FullVector distribution;
  159. dt.traverse(pce, distribution);
  160. distribution.normalize();
  161. if (overall_distribution.empty())
  162. overall_distribution = distribution;
  163. else
  164. overall_distribution.add(distribution);
  165. }
  166. overall_distribution.normalize();
  167. int classno = overall_distribution.maxElement();
  168. return ClassificationResult(classno, overall_distribution);
  169. }
  170. int FPCRandomForests::classify_optimize(Example & pce)
  171. {
  172. FullVector overall_distribution;
  173. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  174. i != forest.end();
  175. i++)
  176. {
  177. DecisionTree & dt = *(*i);
  178. FullVector distribution;
  179. dt.traverse(pce, distribution);
  180. if (overall_distribution.empty())
  181. overall_distribution = distribution;
  182. else
  183. overall_distribution.add(distribution);
  184. }
  185. return overall_distribution.maxElement();
  186. }
  187. void FPCRandomForests::train(FeaturePool & fp, Examples & examples)
  188. {
  189. assert(builder != NULL);
  190. if (maxClassNo < 0)
  191. maxClassNo = examples.getMaxClassNo();
  192. FullVector example_distribution(maxClassNo + 1);
  193. map<int, vector<int> > class_examples;
  194. long index = 0;
  195. for (Examples::const_iterator i = examples.begin(); i != examples.end(); i++, index++)
  196. {
  197. int classno = i->first;
  198. example_distribution[classno]++;
  199. class_examples[classno].push_back(index);
  200. }
  201. if (weight_examples)
  202. {
  203. for ( Examples::iterator i = examples.begin();
  204. i != examples.end(); i++, index++ )
  205. i->second.weight = examples.size() / example_distribution[i->first];
  206. }
  207. double minExamples = (double)examples.size();
  208. int minExamplesClassNo = 0;
  209. for (int i = 0 ; i < example_distribution.size() ; i++)
  210. {
  211. double val = example_distribution[i];
  212. if (minExamples > val && val != 0.0)
  213. {
  214. minExamples = val;
  215. minExamplesClassNo = i;
  216. }
  217. }
  218. fprintf(stderr, "FPCRandomForests: minimum number of examples: %d (classno: %d)\n", (int)minExamples, minExamplesClassNo);
  219. int featuresCount = (int)(fp.size() * features_per_tree);
  220. fprintf(stderr, "FPCRandomForests: number of features %d\n", (int)fp.size());
  221. vector< vector<int> > outofbagtrees;
  222. outofbagtrees.resize(examples.size());
  223. for (int k = 0 ; k < number_of_trees ; k++)
  224. {
  225. vector<int> tmp;
  226. exselection.push_back(tmp);
  227. }
  228. #pragma omp parallel for
  229. for (int k = 0 ; k < number_of_trees ; k++)
  230. {
  231. fprintf(stderr, "FPCRandomForests: [ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  232. FeaturePool fp_subset;
  233. Examples examples_subset;
  234. for (map<int, vector<int> >::const_iterator j = class_examples.begin();
  235. j != class_examples.end(); j++)
  236. {
  237. vector<int> examples_index ( j->second );
  238. int trainingExamples;
  239. if (use_simple_balancing)
  240. trainingExamples = (int)(minExamples * samples_per_tree);
  241. else
  242. trainingExamples = (int)(examples_index.size() * samples_per_tree);
  243. fprintf (stderr, "FPCRandomForests: selection of %d examples for each tree (classno: %d)\n", trainingExamples, j->first );
  244. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  245. {
  246. fprintf(stderr, "FPCRandomForests: number of examples < 3 !! minExamples=%f, trainingExamples=%d\n",
  247. minExamples, trainingExamples);
  248. trainingExamples = examples_index.size();
  249. fprintf(stderr, "FPCRandomForests: I will use all %d examples of this class !!\n", trainingExamples);
  250. }
  251. // TODO: optional include examples weights
  252. if(samples_per_tree < 1.0)
  253. random_shuffle(examples_index.begin(), examples_index.end());
  254. examples_subset.reserve(examples_subset.size() + trainingExamples);
  255. for (int e = 0 ; e < trainingExamples ; e++)
  256. {
  257. examples_subset.push_back(examples[examples_index[e]]);
  258. exselection[k].push_back(examples_index[e]);
  259. }
  260. // set out of bag trees
  261. for (uint e = trainingExamples; e < examples_index.size() ; e++)
  262. {
  263. int index = examples_index[e];
  264. #pragma omp critical
  265. outofbagtrees[index].push_back(k);
  266. }
  267. }
  268. /******* select a random feature set *******/
  269. FeaturePool fpTree ( fp );
  270. int featuresCountT = featuresCount;
  271. if (featuresCountT >= (int)fpTree.size()) featuresCountT = fpTree.size();
  272. random_shuffle(fpTree.begin(), fpTree.end());
  273. if (featuresCountT < (int)fpTree.size())
  274. {
  275. fp_subset.insert(fp_subset.begin(), fpTree.begin(), fpTree.begin() + featuresCountT);
  276. }
  277. else
  278. {
  279. fp_subset = fpTree;
  280. }
  281. fp_subset.initRandomFeatureSelection();
  282. /******* training of an individual tree ****/
  283. DecisionTree *tree = new DecisionTree(conf, maxClassNo);
  284. #pragma omp critical
  285. builder->build(*tree, fp_subset, examples_subset, maxClassNo);
  286. /******* prune tree using a simple minimum entropy criterion *****/
  287. if (minimum_entropy != 0.0)
  288. tree->pruneTreeEntropy(minimum_entropy);
  289. /******* drop some precalculated data if memory should be saved **/
  290. #pragma omp critical
  291. if (memory_efficient)
  292. {
  293. set<CachedExample *> alreadyDropped;
  294. for (Examples::iterator i = examples_subset.begin();
  295. i != examples_subset.end();
  296. i++)
  297. {
  298. CachedExample *ce = i->second.ce;
  299. if (alreadyDropped.find(ce) == alreadyDropped.end())
  300. {
  301. ce->dropPreCached();
  302. alreadyDropped.insert(ce);
  303. }
  304. }
  305. }
  306. /******* add individual tree to ensemble *****/
  307. #pragma omp critical
  308. forest.push_back(tree);
  309. }
  310. if (enableOutOfBagEstimates)
  311. calcOutOfBagEstimates(outofbagtrees, examples);
  312. }
  313. void FPCRandomForests::restore(istream & is, int format)
  314. {
  315. std::string tag;
  316. int index;
  317. if( (is >> tag) && (tag == "FOREST") )
  318. {
  319. while ( (is >> tag) && (tag == "TREE") )
  320. {
  321. is >> index;
  322. DecisionTree *dt = new DecisionTree ( conf, maxClassNo );
  323. dt->restore ( is, format );
  324. if ( minimum_entropy != 0.0 )
  325. dt->pruneTreeEntropy ( minimum_entropy );
  326. forest.push_back(dt);
  327. }
  328. }
  329. }
  330. void FPCRandomForests::store(ostream & os, int format) const
  331. {
  332. os << "FOREST " << endl;
  333. int index = 0;
  334. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  335. i != forest.end();
  336. i++, index++ )
  337. {
  338. const DecisionTree & dt = *(*i);
  339. os << "TREE " << index << endl;
  340. dt.store ( os, format );
  341. os << "ENDTREE ";
  342. }
  343. os << endl;
  344. os << "ENDFOREST " << endl;
  345. }
  346. void FPCRandomForests::clear()
  347. {
  348. for ( vector<DecisionTree *>::iterator i = forest.begin();
  349. i != forest.end();
  350. i++ )
  351. delete (*i);
  352. forest.clear();
  353. }
  354. void FPCRandomForests::indexDescendants(map<DecisionNode *, pair<long, int> > & index) const
  355. {
  356. long maxindex = 0;
  357. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  358. i != forest.end();
  359. i++ )
  360. (*i)->indexDescendants ( index, maxindex );
  361. }
  362. void FPCRandomForests::resetCounters()
  363. {
  364. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  365. i != forest.end();
  366. i++ )
  367. (*i)->resetCounters ();
  368. }
  369. FeaturePoolClassifier *FPCRandomForests::clone() const
  370. {
  371. FPCRandomForests *o = new FPCRandomForests(conf, confsection);
  372. o->maxClassNo = maxClassNo;
  373. return o;
  374. }
  375. void FPCRandomForests::setComplexity(int size)
  376. {
  377. fprintf (stderr, "FPCRandomForests: set complexity to %d, overwriting current value %d\n",
  378. size, number_of_trees );
  379. number_of_trees = size;
  380. }