FPCRandomForests.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /**
  2. * @file FPCRandomForests.cpp
  3. * @brief implementation of random set forests
  4. * @author Erik Rodner
  5. * @date 04/24/2008
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <iostream>
  11. #include "vislearning/classifier/fpclassifier/randomforest/FPCRandomForests.h"
  12. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  13. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandom.h"
  14. #include "vislearning/classifier/fpclassifier/randomforest/DTBClusterRandom.h"
  15. #include "vislearning/cbaselib/FeaturePool.h"
  16. using namespace OBJREC;
  17. using namespace std;
  18. using namespace NICE;
  19. FPCRandomForests::FPCRandomForests()
  20. {
  21. builder = NULL;
  22. minimum_entropy = 0.0;
  23. enableOutOfBagEstimates = false;
  24. maxClassNo = -1;
  25. }
  26. FPCRandomForests::FPCRandomForests(const Config *_conf, std::string section) : conf(_conf)
  27. {
  28. std::string builder_method = conf->gS(section, "builder", "random");
  29. minimum_entropy = conf->gD(section, "minimum_entropy", 0.0);
  30. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  31. maxClassNo = -1;
  32. confsection = section;
  33. if ( builder_method == "none" ) {
  34. // do not initialize
  35. builder = NULL;
  36. } else {
  37. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  38. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  39. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  40. use_simple_balancing = conf->gB(section, "use_simple_balancing", false);
  41. weight_examples = conf->gB(section, "weight_examples", false);
  42. memory_efficient = conf->gB(section, "memory_efficient", false);
  43. std::string builder_section = conf->gS(section, "builder_section", "DTBRandom");
  44. if ( builder_method == "standard" )
  45. builder = new DTBStandard ( conf, builder_section );
  46. else if (builder_method == "random" )
  47. builder = new DTBRandom ( conf, builder_section );
  48. else if (builder_method == "cluster_random" )
  49. builder = new DTBClusterRandom ( conf, builder_section );
  50. else {
  51. fprintf (stderr, "DecisionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  52. exit(-1);
  53. }
  54. }
  55. }
  56. FPCRandomForests::~FPCRandomForests()
  57. {
  58. for ( vector<DecisionTree *>::iterator i = forest.begin();
  59. i != forest.end();
  60. i++ )
  61. delete (*i);
  62. if ( builder != NULL )
  63. delete builder;
  64. }
  65. void FPCRandomForests::calcOutOfBagEstimates (
  66. vector< vector<int> > & outofbagtrees,
  67. Examples & examples )
  68. {
  69. oobResults.clear ();
  70. // calculate out of bag classification results
  71. // as suggested by Breiman
  72. // out of bag = training data not used to build
  73. // a single tree is used as testing data for this tree
  74. long index = 0;
  75. for ( Examples::iterator k = examples.begin();
  76. k != examples.end(); k++, index++ )
  77. {
  78. int classno_groundtruth = k->first;
  79. const vector<int> & trees = outofbagtrees[index];
  80. if ( trees.size() <= 0 ) continue;
  81. ClassificationResult r = classify ( k->second, trees );
  82. // FIXME: assumption negative class dst is 0
  83. double score = r.scores.get( 0 /*negativeClassDST*/);
  84. oobResults.push_back ( pair<double, int> ( score, classno_groundtruth ) );
  85. }
  86. }
  87. void FPCRandomForests::getAllLeafNodes ( vector<DecisionNode *> & leafNodes)
  88. {
  89. //leafNodes.reserve ( forest.size() );
  90. int z = 0;
  91. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  92. i != forest.end();
  93. i++,z++ )
  94. {
  95. DecisionTree & dt = *(*i);
  96. vector<DecisionNode *> leaves = dt.getAllLeafNodes();
  97. for(int j = 0; j < (int)leaves.size(); j++)
  98. {
  99. for(int k = 0; k < leaves[j]->trainExamplesIndices.size(); k++)
  100. {
  101. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  102. }
  103. leafNodes.push_back(leaves[j]);
  104. }
  105. }
  106. }
  107. void FPCRandomForests::getLeafNodes ( Example & pce,
  108. vector<DecisionNode *> & leafNodes,
  109. int depth )
  110. {
  111. leafNodes.reserve ( forest.size() );
  112. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  113. i != forest.end();
  114. i++ )
  115. {
  116. DecisionTree & dt = *(*i);
  117. DecisionNode *leaf = dt.getLeafNode ( pce, depth );
  118. leafNodes.push_back ( leaf );
  119. }
  120. }
  121. ClassificationResult FPCRandomForests::classify ( Example & pce,
  122. const vector<int> & outofbagtrees )
  123. {
  124. // classify using only a selection of all trees
  125. // contained in outofbagtrees
  126. FullVector overall_distribution;
  127. for ( vector<int>::const_iterator i = outofbagtrees.begin();
  128. i != outofbagtrees.end();
  129. i++ )
  130. {
  131. assert ( *i < (int)forest.size() );
  132. DecisionTree & dt = *(forest[(*i)]);
  133. FullVector distribution;
  134. dt.traverse ( pce, distribution );
  135. distribution.normalize();
  136. if ( overall_distribution.empty() )
  137. overall_distribution = distribution;
  138. else
  139. overall_distribution.add ( distribution );
  140. }
  141. overall_distribution.normalize();
  142. int classno = overall_distribution.maxElement();
  143. return ClassificationResult(classno, overall_distribution);
  144. }
  145. ClassificationResult FPCRandomForests::classify(Example & pce)
  146. {
  147. FullVector overall_distribution;
  148. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  149. i != forest.end();
  150. i++)
  151. {
  152. DecisionTree & dt = *(*i);
  153. FullVector distribution;
  154. dt.traverse(pce, distribution);
  155. distribution.normalize();
  156. if (overall_distribution.empty())
  157. overall_distribution = distribution;
  158. else
  159. overall_distribution.add(distribution);
  160. }
  161. overall_distribution.normalize();
  162. int classno = overall_distribution.maxElement();
  163. return ClassificationResult(classno, overall_distribution);
  164. }
  165. int FPCRandomForests::classify_optimize(Example & pce)
  166. {
  167. FullVector overall_distribution;
  168. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  169. i != forest.end();
  170. i++)
  171. {
  172. DecisionTree & dt = *(*i);
  173. FullVector distribution;
  174. dt.traverse(pce, distribution);
  175. if (overall_distribution.empty())
  176. overall_distribution = distribution;
  177. else
  178. overall_distribution.add(distribution);
  179. }
  180. return overall_distribution.maxElement();
  181. }
  182. void FPCRandomForests::train(FeaturePool & fp, Examples & examples)
  183. {
  184. cerr << "FPCRandomForests::train()" << endl;
  185. assert(builder != NULL);
  186. if (maxClassNo < 0)
  187. maxClassNo = examples.getMaxClassNo();
  188. FullVector example_distribution(maxClassNo + 1);
  189. map<int, vector<int> > class_examples;
  190. long index = 0;
  191. for (Examples::const_iterator i = examples.begin(); i != examples.end(); i++, index++)
  192. {
  193. int classno = i->first;
  194. example_distribution[classno]++;
  195. class_examples[classno].push_back(index);
  196. }
  197. if (weight_examples)
  198. {
  199. for (Examples::iterator i = examples.begin();
  200. i != examples.end();
  201. i++, index++)
  202. i->second.weight = examples.size() / example_distribution[i->first];
  203. }
  204. double minExamples = (double)examples.size();
  205. int minExamplesClassNo = 0;
  206. for (int i = 0 ; i < example_distribution.size() ; i++)
  207. {
  208. double val = example_distribution[i];
  209. if (minExamples > val && val != 0.0)
  210. {
  211. minExamples = val;
  212. minExamplesClassNo = i;
  213. }
  214. }
  215. fprintf(stderr, "FPCRandomForests: minimum number of examples: %f (classno: %d)\n", minExamples, minExamplesClassNo);
  216. int featuresCount = (int)(fp.size() * features_per_tree);
  217. fprintf(stderr, "FPCRandomForests: number of features %d\n", (int)fp.size());
  218. vector< vector<int> > outofbagtrees;
  219. outofbagtrees.resize(examples.size());
  220. for (int k = 0 ; k < number_of_trees ; k++)
  221. {
  222. vector<int> tmp;
  223. exselection.push_back(tmp);
  224. }
  225. #pragma omp parallel for
  226. for (int k = 0 ; k < number_of_trees ; k++)
  227. {
  228. fprintf(stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  229. FeaturePool fp_subset;
  230. Examples examples_subset;
  231. for (map<int, vector<int> >::const_iterator j = class_examples.begin();
  232. j != class_examples.end(); j++)
  233. {
  234. vector<int> examples_index ( j->second );
  235. int trainingExamples;
  236. if (use_simple_balancing)
  237. trainingExamples = (int)(minExamples * samples_per_tree);
  238. else
  239. trainingExamples = (int)(examples_index.size() * samples_per_tree);
  240. fprintf (stderr, "FPCRandomForests: selection of %d examples for each tree\n", trainingExamples );
  241. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  242. {
  243. fprintf(stderr, "FPCRandomForests: number of examples < 3 !! minExamples=%f, trainingExamples=%d\n",
  244. minExamples, trainingExamples);
  245. trainingExamples = examples_index.size();
  246. fprintf(stderr, "FPCRandomForests: I will use all %d examples of this class !!\n", trainingExamples);
  247. }
  248. // TODO: optional include examples weights
  249. if(samples_per_tree < 1.0)
  250. random_shuffle(examples_index.begin(), examples_index.end());
  251. examples_subset.reserve(examples_subset.size() + trainingExamples);
  252. for (int e = 0 ; e < trainingExamples ; e++)
  253. {
  254. examples_subset.push_back(examples[examples_index[e]]);
  255. exselection[k].push_back(examples_index[e]);
  256. }
  257. // set out of bag trees
  258. for (uint e = trainingExamples; e < examples_index.size() ; e++)
  259. {
  260. int index = examples_index[e];
  261. #pragma omp critical
  262. outofbagtrees[index].push_back(k);
  263. }
  264. }
  265. /******* select a random feature set *******/
  266. FeaturePool fpTree ( fp );
  267. int featuresCountT = featuresCount;
  268. if (featuresCountT >= (int)fpTree.size()) featuresCountT = fpTree.size();
  269. random_shuffle(fpTree.begin(), fpTree.end());
  270. if (featuresCountT < (int)fpTree.size())
  271. {
  272. fp_subset.insert(fp_subset.begin(), fpTree.begin(), fpTree.begin() + featuresCountT);
  273. }
  274. else
  275. {
  276. fp_subset = fpTree;
  277. }
  278. fp_subset.initRandomFeatureSelection();
  279. /******* training of an individual tree ****/
  280. DecisionTree *tree = new DecisionTree(conf, maxClassNo);
  281. builder->build(*tree, fp_subset, examples_subset, maxClassNo);
  282. /******* prune tree using a simple minimum entropy criterion *****/
  283. if (minimum_entropy != 0.0)
  284. tree->pruneTreeEntropy(minimum_entropy);
  285. /******* drop some precalculated data if memory should be saved **/
  286. #pragma omp critical
  287. if (memory_efficient)
  288. {
  289. set<CachedExample *> alreadyDropped;
  290. for (Examples::iterator i = examples_subset.begin();
  291. i != examples_subset.end();
  292. i++)
  293. {
  294. CachedExample *ce = i->second.ce;
  295. if (alreadyDropped.find(ce) == alreadyDropped.end())
  296. {
  297. ce->dropPreCached();
  298. alreadyDropped.insert(ce);
  299. }
  300. }
  301. }
  302. /******* add individual tree to ensemble *****/
  303. #pragma omp critical
  304. forest.push_back(tree);
  305. }
  306. if (enableOutOfBagEstimates)
  307. calcOutOfBagEstimates(outofbagtrees, examples);
  308. }
  309. void FPCRandomForests::restore(istream & is, int format)
  310. {
  311. std::string tag;
  312. int index;
  313. while ( (is >> tag) && (tag == "TREE") )
  314. {
  315. is >> index;
  316. DecisionTree *dt = new DecisionTree ( conf, maxClassNo );
  317. dt->restore ( is );
  318. if ( minimum_entropy != 0.0 )
  319. dt->pruneTreeEntropy ( minimum_entropy );
  320. forest.push_back(dt);
  321. }
  322. }
  323. void FPCRandomForests::store(ostream & os, int format) const
  324. {
  325. int index = 0;
  326. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  327. i != forest.end();
  328. i++, index++ )
  329. {
  330. const DecisionTree & dt = *(*i);
  331. os << "TREE " << index << endl;
  332. dt.store ( os, format );
  333. os << "ENDTREE ";
  334. }
  335. }
  336. void FPCRandomForests::clear()
  337. {
  338. for ( vector<DecisionTree *>::iterator i = forest.begin();
  339. i != forest.end();
  340. i++ )
  341. delete (*i);
  342. forest.clear();
  343. }
  344. void FPCRandomForests::indexDescendants(map<DecisionNode *, pair<long, int> > & index) const
  345. {
  346. long maxindex = 0;
  347. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  348. i != forest.end();
  349. i++ )
  350. (*i)->indexDescendants ( index, maxindex );
  351. }
  352. void FPCRandomForests::resetCounters()
  353. {
  354. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  355. i != forest.end();
  356. i++ )
  357. (*i)->resetCounters ();
  358. }
  359. FeaturePoolClassifier *FPCRandomForests::clone() const
  360. {
  361. FPCRandomForests *o = new FPCRandomForests(conf, confsection);
  362. o->maxClassNo = maxClassNo;
  363. return o;
  364. }
  365. void FPCRandomForests::setComplexity(int size)
  366. {
  367. fprintf (stderr, "FPCRandomForests: set complexity to %d, overwriting current value %d\n",
  368. size, number_of_trees );
  369. number_of_trees = size;
  370. }