FPCRandomForests.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. /**
  2. * @file FPCRandomForests.cpp
  3. * @brief implementation of random set forests
  4. * @author Erik Rodner
  5. * @date 04/24/2008
  6. */
  7. #ifdef NICE_USELIB_OPENMP
  8. #include <omp.h>
  9. #endif
  10. #include <vislearning/nice.h>
  11. #include <iostream>
  12. #include "vislearning/classifier/fpclassifier/randomforest/FPCRandomForests.h"
  13. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  14. #include "vislearning/classifier/fpclassifier/randomforest/DTBRandom.h"
  15. #include "vislearning/classifier/fpclassifier/randomforest/DTBClusterRandom.h"
  16. #include "vislearning/cbaselib/FeaturePool.h"
  17. using namespace OBJREC;
  18. using namespace std;
  19. using namespace NICE;
  20. FPCRandomForests::FPCRandomForests()
  21. {
  22. builder = NULL;
  23. minimum_entropy = 0.0;
  24. enableOutOfBagEstimates = false;
  25. maxClassNo = -1;
  26. }
  27. FPCRandomForests::FPCRandomForests(const Config *_conf, std::string section) : conf(_conf)
  28. {
  29. std::string builder_method = conf->gS(section, "builder", "random");
  30. minimum_entropy = conf->gD(section, "minimum_entropy", 0.0);
  31. enableOutOfBagEstimates = conf->gB(section, "enable_out_of_bag_estimates", false);
  32. maxClassNo = -1;
  33. confsection = section;
  34. if ( builder_method == "none" ) {
  35. // do not initialize
  36. builder = NULL;
  37. } else {
  38. number_of_trees = conf->gI(section, "number_of_trees", 20 );
  39. features_per_tree = conf->gD(section, "features_per_tree", 1.0 );
  40. samples_per_tree = conf->gD(section, "samples_per_tree", 0.2 );
  41. use_simple_balancing = conf->gB(section, "use_simple_balancing", false);
  42. weight_examples = conf->gB(section, "weight_examples", false);
  43. memory_efficient = conf->gB(section, "memory_efficient", false);
  44. std::string builder_section = conf->gS(section, "builder_section", "DTBRandom");
  45. if ( builder_method == "standard" )
  46. builder = new DTBStandard ( conf, builder_section );
  47. else if (builder_method == "random" )
  48. builder = new DTBRandom ( conf, builder_section );
  49. else if (builder_method == "cluster_random" )
  50. builder = new DTBClusterRandom ( conf, builder_section );
  51. else {
  52. fprintf (stderr, "DecisionTreeBuilder %s not yet implemented !\n", builder_method.c_str() );
  53. exit(-1);
  54. }
  55. }
  56. }
  57. FPCRandomForests::~FPCRandomForests()
  58. {
  59. for ( vector<DecisionTree *>::iterator i = forest.begin();
  60. i != forest.end();
  61. i++ )
  62. delete (*i);
  63. if ( builder != NULL )
  64. delete builder;
  65. }
  66. void FPCRandomForests::calcOutOfBagEstimates (
  67. vector< vector<int> > & outofbagtrees,
  68. Examples & examples )
  69. {
  70. oobResults.clear ();
  71. // calculate out of bag classification results
  72. // as suggested by Breiman
  73. // out of bag = training data not used to build
  74. // a single tree is used as testing data for this tree
  75. long index = 0;
  76. for ( Examples::iterator k = examples.begin();
  77. k != examples.end(); k++, index++ )
  78. {
  79. int classno_groundtruth = k->first;
  80. const vector<int> & trees = outofbagtrees[index];
  81. if ( trees.size() <= 0 ) continue;
  82. ClassificationResult r = classify ( k->second, trees );
  83. // FIXME: assumption negative class dst is 0
  84. double score = r.scores.get( 0 /*negativeClassDST*/);
  85. oobResults.push_back ( pair<double, int> ( score, classno_groundtruth ) );
  86. }
  87. }
  88. void FPCRandomForests::getAllLeafNodes ( vector<DecisionNode *> & leafNodes)
  89. {
  90. //leafNodes.reserve ( forest.size() );
  91. int z = 0;
  92. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  93. i != forest.end();
  94. i++,z++ )
  95. {
  96. DecisionTree & dt = *(*i);
  97. vector<DecisionNode *> leaves = dt.getAllLeafNodes();
  98. for(int j = 0; j < (int)leaves.size(); j++)
  99. {
  100. for(int k = 0; k < leaves[j]->trainExamplesIndices.size(); k++)
  101. {
  102. leaves[j]->trainExamplesIndices[k] = exselection[z][leaves[j]->trainExamplesIndices[k]];
  103. }
  104. leafNodes.push_back(leaves[j]);
  105. }
  106. }
  107. }
  108. void FPCRandomForests::getLeafNodes ( Example & pce,
  109. vector<DecisionNode *> & leafNodes,
  110. int depth )
  111. {
  112. leafNodes.reserve ( forest.size() );
  113. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  114. i != forest.end();
  115. i++ )
  116. {
  117. DecisionTree & dt = *(*i);
  118. DecisionNode *leaf = dt.getLeafNode ( pce, depth );
  119. leafNodes.push_back ( leaf );
  120. }
  121. }
  122. ClassificationResult FPCRandomForests::classify ( Example & pce,
  123. const vector<int> & outofbagtrees )
  124. {
  125. // classify using only a selection of all trees
  126. // contained in outofbagtrees
  127. FullVector overall_distribution;
  128. for ( vector<int>::const_iterator i = outofbagtrees.begin();
  129. i != outofbagtrees.end();
  130. i++ )
  131. {
  132. assert ( *i < (int)forest.size() );
  133. DecisionTree & dt = *(forest[(*i)]);
  134. FullVector distribution;
  135. dt.traverse ( pce, distribution );
  136. distribution.normalize();
  137. if ( overall_distribution.empty() )
  138. overall_distribution = distribution;
  139. else
  140. overall_distribution.add ( distribution );
  141. }
  142. overall_distribution.normalize();
  143. int classno = overall_distribution.maxElement();
  144. return ClassificationResult(classno, overall_distribution);
  145. }
  146. ClassificationResult FPCRandomForests::classify(Example & pce)
  147. {
  148. FullVector overall_distribution;
  149. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  150. i != forest.end();
  151. i++)
  152. {
  153. DecisionTree & dt = *(*i);
  154. FullVector distribution;
  155. dt.traverse(pce, distribution);
  156. distribution.normalize();
  157. if (overall_distribution.empty())
  158. overall_distribution = distribution;
  159. else
  160. overall_distribution.add(distribution);
  161. }
  162. overall_distribution.normalize();
  163. int classno = overall_distribution.maxElement();
  164. return ClassificationResult(classno, overall_distribution);
  165. }
  166. int FPCRandomForests::classify_optimize(Example & pce)
  167. {
  168. FullVector overall_distribution;
  169. for (vector<DecisionTree *>::const_iterator i = forest.begin();
  170. i != forest.end();
  171. i++)
  172. {
  173. DecisionTree & dt = *(*i);
  174. FullVector distribution;
  175. dt.traverse(pce, distribution);
  176. if (overall_distribution.empty())
  177. overall_distribution = distribution;
  178. else
  179. overall_distribution.add(distribution);
  180. }
  181. return overall_distribution.maxElement();
  182. }
  183. void FPCRandomForests::train(FeaturePool & fp, Examples & examples)
  184. {
  185. cerr << "FPCRandomForests::train()" << endl;
  186. assert(builder != NULL);
  187. if (maxClassNo < 0)
  188. maxClassNo = examples.getMaxClassNo();
  189. FullVector example_distribution(maxClassNo + 1);
  190. map<int, vector<int> > class_examples;
  191. long index = 0;
  192. for (Examples::const_iterator i = examples.begin(); i != examples.end(); i++, index++)
  193. {
  194. int classno = i->first;
  195. example_distribution[classno]++;
  196. class_examples[classno].push_back(index);
  197. }
  198. if (weight_examples)
  199. {
  200. for (Examples::iterator i = examples.begin();
  201. i != examples.end();
  202. i++, index++)
  203. i->second.weight = examples.size() / example_distribution[i->first];
  204. }
  205. double minExamples = (double)examples.size();
  206. int minExamplesClassNo = 0;
  207. for (int i = 0 ; i < example_distribution.size() ; i++)
  208. {
  209. double val = example_distribution[i];
  210. if (minExamples > val && val != 0.0)
  211. {
  212. minExamples = val;
  213. minExamplesClassNo = i;
  214. }
  215. }
  216. fprintf(stderr, "FPCRandomForests: minimum number of examples: %f (classno: %d)\n", minExamples, minExamplesClassNo);
  217. int featuresCount = (int)(fp.size() * features_per_tree);
  218. fprintf(stderr, "FPCRandomForests: number of features %d\n", (int)fp.size());
  219. vector< vector<int> > outofbagtrees;
  220. outofbagtrees.resize(examples.size());
  221. for (int k = 0 ; k < number_of_trees ; k++)
  222. {
  223. vector<int> tmp;
  224. exselection.push_back(tmp);
  225. }
  226. #pragma omp parallel for
  227. for (int k = 0 ; k < number_of_trees ; k++)
  228. {
  229. fprintf(stderr, "[ -- building tree %d/%d -- ]\n", k + 1, number_of_trees);
  230. FeaturePool fp_subset;
  231. Examples examples_subset;
  232. for (map<int, vector<int> >::const_iterator j = class_examples.begin();
  233. j != class_examples.end(); j++)
  234. {
  235. vector<int> examples_index ( j->second );
  236. int trainingExamples;
  237. if (use_simple_balancing)
  238. trainingExamples = (int)(minExamples * samples_per_tree);
  239. else
  240. trainingExamples = (int)(examples_index.size() * samples_per_tree);
  241. fprintf (stderr, "FPCRandomForests: selection of %d examples for each tree\n", trainingExamples );
  242. if ( (trainingExamples < 3) && ((int)examples_index.size() > trainingExamples) )
  243. {
  244. fprintf(stderr, "FPCRandomForests: number of examples < 3 !! minExamples=%f, trainingExamples=%d\n",
  245. minExamples, trainingExamples);
  246. trainingExamples = examples_index.size();
  247. fprintf(stderr, "FPCRandomForests: I will use all %d examples of this class !!\n", trainingExamples);
  248. }
  249. // TODO: optional include examples weights
  250. if(samples_per_tree < 1.0)
  251. random_shuffle(examples_index.begin(), examples_index.end());
  252. examples_subset.reserve(examples_subset.size() + trainingExamples);
  253. for (int e = 0 ; e < trainingExamples ; e++)
  254. {
  255. examples_subset.push_back(examples[examples_index[e]]);
  256. exselection[k].push_back(examples_index[e]);
  257. }
  258. // set out of bag trees
  259. for (uint e = trainingExamples; e < examples_index.size() ; e++)
  260. {
  261. int index = examples_index[e];
  262. #pragma omp critical
  263. outofbagtrees[index].push_back(k);
  264. }
  265. }
  266. /******* select a random feature set *******/
  267. FeaturePool fpTree ( fp );
  268. int featuresCountT = featuresCount;
  269. if (featuresCountT >= (int)fpTree.size()) featuresCountT = fpTree.size();
  270. random_shuffle(fpTree.begin(), fpTree.end());
  271. if (featuresCountT < (int)fpTree.size())
  272. {
  273. fp_subset.insert(fp_subset.begin(), fpTree.begin(), fpTree.begin() + featuresCountT);
  274. }
  275. else
  276. {
  277. fp_subset = fpTree;
  278. }
  279. fp_subset.initRandomFeatureSelection();
  280. /******* training of an individual tree ****/
  281. DecisionTree *tree = new DecisionTree(conf, maxClassNo);
  282. builder->build(*tree, fp_subset, examples_subset, maxClassNo);
  283. /******* prune tree using a simple minimum entropy criterion *****/
  284. if (minimum_entropy != 0.0)
  285. tree->pruneTreeEntropy(minimum_entropy);
  286. /******* drop some precalculated data if memory should be saved **/
  287. #pragma omp critical
  288. if (memory_efficient)
  289. {
  290. set<CachedExample *> alreadyDropped;
  291. for (Examples::iterator i = examples_subset.begin();
  292. i != examples_subset.end();
  293. i++)
  294. {
  295. CachedExample *ce = i->second.ce;
  296. if (alreadyDropped.find(ce) == alreadyDropped.end())
  297. {
  298. ce->dropPreCached();
  299. alreadyDropped.insert(ce);
  300. }
  301. }
  302. }
  303. /******* add individual tree to ensemble *****/
  304. #pragma omp critical
  305. forest.push_back(tree);
  306. }
  307. if (enableOutOfBagEstimates)
  308. calcOutOfBagEstimates(outofbagtrees, examples);
  309. }
  310. void FPCRandomForests::restore(istream & is, int format)
  311. {
  312. std::string tag;
  313. int index;
  314. while ( (is >> tag) && (tag == "TREE") )
  315. {
  316. is >> index;
  317. DecisionTree *dt = new DecisionTree ( conf, maxClassNo );
  318. dt->restore ( is );
  319. if ( minimum_entropy != 0.0 )
  320. dt->pruneTreeEntropy ( minimum_entropy );
  321. forest.push_back(dt);
  322. }
  323. }
  324. void FPCRandomForests::store(ostream & os, int format) const
  325. {
  326. int index = 0;
  327. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  328. i != forest.end();
  329. i++, index++ )
  330. {
  331. const DecisionTree & dt = *(*i);
  332. os << "TREE " << index << endl;
  333. dt.store ( os, format );
  334. os << "ENDTREE ";
  335. }
  336. }
  337. void FPCRandomForests::clear()
  338. {
  339. for ( vector<DecisionTree *>::iterator i = forest.begin();
  340. i != forest.end();
  341. i++ )
  342. delete (*i);
  343. forest.clear();
  344. }
  345. void FPCRandomForests::indexDescendants(map<DecisionNode *, pair<long, int> > & index) const
  346. {
  347. long maxindex = 0;
  348. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  349. i != forest.end();
  350. i++ )
  351. (*i)->indexDescendants ( index, maxindex );
  352. }
  353. void FPCRandomForests::resetCounters()
  354. {
  355. for ( vector<DecisionTree *>::const_iterator i = forest.begin();
  356. i != forest.end();
  357. i++ )
  358. (*i)->resetCounters ();
  359. }
  360. FeaturePoolClassifier *FPCRandomForests::clone() const
  361. {
  362. FPCRandomForests *o = new FPCRandomForests(conf, confsection);
  363. o->maxClassNo = maxClassNo;
  364. return o;
  365. }
  366. void FPCRandomForests::setComplexity(int size)
  367. {
  368. fprintf (stderr, "FPCRandomForests: set complexity to %d, overwriting current value %d\n",
  369. size, number_of_trees );
  370. number_of_trees = size;
  371. }