DTBStandard.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. /**
  2. * @file DTBStandard.cpp
  3. * @brief normal decision tree
  4. * @author Erik Rodner
  5. * @date 05/06/2008
  6. */
  7. #include <iostream>
  8. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  9. using namespace OBJREC;
  10. using namespace std;
  11. using namespace NICE;
  12. #undef DEBUG_CART
  13. DTBStandard::DTBStandard( const Config *_conf, std::string section )
  14. {
  15. minimum_information_gain = _conf->gD (section,
  16. "minimum_information_gain", 10e-7 );
  17. minExamples = _conf->gD (section, "min_examples", 10);
  18. }
  19. DTBStandard::~DTBStandard()
  20. {
  21. }
  22. DecisionNode *DTBStandard::build ( const FeaturePool & fp, const Examples & examples, int maxClassNo )
  23. {
  24. FeatureStorage featureStorage;
  25. DecisionNode *root;
  26. fprintf (stderr, "DecisionTree: Precalculating ALL features (This is time and memory consuming !!)\n");
  27. FullVector example_count ( maxClassNo+1 );
  28. int example_index = 0;
  29. for ( vector< pair<int, Example> >::const_iterator example_i = examples.begin();
  30. example_i != examples.end() ; example_i++, example_index++ )
  31. {
  32. int classno = example_i->first;
  33. example_count[classno] += example_i->second.weight;
  34. }
  35. long count_features = 0;
  36. int feature_index = 0;
  37. for ( FeaturePool::const_iterator feature_i = fp.begin();
  38. feature_i != fp.end() ; feature_i++ )
  39. {
  40. const Feature *f = feature_i->second;
  41. int example_index = 0;
  42. for ( vector< pair<int, Example> >::const_iterator example_i = examples.begin();
  43. example_i != examples.end() ; example_i++, example_index++ )
  44. {
  45. int classno = example_i->first;
  46. const Example & ce = example_i->second;
  47. double value = f->val ( &ce );
  48. if(value > 0.0 && value < 1e-300)
  49. value = 0.0;
  50. featureStorage[feature_index].insert ( quadruplet<double, int, int, double> ( value, classno, example_index, ce.weight ) );
  51. count_features++;
  52. }
  53. feature_index++;
  54. }
  55. fprintf (stderr, "DecisionTree: Precalculating done !\n");
  56. fprintf (stderr, "Feature Value Count : %ld\n", count_features );
  57. fprintf (stderr, "Examples : %d\n", (int)examples.size() );
  58. root = buildnode_allfeatures ( fp, featureStorage, example_count );
  59. return root;
  60. }
  61. DecisionNode *DTBStandard::buildnode_allfeatures ( const FeaturePool & features,
  62. FeatureStorage & featureStorage,
  63. const FullVector & distribution )
  64. {
  65. DecisionNode *node = NULL;
  66. assert ( ! featureStorage.empty() );
  67. node = new DecisionNode();
  68. node->distribution = distribution;
  69. double max = 0;
  70. double sumweight = 0;
  71. for ( int i = 0 ; i < distribution.size() ; i++ )
  72. {
  73. sumweight += distribution[i];
  74. if ( max < distribution[i] ) max = distribution[i];
  75. }
  76. if ( max == sumweight )
  77. {
  78. // leaf node reached, because distribution is a simple peak distribution
  79. vector<int> exsel;
  80. for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
  81. {
  82. FeatureValues & featureSet = ft->second;
  83. for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
  84. {
  85. exsel.push_back(k->third);
  86. }
  87. }
  88. node->trainExamplesIndices = exsel;
  89. return node;
  90. }
  91. double optimal_threshold = 0.0;
  92. int optimal_feature = 0;
  93. double optimal_ig = 0.0;
  94. double overall_entropy = 0.0;
  95. if ( featureStorage.begin()->second.size() < minExamples )
  96. {
  97. vector<int> exsel;
  98. for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
  99. {
  100. FeatureValues & featureSet = ft->second;
  101. for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
  102. {
  103. exsel.push_back(k->third);
  104. }
  105. }
  106. node->trainExamplesIndices = exsel;
  107. return node;
  108. }
  109. for ( int i = 0 ; i < distribution.size() ; i++ )
  110. {
  111. double nk = distribution.data[i];
  112. if ( nk != 0 ) {
  113. overall_entropy -= nk/sumweight * ( log (nk) - log(sumweight) );
  114. }
  115. }
  116. if ( overall_entropy <= 0.0 )
  117. {
  118. distribution.store(cerr);
  119. exit(-1);
  120. }
  121. int c = 0;
  122. for ( FeatureStorage::const_iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ , c++)
  123. {
  124. int feature_index = ft->first;
  125. const FeatureValues & featureSet = ft->second;
  126. double information_gain = 0.0;
  127. FullVector lkappa ( distribution.size() );
  128. double best_information_gain = 0.0;
  129. double best_threshold = 0.0;
  130. double l = 0;
  131. double weight_old = 0.0;
  132. double first_value = featureSet.begin()->first;
  133. FeatureValues::const_iterator lastElement = featureSet.end();
  134. lastElement--;
  135. double last_value = lastElement->first;
  136. double weight = lastElement->fourth;
  137. int previous_class = lastElement->second;
  138. double previous_value = lastElement->first;
  139. for ( FeatureValues::const_iterator k = lastElement;
  140. k != featureSet.begin();
  141. k--, weight+=k->fourth )
  142. {
  143. int current_class = k->second;
  144. double current_value = k->first;
  145. assert (! isnan(current_value));
  146. double m = weight - weight_old;
  147. l = weight_old;
  148. double r = sumweight - l;
  149. /*if ( (k != featureSet.begin())&& (current_class != previous_class)
  150. // this leds to problems when using really weak classifiers !!
  151. && ( (current_value != previous_value) ||
  152. ( (current_value != first_value) &&
  153. (current_value != last_value)
  154. )
  155. )
  156. )*/
  157. if( (current_value != previous_value) && (current_value != last_value) )
  158. {
  159. if ( (l<= 10e-7) || ( r <= 10e-7 ) )
  160. continue;
  161. assert ( r > 10e-7 );
  162. assert ( l > 10e-7 );
  163. double left_entropy = 0.0;
  164. double right_entropy = 0.0;
  165. for ( int i = 0 ; i < distribution.size() ; i++ )
  166. {
  167. double lk = lkappa[i];
  168. double nk = distribution.data[i];
  169. double rk = nk - lk;
  170. if ( lk > 10e-7 )
  171. {
  172. double ql = lk / (double)l;
  173. left_entropy -= ql * log(ql);
  174. }
  175. if ( rk > 10e-7 )
  176. {
  177. double qr = rk / (double)r;
  178. right_entropy -= qr * log(qr);
  179. }
  180. }
  181. information_gain = overall_entropy - l*left_entropy/sumweight - r*right_entropy/sumweight;
  182. #ifdef DEBUG_CART
  183. //fprintf (stderr, "ig %f t %f\n sumweight %f e %f le %f re %f\n", information_gain, 0.5 * (current_value + previous_value ),
  184. // sumweight, overall_entropy, left_entropy, right_entropy );
  185. fprintf (stderr, "f: %i ig %f sumweight %f e %f le %f re %f l %f r %f\n", c, information_gain, sumweight, overall_entropy, left_entropy, right_entropy, l, r );
  186. #endif
  187. if ( information_gain > best_information_gain )
  188. {
  189. best_information_gain = information_gain;
  190. best_threshold = 0.5 * (current_value + previous_value);
  191. }
  192. }
  193. weight_old = weight;
  194. lkappa[current_class] += m;
  195. previous_class = current_class;
  196. previous_value = current_value;
  197. }
  198. if ( best_information_gain >= optimal_ig )
  199. {
  200. optimal_threshold = best_threshold;
  201. optimal_feature = feature_index;
  202. optimal_ig = best_information_gain;
  203. }
  204. }
  205. #ifdef DEBUG_CART
  206. fprintf (stderr, "Feature optimization finished: information gain %f threshold %f feature index %d\n",
  207. optimal_ig, optimal_threshold, optimal_feature );
  208. #endif
  209. if ( optimal_ig == 0.0 )
  210. {
  211. fprintf (stderr, "WARNING: zero mutual information! have a look at your feature values\n");
  212. FeatureStorage::const_iterator first = featureStorage.begin();
  213. if ( first == featureStorage.end() )
  214. {
  215. fprintf (stderr, "ERROR: no features given at all !\n");
  216. exit(-1);
  217. }
  218. const FeatureValues & featureSet = first->second;
  219. const int max_debug_values = 50;
  220. int l = 0;
  221. for ( FeatureValues::const_iterator kd = featureSet.begin();
  222. (kd != featureSet.end()) && (l < max_debug_values) ; kd++,l++ )
  223. fprintf (stderr, "(%d,%f) ", kd->second, kd->first );
  224. fprintf (stderr, "\n[...] reverse:\n" );
  225. l = 0;
  226. for ( FeatureValues::const_reverse_iterator kd = featureSet.rbegin();
  227. (kd != featureSet.rend()) && (l < max_debug_values) ; kd++,l++ )
  228. fprintf (stderr, "(%d,%f) ", kd->second, kd->first );
  229. fprintf (stderr, "\n" );
  230. }
  231. // split feature set according to optimal threshold
  232. if ( optimal_ig <= minimum_information_gain )
  233. {
  234. vector<int> exsel;
  235. for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
  236. {
  237. FeatureValues & featureSet = ft->second;
  238. for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
  239. {
  240. exsel.push_back(k->third);
  241. }
  242. }
  243. node->trainExamplesIndices = exsel;
  244. return node;
  245. }
  246. node->f = features[optimal_feature].second->clone();
  247. node->threshold = optimal_threshold;
  248. // kann optimiert werden, da set schon sortiert !!
  249. set<int> examples_left;
  250. FullVector distribution_left ( distribution.size() );
  251. FullVector distribution_right ( distribution.size() );
  252. int sum_left = 0;
  253. for ( FeatureValues::const_iterator k =
  254. featureStorage[optimal_feature].begin();
  255. k != featureStorage[optimal_feature].end();
  256. k++ )
  257. {
  258. int example_index = k->third;
  259. int classno = k->second;
  260. double val = k->first;
  261. if ( val < optimal_threshold ) {
  262. examples_left.insert ( example_index );
  263. sum_left ++;
  264. distribution_left[classno]++;
  265. } else {
  266. distribution_right[classno]++;
  267. }
  268. }
  269. if ( sum_left == 0 ) {
  270. // seldom case of equal feature values
  271. // unable to split !!
  272. fprintf (stderr, "WARNING: equal feature values, splitting impossible\n");
  273. return node;
  274. }
  275. node->f = features[optimal_feature].second->clone();
  276. node->threshold = optimal_threshold;
  277. #ifdef DEBUG_CART
  278. node->f->store (cerr);
  279. cerr << endl;
  280. #endif
  281. FeatureStorage left;
  282. for ( FeatureStorage::iterator ft = featureStorage.begin();
  283. ft != featureStorage.end(); ft++ )
  284. {
  285. int feature_index = ft->first;
  286. FeatureValues & featureSet = ft->second;
  287. for ( FeatureValues::iterator k = featureSet.begin();
  288. k != featureSet.end() ; )
  289. {
  290. int example_index = k->third;
  291. if ( examples_left.find(example_index) != examples_left.end() )
  292. {
  293. left[feature_index].insert ( quadruplet<double, int, int, double> ( *k ) );
  294. featureSet.erase ( k++ );
  295. } else {
  296. ++k;
  297. }
  298. }
  299. }
  300. #ifdef DEBUG_CART
  301. for ( int i = 0 ; i < distribution_left.size() ; i++ )
  302. {
  303. fprintf (stderr, "DecisionTree: split of class %d (%f <-> %f)\n", i,
  304. distribution_left[i], distribution_right[i] );
  305. }
  306. #endif
  307. node->left = buildnode_allfeatures ( features,
  308. left,
  309. distribution_left );
  310. node->right = buildnode_allfeatures ( features,
  311. featureStorage,
  312. distribution_right );
  313. return node;
  314. }