DTBStandard.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. /**
  2. * @file DTBStandard.cpp
  3. * @brief normal decision tree
  4. * @author Erik Rodner
  5. * @date 05/06/2008
  6. */
  7. #include <vislearning/nice_nonvis.h>
  8. #include <iostream>
  9. #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
  10. using namespace OBJREC;
  11. using namespace std;
  12. using namespace NICE;
  13. #undef DEBUG_CART
  14. DTBStandard::DTBStandard( const Config *_conf, std::string section )
  15. {
  16. minimum_information_gain = _conf->gD (section,
  17. "minimum_information_gain", 10e-7 );
  18. minExamples = _conf->gD (section, "min_examples", 10);
  19. }
  20. DTBStandard::~DTBStandard()
  21. {
  22. }
  23. DecisionNode *DTBStandard::build ( const FeaturePool & fp, const Examples & examples, int maxClassNo )
  24. {
  25. FeatureStorage featureStorage;
  26. DecisionNode *root;
  27. fprintf (stderr, "DecisionTree: Precalculating ALL features (This is time and memory consuming !!)\n");
  28. FullVector example_count ( maxClassNo+1 );
  29. int example_index = 0;
  30. for ( vector< pair<int, Example> >::const_iterator example_i = examples.begin();
  31. example_i != examples.end() ; example_i++, example_index++ )
  32. {
  33. int classno = example_i->first;
  34. example_count[classno] += example_i->second.weight;
  35. }
  36. long count_features = 0;
  37. int feature_index = 0;
  38. for ( FeaturePool::const_iterator feature_i = fp.begin();
  39. feature_i != fp.end() ; feature_i++ )
  40. {
  41. const Feature *f = feature_i->second;
  42. int example_index = 0;
  43. for ( vector< pair<int, Example> >::const_iterator example_i = examples.begin();
  44. example_i != examples.end() ; example_i++, example_index++ )
  45. {
  46. int classno = example_i->first;
  47. const Example & ce = example_i->second;
  48. double value = f->val ( &ce );
  49. if(value > 0.0 && value < 1e-300)
  50. value = 0.0;
  51. featureStorage[feature_index].insert ( quadruplet<double, int, int, double> ( value, classno, example_index, ce.weight ) );
  52. count_features++;
  53. }
  54. feature_index++;
  55. }
  56. fprintf (stderr, "DecisionTree: Precalculating done !\n");
  57. fprintf (stderr, "Feature Value Count : %ld\n", count_features );
  58. fprintf (stderr, "Examples : %d\n", (int)examples.size() );
  59. root = buildnode_allfeatures ( fp, featureStorage, example_count );
  60. return root;
  61. }
  62. DecisionNode *DTBStandard::buildnode_allfeatures ( const FeaturePool & features,
  63. FeatureStorage & featureStorage,
  64. const FullVector & distribution )
  65. {
  66. DecisionNode *node = NULL;
  67. assert ( ! featureStorage.empty() );
  68. node = new DecisionNode();
  69. node->distribution = distribution;
  70. double max = 0;
  71. double sumweight = 0;
  72. for ( int i = 0 ; i < distribution.size() ; i++ )
  73. {
  74. sumweight += distribution[i];
  75. if ( max < distribution[i] ) max = distribution[i];
  76. }
  77. if ( max == sumweight )
  78. {
  79. // leaf node reached, because distribution is a simple peak distribution
  80. vector<int> exsel;
  81. for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
  82. {
  83. FeatureValues & featureSet = ft->second;
  84. for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
  85. {
  86. exsel.push_back(k->third);
  87. }
  88. }
  89. node->trainExamplesIndices = exsel;
  90. return node;
  91. }
  92. double optimal_threshold = 0.0;
  93. int optimal_feature = 0;
  94. double optimal_ig = 0.0;
  95. double overall_entropy = 0.0;
  96. if ( featureStorage.begin()->second.size() < minExamples )
  97. {
  98. vector<int> exsel;
  99. for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
  100. {
  101. FeatureValues & featureSet = ft->second;
  102. for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
  103. {
  104. exsel.push_back(k->third);
  105. }
  106. }
  107. node->trainExamplesIndices = exsel;
  108. return node;
  109. }
  110. for ( int i = 0 ; i < distribution.size() ; i++ )
  111. {
  112. double nk = distribution.data[i];
  113. if ( nk != 0 ) {
  114. overall_entropy -= nk/sumweight * ( log (nk) - log(sumweight) );
  115. }
  116. }
  117. if ( overall_entropy <= 0.0 )
  118. {
  119. distribution.store(cerr);
  120. exit(-1);
  121. }
  122. int c = 0;
  123. for ( FeatureStorage::const_iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ , c++)
  124. {
  125. int feature_index = ft->first;
  126. const FeatureValues & featureSet = ft->second;
  127. double information_gain = 0.0;
  128. FullVector lkappa ( distribution.size() );
  129. double best_information_gain = 0.0;
  130. double best_threshold = 0.0;
  131. double l = 0;
  132. double weight_old = 0.0;
  133. double first_value = featureSet.begin()->first;
  134. FeatureValues::const_iterator lastElement = featureSet.end();
  135. lastElement--;
  136. double last_value = lastElement->first;
  137. double weight = lastElement->fourth;
  138. int previous_class = lastElement->second;
  139. double previous_value = lastElement->first;
  140. for ( FeatureValues::const_iterator k = lastElement;
  141. k != featureSet.begin();
  142. k--, weight+=k->fourth )
  143. {
  144. int current_class = k->second;
  145. double current_value = k->first;
  146. assert (! isnan(current_value));
  147. double m = weight - weight_old;
  148. l = weight_old;
  149. double r = sumweight - l;
  150. /*if ( (k != featureSet.begin())&& (current_class != previous_class)
  151. // this leds to problems when using really weak classifiers !!
  152. && ( (current_value != previous_value) ||
  153. ( (current_value != first_value) &&
  154. (current_value != last_value)
  155. )
  156. )
  157. )*/
  158. if( (current_value != previous_value) && (current_value != last_value) )
  159. {
  160. if ( (l<= 10e-7) || ( r <= 10e-7 ) )
  161. continue;
  162. assert ( r > 10e-7 );
  163. assert ( l > 10e-7 );
  164. double left_entropy = 0.0;
  165. double right_entropy = 0.0;
  166. for ( int i = 0 ; i < distribution.size() ; i++ )
  167. {
  168. double lk = lkappa[i];
  169. double nk = distribution.data[i];
  170. double rk = nk - lk;
  171. if ( lk > 10e-7 )
  172. {
  173. double ql = lk / (double)l;
  174. left_entropy -= ql * log(ql);
  175. }
  176. if ( rk > 10e-7 )
  177. {
  178. double qr = rk / (double)r;
  179. right_entropy -= qr * log(qr);
  180. }
  181. }
  182. information_gain = overall_entropy - l*left_entropy/sumweight - r*right_entropy/sumweight;
  183. #ifdef DEBUG_CART
  184. //fprintf (stderr, "ig %f t %f\n sumweight %f e %f le %f re %f\n", information_gain, 0.5 * (current_value + previous_value ),
  185. // sumweight, overall_entropy, left_entropy, right_entropy );
  186. fprintf (stderr, "f: %i ig %f sumweight %f e %f le %f re %f l %f r %f\n", c, information_gain, sumweight, overall_entropy, left_entropy, right_entropy, l, r );
  187. #endif
  188. if ( information_gain > best_information_gain )
  189. {
  190. best_information_gain = information_gain;
  191. best_threshold = 0.5 * (current_value + previous_value);
  192. }
  193. }
  194. weight_old = weight;
  195. lkappa[current_class] += m;
  196. previous_class = current_class;
  197. previous_value = current_value;
  198. }
  199. if ( best_information_gain >= optimal_ig )
  200. {
  201. optimal_threshold = best_threshold;
  202. optimal_feature = feature_index;
  203. optimal_ig = best_information_gain;
  204. }
  205. }
  206. #ifdef DEBUG_CART
  207. fprintf (stderr, "Feature optimization finished: information gain %f threshold %f feature index %d\n",
  208. optimal_ig, optimal_threshold, optimal_feature );
  209. #endif
  210. if ( optimal_ig == 0.0 )
  211. {
  212. fprintf (stderr, "WARNING: zero mutual information! have a look at your feature values\n");
  213. FeatureStorage::const_iterator first = featureStorage.begin();
  214. if ( first == featureStorage.end() )
  215. {
  216. fprintf (stderr, "ERROR: no features given at all !\n");
  217. exit(-1);
  218. }
  219. const FeatureValues & featureSet = first->second;
  220. const int max_debug_values = 50;
  221. int l = 0;
  222. for ( FeatureValues::const_iterator kd = featureSet.begin();
  223. (kd != featureSet.end()) && (l < max_debug_values) ; kd++,l++ )
  224. fprintf (stderr, "(%d,%f) ", kd->second, kd->first );
  225. fprintf (stderr, "\n[...] reverse:\n" );
  226. l = 0;
  227. for ( FeatureValues::const_reverse_iterator kd = featureSet.rbegin();
  228. (kd != featureSet.rend()) && (l < max_debug_values) ; kd++,l++ )
  229. fprintf (stderr, "(%d,%f) ", kd->second, kd->first );
  230. fprintf (stderr, "\n" );
  231. }
  232. // split feature set according to optimal threshold
  233. if ( optimal_ig <= minimum_information_gain )
  234. {
  235. vector<int> exsel;
  236. for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
  237. {
  238. FeatureValues & featureSet = ft->second;
  239. for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
  240. {
  241. exsel.push_back(k->third);
  242. }
  243. }
  244. node->trainExamplesIndices = exsel;
  245. return node;
  246. }
  247. node->f = features[optimal_feature].second->clone();
  248. node->threshold = optimal_threshold;
  249. // kann optimiert werden, da set schon sortiert !!
  250. set<int> examples_left;
  251. FullVector distribution_left ( distribution.size() );
  252. FullVector distribution_right ( distribution.size() );
  253. int sum_left = 0;
  254. for ( FeatureValues::const_iterator k =
  255. featureStorage[optimal_feature].begin();
  256. k != featureStorage[optimal_feature].end();
  257. k++ )
  258. {
  259. int example_index = k->third;
  260. int classno = k->second;
  261. double val = k->first;
  262. if ( val < optimal_threshold ) {
  263. examples_left.insert ( example_index );
  264. sum_left ++;
  265. distribution_left[classno]++;
  266. } else {
  267. distribution_right[classno]++;
  268. }
  269. }
  270. if ( sum_left == 0 ) {
  271. // seldom case of equal feature values
  272. // unable to split !!
  273. fprintf (stderr, "WARNING: equal feature values, splitting impossible\n");
  274. return node;
  275. }
  276. node->f = features[optimal_feature].second->clone();
  277. node->threshold = optimal_threshold;
  278. #ifdef DEBUG_CART
  279. node->f->store (cerr);
  280. cerr << endl;
  281. #endif
  282. FeatureStorage left;
  283. for ( FeatureStorage::iterator ft = featureStorage.begin();
  284. ft != featureStorage.end(); ft++ )
  285. {
  286. int feature_index = ft->first;
  287. FeatureValues & featureSet = ft->second;
  288. for ( FeatureValues::iterator k = featureSet.begin();
  289. k != featureSet.end() ; )
  290. {
  291. int example_index = k->third;
  292. if ( examples_left.find(example_index) != examples_left.end() )
  293. {
  294. left[feature_index].insert ( quadruplet<double, int, int, double> ( *k ) );
  295. featureSet.erase ( k++ );
  296. } else {
  297. ++k;
  298. }
  299. }
  300. }
  301. #ifdef DEBUG_CART
  302. for ( int i = 0 ; i < distribution_left.size() ; i++ )
  303. {
  304. fprintf (stderr, "DecisionTree: split of class %d (%f <-> %f)\n", i,
  305. distribution_left[i], distribution_right[i] );
  306. }
  307. #endif
  308. node->left = buildnode_allfeatures ( features,
  309. left,
  310. distribution_left );
  311. node->right = buildnode_allfeatures ( features,
  312. featureStorage,
  313. distribution_right );
  314. return node;
  315. }