RTBMinDist.cpp 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /**
  2. * @file RTBMinDist.cpp
  3. * @brief random regression tree; split criterion is to minimize mean distance of all examples of an inner node
  4. * @author Frank Prüfer
  5. * @date 09/17/2013
  6. */
  7. #include <iostream>
  8. #include "RTBMinDist.h"
  9. using namespace OBJREC;
  10. #undef DEBUGTREE
  11. #undef DETAILTREE
  12. using namespace std;
  13. using namespace NICE;
  14. RTBMinDist::RTBMinDist( const Config *conf, std::string section )
  15. {
  16. random_split_tests = conf->gI(section, "random_split_tests", 10 );
  17. random_features = conf->gI(section, "random_features", 500 );
  18. max_depth = conf->gI(section, "max_depth", 10 );
  19. min_examples = conf->gI(section, "min_examples", 50);
  20. minimum_distance_reduction = conf->gD("RandomForest", "minimum_distance_reduction", 10e-3 );
  21. if ( conf->gB(section, "start_random_generator", false ) )
  22. srand(time(NULL));
  23. }
  24. RTBMinDist::~RTBMinDist()
  25. {
  26. }
  27. void RTBMinDist::computeDistanceToPrototype( const vector<double> &fvalues,
  28. const int &countEx,
  29. double &dist )
  30. {
  31. double prototype = 0.0;
  32. for ( int i = 0; i < countEx; i++ ){
  33. prototype += fvalues[i];
  34. }
  35. prototype /= (double)countEx;
  36. for ( int i = 0; i < countEx; i++ ){
  37. dist += abs(prototype - fvalues[i]);
  38. }
  39. dist /= (double)countEx;
  40. }
  41. bool RTBMinDist::averageDistanceLeftRight(const vector< pair< double, int > > values,
  42. double threshold,
  43. double& avg_dist_left,
  44. double& avg_dist_right,
  45. int& count_left,
  46. int& count_right)
  47. {
  48. count_left = 0;
  49. count_right = 0;
  50. vector<int> selection_left;
  51. vector<int> selection_right;
  52. vector<double> values_left;
  53. vector<double> values_right;
  54. for ( vector< pair< double, int > >::const_iterator it = values.begin();
  55. it != values.end(); it++ )
  56. {
  57. double value = it->first;
  58. if ( value < threshold )
  59. {
  60. count_left++;
  61. selection_left.push_back( it->second );
  62. values_left.push_back( it->first );
  63. }
  64. else
  65. {
  66. count_right++;
  67. selection_right.push_back( it->second );
  68. values_right.push_back( it->first );
  69. }
  70. }
  71. if ( (count_left == 0) || (count_right == 0) )
  72. return false; // no split
  73. if ( (count_left < min_examples) || (count_right < min_examples) )
  74. return false; // no split
  75. //compute mean distance of left and right group to respective prototype
  76. computeDistanceToPrototype( values_left, count_left, avg_dist_left);
  77. computeDistanceToPrototype( values_right, count_right, avg_dist_right);
  78. return true;
  79. }
  80. RegressionNode *RTBMinDist::buildRecursive ( const NICE::VVector & x,
  81. const NICE::Vector & y,
  82. std::vector<int> & selection,
  83. int depth)
  84. {
  85. #ifdef DEBUGTREE
  86. fprintf (stderr, "Examples: %d (depth %d)\n", (int)selection.size(),
  87. (int)depth);
  88. #endif
  89. RegressionNode *node = new RegressionNode ();
  90. node->nodePrediction( y, selection );
  91. if ( depth > max_depth )
  92. {
  93. #ifdef DEBUGTREE
  94. fprintf (stderr, "RTBMinDist: maxmimum depth reached !\n");
  95. #endif
  96. node->trainExamplesIndices = selection;
  97. return node;
  98. }
  99. if ( (int)selection.size() < min_examples )
  100. {
  101. #ifdef DEBUGTREE
  102. fprintf (stderr, "RTBMinDist: minimum examples reached %d < %d !\n",
  103. (int)selection.size(), min_examples );
  104. #endif
  105. node->trainExamplesIndices = selection;
  106. return node;
  107. }
  108. int best_feature = 0;
  109. double best_threshold = 0.0;
  110. double best_reduct = -1.0;
  111. // vector<pair<double, int> > best_values;
  112. vector<pair<double, int> > values;
  113. double distance_left = 0.0;
  114. double distance_right = 0.0;
  115. for ( int k = 0; k < random_features; k++ )
  116. {
  117. #ifdef DETAILTREE
  118. fprintf (stderr, "calculating random feature %d\n", k );
  119. #endif
  120. int f = rand() % x[0].size();
  121. values.clear();
  122. collectFeatureValues ( x, selection, f, values );
  123. double curDist = 0.0;
  124. vector<double> fvalues;
  125. for ( vector< pair< double, int > >::const_iterator it = values.begin();
  126. it != values.end(); it++ )
  127. {
  128. fvalues.push_back(it->first);
  129. }
  130. computeDistanceToPrototype( fvalues, (int)values.size(), curDist );
  131. double minValue = (min_element ( values.begin(), values.end() ))->first;
  132. double maxValue = (max_element ( values.begin(), values.end() ))->first;
  133. #ifdef DETAILTREE
  134. fprintf (stderr, "max %f min %f\n", maxValue, minValue );
  135. #endif
  136. if ( maxValue - minValue < 1e-7 ) continue;
  137. for ( int i = 0; i < random_split_tests; i++ )
  138. {
  139. double threshold;
  140. threshold = rand() * (maxValue -minValue ) / RAND_MAX + minValue;
  141. #ifdef DETAILTREE
  142. fprintf (stderr, "calculating split f/s(f) %d/%d %f\n", k, i, threshold );
  143. #endif
  144. distance_left = 0.0;
  145. distance_right = 0.0;
  146. int count_left, count_right;
  147. if ( ! averageDistanceLeftRight( values, threshold, distance_left,
  148. distance_right, count_left, count_right) )
  149. continue;
  150. //double pl = (count_left) / (count_left +count_right);
  151. //double errorReduction = lsError - pl*lsError_left - (1-pl)*lsError_right;
  152. double distReduction = curDist - distance_left - distance_right;
  153. if ( distReduction > best_reduct )
  154. {
  155. best_reduct = distReduction;
  156. best_threshold = threshold;
  157. best_feature = f;
  158. #ifdef DETAILTREE
  159. fprintf (stderr, "t %f for feature %i\n", best_threshold, best_feature );
  160. #endif
  161. }
  162. }
  163. }
  164. if ( best_reduct < minimum_distance_reduction )
  165. {
  166. #ifdef DEBUGTREE
  167. fprintf (stderr, "RTBMinDist: distance reduction to small !\n");
  168. #endif
  169. node->trainExamplesIndices = selection;
  170. return node;
  171. }
  172. node->f = best_feature;
  173. node->threshold = best_threshold;
  174. // re calculating examples_left and examples_right
  175. vector<int> best_examples_left;
  176. vector<int> best_examples_right;
  177. values.clear();
  178. collectFeatureValues( x, selection, best_feature, values);
  179. best_examples_left.reserve ( values.size() / 2 );
  180. best_examples_right.reserve ( values.size() / 2 );
  181. for ( vector< pair < double, int > >::const_iterator it = values.begin();
  182. it != values.end(); it++ )
  183. {
  184. double value = it->first;
  185. if ( value < best_threshold )
  186. best_examples_left.push_back( it->second );
  187. else
  188. best_examples_right.push_back( it->second );
  189. }
  190. node->left = buildRecursive( x, y, best_examples_left, depth+1 );
  191. node->right = buildRecursive( x, y, best_examples_right, depth+1 );
  192. return node;
  193. }
  194. RegressionNode *RTBMinDist::build( const NICE::VVector & x,
  195. const NICE::Vector & y )
  196. {
  197. int index = 0;
  198. vector<int> all;
  199. all.reserve ( y.size() );
  200. for ( uint i = 0; i < y.size(); i++ )
  201. {
  202. all.push_back( index );
  203. index++;
  204. }
  205. return buildRecursive( x, y, all, 0);
  206. }