/** * @file DTBStandard.cpp * @brief normal decision tree * @author Erik Rodner * @date 05/06/2008 */ #include #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h" using namespace OBJREC; using namespace std; using namespace NICE; #undef DEBUG_CART DTBStandard::DTBStandard( const Config *_conf, std::string section ) { minimum_information_gain = _conf->gD (section, "minimum_information_gain", 10e-7 ); minExamples = _conf->gD (section, "min_examples", 10); } DTBStandard::~DTBStandard() { } DecisionNode *DTBStandard::build ( const FeaturePool & fp, const Examples & examples, int maxClassNo ) { FeatureStorage featureStorage; DecisionNode *root; fprintf (stderr, "DecisionTree: Precalculating ALL features (This is time and memory consuming !!)\n"); FullVector example_count ( maxClassNo+1 ); int example_index = 0; for ( vector< pair >::const_iterator example_i = examples.begin(); example_i != examples.end() ; example_i++, example_index++ ) { int classno = example_i->first; example_count[classno] += example_i->second.weight; } long count_features = 0; int feature_index = 0; for ( FeaturePool::const_iterator feature_i = fp.begin(); feature_i != fp.end() ; feature_i++ ) { const Feature *f = feature_i->second; int example_index = 0; for ( vector< pair >::const_iterator example_i = examples.begin(); example_i != examples.end() ; example_i++, example_index++ ) { int classno = example_i->first; const Example & ce = example_i->second; double value = f->val ( &ce ); if(value > 0.0 && value < 1e-300) value = 0.0; featureStorage[feature_index].insert ( quadruplet ( value, classno, example_index, ce.weight ) ); count_features++; } feature_index++; } fprintf (stderr, "DecisionTree: Precalculating done !\n"); fprintf (stderr, "Feature Value Count : %ld\n", count_features ); fprintf (stderr, "Examples : %d\n", (int)examples.size() ); root = buildnode_allfeatures ( fp, featureStorage, example_count ); return root; } DecisionNode *DTBStandard::buildnode_allfeatures ( const FeaturePool & features, FeatureStorage & featureStorage, const FullVector & distribution ) { DecisionNode *node = NULL; assert ( ! featureStorage.empty() ); node = new DecisionNode(); node->distribution = distribution; double max = 0; double sumweight = 0; for ( int i = 0 ; i < distribution.size() ; i++ ) { sumweight += distribution[i]; if ( max < distribution[i] ) max = distribution[i]; } if ( max == sumweight ) { // leaf node reached, because distribution is a simple peak distribution vector exsel; for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ ) { FeatureValues & featureSet = ft->second; for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; ) { exsel.push_back(k->third); } } node->trainExamplesIndices = exsel; return node; } double optimal_threshold = 0.0; int optimal_feature = 0; double optimal_ig = 0.0; double overall_entropy = 0.0; if ( featureStorage.begin()->second.size() < minExamples ) { vector exsel; for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ ) { FeatureValues & featureSet = ft->second; for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; ) { exsel.push_back(k->third); } } node->trainExamplesIndices = exsel; return node; } for ( int i = 0 ; i < distribution.size() ; i++ ) { double nk = distribution.data[i]; if ( nk != 0 ) { overall_entropy -= nk/sumweight * ( log (nk) - log(sumweight) ); } } if ( overall_entropy <= 0.0 ) { distribution.store(cerr); exit(-1); } int c = 0; for ( FeatureStorage::const_iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ , c++) { int feature_index = ft->first; const FeatureValues & featureSet = ft->second; double information_gain = 0.0; FullVector lkappa ( distribution.size() ); double best_information_gain = 0.0; double best_threshold = 0.0; double l = 0; double weight_old = 0.0; double first_value = featureSet.begin()->first; FeatureValues::const_iterator lastElement = featureSet.end(); lastElement--; double last_value = lastElement->first; double weight = lastElement->fourth; int previous_class = lastElement->second; double previous_value = lastElement->first; for ( FeatureValues::const_iterator k = lastElement; k != featureSet.begin(); k--, weight+=k->fourth ) { int current_class = k->second; double current_value = k->first; assert (! isnan(current_value)); double m = weight - weight_old; l = weight_old; double r = sumweight - l; /*if ( (k != featureSet.begin())&& (current_class != previous_class) // this leds to problems when using really weak classifiers !! && ( (current_value != previous_value) || ( (current_value != first_value) && (current_value != last_value) ) ) )*/ if( (current_value != previous_value) && (current_value != last_value) ) { if ( (l<= 10e-7) || ( r <= 10e-7 ) ) continue; assert ( r > 10e-7 ); assert ( l > 10e-7 ); double left_entropy = 0.0; double right_entropy = 0.0; for ( int i = 0 ; i < distribution.size() ; i++ ) { double lk = lkappa[i]; double nk = distribution.data[i]; double rk = nk - lk; if ( lk > 10e-7 ) { double ql = lk / (double)l; left_entropy -= ql * log(ql); } if ( rk > 10e-7 ) { double qr = rk / (double)r; right_entropy -= qr * log(qr); } } information_gain = overall_entropy - l*left_entropy/sumweight - r*right_entropy/sumweight; #ifdef DEBUG_CART //fprintf (stderr, "ig %f t %f\n sumweight %f e %f le %f re %f\n", information_gain, 0.5 * (current_value + previous_value ), // sumweight, overall_entropy, left_entropy, right_entropy ); fprintf (stderr, "f: %i ig %f sumweight %f e %f le %f re %f l %f r %f\n", c, information_gain, sumweight, overall_entropy, left_entropy, right_entropy, l, r ); #endif if ( information_gain > best_information_gain ) { best_information_gain = information_gain; best_threshold = 0.5 * (current_value + previous_value); } } weight_old = weight; lkappa[current_class] += m; previous_class = current_class; previous_value = current_value; } if ( best_information_gain >= optimal_ig ) { optimal_threshold = best_threshold; optimal_feature = feature_index; optimal_ig = best_information_gain; } } #ifdef DEBUG_CART fprintf (stderr, "Feature optimization finished: information gain %f threshold %f feature index %d\n", optimal_ig, optimal_threshold, optimal_feature ); #endif if ( optimal_ig == 0.0 ) { fprintf (stderr, "WARNING: zero mutual information! have a look at your feature values\n"); FeatureStorage::const_iterator first = featureStorage.begin(); if ( first == featureStorage.end() ) { fprintf (stderr, "ERROR: no features given at all !\n"); exit(-1); } const FeatureValues & featureSet = first->second; const int max_debug_values = 50; int l = 0; for ( FeatureValues::const_iterator kd = featureSet.begin(); (kd != featureSet.end()) && (l < max_debug_values) ; kd++,l++ ) fprintf (stderr, "(%d,%f) ", kd->second, kd->first ); fprintf (stderr, "\n[...] reverse:\n" ); l = 0; for ( FeatureValues::const_reverse_iterator kd = featureSet.rbegin(); (kd != featureSet.rend()) && (l < max_debug_values) ; kd++,l++ ) fprintf (stderr, "(%d,%f) ", kd->second, kd->first ); fprintf (stderr, "\n" ); } // split feature set according to optimal threshold if ( optimal_ig <= minimum_information_gain ) { vector exsel; for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ ) { FeatureValues & featureSet = ft->second; for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; ) { exsel.push_back(k->third); } } node->trainExamplesIndices = exsel; return node; } node->f = features[optimal_feature].second->clone(); node->threshold = optimal_threshold; // kann optimiert werden, da set schon sortiert !! set examples_left; FullVector distribution_left ( distribution.size() ); FullVector distribution_right ( distribution.size() ); int sum_left = 0; for ( FeatureValues::const_iterator k = featureStorage[optimal_feature].begin(); k != featureStorage[optimal_feature].end(); k++ ) { int example_index = k->third; int classno = k->second; double val = k->first; if ( val < optimal_threshold ) { examples_left.insert ( example_index ); sum_left ++; distribution_left[classno]++; } else { distribution_right[classno]++; } } if ( sum_left == 0 ) { // seldom case of equal feature values // unable to split !! fprintf (stderr, "WARNING: equal feature values, splitting impossible\n"); return node; } node->f = features[optimal_feature].second->clone(); node->threshold = optimal_threshold; #ifdef DEBUG_CART node->f->store (cerr); cerr << endl; #endif FeatureStorage left; for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ ) { int feature_index = ft->first; FeatureValues & featureSet = ft->second; for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; ) { int example_index = k->third; if ( examples_left.find(example_index) != examples_left.end() ) { left[feature_index].insert ( quadruplet ( *k ) ); featureSet.erase ( k++ ); } else { ++k; } } } #ifdef DEBUG_CART for ( int i = 0 ; i < distribution_left.size() ; i++ ) { fprintf (stderr, "DecisionTree: split of class %d (%f <-> %f)\n", i, distribution_left[i], distribution_right[i] ); } #endif node->left = buildnode_allfeatures ( features, left, distribution_left ); node->right = buildnode_allfeatures ( features, featureStorage, distribution_right ); return node; }