123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- /**
- * @file DTBStandard.cpp
- * @brief normal decision tree
- * @author Erik Rodner
- * @date 05/06/2008
- */
- #include <iostream>
- #include "vislearning/classifier/fpclassifier/randomforest/DTBStandard.h"
- using namespace OBJREC;
- using namespace std;
- using namespace NICE;
- #undef DEBUG_CART
- DTBStandard::DTBStandard( const Config *_conf, std::string section )
- {
- minimum_information_gain = _conf->gD (section,
- "minimum_information_gain", 10e-7 );
- minExamples = _conf->gD (section, "min_examples", 10);
- }
- DTBStandard::~DTBStandard()
- {
- }
- DecisionNode *DTBStandard::build ( const FeaturePool & fp, const Examples & examples, int maxClassNo )
- {
- FeatureStorage featureStorage;
- DecisionNode *root;
- fprintf (stderr, "DecisionTree: Precalculating ALL features (This is time and memory consuming !!)\n");
- FullVector example_count ( maxClassNo+1 );
- int example_index = 0;
- for ( vector< pair<int, Example> >::const_iterator example_i = examples.begin();
- example_i != examples.end() ; example_i++, example_index++ )
- {
- int classno = example_i->first;
-
- example_count[classno] += example_i->second.weight;
- }
-
- long count_features = 0;
- int feature_index = 0;
- for ( FeaturePool::const_iterator feature_i = fp.begin();
- feature_i != fp.end() ; feature_i++ )
- {
- const Feature *f = feature_i->second;
-
- int example_index = 0;
- for ( vector< pair<int, Example> >::const_iterator example_i = examples.begin();
- example_i != examples.end() ; example_i++, example_index++ )
- {
- int classno = example_i->first;
- const Example & ce = example_i->second;
- double value = f->val ( &ce );
- if(value > 0.0 && value < 1e-300)
- value = 0.0;
- featureStorage[feature_index].insert ( quadruplet<double, int, int, double> ( value, classno, example_index, ce.weight ) );
- count_features++;
- }
- feature_index++;
- }
-
- fprintf (stderr, "DecisionTree: Precalculating done !\n");
- fprintf (stderr, "Feature Value Count : %ld\n", count_features );
- fprintf (stderr, "Examples : %d\n", (int)examples.size() );
- root = buildnode_allfeatures ( fp, featureStorage, example_count );
- return root;
- }
- DecisionNode *DTBStandard::buildnode_allfeatures ( const FeaturePool & features,
- FeatureStorage & featureStorage,
- const FullVector & distribution )
- {
- DecisionNode *node = NULL;
- assert ( ! featureStorage.empty() );
- node = new DecisionNode();
- node->distribution = distribution;
- double max = 0;
- double sumweight = 0;
- for ( int i = 0 ; i < distribution.size() ; i++ )
- {
- sumweight += distribution[i];
- if ( max < distribution[i] ) max = distribution[i];
- }
- if ( max == sumweight )
- {
- // leaf node reached, because distribution is a simple peak distribution
- vector<int> exsel;
- for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
- {
- FeatureValues & featureSet = ft->second;
- for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
- {
- exsel.push_back(k->third);
- }
- }
- node->trainExamplesIndices = exsel;
- return node;
- }
- double optimal_threshold = 0.0;
- int optimal_feature = 0;
- double optimal_ig = 0.0;
- double overall_entropy = 0.0;
- if ( featureStorage.begin()->second.size() < minExamples )
- {
- vector<int> exsel;
- for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
- {
- FeatureValues & featureSet = ft->second;
- for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
- {
- exsel.push_back(k->third);
- }
- }
- node->trainExamplesIndices = exsel;
- return node;
- }
- for ( int i = 0 ; i < distribution.size() ; i++ )
- {
- double nk = distribution.data[i];
- if ( nk != 0 ) {
- overall_entropy -= nk/sumweight * ( log (nk) - log(sumweight) );
- }
- }
- if ( overall_entropy <= 0.0 )
- {
- distribution.store(cerr);
- exit(-1);
- }
- int c = 0;
- for ( FeatureStorage::const_iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ , c++)
- {
- int feature_index = ft->first;
- const FeatureValues & featureSet = ft->second;
- double information_gain = 0.0;
- FullVector lkappa ( distribution.size() );
-
- double best_information_gain = 0.0;
- double best_threshold = 0.0;
- double l = 0;
-
- double weight_old = 0.0;
- double first_value = featureSet.begin()->first;
- FeatureValues::const_iterator lastElement = featureSet.end();
- lastElement--;
- double last_value = lastElement->first;
- double weight = lastElement->fourth;
- int previous_class = lastElement->second;
- double previous_value = lastElement->first;
-
-
-
- for ( FeatureValues::const_iterator k = lastElement;
- k != featureSet.begin();
- k--, weight+=k->fourth )
- {
- int current_class = k->second;
- double current_value = k->first;
- assert (! isnan(current_value));
-
- double m = weight - weight_old;
- l = weight_old;
-
- double r = sumweight - l;
-
-
- /*if ( (k != featureSet.begin())&& (current_class != previous_class)
- // this leds to problems when using really weak classifiers !!
- && ( (current_value != previous_value) ||
- ( (current_value != first_value) &&
- (current_value != last_value)
- )
- )
- )*/
- if( (current_value != previous_value) && (current_value != last_value) )
- {
- if ( (l<= 10e-7) || ( r <= 10e-7 ) )
- continue;
-
- assert ( r > 10e-7 );
- assert ( l > 10e-7 );
-
- double left_entropy = 0.0;
- double right_entropy = 0.0;
- for ( int i = 0 ; i < distribution.size() ; i++ )
- {
- double lk = lkappa[i];
- double nk = distribution.data[i];
- double rk = nk - lk;
- if ( lk > 10e-7 )
- {
- double ql = lk / (double)l;
- left_entropy -= ql * log(ql);
- }
- if ( rk > 10e-7 )
- {
- double qr = rk / (double)r;
- right_entropy -= qr * log(qr);
- }
- }
-
- information_gain = overall_entropy - l*left_entropy/sumweight - r*right_entropy/sumweight;
- #ifdef DEBUG_CART
- //fprintf (stderr, "ig %f t %f\n sumweight %f e %f le %f re %f\n", information_gain, 0.5 * (current_value + previous_value ),
- // sumweight, overall_entropy, left_entropy, right_entropy );
- fprintf (stderr, "f: %i ig %f sumweight %f e %f le %f re %f l %f r %f\n", c, information_gain, sumweight, overall_entropy, left_entropy, right_entropy, l, r );
- #endif
- if ( information_gain > best_information_gain )
- {
- best_information_gain = information_gain;
- best_threshold = 0.5 * (current_value + previous_value);
- }
- }
- weight_old = weight;
- lkappa[current_class] += m;
- previous_class = current_class;
- previous_value = current_value;
- }
-
-
- if ( best_information_gain >= optimal_ig )
- {
- optimal_threshold = best_threshold;
- optimal_feature = feature_index;
- optimal_ig = best_information_gain;
- }
- }
- #ifdef DEBUG_CART
- fprintf (stderr, "Feature optimization finished: information gain %f threshold %f feature index %d\n",
- optimal_ig, optimal_threshold, optimal_feature );
- #endif
- if ( optimal_ig == 0.0 )
- {
- fprintf (stderr, "WARNING: zero mutual information! have a look at your feature values\n");
- FeatureStorage::const_iterator first = featureStorage.begin();
- if ( first == featureStorage.end() )
- {
- fprintf (stderr, "ERROR: no features given at all !\n");
- exit(-1);
- }
- const FeatureValues & featureSet = first->second;
- const int max_debug_values = 50;
- int l = 0;
- for ( FeatureValues::const_iterator kd = featureSet.begin();
- (kd != featureSet.end()) && (l < max_debug_values) ; kd++,l++ )
- fprintf (stderr, "(%d,%f) ", kd->second, kd->first );
- fprintf (stderr, "\n[...] reverse:\n" );
- l = 0;
- for ( FeatureValues::const_reverse_iterator kd = featureSet.rbegin();
- (kd != featureSet.rend()) && (l < max_debug_values) ; kd++,l++ )
- fprintf (stderr, "(%d,%f) ", kd->second, kd->first );
- fprintf (stderr, "\n" );
- }
- // split feature set according to optimal threshold
- if ( optimal_ig <= minimum_information_gain )
- {
- vector<int> exsel;
- for ( FeatureStorage::iterator ft = featureStorage.begin(); ft != featureStorage.end(); ft++ )
- {
- FeatureValues & featureSet = ft->second;
- for ( FeatureValues::iterator k = featureSet.begin(); k != featureSet.end() ; )
- {
- exsel.push_back(k->third);
- }
- }
- node->trainExamplesIndices = exsel;
- return node;
- }
- node->f = features[optimal_feature].second->clone();
- node->threshold = optimal_threshold;
- // kann optimiert werden, da set schon sortiert !!
- set<int> examples_left;
- FullVector distribution_left ( distribution.size() );
- FullVector distribution_right ( distribution.size() );
- int sum_left = 0;
- for ( FeatureValues::const_iterator k =
- featureStorage[optimal_feature].begin();
- k != featureStorage[optimal_feature].end();
- k++ )
- {
- int example_index = k->third;
- int classno = k->second;
- double val = k->first;
- if ( val < optimal_threshold ) {
- examples_left.insert ( example_index );
- sum_left ++;
- distribution_left[classno]++;
- } else {
- distribution_right[classno]++;
- }
- }
- if ( sum_left == 0 ) {
- // seldom case of equal feature values
- // unable to split !!
- fprintf (stderr, "WARNING: equal feature values, splitting impossible\n");
- return node;
- }
- node->f = features[optimal_feature].second->clone();
- node->threshold = optimal_threshold;
- #ifdef DEBUG_CART
- node->f->store (cerr);
- cerr << endl;
- #endif
-
- FeatureStorage left;
- for ( FeatureStorage::iterator ft = featureStorage.begin();
- ft != featureStorage.end(); ft++ )
- {
- int feature_index = ft->first;
- FeatureValues & featureSet = ft->second;
- for ( FeatureValues::iterator k = featureSet.begin();
- k != featureSet.end() ; )
- {
- int example_index = k->third;
- if ( examples_left.find(example_index) != examples_left.end() )
- {
- left[feature_index].insert ( quadruplet<double, int, int, double> ( *k ) );
- featureSet.erase ( k++ );
- } else {
- ++k;
- }
- }
- }
- #ifdef DEBUG_CART
- for ( int i = 0 ; i < distribution_left.size() ; i++ )
- {
- fprintf (stderr, "DecisionTree: split of class %d (%f <-> %f)\n", i,
- distribution_left[i], distribution_right[i] );
- }
- #endif
- node->left = buildnode_allfeatures ( features,
- left,
- distribution_left );
- node->right = buildnode_allfeatures ( features,
- featureStorage,
- distribution_right );
- return node;
- }
|