소스 검색

added own class for splitting criteria of decision trees

Sven Sickert 8 년 전
부모
커밋
10a60e8cde

+ 149 - 0
classifier/fpclassifier/randomforest/SCGiniIndex.cpp

@@ -0,0 +1,149 @@
+/**
+ * @file SCGiniIndex.cpp
+ * @brief the Gini index splitting criterion
+ * @author Sven Sickert
+ * @date 01/16/2017
+
+*/
+#include "SCGiniIndex.h"
+
+using namespace OBJREC;
+
+/* default constructor */
+SCGiniIndex::SCGiniIndex()
+    : SplittingCriterion ()
+{
+    count_left  = 0.0;
+    count_right = 0.0;
+    gini_left   = 0.0;
+    gini_right  = 0.0;
+}
+
+/* simple constructor */
+SCGiniIndex::SCGiniIndex( int _min_examples )
+    : SplittingCriterion ( _min_examples )
+{
+    count_left  = 0.0;
+    count_right = 0.0;
+    gini_left   = 0.0;
+    gini_right  = 0.0;
+}
+
+/* config constructor */
+SCGiniIndex::SCGiniIndex( const NICE::Config *conf )
+    : SplittingCriterion ( conf )
+{
+    count_left  = 0.0;
+    count_right = 0.0;
+    gini_left   = 0.0;
+    gini_right  = 0.0;
+}
+
+/* copy constructor */
+SCGiniIndex::SCGiniIndex( const SCGiniIndex &obj )
+{
+    min_examples  = obj.min_examples;
+    min_entropy   = obj.min_entropy;
+    min_purity    = obj.min_purity;
+    entropy_cur   = obj.entropy_cur;
+    count_left    = obj.count_left;
+    count_right   = obj.count_right;
+    gini_left     = obj.gini_left;
+    gini_right    = obj.gini_right;
+}
+
+/* simple destructor */
+SCGiniIndex::~SCGiniIndex()
+{
+}
+
+/* cloning function */
+SplittingCriterion* SCGiniIndex::clone()
+{
+    SplittingCriterion* sc = new SCGiniIndex( *this );
+    return sc;
+}
+
+double SCGiniIndex::computeGiniIndex(
+        const double* distribution,
+        const double count,
+        const int maxClassNo )
+{
+    double g_sum = 0.0;
+
+    for ( int j = 0 ; j <= maxClassNo ; j++ )
+    {
+        double p = distribution[j] / count;
+        g_sum += p*p;
+    }
+
+    return (1-g_sum);
+}
+
+bool SCGiniIndex::evaluateSplit(
+        const FeatureValuesUnsorted & values,
+        double threshold,
+        double* distribution_left,
+        double* distribution_right,
+        int maxClassNo )
+{
+    this->count_left = 0;
+    this->count_right = 0;
+    int count_unweighted_left = 0;
+    int count_unweighted_right = 0;
+
+    double *distribution = new double [maxClassNo+1];
+    for ( int c = 0; c <= maxClassNo; c++ )
+        distribution[c] = 0.0;
+
+    for ( FeatureValuesUnsorted::const_iterator i = values.begin();
+          i != values.end();
+          i++ )
+    {
+        int classno = i->second;
+        double value = i->first;
+        double weight = i->fourth;
+        
+        distribution[classno] += weight;
+        if ( value < threshold ) {
+            distribution_left[classno] += weight;
+            this->count_left += weight;
+            count_unweighted_left++;
+        }
+        else
+        {
+            distribution_right[classno] += weight;
+            this->count_right += weight;
+            count_unweighted_right++;
+        }
+    }
+
+    if (  (count_unweighted_left < this->min_examples)
+       || (count_unweighted_right < this->min_examples) )
+    {
+        delete [] distribution;
+        return false;
+    }
+
+    // current entropy
+    this->entropy_cur = computeEntropy( distribution, this->count_left+this->count_right, maxClassNo );
+
+    // left Gini index
+    this->gini_left   = computeGiniIndex( distribution_left, this->count_left, maxClassNo );
+
+    // right Gini index
+    this->gini_right  = computeGiniIndex( distribution_right, this->count_right, maxClassNo );
+    
+    delete [] distribution;
+    return true;
+}
+
+double SCGiniIndex::computePurity() const
+{
+    double p_left = (this->count_left) / (this->count_left + this->count_right);
+
+    // computing Gini impurity
+    double gi = p_left*this->gini_left + (1-p_left)*this->gini_right;
+
+    return (1-gi);
+}

+ 83 - 0
classifier/fpclassifier/randomforest/SCGiniIndex.h

@@ -0,0 +1,83 @@
+/**
+ * @file SCGiniIndex.h
+ * @brief the Gini index splitting criterion
+ * @author Sven Sickert
+ * @date 01/16/2017
+
+*/
+#ifndef SCGiniIndexINCLUDE
+#define SCGiniIndexINCLUDE
+
+#include "SplittingCriterion.h"
+
+namespace OBJREC {
+
+class SCGiniIndex : public SplittingCriterion
+{
+  protected:
+
+    double gini_left,
+           gini_right,
+           count_left,
+           count_right;
+
+    /**
+     * @brief computation of Gini index
+     * @param distribution given distribution
+     * @param count amount of samples
+     * @param maxClassNo maximum class number
+     * @return computed Gini index
+     */
+    double computeGiniIndex(
+        const double* distribution,
+        const double count ,
+        const int maxClassNo );
+
+  public:
+
+    /* default constructor */
+    SCGiniIndex();
+
+    /* simple constructor */
+    SCGiniIndex( int _min_examples );
+
+    /** config constructor */
+    SCGiniIndex( const NICE::Config *conf );
+
+    /** copy constructor */
+    SCGiniIndex( const SCGiniIndex &obj );
+
+    /* simple destructor */
+    virtual ~SCGiniIndex();
+
+    /* cloning function */
+    virtual SplittingCriterion* clone();
+
+    /**
+     * @brief evaluate the split and return if split is possible
+     * @param values unsorted list of feature values of a certain dimension
+     * @param threshold threshold for current feature dimension
+     * @param distribution_left class distribution for left child node after splitting
+     * @param distribution_right class distribution for right child node after splitting
+     * @param maxClassNo maximum class number
+     * @return possible split or not
+     */
+    virtual bool evaluateSplit(
+        const FeatureValuesUnsorted & values,
+        double threshold,
+        double* distribution_left,
+        double* distribution_right,
+        int maxClassNo );
+
+    /**
+     * @brief compute purity based on given split
+     * @return purity value
+     */
+    double computePurity() const;
+
+};
+
+} // namespace
+
+
+#endif

+ 143 - 0
classifier/fpclassifier/randomforest/SCInformationGain.cpp

@@ -0,0 +1,143 @@
+/**
+ * @file SCInformationGain.cpp
+ * @brief the information gain splitting criterion
+ * @author Sven Sickert
+ * @date 01/12/2017
+
+*/
+#include "SCInformationGain.h"
+
+using namespace OBJREC;
+
+/* default constructor */
+SCInformationGain::SCInformationGain()
+    : SplittingCriterion ()
+{
+    entropy_left  = 0.0;
+    entropy_right = 0.0;
+    count_left  = 0.0;
+    count_right = 0.0;
+    use_shannon_entropy = false;
+}
+
+/* simple constructor */
+SCInformationGain::SCInformationGain( int _min_examples )
+    : SplittingCriterion ( _min_examples )
+{
+    entropy_left  = 0.0;
+    entropy_right = 0.0;
+    count_left  = 0.0;
+    count_right = 0.0;
+    use_shannon_entropy = false;
+}
+
+/* config constructor */
+SCInformationGain::SCInformationGain( const NICE::Config *conf )
+    : SplittingCriterion ( conf )
+{
+    entropy_left = 0.0;
+    entropy_right = 0.0;
+    count_left  = 0.0;
+    count_right = 0.0;
+    use_shannon_entropy = conf->gB ( "SplittingCriterion",
+                                     "use_shannon_entropy",
+                                     false );
+}
+
+/* copy constructor */
+SCInformationGain::SCInformationGain( const SCInformationGain &obj )
+{
+    min_examples  = obj.min_examples;
+    min_entropy   = obj.min_entropy;
+    min_purity    = obj.min_purity;
+    entropy_left  = obj.entropy_left;
+    entropy_right = obj.entropy_right;
+    entropy_cur   = obj.entropy_cur;
+    count_left    = obj.count_left;
+    count_right   = obj.count_right;
+    use_shannon_entropy = obj.use_shannon_entropy;
+}
+
+/* simple destructor */
+SCInformationGain::~SCInformationGain()
+{
+}
+
+/* cloning function */
+SplittingCriterion* SCInformationGain::clone()
+{
+    SplittingCriterion* sc = new SCInformationGain( *this );
+    return sc;
+}
+
+bool SCInformationGain::evaluateSplit(
+        const FeatureValuesUnsorted & values,
+        double threshold,
+        double* distribution_left,
+        double* distribution_right,
+        int maxClassNo )
+{
+    this->count_left = 0;
+    this->count_right = 0;
+    int count_unweighted_left = 0;
+    int count_unweighted_right = 0;
+
+    double *distribution = new double [maxClassNo+1];
+    for ( int c = 0; c <= maxClassNo; c++ )
+        distribution[c] = 0.0;
+
+    for ( FeatureValuesUnsorted::const_iterator i = values.begin();
+          i != values.end();
+          i++ )
+    {
+        int classno = i->second;
+        double value = i->first;
+        double weight = i->fourth;
+        
+        distribution[classno] += weight;
+        if ( value < threshold ) {
+            distribution_left[classno] += weight;
+            this->count_left += weight;
+            count_unweighted_left++;
+        }
+        else
+        {
+            distribution_right[classno] += weight;
+            this->count_right += weight;
+            count_unweighted_right++;
+        }
+    }
+
+    if (  (count_unweighted_left < this->min_examples)
+       || (count_unweighted_right < this->min_examples) )
+    {
+        delete [] distribution;
+        return false;
+    }
+
+    // current entropy
+    this->entropy_cur  = computeEntropy( distribution, this->count_left+this->count_right, maxClassNo );
+
+    // entropy for left child
+    this->entropy_left = computeEntropy( distribution_left, this->count_left, maxClassNo );
+
+    // entropy for right child
+    this->entropy_right = computeEntropy( distribution_right, this->count_right, maxClassNo );
+
+    delete [] distribution;
+    return true;
+}
+
+double SCInformationGain::computePurity() const
+{
+    double p_left = (this->count_left) / (this->count_left + this->count_right);
+    double ig = this->entropy_cur - p_left*this->entropy_left - (1-p_left)*this->entropy_right;
+
+    if ( use_shannon_entropy )
+    {
+        double entropy_split = -( p_left*log(p_left) + (1-p_left)*log(1-p_left) );
+        ig = 2*ig / ( this->entropy_cur + entropy_split );
+    }
+    
+    return ig;
+}

+ 73 - 0
classifier/fpclassifier/randomforest/SCInformationGain.h

@@ -0,0 +1,73 @@
+/**
+ * @file SCInformationGain.h
+ * @brief the information gain splitting criterion
+ * @author Sven Sickert
+ * @date 01/12/2017
+
+*/
+#ifndef SCInformationGainINCLUDE
+#define SCInformationGainINCLUDE
+
+#include "SplittingCriterion.h"
+
+namespace OBJREC {
+
+class SCInformationGain : public SplittingCriterion
+{
+  protected:
+
+    double entropy_left,
+           entropy_right,
+           count_left,
+           count_right;
+
+    bool   use_shannon_entropy;
+
+  public:
+
+    /* default constructor */
+    SCInformationGain();
+
+    /* simple constructor */
+    SCInformationGain( int _min_examples );
+
+    /** config constructor */
+    SCInformationGain( const NICE::Config *conf );
+
+    /** copy constructor */
+    SCInformationGain( const SCInformationGain &obj );
+
+    /* simple destructor */
+    virtual ~SCInformationGain();
+
+    /* cloning function */
+    virtual SplittingCriterion* clone();
+
+    /**
+     * @brief evaluate the split and return if split is possible
+     * @param values unsorted list of feature values of a certain dimension
+     * @param threshold threshold for current feature dimension
+     * @param distribution_left class distribution for left child node after splitting
+     * @param distribution_right class distribution for right child node after splitting
+     * @param maxClassNo maximum class number
+     * @return possible split or not
+     */
+    virtual bool evaluateSplit(
+        const FeatureValuesUnsorted & values,
+        double threshold,
+        double* distribution_left,
+        double* distribution_right,
+        int maxClassNo );
+
+    /**
+     * @brief compute purity based on given split
+     * @return purity value
+     */
+    double computePurity() const;
+
+};
+    
+} // namespace
+
+
+#endif

+ 69 - 0
classifier/fpclassifier/randomforest/SplittingCriterion.cpp

@@ -0,0 +1,69 @@
+/**
+ * @file SplittingCriterion.cpp
+ * @brief abstract interface for splitting criteria
+ * @author Sven Sickert
+ * @date 01/12/2017
+
+*/
+#include "SplittingCriterion.h"
+
+using namespace OBJREC;
+
+/* default constructor */
+SplittingCriterion::SplittingCriterion()
+{
+    min_examples = 50;
+    entropy_cur = 0.0;
+    min_entropy = 10e-5;
+    min_purity = 10e-7;
+}
+
+/* simple constructor */
+SplittingCriterion::SplittingCriterion( int _min_examples )
+{
+    min_examples = _min_examples;
+    entropy_cur = 0.0;
+    min_entropy = 10e-5;
+    min_purity = 10e-7;
+}
+
+/* config constructor */
+SplittingCriterion::SplittingCriterion( const NICE::Config *conf )
+{
+    min_examples = conf->gI ( "SplittingCriterion", "min_examples", 50 );
+    min_entropy = conf->gD ( "SplittingCriterion", "min_entropy", 10e-5 );
+    min_purity = conf->gD ( "SplittingCriterion", "min_purity", 10e-7 );
+    entropy_cur = 0.0;
+}
+
+/* copy constructor */
+SplittingCriterion::SplittingCriterion( const SplittingCriterion &obj )
+{
+    min_examples = obj.min_examples;
+    min_entropy = obj.min_entropy;
+    min_purity = obj.min_purity;
+    entropy_cur = obj.entropy_cur;
+}
+
+/* default destructor */
+SplittingCriterion::~SplittingCriterion()
+{
+}
+
+/* computation of entropy */
+double SplittingCriterion::computeEntropy(
+        const double* distribution,
+        const double count,
+        const int maxClassNo )
+{
+    double e = 0.0;
+
+    for ( int j = 0 ; j <= maxClassNo ; j++ )
+        if ( distribution[j] != 0 )
+            e -= distribution[j] * log(distribution[j]);
+
+    e /= count;
+    e += log(count);
+
+    return e;
+}

+ 115 - 0
classifier/fpclassifier/randomforest/SplittingCriterion.h

@@ -0,0 +1,115 @@
+/**
+ * @file SplittingCriterion.h
+ * @brief abstract interface for splitting criteria
+ * @author Sven Sickert
+ * @date 01/12/2017
+
+*/
+#ifndef SplittingCriterionINCLUDE
+#define SplittingCriterionINCLUDE
+
+#include "core/basics/Config.h"
+#include "vislearning/cbaselib/Feature.h"
+
+namespace OBJREC {
+    
+/* abstract interface for splitting criteria */
+class SplittingCriterion
+{
+  protected:
+    int min_examples;
+    double entropy_cur;
+    double min_entropy;
+    double min_purity;
+
+    /**
+     * @brief computation of entropy
+     * @param distribution given distribution
+     * @param count amount of samples
+     * @param maxClassNo maximum class number
+     * @return computed entropy
+     */
+    double computeEntropy(
+        const double* distribution,
+        const double count ,
+        const int maxClassNo );
+
+  public:
+
+    /** default constructor */
+    SplittingCriterion( );
+    
+    /** simple constructor */
+    SplittingCriterion( int _min_examples );
+
+    /** config constructor */
+    SplittingCriterion( const NICE::Config *conf );
+
+    /** copy constructor */
+    SplittingCriterion( const SplittingCriterion &obj );
+
+    /** default destructor */
+    virtual ~SplittingCriterion();
+
+    /** cloning functioning */
+    virtual SplittingCriterion* clone() = 0;
+
+    /**
+     * @brief evaluate the split and return if split is possible
+     * @param values unsorted list of feature values of a certain dimension
+     * @param threshold threshold for current feature dimension
+     * @param distribution_left class distribution for left child node after splitting
+     * @param distribution_right class distribution for right child node after splitting
+     * @param maxClassNo maximum class number
+     * @return possible split or not
+     */
+    virtual bool evaluateSplit(
+        const FeatureValuesUnsorted & values,
+        double threshold,
+        double* distribution_left,
+        double* distribution_right,
+        int maxClassNo ) = 0;
+
+    /**
+     * @brief compute purity based on given split
+     * @return purity value
+     */
+    virtual double computePurity() const = 0;
+
+    /**
+     * @brief return entropy value
+     */
+    double getEntropy() const
+    {
+        return entropy_cur;
+    }
+
+    /**
+     * @brief return minimum allowed entropy value
+     */
+    double getMinimumEntropy() const
+    {
+        return min_entropy;
+    }
+
+    /**
+     * @brief return target purity value
+     */
+    double getMinimumPurity() const
+    {
+        return min_purity;
+    }
+
+    /**
+     * @brief return allowed minmum amount of examples
+     */
+    int getMinimumExamples() const
+    {
+        return min_examples;
+    }
+
+};
+    
+}  // namespace
+
+#endif