|
@@ -0,0 +1,421 @@
|
|
|
+/**
|
|
|
+ * @file KMedian.cpp
|
|
|
+ * @brief KMedian (aka K-medoid)
|
|
|
+* @author Alexander Freytag
|
|
|
+* @date 23-04-2013 (dd-mm-yyyy)
|
|
|
+
|
|
|
+ */
|
|
|
+
|
|
|
+#ifdef NICE_USELIB_OPENMP
|
|
|
+#include <omp.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#include <iostream>
|
|
|
+#include <map>
|
|
|
+#include <algorithm> //to easily find the smallest value in a map
|
|
|
+
|
|
|
+#include "vislearning/math/cluster/KMedian.h"
|
|
|
+#include "vislearning/math/distances/genericDistance.h"
|
|
|
+
|
|
|
+#include <set>
|
|
|
+
|
|
|
+using namespace OBJREC;
|
|
|
+
|
|
|
+using namespace std;
|
|
|
+
|
|
|
+using namespace NICE;
|
|
|
+
|
|
|
+
|
|
|
+typedef std::pair<int, double> MyPairType;
|
|
|
+struct CompareSecond
|
|
|
+{
|
|
|
+ bool operator()(const MyPairType& left, const MyPairType& right) const
|
|
|
+ {
|
|
|
+ return left.second < right.second;
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+#undef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+// #define DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+
|
|
|
+#undef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+// #define DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+KMedian::KMedian(int _noClasses, string _distanceType) :
|
|
|
+ noClasses(_noClasses), distanceType(_distanceType)
|
|
|
+{
|
|
|
+ //srand(time(NULL));
|
|
|
+ distancefunction = GenericDistanceSelection::selectDistance(distanceType);
|
|
|
+
|
|
|
+ this->d_minDelta = 1e-5;
|
|
|
+ this->i_maxIterations = 200;
|
|
|
+}
|
|
|
+
|
|
|
+KMedian::~KMedian()
|
|
|
+{
|
|
|
+}
|
|
|
+
|
|
|
+void KMedian::initial_guess(const VVector & features, VVector & prototypes)
|
|
|
+{
|
|
|
+ int j = 0;
|
|
|
+ std::set<int, std::greater<int> > mark;
|
|
|
+
|
|
|
+ for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, j++)
|
|
|
+ {
|
|
|
+ int k;
|
|
|
+
|
|
|
+ do
|
|
|
+ {
|
|
|
+ k = rand() % features.size();
|
|
|
+ } while (mark.find(k) != mark.end());
|
|
|
+
|
|
|
+ mark.insert(mark.begin(), k);
|
|
|
+
|
|
|
+ *i = features[k];
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+int KMedian::compute_prototypes(const VVector & features, VVector & prototypes,
|
|
|
+ std::vector<double> & weights, const std::vector<int> & assignment)
|
|
|
+{
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+ std::cerr << "initial assignments: ";
|
|
|
+ for (std::vector<int>::const_iterator assignIt = assignment.begin(); assignIt != assignment.end(); assignIt++)
|
|
|
+ {
|
|
|
+ std::cerr << " " << *assignIt;
|
|
|
+ }
|
|
|
+ std::cerr << std::endl;
|
|
|
+ #endif
|
|
|
+
|
|
|
+ //initialization
|
|
|
+ for (int k = 0; k < noClasses; k++)
|
|
|
+ {
|
|
|
+ prototypes[k].set(0);
|
|
|
+ weights[k] = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ NICE::VectorT<int> numberOfCurrentAssignments ( noClasses ) ;
|
|
|
+ numberOfCurrentAssignments.set ( 0 );
|
|
|
+
|
|
|
+ int exCnt = 0;
|
|
|
+ //how many examples are assigned to the current clusters?
|
|
|
+ for (VVector::const_iterator i = features.begin(); i != features.end(); i++, exCnt++)
|
|
|
+ {
|
|
|
+ int k = assignment[exCnt];
|
|
|
+ //increase counter for assigned cluster
|
|
|
+ numberOfCurrentAssignments[ k ] ++;
|
|
|
+ }
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+ std::cerr << "k-median -- current assignmens: " << numberOfCurrentAssignments << std::endl << "noClasses: " << noClasses << std::endl;
|
|
|
+ #endif
|
|
|
+
|
|
|
+ //compute the median for every cluster
|
|
|
+ #pragma omp parallel for
|
|
|
+ for (int clusterCnt = 0; clusterCnt < noClasses; clusterCnt++)
|
|
|
+ {
|
|
|
+ NICE::Vector overallDistances ( numberOfCurrentAssignments[ clusterCnt ] );
|
|
|
+ VVector::const_iterator lastExampleWorkedOn = features.begin();
|
|
|
+ int i_idxOfLastExampleWorkedOn ( 0 );
|
|
|
+ uint i_exCntInt ( 0 );
|
|
|
+
|
|
|
+ //this map will contain overall distances of all examples within the current clusters
|
|
|
+ //we need separate maps for every cluster to allow parallelization
|
|
|
+ std::map<int,double> distancesWithinCluster;
|
|
|
+ for (VVector::const_iterator featIt = features.begin(); featIt != features.end(); featIt++, i_exCntInt++)
|
|
|
+ {
|
|
|
+ int k = assignment[i_exCntInt];
|
|
|
+
|
|
|
+ //only considere examples currently assigned to cluster clusterCnt
|
|
|
+ if ( k != clusterCnt)
|
|
|
+ {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ uint exCntIntTmp ( i_idxOfLastExampleWorkedOn ); //idx going over all features
|
|
|
+ for (VVector::const_iterator j = lastExampleWorkedOn ; j != features.end(); j++, exCntIntTmp++)
|
|
|
+ {
|
|
|
+ int kTmp;
|
|
|
+ if ( exCntIntTmp < assignment.size() )
|
|
|
+ kTmp = assignment[exCntIntTmp];
|
|
|
+ else
|
|
|
+ {
|
|
|
+ //actually, this will be never be reached :)
|
|
|
+ std::cerr << "ERROR: exCntIntTmp >= assignment.size() " << exCntIntTmp << " " << assignment.size() << std::endl;
|
|
|
+ }
|
|
|
+
|
|
|
+ //only considere examples currently assigned to cluster clusterCnt
|
|
|
+ if ( kTmp != clusterCnt)
|
|
|
+ continue;
|
|
|
+
|
|
|
+
|
|
|
+ double dist ( distancefunction->calculate( *featIt, *j) );
|
|
|
+ if ( i_exCntInt < features.size() )
|
|
|
+ {
|
|
|
+ distancesWithinCluster[ i_exCntInt ] += dist;
|
|
|
+ #ifdef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+ std::cerr << "increase " << i_exCntInt << " by " << dist << " for " <<*featIt << " and " << *j << std::endl;
|
|
|
+ #endif
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ //actually, this will be never be reached :)
|
|
|
+ std::cerr << "ERROR: i_exCntInt >= features.size() " << i_exCntInt << " " << features.size() << std::endl;
|
|
|
+ }
|
|
|
+
|
|
|
+ if ( i_exCntInt != exCntIntTmp )
|
|
|
+ {
|
|
|
+ if (exCntIntTmp < features.size() )
|
|
|
+ {
|
|
|
+ distancesWithinCluster[ exCntIntTmp ] += dist;
|
|
|
+ #ifdef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+ std::cerr << "increase also " << exCntIntTmp << " by " << dist << std::endl;
|
|
|
+ #endif
|
|
|
+ }
|
|
|
+ else
|
|
|
+ std::cerr << "ERROR: exCntIntTmp >= features.size() " << exCntIntTmp << " " << features.size() << std::endl;
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ //inc by one to avoid calculating some distances twice
|
|
|
+ if ( ( featIt != features.end()) && ( (featIt +1 ) != features.end()) )
|
|
|
+ {
|
|
|
+ lastExampleWorkedOn = ( featIt + 1 );
|
|
|
+ i_idxOfLastExampleWorkedOn = i_exCntInt+1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+ std::cerr << "distances for cluster " << clusterCnt << " ";
|
|
|
+ for(std::map<int,double>::const_iterator distIt = distancesWithinCluster.begin(); distIt != distancesWithinCluster.end(); distIt++)
|
|
|
+ {
|
|
|
+ std::cerr << distIt->first << " " << distIt->second << " ";
|
|
|
+ }
|
|
|
+ std::cerr << std::endl;
|
|
|
+ #endif
|
|
|
+
|
|
|
+ //now compute the index of example with min overall distance
|
|
|
+ int idxOfClusterMedian ( (min_element(distancesWithinCluster.begin(), distancesWithinCluster.end(), CompareSecond()))->first );
|
|
|
+
|
|
|
+ #pragma omp critical
|
|
|
+ prototypes[clusterCnt] = features[idxOfClusterMedian];
|
|
|
+
|
|
|
+ //finished computations for cluster k
|
|
|
+ }
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_PROTOCOMP
|
|
|
+ std::cerr << " ---- prototypes after current iteration: --- " << std::endl;
|
|
|
+ for (NICE::VVector::const_iterator protoIt = prototypes.begin(); protoIt != prototypes.end(); protoIt++)
|
|
|
+ {
|
|
|
+ std::cerr << *protoIt << " ";
|
|
|
+ }
|
|
|
+
|
|
|
+ std::cerr << std::endl;
|
|
|
+ #endif
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+double KMedian::compute_delta(const VVector & oldprototypes,
|
|
|
+ const VVector & prototypes)
|
|
|
+{
|
|
|
+ double distance = 0;
|
|
|
+
|
|
|
+ for (uint k = 0; k < oldprototypes.size(); k++)
|
|
|
+ {
|
|
|
+ distance += distancefunction->calculate(oldprototypes[k], prototypes[k]);
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ fprintf(stderr, "KMedian::compute_delta: Distance:",
|
|
|
+ distancefunction->calculate(oldprototypes[k], prototypes[k]));
|
|
|
+ #endif
|
|
|
+ }
|
|
|
+ return distance;
|
|
|
+}
|
|
|
+
|
|
|
+double KMedian::compute_assignments(const VVector & features,
|
|
|
+ const VVector & prototypes,
|
|
|
+ std::vector<int> & assignment)
|
|
|
+{
|
|
|
+ int index = 0;
|
|
|
+ for (VVector::const_iterator i = features.begin(); i != features.end(); i++, index++)
|
|
|
+ {
|
|
|
+
|
|
|
+ const NICE::Vector & x = *i;
|
|
|
+ double mindist = std::numeric_limits<double>::max();
|
|
|
+ int minclass = 0;
|
|
|
+
|
|
|
+ int c = 0;
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+
|
|
|
+ fprintf(stderr, "computing nearest prototype for std::vector %d\n",
|
|
|
+ index);
|
|
|
+ #endif
|
|
|
+
|
|
|
+ for (VVector::const_iterator j = prototypes.begin(); j
|
|
|
+ != prototypes.end(); j++, c++)
|
|
|
+ {
|
|
|
+
|
|
|
+ const NICE::Vector & p = *j;
|
|
|
+ double distance = distancefunction->calculate(p, x);
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ fprintf(stderr, "KMedian::compute_delta: Distance: %f\n",
|
|
|
+ distancefunction->calculate(p, x));
|
|
|
+ #endif
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ cerr << p << endl;
|
|
|
+ cerr << x << endl;
|
|
|
+ fprintf(stderr, "distance to prototype %d is %f\n", c, distance);
|
|
|
+ #endif
|
|
|
+
|
|
|
+ if (distance < mindist)
|
|
|
+ {
|
|
|
+ minclass = c;
|
|
|
+ mindist = distance;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ assignment[index] = minclass;
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0.0;
|
|
|
+}
|
|
|
+
|
|
|
+double KMedian::compute_weights(const VVector & features,
|
|
|
+ std::vector<double> & weights,
|
|
|
+ std::vector<int> & assignment)
|
|
|
+{
|
|
|
+ for (int k = 0; k < noClasses; k++)
|
|
|
+ weights[k] = 0;
|
|
|
+
|
|
|
+ int j = 0;
|
|
|
+
|
|
|
+ for (VVector::const_iterator i = features.begin(); i != features.end(); i++, j++)
|
|
|
+ {
|
|
|
+ int k = assignment[j];
|
|
|
+ weights[k]++;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int k = 0; k < noClasses; k++)
|
|
|
+ weights[k] = weights[k] / features.size();
|
|
|
+
|
|
|
+ return 0.0;
|
|
|
+}
|
|
|
+
|
|
|
+void KMedian::cluster(const NICE::VVector & features,
|
|
|
+ NICE::VVector & prototypes,
|
|
|
+ std::vector<double> & weights,
|
|
|
+ std::vector<int> & assignment)
|
|
|
+{
|
|
|
+ NICE::VVector oldprototypes;
|
|
|
+
|
|
|
+ prototypes.clear();
|
|
|
+ weights.clear();
|
|
|
+ assignment.clear();
|
|
|
+ weights.resize(noClasses, 0);
|
|
|
+ assignment.resize(features.size(), 0);
|
|
|
+
|
|
|
+ int dimension;
|
|
|
+
|
|
|
+ if ((int) features.size() >= noClasses)
|
|
|
+ dimension = features[0].size();
|
|
|
+ else
|
|
|
+ {
|
|
|
+ fprintf(stderr,
|
|
|
+ "FATAL ERROR: Not enough feature vectors provided for kMeans\n");
|
|
|
+ exit(-1);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int k = 0; k < noClasses; k++)
|
|
|
+ {
|
|
|
+ prototypes.push_back( NICE::Vector(dimension) );
|
|
|
+ prototypes[k].set(0);
|
|
|
+ }
|
|
|
+
|
|
|
+ bool successKMedian ( false );
|
|
|
+ int iterations ( 0 );
|
|
|
+ double delta ( std::numeric_limits<double>::max() );
|
|
|
+
|
|
|
+ while ( !successKMedian )
|
|
|
+ {
|
|
|
+ //we assume that this run will be successful
|
|
|
+ successKMedian = true;
|
|
|
+
|
|
|
+ this->initial_guess(features, prototypes);
|
|
|
+
|
|
|
+ iterations = 0;
|
|
|
+ delta = std::numeric_limits<double>::max();
|
|
|
+
|
|
|
+ //until-loop over iterations
|
|
|
+ do
|
|
|
+ {
|
|
|
+ iterations++;
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ std::cerr << "k-median iteration " << iterations << std::endl;
|
|
|
+ #endif
|
|
|
+
|
|
|
+ this->compute_assignments( features, prototypes, assignment );
|
|
|
+
|
|
|
+ if (iterations > 1)
|
|
|
+ oldprototypes = prototypes;
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ fprintf(stderr, "KMedian::cluster compute_prototypes\n");
|
|
|
+ #endif
|
|
|
+
|
|
|
+ if ( this->compute_prototypes( features, prototypes, weights, assignment ) < 0 )
|
|
|
+ {
|
|
|
+ fprintf(stderr, "KMedian::cluster restart\n");
|
|
|
+ successKMedian = false;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ fprintf(stderr, "KMedian::cluster compute_delta\n");
|
|
|
+ #endif
|
|
|
+
|
|
|
+ if (iterations > 1)
|
|
|
+ delta = this->compute_delta( oldprototypes, prototypes );
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ this->print_iteration( iterations, prototypes, delta );
|
|
|
+ #endif
|
|
|
+
|
|
|
+ } while ((delta > d_minDelta) && (iterations < i_maxIterations));
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ std::cerr << "ended optimization -- delta: " << delta << " of d_minDelta: " << d_minDelta << " --- and iterations: " << iterations << " of i_maxIterations: " << i_maxIterations << std::endl;
|
|
|
+
|
|
|
+ #ifdef DEBUG_KMEDIAN_ASSIGNMENTS
|
|
|
+ fprintf(stderr, "KMedian::cluster: iterations = %d, delta = %f\n",
|
|
|
+ iterations, delta);
|
|
|
+ #endif
|
|
|
+
|
|
|
+ this->compute_weights( features, weights, assignment );
|
|
|
+}
|
|
|
+
|
|
|
+void KMedian::print_iteration( int iterations, VVector & prototypes, double delta )
|
|
|
+{
|
|
|
+ if (iterations > 1)
|
|
|
+ fprintf(stderr, "KMedian::cluster: iteration=%d delta=%f\n", iterations,
|
|
|
+ delta);
|
|
|
+ else
|
|
|
+ fprintf(stderr, "KMedian::cluster: iteration=%d\n", iterations);
|
|
|
+
|
|
|
+ int k = 0;
|
|
|
+
|
|
|
+ for (VVector::const_iterator i = prototypes.begin(); i != prototypes.end(); i++, k++)
|
|
|
+ {
|
|
|
+ fprintf(stderr, "class (%d)\n", k);
|
|
|
+ cerr << "prototype = " << (*i) << endl;
|
|
|
+ }
|
|
|
+}
|