/** * @file KMeansHeuristic.cpp * @brief K-Means * @author Erik Rodner, Michael Koch, Michael Trummer * @date 02/04/2011 */ #include #include "vislearning/math/cluster/KMeansHeuristic.h" #include "vislearning/math/distances/genericDistance.h" #include using namespace OBJREC; using namespace std; using namespace NICE; #undef DEBUG_KMeansHeuristic ///////////////////// ///////////////////// ///////////////////// // CONSTRUCTORS / DESTRUCTORS ///////////////////// ///////////////////// ///////////////////// KMeansHeuristic::KMeansHeuristic() : ClusterAlgorithm() { this->noClusters = 20; this->distanceType = "euclidean"; this->distancefunction = NULL; } KMeansHeuristic::KMeansHeuristic(int _noClusters, string _distanceType) : noClusters(_noClusters), distanceType(_distanceType) { //srand(time(NULL)); this->distancefunction = GenericDistanceSelection::selectDistance(distanceType); } KMeansHeuristic::KMeansHeuristic( const NICE::Config * _conf, const std::string & _confSection) { this->initFromConfig( _conf, _confSection ); } KMeansHeuristic::~KMeansHeuristic() { if ( this->distancefunction != NULL ) { delete this->distancefunction; this->distancefunction = NULL ; } } void KMeansHeuristic::initFromConfig( const NICE::Config* _conf, const std::string& _confSection ) { this->noClusters = _conf->gI( _confSection, "noClusters", 20); this->distanceType = _conf->gS( _confSection, "distanceType", "euclidean" ); this->distancefunction = OBJREC::GenericDistanceSelection::selectDistance( this->distanceType ); } ///////////////////// ///////////////////// ///////////////////// // CLUSTERING STUFF ///////////////////// ///////////////////// ////////////////// void KMeansHeuristic::initial_guess(const VVector & features, VVector & prototypes) { int j = 0; std::set > mark; for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, j++) { int k; do { k = rand() % features.size(); } while (mark.find(k) != mark.end()); mark.insert(mark.begin(), k); *i = features[k]; } } // re-init cluster means int KMeansHeuristic::robust_prototypes(const VVector &features, VVector &prototypes, std::vector & weights, const std::vector & assignment) { if (features.size() > 0) { int dim = features[0].size(); weights.assign(weights.size(), 0.0); int m = 0; for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, m++) { NICE::Vector & p = *i; if (NICE::isNaN(p[0])) { continue; } int clustersize = 0; vector clusterassign(features.size(), 0); for (int a = 0; a < (int) assignment.size(); a++) { if (assignment[a] == m) { clusterassign[a] = 1; clustersize++; } } //cout << "size " << clustersize << endl; int cnt = 0; while (cnt < clustersize/4) { //find feature with largest distance to the cluster mean double dist = 0, maxdist = 0; int maxdistind = 0; for (int a = 0; a < (int) clusterassign.size(); a++) { if (clusterassign[a] == 1) { dist = 0; dist += distancefunction->calculate(p, features[a]); if (dist > maxdist) { maxdist = dist; maxdistind = a; } } } //detach max-dist feature from the cluster clusterassign[maxdistind] = 0; cnt++; } //recalculate the cluster mean p=0.0; for (int a = 0; a < (int) clusterassign.size(); a++) { if (clusterassign[a] == 1) { if (NICE::isNaN(features[a][0])) continue; p += features[a]; weights[m]++; } } if (weights[m] <= 0) { return -1; } for (int d = 0; d < dim; d++) { p[d] /= weights[m]; } #ifdef DEBUG_KMeansHeuristic cerr << "prototype for class" << m << ":" << p << endl; #endif } } return 0; } double KMeansHeuristic::compute_delta(const VVector & oldprototypes, const VVector & prototypes) { double distance = 0; for (uint k = 0; k < oldprototypes.size(); k++) distance += distancefunction->calculate(oldprototypes[k], prototypes[k]); return distance; } double KMeansHeuristic::compute_assignments(const VVector & features, const VVector & prototypes, std::vector & assignment) { int index = 0; for (VVector::const_iterator i = features.begin(); i != features.end(); i++, index++) { const NICE::Vector & x = *i; double mindist = std::numeric_limits::max(); int minclass = 0; int c = 0; #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "computing nearest prototype for std::vector %d\n", index); #endif for (VVector::const_iterator j = prototypes.begin(); j != prototypes.end(); j++, c++) { const NICE::Vector & p = *j; double distance = distancefunction->calculate(p, x); #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "distance to prototype %d is %f\n", c, distance); #endif if (distance < mindist) { minclass = c; mindist = distance; } } assignment[index] = minclass; } return 0.0; } double KMeansHeuristic::compute_weights(const VVector & features, std::vector< double> & weights, std::vector & assignment) { for (int k = 0; k < noClusters; k++) weights[k] = 0; int j = 0; for (VVector::const_iterator i = features.begin(); i != features.end(); i++, j++) { int k = assignment[j]; weights[k]++; } for (int k = 0; k < noClusters; k++) weights[k] = weights[k] / features.size(); return 0.0; } void KMeansHeuristic::cluster(const VVector & features, VVector & prototypes, std::vector & weights, std::vector & assignment) { VVector oldprototypes; prototypes.clear(); weights.clear(); assignment.clear(); weights.resize(noClusters, 0); assignment.resize(features.size(), 0); int dimension; if ((int) features.size() >= noClusters) dimension = features[0].size(); else { fprintf(stderr, "FATAL ERROR: Not enough feature vectors provided for KMeansHeuristic\n"); exit(-1); } for (int k = 0; k < noClusters; k++) { prototypes.push_back(Vector(dimension)); prototypes[k].set(0); } KMeansHeuristic_Restart: initial_guess(features, prototypes); int iterations = 0; double delta = std::numeric_limits::max(); const double minDelta = 1e-5; const int maxIterations = 200; do { iterations++; compute_assignments(features, prototypes, assignment); if (iterations > 1) oldprototypes = prototypes; #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "KMeansHeuristic::cluster compute_prototypes\n"); #endif //if (compute_prototypes(features, prototypes, weights, assignment) < 0) if (robust_prototypes(features, prototypes, weights, assignment) < 0) { fprintf(stderr, "KMeansHeuristic::cluster restart\n"); goto KMeansHeuristic_Restart; } #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "KMeansHeuristic::cluster compute_delta\n"); #endif if (iterations > 1) delta = compute_delta(oldprototypes, prototypes); #ifdef DEBUG_KMeansHeuristic print_iteration(iterations, prototypes, delta); #endif } while ((delta > minDelta) && (iterations < maxIterations)); #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "KMeansHeuristic::cluster: iterations = %d, delta = %f\n", iterations, delta); #endif compute_weights(features, weights, assignment); } void KMeansHeuristic::print_iteration(int iterations, VVector & prototypes, double delta) { if (iterations > 1) fprintf(stderr, "KMeansHeuristic::cluster: iteration=%d delta=%f\n", iterations, delta); else fprintf(stderr, "KMeansHeuristic::cluster: iteration=%d\n", iterations); int k = 0; for (VVector::const_iterator i = prototypes.begin(); i != prototypes.end(); i++, k++) { fprintf(stderr, "class (%d)\n", k); cerr << "prototype = " << (*i) << endl; } } ///////////////////// INTERFACE PERSISTENT ///////////////////// // interface specific methods for store and restore ///////////////////// INTERFACE PERSISTENT ///////////////////// void KMeansHeuristic::restore ( std::istream & is, int format ) { //delete everything we knew so far... this->clear(); if ( is.good() ) { std::string tmp; is >> tmp; //class name if ( ! this->isStartTag( tmp, "KMeansHeuristic" ) ) { std::cerr << " WARNING - attempt to restore KMeansHeuristic, but start flag " << tmp << " does not match! Aborting... " << std::endl; throw; } bool b_endOfBlock ( false ) ; while ( !b_endOfBlock ) { is >> tmp; // start of block if ( this->isEndTag( tmp, "KMeansHeuristic" ) ) { b_endOfBlock = true; continue; } tmp = this->removeStartTag ( tmp ); if ( tmp.compare("noClusters") == 0 ) { is >> this->noClusters; is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else if ( tmp.compare("distanceType") == 0 ) { is >> this->distanceType; //TODO fixme this->distancefunction = OBJREC::GenericDistanceSelection::selectDistance( this->distanceType ); is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else if ( tmp.compare("distancefunction") == 0 ) { //TODO is >> this->distancefunction; is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else { std::cerr << "WARNING -- unexpected KMeansHeuristic object -- " << tmp << " -- for restoration... aborting" << std::endl; throw; } } } else { std::cerr << "KMeansHeuristic::restore -- InStream not initialized - restoring not possible!" << std::endl; throw; } } void KMeansHeuristic::store ( std::ostream & os, int format ) const { if (os.good()) { // show starting point os << this->createStartTag( "KMeansHeuristic" ) << std::endl; os << this->createStartTag( "noClusters" ) << std::endl; os << this->noClusters << std::endl; os << this->createEndTag( "noClusters" ) << std::endl; os << this->createStartTag( "distanceType" ) << std::endl; os << this->distanceType << std::endl; os << this->createEndTag( "distanceType" ) << std::endl; os << this->createStartTag( "distancefunction" ) << std::endl; //TODO os << this->distancefunction << std::endl; os << this->createEndTag( "distancefunction" ) << std::endl; // done os << this->createEndTag( "KMeansHeuristic" ) << std::endl; } else { std::cerr << "OutStream not initialized - storing not possible!" << std::endl; } } void KMeansHeuristic::clear () { if ( this->distancefunction != NULL ) { delete this->distancefunction; this->distancefunction = NULL ; } }