/** * @file KMeans.cpp * @brief K-Means * @author Erik Rodner, Alexander Freytag * @date 29-10-2007 (dd-mm-yyyy) */ #ifdef NICE_USELIB_OPENMP #include #endif #include #include #include #include "vislearning/math/cluster/KMeans.h" #include "vislearning/math/distances/genericDistance.h" using namespace OBJREC; using namespace std; using namespace NICE; #undef DEBUG_KMEANS ///////////////////// ///////////////////// ///////////////////// // CONSTRUCTORS / DESTRUCTORS ///////////////////// ///////////////////// ///////////////////// KMeans::KMeans() : ClusterAlgorithm() { this->noClusters = 20; this->distanceType = "euclidean"; this->distancefunction = NULL; this->d_minDelta = 1e-5; this->i_maxIterations = 200; } KMeans::KMeans(const int & _noClusters, const std::string & _distanceType) : noClusters(_noClusters), distanceType(_distanceType) { //srand(time(NULL)); this->distancefunction = GenericDistanceSelection::selectDistance(distanceType); this->d_minDelta = 1e-5; this->i_maxIterations = 200; } KMeans::KMeans( const NICE::Config * _conf, const std::string & _confSection) { this->initFromConfig( _conf, _confSection ); } KMeans::~KMeans() { if ( this->distancefunction != NULL ) { delete this->distancefunction; this->distancefunction = NULL ; } } void KMeans::initFromConfig( const NICE::Config* _conf, const std::string& _confSection ) { this->noClusters = _conf->gI( _confSection, "noClusters", 20); this->distanceType = _conf->gS( _confSection, "distanceType", "euclidean" ); this->distancefunction = OBJREC::GenericDistanceSelection::selectDistance( this->distanceType ); this->d_minDelta = _conf->gD( _confSection, "minDelta", 1e-5 ); this->i_maxIterations = _conf->gI( _confSection, "maxIterations", 200); } ///////////////////// ///////////////////// ///////////////////// // CLUSTERING STUFF ///////////////////// ///////////////////// ////////////////// void KMeans::initial_guess(const NICE::VVector & features, NICE::VVector & prototypes) { int j = 0; std::set > mark; for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, j++) { int k; do { k = rand() % features.size(); } while (mark.find(k) != mark.end()); mark.insert(mark.begin(), k); *i = features[k]; } } int KMeans::compute_prototypes(const VVector & features, VVector & prototypes, std::vector & weights, const std::vector & assignment) { int j = 0; // fprintf (stderr, "KMeans::compute_prototypes: init noClusters=%d\n", noClusters); for (int k = 0; k < this->noClusters; k++) { prototypes[k].set(0); weights[k] = 0; } // fprintf (stderr, "KMeans::compute_prototypes: compute means\n"); for (VVector::const_iterator i = features.begin(); i != features.end(); i++, j++) { int k = assignment[j]; NICE::Vector & p = prototypes[k]; const NICE::Vector & x = *i; #ifdef DEBUG_KMEANS fprintf( stderr, "KMeans::compute_prototypes: std::vector %d has assignment %d\n", j, k); #endif p += x; #ifdef DEBUG_KMEANS std::cerr << "vector was : " << x << std::endl; std::cerr << "prototype for this class is now : " << p << std::endl; #endif weights[k]++; } // fprintf (stderr, "KMeans::compute_prototypes: scaling\n"); #pragma omp parallel for for (int k = 0; k < this->noClusters; k++) { NICE::Vector & p = prototypes[k]; #ifdef DEBUG_KMEANS std::cerr << "prototype for this class before scaling : " << p << std::endl; #endif // if (weights[k] <= 0) // { // return -1; // } if (weights[k] > 0) { p *= (1.0 / weights[k]); weights[k] = weights[k] / features.size(); #ifdef DEBUG_KMEANS std::cerr << "prototype for this class after scaling with " << weights[k] << " : " << p << std::endl; #endif } } return 0; } double KMeans::compute_delta(const NICE::VVector & oldprototypes, const NICE::VVector & prototypes) { double distance = 0; for (uint k = 0; k < oldprototypes.size(); k++) { distance += this->distancefunction->calculate(oldprototypes[k], prototypes[k]); #ifdef DEBUG_KMEANS fprintf(stderr, "KMeans::compute_delta: Distance: %f", distancefunction->calculate(oldprototypes[k], prototypes[k])); #endif } return distance; } double KMeans::compute_assignments(const NICE::VVector & features, const NICE::VVector & prototypes, std::vector & assignment) { // int index = 0; // for (VVector::const_iterator i = features.begin(); i != features.end(); i++, index++) uint noFeatures ( features.size() ); #pragma omp parallel for for (int index = 0; index < noFeatures; index++) { // const NICE::Vector & x = *i; const NICE::Vector & x = features[index]; double mindist = std::numeric_limits::max(); int minclass = 0; int c = 0; #ifdef DEBUG_KMEANS fprintf(stderr, "computing nearest prototype for std::vector %d\n", index); #endif for (VVector::const_iterator j = prototypes.begin(); j != prototypes.end(); j++, c++) { const NICE::Vector & p = *j; double distance = this->distancefunction->calculate(p, x); #ifdef DEBUG_KMEANS fprintf(stderr, "KMeans::compute_delta: Distance: %f", this->distancefunction->calculate(p, x)); #endif #ifdef DEBUG_KMEANS std::cerr << p << std::endl; std::cerr << x << std::endl; fprintf(stderr, "distance to prototype %d is %f\n", c, distance); #endif if (distance < mindist) { minclass = c; mindist = distance; } } assignment[index] = minclass; } return 0.0; } double KMeans::compute_weights(const NICE::VVector & features, std::vector & weights, std::vector & assignment) { for (int k = 0; k < this->noClusters; k++) weights[k] = 0; int j = 0; for (NICE::VVector::const_iterator i = features.begin(); i != features.end(); i++, j++) { int k = assignment[j]; weights[k]++; } #pragma omp parallel for for (int k = 0; k < this->noClusters; k++) weights[k] = weights[k] / features.size(); return 0.0; } void KMeans::cluster(const NICE::VVector & features, NICE::VVector & prototypes, std::vector & weights, std::vector & assignment) { NICE::VVector oldprototypes; prototypes.clear(); weights.clear(); assignment.clear(); weights.resize(noClusters, 0); assignment.resize(features.size(), 0); int dimension; if ((int) features.size() >= this->noClusters) dimension = features[0].size(); else { std::ostringstream stringStream; stringStream << "FATAL ERROR: Not enough feature vectors provided for kMeans \n k: " << this->noClusters << " numberOfFeatures: " << features.size() << "\n"; std::string errormessage ( stringStream.str() ); throw NICE::Exception( errormessage ); } for (int k = 0; k < this->noClusters; k++) { prototypes.push_back(Vector(dimension)); prototypes[k].set(0); } bool successKMeans ( false ); int iterations ( 0 ); double delta ( std::numeric_limits::max() ); while ( !successKMeans ) { //we assume that this run will be successful successKMeans = true; this->initial_guess(features, prototypes); iterations = 0; delta = std::numeric_limits::max(); do { iterations++; this->compute_assignments(features, prototypes, assignment); if (iterations > 1) oldprototypes = prototypes; #ifdef DEBUG_KMEANS fprintf(stderr, "KMeans::cluster compute_prototypes\n"); #endif if ( this->compute_prototypes(features, prototypes, weights, assignment) < 0 ) { fprintf(stderr, "KMeans::cluster restart\n"); successKMeans = false; break; } #ifdef DEBUG_KMEANS fprintf(stderr, "KMeans::cluster compute_delta\n"); #endif if (iterations > 1) delta = this->compute_delta(oldprototypes, prototypes); #ifdef DEBUG_KMEANS print_iteration(iterations, prototypes, delta); #endif } while ((delta > d_minDelta) && (iterations < i_maxIterations)); } #ifdef DEBUG_KMEANS fprintf(stderr, "KMeans::cluster: iterations = %d, delta = %f\n", iterations, delta); #endif this->compute_weights(features, weights, assignment); } void KMeans::print_iteration(int iterations, NICE::VVector & prototypes, double delta) { if (iterations > 1) fprintf(stderr, "KMeans::cluster: iteration=%d delta=%f\n", iterations, delta); else fprintf(stderr, "KMeans::cluster: iteration=%d\n", iterations); int k = 0; for (NICE::VVector::const_iterator i = prototypes.begin(); i != prototypes.end(); i++, k++) { fprintf(stderr, "class (%d)\n", k); std::cerr << "prototype = " << (*i) << std::endl; } } ///////////////////// INTERFACE PERSISTENT ///////////////////// // interface specific methods for store and restore ///////////////////// INTERFACE PERSISTENT ///////////////////// void KMeans::restore ( std::istream & is, int format ) { //delete everything we knew so far... this->clear(); if ( is.good() ) { std::string tmp; is >> tmp; //class name if ( ! this->isStartTag( tmp, "KMeans" ) ) { std::cerr << " WARNING - attempt to restore KMeans, but start flag " << tmp << " does not match! Aborting... " << std::endl; throw; } bool b_endOfBlock ( false ) ; while ( !b_endOfBlock ) { is >> tmp; // start of block if ( this->isEndTag( tmp, "KMeans" ) ) { b_endOfBlock = true; continue; } tmp = this->removeStartTag ( tmp ); if ( tmp.compare("noClusters") == 0 ) { is >> this->noClusters; is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else if ( tmp.compare("distanceType") == 0 ) { is >> this->distanceType; //TODO fixme this->distancefunction = OBJREC::GenericDistanceSelection::selectDistance( this->distanceType ); is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else if ( tmp.compare("distancefunction") == 0 ) { //TODO is >> this->distancefunction; is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else if ( tmp.compare("d_minDelta") == 0 ) { is >> this->d_minDelta; is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else if ( tmp.compare("i_maxIterations") == 0 ) { is >> this->i_maxIterations; is >> tmp; // end of block tmp = this->removeEndTag ( tmp ); } else { std::cerr << "WARNING -- unexpected KMeans object -- " << tmp << " -- for restoration... aborting" << std::endl; throw; } } } else { std::cerr << "KMeans::restore -- InStream not initialized - restoring not possible!" << std::endl; throw; } } void KMeans::store ( std::ostream & os, int format ) const { if (os.good()) { // show starting point os << this->createStartTag( "KMeans" ) << std::endl; os << this->createStartTag( "noClusters" ) << std::endl; os << this->noClusters << std::endl; os << this->createEndTag( "noClusters" ) << std::endl; os << this->createStartTag( "distanceType" ) << std::endl; os << this->distanceType << std::endl; os << this->createEndTag( "distanceType" ) << std::endl; os << this->createStartTag( "distancefunction" ) << std::endl; //TODO os << this->distancefunction << std::endl; os << this->createEndTag( "distancefunction" ) << std::endl; os << this->createStartTag( "d_minDelta" ) << std::endl; os << this->d_minDelta << std::endl; os << this->createEndTag( "d_minDelta" ) << std::endl; os << this->createStartTag( "i_maxIterations" ) << std::endl; os << this->i_maxIterations << std::endl; os << this->createEndTag( "i_maxIterations" ) << std::endl; // done os << this->createEndTag( "KMeans" ) << std::endl; } else { std::cerr << "OutStream not initialized - storing not possible!" << std::endl; } } void KMeans::clear () { if ( this->distancefunction != NULL ) { delete this->distancefunction; this->distancefunction = NULL ; } }