/** * @file KMedian.cpp * @brief KMedian (aka K-medoid) * @author Alexander Freytag * @date 23-04-2013 (dd-mm-yyyy) */ #ifdef NICE_USELIB_OPENMP #include #endif #include #include #include //to easily find the smallest value in a map #include "vislearning/math/cluster/KMedian.h" #include "vislearning/math/distances/genericDistance.h" #include using namespace OBJREC; using namespace std; using namespace NICE; typedef std::pair MyPairType; struct CompareSecond { bool operator()(const MyPairType& left, const MyPairType& right) const { return left.second < right.second; } }; #undef DEBUG_KMEDIAN_ASSIGNMENTS // #define DEBUG_KMEDIAN_ASSIGNMENTS #undef DEBUG_KMEDIAN_PROTOCOMP // #define DEBUG_KMEDIAN_PROTOCOMP KMedian::KMedian(const int & _noClasses, const std::string & _distanceType) : noClasses(_noClasses), distanceType(_distanceType) { //srand(time(NULL)); distancefunction = GenericDistanceSelection::selectDistance(distanceType); this->d_minDelta = 1e-5; this->i_maxIterations = 200; } KMedian::KMedian( const NICE::Config *conf, const std::string & _section) { this->distanceType = conf->gS( _section, "distanceType", "euclidean" ); this->distancefunction = GenericDistanceSelection::selectDistance(distanceType); this->d_minDelta = conf->gD( _section, "minDelta", 1e-5 ); this->i_maxIterations = conf->gI( _section, "maxIterations", 200); this->noClasses = conf->gI( _section, "noClasses", 20); } KMedian::~KMedian() { } void KMedian::initial_guess(const VVector & features, VVector & prototypes) { int j = 0; std::set > mark; for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, j++) { int k; do { k = rand() % features.size(); } while (mark.find(k) != mark.end()); mark.insert(mark.begin(), k); *i = features[k]; } } int KMedian::compute_prototypes(const VVector & features, VVector & prototypes, std::vector & weights, const std::vector & assignment) { #ifdef DEBUG_KMEDIAN_PROTOCOMP std::cerr << "initial assignments: "; for (std::vector::const_iterator assignIt = assignment.begin(); assignIt != assignment.end(); assignIt++) { std::cerr << " " << *assignIt; } std::cerr << std::endl; #endif //initialization for (int k = 0; k < noClasses; k++) { prototypes[k].set(0); weights[k] = 0; } NICE::VectorT numberOfCurrentAssignments ( noClasses ) ; numberOfCurrentAssignments.set ( 0 ); int exCnt = 0; //how many examples are assigned to the current clusters? for (VVector::const_iterator i = features.begin(); i != features.end(); i++, exCnt++) { int k = assignment[exCnt]; //increase counter for assigned cluster numberOfCurrentAssignments[ k ] ++; } #ifdef DEBUG_KMEDIAN_PROTOCOMP std::cerr << "k-median -- current assignmens: " << numberOfCurrentAssignments << std::endl << "noClasses: " << noClasses << std::endl; #endif //compute the median for every cluster #pragma omp parallel for for (int clusterCnt = 0; clusterCnt < noClasses; clusterCnt++) { NICE::Vector overallDistances ( numberOfCurrentAssignments[ clusterCnt ] ); VVector::const_iterator lastExampleWorkedOn = features.begin(); int i_idxOfLastExampleWorkedOn ( 0 ); uint i_exCntInt ( 0 ); //this map will contain overall distances of all examples within the current clusters //we need separate maps for every cluster to allow parallelization std::map distancesWithinCluster; for (VVector::const_iterator featIt = features.begin(); featIt != features.end(); featIt++, i_exCntInt++) { int k = assignment[i_exCntInt]; //only considere examples currently assigned to cluster clusterCnt if ( k != clusterCnt) { continue; } uint exCntIntTmp ( i_idxOfLastExampleWorkedOn ); //idx going over all features for (VVector::const_iterator j = lastExampleWorkedOn ; j != features.end(); j++, exCntIntTmp++) { int kTmp; if ( exCntIntTmp < assignment.size() ) kTmp = assignment[exCntIntTmp]; else { //actually, this will be never be reached :) std::cerr << "ERROR: exCntIntTmp >= assignment.size() " << exCntIntTmp << " " << assignment.size() << std::endl; } //only considere examples currently assigned to cluster clusterCnt if ( kTmp != clusterCnt) continue; double dist ( distancefunction->calculate( *featIt, *j) ); if ( i_exCntInt < features.size() ) { distancesWithinCluster[ i_exCntInt ] += dist; #ifdef DEBUG_KMEDIAN_PROTOCOMP std::cerr << "increase " << i_exCntInt << " by " << dist << " for " <<*featIt << " and " << *j << std::endl; #endif } else { //actually, this will be never be reached :) std::cerr << "ERROR: i_exCntInt >= features.size() " << i_exCntInt << " " << features.size() << std::endl; } if ( i_exCntInt != exCntIntTmp ) { if (exCntIntTmp < features.size() ) { distancesWithinCluster[ exCntIntTmp ] += dist; #ifdef DEBUG_KMEDIAN_PROTOCOMP std::cerr << "increase also " << exCntIntTmp << " by " << dist << std::endl; #endif } else std::cerr << "ERROR: exCntIntTmp >= features.size() " << exCntIntTmp << " " << features.size() << std::endl; } } //inc by one to avoid calculating some distances twice if ( ( featIt != features.end()) && ( (featIt +1 ) != features.end()) ) { lastExampleWorkedOn = ( featIt + 1 ); i_idxOfLastExampleWorkedOn = i_exCntInt+1; } } #ifdef DEBUG_KMEDIAN_PROTOCOMP std::cerr << "distances for cluster " << clusterCnt << " "; for(std::map::const_iterator distIt = distancesWithinCluster.begin(); distIt != distancesWithinCluster.end(); distIt++) { std::cerr << distIt->first << " " << distIt->second << " "; } std::cerr << std::endl; #endif //now compute the index of example with min overall distance int idxOfClusterMedian ( (min_element(distancesWithinCluster.begin(), distancesWithinCluster.end(), CompareSecond()))->first ); #pragma omp critical prototypes[clusterCnt] = features[idxOfClusterMedian]; //finished computations for cluster k } #ifdef DEBUG_KMEDIAN_PROTOCOMP std::cerr << " ---- prototypes after current iteration: --- " << std::endl; for (NICE::VVector::const_iterator protoIt = prototypes.begin(); protoIt != prototypes.end(); protoIt++) { std::cerr << *protoIt << " "; } std::cerr << std::endl; #endif return 0; } double KMedian::compute_delta(const VVector & oldprototypes, const VVector & prototypes) { double distance = 0; for (uint k = 0; k < oldprototypes.size(); k++) { distance += distancefunction->calculate(oldprototypes[k], prototypes[k]); #ifdef DEBUG_KMEDIAN_ASSIGNMENTS fprintf(stderr, "KMedian::compute_delta: Distance:", distancefunction->calculate(oldprototypes[k], prototypes[k])); #endif } return distance; } double KMedian::compute_assignments(const VVector & features, const VVector & prototypes, std::vector & assignment) { int index = 0; for (VVector::const_iterator i = features.begin(); i != features.end(); i++, index++) { const NICE::Vector & x = *i; double mindist = std::numeric_limits::max(); int minclass = 0; int c = 0; #ifdef DEBUG_KMEDIAN_ASSIGNMENTS fprintf(stderr, "computing nearest prototype for std::vector %d\n", index); #endif for (VVector::const_iterator j = prototypes.begin(); j != prototypes.end(); j++, c++) { const NICE::Vector & p = *j; double distance = distancefunction->calculate(p, x); #ifdef DEBUG_KMEDIAN_ASSIGNMENTS fprintf(stderr, "KMedian::compute_delta: Distance: %f\n", distancefunction->calculate(p, x)); #endif #ifdef DEBUG_KMEDIAN_ASSIGNMENTS cerr << p << endl; cerr << x << endl; fprintf(stderr, "distance to prototype %d is %f\n", c, distance); #endif if (distance < mindist) { minclass = c; mindist = distance; } } assignment[index] = minclass; } return 0.0; } double KMedian::compute_weights(const VVector & features, std::vector & weights, std::vector & assignment) { for (int k = 0; k < noClasses; k++) weights[k] = 0; int j = 0; for (VVector::const_iterator i = features.begin(); i != features.end(); i++, j++) { int k = assignment[j]; weights[k]++; } for (int k = 0; k < noClasses; k++) weights[k] = weights[k] / features.size(); return 0.0; } void KMedian::cluster(const NICE::VVector & features, NICE::VVector & prototypes, std::vector & weights, std::vector & assignment) { NICE::VVector oldprototypes; prototypes.clear(); weights.clear(); assignment.clear(); weights.resize(noClasses, 0); assignment.resize(features.size(), 0); int dimension; if ((int) features.size() >= noClasses) dimension = features[0].size(); else { fprintf(stderr, "FATAL ERROR: Not enough feature vectors provided for kMeans\n"); exit(-1); } for (int k = 0; k < noClasses; k++) { prototypes.push_back( NICE::Vector(dimension) ); prototypes[k].set(0); } bool successKMedian ( false ); int iterations ( 0 ); double delta ( std::numeric_limits::max() ); while ( !successKMedian ) { //we assume that this run will be successful successKMedian = true; this->initial_guess(features, prototypes); iterations = 0; delta = std::numeric_limits::max(); //until-loop over iterations do { iterations++; // #ifdef DEBUG_KMEDIAN_ASSIGNMENTS std::cerr << "k-median iteration " << iterations << std::endl; // #endif this->compute_assignments( features, prototypes, assignment ); if (iterations > 1) oldprototypes = prototypes; #ifdef DEBUG_KMEDIAN_ASSIGNMENTS fprintf(stderr, "KMedian::cluster compute_prototypes\n"); #endif if ( this->compute_prototypes( features, prototypes, weights, assignment ) < 0 ) { fprintf(stderr, "KMedian::cluster restart\n"); successKMedian = false; break; } #ifdef DEBUG_KMEDIAN_ASSIGNMENTS fprintf(stderr, "KMedian::cluster compute_delta\n"); #endif if (iterations > 1) delta = this->compute_delta( oldprototypes, prototypes ); #ifdef DEBUG_KMEDIAN_ASSIGNMENTS this->print_iteration( iterations, prototypes, delta ); #endif } while ((delta > d_minDelta) && (iterations < i_maxIterations)); } std::cerr << "ended optimization -- delta: " << delta << " of d_minDelta: " << d_minDelta << " --- and iterations: " << iterations << " of i_maxIterations: " << i_maxIterations << std::endl; #ifdef DEBUG_KMEDIAN_ASSIGNMENTS fprintf(stderr, "KMedian::cluster: iterations = %d, delta = %f\n", iterations, delta); #endif this->compute_weights( features, weights, assignment ); } void KMedian::print_iteration( int iterations, VVector & prototypes, double delta ) { if (iterations > 1) fprintf(stderr, "KMedian::cluster: iteration=%d delta=%f\n", iterations, delta); else fprintf(stderr, "KMedian::cluster: iteration=%d\n", iterations); int k = 0; for (VVector::const_iterator i = prototypes.begin(); i != prototypes.end(); i++, k++) { fprintf(stderr, "class (%d)\n", k); cerr << "prototype = " << (*i) << endl; } }