/** * @file KMeansHeuristic.cpp * @brief K-Means * @author Erik Rodner, Michael Koch, Michael Trummer * @date 02/04/2011 */ #include #include "vislearning/math/cluster/KMeansHeuristic.h" #include "vislearning/math/distances/genericDistance.h" #include using namespace OBJREC; using namespace std; using namespace NICE; #undef DEBUG_KMeansHeuristic KMeansHeuristic::KMeansHeuristic(int _noClasses, string _distanceType) : noClasses(_noClasses), distanceType(_distanceType) { //srand(time(NULL)); distancefunction = GenericDistanceSelection::selectDistance(distanceType); } KMeansHeuristic::~KMeansHeuristic() { } void KMeansHeuristic::initial_guess(const VVector & features, VVector & prototypes) { int j = 0; std::set > mark; for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, j++) { int k; do { k = rand() % features.size(); } while (mark.find(k) != mark.end()); mark.insert(mark.begin(), k); *i = features[k]; } } // re-init cluster means int KMeansHeuristic::robust_prototypes(const VVector &features, VVector &prototypes, std::vector & weights, const std::vector & assignment) { if (features.size() > 0) { int dim = features[0].size(); weights.assign(weights.size(), 0.0); int m = 0; for (VVector::iterator i = prototypes.begin(); i != prototypes.end(); i++, m++) { NICE::Vector & p = *i; if (isnan(p[0])) { continue; } int clustersize = 0; vector clusterassign(features.size(), 0); for (int a = 0; a < (int) assignment.size(); a++) { if (assignment[a] == m) { clusterassign[a] = 1; clustersize++; } } //cout << "size " << clustersize << endl; int cnt = 0; while (cnt < clustersize/4) { //find feature with largest distance to the cluster mean double dist = 0, maxdist = 0; int maxdistind = 0; for (int a = 0; a < (int) clusterassign.size(); a++) { if (clusterassign[a] == 1) { dist = 0; dist += distancefunction->calculate(p, features[a]); if (dist > maxdist) { maxdist = dist; maxdistind = a; } } } //detach max-dist feature from the cluster clusterassign[maxdistind] = 0; cnt++; } //recalculate the cluster mean p=0.0; for (int a = 0; a < (int) clusterassign.size(); a++) { if (clusterassign[a] == 1) { if (isnan(features[a][0])) continue; p += features[a]; weights[m]++; } } if (weights[m] <= 0) { return -1; } for (int d = 0; d < dim; d++) { p[d] /= weights[m]; } #ifdef DEBUG_KMeansHeuristic cerr << "prototype for class" << m << ":" << p << endl; #endif } } return 0; } double KMeansHeuristic::compute_delta(const VVector & oldprototypes, const VVector & prototypes) { double distance = 0; for (uint k = 0; k < oldprototypes.size(); k++) distance += distancefunction->calculate(oldprototypes[k], prototypes[k]); return distance; } double KMeansHeuristic::compute_assignments(const VVector & features, const VVector & prototypes, std::vector & assignment) { int index = 0; for (VVector::const_iterator i = features.begin(); i != features.end(); i++, index++) { const NICE::Vector & x = *i; double mindist = std::numeric_limits::max(); int minclass = 0; int c = 0; #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "computing nearest prototype for std::vector %d\n", index); #endif for (VVector::const_iterator j = prototypes.begin(); j != prototypes.end(); j++, c++) { const NICE::Vector & p = *j; double distance = distancefunction->calculate(p, x); #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "distance to prototype %d is %f\n", c, distance); #endif if (distance < mindist) { minclass = c; mindist = distance; } } assignment[index] = minclass; } return 0.0; } double KMeansHeuristic::compute_weights(const VVector & features, std::vector< double> & weights, std::vector & assignment) { for (int k = 0; k < noClasses; k++) weights[k] = 0; int j = 0; for (VVector::const_iterator i = features.begin(); i != features.end(); i++, j++) { int k = assignment[j]; weights[k]++; } for (int k = 0; k < noClasses; k++) weights[k] = weights[k] / features.size(); return 0.0; } void KMeansHeuristic::cluster(const VVector & features, VVector & prototypes, std::vector & weights, std::vector & assignment) { VVector oldprototypes; prototypes.clear(); weights.clear(); assignment.clear(); weights.resize(noClasses, 0); assignment.resize(features.size(), 0); int dimension; if ((int) features.size() >= noClasses) dimension = features[0].size(); else { fprintf(stderr, "FATAL ERROR: Not enough feature vectors provided for KMeansHeuristic\n"); exit(-1); } for (int k = 0; k < noClasses; k++) { prototypes.push_back(Vector(dimension)); prototypes[k].set(0); } KMeansHeuristic_Restart: initial_guess(features, prototypes); int iterations = 0; double delta = std::numeric_limits::max(); const double minDelta = 1e-5; const int maxIterations = 200; do { iterations++; compute_assignments(features, prototypes, assignment); if (iterations > 1) oldprototypes = prototypes; #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "KMeansHeuristic::cluster compute_prototypes\n"); #endif //if (compute_prototypes(features, prototypes, weights, assignment) < 0) if (robust_prototypes(features, prototypes, weights, assignment) < 0) { fprintf(stderr, "KMeansHeuristic::cluster restart\n"); goto KMeansHeuristic_Restart; } #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "KMeansHeuristic::cluster compute_delta\n"); #endif if (iterations > 1) delta = compute_delta(oldprototypes, prototypes); #ifdef DEBUG_KMeansHeuristic print_iteration(iterations, prototypes, delta); #endif } while ((delta > minDelta) && (iterations < maxIterations)); #ifdef DEBUG_KMeansHeuristic fprintf(stderr, "KMeansHeuristic::cluster: iterations = %d, delta = %f\n", iterations, delta); #endif compute_weights(features, weights, assignment); } void KMeansHeuristic::print_iteration(int iterations, VVector & prototypes, double delta) { if (iterations > 1) fprintf(stderr, "KMeansHeuristic::cluster: iteration=%d delta=%f\n", iterations, delta); else fprintf(stderr, "KMeansHeuristic::cluster: iteration=%d\n", iterations); int k = 0; for (VVector::const_iterator i = prototypes.begin(); i != prototypes.end(); i++, k++) { fprintf(stderr, "class (%d)\n", k); cerr << "prototype = " << (*i) << endl; } }