/** * @file HierarchicalCluster.cpp * @brief Hierarchical Clustering wrapper * @author Michael Koch * @date 10/25/2010 */ #include #include #include "vislearning/math/cluster/HierarchicalCluster.h" #include "vislearning/math/cluster/cluster.h" #include #undef max #undef min #include using namespace OBJREC; using namespace std; using namespace NICE; #undef DEBUG_HierarchicalCluster HierarchicalCluster::HierarchicalCluster(int _noClasses, char _method, int _distanceType) : noClasses(_noClasses), method(_method), distanceType(_distanceType) { srand(time(NULL)); } HierarchicalCluster::~HierarchicalCluster() { } void HierarchicalCluster::cluster(const VVector & features, VVector & prototypes, std::vector & weights, std::vector & assignment) { double double_max = std::numeric_limits::max(); if (features.size() <= 1) { prototypes=features; } else { int featureamount = features.size(); int dimension = features[0].size(); double **data; int **mask; VVectorToDoubleArray(features, data, mask); double * weightpointer = new double[dimension]; for (uint i = 0; i < dimension; i++) { if (i < weights.size()) { weightpointer[i] = weights[i]; } else { weightpointer[i] = 1.0; } } int transpose = 0; double ** distmatrix = NULL; Node * nodes = treecluster(featureamount, dimension, data, mask, weightpointer, transpose, distanceFunction(distanceType), method, distmatrix); std::vector distances; for (int i = 0; i < featureamount; i++) { Node currentnode = nodes[i]; if (currentnode.distance < double_max) { distances.push_back(currentnode.distance); } } //TODO cluster estimation refactor without cluster.h in HierarchicalCluster.h if (noClasses <= 0) { std::sort(distances.begin(), distances.end()); std::vector gradient; std::vector cuts; size_t count = 0; double last_element; for (size_t i = 0; i < distances.size(); i++) { if (count > 0) { gradient.push_back(distances[i] - last_element); } last_element = distances[i]; count++; } size_t pos_max = 0; double max; if (gradient.size() > 0) { max = gradient[0]; for (size_t i = 1; i < gradient.size(); i++) { if (gradient[i] > max) { max = gradient[i]; pos_max = i; } } } size_t pos_reject = pos_max + 1; double distance_reject = distances[pos_reject]; for (size_t i = 0; i < featureamount; i++) { if (nodes[i].distance < double_max && nodes[i].distance >= distance_reject) { cuts.push_back(2); } else { cuts.push_back(0); } } bool change = true; while (change) { change = false; for (size_t i = 0; i < featureamount; i++) { if (cuts[i] > 0 && nodes[i].distance < double_max && nodes[i].distance >= distance_reject) { if (nodes[i].left < 0) { int pos_left = -nodes[i].left - 1; if (cuts[pos_left] > 0) { if (cuts[i] > 0) { cuts[i]--; change = true; } } } if (nodes[i].right < 0) { int pos_right = -nodes[i].right - 1; if (cuts[pos_right] > 0) { if (cuts[i] > 0) { cuts[i]--; change = true; } } } } } } noClasses = 0; for (int i = 0; i < featureamount; i++) { noClasses += cuts[i]; } } int clusterassignment[featureamount]; cuttree(featureamount, nodes, noClasses, clusterassignment); // assignment.clear(); for (int i = 0; i < featureamount; i++) { if (clusterassignment[i] >= 0) { assignment.push_back(clusterassignment[i]); } } prototypes.clear(); for (int k = 0; k < noClasses; k++) { prototypes.push_back(Vector(dimension)); prototypes[k].set(0); weights.push_back(0); } compute_prototypes(features, prototypes, weights, assignment); } } void HierarchicalCluster::VVectorToDoubleArray(const VVector &data, double ** &doublearray, int ** &mask) { if (data.size() > 0) { int featureamount = data.size(); int dimension = data[0].size(); doublearray = new double*[featureamount]; mask = new int*[featureamount]; for (int i = 0; i < featureamount; ++i) { doublearray[i] = new double[dimension]; mask[i] = new int[dimension]; } for (int row = 0; row < featureamount; row++) { for (int col = 0; col < dimension; col++) { doublearray[row][col] = data[row][col]; if (isNaN(data[row][col])) { mask[row][col] = 0; } else { mask[row][col] = 1; } } } } } //TODO find better space for that one and refactor with enum char HierarchicalCluster::distanceFunction(int distancetype) { // dist (input) char // Defines which distance measure is used, as given by the table: // dist=='e': Euclidean distance // dist=='b': City-block distance // dist=='c': correlation // dist=='a': absolute value of the correlation // dist=='u': uncentered correlation // dist=='x': absolute uncentered correlation // dist=='s': Spearman's rank correlation // dist=='k': Kendall's tau // For other values of dist, the default (Euclidean distance) is used. char type = 'e'; switch (distancetype) { case 0: { type = 'e'; } break; case 1: { type = 'b'; } break; case 2: { type = 'c'; } break; case 3: { type = 'a'; } break; case 4: { type = 'u'; } break; case 5: { type = 'x'; } break; case 6: { type = 's'; } break; case 7: { type = 'k'; } break; default: type = 'e'; break; } return type; } int HierarchicalCluster::compute_prototypes(const VVector & features, VVector & prototypes, std::vector & weights, const std::vector< int> & assignment) { int j = 0; for (int k = 0; k < noClasses; k++) { prototypes[k].set(0); weights[k] = 0; } for (VVector::const_iterator i = features.begin(); i != features.end(); i++, j++) { int k = assignment[j]; NICE::Vector & p = prototypes[k]; const NICE::Vector & x = *i; p += x; weights[k]++; } for (int k = 0; k < noClasses; k++) { NICE::Vector & p = prototypes[k]; if (weights[k] <= 0) { return -1; } p *= (1.0 / weights[k]); weights[k] = weights[k] / features.size(); } return 0; }