Преглед на файлове

Implemented simple k-Nearest-Neighbor regression.

Frank Prüfer преди 11 години
родител
ревизия
00e06b09a0
променени са 3 файла, в които са добавени 433 реда и са изтрити 0 реда
  1. 93 0
      regression/npregression/RegKNN.cpp
  2. 55 0
      regression/npregression/RegKNN.h
  3. 285 0
      regression/progs/testNPRegression.cpp

+ 93 - 0
regression/npregression/RegKNN.cpp

@@ -0,0 +1,93 @@
+/**
+* @file RegKNN.cpp
+* @brief Implementation of k-Nearest-Neighbor algorithm for regression purposes
+* @author Frank Prüfer
+* @date 08/29/2013
+
+*/  
+#include <iostream>
+
+#include "vislearning/regression/npregression/RegKNN.h"
+
+#include "vislearning/math/mathbase/FullVector.h"
+
+using namespace OBJREC;
+
+using namespace std;
+using namespace NICE;
+
+
+RegKNN::RegKNN ( const Config *_conf, NICE::VectorDistance<double> *_distancefunc ) : distancefunc (_distancefunc)
+{
+    K = _conf->gI("RegKNN", "K", 1 );
+    if ( _distancefunc == NULL )
+		distancefunc = new EuclidianDistance<double>();
+}
+
+RegKNN::~RegKNN()
+{
+}
+
+void RegKNN::teach ( const NICE::VVector & _dataSet, const NICE::Vector & _labelSet)
+{
+    fprintf (stderr, "teach using all !\n");
+    //NOTE this is crucial if we clear _teachSet afterwards!
+    //therefore, take care NOT to call _techSet.clear() somewhere out of this method
+    this->dataSet = _dataSet;
+    this->labelSet = _labelSet;
+    
+    std::cerr << "number of known training samples: " << this->dataSet.size() << std::endl;   
+    
+}
+
+// void RegKNN::teach ( const NICE::Vector & x, const double & y )
+// {
+//     std::cerr << "RegKNN::teach one new example" << std::endl;
+//     
+//     for ( size_t i = 0 ; i < x.size() ; i++ )
+//       if ( isnan(x[i]) ) 
+//       {
+//           fprintf (stderr, "There is a NAN value in within this vector: x[%d] = %f\n", (int)i, x[i]);
+//           cerr << x << endl;
+//           exit(-1);
+//       }
+// 
+//     dataSet.push_back ( x );
+//     labelSet.push_back ( y );
+//     
+//     std::cerr << "number of known training samples: " << dataSet.size()<< std::endl;
+// }
+
+double RegKNN::predict ( const NICE::Vector & x )
+{
+    FullVector distances(dataSet.size());
+
+    if ( dataSet.size() <= 0 ) {
+		fprintf (stderr, "RegKNN: please train this classifier before classifying\n");
+		exit(-1);
+    }
+
+    for(uint i = 0; i < dataSet.size(); i++){
+    
+      double distance = distancefunc->calculate (x,dataSet[i]);
+      
+      if ( isnan(distance) )
+      {
+          fprintf (stderr, "RegKNN::classify: NAN value found !!\n");
+          cerr << x << endl;
+          cerr << dataSet[i] << endl;
+      }
+      distances[i] = distance;
+      
+    }
+    std::vector<int> ind;
+    distances.getSortedIndices(ind);
+    
+    double response = 0.0;  
+    
+    for(uint i = 0; i < K; i++){
+      response += labelSet[ind[i]];
+    }
+       
+    return (response / (double) K);
+}

+ 55 - 0
regression/npregression/RegKNN.h

@@ -0,0 +1,55 @@
+/**
+* @file RegKNN.h
+* @brief Implementation of k-Nearest-Neighbor algorithm for regression purposes
+* @author Frank Prüfer
+* @date 08/29/2013
+
+*/ 
+#ifndef REGKNNINCLUDE
+#define REGKNNINCLUDE
+
+#include "core/vector/VectorT.h"
+#include "core/vector/VVector.h"
+#include "core/vector/MatrixT.h"
+
+#include "core/basics/Config.h"
+
+#include <core/vector/Distance.h>
+
+#include "vislearning/regression/regressionbase/RegressionAlgorithm.h"
+
+namespace OBJREC
+{ 
+class RegKNN : public RegressionAlgorithm
+{
+  protected:
+    int K;
+    
+    /** set of data points */
+    NICE::VVector dataSet;
+    
+    /** set of responses according to dataset */
+    NICE::Vector labelSet;
+    
+    /** used distance function */
+    NICE::VectorDistance<double> *distancefunc;
+  
+  public:
+    /** simple constructor */
+    RegKNN( const NICE::Config *conf, NICE::VectorDistance<double> *distancefunc = NULL );
+    
+    /** simple destructor */
+    virtual ~RegKNN();
+    
+    /** predict response using simple vector */
+    double predict ( const NICE::Vector & x );
+    
+    /** teach whole set at once */
+    void teach ( const NICE::VVector & dataSet, const NICE::Vector & labelSet );
+
+//     /** teach one data point at a time */
+//     void teach ( const NICE::Vector & x, const double & y );
+};
+}	//namespace
+
+#endif

+ 285 - 0
regression/progs/testNPRegression.cpp

@@ -0,0 +1,285 @@
+/**
+* @file testLinRegression.cpp
+* @brief test of linear regression
+* @author Frank Prüfer
+* @date 08/29/2013
+
+*/
+
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "core/basics/Config.h"
+#include "core/vector/VectorT.h"
+#include "core/vector/VVector.h"
+
+#include "vislearning/baselib/ICETools.h"
+
+#include "vislearning/regression/npregression/RegKNN.h"
+
+using namespace OBJREC;
+using namespace NICE;
+using namespace std;
+
+void csvline_populate ( vector<string> &record,
+                       const string& line,
+                       char delimiter )
+{
+  int linepos=0;
+  int inquotes=false;
+  char c;
+  int linemax=line.length();
+  string curstring;
+  record.clear();
+
+  while(line[linepos]!=0 && linepos < linemax)
+  {
+    c = line[linepos];
+
+    if (!inquotes && curstring.length()==0 && c=='"')
+    {
+      //beginquotechar
+      inquotes=true;
+    }
+    else if (inquotes && c=='"')
+    {
+      //quotechar
+      if ( (linepos+1 <linemax) && (line[linepos+1]=='"') )
+      {
+        //encountered 2 double quotes in a row (resolves to 1 double quote)
+        curstring.push_back(c);
+        linepos++;
+      }
+      else
+      {
+        //endquotechar
+        inquotes=false;
+      }
+    }
+    else if (!inquotes && c==delimiter)
+    {
+      //end of field
+      record.push_back( curstring );
+      curstring="";
+    }
+    else if (!inquotes && (c=='\r' || c=='\n') )
+    {
+     record.push_back( curstring );
+     return;
+    }
+    else
+    {
+      curstring.push_back(c);
+    }
+    linepos++;
+  }
+  
+  record.push_back( curstring );
+}
+
+void loadData( NICE::VVector &Data,
+               NICE::Vector &y,
+               const string &path,
+               const string &xdat,
+               const string &ydat )
+{
+
+  vector<string> row;
+  string line;
+
+  cerr<<"Preloading Data...";
+  ifstream in( (path+xdat).c_str() );
+  if ( in.fail() )
+  {
+    cout << "File not found" <<endl;
+    exit(EXIT_FAILURE);
+  }
+
+  int numData = 0;
+
+  while ( getline(in, line)  && in.good() )
+  {
+    csvline_populate(row, line, ',');
+    vector<double> vec;
+    for (int i = 0; i < (int)row.size(); i++)
+    {
+      double dval = 0.0;
+      dval = atof(row[i].data() );
+      vec.push_back(dval);
+    }
+    NICE::Vector nvec(vec);
+    Data.push_back(nvec);
+    numData++;
+  }
+  in.close();
+
+  cerr<<"Finished."<<endl<<"Starting to get preloaded Labels...";
+
+  in.open( (path+ydat).c_str() );
+  if ( in.fail() )
+  {
+    cout << "File not found! Setting default value 0.0..." <<endl;
+    y.resize(numData);
+    y.set(0.0);
+  }
+  else
+  {
+    y.resize(numData);
+    int count = 0;
+    while(getline(in, line)  && in.good() )
+    {
+      csvline_populate(row, line, ',');
+      for ( int i = 0; i < (int)row.size(); i++ )
+      {
+        double dval = 0.0;
+        dval = atof(row[i].data() );
+        y.set(count,dval);
+        count++;
+      }
+    }
+    in.close();
+  }
+
+  cerr<<"Finished."<<endl;
+}
+
+void testFrame (  Config conf,
+		  NICE::VVector &xdata,
+		  NICE::Vector &y )
+{
+  cerr<<"\nStarting test framework..."<<endl;
+  
+  /*------------Initialize Variables-----------*/
+  ofstream storeEvalData;
+  double trainRatio = conf.gD( "debug", "training_ratio", .9 );
+  
+  int trainingSize = (int)(trainRatio*xdata.size());
+  int testingSize = xdata.size() - trainingSize;
+  
+  vector<int> indices;
+  for ( int i = 0; i < (int)xdata.size(); i++ )
+    indices.push_back(i);
+  
+  int nfolds = conf.gI( "debug", "nfolds", 10 );
+  Vector mef_v ( nfolds );
+  Vector corr_v ( nfolds );
+  Vector resub_v ( nfolds );
+  Vector diff_v ( nfolds );
+
+  bool saveConfig = conf.gB( "debug", "save_config", false );
+  
+  /*------------Store Configuration------------*/
+  string filename = conf.gS( "debug", "filename" );
+  
+  if ( saveConfig )
+  {
+    cout << "Configuration will be stored in: " << filename << "_config" << endl;
+    
+    storeEvalData.open ( (filename+"_config").c_str() );
+
+    storeEvalData.close();
+  } else
+  {
+    cout << "Configuration will not be stored." << endl;
+  }
+  
+  /*------------Setting up NPRegression--------------*/
+  for ( int k = 0; k < nfolds; k++)
+  {
+    string fold;
+    ostringstream convert;
+    convert << k;
+    fold = convert.str();
+    
+    cout << "\nFOLD " << k << ":\n======" << endl;
+    
+
+    cerr << "Initializing NPRegression...";
+    RegKNN *knn = new RegKNN (&conf, NULL);
+    cerr << "Finished." << endl;
+    
+    cerr << "Teaching the NPRegression algorithm...";
+    NICE::VVector trainData, testData;
+    NICE::Vector trainVals ( trainingSize );
+    NICE::Vector testVals ( testingSize );
+    random_shuffle( indices.begin(), indices.end() );
+    for ( int i = 0; i < trainingSize; i++ )
+    {
+      trainData.push_back ( xdata[ indices[i] ] );
+      trainVals.set( i, y[ indices[i] ] );
+    }
+    for ( int j = 0; j < testingSize; j++ )
+    {
+      testData.push_back ( xdata[ indices[j+trainingSize] ] );
+      testVals.set( j, y[ indices[j+trainingSize] ] );
+    }
+    
+    knn->teach ( trainData, trainVals );
+    cerr << "Finished." << endl;
+    
+    /*-------------Testing RDF-GP--------------*/
+
+    cerr << "\nGetting prediction values for all data points...";
+    NICE::Vector predictionValues( testingSize );
+    predictionValues.set ( 0.0 );
+    for ( int j = 0; j < testingSize; j++ )
+    {
+      predictionValues[j] = knn->predict( testData[j] );
+    }
+    cerr << "Finished." << endl;
+    
+    /*---------------Evaluation----------------*/
+    NICE::Vector diff = testVals - predictionValues;
+    
+    double mod_var = diff.StdDev()*diff.StdDev();
+    double tar_var = testVals.StdDev()*testVals.StdDev();
+    mef_v.set( k, (1-mod_var/tar_var) );
+    
+    NICE::Vector meanv( predictionValues.size() );
+    meanv.set( diff.Mean() );
+    NICE::Vector lhs = diff - meanv;
+    meanv.set( testVals.Mean() );
+    NICE::Vector rhs = testVals - meanv;
+    lhs *= rhs;
+    double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() );
+    corr_v.set( k, corr );
+    
+    diff *= diff;
+    diff_v.set( k, diff.Mean());
+    resub_v.set( k, (diff.Mean() / tar_var) );
+  }
+  
+  /*------------------Output-------------------*/
+  cout << "\nSimple Cross Validation Stats:\n==============================" << endl;
+  cout << "  Modelling Efficiency: " << mef_v.Mean() << endl;
+  cout << "  Correlation: " << corr_v.Mean() << endl;
+  cout << "  Mean Square Error: " << diff_v.Mean() << endl;
+  cout << "  Standardized MSE: " << resub_v.Mean() << endl;
+}
+
+
+int main (int argc, char **argv) {
+
+  Config conf ( argc, argv );   //get config from user input
+  
+  string path = conf.gS( "debug", "path", "." );
+  string dataset = conf.gS( "debug", "dataset", "flux" );
+
+  NICE::VVector xdata;
+  NICE::Vector y;
+
+  loadData(xdata, y, path, (dataset+"_x.csv"), (dataset+"_y.csv") ); //load all data
+  
+  testFrame( conf, xdata, y );
+
+  return 0;
+}
+
+
+