/** * @file testRegressionGP.cpp * @brief test of GP * @author Sven Sickert * @date 07/11/2013 */ #include <sstream> #include <iostream> #include <fstream> #include <sstream> #include <string> #include <vector> #include <stdlib.h> #include <assert.h> #include "core/basics/Config.h" #include "core/vector/VectorT.h" #include "core/vector/VVector.h" //#include "vislearning/baselib/ICETools.h" #include "vislearning/regression/gpregression/RegGaussianProcess.h" #include "vislearning/math/kernels/KernelExp.h" using namespace OBJREC; using namespace NICE; using namespace std; void csvline_populate ( vector<string> &record, const string& line, char delimiter ) { int linepos=0; int inquotes=false; char c; int linemax=line.length(); string curstring; record.clear(); while(line[linepos]!=0 && linepos < linemax) { c = line[linepos]; if (!inquotes && curstring.length()==0 && c=='"') { //beginquotechar inquotes=true; } else if (inquotes && c=='"') { //quotechar if ( (linepos+1 <linemax) && (line[linepos+1]=='"') ) { //encountered 2 double quotes in a row (resolves to 1 double quote) curstring.push_back(c); linepos++; } else { //endquotechar inquotes=false; } } else if (!inquotes && c==delimiter) { //end of field record.push_back( curstring ); curstring=""; } else if (!inquotes && (c=='\r' || c=='\n') ) { record.push_back( curstring ); return; } else { curstring.push_back(c); } linepos++; } record.push_back( curstring ); } void loadData( NICE::VVector &Data, NICE::Vector &y, const string &path, const string &xdat, const string &ydat ) { vector<string> row; string line; cerr<<"Preloading Data..."; ifstream in( (path+xdat).c_str() ); if ( in.fail() ) { cout << "File not found" <<endl; exit(EXIT_FAILURE); } int dim = 10; //TODO fixed data dimension int numData = 0; while ( getline(in, line) && in.good() ) { csvline_populate(row, line, ','); NICE::Vector vec(dim); for (int i = 0; i < (int)row.size(); i++) { double dval = 0.0; dval = atof(row[i].data() ); vec.set(i,dval); } Data.push_back(vec); numData++; } in.close(); cerr<<"Finished."<<endl<<"Starting to get preloaded Labels..."; in.open( (path+ydat).c_str() ); if ( in.fail() ) { cout << "File not found! Setting default value 0.0..." <<endl; y.resize(numData); y.set(0.0); } else { y.resize(numData); int count = 0; while(getline(in, line) && in.good() ) { csvline_populate(row, line, ','); for ( int i = 0; i < (int)row.size(); i++ ) { double dval = 0.0; dval = atof(row[i].data() ); y.set(count,dval); count++; } } in.close(); } cerr<<"Finished."<<endl; } void testFrame ( Config confRDF, NICE::VVector &xdata, NICE::Vector &y ) { cerr<<"\nStarting test framework..."<<endl; /*------------Initialize Variables-----------*/ ofstream storeEvalData; int trainingSize = (int)(.5*xdata.size()); int testingSize = xdata.size() - trainingSize; vector<int> indices; for ( int i = 0; i < (int)xdata.size(); i++ ) indices.push_back(i); int nfolds = confRDF.gI( "debug", "nfolds", 10 ); Vector mef_v ( nfolds ); Vector corr_v ( nfolds ); Vector resub_v ( nfolds ); Vector diff_v ( nfolds ); KernelExp *kernel_template = new KernelExp ( confRDF.gD("Kernel", "log_rbf_gamma", -2.5), 0.0 ); /*--------------Setting up GP----------------*/ for ( int k = 0; k < nfolds; k++) { string fold; ostringstream convert; convert << k; fold = convert.str(); cout << "\nFOLD " << k << ":\n======" << endl; cerr << "Initializing GP regression..."; Kernel *kernel_function = NULL; kernel_function = new KernelExp ( *(kernel_template) ); RegGaussianProcess *regGP = new RegGaussianProcess( &confRDF, kernel_function, "GPRegression" ); NICE::VVector trainData, testData; NICE::Vector trainVals ( trainingSize ); NICE::Vector testVals ( testingSize ); random_shuffle( indices.begin(), indices.end() ); for ( int i = 0; i < trainingSize; i++ ) { trainData.push_back ( xdata[ indices[i] ] ); trainVals.set( i, y[ indices[i] ] ); } for ( int j = 0; j < testingSize; j++ ) { testData.push_back ( xdata[ indices[j+trainingSize] ] ); testVals.set( j, y[ indices[j+trainingSize] ] ); } cerr << "Finished." << endl; cerr << "Teaching the GP regression..."; regGP->teach( trainData, trainVals ); cerr << "Finished." << endl; /*---------------Testing GP----------------*/ cerr << "\nGetting prediction values for all data points..."; NICE::Vector predictionValues( testingSize ); predictionValues.set ( 0.0 ); for ( int j = 0; j < testingSize; j++ ) { predictionValues[j] = regGP->predict( testData[j] ); } cerr << "Finished." << endl; /*---------------Evaluation----------------*/ NICE::Vector diff = testVals - predictionValues; for (int j = 0; j < testingSize; j++) cerr << testVals[j] << " " << predictionValues[j] << endl; double mod_var = diff.StdDev()*diff.StdDev(); double tar_var = testVals.StdDev()*testVals.StdDev(); mef_v.set( k, (1-mod_var/tar_var) ); NICE::Vector meanv( predictionValues.size() ); meanv.set( diff.Mean() ); NICE::Vector lhs = diff - meanv; meanv.set( testVals.Mean() ); NICE::Vector rhs = testVals - meanv; lhs *= rhs; double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() ); corr_v.set( k, corr ); diff *= diff; diff_v.set( k, diff.Mean()); resub_v.set( k, (diff.Mean() / tar_var) ); } /*------------------Output-------------------*/ cout << "\nSimple Cross Validation Stats:\n==============================" << endl; cout << " Modelling Efficiency: " << mef_v.Mean() << endl; cout << " Correlation: " << corr_v.Mean() << endl; cout << " Mean Square Error: " << diff_v.Mean() << endl; cout << " Standardized MSE: " << resub_v.Mean() << endl; } int main (int argc, char **argv) { string path = "/home/sickert/data/cosre-MPI/regression-fluxcom/_DATA/"; Config confRDF(path+"config.conf"); //Config for RDF NICE::VVector xdata; NICE::Vector y; /*----------Load dataset---------*/ loadData(xdata, y, path, "flux_x.csv", "flux_y.csv"); //load all data testFrame( confRDF, xdata, y ); return 0; }