/** * @file testLinRegression.cpp * @brief test of linear regression * @author Frank Prüfer * @date 08/13/2013 */ #include <sstream> #include <iostream> #include <fstream> #include <sstream> #include <string> #include <vector> #include <stdlib.h> #include <assert.h> #include "core/basics/Config.h" #include "core/vector/VectorT.h" #include "core/vector/VVector.h" #include "vislearning/baselib/ICETools.h" #include "vislearning/regression/linregression/LinRegression.h" using namespace OBJREC; using namespace NICE; using namespace std; void csvline_populate ( vector<string> &record, const string& line, char delimiter ) { int linepos=0; int inquotes=false; char c; int linemax=line.length(); string curstring; record.clear(); while(line[linepos]!=0 && linepos < linemax) { c = line[linepos]; if (!inquotes && curstring.length()==0 && c=='"') { //beginquotechar inquotes=true; } else if (inquotes && c=='"') { //quotechar if ( (linepos+1 <linemax) && (line[linepos+1]=='"') ) { //encountered 2 double quotes in a row (resolves to 1 double quote) curstring.push_back(c); linepos++; } else { //endquotechar inquotes=false; } } else if (!inquotes && c==delimiter) { //end of field record.push_back( curstring ); curstring=""; } else if (!inquotes && (c=='\r' || c=='\n') ) { record.push_back( curstring ); return; } else { curstring.push_back(c); } linepos++; } record.push_back( curstring ); } void loadData( NICE::VVector &Data, NICE::Vector &y, const string &path, const string &xdat, const string &ydat ) { vector<string> row; string line; cerr<<"Preloading Data..."; ifstream in( (path+xdat).c_str() ); if ( in.fail() ) { cout << "File not found" <<endl; exit(EXIT_FAILURE); } int numData = 0; while ( getline(in, line) && in.good() ) { csvline_populate(row, line, ','); vector<double> vec; for (int i = 0; i < (int)row.size(); i++) { double dval = 0.0; dval = atof(row[i].data() ); vec.push_back(dval); } NICE::Vector nvec(vec); Data.push_back(nvec); numData++; } in.close(); cerr<<"Finished."<<endl<<"Starting to get preloaded Labels..."; in.open( (path+ydat).c_str() ); if ( in.fail() ) { cout << "File not found! Setting default value 0.0..." <<endl; y.resize(numData); y.set(0.0); } else { y.resize(numData); int count = 0; while(getline(in, line) && in.good() ) { csvline_populate(row, line, ','); for ( int i = 0; i < (int)row.size(); i++ ) { double dval = 0.0; dval = atof(row[i].data() ); y.set(count,dval); count++; } } in.close(); } cerr<<"Finished."<<endl; } void testFrame ( Config conf, NICE::VVector &xdata, NICE::Vector &y ) { cerr<<"\nStarting test framework..."<<endl; /*------------Initialize Variables-----------*/ ofstream storeEvalData; double trainRatio = conf.gD( "debug", "training_ratio", .9 ); int trainingSize = (int)(trainRatio*xdata.size()); int testingSize = xdata.size() - trainingSize; vector<int> indices; for ( int i = 0; i < (int)xdata.size(); i++ ) indices.push_back(i); int nfolds = conf.gI( "debug", "nfolds", 10 ); Vector mef_v ( nfolds ); Vector corr_v ( nfolds ); Vector resub_v ( nfolds ); Vector diff_v ( nfolds ); bool saveConfig = conf.gB( "debug", "save_config", false ); /*------------Store Configuration------------*/ string filename = conf.gS( "debug", "filename" ); if ( saveConfig ) { cout << "Configuration will be stored in: " << filename << "_config" << endl; storeEvalData.open ( (filename+"_config").c_str() ); storeEvalData.close(); } else { cout << "Configuration will not be stored." << endl; } /*------------Setting up PreRDF--------------*/ for ( int k = 0; k < nfolds; k++) { string fold; ostringstream convert; convert << k; fold = convert.str(); cout << "\nFOLD " << k << ":\n======" << endl; cerr << "Initializing LinRegression..."; LinRegression *linReg = new LinRegression (); cerr << "Finished." << endl; cerr << "Teaching the LinRegression algorithm..."; NICE::VVector trainData, testData; NICE::Vector trainVals ( trainingSize ); NICE::Vector testVals ( testingSize ); random_shuffle( indices.begin(), indices.end() ); for ( int i = 0; i < trainingSize; i++ ) { trainData.push_back ( xdata[ indices[i] ] ); trainVals.set( i, y[ indices[i] ] ); } for ( int j = 0; j < testingSize; j++ ) { testData.push_back ( xdata[ indices[j+trainingSize] ] ); testVals.set( j, y[ indices[j+trainingSize] ] ); } linReg->teach ( trainData, trainVals ); cerr << "Finished." << endl; /*-------------Testing RDF-GP--------------*/ cerr << "\nGetting prediction values for all data points..."; NICE::Vector predictionValues( testingSize ); predictionValues.set ( 0.0 ); for ( int j = 0; j < testingSize; j++ ) { predictionValues[j] = linReg->predict( testData[j] ); } cerr << "Finished." << endl; /*---------------Evaluation----------------*/ NICE::Vector diff = testVals - predictionValues; double mod_var = diff.StdDev()*diff.StdDev(); double tar_var = testVals.StdDev()*testVals.StdDev(); mef_v.set( k, (1-mod_var/tar_var) ); NICE::Vector meanv( predictionValues.size() ); meanv.set( diff.Mean() ); NICE::Vector lhs = diff - meanv; meanv.set( testVals.Mean() ); NICE::Vector rhs = testVals - meanv; lhs *= rhs; double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() ); corr_v.set( k, corr ); diff *= diff; diff_v.set( k, diff.Mean()); resub_v.set( k, (diff.Mean() / tar_var) ); } /*------------------Output-------------------*/ cout << "\nSimple Cross Validation Stats:\n==============================" << endl; cout << " Modelling Efficiency: " << mef_v.Mean() << endl; cout << " Correlation: " << corr_v.Mean() << endl; cout << " Mean Square Error: " << diff_v.Mean() << endl; cout << " Standardized MSE: " << resub_v.Mean() << endl; } int main (int argc, char **argv) { Config conf ( argc, argv ); //get config from user input string path = conf.gS( "debug", "path", "." ); string dataset = conf.gS( "debug", "dataset", "flux" ); NICE::VVector xdata; NICE::Vector y; loadData(xdata, y, path, (dataset+"_x.csv"), (dataset+"_y.csv") ); //load all data testFrame( conf, xdata, y ); return 0; }