/** * @file testRegressionRDF.cpp * @brief test of RDF with arbitrary leaf regression method * @author Sven Sickert * @date 07/02/2013 */ #include #include #include #include #include #include #include #include #include "core/basics/Config.h" #include "core/vector/VectorT.h" #include "core/vector/VVector.h" #include "vislearning/baselib/ICETools.h" #include "vislearning/regression/regcombination/RegPreRandomForests.h" #include "vislearning/regression/gpregression/RegGaussianProcess.h" #include "vislearning/regression/linregression/LinRegression.h" #include "vislearning/regression/npregression/RegKNN.h" #include "vislearning/math/kernels/KernelExp.h" using namespace OBJREC; using namespace NICE; using namespace std; void csvline_populate ( vector &record, const string& line, char delimiter ) { int linepos=0; int inquotes=false; char c; int linemax=line.length(); string curstring; record.clear(); while(line[linepos]!=0 && linepos < linemax) { c = line[linepos]; if (!inquotes && curstring.length()==0 && c=='"') { //beginquotechar inquotes=true; } else if (inquotes && c=='"') { //quotechar if ( (linepos+1 row; string line; cerr<<"Preloading Data..."; ifstream in( (path+xdat).c_str() ); if ( in.fail() ) { cout << "File not found" < vec; for (int i = 0; i < (int)row.size(); i++) { double dval = 0.0; dval = atof(row[i].data() ); vec.push_back(dval); } NICE::Vector nvec(vec); Data.push_back(nvec); numData++; } in.close(); cerr<<"Finished."< indices; for ( int i = 0; i < (int)xdata.size(); i++ ) indices.push_back(i); int nfolds = confRDF.gI( "debug", "nfolds", 10 ); Vector mef_v ( nfolds ); Vector corr_v ( nfolds ); Vector resub_v ( nfolds ); Vector diff_v ( nfolds ); bool saveForest = confRDF.gB( "debug", "save_forest", false ); string leafReg = confRDF.gS( "PreRandomForest", "leaf_regression", "gp" ); KernelExp *kernel_template = new KernelExp ( confRDF.gD("Kernel", "log_rbf_gamma", -2.5), 0.0 ); /*------------Store Configuration------------*/ string filename = confRDF.gS( "debug", "filename" ); if ( saveForest ) { cout << "Configuration will be stored in: " << filename << "_config" << endl; storeEvalData.open ( (filename+"_config").c_str() ); storeEvalData << "random_split_tests=" << confRDF.gI ( "RTBRandom", "random_split_tests" ) << endl; storeEvalData << "random_features=" << confRDF.gI ( "RTBRandom", "random_features" ) << endl; storeEvalData << "max_depth=" << confRDF.gI ( "RTBRandom", "max_depth" ) << endl; storeEvalData << "random_split_mode=" << confRDF.gS ( "RTBRandom", "random_split_mode" ) << endl; storeEvalData << "min_examples=" << confRDF.gI ( "RTBRandom", "min_examples" ) << endl; storeEvalData << "number_of_trees=" << confRDF.gI ( "RandomForest", "number_of_trees" ) << endl; storeEvalData << "features_per_tree=" << confRDF.gD ( "RandomForest", "features_per_tree" ) << endl; storeEvalData << "samples_per_tree=" << confRDF.gD ( "RandomForest", "samples_per_tree" ) << endl; storeEvalData << "builder=" << confRDF.gS ( "RandomForest", "builder" ) << endl; storeEvalData << "minimum_error_reduction=" << confRDF.gD ( "RandomForest", "minimum_error_reduction" ) << endl; storeEvalData << "log_rbf_gamma=" << confRDF.gD ( "Kernel", "log_rbf_gamma" ) << endl; storeEvalData.close(); } else { cout << "Configuration will not be stored." << endl; } /*------------Setting up PreRDF--------------*/ for ( int k = 0; k < nfolds; k++) { string fold; ostringstream convert; convert << k; fold = convert.str(); cout << "\nFOLD " << k << ":\n======" << endl; cerr << "Initializing leaf regression method " << leafReg << "..."; RegressionAlgorithm *leafRegression = NULL; Kernel *kernel_function = NULL; if ( leafReg == "GaussProcess" ) { kernel_function = new KernelExp ( *(kernel_template) ); leafRegression = new RegGaussianProcess( &confRDF, kernel_function, "GPRegression" ); } else if ( leafReg == "Linear" ) leafRegression = new LinRegression (); else if ( leafReg == "KNN" ) leafRegression = new RegKNN ( &confRDF, NULL); else if ( leafReg == "none" ) { cerr << "\ntestRegressionRDFGP::testFrame: No leaf regression method set! Using RandomForest prediction..." << endl; } else { cerr << "\ntestRegressionRDFGP::testFrame: No valid leaf regression method set! Aborting..." << endl; exit(-1); } cerr << "Finished." << endl; cerr << "Initializing PreRDF for regression..."; RegPreRandomForests *prf = new RegPreRandomForests ( &confRDF, "PreRandomForest", leafRegression ); cerr << "Finished." << endl; cerr << "Teaching the PreRDF for regression..."; NICE::VVector trainData, testData; NICE::Vector trainVals ( trainingSize ); NICE::Vector testVals ( testingSize ); random_shuffle( indices.begin(), indices.end() ); for ( int i = 0; i < trainingSize; i++ ) { trainData.push_back ( xdata[ indices[i] ] ); trainVals.set( i, y[ indices[i] ] ); } for ( int j = 0; j < testingSize; j++ ) { testData.push_back ( xdata[ indices[j+trainingSize] ] ); testVals.set( j, y[ indices[j+trainingSize] ] ); } prf->teach ( trainData, trainVals ); cerr << "Finished." << endl; /*-------------Testing RDF-GP--------------*/ cerr << "\nGetting prediction values for all data points..."; NICE::Vector predictionValues( testingSize ); predictionValues.set ( 0.0 ); for ( int j = 0; j < testingSize; j++ ) { predictionValues[j] = prf->predict( testData[j] ); } cerr << "Finished." << endl; /*---------------Evaluation----------------*/ NICE::Vector diff = testVals - predictionValues; double mod_var = diff.StdDev()*diff.StdDev(); double tar_var = testVals.StdDev()*testVals.StdDev(); mef_v.set( k, (1-mod_var/tar_var) ); NICE::Vector meanv( predictionValues.size() ); meanv.set( diff.Mean() ); NICE::Vector lhs = diff - meanv; meanv.set( testVals.Mean() ); NICE::Vector rhs = testVals - meanv; lhs *= rhs; double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() ); corr_v.set( k, corr ); diff *= diff; diff_v.set( k, diff.Mean()); resub_v.set( k, (diff.Mean() / tar_var) ); if (kernel_function != NULL) delete kernel_function; } /*------------------Output-------------------*/ cout << "\nSimple Cross Validation Stats:\n==============================" << endl; cout << " Modelling Efficiency: " << mef_v.Mean() << endl; cout << " Correlation: " << corr_v.Mean() << endl; cout << " Mean Square Error: " << diff_v.Mean() << endl; cout << " Standardized MSE: " << resub_v.Mean() << endl; /*-----------------Cleaning------------------*/ delete kernel_template; } int main (int argc, char **argv) { Config conf ( argc, argv ); //Config for RFGP string path = conf.gS( "debug", "path", "." ); string dataset = conf.gS( "debug", "dataset", "flux" ); NICE::VVector xdata; NICE::Vector y; loadData(xdata, y, path, (dataset+"_x.csv"), (dataset+"_y.csv") ); //load all data testFrame( conf, xdata, y ); return 0; }