123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327 |
- /**
- * @file testRegressionRDF.cpp
- * @brief test of RDF with arbitrary leaf regression method
- * @author Sven Sickert
- * @date 07/02/2013
- */
- #include <sstream>
- #include <iostream>
- #include <fstream>
- #include <sstream>
- #include <string>
- #include <vector>
- #include <stdlib.h>
- #include <assert.h>
- #include "core/basics/Config.h"
- #include "core/vector/VectorT.h"
- #include "core/vector/VVector.h"
- #include "vislearning/baselib/ICETools.h"
- #include "vislearning/regression/regcombination/RegPreRandomForests.h"
- #include "vislearning/regression/gpregression/RegGaussianProcess.h"
- #include "vislearning/regression/linregression/LinRegression.h"
- #include "vislearning/regression/npregression/RegKNN.h"
- #include "vislearning/math/kernels/KernelExp.h"
- using namespace OBJREC;
- using namespace NICE;
- using namespace std;
- void csvline_populate ( vector<string> &record,
- const string& line,
- char delimiter )
- {
- int linepos=0;
- int inquotes=false;
- char c;
- int linemax=line.length();
- string curstring;
- record.clear();
- while(line[linepos]!=0 && linepos < linemax)
- {
- c = line[linepos];
- if (!inquotes && curstring.length()==0 && c=='"')
- {
- //beginquotechar
- inquotes=true;
- }
- else if (inquotes && c=='"')
- {
- //quotechar
- if ( (linepos+1 <linemax) && (line[linepos+1]=='"') )
- {
- //encountered 2 double quotes in a row (resolves to 1 double quote)
- curstring.push_back(c);
- linepos++;
- }
- else
- {
- //endquotechar
- inquotes=false;
- }
- }
- else if (!inquotes && c==delimiter)
- {
- //end of field
- record.push_back( curstring );
- curstring="";
- }
- else if (!inquotes && (c=='\r' || c=='\n') )
- {
- record.push_back( curstring );
- return;
- }
- else
- {
- curstring.push_back(c);
- }
- linepos++;
- }
-
- record.push_back( curstring );
- }
- void loadData( NICE::VVector &Data,
- NICE::Vector &y,
- const string &path,
- const string &xdat,
- const string &ydat )
- {
- vector<string> row;
- string line;
- cerr<<"Preloading Data...";
- ifstream in( (path+xdat).c_str() );
- if ( in.fail() )
- {
- cout << "File not found" <<endl;
- exit(EXIT_FAILURE);
- }
- int numData = 0;
- while ( getline(in, line) && in.good() )
- {
- csvline_populate(row, line, ',');
- vector<double> vec;
- for (int i = 0; i < (int)row.size(); i++)
- {
- double dval = 0.0;
- dval = atof(row[i].data() );
- vec.push_back(dval);
- }
- NICE::Vector nvec(vec);
- Data.push_back(nvec);
- numData++;
- }
- in.close();
- cerr<<"Finished."<<endl<<"Starting to get preloaded Labels...";
- in.open( (path+ydat).c_str() );
- if ( in.fail() )
- {
- cout << "File not found! Setting default value 0.0..." <<endl;
- y.resize(numData);
- y.set(0.0);
- }
- else
- {
- y.resize(numData);
- int count = 0;
- while(getline(in, line) && in.good() )
- {
- csvline_populate(row, line, ',');
- for ( int i = 0; i < (int)row.size(); i++ )
- {
- double dval = 0.0;
- dval = atof(row[i].data() );
- y.set(count,dval);
- count++;
- }
- }
- in.close();
- }
- cerr<<"Finished."<<endl;
- }
- void testFrame ( Config confRDF,
- NICE::VVector &xdata,
- NICE::Vector &y )
- {
- cerr<<"\nStarting test framework..."<<endl;
-
- /*------------Initialize Variables-----------*/
- ofstream storeEvalData;
- double trainRatio = confRDF.gD( "debug", "training_ratio", .9 );
-
- int trainingSize = (int)(trainRatio*xdata.size());
- int testingSize = xdata.size() - trainingSize;
-
- vector<int> indices;
- for ( int i = 0; i < (int)xdata.size(); i++ )
- indices.push_back(i);
-
- int nfolds = confRDF.gI( "debug", "nfolds", 10 );
- Vector mef_v ( nfolds );
- Vector corr_v ( nfolds );
- Vector resub_v ( nfolds );
- Vector diff_v ( nfolds );
- bool saveForest = confRDF.gB( "debug", "save_forest", false );
- string leafReg = confRDF.gS( "PreRandomForest", "leaf_regression", "gp" );
-
- KernelExp *kernel_template = new KernelExp ( confRDF.gD("Kernel", "log_rbf_gamma", -2.5), 0.0 );
-
- /*------------Store Configuration------------*/
- string filename = confRDF.gS( "debug", "filename" );
-
- if ( saveForest )
- {
- cout << "Configuration will be stored in: " << filename << "_config" << endl;
-
- storeEvalData.open ( (filename+"_config").c_str() );
- storeEvalData << "random_split_tests=" << confRDF.gI ( "RTBRandom", "random_split_tests" ) << endl;
- storeEvalData << "random_features=" << confRDF.gI ( "RTBRandom", "random_features" ) << endl;
- storeEvalData << "max_depth=" << confRDF.gI ( "RTBRandom", "max_depth" ) << endl;
- storeEvalData << "random_split_mode=" << confRDF.gS ( "RTBRandom", "random_split_mode" ) << endl;
- storeEvalData << "min_examples=" << confRDF.gI ( "RTBRandom", "min_examples" ) << endl;
- storeEvalData << "number_of_trees=" << confRDF.gI ( "RandomForest", "number_of_trees" ) << endl;
- storeEvalData << "features_per_tree=" << confRDF.gD ( "RandomForest", "features_per_tree" ) << endl;
- storeEvalData << "samples_per_tree=" << confRDF.gD ( "RandomForest", "samples_per_tree" ) << endl;
- storeEvalData << "builder=" << confRDF.gS ( "RandomForest", "builder" ) << endl;
- storeEvalData << "minimum_error_reduction=" << confRDF.gD ( "RandomForest", "minimum_error_reduction" ) << endl;
- storeEvalData << "log_rbf_gamma=" << confRDF.gD ( "Kernel", "log_rbf_gamma" ) << endl;
- storeEvalData.close();
- } else
- {
- cout << "Configuration will not be stored." << endl;
- }
-
- /*------------Setting up PreRDF--------------*/
- for ( int k = 0; k < nfolds; k++)
- {
- string fold;
- ostringstream convert;
- convert << k;
- fold = convert.str();
-
- cout << "\nFOLD " << k << ":\n======" << endl;
-
- cerr << "Initializing leaf regression method " << leafReg << "...";
- RegressionAlgorithm *leafRegression = NULL;
- Kernel *kernel_function = NULL;
- if ( leafReg == "GaussProcess" )
- {
- kernel_function = new KernelExp ( *(kernel_template) );
- leafRegression = new RegGaussianProcess( &confRDF, kernel_function, "GPRegression" );
- }
- else if ( leafReg == "Linear" )
- leafRegression = new LinRegression ();
- else if ( leafReg == "KNN" )
- leafRegression = new RegKNN ( &confRDF, NULL);
- else if ( leafReg == "none" ) {
- cerr << "\ntestRegressionRDFGP::testFrame: No leaf regression method set! Using RandomForest prediction..." << endl;
- } else {
- cerr << "\ntestRegressionRDFGP::testFrame: No valid leaf regression method set! Aborting..." << endl;
- exit(-1);
- }
- cerr << "Finished." << endl;
- cerr << "Initializing PreRDF for regression...";
- RegPreRandomForests *prf = new RegPreRandomForests ( &confRDF, "PreRandomForest", leafRegression );
- cerr << "Finished." << endl;
-
- cerr << "Teaching the PreRDF for regression...";
- NICE::VVector trainData, testData;
- NICE::Vector trainVals ( trainingSize );
- NICE::Vector testVals ( testingSize );
- random_shuffle( indices.begin(), indices.end() );
- for ( int i = 0; i < trainingSize; i++ )
- {
- trainData.push_back ( xdata[ indices[i] ] );
- trainVals.set( i, y[ indices[i] ] );
- }
- for ( int j = 0; j < testingSize; j++ )
- {
- testData.push_back ( xdata[ indices[j+trainingSize] ] );
- testVals.set( j, y[ indices[j+trainingSize] ] );
- }
-
- prf->teach ( trainData, trainVals );
- cerr << "Finished." << endl;
-
- /*-------------Testing RDF-GP--------------*/
- cerr << "\nGetting prediction values for all data points...";
- NICE::Vector predictionValues( testingSize );
- predictionValues.set ( 0.0 );
- for ( int j = 0; j < testingSize; j++ )
- {
- predictionValues[j] = prf->predict( testData[j] );
- }
- cerr << "Finished." << endl;
-
- /*---------------Evaluation----------------*/
- NICE::Vector diff = testVals - predictionValues;
-
- double mod_var = diff.StdDev()*diff.StdDev();
- double tar_var = testVals.StdDev()*testVals.StdDev();
- mef_v.set( k, (1-mod_var/tar_var) );
-
- NICE::Vector meanv( predictionValues.size() );
- meanv.set( diff.Mean() );
- NICE::Vector lhs = diff - meanv;
- meanv.set( testVals.Mean() );
- NICE::Vector rhs = testVals - meanv;
- lhs *= rhs;
- double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() );
- corr_v.set( k, corr );
-
- diff *= diff;
- diff_v.set( k, diff.Mean());
- resub_v.set( k, (diff.Mean() / tar_var) );
-
- if (kernel_function != NULL)
- delete kernel_function;
- }
-
- /*------------------Output-------------------*/
- cout << "\nSimple Cross Validation Stats:\n==============================" << endl;
- cout << " Modelling Efficiency: " << mef_v.Mean() << endl;
- cout << " Correlation: " << corr_v.Mean() << endl;
- cout << " Mean Square Error: " << diff_v.Mean() << endl;
- cout << " Standardized MSE: " << resub_v.Mean() << endl;
-
- /*-----------------Cleaning------------------*/
- delete kernel_template;
- }
- int main (int argc, char **argv) {
- Config conf ( argc, argv ); //Config for RFGP
-
- string path = conf.gS( "debug", "path", "." );
- string dataset = conf.gS( "debug", "dataset", "flux" );
- NICE::VVector xdata;
- NICE::Vector y;
- loadData(xdata, y, path, (dataset+"_x.csv"), (dataset+"_y.csv") ); //load all data
-
- testFrame( conf, xdata, y );
- return 0;
- }
|