testRegressionRDF.cpp 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /**
  2. * @file testRegressionRDF.cpp
  3. * @brief test of RDF with arbitrary leaf regression method
  4. * @author Sven Sickert
  5. * @date 07/02/2013
  6. */
  7. #include <sstream>
  8. #include <iostream>
  9. #include <fstream>
  10. #include <sstream>
  11. #include <string>
  12. #include <vector>
  13. #include <stdlib.h>
  14. #include <assert.h>
  15. #include "core/basics/Config.h"
  16. #include "core/vector/VectorT.h"
  17. #include "core/vector/VVector.h"
  18. #include "vislearning/baselib/ICETools.h"
  19. #include "vislearning/regression/regcombination/RegPreRandomForests.h"
  20. #include "vislearning/regression/gpregression/RegGaussianProcess.h"
  21. #include "vislearning/regression/linregression/LinRegression.h"
  22. #include "vislearning/regression/npregression/RegKNN.h"
  23. #include "vislearning/math/kernels/KernelExp.h"
  24. using namespace OBJREC;
  25. using namespace NICE;
  26. using namespace std;
  27. void csvline_populate ( vector<string> &record,
  28. const string& line,
  29. char delimiter )
  30. {
  31. int linepos=0;
  32. int inquotes=false;
  33. char c;
  34. int linemax=line.length();
  35. string curstring;
  36. record.clear();
  37. while(line[linepos]!=0 && linepos < linemax)
  38. {
  39. c = line[linepos];
  40. if (!inquotes && curstring.length()==0 && c=='"')
  41. {
  42. //beginquotechar
  43. inquotes=true;
  44. }
  45. else if (inquotes && c=='"')
  46. {
  47. //quotechar
  48. if ( (linepos+1 <linemax) && (line[linepos+1]=='"') )
  49. {
  50. //encountered 2 double quotes in a row (resolves to 1 double quote)
  51. curstring.push_back(c);
  52. linepos++;
  53. }
  54. else
  55. {
  56. //endquotechar
  57. inquotes=false;
  58. }
  59. }
  60. else if (!inquotes && c==delimiter)
  61. {
  62. //end of field
  63. record.push_back( curstring );
  64. curstring="";
  65. }
  66. else if (!inquotes && (c=='\r' || c=='\n') )
  67. {
  68. record.push_back( curstring );
  69. return;
  70. }
  71. else
  72. {
  73. curstring.push_back(c);
  74. }
  75. linepos++;
  76. }
  77. record.push_back( curstring );
  78. }
  79. void loadData( NICE::VVector &Data,
  80. NICE::Vector &y,
  81. const string &path,
  82. const string &xdat,
  83. const string &ydat )
  84. {
  85. vector<string> row;
  86. string line;
  87. cerr<<"Preloading Data...";
  88. ifstream in( (path+xdat).c_str() );
  89. if ( in.fail() )
  90. {
  91. cout << "File not found" <<endl;
  92. exit(EXIT_FAILURE);
  93. }
  94. int numData = 0;
  95. while ( getline(in, line) && in.good() )
  96. {
  97. csvline_populate(row, line, ',');
  98. vector<double> vec;
  99. for (int i = 0; i < (int)row.size(); i++)
  100. {
  101. double dval = 0.0;
  102. dval = atof(row[i].data() );
  103. vec.push_back(dval);
  104. }
  105. NICE::Vector nvec(vec);
  106. Data.push_back(nvec);
  107. numData++;
  108. }
  109. in.close();
  110. cerr<<"Finished."<<endl<<"Starting to get preloaded Labels...";
  111. in.open( (path+ydat).c_str() );
  112. if ( in.fail() )
  113. {
  114. cout << "File not found! Setting default value 0.0..." <<endl;
  115. y.resize(numData);
  116. y.set(0.0);
  117. }
  118. else
  119. {
  120. y.resize(numData);
  121. int count = 0;
  122. while(getline(in, line) && in.good() )
  123. {
  124. csvline_populate(row, line, ',');
  125. for ( int i = 0; i < (int)row.size(); i++ )
  126. {
  127. double dval = 0.0;
  128. dval = atof(row[i].data() );
  129. y.set(count,dval);
  130. count++;
  131. }
  132. }
  133. in.close();
  134. }
  135. cerr<<"Finished."<<endl;
  136. }
  137. void testFrame ( Config confRDF,
  138. NICE::VVector &xdata,
  139. NICE::Vector &y )
  140. {
  141. cerr<<"\nStarting test framework..."<<endl;
  142. /*------------Initialize Variables-----------*/
  143. ofstream storeEvalData;
  144. double trainRatio = confRDF.gD( "debug", "training_ratio", .9 );
  145. int trainingSize = (int)(trainRatio*xdata.size());
  146. int testingSize = xdata.size() - trainingSize;
  147. vector<int> indices;
  148. for ( int i = 0; i < (int)xdata.size(); i++ )
  149. indices.push_back(i);
  150. int nfolds = confRDF.gI( "debug", "nfolds", 10 );
  151. Vector mef_v ( nfolds );
  152. Vector corr_v ( nfolds );
  153. Vector resub_v ( nfolds );
  154. Vector diff_v ( nfolds );
  155. bool saveForest = confRDF.gB( "debug", "save_forest", false );
  156. string leafReg = confRDF.gS( "PreRandomForest", "leaf_regression", "gp" );
  157. KernelExp *kernel_template = new KernelExp ( confRDF.gD("Kernel", "log_rbf_gamma", -2.5), 0.0 );
  158. /*------------Store Configuration------------*/
  159. string filename = confRDF.gS( "debug", "filename" );
  160. if ( saveForest )
  161. {
  162. cout << "Configuration will be stored in: " << filename << "_config" << endl;
  163. storeEvalData.open ( (filename+"_config").c_str() );
  164. storeEvalData << "random_split_tests=" << confRDF.gI ( "RTBRandom", "random_split_tests" ) << endl;
  165. storeEvalData << "random_features=" << confRDF.gI ( "RTBRandom", "random_features" ) << endl;
  166. storeEvalData << "max_depth=" << confRDF.gI ( "RTBRandom", "max_depth" ) << endl;
  167. storeEvalData << "random_split_mode=" << confRDF.gS ( "RTBRandom", "random_split_mode" ) << endl;
  168. storeEvalData << "min_examples=" << confRDF.gI ( "RTBRandom", "min_examples" ) << endl;
  169. storeEvalData << "number_of_trees=" << confRDF.gI ( "RandomForest", "number_of_trees" ) << endl;
  170. storeEvalData << "features_per_tree=" << confRDF.gD ( "RandomForest", "features_per_tree" ) << endl;
  171. storeEvalData << "samples_per_tree=" << confRDF.gD ( "RandomForest", "samples_per_tree" ) << endl;
  172. storeEvalData << "builder=" << confRDF.gS ( "RandomForest", "builder" ) << endl;
  173. storeEvalData << "minimum_error_reduction=" << confRDF.gD ( "RandomForest", "minimum_error_reduction" ) << endl;
  174. storeEvalData << "log_rbf_gamma=" << confRDF.gD ( "Kernel", "log_rbf_gamma" ) << endl;
  175. storeEvalData.close();
  176. } else
  177. {
  178. cout << "Configuration will not be stored." << endl;
  179. }
  180. /*------------Setting up PreRDF--------------*/
  181. for ( int k = 0; k < nfolds; k++)
  182. {
  183. string fold;
  184. ostringstream convert;
  185. convert << k;
  186. fold = convert.str();
  187. cout << "\nFOLD " << k << ":\n======" << endl;
  188. cerr << "Initializing leaf regression method " << leafReg << "...";
  189. RegressionAlgorithm *leafRegression = NULL;
  190. Kernel *kernel_function = NULL;
  191. if ( leafReg == "GaussProcess" )
  192. {
  193. kernel_function = new KernelExp ( *(kernel_template) );
  194. leafRegression = new RegGaussianProcess( &confRDF, kernel_function, "GPRegression" );
  195. }
  196. else if ( leafReg == "Linear" )
  197. leafRegression = new LinRegression ();
  198. else if ( leafReg == "KNN" )
  199. leafRegression = new RegKNN ( &confRDF, NULL);
  200. else if ( leafReg == "none" ) {
  201. cerr << "\ntestRegressionRDFGP::testFrame: No leaf regression method set! Using RandomForest prediction..." << endl;
  202. } else {
  203. cerr << "\ntestRegressionRDFGP::testFrame: No valid leaf regression method set! Aborting..." << endl;
  204. exit(-1);
  205. }
  206. cerr << "Finished." << endl;
  207. cerr << "Initializing PreRDF for regression...";
  208. RegPreRandomForests *prf = new RegPreRandomForests ( &confRDF, "PreRandomForest", leafRegression );
  209. cerr << "Finished." << endl;
  210. cerr << "Teaching the PreRDF for regression...";
  211. NICE::VVector trainData, testData;
  212. NICE::Vector trainVals ( trainingSize );
  213. NICE::Vector testVals ( testingSize );
  214. random_shuffle( indices.begin(), indices.end() );
  215. for ( int i = 0; i < trainingSize; i++ )
  216. {
  217. trainData.push_back ( xdata[ indices[i] ] );
  218. trainVals.set( i, y[ indices[i] ] );
  219. }
  220. for ( int j = 0; j < testingSize; j++ )
  221. {
  222. testData.push_back ( xdata[ indices[j+trainingSize] ] );
  223. testVals.set( j, y[ indices[j+trainingSize] ] );
  224. }
  225. prf->teach ( trainData, trainVals );
  226. cerr << "Finished." << endl;
  227. /*-------------Testing RDF-GP--------------*/
  228. cerr << "\nGetting prediction values for all data points...";
  229. NICE::Vector predictionValues( testingSize );
  230. predictionValues.set ( 0.0 );
  231. for ( int j = 0; j < testingSize; j++ )
  232. {
  233. predictionValues[j] = prf->predict( testData[j] );
  234. }
  235. cerr << "Finished." << endl;
  236. /*---------------Evaluation----------------*/
  237. NICE::Vector diff = testVals - predictionValues;
  238. double mod_var = diff.StdDev()*diff.StdDev();
  239. double tar_var = testVals.StdDev()*testVals.StdDev();
  240. mef_v.set( k, (1-mod_var/tar_var) );
  241. NICE::Vector meanv( predictionValues.size() );
  242. meanv.set( diff.Mean() );
  243. NICE::Vector lhs = diff - meanv;
  244. meanv.set( testVals.Mean() );
  245. NICE::Vector rhs = testVals - meanv;
  246. lhs *= rhs;
  247. double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() );
  248. corr_v.set( k, corr );
  249. diff *= diff;
  250. diff_v.set( k, diff.Mean());
  251. resub_v.set( k, (diff.Mean() / tar_var) );
  252. if (kernel_function != NULL)
  253. delete kernel_function;
  254. }
  255. /*------------------Output-------------------*/
  256. cout << "\nSimple Cross Validation Stats:\n==============================" << endl;
  257. cout << " Modelling Efficiency: " << mef_v.Mean() << endl;
  258. cout << " Correlation: " << corr_v.Mean() << endl;
  259. cout << " Mean Square Error: " << diff_v.Mean() << endl;
  260. cout << " Standardized MSE: " << resub_v.Mean() << endl;
  261. /*-----------------Cleaning------------------*/
  262. delete kernel_template;
  263. }
  264. int main (int argc, char **argv) {
  265. Config conf ( argc, argv ); //Config for RFGP
  266. string path = conf.gS( "debug", "path", "." );
  267. string dataset = conf.gS( "debug", "dataset", "flux" );
  268. NICE::VVector xdata;
  269. NICE::Vector y;
  270. loadData(xdata, y, path, (dataset+"_x.csv"), (dataset+"_y.csv") ); //load all data
  271. testFrame( conf, xdata, y );
  272. return 0;
  273. }