testRegressionGP.cpp 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /**
  2. * @file testRegressionGP.cpp
  3. * @brief test of GP
  4. * @author Sven Sickert
  5. * @date 07/11/2013
  6. */
  7. #include <sstream>
  8. #include <iostream>
  9. #include <fstream>
  10. #include <sstream>
  11. #include <string>
  12. #include <vector>
  13. #include <stdlib.h>
  14. #include <assert.h>
  15. #include "core/basics/Config.h"
  16. #include "core/vector/VectorT.h"
  17. #include "core/vector/VVector.h"
  18. //#include "vislearning/baselib/ICETools.h"
  19. #include "vislearning/regression/gpregression/RegGaussianProcess.h"
  20. #include "vislearning/math/kernels/KernelExp.h"
  21. using namespace OBJREC;
  22. using namespace NICE;
  23. using namespace std;
  24. void csvline_populate ( vector<string> &record,
  25. const string& line,
  26. char delimiter )
  27. {
  28. int linepos=0;
  29. int inquotes=false;
  30. char c;
  31. int linemax=line.length();
  32. string curstring;
  33. record.clear();
  34. while(line[linepos]!=0 && linepos < linemax)
  35. {
  36. c = line[linepos];
  37. if (!inquotes && curstring.length()==0 && c=='"')
  38. {
  39. //beginquotechar
  40. inquotes=true;
  41. }
  42. else if (inquotes && c=='"')
  43. {
  44. //quotechar
  45. if ( (linepos+1 <linemax) && (line[linepos+1]=='"') )
  46. {
  47. //encountered 2 double quotes in a row (resolves to 1 double quote)
  48. curstring.push_back(c);
  49. linepos++;
  50. }
  51. else
  52. {
  53. //endquotechar
  54. inquotes=false;
  55. }
  56. }
  57. else if (!inquotes && c==delimiter)
  58. {
  59. //end of field
  60. record.push_back( curstring );
  61. curstring="";
  62. }
  63. else if (!inquotes && (c=='\r' || c=='\n') )
  64. {
  65. record.push_back( curstring );
  66. return;
  67. }
  68. else
  69. {
  70. curstring.push_back(c);
  71. }
  72. linepos++;
  73. }
  74. record.push_back( curstring );
  75. }
  76. void loadData( NICE::VVector &Data,
  77. NICE::Vector &y,
  78. const string &path,
  79. const string &xdat,
  80. const string &ydat )
  81. {
  82. vector<string> row;
  83. string line;
  84. cerr<<"Preloading Data...";
  85. ifstream in( (path+xdat).c_str() );
  86. if ( in.fail() )
  87. {
  88. cout << "File not found" <<endl;
  89. exit(EXIT_FAILURE);
  90. }
  91. int dim = 10; //TODO fixed data dimension
  92. int numData = 0;
  93. while ( getline(in, line) && in.good() )
  94. {
  95. csvline_populate(row, line, ',');
  96. NICE::Vector vec(dim);
  97. for (int i = 0; i < (int)row.size(); i++)
  98. {
  99. double dval = 0.0;
  100. dval = atof(row[i].data() );
  101. vec.set(i,dval);
  102. }
  103. Data.push_back(vec);
  104. numData++;
  105. }
  106. in.close();
  107. cerr<<"Finished."<<endl<<"Starting to get preloaded Labels...";
  108. in.open( (path+ydat).c_str() );
  109. if ( in.fail() )
  110. {
  111. cout << "File not found! Setting default value 0.0..." <<endl;
  112. y.resize(numData);
  113. y.set(0.0);
  114. }
  115. else
  116. {
  117. y.resize(numData);
  118. int count = 0;
  119. while(getline(in, line) && in.good() )
  120. {
  121. csvline_populate(row, line, ',');
  122. for ( int i = 0; i < (int)row.size(); i++ )
  123. {
  124. double dval = 0.0;
  125. dval = atof(row[i].data() );
  126. y.set(count,dval);
  127. count++;
  128. }
  129. }
  130. in.close();
  131. }
  132. cerr<<"Finished."<<endl;
  133. }
  134. void testFrame ( Config confRDF,
  135. NICE::VVector &xdata,
  136. NICE::Vector &y )
  137. {
  138. cerr<<"\nStarting test framework..."<<endl;
  139. /*------------Initialize Variables-----------*/
  140. ofstream storeEvalData;
  141. int trainingSize = (int)(.5*xdata.size());
  142. int testingSize = xdata.size() - trainingSize;
  143. vector<int> indices;
  144. for ( int i = 0; i < (int)xdata.size(); i++ )
  145. indices.push_back(i);
  146. int nfolds = confRDF.gI( "debug", "nfolds", 10 );
  147. Vector mef_v ( nfolds );
  148. Vector corr_v ( nfolds );
  149. Vector resub_v ( nfolds );
  150. Vector diff_v ( nfolds );
  151. KernelExp *kernel_template = new KernelExp ( confRDF.gD("Kernel", "log_rbf_gamma", -2.5), 0.0 );
  152. /*--------------Setting up GP----------------*/
  153. for ( int k = 0; k < nfolds; k++)
  154. {
  155. string fold;
  156. ostringstream convert;
  157. convert << k;
  158. fold = convert.str();
  159. cout << "\nFOLD " << k << ":\n======" << endl;
  160. cerr << "Initializing GP regression...";
  161. Kernel *kernel_function = NULL;
  162. kernel_function = new KernelExp ( *(kernel_template) );
  163. RegGaussianProcess *regGP = new RegGaussianProcess( &confRDF, kernel_function, "GPRegression" );
  164. NICE::VVector trainData, testData;
  165. NICE::Vector trainVals ( trainingSize );
  166. NICE::Vector testVals ( testingSize );
  167. random_shuffle( indices.begin(), indices.end() );
  168. for ( int i = 0; i < trainingSize; i++ )
  169. {
  170. trainData.push_back ( xdata[ indices[i] ] );
  171. trainVals.set( i, y[ indices[i] ] );
  172. }
  173. for ( int j = 0; j < testingSize; j++ )
  174. {
  175. testData.push_back ( xdata[ indices[j+trainingSize] ] );
  176. testVals.set( j, y[ indices[j+trainingSize] ] );
  177. }
  178. cerr << "Finished." << endl;
  179. cerr << "Teaching the GP regression...";
  180. regGP->teach( trainData, trainVals );
  181. cerr << "Finished." << endl;
  182. /*---------------Testing GP----------------*/
  183. cerr << "\nGetting prediction values for all data points...";
  184. NICE::Vector predictionValues( testingSize );
  185. predictionValues.set ( 0.0 );
  186. for ( int j = 0; j < testingSize; j++ )
  187. {
  188. predictionValues[j] = regGP->predict( testData[j] );
  189. }
  190. cerr << "Finished." << endl;
  191. /*---------------Evaluation----------------*/
  192. NICE::Vector diff = testVals - predictionValues;
  193. for (int j = 0; j < testingSize; j++)
  194. cerr << testVals[j] << " " << predictionValues[j] << endl;
  195. double mod_var = diff.StdDev()*diff.StdDev();
  196. double tar_var = testVals.StdDev()*testVals.StdDev();
  197. mef_v.set( k, (1-mod_var/tar_var) );
  198. NICE::Vector meanv( predictionValues.size() );
  199. meanv.set( diff.Mean() );
  200. NICE::Vector lhs = diff - meanv;
  201. meanv.set( testVals.Mean() );
  202. NICE::Vector rhs = testVals - meanv;
  203. lhs *= rhs;
  204. double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() );
  205. corr_v.set( k, corr );
  206. diff *= diff;
  207. diff_v.set( k, diff.Mean());
  208. resub_v.set( k, (diff.Mean() / tar_var) );
  209. }
  210. /*------------------Output-------------------*/
  211. cout << "\nSimple Cross Validation Stats:\n==============================" << endl;
  212. cout << " Modelling Efficiency: " << mef_v.Mean() << endl;
  213. cout << " Correlation: " << corr_v.Mean() << endl;
  214. cout << " Mean Square Error: " << diff_v.Mean() << endl;
  215. cout << " Standardized MSE: " << resub_v.Mean() << endl;
  216. }
  217. int main (int argc, char **argv) {
  218. string path = "/home/sickert/data/cosre-MPI/regression-fluxcom/_DATA/";
  219. Config confRDF(path+"config.conf"); //Config for RDF
  220. NICE::VVector xdata;
  221. NICE::Vector y;
  222. /*----------Load dataset---------*/
  223. loadData(xdata, y, path, "flux_x.csv", "flux_y.csv"); //load all data
  224. testFrame( confRDF, xdata, y );
  225. return 0;
  226. }