/**
* @file testLinRegression.cpp
* @brief test of linear regression
* @author Frank Prüfer
* @date 08/13/2013

*/

#include <sstream>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <stdlib.h>
#include <assert.h>

#include "core/basics/Config.h"
#include "core/vector/VectorT.h"
#include "core/vector/VVector.h"

#include "vislearning/baselib/ICETools.h"

#include "vislearning/regression/linregression/LinRegression.h"

using namespace OBJREC;
using namespace NICE;
using namespace std;

void csvline_populate ( vector<string> &record,
                       const string& line,
                       char delimiter )
{
  int linepos=0;
  int inquotes=false;
  char c;
  int linemax=line.length();
  string curstring;
  record.clear();

  while(line[linepos]!=0 && linepos < linemax)
  {
    c = line[linepos];

    if (!inquotes && curstring.length()==0 && c=='"')
    {
      //beginquotechar
      inquotes=true;
    }
    else if (inquotes && c=='"')
    {
      //quotechar
      if ( (linepos+1 <linemax) && (line[linepos+1]=='"') )
      {
        //encountered 2 double quotes in a row (resolves to 1 double quote)
        curstring.push_back(c);
        linepos++;
      }
      else
      {
        //endquotechar
        inquotes=false;
      }
    }
    else if (!inquotes && c==delimiter)
    {
      //end of field
      record.push_back( curstring );
      curstring="";
    }
    else if (!inquotes && (c=='\r' || c=='\n') )
    {
     record.push_back( curstring );
     return;
    }
    else
    {
      curstring.push_back(c);
    }
    linepos++;
  }
  
  record.push_back( curstring );
}

void loadData( NICE::VVector &Data,
               NICE::Vector &y,
               const string &path,
               const string &xdat,
               const string &ydat )
{

  vector<string> row;
  string line;

  cerr<<"Preloading Data...";
  ifstream in( (path+xdat).c_str() );
  if ( in.fail() )
  {
    cout << "File not found" <<endl;
    exit(EXIT_FAILURE);
  }

  int numData = 0;

  while ( getline(in, line)  && in.good() )
  {
    csvline_populate(row, line, ',');
    vector<double> vec;
    for (int i = 0; i < (int)row.size(); i++)
    {
      double dval = 0.0;
      dval = atof(row[i].data() );
      vec.push_back(dval);
    }
    NICE::Vector nvec(vec);
    Data.push_back(nvec);
    numData++;
  }
  in.close();

  cerr<<"Finished."<<endl<<"Starting to get preloaded Labels...";

  in.open( (path+ydat).c_str() );
  if ( in.fail() )
  {
    cout << "File not found! Setting default value 0.0..." <<endl;
    y.resize(numData);
    y.set(0.0);
  }
  else
  {
    y.resize(numData);
    int count = 0;
    while(getline(in, line)  && in.good() )
    {
      csvline_populate(row, line, ',');
      for ( int i = 0; i < (int)row.size(); i++ )
      {
        double dval = 0.0;
        dval = atof(row[i].data() );
        y.set(count,dval);
        count++;
      }
    }
    in.close();
  }

  cerr<<"Finished."<<endl;
}

void testFrame (  Config conf,
		  NICE::VVector &xdata,
		  NICE::Vector &y )
{
  cerr<<"\nStarting test framework..."<<endl;
  
  /*------------Initialize Variables-----------*/
  ofstream storeEvalData;
  double trainRatio = conf.gD( "debug", "training_ratio", .9 );
  
  int trainingSize = (int)(trainRatio*xdata.size());
  int testingSize = xdata.size() - trainingSize;
  
  vector<int> indices;
  for ( int i = 0; i < (int)xdata.size(); i++ )
    indices.push_back(i);
  
  int nfolds = conf.gI( "debug", "nfolds", 10 );
  Vector mef_v ( nfolds );
  Vector corr_v ( nfolds );
  Vector resub_v ( nfolds );
  Vector diff_v ( nfolds );

  bool saveConfig = conf.gB( "debug", "save_config", false );
  
  /*------------Store Configuration------------*/
  string filename = conf.gS( "debug", "filename" );
  
  if ( saveConfig )
  {
    cout << "Configuration will be stored in: " << filename << "_config" << endl;
    
    storeEvalData.open ( (filename+"_config").c_str() );

    storeEvalData.close();
  } else
  {
    cout << "Configuration will not be stored." << endl;
  }
  
  /*------------Setting up PreRDF--------------*/
  for ( int k = 0; k < nfolds; k++)
  {
    string fold;
    ostringstream convert;
    convert << k;
    fold = convert.str();
    
    cout << "\nFOLD " << k << ":\n======" << endl;
    

    cerr << "Initializing LinRegression...";
    LinRegression *linReg = new LinRegression ();
    cerr << "Finished." << endl;
    
    cerr << "Teaching the LinRegression algorithm...";
    NICE::VVector trainData, testData;
    NICE::Vector trainVals ( trainingSize );
    NICE::Vector testVals ( testingSize );
    random_shuffle( indices.begin(), indices.end() );
    for ( int i = 0; i < trainingSize; i++ )
    {
      trainData.push_back ( xdata[ indices[i] ] );
      trainVals.set( i, y[ indices[i] ] );
    }
    for ( int j = 0; j < testingSize; j++ )
    {
      testData.push_back ( xdata[ indices[j+trainingSize] ] );
      testVals.set( j, y[ indices[j+trainingSize] ] );
    }
    
    linReg->teach ( trainData, trainVals );
    cerr << "Finished." << endl;
    
    /*-------------Testing RDF-GP--------------*/

    cerr << "\nGetting prediction values for all data points...";
    NICE::Vector predictionValues( testingSize );
    predictionValues.set ( 0.0 );
    for ( int j = 0; j < testingSize; j++ )
    {
      predictionValues[j] = linReg->predict( testData[j] );
    }
    cerr << "Finished." << endl;
    
    /*---------------Evaluation----------------*/
    NICE::Vector diff = testVals - predictionValues;
    
    double mod_var = diff.StdDev()*diff.StdDev();
    double tar_var = testVals.StdDev()*testVals.StdDev();
    mef_v.set( k, (1-mod_var/tar_var) );
    
    NICE::Vector meanv( predictionValues.size() );
    meanv.set( diff.Mean() );
    NICE::Vector lhs = diff - meanv;
    meanv.set( testVals.Mean() );
    NICE::Vector rhs = testVals - meanv;
    lhs *= rhs;
    double corr = lhs.Mean() / sqrt( diff.StdDev()*diff.StdDev()*testVals.StdDev()*testVals.StdDev() );
    corr_v.set( k, corr );
    
    diff *= diff;
    diff_v.set( k, diff.Mean());
    resub_v.set( k, (diff.Mean() / tar_var) );
  }
  
  /*------------------Output-------------------*/
  cout << "\nSimple Cross Validation Stats:\n==============================" << endl;
  cout << "  Modelling Efficiency: " << mef_v.Mean() << endl;
  cout << "  Correlation: " << corr_v.Mean() << endl;
  cout << "  Mean Square Error: " << diff_v.Mean() << endl;
  cout << "  Standardized MSE: " << resub_v.Mean() << endl;
}


int main (int argc, char **argv) {

  Config conf ( argc, argv );   //get config from user input
  
  string path = conf.gS( "debug", "path", "." );
  string dataset = conf.gS( "debug", "dataset", "flux" );

  NICE::VVector xdata;
  NICE::Vector y;

  loadData(xdata, y, path, (dataset+"_x.csv"), (dataset+"_y.csv") ); //load all data
  
  testFrame( conf, xdata, y );

  return 0;
}