// $Id: tfreg.cpp 1369 2024-11-29 14:20:00Z ge $
/// \file tfreg.cpp
/// \brief Calculates temperatures using the regression method
/// \author Gerald Weber <gweberbh@gmail.com>
/// $Revision: 1369 $

#include "Options.h"
#include "NeighbourSequence.h"
#include "Regression.h"
#include <unistd.h>
#include "HeterogenousTM.h"
//#include "Actions.h"
#include "RegressionModel.h"
#include "CPUtime.h"
#include "OutStream.h"
#include "ErrorCodes.h"

namespace gbc
  {
  template<>
  std::valarray<HeterogenousTM<>::value_type> ActionAverageY::action(HeterogenousTM<>& tf,size_t N)
    {
    number_of_basepairs=tf.pSI->BP_reduced_duplex.size();
    tf.retrieve_eigensystem();
    tf.retrieve_or_calculate_matrices();
    return tf.average_y();
    }
  };

using namespace gbc;
using namespace std;

typedef double value_type;

int main(int argc,char *argv[])
  {
  
#ifndef SVNVERSION
#define SVNVERSION "$Revision: 1369 $"
#endif
#ifndef LINUXSYSTEM
#define LINUXSYSTEM "UNKNOWN"
#endif
#ifndef BUILDTIME
#define BUILDTIME "UNKNOWN"
#endif
#ifndef BUILDTIME
#define BUILDTIME "UNKNOWN"
#endif
#ifndef COMPILERNAME
#ifdef __INTEL_COMPILER
#define COMPILERNAME "icpc"    
#else
#define COMPILERNAME "g++"
#endif
#endif

  OptionMap::env_id="TFREG";
   
  std::cout << "*** tfreg version=" << std::string(SVNVERSION) << " system=" << std::string(LINUXSYSTEM) << " builddate=" << std::string(BUILDTIME) << " compiler=" << std::string(COMPILERNAME) << std::endl;

 
  Option<string>         output_name("o","output files basename",OptionMap::necessary);
  Option<string>         reg_name("reg","regression parameter file",OptionMap::optional);
  
  Option<string>         result("res","select which type of results",OptionMap::optional,"regression");
  result.add_restricted_option("nncheck","interprets the sequences in the -data file and exits without performing calculations");
  result.add_restricted_option("zyfy","calculates Zy and free energy");
  result.add_restricted_option("prediction","performs Tm prediction from given files in -data and -reg");
  result.add_restricted_option("sprediction","same as prediction but reads/calculates sequentially from file using less memory");
  result.add_restricted_option("averagey","calculates <y>");
  result.add_restricted_option("regression","makes a regression between Tm and melting index");
  
  
  Option<string>         duplextype("duplextype","type of duplex, e.g. DNA or RNA",OptionMap::optional,"DNA");
  duplextype.add_restricted_option("RNA","RNA");
  
  Option<deque<string> > par_file("par","list of parameter file",OptionMap::necessary);
  Option<string>         expand("expand","expand on this base",OptionMap::optional,"CG_CG");
  Option<string>         matrixbase("matrix","location of the matrices",OptionMap::optional);
  Option<deque<string> > datafile("data","list of sequence data files",OptionMap::optional);
  OptionMap::add_synonymn("xml","data");//For compatibility with var-par
  
  Option<string>         sequence("seq","sequence (5' to 3')",OptionMap::optional);
  Option<string>         csequence("cseq","complementary sequence (3' to 5')",OptionMap::optional);
  Option<int>            cutoff("cutoff","number of eigenvalues at which to cutoff a summation",OptionMap::optional,0);

  Option<bool>           periodic_bc("pbc","periodic boundary condition",OptionMap::optional,false);
  periodic_bc.add_restricted_option(false,"open boundary condition");
  periodic_bc.add_restricted_option(true,"periodic boundary condition");
  
  Option<deque<string> > debug("debug","select debuging flags",OptionMap::optional);

  Option<std::string>    model("m","model",OptionMap::optional,"pb");
  model.add_restricted_option("pb","Peyrard-Bishop model");
  model.add_restricted_option("dpb","Dauxois-Peyrard-Bishop  model");
  model.add_restricted_option("hms","Harmonic-Morse-Solvent model");
  model.add_restricted_option("jb","finite enthalpy model");
  model.add_restricted_option("mes","Morse-exact stacking model");
  model.add_restricted_option("test","Reserved");
  model.add_restricted_option("pb3DA","approximated 3D Peyrard-Bishop");
  model.add_restricted_option("pcla","entropic barrier y³");
  model.add_restricted_option("pclj","entropic hump potential");
  model.add_restricted_option("trmf","gaussian barrier");
  
  Option<double>         temperature("t","temperature in Kelvin",OptionMap::optional,370.0);
  Option<Range<double> > temperature_range("tr","temperature range for looping in Kelvin",OptionMap::optional);
  
  Range<double> Limits(-1.0,30.0); Limits.steps(100);
  Option<Range<double> > integration("int","integration range",OptionMap::optional,Limits);
  
  Option<double>         exp_err("ee","experimental error",OptionMap::optional,0.0);
  Option<int>            ran_seed("rs","random_seed",OptionMap::optional,1);
  Option<int>            min_length_regression("mlr","minimum duplex length to be included in regression",OptionMap::optional,0);
  Option<int>            min_a_regression("mar","minimum amount of data points for a given N in regression",OptionMap::optional,3);

  Option<int>            prediction_method("pm","prediction method",OptionMap::optional,2);
  prediction_method.add_restricted_option(1,"Single regression, by melting index for each group of length N");
  prediction_method.add_restricted_option(2,"Double regression, by melting index and group of length N");
  prediction_method.add_restricted_option(3,"Triple regression, by melting index, group of length, and group of salt concentration");
  prediction_method.add_restricted_option(-1,"Regression considering all Tm as a single group");
  prediction_method.add_restricted_option(-2,"Regression considering specific types of group, use together with -pg");
  
  Option<std::string>    prediction_group("pg","prediction group for method=-2, either salt, ct or key",OptionMap::optional,"salt");
  prediction_group.add_restricted_option("salt","group by salt concentrations");
  prediction_group.add_restricted_option("ct","group by Ct concentrations");
  prediction_group.add_restricted_option("key","group by key (see documentation for details)");
  
  Option<double>         salt("salt","salt concentration (mM)", OptionMap::optional,69);
  Option<double>         target_salt("targetsalt","target salt concentration (for converting Tm), if zero gets it from the data file",OptionMap::optional,0);
  Option<bool>           prediction_with_salt_correction("predsaltcorr","apply salt correction to predicted Tm (default: apply to dataset Tm)",OptionMap::optional,false);
  prediction_with_salt_correction.add_restricted_option(false,"apply to dataset Tm");
  prediction_with_salt_correction.add_restricted_option(true,"apply salt correction to predicted Tm");
  
  Option<string>         salt_scheme("saltscheme","type of salt conversion scheme",OptionMap::optional,"nocorrection");
  salt_scheme.add_restricted_option("tm_ln","generic tm vs ln");
  salt_scheme.add_restricted_option("rec_tm_ln","generic 1/tm vs ln");
  salt_scheme.add_restricted_option("owczarzy04eq19","owczarzy04eq19 for DNA");
  salt_scheme.add_restricted_option("owczarzy04eq20","owczarzy04eq20 for DNA");
  salt_scheme.add_restricted_option("owczarzy04eq21","owczarzy04eq21 for DNA");
  salt_scheme.add_restricted_option("owczarzy04eq22","owczarzy04eq22 for DNA");
  salt_scheme.add_restricted_option("chen13eq19","chen13eq19 for RNA");
  salt_scheme.add_restricted_option("chen13eq20","chen13eq20 for RNA");
  salt_scheme.add_restricted_option("chen13eq21","chen13eq21 for RNA");
  salt_scheme.add_restricted_option("chen13eq22","chen13eq22 for RNA");
  salt_scheme.add_restricted_option("schildkraut65","old correction for DNA");
  salt_scheme.add_restricted_option("nakano99","nakano99 for DNA/RNA only for 1000 to 100 Na+");
  
  
  Option<map<string,string> > dict("dict","Additions to nucleotide dictionary",OptionMap::optional);
  Option<string>         terminal_char("terminal","which character should represent the terminal of a sequence",OptionMap::optional,std::string("_"));
  
  //CG_equivalent and AT_equivalent will be compared to BP_reduced_duplex in sequencestructure/SequenceInfo.h
  Option<deque<string> > CG_equivalent("cgeq","Additional base pairs equivalent to CG, used for calculating fCG",OptionMap::optional); 
  Option<deque<string> > AT_equivalent("ateq","Additional base pairs equivalent to AT, used for Is_AT_only",OptionMap::optional); 
  
  Option<bool>           printusedpar("printusedpar","print parameters that were used with file extension .usedpar",OptionMap::optional,false);


  OptionMap::arguments(argc,argv);
  OptionMap::scan_arguments();
  if (result.check("nncheck"))
    {
    output_name.turn_optional();
    par_file.turn_optional();
    if (!sequence.provided()) datafile.turn_necessary();
    }
    
  if (result.check("zyfy"))
    {
    sequence.turn_necessary();
    temperature_range.turn_necessary();
    }
    
  if (result.check("prediction") or result.check("sprediction"))
    {
    reg_name.turn_necessary();
    }
    
  if (prediction_method.check(-2))
    {
    prediction_group.turn_necessary();
    }
  if (target_salt.provided() or prediction_with_salt_correction.check(true))
    {
    salt_scheme.turn_necessary(true);
    if (sequence.provided()) salt.turn_necessary(true);
    }

  OptionMap::check_if_all_given();
  OptionMap::show_options();

  if (debug.provided()) 
    for(auto db : debug.Value) DebugControl::enable(db);
      
  if (ran_seed.provided()) srand(ran_seed);

  Nucleotide<>::dictionary.add('U','U');
  if (duplextype.check("RNA"))
    {
    Nucleotide<>::dictionary.complementary_pair('A','U');
    }
    
  Nucleotide<>::dictionary.add(terminal_char.Value[0],terminal_char.Value[0]);

    
  if (dict.provided())
    {
    for(auto mit : dict.Value)
      {
      std::cout << "Adding new nucleotide to dictionary= "  << mit.first[0] << " and its complementary= " << mit.second[0] << std::endl;
      Nucleotide<>::dictionary.add(mit.first[0],mit.first[0],true);
      Nucleotide<>::dictionary.add(mit.second[0],mit.second[0],true);
      Nucleotide<>::dictionary.complementary_pair(mit.second[0],mit.first[0]);
      }
    }


  HeterogenousTM<value_type> HTM(expand);
  HTM.matrix_base_directory(matrixbase);

  if (periodic_bc.provided()) HTM.Periodic_boundary_condition=periodic_bc;
  HTM.select_hamiltonian((std::string)model);

  //This only populates the ParameterMap, actual values are only attributed when 
  //sequences are passed to HTM
  HTM.get_parameters_from_list_of_files(par_file.Value);

  ApplyLimit::apply(HTM,integration);

  if (cutoff.provided()) HTM.eigenvalue_cutoff((int)cutoff);

  HTM.temperature(temperature.Value);

  RegressionModel<value_type> RM(HTM);

  typedef RegressionModel<value_type>::sequence_type sequence_type;
  
  if (CG_equivalent.provided())
    {
    for (auto cg : CG_equivalent.Value)  RM.dataset.CG_equivalent_set.insert(cg);
    }
  if (AT_equivalent.provided())
    {
    for (auto at : AT_equivalent.Value)  RM.dataset.AT_equivalent_set.insert(at);
    }

  RM.prediction_method=prediction_method;
  if (prediction_group.provided())
    {
    std::cout << "*** Prediction group set to " << prediction_group.Value << std::endl;
    if (prediction_group.check("salt")) RM.prediction_group=RegressionModel<value_type>::Salt;
    else if (prediction_group.check("ct"))   RM.prediction_group=RegressionModel<value_type>::Ct;
      else if (prediction_group.check("key"))  RM.prediction_group=RegressionModel<value_type>::Key;

    if (RM.prediction_method != -2) std::cout << "!!!Warning: setting option -pg only has effect with -pm=-2" << std::endl;
    }
    
  
  RM.Min_length_regression=min_length_regression;
  RM.Min_a_regression=min_a_regression;

  //Defines all files, they will be created on first direct write or explicit stream_open
  OutStream outfile((string)output_name + "|dat"); 
  OutStream verfile((string)output_name + "|ver"); 
  OutStream texfile((string)output_name + "|tex"); 
  OutStream regfile((string)output_name + "|reg");
  OutStream nncheckfile((string)output_name + "|nncheck"); 
  OutStream used_par_file((string)output_name + "|usedpar"); 

  //Needs to come first as the Ct is set at this stage
  if (reg_name.provided())
    {
    cout << "Reading regression parameters from " << reg_name.Value << std::endl;
    RM.read_c(reg_name);
    }


  RM.dataset.read_rules_from_list_of_files(par_file.Value);
  
  if (salt_scheme.provided() and datafile.provided()) 
    {
    RM.dataset.select_salt_correction(salt_scheme);
    RM.dataset.Target_salt_ct=target_salt.Value;
    if (RM.dataset.Target_salt_ct == value_type())
      std::cout << "Info: -targetsalt=0, using salt concentration in dataset as target for salt correction" << std::endl; 
    RM.dataset.Prediction_with_salt_correction=prediction_with_salt_correction.Value;
    
    std::cout << "Info: applying salt correction scheme " << salt_scheme.Value;
    if (RM.dataset.Prediction_with_salt_correction) 
      std::cout << " to predicted Tm" << std::endl;
    else 
    std::cout << " to dataset" << std::endl;
    }


//Read the dataset
  if(datafile.provided())
    {
    if ( result.check("sprediction") )
      RM.dataset.set_list_of_data_files(datafile.Value);
    else
      RM.dataset.read_list_of_data_files(datafile.Value);
    }
    
  // Here we check for the existence of a complementary strand given by -cseq
  // If -cseq was not given we try to work out the complementary strand by the rule given in Nucleotide<>::dictionary
  RegressionModel<value_type>::sequence_info_type si;
  std::string mseq, cseq;
  if (sequence.provided())
    {
    mseq=(string)sequence;
    if (csequence.provided()) cseq=(string)csequence;
    else
      {
      NucleotideSequence<> seq(sequence.Value.length()),comp(sequence.Value.length());
      seq << sequence.Value;
      comp = seq;
      comp.complementary();
      cseq = (string)comp;
      std::cout << "*** cseq not provided, worked out as " << (string)cseq <<  std::endl;
      }
    cout << "seq=" << mseq << " cseq=" << cseq << std::endl;
    
    //Individual sequences will have to be inserted into RM
    si=RM.add_temperature(mseq,cseq,salt.Value,0.0,0.0);
    HTM.pSI=&si;
    }
    
    
  if(result.check("nncheck"))
    {
    if (output_name.provided())
      {
      nncheckfile.stream_open();
      std::cout << "Info: Writing nncheck analysis to file " << nncheckfile.formatted_name() << std::endl;
      RM.dataset.print_temperatures(nncheckfile,true);
      nncheckfile.stream_close();
      }
    else RM.dataset.print_temperatures(std::cout,true);
    exit(0);
    }

  if (sequence.provided())
    {
    if (result.check("averagey"))
      {
      outfile.stream_open();
      std::cout << "*** Calculating " << (string)result << " for sequence " <<  sequence.Value << std::endl;
      Looping<ActionAverageY>::print_result(HTM,0,outfile);
      }
    if (result.check("zyfy"))
      {
      outfile.stream_open();
      std::cout << "*** Calculating " << (string)result << " for sequence " <<  sequence.Value << std::endl;
      Looping<ActionZyFy,ApplyTemperature>::loop(HTM,temperature_range.Value,outfile,0);
      }
    if (result.check("prediction"))
      {
      if (not reg_name.provided()) regfile.stream_open();
      outfile.stream_open();
      std::cout << "*** Calculating " << (string)result << " for sequence " <<  sequence.Value << std::endl;
      RM.predict(si,prediction_method.Value);
      double temp=0.0, omega=0.0;
      temp=si.temperature.predicted;
      omega=si.thermal_equivalence;
      cout << "omega=" << omega << " Tm=" << temp << std::endl;
      RM.dataset.update(si);
      RM.get_partition_function_and_helmholtz_energy();
      RM.dataset.print_temperatures(outfile);
      }
    }

  if(datafile.provided() and result.check("prediction","sprediction","regression"))
    {
    if ((result.check("prediction","sprediction")) and (not reg_name.provided()) ) regfile.stream_open();
    if (result.check("regression")) regfile.stream_open();
    outfile.stream_open();
    verfile.stream_open();
    texfile.stream_open();
    std::cout << "*** Calculating " << (string)result << " for data file(s) ";
    std::copy(datafile.Value.begin(),datafile.Value.end(),std::ostream_iterator<std::string>(std::cout, ","));
    std::cout << std::endl;
    
    if ( result.check("sprediction") )
      {
      RM.predict_and_print_sequentially(outfile);
      RM.print_verify(verfile);
      }
    else
      {
      if (exp_err.provided()) 
        {
        RM.dataset.randomise_dataset();
        std::cout << "*** Randomising dataset with -ee=" << exp_err.Value << std::endl;
        }
      
      RM.recalculate_all_thermal_equivalences();
      if (result.check("regression")) 
        {
        RM.calculate_regression_and_save(regfile); //otherwise reads file
        std::cout << "*** Calculating regression " << std::endl;
        }
      RM.predict_all();
      RM.calculate_and_print_verify(verfile);
      RM.dataset.print_temperatures(outfile);
      }
    RM.print_tex(texfile);
    }
    
  if(datafile.provided() and result.check("averagey"))
    {
    outfile.stream_open();
    std::cout << "*** Calculating " << (string)result << " for data file(s) ";
    std::copy(datafile.Value.begin(),datafile.Value.end(),std::ostream_iterator<std::string>(std::cout, ","));
    std::cout << std::endl;
    if (exp_err.provided()) 
      {
      RM.dataset.randomise_dataset();//ATTENTION this may not be necessary
      std::cout << "*** Randomising dataset with -ee=" << exp_err.Value << std::endl;
      }
    RM.recalculate_all_thermal_equivalences();
    RM.calculate_print_all_average_y(outfile);
    }

  if (printusedpar.check(true))
    {
    used_par_file.stream_open();
    HTM.parameter_map.print_parameters_and_stats(used_par_file,"parameters");
    used_par_file.stream_close();
    }
    
  outfile.stream_close();
  print_cpu();

  return 0;
}

