// $Id: SequenceDataset.h 1369 2024-11-29 14:20:00Z ge $
/// \file SequenceDataset.h
/// \brief This class manages datasest of experimental sequence information
///
/// $Revision: 1369 $
/// \author Gerald Weber <gweberbh@gmail.com>
#ifndef GBC_SEQUENCEDATASET_H
#define GBC_SEQUENCEDATASET_H "$Id: SequenceDataset.h 1369 2024-11-29 14:20:00Z ge $"

#include "SequenceInfo.h"
#include <deque>
#include <vector>
#include "MathAux.h"
#include<boost/algorithm/string.hpp>    
#include<boost/algorithm/string/regex.hpp>                                      
#include <boost/filesystem.hpp>
#include "RegexAux.h"
#include "ErrorCodes.h"
#include <algorithm>
#include "Regression.h"
#include "RegexPattern.h"

namespace gbc
{
template<class _Tp=double>
class DataStatistics
  {
  typedef _Tp value_type;
  public:
    std::string name;
    value_type error;
    value_type average;
    value_type diff_deviation;
    value_type std_deviation;
    value_type sqr_diff;
    value_type relative_sqr_diff;
    value_type sqrt_diff2;

  DataStatistics(void)
    : name(), error(), average(), diff_deviation(), 
      std_deviation(), sqr_diff(), relative_sqr_diff(), sqrt_diff2() {}

  DataStatistics(std::string nm)
    : name(nm), error(), average(), diff_deviation(), 
      std_deviation(), sqr_diff(), relative_sqr_diff(), sqrt_diff2() {}

  inline void print_name(std::ostream &out) const
    {
    out << name << ".error " 
        << name << ".average "
        << name << ".std_deviation "
        << name << ".sqr_diff"
        << name << ".relative_sqr_diff"; 
    }

  inline friend std::ostream& operator<<(std::ostream &out, DataStatistics& dt)
    {
    out << dt.error << " " << dt.average << " " << dt.std_deviation << " " << dt.sqr_diff << " " << dt.relative_sqr_diff;
    return out;
    }
  };

template<class _SequenceInfo>
class SequenceDataset
  {
  public:
    typedef _SequenceInfo                        sequence_info_type;
    typedef typename sequence_info_type::sequence_type
                                                 sequence_type;
    typedef size_t                               sequence_length_type;
    typedef typename sequence_info_type::value_type   
                                                 value_type;
    typedef typename sequence_info_type::equivalence_map_type    equivalence_map_type;
    typedef typename sequence_info_type::matched_equivalence_map_type    matched_equivalence_map_type;
    typedef typename sequence_info_type::duplex_type    duplex_type;
    typedef typename sequence_info_type::base_pair_type base_pair_type;
    typedef typename sequence_info_type::salt_correction_scheme_type salt_correction_scheme_type;
    typedef typename sequence_info_type::string_set_type string_set_type;

    typedef value_type                           salt_concentration_type;
    typedef std::set<salt_concentration_type>    salt_set_type;
    typedef std::set<value_type>                 length_set_type;
    typedef std::deque<sequence_info_type>       sequence_info_deque_type;
    typedef std::deque<sequence_info_type*>      sequence_info_deque_ptype;
    typedef std::map<sequence_length_type,
                     sequence_info_deque_ptype>  length_dataset_type;
    typedef std::map<salt_concentration_type,
                     length_dataset_type>        salt_length_dataset_type;
    typedef std::deque<value_type>               species_concentration_deque_type;
    typedef std::deque<value_type>               value_deque_type;
    typedef typename salt_length_dataset_type::iterator 
                                                 iterator;
    typedef std::valarray<value_type>            vector_type;
    typedef DataStatistics<value_type>           datastatistics_type;
    typedef ReferenceSet<Duplex<> >                    duplex_ref_type;
    typedef ReferenceSet<NeighbourSequence<> >         neighbours_ref_type;
    typedef ReferenceSet<StrandPairSequence<> >        strandpair_ref_type;
    typedef std::map<std::string,std::pair<value_type,value_type> >       neighbours_value_ref_type;
    typedef std::map<std::string,std::pair<value_type,value_type> >       basepairs_value_ref_type;
    typedef std::deque<std::string>                    string_deque_type;
    typedef std::map<duplex_type,int>             trimers_map_type;
    typedef std::map<base_pair_type,int>          base_pair_count_type;
    typedef Regression<value_type>                linear_regression_type;


  std::string Dataset_identfifier;
  std::string Dataset_type;
  std::string File_comment;
  bool Uniform_species_concentration;
  bool Species_concentration_necessary;
  value_type Species_concentration_symmetry_factor;       ///< default = 1.0
  species_concentration_deque_type Species_concentration;
  bool Ct_externaly_provided;
  bool Uniform_salt_concentration;
  value_type Salt_concentration;                          ///< base salt concentration of dataset, if not given will be the one of the first sequence
  value_type Target_salt_ct;                              ///< if given, all salt concentrations of dataset will be adjusted
  std::string Dataset_filename;
  sequence_info_deque_type  Raw_dataset;
  sequence_info_deque_type  Original_raw_dataset;
  salt_set_type             Salt_concentration_set;          ///< Set of salt concentrations
  length_set_type           Lengths;                      ///< Set of lenght values

  length_dataset_type       Length_dataset;               ///< A map indexed by length
  salt_length_dataset_type  Salt_length_dataset;          ///< a map index by salt and length
  salt_length_dataset_type  Salt_dataset;                 ///< a map index by salt at any length by setting length=0, we do this to reuse existing code for Salt_length_dataset
  salt_length_dataset_type  Ct_dataset;                   ///< a map index by Ct at any length
  salt_length_dataset_type  Key_dataset;                  ///< a map index by a given Key at any length
  salt_length_dataset_type  Original_salt_length_dataset;

  datastatistics_type Temperature;
  datastatistics_type Entropy;
  datastatistics_type Enthalpy;
  datastatistics_type Gibbs;
  value_deque_type diff_Tm, diff_H, diff_S,  diff_G, rdiff_Tm, rdiff_H, rdiff_S;

  linear_regression_type DeltaG_X_Tm_reg; ///< Holds the linear regression of DeltaG X Tm
  
  int prediction_method; //Used in RegressionModel
  size_t Number_of_sequences_read;  //How man sequences were read, this number may differ from Raw_dataset.size()
  
  equivalence_map_type EquivalenceMap; ///< Map which collects base-pair equivalences 
  equivalence_map_type ReMap; ///< Map which collects base-pair equivalences in regex form
  matched_equivalence_map_type  MatchedEquivalenceMap; ///< Holds matched equivalences
  duplex_ref_type        Basepair_set;              ///< The set of different base pairs
  neighbours_ref_type    Reduced_neighbours_set;          ///< The set of different base pair neighbours
  neighbours_ref_type    Exact_neighbours_set;    ///< The set of different base pair neighbours
  neighbours_value_ref_type StatNNTemperature; ///< Average tempeature for NN type se Eq.(14) in martins24
  basepairs_value_ref_type  StatBPTemperature; ///< Average tempeature for BB type
  strandpair_ref_type    StrandPair_set;          ///< The set of different strand pair neighbours

  neighbours_ref_type    NPNeighbours_set;          ///< The set of different base pair neighbours, non-periodic
  neighbours_ref_type    PNeighbours_set;          ///< The set of different base pair neighbours, periodic
  neighbours_ref_type    ExclusivePNeighbours_set;  ///< Bbase pair neighbours that only exists because of periodicity
  strandpair_ref_type    NPStrandPair_set;          ///< The set of different strand pair neighbours, non-periodic
  trimers_map_type       Trimers_map;
  base_pair_count_type   Terminal5_bp_count;
  base_pair_count_type   Terminal3_bp_count;
  base_pair_count_type   Terminal5_bp_reduced_count;
  base_pair_count_type   Terminal3_bp_reduced_count;
  salt_correction_scheme_type Salt_correction_scheme;
  bool                   Prediction_with_salt_correction; ///< if true applies salt correction on predicted Tm, default is to apply on input Tm
  bool                   Debug;
  bool                   ParameterReadjust;       ///< Performs calculate_av_tm_nn_type
  string_set_type        CG_equivalent_set;       ///< String list of BP that should be considered equivalent do CG, this list should be symmetry reduced
  string_set_type        AT_equivalent_set;       ///< String list of BP that should be considered equivalent do CG, this list should be symmetry reduced
  string_deque_type      Data_file_deque;         ///< List of files
  std::string PrintFlags;           ///< Flags that control what is going to be printed

  boost::regex Regex_temperature, Regex_gibbs, Regex_tm_gibbs, Regex_comment, Regex_comment_line, Regex_group, Regex_full_key, Fields5,  Fields7;

  
  SequenceDataset(const std::string id=std::string()): Dataset_identfifier(id), Dataset_type(),
    Uniform_species_concentration(true), Species_concentration_necessary(true), Species_concentration_symmetry_factor(1.0), Species_concentration(), Number_of_sequences_read(),
    Ct_externaly_provided(false), Uniform_salt_concentration(true), Salt_concentration(), Target_salt_ct(), Salt_correction_scheme(sequence_info_type::nocorrection), Prediction_with_salt_correction(false), Debug(false), ParameterReadjust(true), 
    Regex_temperature("^(temperature$|temperature\\s+#.*$)"), //Identifies files hodling Tm
    Regex_gibbs("^(gibbs$|gibbs\\s+#.*$)"),                   //Identifies files hodling DeltaH and DeltaS
    Regex_tm_gibbs("^(temperature\\+gibbs|temperature\\+gibbs\\s+#.*$)"),
    Regex_comment("#\\s*([^\\s].*)$"),                        //capture the field after # that is used as comment, stripping spaces between # and the first character
    Regex_comment_line("^\\s*#"),                             //Identifies a line starting with #, spaces are allowed at beginning of file
    Regex_group("&(\\d+)"),                                   //inside the comment filed there may be group key assignments of the form &1, &2 etc
    Regex_full_key("\\s*&\\d+\\s*"),                          //we use this for removing the key and extra spaces that may exists around it
    Fields5("^([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)(\\s*$|\\s+#.+$)"),
    Fields7("^([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)\\s+([^\\s]+)(\\s*$|\\s+#.+$)")

      {
      CG_equivalent_set.insert("CG");
      CG_equivalent_set.insert("dCrG"); //DNA-RNA
      CG_equivalent_set.insert("dGrC"); //DNA-RNA
      AT_equivalent_set.insert("AT"); 
      AT_equivalent_set.insert("AU");         
      };
  
   // This function collects the different salt concentrations in the dataset, and for each makes
   // a list of available sequence lengths. This is used for the regression method of the
   // Peyrard-Bishop thermodynamic equivalence.
   void make_salt_length_map(void)
      {
      Salt_length_dataset.clear();
      Salt_dataset.clear();
      Ct_dataset.clear();
      Key_dataset.clear();
      typename sequence_info_deque_type::iterator si=Raw_dataset.begin();
      for(si=Raw_dataset.begin(); si != Raw_dataset.end(); si++)
        { 
        if (Salt_length_dataset.find(si->salt_concentration["Na+"]) == Salt_length_dataset.end())
          {
          Salt_concentration_set.insert(si->salt_concentration["Na+"]);
          Salt_length_dataset.insert(typename salt_length_dataset_type::value_type(si->salt_concentration["Na+"],length_dataset_type()));
          }

        if (Salt_length_dataset[si->salt_concentration["Na+"]].find(si->length) == Salt_length_dataset[si->salt_concentration["Na+"]].end())
          {
          Lengths.insert(si->length);
          Salt_length_dataset[si->salt_concentration["Na+"]].insert(typename length_dataset_type::value_type(si->length,sequence_info_deque_ptype()));
          }
      
        // Populate the map.
        Salt_length_dataset[si->salt_concentration["Na+"]][si->length].push_back(&(*si));
        Salt_dataset[si->salt_concentration["Na+"]][0].push_back(&(*si));//always at lenght=0
        Ct_dataset[si->species_concentration][0].push_back(&(*si));//always at lenght=0
        if (si->group_key) Key_dataset[si->group_key][0].push_back(&(*si));//always at lenght=0, only for group_key != 0
        }
      }

    inline void select_salt_correction(std::string scheme)
      {
      typename std::map<std::string,salt_correction_scheme_type> 
        available_schemes = { {"nocorrection",   sequence_info_type::nocorrection}, 
                              {"schildkraut65",  sequence_info_type::schildkraut65},
                              {"santalucia98",   sequence_info_type::santalucia98},
                              {"owczarzy04eq19", sequence_info_type::owczarzy04eq19},
                              {"owczarzy04eq20", sequence_info_type::owczarzy04eq20},
                              {"owczarzy04eq21", sequence_info_type::owczarzy04eq21},
                              {"owczarzy04eq22", sequence_info_type::owczarzy04eq22}, 
                              {"chen13eq19",     sequence_info_type::chen13eq19},
                              {"chen13eq20",     sequence_info_type::chen13eq20},
                              {"chen13eq21",     sequence_info_type::chen13eq21},
                              {"chen13eq22",     sequence_info_type::chen13eq22},
                              {"nakano99",       sequence_info_type::nakano99},
                              {"tm_ln",          sequence_info_type::tm_ln},
                              {"rec_tm_ln",      sequence_info_type::rec_tm_ln}
                           };
      
      if (available_schemes.find(scheme) == available_schemes.end())
        {
        CERR_ERROR(ERRSCSDNE) << " Error: Salt correction scheme " << scheme << " does not exists" << std::endl;
        CERR << "Available schemes are: "; 
        typename std::map<std::string,salt_correction_scheme_type>::const_iterator it;
        for(it=available_schemes.begin(); it!=available_schemes.end(); it++)
          CERR << it->first << " ";
        CERR << std::endl;
        CERR_TERM
        }
      else Salt_correction_scheme=available_schemes[scheme];
      
      }

    // This is the main function that interprets and adds information
    sequence_info_type insert_si(const std::string& seq,        ///< The sequence string 
             const std::string& comp,       ///< The complimentary sequence string 
             const value_type& salt_conc,   ///< The salt concentration
             const value_type& temperature, ///< The experimental melting temperature
             const value_type& enthalpy,    ///< The experimental enthalpy
             const value_type& entropy,     ///< The experimental entropy
             const value_type& spec_conc,   ///< Species concentration.
             const std::string& from_file=std::string(), ///< Optional file origin
             const std::string& identification=std::string(),
             const int& group_key=0)
      {
      // If there is no reference species concentration yet, then use the current one.
      if (Species_concentration.empty()) 
        {
        Species_concentration.push_back(spec_conc);
        COUT_INFO(INFOSRSC) << " Setting reference species concentration to " <<  spec_conc << " from sequence " << seq << std::endl;
        }
      // Check if species concentration is uniform for dataset
      if (Uniform_species_concentration)
        {
        if (spec_conc != Species_concentration[0]) 
          {
          Uniform_species_concentration=false;
          COUT_INFO(INFODHMSC) << " dataset " << Dataset_filename << " has multiple species concentrations" << std::endl;
          }
        }

      // If there is no reference salt concentration yet, then use the current one.
      if (Salt_concentration == value_type()) 
        {
        Salt_concentration=salt_conc;
        COUT_INFO(INFOSBSCT) << " Setting reference salt concentration to " <<  Salt_concentration << " from sequence " << seq << std::endl;
        }
      // Check if species concentration is uniform for dataset
      if (Uniform_salt_concentration)
        {
        if (salt_conc != Salt_concentration) 
          {
          Uniform_salt_concentration=false;
          CERR_WARN(WDHMSC) << " dataset " << Dataset_filename << " has multiple salt concentrations" << std::endl;
          }
        }

      sequence_info_type si;
      si.PrintFlags=PrintFlags;
      si.CG_equivalent_set=CG_equivalent_set;
      si.AT_equivalent_set=AT_equivalent_set; 

      si.pEquivalenceMap=&EquivalenceMap;//get hold of the trimer list equivalence map from read_xml
      si.pReMap=&ReMap;
      si.pMatchedEquivalenceMap=&MatchedEquivalenceMap;


      si.sequence=sequence_type(seq,comp); //this will become obsolete
      si.insert_sequence(seq,comp);
      si.group_key=group_key;
      si.identification=identification;
      si.from_file=from_file;

      //Updates dataset statisctics from SequenceInfo analysis
      Basepair_set.insert_from_map(si.Basepair_set);
      Reduced_neighbours_set.insert_from_map(si.Reduced_neighbours_set);
      Exact_neighbours_set.insert_from_map(si.Exact_neighbours_set);
      StrandPair_set.insert_from_map(si.StrandPair_set);
      if (si.Periodic)
        {
        NPNeighbours_set.insert_from_map(si.NPNeighbours_set);
        PNeighbours_set.reference_insert(si.Periodicity_NN);
        ExclusivePNeighbours_set.clear();
        typename neighbours_ref_type::const_iterator pn;
        for (pn=PNeighbours_set.begin(); pn != PNeighbours_set.end(); pn++)
          {
          if (NPNeighbours_set.count(pn->first) == 0)
            {
            ExclusivePNeighbours_set.reference_insert(pn->first,pn->second);
            }
          }
        if ( (ExclusivePNeighbours_set.size()+NPNeighbours_set.size()) != Reduced_neighbours_set.size())
          {
          CERR_ERROR(ERRESBER) 
          << "ExclusivePNeighbours_set.size()+NPNeighbours_set.size() should be equal to Reduced_neighbours_set.size() but we got " << ExclusivePNeighbours_set.size() << "+" << NPNeighbours_set.size() << "!=" << Reduced_neighbours_set.size() << std::endl;
          CERR << "This error indicates a problem in the interpretation of the sequence dataset" << std::endl;
          }
        NPStrandPair_set.insert_from_map(si.NPStrandPair_set);
        }

      for(typename trimers_map_type::iterator trim=si.Trimers_map.begin(); trim != si.Trimers_map.end(); trim++)
        {
        Trimers_map[trim->first] += trim->second;
        }
  
        
      Terminal5_bp_count[si.Terminal5_bp]++;
      Terminal3_bp_count[si.Terminal3_bp]++;
      Terminal5_bp_reduced_count[si.Terminal5_bp_reduced]++;
      Terminal3_bp_reduced_count[si.Terminal3_bp_reduced]++;
      
      
      si.determine_alpha();
      
      if (spec_conc == value_type()) 
        {
        if (not Species_concentration.empty()) 
          {
          CERR_WARN(WSHNCURC) << " Sequence " << seq << "/" << comp << "has no Ct, using the reference Ct=" <<  Species_concentration[0] << std::endl;
          si.species_concentration=Species_concentration[0];
          }
        else
          {
          CERR_WARN(WSHNCTCMF) << " Sequence " << seq << "/" << comp << "has no Ct, Tm calculation may fail" << std::endl;
          si.species_concentration=value_type();
          }
        }
      else si.species_concentration=spec_conc;

      if (si.Self_complementary and Species_concentration_symmetry_factor != 1.0) 
        {
        CERR_DEBUG(DSED_SISCMC) << " Sequence is self-complementary, multiplying Ct=" << si.species_concentration << " by " << Species_concentration_symmetry_factor << std::endl;
        si.species_concentration *= Species_concentration_symmetry_factor;
        }

      
      si.temperature.adjusted=si.temperature.measured=temperature;
      si.enthalpy.measured=enthalpy;
      si.entropy.measured=entropy;
      if (not Species_concentration.empty()) si.Ref_concentration=Species_concentration[0];//Ref_concentration no in use

      si.Salt_correction_scheme=Salt_correction_scheme;
      si.Prediction_with_salt_correction=Prediction_with_salt_correction;
      si.Target_salt_ct=Target_salt_ct;
      si.salt_concentration["Na+"]=salt_conc;
      if (Salt_correction_scheme != sequence_info_type::nocorrection and not Prediction_with_salt_correction)
        si.apply_salt_correction();


      Raw_dataset.push_back(si);
      return si;
      }
      
    /// Update
    void update(sequence_info_type &si)
      {
      typename sequence_info_deque_type::iterator sequence_info_it=Raw_dataset.begin();
      for(sequence_info_it=Raw_dataset.begin(); sequence_info_it != Raw_dataset.end(); sequence_info_it++)
        {
	if (si.sequence == sequence_info_it->sequence)
	  {
	  *sequence_info_it = si;
	  }
	}
      }


    /// \brief Adds a new sequence and its experimental data to a map.
    ///
    /// This is one of the main functions of this class. It adds a new sequence
    /// and its experimental melting temperature, together with salt and species
    /// concentration to a map. 
    sequence_info_type add_temperature(const std::string& seq,        ///< The sequence string 
             const std::string& comp,       ///< The complementary sequence string 
             const value_type& salt_conc,   ///< The salt concentration in mmol/L
             const value_type& temperature, ///< The experimental melting temperature in K
             const value_type& spec_conc,   ///< Species concentration in 1e-6 M
             const std::string& from_file=std::string(), ///< Optional file origin
             const std::string& identification=std::string(), ///< Optional sequence identification
             const int& group_key=0)
      {
      if (spec_conc == value_type() and Species_concentration_necessary)
        CERR_WARN(WSCIZ) << " warning: species concentration is zero (Ct=0) " << std::endl;
      insert_si(seq,comp,salt_conc,temperature,0.0,0.0,spec_conc,from_file,identification,group_key);
      sequence_info_type *si = &(Raw_dataset.back()); //gets a pointer to itself from the Raw_dataset map
      return *si;
      }

    /// \brief Adds a new sequence and its experimental data to a map.
    ///
    /// This is one of the main functions of this class. It adds a new sequence
    /// and its experimental enthalpy/entropy, together with salt and species
    /// concentration to a map. 
    sequence_info_type add_gibbs(const std::string& seq,        ///< The sequence string 
             const std::string& comp,       ///< The complementary sequence string 
             const value_type& salt_conc,   ///< The salt concentration
             const value_type& enthalpy,    ///< The experimental enthalpy
             const value_type& entropy,     ///< The experimental entropy
             const value_type& spec_conc,   ///< Species concentration in 1e-6 M
             const std::string& from_file=std::string(), ///< Optional file origin
             const std::string& identification=std::string(), ///< Optional sequence identification
             const int& group_key=0)
      {
      if (spec_conc == value_type())
         CERR_WARN(WSCIZ) << " warning: species concentration is zero (Ct=0) " << std::endl;

      insert_si(seq,comp,salt_conc,0.0,enthalpy,entropy,spec_conc,from_file,identification,group_key);
      sequence_info_type *si = &(Raw_dataset.back());  //gets a pointer to itself from the Raw_dataset map
      if (spec_conc != value_type())
        {
        si->set_temperature_from_gibbs();
        }
      return *si;
      }
      
    /// \brief Adds a new sequence and its experimental data to a map.
    ///
    /// This is one of the main functions of this class. It adds a new sequence
    /// and its experimental enthalpy/entropy, together with salt and species
    /// concentration to a map. 
    sequence_info_type add_all(const std::string& seq,        ///< The sequence string 
             const std::string& comp,       ///< The complementary sequence string 
             const value_type& salt_conc,   ///< The salt concentration
             const value_type& temperature, ///< The experimental melting temperature in K
             const value_type& enthalpy,    ///< The experimental enthalpy
             const value_type& entropy,     ///< The experimental entropy
             const value_type& spec_conc,   ///< Species concentration in 1e-6 M
             const std::string& from_file=std::string(), ///< Optional file origin
             const std::string& identification=std::string(), ///< Optional sequence identification
             const int& group_key=0)
      {
      if (spec_conc == value_type())
         CERR_WARN(WSCIZ) << " warning: species concentration is zero (Ct=0) " << std::endl;

      insert_si(seq,comp,salt_conc,temperature,enthalpy,entropy,spec_conc,from_file,identification,group_key);
      sequence_info_type *si = &(Raw_dataset.back());  //gets a pointer to itself from the Raw_dataset map
      return *si;
      }

    /// Despite its name no xml is read, its an ordinary data file organized in columns  
    void read_xml(std::string xmlfile, 
                  species_concentration_deque_type spconc=species_concentration_deque_type(), 
                  double multspconc=1)
      {
      if (spconc.empty()) spconc=Species_concentration;
      else Ct_externaly_provided=true;

      std::ifstream xml;
      
      open_and_read_xml_header(xml,xmlfile);
 
      while(!xml.eof()) read_xml_line(xml,xmlfile,spconc,multspconc);
          
      make_salt_length_map();
      Original_raw_dataset=Raw_dataset;
      }
    
    //Open and reads header info xml file
    inline void open_and_read_xml_header(std::ifstream &xml, const std::string& xmlfile)
      {
      if (Dataset_filename.empty()) Dataset_filename=xmlfile;
      else Dataset_filename += "," + xmlfile;
      if ( boost::filesystem::exists(xmlfile.c_str()) ) 
        xml.open(xmlfile.c_str());
      else 
        {
        CERR_ERROR(ERRFXDNE) << "File " << xmlfile << " does not exist" << std::endl;
        CERR_TERM
        }
        
  
      std::string type;
      std::getline (xml,type);
      if (boost::regex_match(type,Regex_temperature)) Dataset_type="temperature";
      else
        {
        if (boost::regex_match(type,Regex_gibbs)) Dataset_type="gibbs";
        else
          {
          if (boost::regex_match(type,Regex_tm_gibbs)) Dataset_type="temperature+gibbs";
          else
            {
            CERR_ERROR(ERRFDFXNR) << " Format of data file " << xmlfile.c_str() << " not recognized, make sure first line is either \'temperature\', \'gibbs\' or \'temperature+gibbs\'" << std::endl;
            CERR_TERM
            }
          }
        }
        
      COUT_INFO(INFORDFL) << " Reading data file " <<  xmlfile.c_str() << " of type \"" << Dataset_type << "\""<< std::endl;
          
      //extract file comment after # in the first line
      File_comment=reg_match_string(type,Regex_comment);
      }
    
    //Reads single line from xml file
    inline void read_xml_line(std::ifstream &xml, const std::string& from_file, 
                       species_concentration_deque_type &spconc, double &multspconc)
      {
      boost::regex fields;
      if (Dataset_type == "temperature" or Dataset_type == "gibbs") fields=Fields5; 
      else fields=Fields7; //Dataset_type == "temperature+gibbs"
      
      CERR_DEBUG(DSED_RXML) << "Dataset type=" << Dataset_type << std::endl;

      //Field that need to be assigned to SequenceInformation
      int group_key=0;
      std::string seq, comp, id, identification;
      double temp=0.0, salt=0.0, concentration=0.0, enthalpy=0.0, entropy=0.0;
      
      //Read the line
      std::string line;
      std::getline (xml,line);
      
      if ( (line.length() > 0) && (line[0] != '#') ) //also ignore lines starting with #
        {
        boost::smatch found;
        if (boost::regex_search(line,found,fields))
          {
          if (found.size() == 7)
            {
            seq=found[1];
            comp=found[2];
            id=found[6];
            if (Dataset_type == "temperature")
              {
              assign(temp,found[3]);//in C
              assign(salt,found[4]);//in mM
              assign(concentration,found[5]);//\mu M
              }
            else //Dataset_type == "gibbs"
              {
              assign(enthalpy,found[3]);//in kcal/mol
              assign(entropy,found[4]);//in cal/(mol.K) or eu
              assign(salt,found[5]);//in mM or mmol/L
              }
            }
          if (found.size() == 9) //Dataset_type == "temperature+gibbs"
            {
            seq=found[1];
            comp=found[2];
            assign(temp,found[3]);//in C
            assign(salt,found[4]);//in mM
            assign(concentration,found[5]);//\mu M
            assign(enthalpy,found[6]);//in kcal/mol
            assign(entropy,found[7]);//in cal/(mol.K) or eu
            id=found[8];
            }
              
          //Parse the extra comment in line
          boost::smatch fcmd;
          if (boost::regex_search(id,fcmd,Regex_comment)) 
            {
            identification=fcmd[1];
            boost::smatch fk;
            if (boost::regex_search(identification,fk,Regex_group)) 
              {
              assign(group_key,fk[1]);
              std::string fmt{""};
              identification=boost::regex_replace(identification, Regex_full_key, fmt); //removes the group_key
              }
            }
            
          if (!xml.eof())
            {
            if (Dataset_type == "temperature" or Dataset_type == "temperature+gibbs")
              {
              if (not spconc.empty() and Ct_externaly_provided) 
                {
                COUT_INFO(INFORSCW) << " Replacing sequence Ct " <<  concentration << " with " << spconc[0] << " from file" << std::endl;
                concentration = spconc[0]; //replace species concentration of the file
                }
              if (multspconc != 1.0) concentration *= multspconc; //multiply species concentration of the file
              if (Dataset_type == "temperature") 
                add_temperature(seq,comp,salt,temp,concentration,from_file,identification,group_key);
              if (Dataset_type == "temperature+gibbs") 
                add_all(seq,comp,salt,temp,enthalpy,entropy,concentration,from_file,identification,group_key);
              }
            if (Dataset_type == "gibbs")
              {
              if (spconc.size() > 0)
                {
                // Here we add the sequence for each Ct in the list
                for (auto ctlist : spconc) add_gibbs(seq,comp,salt,enthalpy,entropy,(ctlist)*multspconc,from_file,identification,group_key);
                }
              else
                {
                //adding with zero species_concentration
                add_gibbs(seq,comp,salt,enthalpy,entropy,0.0,from_file,identification,group_key);
                }
              }
            }
          Number_of_sequences_read++;
          }
        else
          {
          if (not boost::regex_search(line,found,Regex_comment_line))
            {
            CERR_ERROR(ERRUTPL) << "unable to parse line [" << line << "]" << std::endl; 
            CERR_TERM
            }
          }
        }
      }
     
      
    inline void set_list_of_data_files(string_deque_type &data_file_deque)
      {
      Data_file_deque=data_file_deque;
      }

    inline void read_list_of_data_files(string_deque_type &data_file_deque, 
                  species_concentration_deque_type spconc=species_concentration_deque_type(), 
                  double multspconc=1)
      {
      Data_file_deque=data_file_deque;
      read_all_data_files(spconc,multspconc);
      }
      
    inline void read_all_data_files( 
                  species_concentration_deque_type spconc=species_concentration_deque_type(), 
                  double multspconc=1)
      {
      for (auto pit : Data_file_deque) read_xml(pit,spconc,multspconc);
      if (Number_of_sequences_read == 0) 
        {
        CERR_ERROR(ERRNOSEQR) << "no sequences found in datafiles: " << boost::algorithm::join(Data_file_deque,", ")  << std::endl;
        CERR_TERM
        }
      }
      
      
    void read_rules(std::string rulefile)
      {
      std::ifstream rules;
      if ( boost::filesystem::exists(rulefile) ) 
        rules.open(rulefile.c_str());
      else 
        {
        CERR_ERROR(ERRFRDNE) << "File " << rulefile << " does not exist" << std::endl;
        CERR_TERM
        }
      if (!rules.is_open())
        {
        CERR_ERROR(ERRCNRR) << " Could not read " << rulefile << std::endl;
        CERR_TERM
        }

      int new_equivalence_rules=0;
      while(!rules.eof())
        {
        std::string line;
        std::getline(rules,line);
        if (reg_match(line,"^\\+"))
          {
          new_equivalence_rules++;
          std::string bp=reg_match_string(line,SEQUENCEDATASET_BP_MATCH_PATTERN);
          std::string equiv=reg_match_string(line,SEQUENCEDATASET_EQUIV_MATCH_PATTERN);
          if (bp.empty() or equiv.empty())
            {
            CERR_ERROR(ERRCNIRL) << "Could not interpret rule line [" << line << "] check for formatting mistakes" << std::endl
                                 << "The first part should start with + and contain a base pair in notation BP^X, found=" << bp << std::endl
                                 << "The second part a list of trimers comma-separated (spaces allowed), found=" << equiv << std::endl;
            CERR_TERM
            }
          std::deque<std::string> extracted;
          boost::algorithm::split_regex(extracted, equiv, boost::regex("\\s*,\\s*"));
          for(auto ei : extracted)
            {
            CERR_DDEBUG(DSED_EXSPLIT) << "extracted split = [" << ei << "]" << std::endl;
            std::string el = reg_rm_white_space(ei);
            std::string re=reg_match_string(el,"^/(.+)/$");
            if (not re.empty()) ReMap[re]=bp; 
            else 
              {
              //Non-regex trimers have a fixed structure like AUA/UGU and therefore can be checked for various problems
              if (el.length() < 7) 
                {
                CERR_ERROR(ERRPMT) << " Possibly malformed trimer " << el << " has less than 7 characters, check for formatting mistakes" << std::endl;
                CERR_TERM
                }
                
              //check if the reverse precedes lexically
              std::string le=el;
              std::reverse(le.begin(),le.end());
              if (le < el)
                {
                CERR_WARN(WCTITPA) << "Check trimer \"" << el << "\" from rule " << bp << ", its reverse \"" << le << "\" precedes alphabetically, therefore it may never be matched to any existing trimer" << std::endl; 
                }
                
              //Check if same trimer appears in other rules, may not be a mistake but may cause unexpected results
              auto fnd=EquivalenceMap.find(el);
              if (fnd != EquivalenceMap.end())
                {
                CERR_WARN(WTFRAEFAR) << "Trimer \"" << el << "\" for rule " << bp << " already exists for a another rule " << fnd->second << ", check your trimer rules for duplicates" << std::endl; 
                }
                
              //here we test if the central base-pair of a trimer matches with the base-pair from the rule
              base_pair_type bsp(bp);
              bsp.reduce_to_smallest_symmetry();
              duplex_type trimer(el,base_pair_type::simplify_symmetry);
              if (bsp.formatted_string_no_version() != trimer[1].formatted_string_no_version())
                {
                CERR_WARN(WCBPTWDNMR) << "Central base pair of trimer \"" << el << "\" is " << trimer[1].formatted_string_no_version() << " which does not match rule " << bp << ", check your trimer rules" << std::endl;
                }
                
              EquivalenceMap[el]=bp;
              }
            }
          }
        }
      typename equivalence_map_type::iterator eq=EquivalenceMap.begin();
      if ( (eq != EquivalenceMap.end()) and (new_equivalence_rules > 0) )
        {
        COUT_INFO(INFOTRRF) << new_equivalence_rules << " trimer rules read from file " << rulefile << std::endl;
        }
      int elements = EquivalenceMap.size() + ReMap.size();
      if (  elements  < 50)
        {
        for(auto rm : ReMap)          COUT_INFO(INFOTRMAT) << " " << rm.first << " <regex> " << rm.second << std::endl;
        for(auto em : EquivalenceMap) COUT_INFO(INFOTRMAT) << " " << em.first << " <=> "     << em.second << std::endl;
        }
      else
        {
        COUT_INFO(INFOTWMTL) << "Trimer equivalence map too long (" << elements << " elements), output suppressed" << std::endl;
        }
      }
      
    void read_rules_from_list_of_files(string_deque_type &par_file_deque)
      {
      for(auto pit : par_file_deque) this->read_rules(pit);
      }


    void randomise_dataset(void)
      {
      Raw_dataset=Original_raw_dataset;
      make_salt_length_map();

      std::vector<value_type> temperature_change,enthalpy_change,entropy_change;
      typename sequence_info_deque_type::iterator sequence_info_it=Raw_dataset.begin();
      for(sequence_info_it=Raw_dataset.begin(); sequence_info_it != Raw_dataset.end(); sequence_info_it++)
        {
         if (Dataset_type == std::string("temperature"))
           {
           temperature_change.push_back(gauss_ran(Temperature.error));
           sequence_info_it->temperature.measured += temperature_change.back();
           sequence_info_it->temperature.adjusted=sequence_info_it->temperature.measured;
           }
         if (Dataset_type == std::string("gibbs"))
           {
           enthalpy_change.push_back(gauss_ran(sequence_info_it->enthalpy.measured*Enthalpy.error*0.01));
           entropy_change.push_back(gauss_ran(sequence_info_it->entropy.measured*Entropy.error*0.01));
           sequence_info_it->enthalpy.measured += enthalpy_change.back();
           sequence_info_it->entropy.measured += entropy_change.back() ;
           value_type tmpch=sequence_info_it->temperature.measured;
           sequence_info_it->set_temperature_from_gibbs();
           temperature_change.push_back(tmpch-sequence_info_it->temperature.measured);
           }
         }
      Temperature.std_deviation=standard_deviation(temperature_change);
      Enthalpy.std_deviation=standard_deviation(enthalpy_change);
      Entropy.std_deviation=standard_deviation(entropy_change);
      }

    inline void collect_tm_difference(sequence_info_type &si)
      {
      value_type delta_Tm=fabs(si.temperature.predicted-si.temperature.adjusted);
      diff_Tm.push_back(delta_Tm);
      value_type rdelta_Tm=0.0;
      if (si.temperature.adjusted != 0.0) rdelta_Tm=fabs(delta_Tm/si.temperature.adjusted);
      rdiff_Tm.push_back(rdelta_Tm);
      if (si.enthalpy.measured != 0.0)
        {
        value_type delta_H=fabs(si.enthalpy.predicted - si.enthalpy.measured);
        diff_H.push_back(delta_H);
        value_type rdelta_H=0.0;
        if (si.enthalpy.measured != 0.0)  rdelta_H=fabs(delta_H/si.enthalpy.measured);
        rdiff_H.push_back(rdelta_H);
        }
      if (si.entropy.measured != 0.0)
        {
        value_type delta_S=fabs(si.entropy.predicted - si.entropy.measured);
        diff_S.push_back(delta_S);
        value_type rdelta_S=0.0;
        if (si.entropy.measured != 0.0 ) rdelta_S=fabs(delta_S/si.entropy.measured);
        rdiff_S.push_back(rdelta_S);
        }
      }
      
    inline void calculate_tm_statistics(void)
      {
      size_t number_of_sequences=Number_of_sequences_read;
      if ( (Number_of_sequences_read == 0) and (Raw_dataset.size() !=0) ) number_of_sequences=Raw_dataset.size();
        
        
      Temperature.average=average(diff_Tm);
      Temperature.diff_deviation=standard_deviation(diff_Tm);
      Temperature.sqr_diff=sqr_sum(diff_Tm);
      Temperature.relative_sqr_diff=sqr_sum(rdiff_Tm);
      Temperature.sqrt_diff2=sqrt(Temperature.sqr_diff/number_of_sequences);
      if (not diff_H.empty())
        {
        Enthalpy.average=average(diff_H);
        Enthalpy.diff_deviation=standard_deviation(diff_H);
        Enthalpy.sqr_diff=sqr_sum(diff_H);
        Enthalpy.relative_sqr_diff=sqr_sum(rdiff_H);
        Enthalpy.sqrt_diff2=sqrt(Enthalpy.sqr_diff/number_of_sequences);
        }
      if (not diff_S.empty())
        {
        Entropy.average=average(diff_S);
        Entropy.diff_deviation=standard_deviation(diff_S);
        Entropy.sqr_diff=sqr_sum(diff_S);
        Entropy.relative_sqr_diff=sqr_sum(rdiff_S);
        Entropy.sqrt_diff2=sqrt(Entropy.sqr_diff/number_of_sequences);
        }
      if (not diff_G.empty())
        {
        Gibbs.average=average(diff_G);
        Gibbs.diff_deviation=standard_deviation(diff_G);
        Gibbs.sqr_diff=sqr_sum(diff_G);
        Gibbs.sqrt_diff2=sqrt(Gibbs.sqr_diff/number_of_sequences);
        }
      if (ParameterReadjust) 
        {
        calculate_av_tm_nn_type();
        calculate_av_tm_bp_type();
        }
      }

    /// \brief Calculates the average melting temperature foreach sequence containing a specific NN
    /// See Eq(14) martins24
    inline void calculate_av_tm_nn_type(void)
      {
      StatNNTemperature.clear();
      std::map<std::string,std::deque<value_type> > index; 
      for(auto &sequence_info_it : Raw_dataset)
        {
        for(auto &nn: sequence_info_it.Reduced_neighbours_set)
          index[(std::string)nn.first].push_back(sequence_info_it.temperature.adjusted);
        }
      for(auto &in: index) 
        {
        StatNNTemperature[in.first].first = average(in.second);
        StatNNTemperature[in.first].second = standard_deviation(in.second);
        }
      }

    inline void calculate_av_tm_bp_type(void)
      {
      StatBPTemperature.clear();
      std::map<std::string,std::deque<value_type> > index; 
      for(auto &sequence_info_it : Raw_dataset)
        {
        for(auto &bp: sequence_info_it.Basepair_set)
          index[(std::string)bp.first].push_back(sequence_info_it.temperature.adjusted);
        }
      for(auto &in: index) 
        {
        StatBPTemperature[in.first].first = average(in.second);
        StatBPTemperature[in.first].second = standard_deviation(in.second);
        }
      }
      
      
    /// \brief Calculates the difference between the predicted and the adjusted temperature
    inline void calculate_tm_difference_entire_set(void) //old name difference
      {
      diff_Tm.clear(); diff_H.clear(); diff_S.clear(), diff_G.clear();
      rdiff_Tm.clear(); rdiff_H.clear(); rdiff_S.clear();
      
      for(auto &sequence_info_it : Raw_dataset) collect_tm_difference(sequence_info_it);
        
      calculate_tm_statistics();
      }
      
    /// \brief Calculate the regression between DeltaG and Tm, that is, how well DeltaG predicts Tm in a linear model
    inline value_type calculate_deltag_tm_regression(void)
      {
      int n=Raw_dataset.size();
      std::valarray<value_type> Tm(n), DeltaG(n);
      int i=0;
      for(auto &sequence_info_it : Raw_dataset) 
        {
        sequence_info_it.calculate_gibbs_free_energy();
        Tm[i]=sequence_info_it.temperature.predicted;
        DeltaG[i]=sequence_info_it.gibbs_free_energy.predicted;
        CERR_DEBUG_CODE(DSED_TMDG) << " DeltaG[" << i << "]=" << DeltaG[i] << " Tm[" << i << "]=" << Tm[i] << std::endl;
        i++;
        }
      DeltaG_X_Tm_reg.calculate_regression(DeltaG,Tm);
      if (BOOL_DEBUG(DSED_TMDGREG))
        {
        CERR_DEBUG(DSED_TMDGREG) << " regression parameters " << std::endl;
        DeltaG_X_Tm_reg.print_regression_info(CERR);
        }
      if (BOOL_DEBUG(DSED_TMDGVEC))
        {
        CERR_DEBUG(DSED_TMDGVEC) << " regression vectors " << std::endl;
        DeltaG_X_Tm_reg.print_vectors(CERR);
        }
      
      return DeltaG_X_Tm_reg.chisq;
      }
      
     
    inline void print_nncheck(std::ostream& out)
      {
      out << std::endl << "**************************** Data set analysis ********************" << std::endl << std::endl; 
      out << "File_name: " << Dataset_filename << std::endl;
      out << "File_comment: " << File_comment << std::endl; 
      out << this->Basepair_set.size() << " unique base-pairs in dataset (BP symmetry reduced)" <<  std::endl;
      out << "Base-pair set      : " << this->Basepair_set << std::endl;
      out << this->Reduced_neighbours_set.size() << " unique nearest-neighbours in dataset  (NP+P, NN symmetry reduced)" <<  std::endl;
      out << "Neighbours set     : " << this->Reduced_neighbours_set << std::endl;
      out << this->Exact_neighbours_set.size() << " unique nearest-neighbours in dataset  (NP+P, not symmetry reduced)" <<  std::endl;
      out << "Neighbours set     : " << this->Exact_neighbours_set << std::endl;
      if (this->NPNeighbours_set.size() > 0) 
        {
        out << this->NPNeighbours_set.size() << " unique NP nearest-neighbours in dataset  (NP:non-periodic, NN symmetry reduced)" <<  std::endl;
        out << "Neighbours set (NP): " << this->NPNeighbours_set << std::endl;
        }
                    
      if (this->PNeighbours_set.size() > 0) 
        {
        out << this->PNeighbours_set.size() << " unique P nearest-neighbours in dataset  (P:periodic-only last->first, NN symmetry reduced)" <<  std::endl;
        out << "Neighbours set (P): " << this->PNeighbours_set << std::endl;
        }
          
      if (this->ExclusivePNeighbours_set.size() > 0) 
        {
        out << this->ExclusivePNeighbours_set.size() << " unique exclusive-P nearest-neighbours in dataset  (P:periodic-only last->first, NN symmetry reduced)" <<  std::endl;
        out << "Neighbours set (ExclusiveP): " << this->ExclusivePNeighbours_set << std::endl;
        }

      out << this->StrandPair_set.size() << " unique strand-pair-neighbours in dataset (5'->3' direction)" <<  std::endl;
      out << "StrandPair set     : " << this->StrandPair_set << std::endl;
      if (this->NPStrandPair_set.size()) 
        {
        out << this->NPStrandPair_set.size() << " unique strand-pair-neighbours in dataset (non-periodic, 5'->3' direction)" <<  std::endl;
        out << "StrandPair set (NP): " << this->NPStrandPair_set << std::endl;
        }
      typename trimers_map_type::iterator trim;
      out << Trimers_map.size() << " unique trimers in dataset (non-periodic, trimer symmetry reduced, 5'->3'/3'->5')" <<  std::endl;
      out << "Trimers_set    (NP): (" << Trimers_map.size() << ") ";
      for(trim=Trimers_map.begin(); trim != Trimers_map.end(); trim++)
        {
        out << (std::string)(trim->first) << "=" << trim->second << " ";
        }
      out << "NP=non-periodic" << std::endl << std::endl;
      
      
      if (MatchedEquivalenceMap.size())
        {
        out << "Regex Matched Trimer Equivalences" <<  std::endl;
        std::list<std::string> matched;
        for(auto mat : MatchedEquivalenceMap)
          {
          out << mat.first << ": +" << ReMap[mat.first] << " "; 
          mat.second.sort();
          out << boost::algorithm::join(mat.second,",") << std::endl;
          for (auto tr: mat.second) matched.push_back(tr);
          }
        out << std::endl;
      
        std::list<std::string> unmatched;
        auto found=matched.begin();
        for(auto trim : Trimers_map)  if (std::find(matched.begin(),matched.end(),(std::string)(trim.first)) ==  matched.end()) unmatched.push_back((std::string)(trim.first));
          
        if (unmatched.size()) out << "Trimers not Covered by Equivalences  " <<  std::endl << boost::algorithm::join(unmatched,",") <<  std::endl <<  std::endl;
        
        }
      else out << "No Regex Matched Trimers" <<  std::endl <<  std::endl;
      
      typename base_pair_count_type::iterator term_it;
        
      out << Terminal5_bp_count.size() << " unique terminal-5' base pairs " << std::endl << "Terminal-5': ";
      for (term_it=Terminal5_bp_count.begin(); term_it != Terminal5_bp_count.end(); term_it++)
        {
        out << (std::string)(term_it->first) << "=" << term_it->second << " ";
        }
      out << std::endl << std::endl;
        
      out << Terminal5_bp_reduced_count.size() << " unique terminal-5' base pairs (BP symmetry reduced)" << std::endl << "Terminal-5' (BP symmetry reduced): ";
      for (term_it=Terminal5_bp_reduced_count.begin(); term_it != Terminal5_bp_reduced_count.end(); term_it++)
        {
        out << (std::string)(term_it->first) << "=" << term_it->second << " ";
        }
      out << std::endl << std::endl;

      out << Terminal3_bp_count.size() << " unique terminal-3' base pairs " << std::endl << "Terminal-3': ";
      for (term_it=Terminal3_bp_count.begin(); term_it != Terminal3_bp_count.end(); term_it++)
        {
        out << (std::string)(term_it->first) << "=" << term_it->second << " ";
        }
      out << std::endl << std::endl;
        
      out << Terminal3_bp_reduced_count.size() << " unique terminal-3' base pairs  (BP symmetry reduced)" << std::endl << "Terminal-3'  (BP symmetry reduced): ";
      for (term_it=Terminal3_bp_reduced_count.begin(); term_it != Terminal3_bp_reduced_count.end(); term_it++)
        {
        out << (std::string)(term_it->first) << "=" << term_it->second << " ";
        }
      out << std::endl << std::endl;
        
      out << std::endl << "**************************** Regression Groups analysis ********************" << std::endl << std::endl; 
      typename salt_length_dataset_type::iterator group_it;
      out << "Salt/Length groups: (" << Salt_length_dataset.size() << ")" << std::endl;
      for (group_it=Salt_length_dataset.begin(); group_it != Salt_length_dataset.end(); ++group_it)
        {
        typename length_dataset_type::iterator length_it;
        for (length_it=group_it->second.begin(); length_it != group_it->second.end(); ++length_it)
          {
          out << "Salt: " << group_it->first << " length: " << length_it->first
              << " (" << length_it->second.size() << ") ";
          typename sequence_info_deque_ptype::iterator psi;
          for (psi=length_it->second.begin(); psi != length_it->second.end(); ++psi) out << (*psi)->identification << " ";
          out << std::endl;
          }
        out << std::endl;
        }

      out << std::endl << "Salt groups: (" << Salt_dataset.size() << ")" << std::endl;
      for (group_it=Salt_dataset.begin(); group_it != Salt_dataset.end(); ++group_it)
        {
        typename length_dataset_type::iterator length_it=group_it->second.begin();//This group has only 1 set of length=0
        out << "Salt: " << group_it->first
            << " (" << length_it->second.size() << ") ";
        typename sequence_info_deque_ptype::iterator psi;
        for (psi=length_it->second.begin(); psi != length_it->second.end(); ++psi)
          out << (*psi)->identification << " ";
        out << std::endl;
        }
          
      out << std::endl << "Ct groups: (" << Ct_dataset.size() << ")" << std::endl;
      for (group_it=Ct_dataset.begin(); group_it != Ct_dataset.end(); ++group_it)
        {
        typename length_dataset_type::iterator length_it=group_it->second.begin();//This group has only 1 set of length=0
        out << "Ct: " << group_it->first
            << " (" << length_it->second.size() << ") ";
        typename sequence_info_deque_ptype::iterator psi;
        for (psi=length_it->second.begin(); psi != length_it->second.end(); ++psi)
          out << (*psi)->identification << " ";
        out << std::endl;
        }

      out << std::endl << "Key groups: (" << Key_dataset.size() << ")" << std::endl;
      for (group_it=Key_dataset.begin(); group_it != Key_dataset.end(); ++group_it)
        {
        typename length_dataset_type::iterator length_it=group_it->second.begin();//This group has only 1 set of length=0
        out << "Key: " << group_it->first
            << " (" << length_it->second.size() << ") ";
        typename sequence_info_deque_ptype::iterator psi;
        for (psi=length_it->second.begin(); psi != length_it->second.end(); ++psi)
          out << (*psi)->identification << " ";
        out << std::endl;
        }
        
      if (ParameterReadjust)
        {
        calculate_av_tm_nn_type();
        out << std::endl << "**************************** Average TM for each type of NN ********************" << std::endl << std::endl; 
        for (auto av:StatNNTemperature) out << av.first << "=" << av.second.first << " +/- " << av.second.second << std::endl;

        calculate_av_tm_bp_type();
        out << std::endl << "**************************** Average TM for each type of BP ********************" << std::endl << std::endl; 
        for (auto av:StatBPTemperature) out << av.first << "=" << av.second.first << " +/- " << av.second.second << std::endl;
        }

      out << std::endl << std::endl;
       
      }
        
        
    /// \brief Prints out the experimental and calculated temperatures.
    inline void print_temperatures(std::ostream& out, bool analyse=false)
      {
      typename sequence_info_deque_type::iterator sequence_info_it=Raw_dataset.begin();
      sequence_info_it->PrintFlags=PrintFlags;
      CERR_DEBUG(DSED_PRTFLG) << " print_temperatures PrintFlags=[" << PrintFlags << "]" << std::endl;
      if (not analyse) sequence_info_it->print_head(out);
      for(sequence_info_it=Raw_dataset.begin(); sequence_info_it != Raw_dataset.end(); sequence_info_it++)
        {
        sequence_info_it->PrintFlags=PrintFlags;
        sequence_info_it->print(out,analyse);
        }
      if (analyse) print_nncheck(out);
      else
        {
        out << "StdTempDiff "     << Temperature.std_deviation << std::endl;
        out << "StdEnthalpyDiff " << Enthalpy.std_deviation    << std::endl;
        out << "StdEntropyDiff "  << Entropy.std_deviation     << std::endl;
        }

      }
      
    /// \brief Prints out a new dataset using temperature.adjusted
    inline void print_adjusted_dataset(std::ostream& out)
      {
      typename sequence_info_deque_type::iterator sequence_info_it=Raw_dataset.begin();
      out << "temperature" << std::endl;
      for(sequence_info_it=Raw_dataset.begin(); sequence_info_it != Raw_dataset.end(); sequence_info_it++)
        {
        sequence_info_it->print_salt_adjusted_tm(out);
        }        
      }


   inline void calculate_and_print_verify(std::ostream& out) //old name verify
      {
      calculate_tm_difference_entire_set();
      print_verify(out);
      }
      
   inline void print_verify(std::ostream& out)
      {
      size_t number_of_sequences=Number_of_sequences_read;
      if ( (Number_of_sequences_read == 0) and (Raw_dataset.size() !=0) ) number_of_sequences=Raw_dataset.size();
      out << "average diff_deviation sqr_diff sqrt_diff2 relative_sqr_diff N" << std::endl;
      out << "Tm " << Temperature.average << " " << Temperature.diff_deviation << " " << Temperature.sqr_diff << " " << Temperature.sqrt_diff2 << " " << Temperature.relative_sqr_diff << " " << number_of_sequences << std::endl;
      if (Enthalpy.average != 0.0)
        out << "DeltaH "  << Enthalpy.average << " " << Enthalpy.diff_deviation << " " << Enthalpy.sqr_diff << " " << Enthalpy.sqrt_diff2 << " " << Enthalpy.relative_sqr_diff << std::endl;
      if (Entropy.average != 0.0)
        out << "DeltaS " << Entropy.average << " " << Entropy.diff_deviation << " " << Entropy.sqr_diff << " " << Entropy.sqrt_diff2 << " " << Entropy.relative_sqr_diff << std::endl;
      if (Gibbs.average != 0.0)
        out << "DeltaG " << Gibbs.average << " " << Gibbs.diff_deviation << " " << Gibbs.sqr_diff << " " << Gibbs.sqrt_diff2 << " " << Gibbs.relative_sqr_diff << std::endl;
      }

   inline void print_tex(std::ostream& out)
     {
     out << "\\SqrDiff{" << Temperature.sqr_diff   << "}" << std::endl;
     out << "\\AvDiff{"  << Temperature.average   << "}" << std::endl;
     out << "\\StdDiff{" << Temperature.diff_deviation << "}" << std::endl;
     out << "\\StdTempDiff{" << Temperature.std_deviation << "}" << std::endl;
     out << "\\StdEnthalpyDiff{" << Enthalpy.std_deviation << "}" << std::endl;
     out << "\\StdEntropyDiff{" << Entropy.std_deviation << "}" << std::endl;
     }

    /// \brief Build a vector of lengths for a given salt concentration
    inline vector_type length_vector(typename salt_length_dataset_type::iterator& salt_it)
      {
      vector_type lv(salt_it->second.size());
      typename length_dataset_type::iterator length_it;
      size_t n;
      for (length_it=salt_it->second.begin(),n=0; length_it != salt_it->second.end(); ++length_it, ++n)
        {
        lv[n]=length_it->first;
        }
      return lv;
      }

    /// \brief Build a vector of thermal_equivalences for a given length
    inline vector_type thermal_equivalence_vector(typename length_dataset_type::iterator& length_it)
      {
      vector_type lv(length_it->second.size());
      typename sequence_info_deque_ptype::iterator si_it;
      size_t n;
      for (si_it=length_it->second.begin(),n=0; si_it != length_it->second.end(); ++si_it, ++n)
        {
        lv[n]=(*si_it)->thermal_equivalence;
        //std::cerr << "theq=" << lv[n] << std::endl;
        }
      return lv;
      }

    /// \brief Build a vector of all thermal_equivalences
    inline vector_type thermal_equivalence_vector(void)
      {
      vector_type lv(Raw_dataset.size());
      typename sequence_info_deque_type::iterator si=Raw_dataset.begin();
      size_t n;
      for(si=Raw_dataset.begin(),n=0; si != Raw_dataset.end(); si++, ++n)
        {
	//std::cerr << "theq=" << si->thermal_equivalence << std::endl;
        lv[n]=si->thermal_equivalence;
        }
      return lv;
      }

   /// \brief Build a vector of temperatures for a given length
   inline  vector_type temperature_vector(typename length_dataset_type::iterator& length_it)
      {
      vector_type lv(length_it->second.size());
      typename sequence_info_deque_ptype::iterator si_it;
      size_t n;
      for (si_it=length_it->second.begin(),n=0; si_it != length_it->second.end(); ++si_it, ++n)
        {
        lv[n]=(*si_it)->temperature.adjusted;
        //std::cerr << "temp=" << lv[n] << std::endl;
        }
      return lv;
      }

   /// \brief Build a vector of all temperatures
   inline  vector_type temperature_vector(void)
      {
      vector_type lv(Raw_dataset.size());
      typename sequence_info_deque_type::iterator si=Raw_dataset.begin();
      size_t n;
      for(si=Raw_dataset.begin(),n=0; si != Raw_dataset.end(); si++, ++n)
        {
        lv[n]=si->temperature.adjusted;
        //std::cerr << "temp=" << lv[n] << std::endl;
        }
      return lv;
      }
  };
};
#endif
