// $Id: NucleotideSequence.h 1367 2024-08-30 21:00:18Z ge $
/// \file NucleotideSequence.h
/// \brief Contains the definition of the class NucleotideSequence
///
/// $Revision: 1367 $
/// \author Gerald Weber <gweberbh@gmail.com>
#ifndef NUCLEOTIDESEQUENCE_H
#define NUCLEOTIDESEQUENCE_H "$Id: NucleotideSequence.h 1367 2024-08-30 21:00:18Z ge $"
#include <deque>
#include <string>
#include <iterator>
#include <algorithm>
#include "Nucleotide.h"
#include <boost/regex.hpp>
#include <boost/algorithm/string.hpp>
#include "RegexPattern.h"
#include "ErrorCodes.h"

namespace gbc {
	/// extends deque class
template<class _InternalTp=char>
/// \brief Sequence (std::deque) of Nucleotides.
///
/// Holds a sequence of Nucleotides of fixed size in a std::deque.
class NucleotideSequence:public std::deque<Nucleotide<_InternalTp> >
  {
  public:
  typedef _InternalTp                       internal_type;
  typedef Nucleotide<internal_type>         nucleotide_type;
  typedef std::deque<nucleotide_type>       deque_nucleotide_type; //only when using deque
  typedef NucleotideSequence<internal_type> nucleotide_sequence_type;
  typedef unsigned int window_type;                 ///< type of the window_size
  typedef typename nucleotide_type::sugar_type sugar_type;
  typedef std::multiset<sugar_type>         sugar_set_type;
  
  enum direction_type { dir53=53, dir35=35, dir32=32, dir23=23 };       ///< strand direction 5'->3' or 3'->5'
  window_type window_size;                          ///< size of the window
  window_type keep_from_previous_read;              ///< amount of nucleotides to be kept from previous readings
  bool shift_read;                                  ///< \brief true if reading should be a sliding window.
                                                    ///<
                                                    ///< The first Nucleotide will
	                                            ///< be discarded and a new one will be appended.
  bool shifted;                                     ///< indicates if last read was a simple shift
  bool cleared;                                     ///< indicates if before last read the sequence was cleared
  direction_type StrandDirection;                   ///< Holds the direction to which the strand is represented, dir53 is default
  bool Extended_strand_interpretation;
  sugar_type MainSugar;                             ///< The type of sugar that is most abundant in sequence
  bool UniformSugar;                                ///< True if all nucleotides have the same ribose type
  sugar_set_type SugarSet;                          ///< MultiSet with all types of sugar

  /// The coid constructor, sets the window_size to 1 and shift_read to true.
  NucleotideSequence(void)
   : deque_nucleotide_type(), window_size(1), keep_from_previous_read(0), shift_read(true), shifted(false), cleared(false), StrandDirection(dir53), Extended_strand_interpretation(true), 
     MainSugar(nucleotide_type::deoxyribose), UniformSugar(true), SugarSet()
   {
   }

  /// Constructor which sets the window size
  NucleotideSequence(int ws) ///< size of the window
   : deque_nucleotide_type(), window_size(ws), keep_from_previous_read(0), shift_read(true), shifted(false), cleared(false), StrandDirection(dir53), Extended_strand_interpretation(true), 
     MainSugar(nucleotide_type::deoxyribose), UniformSugar(true), SugarSet()
    {
    }

  /// The coid constructor, sets the window_size to 1 and shift_read to true.
  NucleotideSequence(const std::string& st)
   : deque_nucleotide_type(), window_size(1), keep_from_previous_read(0), shift_read(true), shifted(false), cleared(false), StrandDirection(dir53), Extended_strand_interpretation(true), 
     MainSugar(nucleotide_type::deoxyribose), UniformSugar(true), SugarSet()
   {
   this->assign(st);
   }

   /// Checks if the sequence is completely populated
  inline bool full(void) {return (this->size()==window_size);}

  /// Checks if our sequence is a good sequence.
  inline bool good(void) {return full();}

  template<class _AddTp> ///< a class _AddTp
  /// \brief Pushes a Nucleotide of type _AddTp at the end of the sequence.
  ///
  /// If the sequence is already full the first Nucleotide is popped if
  /// shift_read is true (default=true), otherwise the sequence is completely erased
  /// and only then we add the new Nucleotide.
  /// If you want to add unlimited Nucleotides you should specify very large window_size.
  /// 
  /// The type of Nucleotide to be added may be different from the Nucleotides in the Sequence.
  NucleotideSequence& operator <<(_AddTp nw) ///< _AddTp to be placed at the end.
    {
    shifted=cleared=false;
    if (full())
      {
      if (shift_read) {deque_nucleotide_type::pop_front(); shifted=true;}
      else            {deque_nucleotide_type::clear(); cleared=true;}
      }
    static Nucleotide<_InternalTp> nuc;
    nuc=nw;
    this->push_back(nuc);
    return *this;
    }

  template<class _OtherTp> ///< a class NucleotideSequence<_OtherTp>
  NucleotideSequence& operator <<(NucleotideSequence<_OtherTp> ns) ///< NucleotideSequence to be placed at the end.
    {
    typename NucleotideSequence<_OtherTp>::iterator i;
    for(i = ns.begin(); i != ns.end(); i++) *this << *i;
    return *this;
    }
    
  void invert_strand_direction(void)
    {
    if (StrandDirection == dir53) StrandDirection=dir35; else StrandDirection=dir53;
    }

  /// \brief Assigns a string to the sequence.
  ///
  /// This function fully relaces the content of NucleotideSequence with the string st
  /// the window_size is recalculated to the actual size of the sequence
  /// IMPORTANT this function may need to take into account both numbers 5'-3' due to 3'-2'
  NucleotideSequence& assign(const std::string& st) ///< string to insert
    {
    CERR_DEBUG(DNUS_ASSIGN) << "sequence=" << st << std::endl;
    std::string argstr(st);
    char default_sugar='d';
    const boost::regex pattern(NUCLEOTIDESEQUENCE_ASSIGN_PATTERN);//try to find something like d(TGCATGCA) or r(ACGTACGT)
    boost::smatch found;
    nucleotide_type nuc;
    if (boost::regex_search(st,found,pattern) and Extended_strand_interpretation)
      {
      if (found[1].length())
        {
        switch (found[1].str()[0])
          {
          case '3': StrandDirection=dir35; break;
          case '5': StrandDirection=dir53; break;
          }
        }
        
      if (found[2].length()) 
        {
        nuc.set_sugar(found[2].str()[0]);
        default_sugar=found[2].str()[0];
        }
      argstr=found[3];
      CERR_DEBUG(DNUS_ASSIGN) << "extended strand main sequence=" << argstr << " sugar=" << default_sugar << std::endl;
      }
      
    const boost::regex split_pattern(NUCLEOTIDESEQUENCE_SPLIT_PATTERN);
    boost::sregex_iterator result(argstr.begin(),argstr.end(),split_pattern);
    boost::sregex_iterator end;

    const boost::regex sugar_pattern(NUCLEOTIDESEQUENCE_SUGAR_PATTERN);

    this->clear(); cleared=true;
    SugarSet.clear();    
    for(; result != end; ++result) 
      {
      std::string res=(*result)[0].str();
      CERR_DEBUG(DNUS_RESULT) << "result string=" << res << std::endl;

      if (boost::regex_search(res,found,sugar_pattern))
        {
        CERR_DEBUG(DNUS_RESULT) << "found[1]=" << found[1].str() << " found[2]=" << found[2].str()<< std::endl;
        nuc.set_sugar(found[1].str()[0]);
        SugarSet.insert(found[1].str()[0]);
        nuc.set_nucleobase(found[2].str()[0]);
        }
      else
        {
        CERR_DEBUG(DNUS_RESULT) << "default sugar=" << default_sugar << std::endl;
        nuc.set_sugar(default_sugar);
        SugarSet.insert(default_sugar);
        nuc.set_nucleobase((*result)[0].str()[0]);
        }
      this->push_back(nuc);
      window_size=deque_nucleotide_type::size();
      }
    determine_main_sugar();
    return *this;
    }

  /// \brief Inserts a string into the sequence.
  ///
  /// This function is a specialization of the template NucleotideSequence& operator <<(_AddTp nw).
  NucleotideSequence& operator<<(std::string st) ///< string to insert
    {
    std::string::iterator i;
    for(i = st.begin(); i != st.end(); i++) *this << *i;
    return *this;
    }

   /// \brief Inserts a const char* string into the sequence.
   ///
   /// This function simply calls NucleotideSequence& operator<<(std::string st)
   NucleotideSequence& operator<<(const char* st) ///< string to insert
     {
     *this << std::string(st);
     return *this;
     }

   /// Operator which returns a concatenated string of symbols representing the sequence.
  operator std::string(void) const
    {
    return nucleobases_string();
    }
    
  sugar_type determine_main_sugar(void)
    {
    auto nuc=this->begin();
    MainSugar=nuc->sugar_char();
    size_t max=0;
    for(auto sugar : SugarSet)
      {
      if (MainSugar != sugar) UniformSugar=false;
      if (SugarSet.count(sugar) > max)
        {
        MainSugar = sugar;
        max = SugarSet.count(sugar);
        }
      }
    return MainSugar;
    }

    
  std::string nucleobases_string(void) const
    {
    std::string out=std::string();
    for(auto nuc : *this) 
      {
      if (MainSugar != nuc.sugar_char()) out += "<" + nuc.nucleoside_string() + ">";
      else out += nuc.symbol();
      }
    return out;
    }

  std::string sugar_sequence_string(void) const
    {
    std::string out=std::string(1,MainSugar)+std::string("(")+nucleobases_string()+std::string(")");
    return out;
    }    

  /// \brief Operator which returns an integer representing the whole sequence.
  ///
  /// It is calculated according to \f$ f=\sum_{i=0}^{N-1}4^in_i \f$ where the sequence
  /// is \f$ S=\left\{n_0,n_1,n_2,\ldots,n_{N-1}\right\} \f$
  operator unsigned long int(void) const
    {
    Nucleotide<unsigned int> ni;
    typename nucleotide_sequence_type::const_iterator seq;
    unsigned long int out=0, fac;
    for(seq=this->begin(),fac=1; seq != this->end(); seq++, fac*=4)
      {
      ni=seq->symbol();
      out += (ni.representation-1)*fac;
      }
    return out;
    }



  /// Extractor which prints the whole sequence.
  inline friend std::ostream& operator<<(std::ostream &out,const NucleotideSequence &nts)
    {
    out << (std::string)nts;
    //std::copy(nts.begin(),nts.end(),std::ostream_iterator<internal_type,std::string>(out));
    return out;
    }

  /// Converts this NucleotideSequence to its complementary (\f$A\leftrightarrow T\f$ and \f$C\leftrightarrow G\f$ )
  void complementary(void)
    {
    for(auto seq=this->begin(); seq != this->end(); seq++) seq->complementary();
    }

  /// \brief Reverse NucleotideSequence, also inverts Strand_direction
  ///
  /// \attention This function only acts on Nucleotide::representation
  inline void reverse(void)
    {
    std::reverse(this->begin(),this->end());
    invert_strand_direction();
    }

  /// Converts this NucleotideSequence to its reverse/complementary (\f$A\leftrightarrow T\f$ and \f$C\leftrightarrow G\f$ )
  inline void reverse_complementary(void)
    {
    reverse(); complementary();
    }

  /// \brief Performs n cylic permutations on the sequence.
  ///
  /// A cyclic permutation is to take the first element in the sequence and place it at the end.
  inline void cyclic_permutation(int n=1) ///< Number of permutations
    {
    rotate(this->begin(),this->begin()+n,this->end());
    }

  /// \brief Returns the n and n+1 Nucleotides as a sequence of size 2.
  ///
  /// First position is n=0. The function is cyclic, e.g. if the sequence is of
  /// size 4 and you say neighbours(3) you will get the last nucleotide followed by the first.
  inline nucleotide_sequence_type neighbours(window_type n=0) ///< Position of First Neighbour
    {
    if (n >= window_size) n=n % window_size; 
    nucleotide_sequence_type SubSequence(2);
    typename nucleotide_sequence_type::iterator first=this->begin()+n, second=this->begin()+n+1;
    if (second >= this->end()) second=this->begin();
    SubSequence << *first << *second;
    return SubSequence;
    }

  /// \brief Returns a slice, i.e. a subsequence of Nucleotides.
  inline nucleotide_sequence_type slice(window_type start=0, window_type end=0)
    {
    if (end==0) end=this->size();
    nucleotide_sequence_type SubSequence(end-start);
    for (window_type i=start; i < end; i++) SubSequence << (*this)[i];
    return SubSequence;
    }

  
  /// Swap all Nucleotides of type X and Y, i.e, \f$X\leftrightarrow Y\f$
  void nucleotide_swap(char nuc1, ///< type X
                       char nuc2) ///< type Y
   {
   typename nucleotide_sequence_type::iterator seq;
   for(seq=this->begin(); seq != this->end(); seq++)
     {
    if (seq->symbol()==nuc1) *seq=nuc2;
    else {if (seq->symbol()==nuc2) *seq=nuc1;}
    }
  }

  /// Find the first occurrence of the given NucleotideSequence in this Sequence
  typename nucleotide_sequence_type::iterator find(nucleotide_sequence_type search,
                                                          typename nucleotide_sequence_type::iterator findFrom) {

    window_type position = 0;
    typename nucleotide_sequence_type::iterator i = this->begin();

    if(findFrom != this->begin()) {
      for(;i != findFrom;) {i++; position++;}
    }

    for(;position < (this->size()-search.size()+1);) {
      if(compare_first(slice(position,position+search.size()),search,search.size())) {
        return i;
      }

      i++;
      position++;
    }

    return deque_nucleotide_type::end();
  }

  /// Compares two NucleotideSequence from the start up to length len
  inline friend bool compare_first(const NucleotideSequence &nuc1, ///< first NucleotideSequence
                                   const NucleotideSequence &nuc2, ///< second NucleotideSequence
                                   const int len)                  ///< Number of positions to compare
    {
    return std::equal(nuc1.begin(),nuc1.begin()+len,nuc2.begin());
    }

  /// Compares two NucleotideSequence from the end up to length len (backwards)
  inline friend bool compare_last(const NucleotideSequence &nuc1, ///< first NucleotideSequence
                                  const NucleotideSequence &nuc2, ///< second NucleotideSequence
                                  const int len)                  ///< Number of positions to compare
    {
    return std::equal(nuc1.rbegin(),nuc1.rbegin()+len,nuc2.rbegin());
    }
	
}; //class
}; // namespace

#endif
