// $Id: Nucleotide.h 1367 2024-08-30 21:00:18Z ge $
// Copyright 2006 Gerald Weber
/// \file Nucleotide.h
/// \brief Contains the definition of the class Nucleotide
///
/// $Revision: 1367 $
/// \author Gerald Weber <gweberbh@gmail.com>
#ifndef GBC_NUCLEOTIDE_H
#define GBC_NUCLEOTIDE_H "$Id: Nucleotide.h 1367 2024-08-30 21:00:18Z ge $"
#include "SequencePosition.h"
#include "NucleotideDictionary.h"
#include <fstream>
#include <boost/regex.hpp>
#include "ErrorCodes.h"
#include "RegexPattern.h"

namespace gbc 
  {
  // enum sugar_type {deoxyribose,ribose,threose};
  /// \brief Basic definition of a Nucleotide.
  ///
  /// The basic class which holds the information about a nucleotide in a sequence.
  /// _RepTp (default=char) is the way the information will be actually stored.
  ///
  /// Nucleotide uses an internal dictionary to convert from symbols to it's internal
  /// representation given by _RepTp. Some are predefined, see file Nucleotide.cpp.
  template<class _RepTp=char, class _IndexTp=char>
  class Nucleotide 
    {
    public:
      typedef Nucleotide<_RepTp,_IndexTp> NucleotideType;
      typedef NucleotideDictionary<_RepTp,_IndexTp> DictionaryType;
      typedef char sugar_type;
      _RepTp representation;         ///< the internal representation
      sugar_type Sugar; 
      static const sugar_type deoxyribose='d', ribose='r', threose='t';
      
      bool Good;                          ///< Flags which tells us if Nucleotide has read a valid value.
      static DictionaryType  dictionary;  ///< Dictionary which translates file symbols to internal representation
	                                  ///< This object is static because it makes sense that all objects of type Nucleotide use the same
                                          ///< conversion rules. See file NucleotideDictionary.cpp for predefined dictionaries.
      SequencePosition Sequence_position; ///< Position of this Nucleotide in a file

      /// \brief Void onstructor.
      Nucleotide(void): representation(), Sugar(deoxyribose), Good(false), Sequence_position()
	{}
	
      /// \brief Constructor which initializes with a specific dictionary.
      Nucleotide(const _IndexTp cr): representation(), Sugar(deoxyribose), Good(false), Sequence_position()
	{
        (*this)=cr;
	};

      Nucleotide(const std::string smb): representation(), Sugar(deoxyribose), Good(false), Sequence_position()
        {
        string_interpretation(smb);
        };
        
      /// \brief Constructor which initializes with a specific nucleotide.
      Nucleotide(const NucleotideType& nuc)
	{
        representation=nuc.representation;
        Sugar=nuc.Sugar;
        Good=nuc.Good;
        Sequence_position=nuc.Sequence_position;
	};

      /// \brief Constructor which initializes with a specific dictionary.
      ///
      /// Note that the dictionary needs to be fully specified beforehand.
      /// ATTENTION as dictionary is a static member, this change affects every object of type Nucleotide<Tp1,Tp2>
      Nucleotide(DictionaryType &dict) ///< the object of type DictionaryType
        : representation(),  Sugar(deoxyribose), Good(false)
	{
        dictionary=dict;
        }
        
      /// Converts a string representation into its nucleobase and sugar components. 
      /// For example "rU" will be broken down as sugar=r and nucleobase=U
      inline bool string_interpretation (const std::string smb)
        {
        bool ok=false;
        const boost::regex pattern(NUCLEOTIDE_STRING_INTERPRETATION_PATTERN);
        boost::smatch found;
        CERR_DEBUG(DNUC_STRINT) << "input string=" << smb << std::endl;

        if (boost::regex_search(smb,found,pattern)) 
          {
          if (found[1].length()) Sugar=found[1].str()[0];
          set_nucleobase(found[2].str()[0]);
          ok=Good;
          }
        return ok;
        }

      /// Returns the symbol corresponding to its internal representation.
      inline _IndexTp symbol (void) const {return dictionary.invert(representation);};

      /// Returns the symbol corresponding to its internal representation.
      inline _IndexTp symbol (void) {return dictionary.invert(representation);};

      /// Returns the symbol corresponding to its internal representation.
      inline _IndexTp nucleobase_char (void) const {return dictionary.invert(representation);};
      
      inline char sugar_char (void) const {  return Sugar; };
        
      /// Returns the sugartype
      inline sugar_type sugar (void) const {return Sugar;};

      inline std::string sugar_string(void) const {return std::string(1,sugar_char());};


      inline std::string nucleoside_string (void) const
        {
        return std::string(1,sugar_char())+std::string(1,nucleobase_char());
        };

      inline void set_ribose (void) {Sugar=ribose;}

      inline void set_deoxyribose (void) {Sugar=deoxyribose;}
      
      inline void set_threose (void)     {Sugar=threose;}
      
      inline void set_sugar (char sg)
        {
        Sugar=sg;
        // const boost::regex pattern("([a-z])");
        // boost::smatch found;
        // std::string sug(sg);
        // if (boost::regex_search(sug,found,pattern)) 
        //   {
        //   if (found[1].length()) Sugar=found[1].str()[0];
        //   }

        // try
        //   {
        //   switch (sg)
        //     {
        //     case 'r': set_ribose(); break;
        //     case 'd': set_deoxyribose(); break;
        //     case 't': set_threose(); break;
        //     default: throw sg;
        //     }
        //   }
        // catch (char errsg) {CERR_ERROR(ERRUST) << "unknown sugar type=" << errsg << std::endl; CERR_TERM;} 
        };



      /// \brief Returns the content of the internal representation.
      ///
      /// E.g., if _RepTp=int then you can write (int)nc where nc is of type Nucleotide.
      operator _RepTp() {return representation;}

      template<class _AddTp>
      /// \brief Assigns _AddTp into Nucleotide<_RepTp>.
      ///
      /// There must be a way to convert _AddTp into _RepTp, i.e.,
      /// _RepTp=_AddTp must be possible.
      /// \attention The flag Good will always be true if not set by the convertion _AddTp=_RepTp.
      Nucleotide& operator = (const _AddTp it) ///< the _RepTp to be assigned
	{Good=true; representation=it; return *this;}
	
      bool set_nucleobase (const _IndexTp s)
        {
        auto dit=dictionary.index_map.find(s);
	Good=dit != dictionary.index_map.end();
        try
          {
	  if (Good) representation=dit->second;
          else      
            {
            representation=_RepTp();
            throw s;
            }
          }
        catch (const _IndexTp errs) 
          {CERR_ERROR(ERRCNIB) << "could not interpret BasePair=" << errs << std::endl; CERR_TERM}
	return Good;
        }

      /// \brief Allows you to assign a char to Nucleotide.
      ///
      /// This is a specialization of Nucleotide& operator = (_AddTp it)
      /// Sets flag Good=true if assignement was sucessful, i.e., if s is contained in the dictionary.
      Nucleotide& operator = (const _IndexTp s) ///< the char to translate and assign
	{
        auto dit=dictionary.index_map.find(s);
	Good=dit != dictionary.index_map.end();
	if (Good) representation=dit->second;
        else      representation=_RepTp();
	return *this;
	}

      template<class _RepTp2>
      /// \brief Assigns Nucleotides with different internal representations.
      ///
      /// This is accomplished by using the symbol() function and assigning as a char.
      /// Because of this there will always be a lookup at the dictionary making this a slow
      /// conversion.
      Nucleotide<_RepTp>& operator = (const Nucleotide<_RepTp2>& nt2)
	{
	(*this)=nt2.symbol();
        Sugar=nt2.Sugar;
        Sequence_position=nt2.Sequence_position;
        Good=nt2.Good;
	return *this;
	}

      /// \brief Assigns two Nucleotides with the same internal representation.
      ///
      /// This is a specialization of Nucleotide<_RepTp>& operator = (Nucleotide<_RepTp2>& nt2)
      /// which takes advantange of the fact that _RepTp is the same for both and no
      /// dictitonary lookup is necessary.
      Nucleotide<_RepTp>& operator = (const Nucleotide<_RepTp>& nt2)
	{
	representation=nt2.representation;
        Sugar=nt2.Sugar;
	Sequence_position=nt2.Sequence_position;
        Good=nt2.Good;
	return *this;
	}

      /// \brief Returns the complementary symbol of this Nucleotide.
      /// \attention Function untested.
      inline char complementary_symbol(void) const
        {
        return dictionary.complementary_symbol(symbol());
        }

      /// Converts this Nucleotide to its complementary (\f$A\leftrightarrow T\f$ and \f$C\leftrightarrow G\f$ )
      inline void complementary(void) {representation=dictionary.complementary(representation);}

      /// Tells us if Nucleotide was assigned a valid value.
      inline bool good(void) {return Good;}

      /// Extractor for printing Nucleotide symbols
      friend std::ostream& operator<<(std::ostream &out,const Nucleotide &nt)
	{
	out << nt.symbol();
	return out;
	}

      /// Equal operator returns the operation corresponding to _RepTp
      inline friend bool operator==(const Nucleotide<_RepTp,_IndexTp>& nuc1,const  Nucleotide<_RepTp,_IndexTp>& nuc2)
        {
	return (nuc1.representation==nuc2.representation) and (nuc1.Sugar == nuc2.Sugar);
        }

      /// Equal operator ignoring sugar
      inline friend bool eq_ns(const Nucleotide<_RepTp,_IndexTp>& nuc1,const  Nucleotide<_RepTp,_IndexTp>& nuc2)
        {
	return (nuc1.representation==nuc2.representation);
        }
        
      /// Less than operator returns the operation corresponding to _RepTp
      inline friend  bool operator<(const Nucleotide& nuc1,const Nucleotide& nuc2)
        {
        if (nuc1.Sugar == nuc2.Sugar) return nuc1.symbol() < nuc2.symbol();
        else return nuc1.nucleoside_string() < nuc2.nucleoside_string();
	}

      /// Greater than operator returns the operation corresponding to _RepTp
      inline friend  bool operator>(const Nucleotide& nuc1,const Nucleotide& nuc2)
	{
        if (nuc1.Sugar == nuc2.Sugar) return nuc1.symbol() > nuc2.symbol();
        else return nuc1.nucleoside_string() > nuc2.nucleoside_string();
	}


  }; // end class definition
	

  template<>
  inline char Nucleotide<char>::symbol(void) {return representation;};

  template<class _NucleotideTp>
  inline char NucleotideSymbol(_NucleotideTp nuc) {return nuc.symbol();};

}// end namespace

#endif
