// $Id: NucleotideDictionary.h 1312 2022-07-08 21:37:59Z ge $
// Copyright 2006 Gerald Weber
/// \file NucleotideDictionary.h
/// \brief Class which handles conversions of symbols to internal representations
///
/// $Revision: 1312 $
/// \author Gerald Weber <gweberbh@gmail.com>
#ifndef GBC_NUCLEOTIDEDICTIONARY_H
#define GBC_NUCLEOTIDEDICTIONARY_H "$Id: NucleotideDictionary.h 1312 2022-07-08 21:37:59Z ge $"
#include <fstream>
#include <map>
#include <list>
#include <complex>
#include "ErrorCodes.h"

namespace std {
template<class _Tp>
/// \brief Dummy operator to enable insertion of objects of type complex
/// \attention This is for compilation purposes only
bool operator<(const std::complex<_Tp>& cx1, const std::complex<_Tp>& cx2)
  {return true;}}

namespace gbc {
  
enum class NucleotideCode { empty, generic, DNA, RNA, IUPAC, Math };


/// \brief Conversion dictionary between symbols and internal representation of Nucleotide.
///
/// Implements a conversion dictionary for the symbols which are being read
/// from the data files to some internal representation _RepTp.
/// In a simplest case _RepTp could be char or int, but it can also be some
/// more elaborated structure.
/// _IndexTp has no duplicates, while _RepTp may have. For instance _RepTp could be both 'A' or 'a', which would
/// be converted to say 1. Example: add('A',1) and afterwards add('a',1). 
/// But 1 will always be converted back to the symbol that was defined first, in this case 'A'.
/// In summary, 'A' -> 1, 'a' -> 1, inverse: 1 -> 'A' but never 'a'.
  
template<class _RepTp=char, class _IndexTp=char>
class NucleotideDictionary
  {
  public:
    typedef _RepTp rep_type;
    typedef _IndexTp    index_type;
    typedef std::map<index_type,rep_type> index_map_type;
    typedef std::map<rep_type,index_type> rep_map_type;
    typedef std::map<rep_type,rep_type> complementary_map_type;
    typedef NucleotideDictionary<rep_type,index_type> DictionaryType;
    typedef std::list<index_type> char_list_type;

    index_map_type index_map;
    rep_map_type rep_map; ///< Map for efficient inverse lookup
    complementary_map_type complementary_map; ///< Map for symbols that are complementary
    
    bool Debug; //Debug flag, set to true for seeing more information

    /// \brief Void constructor.
    NucleotideDictionary(void) : Debug(false) { populate(); };

    /// \brief Void constructor.
    NucleotideDictionary(NucleotideCode NC) : Debug(false) { populate(NC); };

//     /// \brief Void constructor.
//     NucleotideDictionary(bool t) : Debug(false) {};
    
    /// \brief Constructor which reads a dictionary file.
    ///
    /// The dictionary file is read immediately and only once.
    NucleotideDictionary(const char dict_name[]) : Debug(false)
      {
      read_dictionary(dict_name);
      };

   /// \brief Constructor which defines the dictionary conversions.
   ///
   /// This constructor gets rid of the need for a dictionary file.
   NucleotideDictionary(const rep_type& N, ///< value for unknown (char='#')
                        const rep_type& A, ///< value for adenine
			const rep_type& C, ///< value for cytosine
			const rep_type& G, ///< value for guanine
			const rep_type& T) ///< value for thymine
                        : Debug(false)
     {
     add('#',N);
     add('A',A); add('C',C); add('G',G); add('T',T);
     add('a',A); add('c',C); add('g',G); add('t',T); //This guarantees that lowercase input is converted to uppercase
     set_complementary_rep(A,T);
     set_complementary_rep(C,G);
     }
     
   inline void populate(NucleotideCode NC=NucleotideCode::empty) {};

   /// \brief Adds new conversion rules to the dictionary.
   ///
   /// This method is simply a convenience replacement for std::map::insert.
   inline void add(const index_type& smb,       //Indexing, usually char
                   const rep_type& cnv,    //Representation
                   bool  replace_inverse=false //If true, inverse will be replace if it already exists
                   )  
      {
      CERR_DDEBUG(DNUD_SMBCNV) << "index=" << smb << " rep=" << cnv << std::endl;
      if (index_map.find(smb) != index_map.end())
        if (index_map[smb] != cnv) 
          CERR_WARN(WRIIWR) << "replacing internal index " << smb << " with representation " << cnv 
                            << " previous representation was " << index_map[smb] << std::endl;
      
      index_map[smb]=cnv;
      
      if (rep_map.find(cnv) != rep_map.end())
        {
        if (replace_inverse)
          {
          if (rep_map[cnv] != smb) 
            CERR_WARN(WRIIWR) << "replacing representation " << cnv << " with internal index " << smb 
                              << " previous internal index was " << rep_map[cnv] << std::endl;
          rep_map[cnv]=smb;
          }
        }
      else rep_map[cnv]=smb;
      }
      
   inline void clear_all(void)
     {
     index_map.clear();
     rep_map.clear();
     complementary_map.clear();
     }

   /// \brief Adds new pair of complementary representations.
   // This assumes that the complementarity is symmetric, e.g., A is complementary to T, while T is complementary to A. 
   // However, this does not work for hybrids like DNA/RNA.
   // ATTENTION existing complementarities will be replaced
   inline void set_complementary_rep(const rep_type& fst, ///< First internal symbol 
                                     const rep_type& snd) ///< Complementary internal symbol
      {
      if ( (complementary_map.find(fst) != complementary_map.end()) or (complementary_map.find(snd) != complementary_map.end()) )
        if (complementary_map[fst] != snd)
          CERR_WARN(WRCF) << "replacing complementary for " << fst << ":" << snd 
                          << " previous pair was " << fst << ":" << complementary_map[fst] <<  std::endl;
      
      complementary_map[fst]=snd;
      complementary_map[snd]=fst;
      }
      
   inline void set_complementary_index(const index_type& fst, ///< First internal symbol 
                                       const index_type& snd) ///< Complementary internal symbol
      {
      if ( (complementary_map.find(index2rep(fst)) != complementary_map.end()) or (complementary_map.find(index2rep(snd)) != complementary_map.end()) )
        if (complementary_map[index2rep(fst)] != index2rep(snd))
          CERR_WARN(WRCF) << "replacing complementary for " << fst << ":" << snd 
                          << " previous pair was " << fst << ":" << complementary_map[index2rep(fst)] << std::endl;
      
      complementary_map[index2rep(fst)]=index2rep(snd);
      complementary_map[index2rep(snd)]=index2rep(fst);
      }

   // Legacy function, use set_complementary_rep instead
   inline void complementary_pair(const rep_type& fst, ///< First internal symbol 
                                  const rep_type& snd) ///< Complementary internal symbol
      {
      set_complementary_rep(fst,snd);
      }


   /// \brief Conversion to complementary symbols found in complementary_map.
   ///
   /// \return Returns the complementary symbol of type char.
   inline char complementary_symbol(const char& fst) const ///< Symbol to convert
      {
      return rep2index(complementary(index2rep(fst)));
      }

   inline rep_type complementary(const rep_type& cnv,            ///< Symbol for which to find the complementary
                               bool  complain=true) const    ///< If true, raise exception when complementary not found
      {
      auto it=complementary_map.find(cnv);
      try
        {
        if (it != complementary_map.end()) return it->second;
        else if(complain) throw cnv;
        }
      catch (rep_type errcnv)
        {
        CERR_WARN(WCRNF) << "complementary representation for \"" << errcnv <<  "\" not found " << std::endl;
        }
      return rep_type();
      }

   /// \brief Removes conversion rules from the dictionary
   inline void rm(const char& smb)   ///< Symbol to be removed
      {
      rep_type fst=index2rep(smb);
      rep_type snd=rep2index(complementary(fst,false)); //complain=false as complementary may not exist
      if (complementary_map.find(fst) != complementary_map.end()) complementary_map.erase(complementary_map.find(fst));
      if (complementary_map.find(snd) != complementary_map.end()) complementary_map.erase(complementary_map.find(snd));
      rep_map.erase(index2rep(smb));
      index_map.erase(smb);
      }

    /// \brief Returns the index_type of an rep_type
    inline index_type rep2index(const rep_type& cnv) const ///< object of type rep_type to be converted
      {
      if (not rep_map.empty()) //Not all types of NucleotideDictionary do have an inverse map
        {
        auto found=rep_map.find(cnv);
        if (found != rep_map.end()) return rep_map.find(cnv)->second;
        else CERR_DDEBUG(DNUD_NISFF) << "no inverse symbol found for \"" << cnv << "\" " << std::endl;
        }
      else
        {
        for(auto dt=index_map.begin(); dt != index_map.end(); dt++)
          {
	  if (dt->second==cnv) return dt->first;
	  }
        }
      return index_type();
      }

    // Legacy function, calls rep2index
    inline index_type invert(const rep_type& cnv) const ///< object of type rep_type to be converted
      {
      return rep2index(cnv);
      }
      
    inline bool exists_rep(const rep_type& rep) const
      {
      auto fnd=rep_map.find(rep);
      return fnd != rep_map.end();
      }
      
    inline bool exists_index(const index_type& ind) const
      {
      auto fnd=index_map.find(ind);
      return fnd != index_map.end();
      }

    inline bool add_new_index(const index_type& ind)
      {
      if (exists_index(ind)) return false;
      else
        {
        rep_type rep='A';
        while (exists_rep(rep)) {rep++;}
        add(ind,rep);
        CERR_DDEBUG(DNUD_ADDNEW) << "added new representation=[" << rep << "] as index=[" << ind << "]" << std::endl;
        return true;
        }
      }
      
    /// \brief Translate _IndexT to the rep_type
    ///
    /// This looks up the translation map, if not found returns '#'.
    inline rep_type index2rep(const index_type& cr) const ///< char to be translated
      {
      auto fnd=index_map.find(cr);
      if (fnd != index_map.end()) return fnd->second; 
        else 
          {
          fnd=index_map.find('#');
          if (fnd != index_map.end()) return fnd->second; 
          }
      return rep_type();
      }

    /// \brief Legacy function, calls index2rep
    inline rep_type translate(const index_type& cr) const ///< char to be translated
      {
      return index2rep(cr);
      }

    /// \brief Extractor which loads a whole dictionary from file.
    ///
    /// Use like: file >> dict;
    friend std::istream& operator>>(std::istream &fl, DictionaryType &dict)
      {
      while (!fl.eof() && fl.good())
        {
        index_type symbol; rep_type conversion;
        fl >> symbol >> conversion;
        dict.add(symbol,conversion);
        }
      return fl;
      }

    /// \brief Writes the contents of the dictionary to ostream.
    friend std::ostream& operator<<(std::ostream& os, const DictionaryType &dict)
      {
      for(auto dit : dict.index_map)
        os << dit.first << " " << dit.second << std::endl;
      return os;
      }

    /// \brief Reads a dictionary file
    void read_dictionary(const char dict_name[]) ///< name of the dictionary file
      {
      std::ifstream dictionary_file;
      dictionary_file.exceptions(std::ios_base::failbit | std::ios_base::badbit);
      try {dictionary_file.open(dict_name);}
      catch (const std::ios_base::failure& error)      {CERR_IERROR(IERRSIBF) << error.what() << std::endl;}
      catch (...)                       {CERR_IERROR(IERRRDUE) << "Unknown exception" << std::endl;}
      dictionary_file.exceptions(std::ios_base::goodbit);
      dictionary_file >> (*this);
      dictionary_file.close();
      }

    /// \brief Lists all characters known to be valid.
    char_list_type valid_characters(void)
      {
      char_list_type char_list;
      for (auto it  = index_map.begin();  it != index_map.end(); it++)
        {
        if (it->first != '#')
          {
          char_list.push_back(rep2index(it->second));
	  }
        }
      char_list.sort();
      char_list.unique();
      return char_list;
      }

  };
  
  //Specializations
  
   template<>
    inline bool NucleotideDictionary<int,char>::add_new_index(const index_type& ind)
      {
      if (exists_index(ind)) return false;
      else
        {
        rep_type rep=1;
        while (exists_rep(rep)) {rep++;}
        add(ind,rep);
        CERR_DDEBUG(DNUD_ADDNEW) << "added new representation=[" << rep << "] as index=[" << ind << "]" << std::endl;
        return true;
        }
      }
   
   template<>
   inline void NucleotideDictionary<int,char>::populate(NucleotideCode NC)
     {
     switch (NC)
       {
       case NucleotideCode::empty   : break;
       case NucleotideCode::generic :
       case NucleotideCode::DNA     :
         add('#',0);
         add('A',1); add('C',2); add('G',3); add('T',4);
         add('a',1); add('c',2); add('g',3); add('t',4); //This guarantees that lowercase input is converted to uppercase
         set_complementary_index('A','T');
         set_complementary_index('C','G');
         break;
         
       case NucleotideCode::RNA :
         add('#',0);
         add('A',1); add('C',2); add('G',3); add('U',4);
         add('a',1); add('c',2); add('g',3); add('u',4); //This guarantees that lowercase input is converted to uppercase
         set_complementary_index('A','U');
         set_complementary_index('C','G');
         break;
         
       case NucleotideCode::IUPAC :
         add('A',1);    add('a',rep2index('A'));
         add('C',10);   add('c',rep2index('C'));
         add('G',100);  add('g',rep2index('G'));
         add('T',1000); add('t',rep2index('T'));
         add('R',index2rep('A') + index2rep('G')); add('r',rep2index('R')); //A or G
         add('Y',index2rep('C') + index2rep('T')); add('y',rep2index('Y')); //C or T
         add('S',index2rep('G') + index2rep('C')); add('s',rep2index('S')); //G or C
         add('W',index2rep('A') + index2rep('T')); add('w',rep2index('W')); //A or T
         add('K',index2rep('G') + index2rep('T')); add('k',rep2index('K')); //G or T
         add('M',index2rep('A') + index2rep('C')); add('m',rep2index('M')); //A or C
         add('B',index2rep('C') + index2rep('G') + index2rep('T')); add('b',rep2index('B')); //C or G or T
         add('D',index2rep('A') + index2rep('G') + index2rep('T')); add('d',rep2index('D')); //A or G or T
         add('H',index2rep('A') + index2rep('C') + index2rep('T')); add('h',rep2index('H')); //A or C or T
         add('V',index2rep('A') + index2rep('C') + index2rep('G')); add('v',rep2index('V')); //A or C or G
         add('N',index2rep('A') + index2rep('C') + index2rep('G') + index2rep('T'));  add('n',rep2index('N')); //any base
  
         set_complementary_index('A','T');
         set_complementary_index('C','G');
         set_complementary_index('R','Y');
         set_complementary_index('W','S');
         set_complementary_index('K','M');
         set_complementary_index('B','V');
         set_complementary_index('D','H');
         set_complementary_index('N','N');
         break;
         
       case NucleotideCode::Math :
         CERR_IERROR(IERRNNMIF) << "no NucleotideCode::Math implementation for NucleotideDictionary<int,char> " << std::endl;
         CERR_TERM
         
       default: break;
       }
     }
  
   template<>
   inline void NucleotideDictionary<char,char>::populate(NucleotideCode NC)
     {
     switch (NC)
       {
       case NucleotideCode::empty   : break;
       case NucleotideCode::generic :
       case NucleotideCode::DNA :
         add('#','N');
         add('A','A'); add('C','C'); add('G','G'); add('T','T');
         add('a','A'); add('c','C'); add('g','G'); add('t','T'); //This guarantees that lowercase input is converted to uppercase
         set_complementary_index('A','T');
         set_complementary_index('C','G');
         break;

       case NucleotideCode::RNA :
         add('#','N');
         add('A','A'); add('C','C'); add('G','G'); add('U','U');
         add('a','A'); add('c','C'); add('g','G'); add('u','U'); //This guarantees that lowercase input is converted to uppercase
         set_complementary_index('A','U');
         set_complementary_index('C','G');
         break;
         
       //https://www.bioinformatics.org/sms/iupac.html
       case NucleotideCode::IUPAC :
         add('A','A'); add('a','A');
         add('C','C'); add('c','C');
         add('G','G'); add('g','G');
         add('T','T'); add('t','T');
         add('R','R'); add('r','R'); //A or G
         add('Y','Y'); add('y','Y'); //C or T
         add('S','S'); add('s','S'); //G or C
         add('W','W'); add('w','W'); //A or T
         add('K','K'); add('k','K'); //G or T
         add('M','M'); add('m','M'); //A or C
         add('B','B'); add('b','B'); //C or G or T
         add('D','D'); add('d','D'); //A or G or T
         add('H','H'); add('h','H'); //A or C or T
         add('V','V'); add('v','V'); //A or C or G
         add('N','N'); add('n','N'); //any base
  
         set_complementary_index('A','T');
         set_complementary_index('C','G');
         set_complementary_index('R','Y');
         set_complementary_index('W','S');
         set_complementary_index('K','M');
         set_complementary_index('B','V');
         set_complementary_index('D','H');
         set_complementary_index('N','N');
         break;
         
       case NucleotideCode::Math :
         CERR_IERROR(IERRNNMIF) << "Error: no NucleotideCode::Math implementation for NucleotideDictionary<char,char> " << std::endl;
         CERR_TERM
         
       default: break;
       }
     }
     
   template<>
   /// \brief Specialization of type complex<int>.
   inline void NucleotideDictionary<std::complex<int>,char>::add(const char& smb,const std::complex<int>& cnv, bool  replace_inverse)
     {
     index_map[smb]=cnv;
     }

   template<>
   /// \brief Specialization of type complex<double>.
   inline void NucleotideDictionary<std::complex<double>,char>::add(const char& smb,const std::complex<double>& cnv, bool  replace_inverse)
     {
     index_map[smb]=cnv;
     }

     template<>
   /// \brief Specialization of type complex<int>.
   inline std::complex<int> NucleotideDictionary<std::complex<int>,char>::complementary(const std::complex<int>& cnv, bool  complain) const ///< Symbol to convert
      {
      return conj(cnv);
      }
      

}
#endif
