// $Id: GenbankFormat.h 1101 2020-04-21 18:21:41Z ge $
/// \file GenbankFormat.h
/// \brief Contains the base class GenbankFormat
///
/// $Revision: 1101 $
/// \author Gerald Weber <gweberbh@gmail.com>
#ifndef GENBANKFORMAT_H
#define GENBANKFORMAT_H "$Id: GenbankFormat.h 1101 2020-04-21 18:21:41Z ge $"
#include "FileFormat.h"

namespace gbc {

/// \brief Parses a file in Genbank flat file format.
///
/// The file format is outlined in http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
/// http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
class GenbankFormat: public FileFormat
  {
  public:
  std::streampos Genbank_origin;   ///< The actual position in the file where the keyword ORIGIN is found.
  std::streampos Features_pos;     ///< The actual position in the file where the keyword FEATURES is found.
  std::string Organism;            ///< The value of the ORGANISM keyword. 

  /// \brief The void constructor. Also sets the genbank keywords (starting with /).
  GenbankFormat(void);

  /// \brief Constructor which takes the filename as argument.
  GenbankFormat(const std::string &fn) ///< the file name
    : FileFormat(fn) {GenbankFormat();}

  /// \brief Checks the file type by looking for a LOCUS word at the begining of the file.
  virtual bool check_file_type(void);

  /// \brief Prepares for reading for finding the position of the keyword ORIGIN and jumping to the next line.
  virtual void prepare_reading(void);

  /// \brief Goes to the position of the keyword ORIGIN and jumps to the next line.
  void goto_origin(void);

 /// Go back to the logical beginning of the genome file.
  virtual inline void start_over(void) {goto_origin();}
  
  void get_section(std::list<ContigSection> &lcs, std::string& feature );

  /// \brief Get the location where the Contig starts/ends.
  ///
  /// This function gets the location of a ContigSection by parsing the line where the keyword is located.
  /// The location is given in terms of relative Nucleotide position.
  /// A contig can have several sections which need to be joined.
  /// This process is currently not entirely safe.
  /// \todo Currently this function can not parse a join keyword if the location are spanned over more the one
  /// line.
  void get_location(std::list<ContigSection> &lcs, ///< list of ContigSection which is populated as new sections are found
                    bool &complementary);     ///< we be true if section is localized in the complementary strand

  /// \brief Find the next occurence of a Genbank keyword and returns the file
  /// position immediately after this keyword.
  ///
  /// This fucntion does not change the position of the file pointer.
  std::streampos find_keyword(const std::string &keyword);  ///< the keyword of a Genbank section, for example ``ORIGIN'' or ``Gene''

  /// \brief Get any feature key located within located within the first 20 columns.
  ///
  /// \attention This function scans the current line only.
  std::string feature_key(void);

  /// \brief Maps all contigs in a file and places them into FileParser::Contig_table.
  virtual void map_contigs(const std::string key=std::string()); ///< the key may be a specific feature of the genbank file, if empty all features are selected

  /// \brief Checks if we are at the begin of a new contig and sets Contig_begin=true if this is the case.
  ///
  /// \attention This function is only meaningful after calling map_contigs()
  virtual void verify_contig_begin(void);

  /// \brief Specialized function for reading files in genbank flat file format
  virtual inline void read(Nucleotide<> &nt) ///< Nucleotide to be assigned to
    {genbank_read(nt);}
    
  template<class _Tp>
  /// \brief Specialized function for reading files in genbank flat file format
  void genbank_read(Nucleotide<_Tp> &nt) ///< Nucleotide to be assigned to
     {
    Contig_begin=false;
    bool isgood=false;
    nt.Good=false;
    if (the_line.eof()) {get_next_line();}
    while (!eof() && !isgood)
      {
      char cr=' ';
      if (the_line.tellg()<=(std::streampos)10) {the_line.seekg((std::streampos)10);} //we skip 10 positions
      the_line >> cr;
      if (the_line.eof()) {get_next_line(); }
      switch(cr)
	{
	case ' ': break; // we skip blanks
	default :
	  if (nt.dictionary.index_map.find(cr) != nt.dictionary.index_map.end())
	    {
	    nt=cr; nt.Good=true;
	    Last_sequence_position.stream_position(tellg());
	    Last_sequence_position.Nucleotide_position++;
	    nt.Sequence_position=Last_sequence_position;
	    if (Contig_table.empty())
	      {
	      isgood=true;
	      Contig_begin=false;
              }
	    else
	      {
	      if (!Contig_table.ignore(Last_sequence_position.Nucleotide_position))
	        {
	        isgood=true;
                verify_contig_begin();
		}
	      }
	    }
          else
	    {
	    nt='#'; // everything else becomes a '#'
	    nt.Good=false;
	    nt.Sequence_position.stream_position(tellg());
	    isgood=true;
	    }
	}
      }
    }
 

  /// \brief Check for the end of the file.
  ///
  /// Also considers as end-of-file if it finds a lone //.
  virtual inline bool eof(void)
    {
    bool result=file_handle.eof();
    if (!result)
      {
      std::string sub=the_line.str().substr(0,2);
      result=sub==std::string("//");
      }
    return result;
    }
 };
};

#endif






