// $Id: FileParser.h 628 2011-08-31 15:42:16Z ge $
/// \file FileParser.h
/// \brief Contains the FileParser class definition
///
/// $Revision: 628 $
/// \author Gerald Weber <gweberbh@gmail.com>
#ifndef FILEPARSER_H
#define FILEPARSER_H "$Id: FileParser.h 628 2011-08-31 15:42:16Z ge $"
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include "FileFormat.h"
#include "FastaFormat.h"
#include "GenbankFormat.h"
#include "IntegerFormat.h"
#include "NucleotideSequence.h"
#include "SequenceSet.h"

namespace gbc {

/// \brief Parses a file containing a sequence of type Nucleotide.
///
/// FileParser connects FileFormats with Nucleotides. The class itself nows nothing
/// about formats, these are handled by FileFormat (generic), FastaFormat, GenbankFormat,
/// IntegerFormat etc. All formats are stored in a std::map called format_map which
/// is initialized by the constructors in FileFormat.cpp.
/// When a file is opened each format tested until the file
/// is identified by FileParser::analyse_file_type() and the file format string is stored
/// in FileParser::file_type.
/// If you need to access specific functions of a given file type, you need to access
/// this like this:
/// GenbankFormat* gen_point=static_cast<GenbankFormat*>(format("genbank"))
class FileParser
  {
  public:
    std::string file_type;                   ///< The file type, autodetected by FileParser::analyse_file_type.
    bool read_contig_at_once;                ///< Indicates if read should continue until the end of contig.
    bool ignore_contig_breaks;               ///< Indicates if the file should be read as if there were no contig breaks of any kind.
  protected:
    bool Stop_at_contig_end;                 ///< Indicates if we should stop reading when a contig break is found
  public:
    unsigned int Number_of_reads;             ///< Number of nucleotides successfully read
    typedef std::map<std::string,void*> format_map;
  protected:
    FileFormat*    generic_file;             ///< Points to an object of class FileFormat, may point also to any derived class.
    format_map file_formats;                 ///< Stores pointer to all format classes.
    
  public:
  /// The void constructor
  FileParser(void)
    :file_type("unknown"), read_contig_at_once(false), ignore_contig_breaks(false), Stop_at_contig_end(false)
    {
    setup_formats();
    generic_file=static_cast<FileFormat*>(format("fasta"));
    }

  /// Constructor which takes the file name as argument.
  FileParser(std::string fn)  ///< the file name
    :file_type("unknown"), read_contig_at_once(false), ignore_contig_breaks(false), Stop_at_contig_end(false)
    {
    setup_formats();
    generic_file=static_cast<FileFormat*>(format("fasta"));
    file_name(fn);
    }

  /// Setup all formats and stores them in FileParser::file_formats.
  void setup_formats(void);

  inline void format(std::string fname,
              void* fpointer)
    {
    file_formats[fname]=fpointer;
    }

  inline void* format(std::string fname)
    {
    return file_formats[fname];
    }
  
  inline FileFormat* generic(void)
    {
    return generic_file;
    }

  /// \brief Indicates that the file should be read backwards, and get the complementary of Nucleotide
  ///
  /// \attention Currently only FastaFormat supports this
  inline void reverse_strand(void)
    {
    generic()->forward_reading=false;
    generic()->get_complementary=true;
    }

  /// Sets the name of the file to be read.
  inline void file_name(const std::string &fn) ///< the file name
    {
    format_map::iterator fm;
    for(fm=file_formats.begin(); fm != file_formats.end(); fm ++) static_cast<FileFormat*>(fm->second)->file_name(fn);
    }

  /// \brief Gets the name of the file.
  /// \return A string with the file name.
  inline std::string file_name(void) {return generic()->file_name();}

  /// Go back to the beginning of the file.
  inline void rewind(void) {generic()->rewind();}

  /// Go back to the logical beginning of the genome file.
  inline void start_over(void) {generic()->start_over();}
  
  /// Returns the good() function of a file handle.
  inline bool good(void) {return generic()->good();}

  /// Returns the eof() function of a file handle.
  inline bool eof(void) {return generic()->eof();}

  /// Close the file.
  inline void close(void) {generic()->close();}
  
  /// Check if file is open
  inline bool is_open(void) {return generic()->is_open();}

  /// Get the stream position.
  inline std::streampos tellg(void) {return generic()->tellg();}

  /// Returns a pointer to FileFormat::Contig_table.
  inline ContigTable* contig_table(void) {return &(generic()->Contig_table);}

  /// \brief Informs if the begining of a new contig was detected.
  ///
  /// If FileParser::ignore_contig_breaks is set it will return awlays false.
  /// \returns true if at the beginning of a new contig, false if not or if FileParser::ignore_contig_breaks=true
  inline bool contig_begin(void) 
    {
    if (!ignore_contig_breaks) return generic()->Contig_begin;
    else return false;
    }

   /// Set to ignore all contigs.
  inline void ignore_all_contigs(void) {generic()->ignore_all_contigs();}

  /// Selects the contig to ignore.
  inline void ignore_contig(const std::string &tp, const std::string &st) ///< the contig description, must be exact
    {generic()->ignore_contig(tp,st);}

  /// Selects the contig to read.
  inline void select_contig(const std::string &tp, const std::string &st) ///< the contig description, must be exact
    {generic()->select_contig(tp,st);}

  /// Selects if we should read beyond the end of a contig or not.
  inline void stop_at_contig_end(bool s) {Stop_at_contig_end=s;}

  /// Informs if we should read beyond the end of a contig or not.
  inline bool stop_at_contig_end(void) const {return Stop_at_contig_end;}

  /// Checks if last read was inside a contig
  inline bool inside_contig(void)
    {
    return generic()->Contig_table.Contig_inside;
    }

  /// Checks the format of the input file and may retrieve a number of important parameters about the file.
  void analyse_file_type(void)
    {
    format_map::iterator fm;
    for(fm=file_formats.begin(); fm != file_formats.end(); fm ++) 
      {
      FileFormat* ff=static_cast<FileFormat*>(fm->second);
      ff->open();
      if (ff->check_file_type()) 
        {
        file_type=fm->first;
        generic_file=ff;
        break;
        }
      else {ff->close();}
      }
    if(fm == file_formats.end())
      {
      std::cerr << std::endl << "Format of " << generic()->file_name() << " unknown, exiting" << std::endl;
      exit(1);
      }
    }

  /// Opens the File, checks its type and prepares for reading.
  inline void open(void)
    {
    generic()->open();
    analyse_file_type();
    generic()->prepare_reading();
    }

  /// Same as open() but also sets the filename  
  inline void open(std::string filename)
    {
    file_name(filename);
    open();
    }

  /// Reads and parses the file already open by FileParser and puts the new information into Nucleotide.
  template<class _InternalTp>
  friend FileParser& operator>>(FileParser& fl,               ///< FileParser to read from
                                Nucleotide<_InternalTp> &nt)  ///< Nucleotide to be assigned to
    {
    fl.generic()->file_read(nt);
    return fl;
    }

  /// \brief This enables the contig-at-once read mode
  ///
  /// This functions sets the read_contig_at_once to true and performs
  /// immediately a mapping of contigs.
  ///
  /// \attention This is very experimental, but in any case make sure you
  /// are calling this function after opening the file.
  inline void contig_at_once(void)
    {
    read_contig_at_once=true;
    generic()->map_contigs();
    contig_table()->Current=contig_table()->begin();
    }

  template<class _InternalTp>
  inline void read_one_contig(NucleotideSequence<_InternalTp> &nts)
    {
    Number_of_reads=0;
    nts.shifted=false;
    nts.clear(); nts.cleared=true;
    nts.window_size=contig_table()->current()->Contig_section.size();
    Nucleotide<_InternalTp> nuc;
    while(!nts.full() || Stop_at_contig_end)
      {
      generic()->file_read(nuc);
      if (contig_begin())
        {
        if (Stop_at_contig_end && (Number_of_reads > 0)) break; //Stop reading.
        }
      if (nuc.good())
        {nts << nuc; Number_of_reads++;}
      else
        {break; nts.window_size=Number_of_reads;} //Nothing valid was read, stop here.
      }
    contig_table()->Current++;
    }

  template<class _InternalTp>
  /// \brief Extracts a NucleotideSequence
  ///
  friend FileParser& operator>>(FileParser& fl,             ///< the _FileTp from which to extract a sequence
                             NucleotideSequence<_InternalTp> &nts) ///< the sequence into which to save the readings
    {
    fl.Number_of_reads=0;
    nts.shifted=nts.cleared=false;
    if (fl.read_contig_at_once)
      {
      fl.read_one_contig(nts);
      return fl;
      }
    if (nts.full())
      {
      if (nts.shift_read) {nts.pop_front(); nts.shifted=true;}
      else                {nts.clear(); nts.cleared=true;}
      }
    Nucleotide<_InternalTp> nuc;
    while(!nts.full() || fl.Stop_at_contig_end)
      {
      fl >> nuc;
      if (fl.contig_begin())
        {
        if (fl.Stop_at_contig_end && (fl.Number_of_reads > 0)) break; //Stop reading.
        // if a new contig begins, throw away everything we've read before
        nts.clear(); nts.shifted=false;
        }
      if (nuc.good())
        {nts << nuc; fl.Number_of_reads++;}
      else
        {nts.clear(); nts.shifted=false; break;} //Nothing valid was read, stop here.
      }
    return fl;
    }

  template<class _Sequence, class _Container>
  /// \brief Extracts all NucleotideSequence in file into SequenceSet.
  ///
  friend FileParser& operator>>(FileParser& fl,                        ///< the FileParser from which to extract a sequence
                                SequenceSet<_Sequence,_Container> &st) ///< the SequenceSet into which to save the readings
    {
    typename SequenceSet<_Sequence,_Container>::sequence_type ns=st.DefaultNS;
    if (fl.read_contig_at_once)
      {
      while(fl.contig_table()->Current!=fl.contig_table()->end())
        {
          fl >> ns;
          if(fl.contig_begin()) ns.window_size=fl.contig_table()->current()->Contig_section.size();
          if (ns.good()) st << ns;
        }
     }
    else
      {
      while(!fl.eof())
        {
        fl >> ns;
        if (ns.good()) st << ns; 
        }
      }
    return fl;
    }
 };
};

#endif
