/*
Copyright 2013 Cameron Palmer

This file is a part of Genezip.

Genezip is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Genezip is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTIBILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Genezip.  If not, see <http://www.gnu.org/licenses/>
*/

/*!
  \file genezip_dataset.h
  \brief public interface to genezip library
 */
#ifndef __GENEZIP__GENEZIP_DATASET_H__
#define __GENEZIP__GENEZIP_DATASET_H__

#include <vector>
#include <stdexcept>
#include <sstream>
#include <cmath>
#include "genezip/config.h"
#include "genezip/compression_handler.h"
#include "genezip/file_io_handler.h"
#include "genezip/imputed_data_file_format.h"
#include "genezip/individual.h"
#include "genezip/smart_pointers.h"
#include "genezip/snp.h"

//! genezip namespace wrapper to prevent type conflicts
namespace genezip {
  //! description of input data file format
  typedef enum {
    IMPUTE2,//!< probability triplets in .gen-type file
    MACH_MLGENO,//!< two alleles per type per sample in .mlgeno-type file
    MACH_MLDOSE,//!< single dosage per type per sample in .mldose-type file
    MACH_MLPROB,//!< two probabilities per type per sample in .mlprob-type file
    BEAGLE_PHASED,//!< two alleles per type per sample, phased
    BEAGLE_DOSE,//!< single dosage per type per sample
    BEAGLE_GPROBS,//!< probability triplets
    PLINK_PED,//!< two alleles per type per sample, unphased
    UNKNOWN_FILE_FORMAT//!< NULL-type value for uninitialized data
  } IMPUTED_DATA_FILE_FORMAT;
  //! map file format enum descriptor to object with formal format spec
  //! \param f enum description of file format
  //! \return object containing corresponding column/format specifications
  imputed_data_file_format disambiguate_file_format(IMPUTED_DATA_FILE_FORMAT f)
    ;

  //! description of desired genotype storage format in memory
  typedef enum {
    //! store data as a single genotype by taking max type >= THRESHOLD,
    //! missing otherwise 
    THRESHOLDED_GENOTYPES,
    //! store data as a single genotype by rounding dosage to 
    //! nearest genotype
    ROUNDED_GENOTYPES,
    //! store data as 2*P1 + P2, the double-precision number of ALLELE1
    //! at a site for a sample
    DOSAGE,
    //! store first two probabilities (H-A1 and HET) of posterior for 
    //! each site/sample
    PROBABILITIES,
    //! NULL-type value for uninitialized data
    UNKNOWN_STORAGE_FORMAT
  } IMPUTED_DATA_STORAGE_FORMAT;
  //! public interface class to genezip library
  class dataset {
  public:
    /*! convenience typedef of vector of unspooled imputed data */
    typedef std::vector<double> imputed_vector;
    /*! convenience typedef of const iterator across unspooled data vector */
    typedef imputed_vector::const_iterator const_imputed_iterator;
    /*! convenience typedef of iterator across unspooled data vector */
    typedef imputed_vector::iterator imputed_iterator;
      
    //! default constructor
    dataset()
      : _storage_format(UNKNOWN_STORAGE_FORMAT),
      _decimal_precision(4),
      _posterior_call_threshold(0.9),
      _getline_current_line(0),
      _genotypes_row_dimension(0),
      _genotypes_column_dimension(0) {}
    //! constructor
    //! \param decimal_precision number of digits after decimal
    //! to be stored
    dataset(unsigned decimal_precision)
      : _storage_format(UNKNOWN_STORAGE_FORMAT),
      _decimal_precision(decimal_precision),
      _posterior_call_threshold(0.9),
      _getline_current_line(0),
      _genotypes_row_dimension(0),
      _genotypes_column_dimension(0) {}
    //! destructor
    ~dataset() throw() {}

    //! get a line; starts at beginning of file, walks through to end
    //! \param target vector destination for extracted line
    //! \return whether a line was successfully found (false for EOF)
    bool getline(std::vector<double> &target) {
      if (_getline_current_line >= genotype_row_dimension())
	return false;
      unspool(_getline_current_line);
      ++_getline_current_line;
      target = _temporary_object;
      return true;
    }
    //! queue up the dataset to a particular row of input data
    /*!
      \param index vector index on [0, nrows) to which the dataset
      should be queued
     */
    void unspool(unsigned index);
    //! get the entire current unspooled row for external processing 
    //! \return copy of the current unspooled row
    imputed_vector get_current_row() const {
      return _temporary_object;
    }
    //! access first element of unspooled data vector (const)
    //! \return const_iterator to the beginning of the current unspooled row
    const_imputed_iterator first_element() const {
      return _temporary_object.begin();
    }
    //! access first element of unspooled data vector
    //! \return iterator to the beginning of the current unspooled row
    imputed_iterator first_element() {
      return _temporary_object.begin();
    }
    //! get end bound of unspooled data vector (const)
    //! \return const_iterator to the end of the current unspooled row
    const_imputed_iterator last_element() const {
      return _temporary_object.end();
    }
    //! get end bound of unspooled data vector
    //! \return iterator to the end of the current unspooled row
    imputed_iterator last_element() {return _temporary_object.end();}

    //! read imputed data from file into memory in compressed format
    /*!
      \param filestem prefix of files to be read
      \param file_format description of file format, from a list of possible
      accepted formats
      \param storage_format how should the data be stored in memory, balancing
      precision with RAM requirements
      \param genotypes_gz whether the input genotype data are gzipped on disk
      \param legends_gz whether the input SNP metadata are gzipped on disk
     */
    void read_data_from_file(const std::string          &filestem,
			     IMPUTED_DATA_FILE_FORMAT    file_format,
			     IMPUTED_DATA_STORAGE_FORMAT storage_format,
			     bool                        genotypes_gz = true,
			     bool                        legends_gz = false);
    //! read imputed data from file into memory in compressed format
    /*!
      \param filestem prefix of files to be read
      \param file_format description of file format, class version
      \param storage_format how should the data be stored in memory,
      balancing precision with RAM requirements
      \param genotypes_gz whether the input genotype data are gzipped on disk
      \param legends_gz whether the input SNP metadata are gzipped on disk
     */
    void read_data_from_file(const std::string          &filestem,
			     imputed_data_file_format    file_format,
			     IMPUTED_DATA_STORAGE_FORMAT storage_format,
			     bool                        genotypes_gz = true,
			     bool                        legends_gz = false);
    //! write imputed data to file from memory in compressed format
    /*!
      \param filestem prefix of files to be written
      \param file_format description of desired file format, from a list of
      possible accepted formats
      \param genotypes_gz whether the output genotype data should be gzipped
      when written
      \param legends_gz whether the output SNP metadata should be gzipped
      when written
     */
    void write_data_to_file(const std::string           &filestem,
			    IMPUTED_DATA_FILE_FORMAT     file_format,
			    bool                         genotypes_gz = true,
			    bool                         legends_gz = false)
      const;
    //! write imputed data to file from memory in compressed format
    /*!
      \param filestem prefix of files to be written
      \param file_format description of desired file format,
      from a list of possible accepted formats
      \param genotypes_gz whether the output genotype data should be 
      gzipped when written
      \param legends_gz whether the output SNP metadata should be gzipped 
      when written
     */
    void write_data_to_file(const std::string           &filestem,
			    imputed_data_file_format     file_format,
			    bool                         genotypes_gz = true,
			    bool                         legends_gz = false)
      const;
    //! get the format of the currently stored/compressed data
    inline IMPUTED_DATA_STORAGE_FORMAT get_stored_data_format() const {
      return _storage_format;
    }
    //! get the decimal precision of stored dosage or probability data
    inline unsigned get_decimal_precision() const {
      return _decimal_precision;
    }
    //! get the threshold for calling a posterior probability a genotype
    inline double get_posterior_call_threshold() const {
      return _posterior_call_threshold;
    }
    //! clear all internally stored data, resetting object 
    //! to a newly-constructed state
    void clear();
    //! get the orientation of the stored genotypes (may differ from 
    //! input format)
    //! \return enum describing orientation of stored genotypes
    inline TYPES_ORIENTATION types_orientation() const {
      return _orientation;
    }
    //! determine whether the stored genotypes are snp major oriented
    //! \return whether the stored genotypes are snp major oriented
    inline bool is_snp_major() const {
      return types_orientation() == SNP_MAJOR;
    }
    //! determine whether the stored genotypes are individual major oriented
    //! \return whether the stored genotypes are individual major oriented
    inline bool is_individual_major() const {
      return types_orientation() == INDIVIDUAL_MAJOR;
    }
    //! get the number of rows in the genotype matrix
    //! \return the number of rows in the genotype matrix
    inline unsigned genotype_row_dimension() const {
      return _genotypes_row_dimension;
    }
    //! get the number of columns in the genotype matrix
    //! \return the number of columns in the genotype matrix
    //! \warning column count is the number of types, NOT
    //! the number of actual doubles stored
    //! \sa values_per_genotype()
    inline unsigned genotype_column_dimension() const {
      return _genotypes_column_dimension;
    }
    //! get the number of doubles required to represent a genotype
    //! \return the number of doubles required to store a genotype
    inline unsigned values_per_genotype() const {
      return _storage_format == PROBABILITIES ? 2 : 1;
    }
    //! get approximate number of bytes used by current object
    //! \return approximate number of bytes used by current object
    inline unsigned bytes_of_compressed_memory() const {
      return (_handler.get() ? _handler->bytes_of_compressed_memory() : 0);
    }
  private:
    //! set the dimensions of the genotype matrix, with metadata removed
    //! \param rowcount number of rows
    //! \param colcount number of columns
    //! \warning column count should be the number of types, NOT
    //! the number of actual doubles stored
    inline void genotype_dimensions(unsigned rowcount, unsigned colcount) {
      _genotypes_row_dimension = rowcount;
      _genotypes_column_dimension = colcount;
    }
    //! set the orientation of the stored genotypes
    //! \param orientation the new orientation of the stored genotypes
    inline void types_orientation(TYPES_ORIENTATION orientation) {
      _orientation = orientation;
    }
    //! initialize the compression handler, providing it with
    //! the number of bits required to store a single value
    inline void initialize_handler(unsigned nbits) {
      if (_handler.get())
	throw std::domain_error("genezip_dataset::initialize_handler: "
				"called on existing dataset.  Are you trying "
				"to store multiple files in the same dataset? "
				"This is not currently supported; give each "
				"set of files its own object");
      genezip_utils::compression_handler *handler = 
	new genezip_utils::compression_handler(25, nbits);
      _handler.set(handler);
    }
    //! add a vector of processed genotypes to the compressed data
    //! \param preproc_data a vector of preprocessed genotypes (as unsigneds)
    inline void enqueue_data(const std::vector<unsigned> &preproc_data) {
      if (!_handler.get())
	throw std::domain_error("genezip_dataset::enqueue_data: "
				"must first initialize compression handler "
				"with storage format bits required!");
      _handler->enqueue_data(preproc_data);
    }
    //! inform the compressed data that no further data will come
    inline void finalize_compression() {
      if (!_handler.get())
	throw std::domain_error("genezip_dataset::finalize_compression: "
				"must first initialize compression handler "
				"with storage format bits required!");
      _handler->finalize();
    }
    //! find a code within the bit width specified by the decimal precision
    //! that is above the maximum number of valid values to be stored;
    //! this will indicate a missing value
    unsigned get_valid_error_code() const;
    //! read a list of genotypes in a particular format from a stringstream
    //! \param strm1 stream where all remaining values are formatted genotypes
    //! \param format description of format of genotypes
    //! \param colcount for the first call, this will store the number
    //! of columns in the current line; for all later calls, this will
    //! store the current estimate of the number of columns in the file,
    //! so the number of observed columns in the current call can be
    //! checked against the dimensions of the rest of the file
    void handle_genotypes_from_file(std::istringstream      &strm1,
				    imputed_data_file_format format,
				    unsigned                &colcount);
    //! process a genotype based on this object's format specification
    //! \param u count of allele 1 of a given marker for a given sample
    //! \param target vector to which result will be added
    void process_genotype(unsigned u,
			  std::vector<unsigned> &target) const;
    //! process a dosage based on this object's format specification
    //! \param d dosage of first allele
    //! \param target vector to which result will be added
    void process_dosage(const double &d,
			std::vector<unsigned> &target) const;
    //! process a posterior probability based on this object's format spec
    //! \param d1 first probability of posterior [P(h-A1)]
    //! \param d2 second probability of posterior [P(het)]
    //! \param target vector to which result will be added
    void process_posterior(const double &d1,
			   const double &d2,
			   std::vector<unsigned> &target) const;
    //! wrap the compression particulars
    genezip_utils::smart_pointer<genezip_utils::compression_handler> _handler;
    //! possibly the stored individual information, if provided
    std::vector<genezip_utils::smart_pointer<genezip_utils::individual> >
      _individuals;
    //! possibly the stored snp information, if provided
    std::vector<genezip_utils::smart_pointer<genezip_utils::snp> > _snps;
    //! format description of current compressed dataset
    IMPUTED_DATA_STORAGE_FORMAT _storage_format;
    //! orientation (SNP-major, indiv-major) of stored data
    TYPES_ORIENTATION _orientation;
    //! precision to maintain in compressed data
    unsigned _decimal_precision;
    //! threshold for calling a posterior entry as a called genotype
    double _posterior_call_threshold;
    //! temporary placeholder for unspooler object
    imputed_vector _temporary_object;
    //! counter for automated walk through entire dataset, 0 is NA
    unsigned _getline_current_line;
    //! number of rows of genotype matrix
    unsigned _genotypes_row_dimension;
    //! number of genotypes per genotype matrix row (NOT column number)
    unsigned _genotypes_column_dimension;
  };
}
#endif //__GENEZIP_DATASET_H__
