/*
  Copyright 2013 Cameron Palmer
  
  This file is a part of Genezip.
  
  Genezip is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  
  Genezip is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTIBILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with Genezip.  If not, see <http://www.gnu.org/licenses/>
*/

/*!
  \file imputed_data_file_format.h
  
  This file contains the class definition for imputed_data_file_format,
  in the genezip namespace. Instances of this class may fully define a set
  of sample, SNP metadata, and genotype files that make up an input imputed
  data format. No specific file formats are defined in this file.
 */

#ifndef __GENEZIP__IMPUTED_DATA_FILE_FORMAT_H__
#define __GENEZIP__IMPUTED_DATA_FILE_FORMAT_H__

#include <string>
#include <vector>
#include <sstream>
#include <utility>
#include <stdexcept>
#include "genezip/file_io_handler.h"
#include "genezip/helper_functions.h"

namespace genezip {
  //! specify whether a snp or a sample per row
  typedef enum {
    SNP_MAJOR, //!< one snp per row, individuals as columns
    INDIVIDUAL_MAJOR, //!< one individual per row, snps as columns
    UNKNOWN_TYPE_ORIENTATION //!< missing/unset format code
  } TYPES_ORIENTATION;
  //! enum describing from where alleles should be loaded
  typedef enum {
    SNP_FILE, //!< alleles are in the snp file
    TYPES_FILE, //!< alleles are in the types file
    UNKNOWN_ALLELE_SOURCE //!< missing/unset format code
  } ALLELE_SOURCE_FILE;
  //! descriptor of input genotype format
  typedef enum {
    GENOTYPES_AS_LETTERS, //!< pairs of alleles, i.e. PLINK ped file format
    GENOTYPES_AS_NUMBERS, //!< pairs of alleles, using 1234 allele codes
    GENOTYPES_PHASED, //!< pairs of alleles, order matters
    DOSAGES_FIRSTALLELE, //!< decimal count of number of first allele
    DOSAGES_BOTHALLELES, //!< decimal count of first, then second allele
    PROBABILITIES_TWOPOSTERIOR, //!< P(homozygote-A1) P(het)
    PROBABILITIES_FULLPOSTERIOR, //!< P(homozygote-A1) P(het) P(homozygote-A2)
    UNKNOWN_DATA_REPRESENTATION //!< missing/unset format code
  } IMPUTED_DATA_INPUT_REPRESENTATION;
  /*!
    \class imputed_data_file_format
    
    A class containing sufficient flags and column markers to fully specify
    an input imputed data file format.
   */
  class imputed_data_file_format {
  public:
    //! default constructor: nullifies input data to default state
    imputed_data_file_format() {
      set_defaults();
    }
    //! copy constructor
    imputed_data_file_format(const imputed_data_file_format &obj)
      : _format_identifier(obj._format_identifier),
      _sample_file_suffix(obj._sample_file_suffix),
      _snp_file_suffix(obj._snp_file_suffix),
      _types_file_suffix(obj._types_file_suffix),
      _has_sample_file(obj._has_sample_file),
      _sample_file_has_types(obj._sample_file_has_types),
      _sample_file_has_header(obj._sample_file_has_header),
      _sample_file_has_subheader(obj._sample_file_has_subheader),
      _has_snp_file(obj._has_snp_file),
      _snp_file_has_types(obj._snp_file_has_types),
      _snp_file_has_header(obj._snp_file_has_header),
      _has_types_file(obj._has_types_file),
      _types_file_has_header(obj._types_file_has_header),
      _snp_file_rsid_col(obj._snp_file_rsid_col),
      _snp_file_chromosome_col(obj._snp_file_chromosome_col),
      _snp_file_position_col(obj._snp_file_position_col),
      _snp_file_gposition_col(obj._snp_file_gposition_col),
      _snp_file_missing_col(obj._snp_file_missing_col),
      _snp_file_frequency_col(obj._snp_file_frequency_col),
      _snp_file_quality_col(obj._snp_file_quality_col),
      _snp_file_start_types_col(obj._snp_file_start_types_col),
      _sample_file_fid_col(obj._sample_file_fid_col),
      _sample_file_iid_col(obj._sample_file_iid_col),
      _sample_file_pat_col(obj._sample_file_pat_col),
      _sample_file_mat_col(obj._sample_file_mat_col),
      _sample_file_sex_col(obj._sample_file_sex_col),
      _sample_file_pheno_col(obj._sample_file_pheno_col),
      _sample_file_missing_col(obj._sample_file_missing_col),
      _sample_file_start_types_col(obj._sample_file_start_types_col),
      _types_file_orientation(obj._types_file_orientation),
      _types_file_start_types_col(obj._types_file_start_types_col),
      _types_format(obj._types_format),
      _allele_cols(obj._allele_cols),
      _allele_source(obj._allele_source) {}

    //! destructor
    ~imputed_data_file_format() {}
    //! set all input data to default, 'unset' state
    void set_defaults();



    // ///////////////////////standard initializers//////////////////////////
    //! set the internal state for that of IMPUTE v2 .gen/.sample files
    void set_as_impute_v2();
    //! set the internal state for that of MACH mlgeno/mlinfo files
    void set_as_mach_mlgeno();
    //! set the internal state for that of MACH mldose/mlinfo files
    void set_as_mach_mldose();
    //! set the internal state for that of MACH mlprob/mlinfo files
    void set_as_mach_mlprob();
    //! set the internal state for that of BEAGLE .phased/.r2 files
    void set_as_beagle_phased();
    //! set the internal state for that of BEAGLE .dose/.r2 files
    void set_as_beagle_dose();
    //! set the internal state for that of BEAGLE .gprobs/.r2 files
    void set_as_beagle_gprobs();
    //! set the internal state for that of PLINK .ped/.map (nonimputed) files
    void set_as_plink_ped();

    // ///////////////////////accessors//////////////////////////////////////
    //! get the name of this file format
    //! \return the name of this file format
    inline const std::string &format_identifier() const {
      return _format_identifier;
    }
    //! set the name of this file format
    //! \param s new format identifier
    inline void format_identifier(const std::string &s) {
      _format_identifier = s;
    }
    //! get the standard suffix for this format's sample file
    //! \return the standard suffix for this format's sample file
    inline const std::string &sample_file_suffix() const {
      return _sample_file_suffix;
    }
    //! set the standard suffix for this format's sample file
    //! \param s the standard suffix for this format's sample file
    inline void sample_file_suffix(const std::string &s) {
      _sample_file_suffix = s;
    }
    //! get the standard suffix for this format's snp file
    //! \return the standard suffix for this format's snp file
    inline const std::string &snp_file_suffix() const {
      return _snp_file_suffix;
    }
    //! set the standard suffix for this format's snp file
    //! \param s the standard suffix for this format's snp file
    inline void snp_file_suffix(const std::string &s) {
      _snp_file_suffix = s;
    }
    //! get the standard suffix for this format's types file
    //! \return the standard suffix for this format's types file
    inline const std::string &types_file_suffix() const {
      return _types_file_suffix;
    }
    //! set the standard suffix for this format's types file
    //! \param s the standard suffix for this format's types file
    inline void types_file_suffix(const std::string &s) {
      _types_file_suffix = s;
    }
    //! get whether the format has a sample file specified with it
    //! \return whether the format specifies a sample file
    inline bool has_sample_file() const {return _has_sample_file;}
    //! set whether the format has a sample file specified with it
    //! \param b whether the format should have a sample file specified with it
    inline void has_sample_file(bool b) {_has_sample_file = b;}
    //! get whether the sample file contains the imputed genotypes
    //! \return whether the sample file contains imputed genotypes
    inline bool sample_file_has_types() const {
      return _sample_file_has_types;
    }
    //! set whether the sample file contains the imputed genotypes
    //! \param b whether the sample file contains the imputed genotypes
    inline void sample_file_has_types(bool b) {
      _sample_file_has_types = b;
    }
    //! get whether the sample file contains a dedicated header row
    //! \return whether the sample file has a dedicated header row
    inline bool sample_file_has_header() const {
      return _sample_file_has_header;
    }
    //! set whether the sample file contains a dedicated header row
    //! \param b whether the sample file should have a dedicated header row
    inline void sample_file_has_header(bool b) {
      _sample_file_has_header = b;
    }
    //! get whether the sample file contains a subheader row
    //! \return whether the sample file has a subheader row
    inline bool sample_file_has_subheader() const {
      return _sample_file_has_subheader;
    }
    //! set whether the sample file has a subheader row
    //! \param b whether the sample file should have a subheader row
    inline void sample_file_has_subheader(bool b) {
      _sample_file_has_subheader = b;
    }
    //! get whether the format has a snp metadata file specified with it
    //! \return whether the format specifies a snp metadata file
    inline bool has_snp_file() const {return _has_snp_file;}
    //! set whether the format has a snp metadata file specified with it
    //! \param b whether the format should have a snp metadata file
    inline void has_snp_file(bool b) {_has_snp_file = b;}
    //! get whether the snp metadata file contains the imputed genotypes
    //! \return whether the snp metadata file contains imputed genotypes
    inline bool snp_file_has_types() const {
      return _snp_file_has_types;
    }
    //! set whether the snp metadata file contains the imputed genotypes
    //! \return whether the snp metadata file contains imputed genotypes
    inline void snp_file_has_types(bool b) {
      _snp_file_has_types = b;
    }
    //! get whether the snp metadata file has a header row
    //! \return whether the snp metadata file has a header row
    inline bool snp_file_has_header() const {
      return _snp_file_has_header;
    }
    //! set whether the snp metadata file has a header row
    //! \param b whether the snp metadata file should have a header row
    inline void snp_file_has_header(bool b) {
      _snp_file_has_header = b;
    }
    //! get whether the format has a dedicated imputed genotype file
    //! \return whether the format has a dedicated imputed genotype file
    inline bool has_types_file() const {return _has_types_file;}
    //! set whether the format has a dedicated imputed genotype file
    //! \param b whether the format should have a dedicated imputed type file
    inline void has_types_file(bool b) {_has_types_file = b;}
    //! get whether the type file has a header row
    //! \return whether the type file has a header row
    inline bool types_file_has_header() const {
      return _types_file_has_header;
    }
    //! set whether the type file has a header row
    //! \param b whether the type file should have a header row
    inline void types_file_has_header(bool b) {
      _types_file_has_header = b;
    }
    //! get the column of the snp metadata file containing the rsid
    //! \return the column of the snp metadata file with the rsid
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_rsid_col() const {
      return _snp_file_rsid_col;
    }
    //! set the column of the snp metadata file containing the rsid
    //! \param u the column of the snp metadata file containing the rsid
    //! \warning 0 parameter value should be DNE
    inline void snp_file_rsid_col(unsigned u) {
      _snp_file_rsid_col = u;
    }
    //! get the column of the snp metadata file containing the chromosome
    //! \return the column of the snp metadata file with the chromosome
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_chromosome_col() const {
      return _snp_file_chromosome_col;
    }
    //! set the column of the snp metadata file containing the chromosome
    //! \param u the column of the snp metadata file containing the chromosome
    //! \warning 0 parameter value should be DNE
    inline void snp_file_chromosome_col(unsigned u) {
      _snp_file_chromosome_col = u;
    }
    //! get the column of the snp metadata file with physical position
    //! \return the column of the snp metadata file with physical position
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_position_col() const {
      return _snp_file_position_col;
    }
    //! set the column of the snp metadata file with physical position
    //! \param u the column of the snp metadata file with physical position
    //! \warning 0 parameter value should be DNE
    inline void snp_file_position_col(unsigned u) {
      _snp_file_position_col = u;
    }
    //! get the column of the snp metadata file with genetic position
    //! \return the column of the snp metadata file with genetic position
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_gposition_col() const {
      return _snp_file_gposition_col;
    }
    //! set the column of the snp metadata file with genetic position
    //! \param u the column of the snp metadata file with genetic position
    //! \warning 0 parameter value should be DNE
    inline void snp_file_gposition_col(unsigned u) {
      _snp_file_gposition_col = u;
    }
    //! get the column of the snp metadata file with per-SNP missingness
    //! \return the column of the snp metadata file with per-SNP missingness
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_missing_col() const {
      return _snp_file_missing_col;
    }
    //! set the column of the snp metadata file with per-SNP missingness
    //! \param u the column of the snp metadata file with per-SNP missingness
    //! \warning 0 parameter value should be DNE
    inline void snp_file_missing_col(unsigned u) {
      _snp_file_missing_col = u;
    }
    //! get the column of the A1 frequency in the snp metadata file
    //! \return the column of the A1 frequency in the snp metadata file
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_frequency_col() const {
      return _snp_file_frequency_col;
    }
    //! set the column of the A1 frequency in the snp metadata file
    //! \param u the column of the A1 frequency in the snp metadata file
    //! \warning 0 parameter value should be DNE
    inline void snp_file_frequency_col(unsigned u) {
      _snp_file_frequency_col = u;
    }
    //! get the column of the snp quality metric in the snp metadata file
    //! \return the column of the snp quality metric in the snp metadata file
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_quality_col() const {
      return _snp_file_quality_col;
    }
    //! set the column of the snp quality metric in the snp metadata file
    //! \param u the column of the snp quality metric in the snp metadata file
    //! \warning 0 parameter value should be DNE
    inline void snp_file_quality_col(unsigned u) {
      _snp_file_quality_col = u;
    }
    //! get the column where genotypes begin in the snp metadata file
    //! \return the column at which genotypes start in the snp metadata file
    //! \warning 0 return value should be DNE
    inline unsigned snp_file_start_types_col() const {
      return _snp_file_start_types_col;
    }
    //! set the column where genotypes begin in the snp metadata file
    //! \param u the column at which types start in the snp metadata file
    //! \warning 0 parameter value should be DNE
    inline void snp_file_start_types_col(unsigned u) {
      _snp_file_start_types_col = u;
    }
    //! get the column of the sample file with FID code
    //! \return the column of the sample file with FID code
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_fid_col() const {
      return _sample_file_fid_col;
    }
    //! set the column of the sample file with IID code
    //! \param u the column of the sample file with IID code
    //! \warning 0 parameter value should be DNE
    inline void sample_file_fid_col(unsigned u) {
      _sample_file_fid_col = u;
    }
    //! get the column of the sample file with IID code
    //! \return the column of the sample file with IID code
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_iid_col() const {
      return _sample_file_iid_col;
    }
    //! set the column of the sample file with IID code
    //! \param u the column of the sample file with IID code
    //! \warning 0 parameter value should be DNE
    inline void sample_file_iid_col(unsigned u) {
      _sample_file_iid_col = u;
    }
    //! get the column of the sample file with PAT code
    //! \return the column of the sample file with PAT code
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_pat_col() const {
      return _sample_file_pat_col;
    }
    //! set the column of the sample file with PAT code
    //! \param u the column of the sample file with PAT code
    //! \warning 0 parameter value should be DNE
    inline void sample_file_pat_col(unsigned u) {
      _sample_file_pat_col = u;
    }
    //! get the column of the sample file with MAT code
    //! \return the column of the sample file with MAT code
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_mat_col() const {
      return _sample_file_mat_col;
    }
    //! set the column of the sample file with MAT code
    //! \param u the column of the sample file with MAT code
    //! \warning 0 parameter value should be DNE
    inline void sample_file_mat_col(unsigned u) {
      _sample_file_mat_col = u;
    }
    //! get the column of the sample file with SEX code
    //! \return the column of the sample file with SEX code
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_sex_col() const {
      return _sample_file_sex_col;
    }
    //! set the column of the sample file with SEX code
    //! \param u the column of the sample file with SEX code
    //! \warning 0 parameter value should be DNE
    inline void sample_file_sex_col(unsigned u) {
      _sample_file_sex_col = u;
    }
    //! get the column of the sample file with PHENO code
    //! \return the column of the sample file with PHENO code
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_pheno_col() const {
      return _sample_file_pheno_col;
    }
    //! set the column of the sample file with PHENO code
    //! \param u the column of the sample file with PHENO code
    //! \warning 0 parameter value should be DNE
    inline void sample_file_pheno_col(unsigned u) {
      _sample_file_pheno_col = u;
    }
    //! get the column of the sample file with per-sample missingness
    //! \return the column of the sample file with per-sample missingness
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_missing_col() const {
      return _sample_file_missing_col;
    }
    //! set the column of the sample file with per-sample missingness
    //! \param u the column of the sample file with per-sample missingness
    //! \warning 0 parameter value should be DNE
    inline void sample_file_missing_col(unsigned u) {
      _sample_file_missing_col = u;
    }
    //! get the column of the sample file where genotypes start
    //! \return the column of the sample file where genotypes start
    //! \warning 0 return value should be DNE
    inline unsigned sample_file_start_types_col() const {
      return _sample_file_start_types_col;
    }
    //! set the column of the sample file where genotypes start
    //! \param u the column of the sample file where genotypes start
    //! \warning 0 parameter value should be DNE
    inline void sample_file_start_types_col(unsigned u) {
      _sample_file_start_types_col = u;
    }
    //! get the orientation of the dedicated types file
    //! \return the orientation of the dedicated types file
    //! \warning 'unset' for this enum is UNKNOWN_TYPE_ORIENTATION
    inline TYPES_ORIENTATION types_file_orientation() const {
      return _types_file_orientation;
    }
    //! set the orientation of the dedicated types file
    //! \param t the desired file orientation
    //! \warning 'unset' for this enum is UNKNOWN_TYPE_REPRESENTATION
    inline void types_file_orientation(TYPES_ORIENTATION t) {
      _types_file_orientation = t;
    }
    //! get the first column of types in the types file
    //! \return the first column of types in the types file
    //! \warning 0 return value should be DNE
    inline unsigned types_file_start_types_col() const {
      return _types_file_start_types_col;
    }
    //! set the first column of types in the types file
    //! \param u the first column of types in the types file
    //! \warning 0 parameter value should be DNE
    inline void types_file_start_types_col(unsigned u) {
      _types_file_start_types_col = u;
    }
    //! get the genotype representation format for this file format
    //! \return the genotype representation format for this file format
    //! \warning 'unset' for this enum is UNKNOWN_DATA_REPRESENTATION
    inline IMPUTED_DATA_INPUT_REPRESENTATION types_format() const {
      return _types_format;
    }
    //! set the genotype representation format for this file format
    //! \param t the desired genotype representation format
    //! \warning 'unset' for this enum is UNKNOWN_DATA_REPRESENTATION
    inline void types_format(IMPUTED_DATA_INPUT_REPRESENTATION t) {
      _types_format = t;
    }
    //! get the column of the ith allele
    //! \param i index of desired allele, on [1, numalleles]
    //! \return the column of the ith allele
    //! \warning 0 return value should be DNE
    unsigned allele_col(unsigned i) const;
    //! set the column of the ith allele
    //! \param i index of desired allele, on [1, numalleles]
    //! \param v column of ith allele
    //! \warning 0 parameter value should be DNE
    void allele_col(unsigned i, unsigned v);
    //! get the type of file in which the allele annotations are found
    //! \return the type of file in which the allele annotations are found
    //! \warning 'unset' for this enum is UNKNOWN_ALLELE_SOURCE
    inline ALLELE_SOURCE_FILE allele_source() const {
      return _allele_source;
    }
    //! set the type of file in which the allele annotation are found
    //! \param f the type of file in which the allele annotations are found
    //! \warning 'unset' for this enum is UNKNOWN_ALLELE_SOURCE
    inline void allele_source(ALLELE_SOURCE_FILE f) {
      _allele_source = f;
    }
    //! clear out any stored allele annotations within current object
    inline void clear_stored_alleles() {
      _allele_cols.clear();
      _allele_source = UNKNOWN_ALLELE_SOURCE;
    }
    //! report the settings of the format to a stream
    //! \param out output stream for report
    void report(std::ostream &out) const;
  private:
    std::string _format_identifier; //!< name of this file format
    std::string _sample_file_suffix; //!< standard suffix for sample file
    std::string _snp_file_suffix; //!< standard suffix for snp file
    std::string _types_file_suffix; //!< standard suffix for types file
    
    bool _has_sample_file; //!< whether the format has a sample file
    bool _sample_file_has_types; //!< whether the sample file has genotypes
    bool _sample_file_has_header; //!< whether the sample file has a header row
    bool _sample_file_has_subheader; //!< the sample file has a 2nd header row
    bool _has_snp_file; //!< whether the format has a snp metadata file
    bool _snp_file_has_types; //!< whether the snp file has genotypes
    bool _snp_file_has_header; //!< whether the snp file has a header row
    bool _has_types_file; //!< whether the format has a genotype file
    bool _types_file_has_header; //!< whether the types file has a header row
    
    unsigned _snp_file_rsid_col; //!< column of RSID in snp metadata file
    unsigned _snp_file_chromosome_col; //!< column of CHR in snp metadata file
    unsigned _snp_file_position_col; //!< column of phys. pos. in snp file
    unsigned _snp_file_gposition_col; //!< column of genet. pos in snp file
    unsigned _snp_file_missing_col; //!< column of missingness in snp file
    unsigned _snp_file_frequency_col; //!< column of A1 frequency in snp file
    unsigned _snp_file_quality_col; //!< column of call quality in snp file
    unsigned _snp_file_start_types_col; //!< first column for types in snp file

    unsigned _sample_file_fid_col;//!< column of FID in sample file
    unsigned _sample_file_iid_col;//!< column of IID in sample file
    unsigned _sample_file_pat_col;//!< column of PAT ID in sample file
    unsigned _sample_file_mat_col;//!< column of MAT ID in sample file
    unsigned _sample_file_sex_col;//!< column of SEX code in sample file
    unsigned _sample_file_pheno_col;//!< column of PHENO code in sample file
    unsigned _sample_file_missing_col;//!< column w/missingness in sample file
    unsigned _sample_file_start_types_col;//!< first col w/types in sample file

    TYPES_ORIENTATION _types_file_orientation;//!< orientation of types file
    unsigned _types_file_start_types_col;//!< first col for types in types file
    IMPUTED_DATA_INPUT_REPRESENTATION _types_format;//!< genotype format

    std::vector<unsigned> _allele_cols; //!< allele cols in snp file
    ALLELE_SOURCE_FILE _allele_source;//!< source of allele annotations
  };
  //! parse a configuration file into a file format specification
  imputed_data_file_format 
    parse_imputed_data_format_from_file(const std::string &filename);
}

#endif //__IMPUTED_DATA_FILE_FORMAT_H__
