/*
Copyright 2013 Cameron Palmer

This file is a part of Genezip.

Genezip is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Genezip is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTIBILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Genezip.  If not, see <http://www.gnu.org/licenses/>
*/

/*!
  \file genezip_dataset.cc
 */

#include "genezip/genezip_dataset.h"

genezip::imputed_data_file_format 
genezip::disambiguate_file_format(IMPUTED_DATA_FILE_FORMAT f) {
  imputed_data_file_format format;
  if (f == IMPUTE2) {
    format.set_as_impute_v2();
  } else if (f == MACH_MLGENO) {
    format.set_as_mach_mlgeno();
  } else if (f == MACH_MLDOSE) {
    format.set_as_mach_mldose();
  } else if (f == MACH_MLPROB) {
    format.set_as_mach_mlprob();
  } else if (f == BEAGLE_PHASED) {
    format.set_as_beagle_phased();
  } else if (f == BEAGLE_DOSE) {
    format.set_as_beagle_dose();
  } else if (f == BEAGLE_GPROBS) {
    format.set_as_beagle_gprobs();
  } else if (f == PLINK_PED) {
    format.set_as_plink_ped();
  } else if (f != UNKNOWN_FILE_FORMAT) {
    throw std::domain_error("disambiguate_file_format: format unset "
			    "and cannot be converted into a class");
  } else {
    throw std::domain_error("disambiguate_file_format: format "
			    "not handled by method");
  }
  return format;
}

void genezip::dataset::unspool(unsigned index) {
  //call the compression handler to deal with this business
  std::vector<unsigned> temp;
  if (!_handler.get())
    throw std::domain_error("genezip_dataset::unspool: called on dataset "
			    "without an initialized compression handler; "
			    "have you read data yet?");
  if (!is_snp_major() && !is_individual_major())
    throw std::domain_error("genezip_dataset::unspool: called on dataset "
			    "without a recognized file orientation; "
			    "have you read data yet?");
  _handler->get_vector_of_data(index,
			       genotype_column_dimension() * 
			       values_per_genotype(),
			       temp);
  _temporary_object.resize(temp.size());
  for (unsigned i = 0; i < temp.size(); ++i) {
    if (get_stored_data_format() == DOSAGE ||
	get_stored_data_format() == PROBABILITIES) {
      _temporary_object.at(i) = static_cast<double>(temp.at(i)) / 
	pow(10, get_decimal_precision() - 1);
    } else {
      _temporary_object.at(i) = temp.at(i);
    }
  }
}

void genezip::dataset::clear() {
  //just blast right over the compression_handler pointer
  _handler = 
    genezip_utils::smart_pointer<genezip_utils::compression_handler>();
  //if (_handler.get()) _handler->clear();
  _individuals.clear();
  _snps.clear();
  _storage_format = UNKNOWN_STORAGE_FORMAT;
  _orientation = UNKNOWN_TYPE_ORIENTATION;
  _decimal_precision = 4;
  _posterior_call_threshold = 0.9;
  _temporary_object.clear();
  _getline_current_line = 
    _genotypes_row_dimension = 
    _genotypes_column_dimension = 0;
}

void genezip::dataset::read_data_from_file(const std::string &filestem,
					   IMPUTED_DATA_FILE_FORMAT file_format,
					   IMPUTED_DATA_STORAGE_FORMAT storage_format,
					   bool genotypes_gz,
					   bool legends_gz) {
  clear();
  _storage_format = storage_format;
  read_data_from_file(filestem,
		      disambiguate_file_format(file_format),
		      storage_format,
		      genotypes_gz,
		      legends_gz);
}

void genezip::dataset::read_data_from_file(const std::string &filestem,
					   imputed_data_file_format
					   file_format,
					   IMPUTED_DATA_STORAGE_FORMAT
					   storage_format,
					   bool genotypes_gz,
					   bool legends_gz) {
  clear();
  _storage_format = storage_format;
  _orientation = file_format.sample_file_has_types() ? INDIVIDUAL_MAJOR :
    (file_format.snp_file_has_types() ? SNP_MAJOR 
     : file_format.types_file_orientation());

  //inform the compression handler that genotypes are forthcoming;
  //give it the number of bits required to store packed genotypes
  //for preprocessing
  //for genotypes, 2 is enough
  unsigned n_bits_required_preprocessed = 2;
  //for dosages, ceiling(log_2(2000)) = 11
  if (_storage_format == DOSAGE)
    n_bits_required_preprocessed = 11;
  //for probabilities, ceiling(log_2(1000)) = 10
  //(per value; then two will be added per type)
  else if (_storage_format == PROBABILITIES)
    n_bits_required_preprocessed = 10;
  initialize_handler(n_bits_required_preprocessed);

  std::string line = "", catcher = "", value = "";
  genezip_utils::file_io_handler input;
  unsigned line_count = 0, col_count = 1;

  unsigned data_matrix_row_count = 0;
  unsigned data_matrix_col_count = 0;
  //this process is somewhat complicated by the need to error check
  //let's start with sample file, if present
  if (file_format.has_sample_file()) {
    input.open(filestem + file_format.sample_file_suffix() +
	       (genotypes_gz && file_format.sample_file_has_types() ?
		".gz" : ""),
	       genotypes_gz && file_format.sample_file_has_types() ?
	       genezip_utils::file_io_handler::READGZ : 
	       genezip_utils::file_io_handler::READ);
    //enqueue all applicable columns from file format
    
    while (input.readline(line)) {
      if (file_format.sample_file_has_header() && !line_count) {
	++line_count;
	continue;
      }
      if (file_format.sample_file_has_subheader() && line_count == 1) {
	++line_count;
	continue;
      }

      std::istringstream strm1(line);
      
      genezip_utils::smart_pointer<genezip_utils::individual> ptr
	(new genezip_utils::individual);

      while ((!file_format.sample_file_has_types() ||
	      file_format.sample_file_start_types_col() > col_count) &&
	     strm1 >> catcher) {
	if (file_format.sample_file_fid_col() == col_count) {
	  ptr->fid(catcher);
	} else if (file_format.sample_file_iid_col() == col_count) {
	  ptr->iid(catcher);
	} else if (file_format.sample_file_pat_col() == col_count) {
	  ptr->pat(catcher);
	} else if (file_format.sample_file_mat_col() == col_count) {
	  ptr->mat(catcher);
	} else if (file_format.sample_file_sex_col() == col_count) {
	  ptr->sex(catcher);
	} else if (file_format.sample_file_pheno_col() == col_count) {
	  ptr->phenotype(catcher);
	} else if (file_format.sample_file_missing_col() == col_count) {
	  ptr->missingness(genezip_utils::from_string<double>(catcher));
	}
	++col_count;
      }
      //need to handle trailing genotype data, if present
      if (file_format.sample_file_has_types()) {
	handle_genotypes_from_file(strm1, file_format, data_matrix_col_count);
      }
      col_count = 1;
      ++line_count;
    }
    input.close();
  }
  if (file_format.sample_file_has_types()) {
    data_matrix_row_count = line_count;
    //adjust for possible headers
    if (file_format.sample_file_has_header()) --data_matrix_row_count;
    if (file_format.sample_file_has_subheader()) --data_matrix_row_count;
  }

  line_count = 0;
  //then the snps file, if present
  if (file_format.has_snp_file()) {
    input.open(filestem + file_format.snp_file_suffix() +
               (genotypes_gz && file_format.snp_file_has_types() ?
                ".gz" : ""),
               genotypes_gz && file_format.snp_file_has_types() ?
               genezip_utils::file_io_handler::READGZ :
               genezip_utils::file_io_handler::READ);
    while (input.readline(line)) {
      if (file_format.snp_file_has_header() && !line_count) {
	++line_count;
	continue;
      }

      std::istringstream strm1(line);
      
      genezip_utils::smart_pointer<genezip_utils::snp> ptr
	(new genezip_utils::snp);

      while ((!file_format.snp_file_has_types() ||
	      file_format.snp_file_start_types_col() > col_count) &&
	     strm1 >> catcher) {
	if (file_format.snp_file_rsid_col() == col_count) {
	  ptr->rsid(catcher);
	} else if (file_format.snp_file_chromosome_col() == col_count) {
	  ptr->chromosome(catcher);
	} else if (file_format.snp_file_position_col() == col_count) {
	  ptr->position(catcher);
	} else if (file_format.snp_file_gposition_col() == col_count) {
	  ptr->gposition(catcher);
	} else if (file_format.snp_file_missing_col() == col_count) {
	  ptr->missingness(genezip_utils::from_string<double>(catcher));
	} else if (file_format.snp_file_frequency_col() == col_count) {
	  ptr->frequency(genezip_utils::from_string<double>(catcher));
	} else if (file_format.snp_file_quality_col() == col_count) {
	  ptr->quality(genezip_utils::from_string<double>(catcher));
	}
	++col_count;
      }
      //need to handle trailing genotype data, if present
      if (file_format.snp_file_has_types()) {
	handle_genotypes_from_file(strm1, file_format, data_matrix_col_count);
      }
      col_count = 1;
      ++line_count;
    }
    input.close();    
  }
  if (file_format.snp_file_has_types()) {
    data_matrix_row_count = line_count;
    //handle possible header
    if (file_format.snp_file_has_header()) --data_matrix_row_count;
  }

  line_count = 0;
  //then the types file, if present
  if (file_format.has_types_file()) {
        input.open(filestem + file_format.types_file_suffix() +
		   (genotypes_gz ? ".gz" : ""),
		   genotypes_gz ?
		   genezip_utils::file_io_handler::READGZ :
		   genezip_utils::file_io_handler::READ);
    while (input.readline(line)) {
      if (file_format.types_file_has_header() && !line_count) {
	++line_count;
	continue;
      }

      std::istringstream strm1(line);

      while (file_format.types_file_start_types_col() > col_count &&
	     strm1 >> catcher) {
	++col_count;
      }
      
      //need to handle genotype data
      handle_genotypes_from_file(strm1, file_format, data_matrix_col_count);
      col_count = 1;
      ++line_count;
    }
    input.close();    
  }
  if (file_format.has_types_file()) {
    data_matrix_row_count = line_count;
    //handle possible header
    if (file_format.types_file_has_header()) --data_matrix_row_count;
  }

  //this dataset is done.  make sure the data are compressed
  std::cout << "finalizing compression" << std::endl;
  finalize_compression();
  genotype_dimensions(data_matrix_row_count, data_matrix_col_count);
}

void genezip::dataset::write_data_to_file(const std::string &filestem,
					  IMPUTED_DATA_FILE_FORMAT file_format,
					  bool genotypes_gz,
					  bool legends_gz)
  const {
  write_data_to_file(filestem,
		     disambiguate_file_format(file_format),
		     genotypes_gz,
		     legends_gz);
}

void genezip::dataset::write_data_to_file(const std::string &filename,
					  imputed_data_file_format file_format,
					  bool genotypes_gz,
					  bool legends_gz) const {
  throw std::domain_error("TODO: implement write_data_to_file");
}

void genezip::dataset::handle_genotypes_from_file(std::istringstream &strm1,
						  imputed_data_file_format 
						  format,
						  unsigned &col_count) {
  char allele1 = '0', allele2 = '0';
  double v1 = 0.0, v2 = 0.0, v3 = 0.0;
  std::vector<unsigned> preprocessed_data;
  while (true) {
    if (format.types_format() == GENOTYPES_AS_LETTERS ||
	format.types_format() == GENOTYPES_AS_NUMBERS) {
      if (!(strm1 >> allele1)) break;
      if (!(strm1 >> allele2))
	throw std::domain_error("genezip::dataset::handle_genotypes_from_file:"
				" in GENOTYPES_AS_LETTERS format, genotype"
				" asymmetry");
      if (format.types_format() == GENOTYPES_AS_NUMBERS) {
	//convert the alleles
	allele1 = genezip_utils::allele_numbers_to_letters(allele1);
	allele2 = genezip_utils::allele_numbers_to_letters(allele2);
      }
      //need to report these alleles as metadata to snp vector
      
    } else if (format.types_format() == GENOTYPES_PHASED) {
      throw std::domain_error("TODO: determine phased data storage format");
    } else if (format.types_format() == DOSAGES_FIRSTALLELE) {
      if (!(strm1 >> v1)) break;
      process_dosage(v1, preprocessed_data);
    } else if (format.types_format() == DOSAGES_BOTHALLELES) {
      if (!(strm1 >> v1)) break;
      if (!(strm1 >> v2))
	throw std::domain_error("genezip::dataset::handle_genotypes_from_file:"
				" in DOSAGES_BOTHALLELES format, genotype"
				" asymmetry");
      process_dosage(v1, preprocessed_data);
    } else if (format.types_format() == PROBABILITIES_TWOPOSTERIOR) {
      if (!(strm1 >> v1)) break;
      if (!(strm1 >> v2))
	throw std::domain_error("genezip::dataset::handle_genotypes_from_file:"
				" in PROBABILITIES_TWOPOSTERIOR format, "
				"genotype asymmetry");
      process_posterior(v1, v2, preprocessed_data);
    } else if (format.types_format() == PROBABILITIES_FULLPOSTERIOR) {
      if (!(strm1 >> v1)) break;
      if (!(strm1 >> v2))
	throw std::domain_error("genezip::dataset::handle_genotypes_from_file:"
				" in PROBABILITIES_FULLPOSTERIOR format, "
				"genotype asymmetry (position 2)");
      if (!(strm1 >> v3))
	throw std::domain_error("genezip::dataset::handle_genotypes_from_file:"
				" in PROBABILITIES_FULLPOSTERIOR format, "
				"genotype asymmetry (position 3)");
      process_posterior(v1, v2, preprocessed_data);
    } else if (format.types_format() == UNKNOWN_DATA_REPRESENTATION) {
      throw std::domain_error("genezip::dataset::handle_genotypes_from_file: "
			      "this format has an unset input genotype "
			      "format");
    } else {
      throw std::domain_error("genezip::dataset::handle_genotypes_from_file: "
			      "unhandled input format!");
    }
  }
  enqueue_data(preprocessed_data);
  if (col_count) {
    if (col_count != preprocessed_data.size() / values_per_genotype())
      throw std::domain_error("genezip::dataset::handle_genotypes: "
			      "column count in input genotypes file does "
			      "not stay consistent between lines");
  } else {
    col_count = preprocessed_data.size() / values_per_genotype();;
  }
}

void genezip::dataset::process_genotype(unsigned               u,
					std::vector<unsigned> &target) const {
  if (get_stored_data_format() == THRESHOLDED_GENOTYPES) {
    //behaves the same as ROUNDED_GENOTYPES
    target.push_back(u);
  } else if (get_stored_data_format() == ROUNDED_GENOTYPES) {
    //behaves the same as THRESHOLDED_GENOTYPES
    target.push_back(u);
  } else if (get_stored_data_format() == DOSAGE) {
    //compatible, but silly
    if (get_decimal_precision() < 1)
      throw std::domain_error("genezip::dataset::process_genotype: "
			      "invalid decimal precision: "
			      + genezip_utils::to_string
			      <unsigned>(get_decimal_precision()));
    target.push_back(u * pow(10, get_decimal_precision() - 1));
  } else if (get_stored_data_format() == PROBABILITIES) {
    throw std::domain_error("genezip::dataset::process_genotype: "
			    "cannot deconvolute input genotypes "
			    "into posterior probabilities");
  } else if (get_stored_data_format() == UNKNOWN_STORAGE_FORMAT) {
    throw std::domain_error("genezip::dataset::process_genotype: "
			    "specified storage format 'unknown' "
			    "is not a valid option");
  } else {
    throw std::domain_error("genezip::dataset::process_genotype: "
			    "unhandled data storage format");
  }
}

void genezip::dataset::process_dosage(const double          &d,
				      std::vector<unsigned> &target) const {
  if (get_stored_data_format() == THRESHOLDED_GENOTYPES) {
    throw std::domain_error("genezip::dataset::process_dosage: "
			    "THRESHOLDED_GENOTYPES storage method "
			    "requires posterior probabilities as "
			    "input data, not dosages");
  } else if (get_stored_data_format() == ROUNDED_GENOTYPES) {
    //TODO: allow user-specified rounding regimes?
    //because this one favors hets....
    unsigned precval = static_cast<unsigned>
      (d * pow(10, get_decimal_precision() - 1));
    int het = static_cast<int>(pow(10, get_decimal_precision() - 1) + 0.5);
    int h1 = het << 1;
    unsigned res = 0;
    if (fabs(h1 - precval) < fabs(het - precval) &&
	fabs(h1 - precval) < precval) {
      res = static_cast<unsigned>(h1);
    } else if (fabs(het - precval) < fabs(h1 - precval) &&
	       fabs(het - precval) < precval) {
      res = static_cast<unsigned>(het);
    }
    target.push_back(res);
  } else if (get_stored_data_format() == DOSAGE) {
    if (get_decimal_precision() < 1)
      throw std::domain_error("genezip::dataset::process_dosage: "
			      "invalid decimal precision: "
			      + genezip_utils::to_string
			      <unsigned>(get_decimal_precision()));
    target.push_back(static_cast<unsigned>
		     (d * pow(10, get_decimal_precision() - 1) + 0.5));
  } else if (get_stored_data_format() == PROBABILITIES) {
    throw std::domain_error("genezip::dataset::process_dosage: "
			    "cannot deconvolute input dosages "
			    "into posterior probabilities");
  } else if (get_stored_data_format() == UNKNOWN_STORAGE_FORMAT) {
    throw std::domain_error("genezip::dataset::process_dosage: "
			    "specified storage format 'unknown' "
			    "is not a valid option");
  } else {
    throw std::domain_error("genezip::dataset::process_dosage: "
			    "unhandled data storage format");
  }
}

void genezip::dataset::process_posterior(const double          &d1,
					 const double          &d2,
					 std::vector<unsigned> &target) const {
  if (get_stored_data_format() == THRESHOLDED_GENOTYPES) {
    if (d1 >= get_posterior_call_threshold()) {
      target.push_back(2);
    } else if (d2 >= get_posterior_call_threshold()) {
      target.push_back(1);
    } else if (1.0 - d1 - d2 >= get_posterior_call_threshold()) {
      target.push_back(0);
    } else {
      target.push_back(get_valid_error_code());
    }
  } else if (get_stored_data_format() == ROUNDED_GENOTYPES ||
	     get_stored_data_format() == DOSAGE) {
    process_dosage(2.0 * d1 + d2, target);
  } else if (get_stored_data_format() == PROBABILITIES) {
    target.push_back(static_cast<unsigned>
		     (d1 * pow(10, get_decimal_precision() - 1) + 0.5));
    target.push_back(static_cast<unsigned>
		     (d2 * pow(10, get_decimal_precision() - 1) + 0.5));
  } else if (get_stored_data_format() == UNKNOWN_STORAGE_FORMAT) {
    throw std::domain_error("genezip::dataset::process_genotype: "
			    "specified storage format 'unknown' "
			    "is not a valid option");
  } else {
    throw std::domain_error("genezip::dataset::process_genotype: "
			    "unhandled data storage format");
  }
}

unsigned genezip::dataset::get_valid_error_code() const {
  unsigned n = static_cast<unsigned>(pow(10, get_decimal_precision() - 1));
  if (get_stored_data_format() != PROBABILITIES)
    n <<= 1;

  unsigned m = static_cast<unsigned>
    (pow(ceil(log2(n)), 2));

  if (m <= n)
    throw std::domain_error("genezip::dataset::get_valid_error_code: "
			    "no valid codes found for this parameter "
			    "combination: " +
			    genezip_utils::to_string<unsigned>(m) +
			    " " +
			    genezip_utils::to_string<unsigned>(n));
  return n+1;
}
