#!/usr/bin/perl -w

# Copyright (C) 2013,2014 Ole Tange, Mike DeGiorgio, Anna-Sapfo
# Malaspinas, Jose Victor Moreno-Mayar, Yong Wang and Free Software
# Foundation, Inc.
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

use strict;
use local::lib;
use Getopt::Long;

$Global::progname = "bamdamage";
$Global::version = "20140328";

Getopt::Long::Configure("bundling");
my @retval = GetOptions
    ("debug|D" => \$opt::debug,
     "verbose|v" => \$opt::verbose,
     "help|h" => \$opt::help,
     "mapquality|m=i" => \$opt::mapq,
     "basequality|b=i" => \$opt::baseq,
     "sample|s=s" => \$opt::sample,
     "version|V" => \$opt::version,
    ) or usage(1);

if($opt::help) { usage(0); }

if($opt::version) {
  version();
  exit(0);
}

my $mapqual_lb = $opt::mapq || 30; # Default minimum mapping quality 30
my $seqqual_ub = 50; # Ignore qualities > 50
my $seqqual_os = 33; # quality score offset
my $seqqual_lb = $opt::baseq || 20; # Default minimum base quality 20
my $length_lb = 20;
my $length_ub = 100; # Max sequence length
my $position_ub = 80; # Max position
my $damage_ub = 20;

my $file = $ARGV[0];
my @out;
if($file and -r $file) {
    open(IN,"-|","samtools","view","-h",$file) or error("Cannot read $file\n");
    if($opt::debug) {
	open(OUT,">",$file.".stat") or error("Cannot write ${file}.stat\n");
    } else {
	open(OUT,">/dev/null") or error("Cannot write ${file}.stat\n");
    }
} else {
    usage(1);
    exit(1);
}
my $outfile = $file;
$outfile =~ s:.*/::; # Remove full path
$outfile =~ s/\.....?$//; # Remove extension if any
$outfile .= ".dam.pdf";

my %ACGT = ("A" => 0, "C" => 1, "G" => 2, "T" => 3);
my @count_by_length;
my @count_by_qual;
my @count_by_mut5;
my @count_by_mut3;
my @count_by_damage;

for(my $i=$seqqual_lb;$i<$seqqual_ub;$i++) {
	$count_by_qual[$i]=0;
}

for(my $i=$length_lb;$i<$length_ub;$i++) {
	$count_by_length[$i]=0;
}

for(my $i=0;$i<$damage_ub;$i++) {
	$count_by_damage[$i]=0;
}

for(my $i=0;$i<4;$i++) {
    for (my $j=0;$j<4;$j++) {
	for(my $k=0;$k<$position_ub;$k++){
	    $count_by_mut5[$i][$j][$k]=0;
	    $count_by_mut3[$i][$j][$k]=0;
	}
    }
}


my $line;
my $linecount = 0;
my $curres;
my $unparsable_warning_printed = 0;
my $unexpected_md_warning_printed = 0;
while ($line=<IN>) {
    if ($line=~/^@/) {
	if ($opt::debug) {print $line;}
	next;
    }
    #===== Extract Info =======#
    if($line =~ /^\S+\t(\S+)\t\S+\t\S+\t(\S+)\t(\S+)\t\S+\t\S+\t\S+\t(\S+)\t(\S+)\t/) {
	# The line parses: Go on
    } else {
	if($opt::verbose) {
	    warning("This line is not parsable: $line");
	} else {
	    if($unparsable_warning_printed++) {
		# skip
	    } else {
		warning("The bam file contains unparsable lines. Use -v to see them.\n");
	    }
	}
	next;
    }
    my $mapflag = $1;
    my $mapqual = $2;
    my $CIGAR = $3;
    my $read = $4;
    my $qual_string = $5;
    my $reverseflag = 0;
    if ($mapflag & 16) {
	$reverseflag=1;
    } else {
	$reverseflag=0;
    }

    # Maping quality too low
    if ($mapqual < $mapqual_lb) { next; }

    # Count length
    my $len = length($qual_string);
    my $hlen = $len;
    if ($hlen >= $length_ub) { $hlen = $length_ub - 1; }
    $count_by_length[$hlen]++;

    # Count sequencing quality
    my @usesite;
    for (my $i = 0; $i<$len; $i++) {
	my $ch = substr($qual_string,$i,1);
	my $qual = ord($ch) - $seqqual_os;
	if ($qual >= $seqqual_lb) {
	    $usesite[$i]=1;
	} else {
	    $usesite[$i]=0;
	}
	if ($qual >= $seqqual_ub) { $qual = $seqqual_ub-1; }
	$count_by_qual[$qual]++;
    }

    #===== Pharse CIGAR string to create a draft reference (with INDEL)  ========#
    my $CIGARBK = $CIGAR;
    my $ref = "";
    my $pos = 0;
    my $check = 0;
    my $nDAM = 0;

    my $cigar_expanded = "";

    while (length($CIGAR)>0) {
	$CIGAR =~ s/^(\d+)([A-Z])//;
	my $seglen = $1;
	my $sym = uc($2);	
	if ($sym eq "M") {
	    $ref .= substr($read,$pos,$seglen);
	    $cigar_expanded = $cigar_expanded . ("M" x $seglen);
	    $pos += $seglen;
	} elsif ($sym eq "I") {
	    $ref .= "-" x $seglen;
	    $cigar_expanded = $cigar_expanded . ("I" x $seglen);
	    $pos += $seglen;
	} elsif ($sym eq "D") {
	    # skip
	} elsif ($sym eq "N") {
	    $ref .= "N" x $seglen;
	    $cigar_expanded = $cigar_expanded . ("N" x $seglen);
	    $pos += $seglen;
	    $check = 1;
	} elsif ($sym eq "P") {
	    $check = 1;
	} elsif ($sym eq "H") {
	    $check = 1;
	} elsif ($sym eq "S") {
	    $ref .= "N" x $seglen;
	    $cigar_expanded = $cigar_expanded . ("S" x $seglen);
	    $pos += $seglen;
	    $check = 1;
	} else {
	    $check = 1;
	}
    }

    #==== Pharse MD string to reverse mutation back to create final reference ======#
    #==== and counting the number of all matches/mismatches and damages ====#	
    $line =~ /MD:Z:(.*?)[\s\n]/;
    my $mismstr = $1;
    my $mismstrbk = $1;
    
    $mismstr=~s/^(\d+)//;
    my $matchl=$1;
    $pos = 0;
    my $count_CG=0;

    for(my $i=0; $i<$matchl;) {
	$curres = uc(substr($ref,$pos,1));
	#unless ($curres eq "-" || $curres eq "N") {$i++;}
	unless (substr($cigar_expanded, $pos, 1) ne "M") {$i++;}
	if ($usesite[$pos]) {
	    if (($curres eq "C" && $reverseflag == 0) || ($curres eq "G" && $reverseflag == 1)) {
		$count_CG++;
	    }
	    if ($curres=~/[ACGT]/) {
		my $refno = $ACGT{$curres};
		if ($pos<$position_ub && $refno>=0) {
		    unless ($reverseflag) {
			$count_by_mut5[$refno][$refno][$pos]++;
		    } else {
			$count_by_mut3[3-$refno][3-$refno][$pos]++;
		    }
		}
		my $pos3 = $len-$pos-1;
		if ($pos3<$position_ub && $pos3>=0 && $refno>=0) {
		    unless ($reverseflag) {
			$count_by_mut3[$refno][$refno][$pos3]++;
		    } else {
			$count_by_mut5[3-$refno][3-$refno][$pos3]++;
		    }
		}
	    }
	}
	$pos++;
    }

    while (length($mismstr)>0) {
	$mismstr=~s/^(.*?)(\d+)//;
	my $curstr=$1;
	my $matchl=$2;
	if ($curstr=~/\^/) {
	    # skip
	} else {
	    my $refres=uc(substr($ref,$pos,1));
	    while(substr($cigar_expanded, $pos, 1) ne "M") {
		$pos++;
		$refres=uc(substr($ref,$pos,1));
	    }	
	    $refres = uc($curstr);
	    #added the option to have an N nucleotide
	    unless ($refres=~/[ACGTN]/) {
		if($unexpected_md_warning_printed++) {
		    if($opt::verbose) {
			warning("unexpected MD string $mismstrbk $curstr $refres\n");
		    }
		} else {
		    warning("The bam file contains unexpected MD strings. Use -v to see them.\nIf you have used GATK to realign your reads followed by samtools calmd, this is to be expected.\n");
		}
	    }
	    my $readres = uc(substr($read,$pos,1));
	    #added the option to have an N nucleotide
	    unless ($readres=~/[ACGTN]/) {
		if($unexpected_md_warning_printed++) {
		    if($opt::verbose) {
			warning("unexpected MD string $mismstrbk $read $readres\n");
		    }
		} else {
		    warning("The bam file contains unexpected MD strings. Use -v to see them.\nIf you have used GATK to realign your reads followed by samtools calmd, this is to be expected.\n");
		}
	    }
				
	    if ($usesite[$pos]) {
		if ( ($readres eq "T" && $refres eq "C" && $reverseflag == 0) 
		     ||
		     ($readres eq "    A" && $refres eq "G" && $reverseflag == 1)) {
		    $nDAM++;
		}
		if (($curres eq "C" && $reverseflag==0)
		    ||
		    ($curres eq "G" && $reverseflag==1)) {
		    $count_CG++;
		}
		my $readno = $ACGT{$readres};
		my $refno = $ACGT{$refres};
		if (defined($refno) and $pos < $position_ub && $refno >= 0 && $readno >= 0) {
		    unless ($reverseflag) {
			$count_by_mut5[$readno][$refno][$pos]++;
		    } else {
			$count_by_mut3[3-$readno][3-$refno][$pos]++;
		    }
		}
		my $pos3 = $len-$pos-1;
		if (defined($refno) and $pos3<$position_ub && $pos3>=0 && $refno>=0 && $readno>=0) {
		    unless ($reverseflag) {
			$count_by_mut3[$readno][$refno][$pos3]++;
		    } else {
			$count_by_mut5[3-$readno][3-$refno][$pos3]++;
		    }
		}
	    }		
	    substr($ref,$pos,1)=$refres;
	    $pos++;
	}
	for(my $i=0; $i<$matchl;) {
	    $curres=uc(substr($ref,$pos,1));	
	    unless (substr($cigar_expanded, $pos, 1) ne "M") {$i++;}
	    if ($usesite[$pos]) {
		if (($curres eq "C" && $reverseflag==0)||($curres eq "G" && $reverseflag==1)) {
		    $count_CG++;
		}
		if ($curres =~ /[ACGT]/) {
		    my $refno = $ACGT{$curres};
		    if ($pos<$position_ub && $refno>=0) {
			unless ($reverseflag) {
			    $count_by_mut5[$refno][$refno][$pos]++;
			} else {
			    $count_by_mut3[3-$refno][3-$refno][$pos]++;
			}
		    }
		    my $pos3=$len-$pos-1;
		    if ($pos3<$position_ub && $pos3>=0 && $refno>=0) {
			unless ($reverseflag) {
			    $count_by_mut3[$refno][$refno][$pos3]++;
			} else {
			    $count_by_mut5[3-$refno][3-$refno][$pos3]++;
			}
		    }
		}
	    }
	    $pos++;
	}
    }

    #=== Print New SAM file with Reference ====#
    chomp($line);
    if ($opt::debug) {
	if ($nDAM>0) {
	    warning("$line\t","$ref\n");	
	}
	if ($check==1) {
	    warning("$CIGARBK\n",
		    "$mismstrbk\n",
		    "$read\n",
		    "$ref\n\n",);
	}
    }

    #=== Calculate damage rate ======#
    if ($count_CG>0) {
	$nDAM = int($nDAM*100/$count_CG);
	if ($nDAM >= $damage_ub) { $nDAM = $damage_ub-1; }
	$count_by_damage[$nDAM]++;
    }        

    #=== On screen output: how many lines processed ====#
    $linecount++;
    if($opt::verbose) {
	if (int($linecount/10000)*10000 == $linecount) {
	    print STDERR "$linecount lines processed\n";
	}
    }
}

for (my $i=$seqqual_lb;$i<$seqqual_ub;$i++) {
	print OUT "$i ";
	push @out, "$i ";
}
print OUT "\n";
push @out, "\n";
for (my $i=$seqqual_lb;$i<$seqqual_ub;$i++) {
	print OUT "$count_by_qual[$i] ";
	push @out, "$count_by_qual[$i] ";
}
print OUT "\n";
push @out, "\n";
#=== Output Read length Distribution =====#
for (my $i=$length_lb;$i<$length_ub;$i++) {
	print OUT "$i ";
	push @out, "$i ";
}
print OUT "\n";
push @out, "\n";
for (my $i=$length_lb;$i<$length_ub;$i++) {
	print OUT "$count_by_length[$i] ";
	push @out, "$count_by_length[$i] ";
}
print OUT "\n";
push @out, "\n";

#=== Output Damage Rate Distribution =====#
for (my $i=0;$i<$damage_ub;$i++) {
	print OUT "$i ";
	push @out, "$i ";
}
print OUT "\n";
push @out, "\n";
for (my $i=0;$i<$damage_ub;$i++) {
	print OUT "$count_by_damage[$i] ";
	push @out, "$count_by_damage[$i] ";
}
print OUT "\n";
push @out, "\n";

#=== Output Mismatch Patterns =====#
for (my $i=0;$i<4;$i++) {
    for (my $j=0;$j<4;$j++) {
	for(my $k=0;$k<$position_ub;$k++) {
	    my $mutrate;
	    my $totalres = $count_by_mut5[0][$j][$k] + $count_by_mut5[1][$j][$k] 
		+ $count_by_mut5[2][$j][$k] + $count_by_mut5[3][$j][$k];
	    if ($totalres==0) {
		$mutrate=0;
	    } else {
		$mutrate=$count_by_mut5[$i][$j][$k]/$totalres;
	    }
	    print OUT "$mutrate ";
	    push @out, "$mutrate ";
	}
	print OUT "\n";
	push @out, "\n";
    }
}

for (my $i=0;$i<4;$i++) {
    for (my $j=0;$j<4;$j++) {
	for(my $k=0;$k<$position_ub;$k++) {
	    my $mutrate;
	    my $totalres = $count_by_mut3[0][$j][$k] + $count_by_mut3[1][$j][$k]
		+ $count_by_mut3[2][$j][$k] + $count_by_mut3[3][$j][$k];
	    if ($totalres == 0) {
		$mutrate = 0;
	    } else {
		$mutrate = $count_by_mut3[$i][$j][$k]/$totalres;
	    }
	    print OUT "$mutrate ";
	    push @out, "$mutrate ";
	}
	print OUT "\n";
	push @out, "\n";
    }
}

close(IN);
close(OUT);

my $file_no_bam = $file;
$file_no_bam =~ s:.*/::; # Remove full path
$file_no_bam =~ s/\.bam$//i;
open(R,"|-","R --vanilla > /dev/null") || die;
print R R_script($opt::sample||$file_no_bam, $outfile, @out);
close R;
print STDERR "Saved $outfile\n";

sub debug {
    # Returns: N/A
    $opt::debug or return;
    @_ = grep { defined $_ ? $_ : "" } @_;
    if($Global::fd{1}) {
	# Original stdout was saved
	my $stdout = $Global::fd{1};
        print $stdout @_;
    } else {
        print @_;
    }
}

sub version {
    # Returns: N/A
    print join("\n",
               "$Global::progname $Global::version",
               "Copyright (C) 2013,2014 Yong Wang, Ole Tange and Free Software Foundation, Inc.",
               "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
               "This is free software: you are free to change and redistribute it.",
               "GNU $Global::progname comes with no warranty.",
               "",
               "Web site: http://www.nongnu.org/software/bammds\n",
	       "When using $Global::progname to process data for publication please cite:\n",
	       "<<INSERT ARTICLE HERE>>\n",
        );
}

sub usage {
    my $exit = shift;
    version();
    print "\nUsage:\n\n",
    "  bamdamage [--mapquality qual] [--basequality qual] [--sample name] [file.bam]\n",
    "  bamdamage [-m qual] [-b qual] [-s name] file.bam\n";
    exit($exit);
}

sub error {
    my @w = @_;
    my $fh = $Global::original_stderr || *STDERR;
    my $prog = $Global::progname || "bamdamage";
    print $fh $prog, ": Error: ", @w;
    exit(1);
}

sub warning {
    my @w = @_;
    my $fh = $Global::original_stderr || *STDERR;
    my $prog = $Global::progname || "bamdamage";
    print $fh $prog, ": Warning: ", @w;
}

sub R_script {
    my $sample_name = shift;
    my $pdffile = shift;
    my @values = @_;
    
    my @script =
	(
	 qq(
           sample_name <- "$sample_name"
           pdfname <- "$pdffile"
           str <- "@values"),
	 q(
open_plot_file <- function(mds_file) {
  if(grepl(".pdf$", mds_file, ignore.case = T)) {
    pdf(mds_file, height=7, width=14);
  } else if(grepl(".png$", mds_file, ignore.case = T)) {
    save_file <- gsub("(....)$", ".%1d\\1", mds_file)
    png(save_file, height=1000, width=2000);
    mds_file <<- gsub("(....)$", ".*\\1", mds_file)
  } else if(grepl(".svg$", mds_file, ignore.case = T)) {
    save_file <- gsub("(....)$", ".%1d\\1", mds_file)
    svg(save_file, height=7, width=14);
    mds_file <<- gsub("(....)$", ".*\\1", mds_file)
  } else if(grepl(".jpg$", mds_file, ignore.case = T)) {
    save_file <- gsub("(....)$", ".%1d\\1", mds_file)
    jpeg(save_file, height=1000, width=2000);
    mds_file <<- gsub("(....)$", ".*\\1", mds_file)
  } else if(grepl(".jpeg$", mds_file, ignore.case = T)) {
    save_file <- gsub("(.....)$", ".%1d\\1", mds_file)
    jpeg(save_file, height=1000, width=2000);
    mds_file <<- gsub("(.....)$", ".*\\1", mds_file)
  } else if(grepl(".tif$", mds_file, ignore.case = T)) {
    save_file <- gsub("(....)$", ".%1d\\1", mds_file)
    tiff(save_file, height=1000, width=2000);
    mds_file <<- gsub("(....)$", ".*\\1", mds_file)
  } else if(grepl(".tiff$", mds_file, ignore.case = T)) {
    save_file <- gsub("(.....)$", ".%1d\\1", mds_file)
    tiff(save_file, height=1000, width=2000);
    mds_file <<- gsub("(.....)$", ".*\\1", mds_file)
  } else {
    ## Unknown format
    error <- paste("Unknown plot format:",mds_file);
    write(error, stderr());
    quit("no",1);
  }
}

open_plot_file(pdfname);

## plot the quality distribution: remove!
# ifn1 <- textConnection(str);
# data1<-read.table(ifn1,header=F,nrows=2,skip=0)
# data1[2,]<-data1[2,]/sum(as.numeric(data1[2,]))
# mat1<-as.matrix(data1[2,])

#barplot(mat1,main="sequencing quality distribution",xlab="quality score",ylab="frequency",col="dodgerblue4",names.arg=data1[1,],legend=sample_name)

##plot of the read length distribution
par(mfrow=c(1,2))
ifn1 <- textConnection(str);
data1<-read.table(ifn1,header=F,nrows=2,skip=2)
values = data1[1,]
counts = data1[2,]

data1[2,]<-data1[2,]/sum(as.numeric(data1[2,]))
freqs = data1[2,]
average = sum(as.vector(values)*as.vector(freqs))
mat2<-as.matrix(data1[2,])

barplot(mat2,main="read length distribution",xlab="read length",ylab="frequency",col="dodgerblue4",names.arg=data1[1,])
legend("topleft",paste(sample_name,", average: ",round(average,2),sep=""),fill="dodgerblue4")

## plot of the damage distribution
#ifn1 <- textConnection(str);
#data1<-read.table(ifn1,header=F,nrows=2,skip=4)
#data1[2,]<-data1[2,]/sum(as.numeric(data1[2,]))
#mat3<-as.matrix(data1[2,])
#barplot(mat3,main="damage distribution",xlab="damage %",ylab="frequency",col="dodgerblue4",names.arg=data1[1,],legend=sample_name)
  
## plot damage from 5 end
par(mfrow=c(1,2))

ifn1 <- textConnection(str);
data1<-read.table(ifn1,header=F,nrows=16,skip=6)
tmp <- t(data1)
dam <- cbind(rowSums(tmp),tmp[,1:16])
nda <- dam
yma <- max(nda[1:25,c(3:6,8:11,13:16)])
yma <- round(yma*1000+0.5)/1000

Colors = c("olivedrab3","deepskyblue1","violetred","olivedrab3","gold","darkorange","deepskyblue1","gold","darkblue","violetred","darkorange","darkblue")
Linetypes = c(1,1,1,2,1,2,2,2,1,1,1,2)
plot(nda[,3], main=paste("Damage Pattern 5' end",sample_name,sep=" "),xlab="Position from 5 end",ylab="Frequency",ylim=c(0,yma),col=2,type="l",lwd=1);
points(nda[,4],type="l",col="deepskyblue1",lwd=3);
points(nda[,15],type="l",col="darkorange",lwd=3);
count = 0;
for (i in 5:16) {
        if (i!=7 && i!=12 && i!=15) {
          count=count+1;
          points(nda[,i],type="l",col=Colors[count],lty=Linetypes[count]);
        }
}

legend("topright",legend=c("C->A","G->A","T->A","A->C","G->C","T->C","A->G","C->G","T->G","A->T","C->T","G->T"),col=Colors,lwd=c(1,3,1,1,1,1,1,1,1,1,3,1),lty=c(1,1,1,2,1,2,2,2,1,1,1,2),cex=0.5);

ifn1 <- textConnection(str);
data1<-read.table(ifn1,header=F,nrows=16,skip=22)
tmp <- t(data1)
dam <- cbind(rowSums(tmp),tmp[,1:16])
nda <- dam;
yma <- max(nda[1:25,c(3:6,8:11,13:16)]);
yma <- round(yma*1000+0.5)/1000;

plot(nda[,3], main=paste("Damage Pattern 3' end",sample_name,sep=" "),xlab="Position from 3 end",ylab="Frequency",ylim=c(0,yma),col=2,type="l",lwd=1);
points(nda[,4],type="l",col="deepskyblue1",lwd=3);
points(nda[,15],type="l",,col="darkorange",lwd=3);
count = 0;

for (i in 5:16) {
        if (i!=7 && i!=12 && i!=15) {
          count=count+1;
          points(nda[,i],type="l",col=Colors[count],lty=Linetypes[count]);
        }
}
legend("topright",legend=c("C->A","G->A","T->A","A->C","G->C","T->C","A->G","C->G","T->G","A->T","C->T","G->T"),col=c("olivedrab3","deepskyblue1","violetred","olivedrab3","gold","darkorange","deepskyblue1","gold","darkblue","violetred","darkorange","darkblue"),lwd=c(1,3,1,1,1,1,1,1,1,1,3,1),lty=c(1,1,1,2,1,2,2,2,1,1,1,2),cex=0.5);


dev.off()

    ));

    return @script;
}
