#!/usr/bin/perl

###############################################################
#  programmer:		bradley e. slaven
#
#  description:		see help area below.
#
#
################################################################

use Getopt::Long;
use Carp qw(carp croak);
use strict;

local *IN_FILE;
local *LOG_FILE;
local *QTRIM_FILE;

my %PROB_ERROR_HASH = ();
my $MAX_HASH_INDEX = 100;
my @QUAL_ARRAY = ();
my $QUAL_ARRAY_LEN = 0;
my @ERROR_ARRAY = ();
my $ERROR_ARRAY_LEN = 0;
my $MIN_SEQ_LENGTH = 50;
my $LONGEST_QUALITY_LENGTH = 0;
my $LONGEST_QUALITY_SCORE = 1.0;


my $LENGTH = 0;
my $ERROR = 1.0;

my $opt_h;
my $opt_l;
my $opt_e;
my $opt_q;
my $opt_f;

MAIN:
{  #beg Main

Getopt::Long::config('no_ignore_case');
use Carp qw(carp croak);
GetOptions('h!'		=> \$opt_h,
	   'help!'	=> \$opt_h,
	   'l=s'	=> \$opt_l,
	   'e=s'	=> \$opt_e,
	   'q=s'	=> \$opt_q,
	   );

$opt_h and usage('help');
$opt_q or usage();
$opt_l or usage();
$opt_e or usage();


$LENGTH = $opt_l;
$MIN_SEQ_LENGTH = $LENGTH -1;
$ERROR = $opt_e;

#$opt_f or usage();

my $qFile = $opt_q;
#my $fFile = $opt_f;

my $logFileName = "$qFile" . "\.TRIMLOG";
open (LOG_FILE, ">$logFileName") or croak "Can Not open $logFileName: $!\n";

my $qTrimFileName = "$qFile" . "\.QUAL_TRIM";
open (QTRIM_FILE, ">$qTrimFileName") or croak "Can Not open $qTrimFileName: $!\n";

my $tLocationFileName = "$qFile" . "\.TRIM_LOC";
open (TRIM_LOC, ">$tLocationFileName") or croak "Can Not open $tLocationFileName: $!\n";

print "NEW QUALITY FILENAME = $qFile\n";
print "LOG FILENAME = $logFileName\n";
print "TRIM LOCATION FILENAME = $tLocationFileName\n";


calculateProbErrorHash();
#printProbErrorHash();
readQualFilename($qFile);

close(QTRIM_FILE);
close(LOG_FILE);
close(TRIM_LOC);

}  #end Main



sub printProbErrorHash() {

	print("PROB_ERROR_HASH:\n");
	print("================\n");
	for (my $i = 0; $i <= $MAX_HASH_INDEX; $i++) {
		#print("PROB_ERROR_HASH{$i}=$PROB_ERROR_HASH{$i}\n");
		my $prob = $PROB_ERROR_HASH{$i};
		print("PROB_ERROR_HASH[$i]=$prob\n");
	}
	print("================\n");

} #end printProbErrorHash




sub calculateProbErrorHash() {

	#error calculation:
	#probError = 10 ** (-qualScore/10)

	for (my $i = 0; $i <= $MAX_HASH_INDEX; $i++) {
		#print("i=$i\t");
		my $exp = $i / 10;
		my $power = 10 ** $exp;
		my $probError = 1 / $power;
		#print("probError=$probError\n");
		$PROB_ERROR_HASH{$i} = $probError;
		#print("PROB_ERROR_HASH{$i}=$PROB_ERROR_HASH{$i}\n");
	} #end for

} #end calculateProbErrorHash

#sub readFastaFilename() {
#
#	my $fname = $_[0];
#	open (IN_FILE, $fname) or croak "Can Not open $fname: $!\n";
#
#	my $seq = "";
#	my $line = "";
#	my $name = "";
#	my $numSeqs = 0;
#
#	while (<IN_FILE>) {
#
#		chomp;
#		my $line = $_;
#
#		if ($line =~ /^>/) {
#			$name = $line;
#			$name =~ s/^>//;
#			$name =~ s/\s.*//;
#			#print("name=$name\n");
#			if ($numSeqs > 0) {
#			#process record
#
#				my $begEndRecord = $BEG_END_HASH{"$name");
#				my @begEndArray = split(/\t/, $begEndRecord);
#				my $beg = $begEndArray[0];
#				my $end = $begEndArray[1];
#				printAnnotatedFastaSequence($name, $beg, $end, $seq);
#				
#			} #end if numSeqs
#			$seq = "";
#			++$numSeqs;
#		}
#		else {
#			my $len = length($line);
#			if ($len > 0) {
#				$seq = $seq . " " . $line;
#			} #end len GT 0
#		} #end else
#	
#	} #end IN_FILE while
#
#} #end readFastaFilename

sub readQualFilename() {

	my $fname = $_[0];
	open (IN_FILE, $fname) or croak "Can Not open $fname: $!\n";

	my $seq = "";
	my $line = "";
	my $name = "";
	my $numSeqs = 0;
	my $numPassedSequences = 0;
	my $numFailedSequences = 0;
	print("READ_NAME\tSTATUS\tBEG\tLEN\tE-SCORE\n");

	while (<IN_FILE>) {

		chomp;
		my $line = $_;

		if ($line =~ /^>/) {

			if ($numSeqs > 0) {
				@QUAL_ARRAY = loadQualArray($seq);
				$QUAL_ARRAY_LEN = $#QUAL_ARRAY;

				@ERROR_ARRAY = convertQualToProbErrors();
				$ERROR_ARRAY_LEN = $#ERROR_ARRAY;

				$LONGEST_QUALITY_LENGTH = 0;
				$LONGEST_QUALITY_SCORE = 1.0;
				my $begLongestQualSeq = getMaxSeqLength();
				if ($begLongestQualSeq >= 0) {
				  print TRIM_LOC ("$name\tACCEPT\t$begLongestQualSeq\t$LONGEST_QUALITY_LENGTH\t$LONGEST_QUALITY_SCORE\n");
				  print ("$name\tACCEPT\t$begLongestQualSeq\t$LONGEST_QUALITY_LENGTH\t$LONGEST_QUALITY_SCORE\n");
				  printAnnotatedQualSequence($name, $begLongestQualSeq, $LONGEST_QUALITY_LENGTH);
				  printAnnotatedQualLog($name, $begLongestQualSeq, $LONGEST_QUALITY_LENGTH);
				  ++$numPassedSequences
				}
				else {
				  print TRIM_LOC ("$name\tREJECT\t$begLongestQualSeq\t$LONGEST_QUALITY_LENGTH\t$LONGEST_QUALITY_SCORE\n");
				  print ("$name\tREJECT\t$begLongestQualSeq\t$LONGEST_QUALITY_LENGTH\t$LONGEST_QUALITY_SCORE\n");
				  print LOG_FILE ("**$name: NO SEQ GT $LENGTH BASE AND QUAL LESS THAN $ERROR **\n");
				  ++$numFailedSequences
				}
				#print("ERROR ARRAY:\n");
				#for (my $i = 1; $i <= $ERROR_ARRAY_LEN; $i++) {
				#	print("$ERROR_ARRAY[$i]\n");
				#}

			} #end if numSeqs
			$name = $line;
			$name =~ s/^>//;
			$name =~ s/\s.*//;
			#print("name=$name\n");

			$seq = "";
			++$numSeqs;
		} #end if
		else {
			my $len = length($line);
			if ($len > 0) {
				$seq = $seq . " " . $line;
			} #end len GT 0
		} #end else
	
	} #end IN_FILE while

	close (IN_FILE);

	print LOG_FILE ("numberOfPassedSequences=$numPassedSequences\n");
	print LOG_FILE ("numberOfFailedSequences=$numFailedSequences\n");

}  #end readQualFilename




sub printAnnotatedQualLog() {

	my $name = $_[0];
	my $beg = $_[1];
	my $len = $_[2];

	my $end = $beg + $len;

	print LOG_FILE (">$name\n");

	for (my $i = 1; $i <= $QUAL_ARRAY_LEN; $i++) {
		if ($i == $beg) { print LOG_FILE ("["); }
		if ($QUAL_ARRAY[$i] < 10) {
		  print LOG_FILE ("$QUAL_ARRAY[$i]  ");
		}
		else {
		  print LOG_FILE ("$QUAL_ARRAY[$i] ");
		}
		if ($i == $end) { print LOG_FILE ("]"); };
		if (($i % 25) == 0) { print LOG_FILE ("\n"); }
	}
	print LOG_FILE ("\n");

} #end printAnnotatedQualLog



sub printAnnotatedQualSequence() {

	my $name = $_[0];
	my $beg = $_[1];
	my $len = $_[2];

	my $end = $beg + $len;

	print QTRIM_FILE (">$name\n");

	for (my $i = $beg; $i <= $end; $i++) {
		#if ($i == $begBest) { print("["); }
		if ($QUAL_ARRAY[$i] < 10) {
			print QTRIM_FILE ("$QUAL_ARRAY[$i]  ");
		}
		else {
			print QTRIM_FILE ("$QUAL_ARRAY[$i] ");
		}
		#if ($i == $endBest) { print("]"); };
		if (($i % 25) == 0) { print QTRIM_FILE ("\n"); }
	}
	print QTRIM_FILE ("\n");

} #end printAnnotatedQualSequence




sub getMaxSeqLength() {

	$LONGEST_QUALITY_LENGTH = 0;
	$LONGEST_QUALITY_SCORE = 1.0;

	my $begQualityLength = -1;

	my $beg = 1;
	my $end = $MIN_SEQ_LENGTH + $beg;

	#print("beg at $beg to $end\n");

	my $shiftOneBaseRight = $ERROR_ARRAY_LEN - $end + 2;
	my $subArraySum = 1.0;
	my $subArrayLen = -1;

	while ($beg < $shiftOneBaseRight) {
		#print("beg at $beg\n");
		my @subArray = ();
		#get first qual string in sequence
		@subArray = substrErrorArray($beg, $end);
		$subArrayLen = $#subArray - 1;
		$subArraySum = $subArray[$#subArray];
		if (($subArraySum <= $ERROR) && 
                    ($LONGEST_QUALITY_LENGTH < $subArrayLen))  {
			$LONGEST_QUALITY_LENGTH = $subArrayLen;
			$LONGEST_QUALITY_SCORE = $subArraySum;
			$begQualityLength = $beg;
		}
	
		#print("subArray: sum=$subArraySum\n");
		#for (my $i = 0; $i <= $subArrayLen; $i++) {
		#	print("subArray[$i]=$subArray[$i]\n");
		#}
		#print("\n");

		my $extendSeqEnd = $end + 1;

		#print ("extendedSeqEnd = $extendSeqEnd\tend=$end\n");
		#print ("ERROR_ARRAY_LEN=$ERROR_ARRAY_LEN\tERROR=$ERROR\n");
		#print ("subArraySum=$subArraySum\n");

		while (($extendSeqEnd < $ERROR_ARRAY_LEN) && 
                       ($subArraySum <= $ERROR)) {

			#print("$beg TO $extendSeqEnd\n");
			@subArray = ();
			@subArray = substrErrorArray($beg, $extendSeqEnd);
			$subArrayLen = $#subArray - 1;
			$subArraySum = $subArray[$#subArray];
			if (($LONGEST_QUALITY_LENGTH < $subArrayLen) &&
			    ($subArraySum <= $ERROR)) {
				$LONGEST_QUALITY_LENGTH = $subArrayLen;
				$LONGEST_QUALITY_SCORE = $subArraySum;
				$begQualityLength = $beg;
			}
			#print("subArray: sum=$subArraySum\n");
			#for (my $i = 0; $i <= $subArrayLen; $i++) {
			#	print("subArray[$i]=$subArray[$i]\n");
			#}
			#print("\n");

			++$extendSeqEnd;
		}

		#print("\n");
		++$beg;
		$end = $MIN_SEQ_LENGTH + $beg;
	} #end while

	#return
	$begQualityLength;

} #end getMaxSeqLength


sub substrErrorArray () {
	my $beg = $_[0];
	my $end = $_[1];
	my $sum = 0.0;

	my @sArray = ();
	my $index = 0;
	for (my $i = $beg; $i <= $end; $i++) {
		$sArray[$index] = $ERROR_ARRAY[$i];
		$sum += $ERROR_ARRAY[$i];
		++$index;
	} #end for
	$sArray[$index] = $sum;  #place sum in last elmt of array

	#return - last elmt of sArray is the SUM!!
	@sArray;
	
} #end substrErrorArray


sub convertQualToProbErrors() {

	my @prob_array = ();
	for (my $i = 0; $i <= $QUAL_ARRAY_LEN; $i++) {
		#print("$QUAL_ARRAY[$i] ");
		my $prob = 1.0;
		if ($QUAL_ARRAY[$i] > $MAX_HASH_INDEX) {
			print ("ERROR:QUAL_ARRAY[$i]=$QUAL_ARRAY[$i]\n");
			$prob = 1.0;
			$prob_array[$i] = $prob;
		} #end if
		else {
			$prob = $PROB_ERROR_HASH{$QUAL_ARRAY[$i]};
			$prob_array[$i] = $prob;
		} #end else
	} #end for

	#return prob_array
	@prob_array;

} #end convertQualToProbScores


sub loadQualArray() {

	my $qString = $_[0];
	my @qual_array = ();
	#print ("qString=$qString\n");
	@qual_array = split(/\s+/, $qString);

	#return qual score array
	@qual_array;

} #end loadQualArray




sub usage {
my $help = shift || 0;
print "Usage: $0 [-options]\n";
print "-h help\n";
print "-q quality score fasta file\n";
#print "-f sequence base fasta file\n";
print "-l length of sequence required\n";
print "-e max error\n";
help() if ($help);
exit;
}



sub help {
print qq~
===============================================================
$0: DOCUMENTATION
---------------------------------------------------------------
This program reads a quality file and it's associated fasta sequence
trimming each read to the arachne assembler specification.

usage example:

	$0 -q qualFilename -l min_length -e max_error

~
}
