#!/usr/local/bin/perl5
#
# build-trimming-info: given an alignment, and given a target sequence
# in that alignment, this script gets the posterior decoding column cost
# of that sequence in the alignment, as can be used for alignment trimming.
#
# Output is in a fasta-inspired format.  The first line is a fasta-like label
# for the target sequence.  Second line is the posterior decoding data, in a
# comma-separated list.  There is one entry per position in the target 
# sequence, whether or not the position is aligned.  Positions not aligned
# are given an out-of-range value, 99999.
#
#
# Inputs:
#     - align: name of an alignment to specify
#     - target: name of the target sequence
#
# Usage:
#    build-trimming-info -align 1arb-1svpA-samA-w0.5-t99-pcef-global.a2m.gz \
#                        -target 1svpA
#
#
    use strict;

    my ($candidateName, $detailsFileName, $template, $target, $seedType, 
		$seedAlign, $buildScript, $candidateDetail, $seedColumnIndex,
	    @targetResidueDetail, $rdbFilename);

    #
    # First off, we set some selected building parameters, ad set the paths
    # so that we'll be able to find our selected building scripts.
    #
    $buildScript = "w0.5";    
    setPaths();


    #
    # Step 1: parse the name of the alignment and the target sequence
    # name from the command line.
    #
    ($target, $candidateName) = parseCmdline(\@ARGV);

    #
    # Step 2: Build a seed alignment by removing the target sequence
    #         from the candidate alignment.
    #
    $seedAlign = makeSeedAlign($candidateName, $target);

    #
    # Step 3: build an RDB file listing what positions in the
    #         target sequence are aligned to which positions in
    #         the alignment, then read that into memory.
    $detailsFileName = baseDetailsFile($target, $candidateName);
    ($candidateDetail, $seedColumnIndex) 
	               = findOutHowTargetAligns($detailsFileName);

    #
    # Step 4: build a simple, quick array indexed by target sequence
    #         residue and listing the following:
    #         - the residue (amino acid)
    #         - its index position in candidateDetail
    #         - the alignment column it appears in, or a default
    #           value if it is not aligned.
    #
    @targetResidueDetail = describeTargetAlignment($candidateDetail);

    #
    # Step 5: Calculate the posterior decoding posteriors for each
    #         target residue included in the alignment.
    #
    posteriorDecodingValues($seedAlign, $candidateName, $target, 
			    $buildScript, \@targetResidueDetail,
			    $candidateDetail, $seedColumnIndex);


    #
    # Step 6: print out to stdout in fasta-like format
    #
    $rdbFilename = outputToStdout($candidateDetail,$target);

    system("rm $seedAlign $detailsFileName");


   #
   # parseCmdline: parse the command line for the names of the
   # alignment to read, and the target sequence to pay special attention 
   # to.  
   # Called with: the array of command line tokens
   # Return with: name of the input alignment and target sequence.
   #
   sub parseCmdline {
       my ($ii, $target, $candidateName, $args);

       $args = shift;

	   $candidateName = "";
	   $target = "";
       for ($ii = 0; $ii <= $#$args; $ii++) {
		   if ($$args[$ii] =~ /^-align/) {
	           $candidateName = $$args[$ii+1];
			   $ii++;
		   }
		   if ($$args[$ii] =~ /^-target/) {
			   $target = $$args[$ii+1];
			   $ii++;
		   }
       }
       return( ($target, $candidateName) );
    }  

  
    #
    # baseDetailsFile: build the baseline details file, listing the
    # template and target residues, the column subscore, and if the
    # column was included in the optimal subalignment.
    #
    sub baseDetailsFile {
	    my ($target, $candidateName, $detailsFile, $cmdline);


	    $target = shift;
		$candidateName = shift;

        $detailsFile = $candidateName;
		$detailsFile =~ s/.a2m.gz/.details.rdb/;
        $cmdline = "measure_shift -reference " . $candidateName
		    . " -candidate " . $candidateName 
		    . " -template " . $target . " -target " . $target
		    . " -details " . $detailsFile . " >/dev/null 2>&1";
		system($cmdline);
		return $detailsFile;
    }

    #
    # makeSeedAlign: make a seed alignment from the candidate alignment
    #                by reading the alignment, throwing out the target
    #                sequence, and printing out the rest of the alignment.
    # 
	# called with: name of the input alignment and target sequence
    # return with: name of the seed alignment (name includes the process ID
	#              to make it pseudo-unique).
    #
    sub makeSeedAlign {
	    my ($candidateName, $target, 
			$seedAlign, %candidateAlign, $cmdline);

		$candidateName = shift;
		$target = shift;

		%candidateAlign = getAlignment($candidateName);
		delete_sequence(\%candidateAlign, $target);
        $seedAlign = "seed." . $$ . ".a2m";
		printAlignment($template, $seedAlign, \%candidateAlign); 
		system("gzip $seedAlign");
		$seedAlign .= ".gz";

        return $seedAlign;
    }

   #
   # getAlignment - read the specified alignment
   #
   # This function reads the specified alignment into a hash of
   # arrays.  Each sequence uses the sequence label as the hash
   # key.  The hash key points to an array of strings which comprise
   # the sequence.
   #
   sub getAlignment{
       my ($alignFilename, %alignment, @sequence, $sequenceLength,
		   $line, @tokens, $label);

       $alignFilename = shift;

       open ALIGNFILE, "zcat $alignFilename |"
		   or die "Could not open $alignFilename for reading";
       %alignment = ();
       @sequence = ();
       $sequenceLength = 0;
       while ($line = <ALIGNFILE>) {
		   chop $line;
		   #
		   # If we find a new label, then either we're reading
		   # data on a new sequence or we're at the very start
		   # of the file.  If the last sequence is not of zero
		   # size, then add the last sequence and last label
		   # to the hash.
		   if ($line =~ />/) {
			   @tokens = split /[> ,:]/, $line;
			   if ($sequenceLength > 0) {
				   $alignment{$label} = [ @sequence ];
			   }
			   $label = $tokens[1];
			   @sequence = [];
			   $sequenceLength = 0;
		   } else {
			   $sequence[$sequenceLength] = $line;
			   $sequenceLength++;
		   }
       }
       close ALIGNFILE;
       if ($sequenceLength > 0) {
		   $alignment{$label} = [ @sequence ];
       }
       return %alignment;
    }


    #
    # delete_sequence - delete the specified sequence from the alignment
    #
    sub delete_sequence {
        my ($alignment, $sequenceName);
	
		$alignment = shift;
		$sequenceName = shift;
	
		delete $alignment->{$sequenceName};
    }


    #
    # printAlignment - save the specified alignment to a file, with the 
    #                  template sequence appearing first in the file.
    #
    sub printAlignment {

	    my ($templateSeq, $alignFilename, $alignmentData, $ii, $label);

		$templateSeq = shift;
		$alignFilename = shift;
		$alignmentData = shift;

        open ALIGNFILE, ">$alignFilename" 
	        or die("$0: Cannot create $alignFilename");

		#
		# What exactly is going on here?  We need to get the template
		# sequence at the front of the file.  Unfortunately, we can't
		# guarantee the label of the template sequence, only that it should
		# start with the template sequence name as a prefix.  So, print the
		# likely match at the front of the file; there should be one.
		#
		foreach $label (keys %{ $alignmentData}) {
	        if ($label =~ /^$templateSeq/) {
			    print ALIGNFILE ">$label\n";
				for ($ii = 0; $ii <= $#{ $alignmentData->{$label}}; $ii++) {
				    print ALIGNFILE "$alignmentData->{$label}[$ii]\n";
				}
				delete $alignmentData->{$label};
			}
        }
		foreach $label (keys %{ $alignmentData}) {
		    print ALIGNFILE ">$label\n";
            for ($ii = 0; $ii <= $#{ $alignmentData->{$label}}; $ii++) {
			    print ALIGNFILE "$alignmentData->{$label}[$ii]\n";
			}
		}
		close ALIGNFILE;
    }


    #
    # findOutHowTargetAligns - get a list of what residues align to what
    # columns in the candidate alignment, and what columns in the seed
    # alignment they correspond to.
    #
    sub findOutHowTargetAligns {
	    my ($detailsFile, @candidateDetail, %rdbLine,
			$line, $col, $targetResidue, $candidateColumn, 
			$seedColumn, @seedColumnIndex );

		$detailsFile = shift;

		open RDBFILE, "column col target < $detailsFile |" 
	               or die "$0: Cannot read $detailsFile";
        
	    #
		# Skip over the two header lines, then read the body of the file
		# inside below while loop.  Note that the rdb column should be
		# monotonically increasing from zero.  Rather than just assume
		# that, I'll keep my own index and test it, thank you very much!
		#
		<RDBFILE>;
        <RDBFILE>;
		$candidateColumn = $seedColumn = 0;
		@candidateDetail = ();
		@seedColumnIndex = ();
        while ($line = <RDBFILE>) {
		    chop $line;
			($col, $targetResidue) = split /[\t ]/, $line;
			if ($col != $candidateColumn) { 
			      die "$0: $detailsFile broken: got $col when expecting $candidateColumn";
			} else {
			    $candidateDetail[$candidateColumn]{col} = $candidateColumn;
				$candidateDetail[$candidateColumn]{target} = $targetResidue;
				$candidateDetail[$candidateColumn]{seedColumn} = $seedColumn;
				$seedColumnIndex[$seedColumn] = $candidateColumn;
				# 
				# Initialize the target sequence position to 0.  We'll
				# modify it in the next subroutine.
				#
				$candidateDetail[$candidateColumn]{targetIdx} = -1;

				#
				# Keep track of what columns in the seed alignment all
				# of this corresponds to.  We are assuming a one-to-one
				# correspondence between columns in the seed alignment
				# and nodes in the model.  Therefore, if the target
				# residue is either aligned, or the target character for
				# the column is a deletion character, it means we've
				# seen the data for one more seed alignment column, and
				# can advance the counter.
				#
				if ($targetResidue =~ /[A-Z]|-/) {
			        $seedColumn++;
				} else {
				    $candidateDetail[$candidateColumn]{seedColumn} = -1;
				}
				$candidateColumn++;
			}
		}
        close RDBFILE;
        return (\@candidateDetail, \@seedColumnIndex);
    }

    
    #
    # describeTargetAlignment - using the information in candidateDetail,
    #                 build a list indexed by target residue position
    #                 describing amino acid, index position in 
    #                 candidate detail, and seed column aligned to (or
    #                 a default value if not aligned)
    sub describeTargetAlignment {
        my ($candidateDetail, @targetDetail, $ii, $targetSeqPosition);

        $candidateDetail = shift;
		@targetDetail = ();
        $targetSeqPosition = 0;
	    for ($ii = 0; $ii <= $#{ $candidateDetail }; $ii++) {
	        if ($$candidateDetail[$ii]{target} =~ /[A-Za-z]/) {

	            #
			    # The next target sequence residue appears in 
			    # candidateDetail at index position $ii.  Mark the position,
			    # and pull some salient information out of candidateDetail.
			    # Note that the residue might not be aligned here: it might
			    # be in an indel.  But that information should appear correctly
			    # in candidateDetail{seedColumn}
			    #
			    $targetDetail[$targetSeqPosition]{candidateIndex} = $ii;
				$targetDetail[$targetSeqPosition]{seedColumn}
		            = $$candidateDetail[$ii]{seedColumn};
				$targetDetail[$targetSeqPosition]{target}
		            = $$candidateDetail[$ii]{target};
				$$candidateDetail[$ii]{targetIdx} = $targetSeqPosition;
				$targetSeqPosition++;
			}
		}      
		return(@targetDetail);
        
    }

    #
    # posteriorDecodingValues
    #
    sub posteriorDecodingValues {
        my ($seedAlign, $candidateAlign,$targetId, $buildScript, $cmd,
			$outputName, $ii, $line, @tokens, $targetPosition, 
			$posteriorOdds, $seedColumn, $candidateIndex, 
			$targetResidueDetail, $candidateDetail, 
			$node, $column, $seedColumnIndex, 
            $readingPosteriors);

		$seedAlign = shift;
		$candidateAlign = shift;
		$targetId = shift;
        $buildScript = shift;
		$targetResidueDetail = shift;
		$candidateDetail = shift;
		$seedColumnIndex = shift;
		
		$outputName =  "recordPdoc_" . $$;
		#
		# Step 1: use the buildscript to build a weighted model of the 
		#         seed alignment
		#
		$cmd = "$buildScript $seedAlign $outputName.mod > /dev/null 2>&1";
		if (system($cmd) != 0) { die "$0: cannot execute $cmd"; }

	
        #
		# Step 2: use the grabdp program to generate a .pdoc file
		#         detailing the posterior decoding of the target
        #         sequence to the model.  The contents of this
        #         file include a huge array for which (<residue>,<column>)
		#         reflects the posterior probablity of aligning 
        #         target sequence residue <residue> to seed alignment
		#         column <column>, given all paths through the model.
        #       
		$cmd = "grabdp $outputName -dpstyle 2 -i $outputName.mod"
		        . " -db $candidateAlign -id $targetId > /dev/null 2>&1";
		if (system($cmd) != 0) { die "$0: Cannot execute $cmd"; }

		#
		# Step 3: Initialize the posterior decoding column cost to
		#         a default value.  This value is an out-of-range
        #         large value.
		for ($ii = 0; $ii <= $#{ $candidateDetail }; $ii++) {
		    $$candidateDetail[$ii]{charPosteriorMatch} = 99999;
		}

        #
		# Step 4: Grab the posteriors out of the *.pdoc file.  
        #         To get to the rows of interest, we'll scan down through
        #         the file past the keyword POSTERIORS.  Following that,
        #         there are three lines per residue.
        #         1. transition posteriors for all columns
        #         2. character-based match posteriors for all columns
        #         3. character-based insert posteriors for all columns.
		#         It's only the second line that we care about.
        #
        #         The format of the line as as follws:
        #             (Char F, index 4, Match) ...
        #         indicating that we're looking at the match distribution for
        #         Character F, the fourth character in the alphabet.  The
        #         format is the same after that: starts with the posterior
        #         for node 0, then goes to node 1, etc.  So in total, the
        #         posterior for the character at the 1st node (the first
        #         alignment column) is the 7th character in the sequence.
        #
		open PDOCVALUES, "$outputName.pdoc" 
              or die "$0: Cannot read $outputName";
		$readingPosteriors = 0;
        while (!$readingPosteriors) {
            $line = <PDOCVALUES>;
            if ($line =~ /POSTERIORS/) { $readingPosteriors = 1; }
		}
		while ($line = <PDOCVALUES>) {

            #
            # We're reading one line of transition stuff.  Mostly skip on 
            # to the line of character-based stuff.
            #
			@tokens = split / /, $line;

            #
            # At the bottom of the pdoc file is a line pertaining to the end
            # node, denoted as Node -1 for probably historic reasons.  There's
            # no information we need in that line, and indexing a node with
            # a negative number would only screw up the array reading.  If
            # this is node -1, don't bother with the line.
            # (also note that the last thing in the pdoc file is transition
            # data for Node -1).
            #
            if ($tokens[1] >= 0) {
			    $targetPosition = $tokens[1];
				$seedColumn 
			         = $$targetResidueDetail[$targetPosition]{seedColumn};
                if ($seedColumn >= 0) {
					$candidateIndex 
					  = $$targetResidueDetail[$targetPosition]{candidateIndex};
				}

		
				#
				# Now read one line of character-based match stuff
				$line = <PDOCVALUES>;
				chop $line;
				@tokens = split / /, $line;
                if ($seedColumn >= 0) {
				    $posteriorOdds = $tokens[$seedColumn+6];
					$$candidateDetail[$candidateIndex]{charPosteriorMatch} 
					                                   = $posteriorOdds;
				}

				#
				# Now one line of character-based insert stuff (skip)
				#
				$line = <PDOCVALUES>;

            }
		}
		$cmd = "rm " . $outputName . "*";
		system $cmd;
    }


    #
    # outputToStdout: send to stdout a record of this in a fasta-like
    #                 format.
    #
    sub outputToStdout {
        my ($target, $ii, $candidateDetail);

        $candidateDetail = shift;
		$target = shift;

        print ">$target\n";
        for ($ii = 0; $ii <= $#$candidateDetail; $ii++) {
		    if ($$candidateDetail[$ii]{target} =~ /[A-Za-z]/) {
                print "$$candidateDetail[$ii]{charPosteriorMatch},";
			}
        }
        print "\n";
    }


    sub setPaths
    {
	    $ENV{PATH} .= ":/projects/compbio/experiments/models.97/scripts99"
	              . "/projects/compbio/experiments/models.97/scripts";
		$ENV{PATH} .= ":/projects/compbio/bin:/projects/compbio/bin/alpha";
		$ENV{PATH} .= ":/projects/compbio/bin/scripts";
    }