#!/usr/local/bin/perl5 # # build-trimming-info: given an alignment, and given a target sequence # in that alignment, this script gets the posterior decoding column cost # of that sequence in the alignment, as can be used for alignment trimming. # # Output is in a fasta-inspired format. The first line is a fasta-like label # for the target sequence. Second line is the posterior decoding data, in a # comma-separated list. There is one entry per position in the target # sequence, whether or not the position is aligned. Positions not aligned # are given an out-of-range value, 99999. # # # Inputs: # - align: name of an alignment to specify # - target: name of the target sequence # # Usage: # build-trimming-info -align 1arb-1svpA-samA-w0.5-t99-pcef-global.a2m.gz \ # -target 1svpA # # use strict; my ($candidateName, $detailsFileName, $template, $target, $seedType, $seedAlign, $buildScript, $candidateDetail, $seedColumnIndex, @targetResidueDetail, $rdbFilename); # # First off, we set some selected building parameters, ad set the paths # so that we'll be able to find our selected building scripts. # $buildScript = "w0.5"; setPaths(); # # Step 1: parse the name of the alignment and the target sequence # name from the command line. # ($target, $candidateName) = parseCmdline(\@ARGV); # # Step 2: Build a seed alignment by removing the target sequence # from the candidate alignment. # $seedAlign = makeSeedAlign($candidateName, $target); # # Step 3: build an RDB file listing what positions in the # target sequence are aligned to which positions in # the alignment, then read that into memory. $detailsFileName = baseDetailsFile($target, $candidateName); ($candidateDetail, $seedColumnIndex) = findOutHowTargetAligns($detailsFileName); # # Step 4: build a simple, quick array indexed by target sequence # residue and listing the following: # - the residue (amino acid) # - its index position in candidateDetail # - the alignment column it appears in, or a default # value if it is not aligned. # @targetResidueDetail = describeTargetAlignment($candidateDetail); # # Step 5: Calculate the posterior decoding posteriors for each # target residue included in the alignment. # posteriorDecodingValues($seedAlign, $candidateName, $target, $buildScript, \@targetResidueDetail, $candidateDetail, $seedColumnIndex); # # Step 6: print out to stdout in fasta-like format # $rdbFilename = outputToStdout($candidateDetail,$target); system("rm $seedAlign $detailsFileName"); # # parseCmdline: parse the command line for the names of the # alignment to read, and the target sequence to pay special attention # to. # Called with: the array of command line tokens # Return with: name of the input alignment and target sequence. # sub parseCmdline { my ($ii, $target, $candidateName, $args); $args = shift; $candidateName = ""; $target = ""; for ($ii = 0; $ii <= $#$args; $ii++) { if ($$args[$ii] =~ /^-align/) { $candidateName = $$args[$ii+1]; $ii++; } if ($$args[$ii] =~ /^-target/) { $target = $$args[$ii+1]; $ii++; } } return( ($target, $candidateName) ); } # # baseDetailsFile: build the baseline details file, listing the # template and target residues, the column subscore, and if the # column was included in the optimal subalignment. # sub baseDetailsFile { my ($target, $candidateName, $detailsFile, $cmdline); $target = shift; $candidateName = shift; $detailsFile = $candidateName; $detailsFile =~ s/.a2m.gz/.details.rdb/; $cmdline = "measure_shift -reference " . $candidateName . " -candidate " . $candidateName . " -template " . $target . " -target " . $target . " -details " . $detailsFile . " >/dev/null 2>&1"; system($cmdline); return $detailsFile; } # # makeSeedAlign: make a seed alignment from the candidate alignment # by reading the alignment, throwing out the target # sequence, and printing out the rest of the alignment. # # called with: name of the input alignment and target sequence # return with: name of the seed alignment (name includes the process ID # to make it pseudo-unique). # sub makeSeedAlign { my ($candidateName, $target, $seedAlign, %candidateAlign, $cmdline); $candidateName = shift; $target = shift; %candidateAlign = getAlignment($candidateName); delete_sequence(\%candidateAlign, $target); $seedAlign = "seed." . $$ . ".a2m"; printAlignment($template, $seedAlign, \%candidateAlign); system("gzip $seedAlign"); $seedAlign .= ".gz"; return $seedAlign; } # # getAlignment - read the specified alignment # # This function reads the specified alignment into a hash of # arrays. Each sequence uses the sequence label as the hash # key. The hash key points to an array of strings which comprise # the sequence. # sub getAlignment{ my ($alignFilename, %alignment, @sequence, $sequenceLength, $line, @tokens, $label); $alignFilename = shift; open ALIGNFILE, "zcat $alignFilename |" or die "Could not open $alignFilename for reading"; %alignment = (); @sequence = (); $sequenceLength = 0; while ($line = ) { chop $line; # # If we find a new label, then either we're reading # data on a new sequence or we're at the very start # of the file. If the last sequence is not of zero # size, then add the last sequence and last label # to the hash. if ($line =~ />/) { @tokens = split /[> ,:]/, $line; if ($sequenceLength > 0) { $alignment{$label} = [ @sequence ]; } $label = $tokens[1]; @sequence = []; $sequenceLength = 0; } else { $sequence[$sequenceLength] = $line; $sequenceLength++; } } close ALIGNFILE; if ($sequenceLength > 0) { $alignment{$label} = [ @sequence ]; } return %alignment; } # # delete_sequence - delete the specified sequence from the alignment # sub delete_sequence { my ($alignment, $sequenceName); $alignment = shift; $sequenceName = shift; delete $alignment->{$sequenceName}; } # # printAlignment - save the specified alignment to a file, with the # template sequence appearing first in the file. # sub printAlignment { my ($templateSeq, $alignFilename, $alignmentData, $ii, $label); $templateSeq = shift; $alignFilename = shift; $alignmentData = shift; open ALIGNFILE, ">$alignFilename" or die("$0: Cannot create $alignFilename"); # # What exactly is going on here? We need to get the template # sequence at the front of the file. Unfortunately, we can't # guarantee the label of the template sequence, only that it should # start with the template sequence name as a prefix. So, print the # likely match at the front of the file; there should be one. # foreach $label (keys %{ $alignmentData}) { if ($label =~ /^$templateSeq/) { print ALIGNFILE ">$label\n"; for ($ii = 0; $ii <= $#{ $alignmentData->{$label}}; $ii++) { print ALIGNFILE "$alignmentData->{$label}[$ii]\n"; } delete $alignmentData->{$label}; } } foreach $label (keys %{ $alignmentData}) { print ALIGNFILE ">$label\n"; for ($ii = 0; $ii <= $#{ $alignmentData->{$label}}; $ii++) { print ALIGNFILE "$alignmentData->{$label}[$ii]\n"; } } close ALIGNFILE; } # # findOutHowTargetAligns - get a list of what residues align to what # columns in the candidate alignment, and what columns in the seed # alignment they correspond to. # sub findOutHowTargetAligns { my ($detailsFile, @candidateDetail, %rdbLine, $line, $col, $targetResidue, $candidateColumn, $seedColumn, @seedColumnIndex ); $detailsFile = shift; open RDBFILE, "column col target < $detailsFile |" or die "$0: Cannot read $detailsFile"; # # Skip over the two header lines, then read the body of the file # inside below while loop. Note that the rdb column should be # monotonically increasing from zero. Rather than just assume # that, I'll keep my own index and test it, thank you very much! # ; ; $candidateColumn = $seedColumn = 0; @candidateDetail = (); @seedColumnIndex = (); while ($line = ) { chop $line; ($col, $targetResidue) = split /[\t ]/, $line; if ($col != $candidateColumn) { die "$0: $detailsFile broken: got $col when expecting $candidateColumn"; } else { $candidateDetail[$candidateColumn]{col} = $candidateColumn; $candidateDetail[$candidateColumn]{target} = $targetResidue; $candidateDetail[$candidateColumn]{seedColumn} = $seedColumn; $seedColumnIndex[$seedColumn] = $candidateColumn; # # Initialize the target sequence position to 0. We'll # modify it in the next subroutine. # $candidateDetail[$candidateColumn]{targetIdx} = -1; # # Keep track of what columns in the seed alignment all # of this corresponds to. We are assuming a one-to-one # correspondence between columns in the seed alignment # and nodes in the model. Therefore, if the target # residue is either aligned, or the target character for # the column is a deletion character, it means we've # seen the data for one more seed alignment column, and # can advance the counter. # if ($targetResidue =~ /[A-Z]|-/) { $seedColumn++; } else { $candidateDetail[$candidateColumn]{seedColumn} = -1; } $candidateColumn++; } } close RDBFILE; return (\@candidateDetail, \@seedColumnIndex); } # # describeTargetAlignment - using the information in candidateDetail, # build a list indexed by target residue position # describing amino acid, index position in # candidate detail, and seed column aligned to (or # a default value if not aligned) sub describeTargetAlignment { my ($candidateDetail, @targetDetail, $ii, $targetSeqPosition); $candidateDetail = shift; @targetDetail = (); $targetSeqPosition = 0; for ($ii = 0; $ii <= $#{ $candidateDetail }; $ii++) { if ($$candidateDetail[$ii]{target} =~ /[A-Za-z]/) { # # The next target sequence residue appears in # candidateDetail at index position $ii. Mark the position, # and pull some salient information out of candidateDetail. # Note that the residue might not be aligned here: it might # be in an indel. But that information should appear correctly # in candidateDetail{seedColumn} # $targetDetail[$targetSeqPosition]{candidateIndex} = $ii; $targetDetail[$targetSeqPosition]{seedColumn} = $$candidateDetail[$ii]{seedColumn}; $targetDetail[$targetSeqPosition]{target} = $$candidateDetail[$ii]{target}; $$candidateDetail[$ii]{targetIdx} = $targetSeqPosition; $targetSeqPosition++; } } return(@targetDetail); } # # posteriorDecodingValues # sub posteriorDecodingValues { my ($seedAlign, $candidateAlign,$targetId, $buildScript, $cmd, $outputName, $ii, $line, @tokens, $targetPosition, $posteriorOdds, $seedColumn, $candidateIndex, $targetResidueDetail, $candidateDetail, $node, $column, $seedColumnIndex, $readingPosteriors); $seedAlign = shift; $candidateAlign = shift; $targetId = shift; $buildScript = shift; $targetResidueDetail = shift; $candidateDetail = shift; $seedColumnIndex = shift; $outputName = "recordPdoc_" . $$; # # Step 1: use the buildscript to build a weighted model of the # seed alignment # $cmd = "$buildScript $seedAlign $outputName.mod > /dev/null 2>&1"; if (system($cmd) != 0) { die "$0: cannot execute $cmd"; } # # Step 2: use the grabdp program to generate a .pdoc file # detailing the posterior decoding of the target # sequence to the model. The contents of this # file include a huge array for which (,) # reflects the posterior probablity of aligning # target sequence residue to seed alignment # column , given all paths through the model. # $cmd = "grabdp $outputName -dpstyle 2 -i $outputName.mod" . " -db $candidateAlign -id $targetId > /dev/null 2>&1"; if (system($cmd) != 0) { die "$0: Cannot execute $cmd"; } # # Step 3: Initialize the posterior decoding column cost to # a default value. This value is an out-of-range # large value. for ($ii = 0; $ii <= $#{ $candidateDetail }; $ii++) { $$candidateDetail[$ii]{charPosteriorMatch} = 99999; } # # Step 4: Grab the posteriors out of the *.pdoc file. # To get to the rows of interest, we'll scan down through # the file past the keyword POSTERIORS. Following that, # there are three lines per residue. # 1. transition posteriors for all columns # 2. character-based match posteriors for all columns # 3. character-based insert posteriors for all columns. # It's only the second line that we care about. # # The format of the line as as follws: # (Char F, index 4, Match) ... # indicating that we're looking at the match distribution for # Character F, the fourth character in the alphabet. The # format is the same after that: starts with the posterior # for node 0, then goes to node 1, etc. So in total, the # posterior for the character at the 1st node (the first # alignment column) is the 7th character in the sequence. # open PDOCVALUES, "$outputName.pdoc" or die "$0: Cannot read $outputName"; $readingPosteriors = 0; while (!$readingPosteriors) { $line = ; if ($line =~ /POSTERIORS/) { $readingPosteriors = 1; } } while ($line = ) { # # We're reading one line of transition stuff. Mostly skip on # to the line of character-based stuff. # @tokens = split / /, $line; # # At the bottom of the pdoc file is a line pertaining to the end # node, denoted as Node -1 for probably historic reasons. There's # no information we need in that line, and indexing a node with # a negative number would only screw up the array reading. If # this is node -1, don't bother with the line. # (also note that the last thing in the pdoc file is transition # data for Node -1). # if ($tokens[1] >= 0) { $targetPosition = $tokens[1]; $seedColumn = $$targetResidueDetail[$targetPosition]{seedColumn}; if ($seedColumn >= 0) { $candidateIndex = $$targetResidueDetail[$targetPosition]{candidateIndex}; } # # Now read one line of character-based match stuff $line = ; chop $line; @tokens = split / /, $line; if ($seedColumn >= 0) { $posteriorOdds = $tokens[$seedColumn+6]; $$candidateDetail[$candidateIndex]{charPosteriorMatch} = $posteriorOdds; } # # Now one line of character-based insert stuff (skip) # $line = ; } } $cmd = "rm " . $outputName . "*"; system $cmd; } # # outputToStdout: send to stdout a record of this in a fasta-like # format. # sub outputToStdout { my ($target, $ii, $candidateDetail); $candidateDetail = shift; $target = shift; print ">$target\n"; for ($ii = 0; $ii <= $#$candidateDetail; $ii++) { if ($$candidateDetail[$ii]{target} =~ /[A-Za-z]/) { print "$$candidateDetail[$ii]{charPosteriorMatch},"; } } print "\n"; } sub setPaths { $ENV{PATH} .= ":/projects/compbio/experiments/models.97/scripts99" . "/projects/compbio/experiments/models.97/scripts"; $ENV{PATH} .= ":/projects/compbio/bin:/projects/compbio/bin/alpha"; $ENV{PATH} .= ":/projects/compbio/bin/scripts"; }