#!/usr/local/bin/perl -w #Usage:gather_best_align_scores numtopredict working_dir target topalignrdb topscorerdb { use FileHandle; STDERR->autoflush(1); use English; use File::Basename; use lib dirname($PROGRAM_NAME); use READ_RDB; if ($#ARGV < 3) { &print_usage_exit; } ($n, $WORKDIR, $TARGET, $topalignrdb, $topscorerdb) = @ARGV; #results will be stored in this hash where key is #file name and value is the "E-value of the alignment", i.e. #the weighted sum of (log of) E-value of the sequence that was not #used to build the HMM and (log of) E-value sequence got when we #averaged over best socres in best-scores.rdb %alignmentscores = (); #hash of scores from best_scores.rdb #these are the averaged E-values from all models used %avgscores = (); #hash of candidates (keys) and their FSSP reps (values) %fssphash = (); #we'll be computing a weighted geometric average of the #pairwise alignment and best-scores averaged E-value with #the following weights $weight{"pairwise-alignment"} = 1.0; $weight{"best-scores-average"} = 5.0; $total_weight = $weight{"pairwise-alignment"} + $weight{"best-scores-average"}; $FSSP_seed_bonus = 2.0; open(TOPALIGNS, "$topalignrdb") || die "Can't open $topalignrdb for reading\n"; read_rdb_header(*TOPALIGNS{IO}); $idcolnum = $col_num{"Template"}; $FSSPcolnum = $col_num{"FSSP_rep"}; while($line = ) { chomp $line; @cols = split("\t", $line); $fssphash{$cols[$idcolnum]} = $cols[$FSSPcolnum]; } close(TOPALIGNS); #hash of candidates (keys) and their SCOP domains (values) %scophash = (); open(TOPSCORES, "$topscorerdb") || die "Can't open $topscorerdb for reading\n"; read_rdb_header(*TOPSCORES{IO}); $idcolnum = $col_num{"Sequence_ID"}; # $lencolnum = $col_num{"Length"}; # $BEcolnum = $col_num{"Best_Evalue"}; #unused in html output $Ecolnum = $col_num{"Evalue"}; # $FSSPcolnum = $col_num{"FSSP-rep"}; $SCOPcolnum = $col_num{"SCOP_domain"}; # $suidcolnum = $col_num{"SCOP_suid"}; while($line = ) { next if ($line =~ /^\s+/); chomp $line; @cols = split("\t", $line); $scophash{$cols[$idcolnum]} = $cols[$SCOPcolnum]; $avgscores{$cols[$idcolnum]} = $cols[$Ecolnum]; } close(TOPSCORES); #we also need the name of the target open(SEEDA2M, "$WORKDIR/$TARGET.a2m") || die "Can't open $WORKDIR/$TARGET.a2m file for reading\n"; while($line = ) { if ($line =~ /^>/) { $line =~ />(\S+)/; $targetid = $1; last; } } close(SEEDA2M); foreach $candidate (keys %fssphash) { #enter directory where candidate's alignments are stored chdir "$WORKDIR/$candidate" || die "Can't cd to $WORKDIR/$candidate\n"; open(THEFILES, "ls | "); while($filename = ) { chomp $filename; undef $chainID; #flag that identifies whether it's a TP or PT alignment $isTP = -1; if ($filename =~ /^(\w+)-(\w+)-.*.dist$/) { $pre1 = $1; $pre2 = $2; #if this is a "TP" or target-template alignment, target is the model #and we'll get the E-value of the template #vice versa for "PT" alignments if ($filename =~ /^$candidate/) { # filename will be of this form: 1gpt-seed.foo.dist # if candidate's name is first, it's a "PT" alignment $isTP = 0; $chainID = $pre1; } if ($filename =~ /^$TARGET/) { # filename will be of this form: seed-1gpt.foo.dist # if target's name is first, it's a "TP" alignment $isTP = 1; $chainID = $pre2; } if (!(defined $chainID)) { die "Error: Couldn't get chainID name from file:$filename\n"; } open(DISTFILE, "$filename") || die "Can't open $filename for reading\n"; #For a TP alignment we want the E-value of the template #For a PT alignment we want the E-value of the target while($l = ) { next if $l =~ /^\%/; @cols = split(" ", $l); if ( (($cols[0] eq "$candidate") && ($isTP == 1)) || (($cols[0] eq "$targetid") && ($isTP == 0)) ) { $alignment_logE = $cols[4]<=0? -999: log($cols[4]); if(defined $avgscores{$chainID}) { $raw_best = $avgscores{$chainID}; } else { die "Error: No average E-value for $chainID in average scores hash\n"; } $bestscores_logE = $raw_best<=0? -999: log($raw_best); #put STR two-track alignments in their own hash if($filename =~ /\S-STR-.*/){ $STR_alignmentscores{$filename} = (($bestscores_logE * $weight{"best-scores-average"}) + ($alignment_logE * $weight{"pairwise-alignment"})) /$total_weight; } else { $alignmentscores{$filename} = (($bestscores_logE * $weight{"best-scores-average"}) + ($alignment_logE * $weight{"pairwise-alignment"})) /$total_weight; } if($filename =~ /\w+-\w+-fssp-*./) { $alignmentscores{$filename} -= $FSSP_seed_bonus; } } } close(DISTFILE); } } } #Simple scheme: we just output the candidate alignments #sorted by Evalue #now sort the hash by value and print out to an RDB file #also make sure we get STR two-track alignments since these #are the ones we plan to report in the "top alignments" $count = 0; print "Alignment\tEvalue\tFSSP-rep\tSCOP-domain\n"; print "70S\t10N\t5S\t5S\n"; @STRkeys = sort {$STR_alignmentscores{$a} <=> $STR_alignmentscores{$b} } keys %STR_alignmentscores; @otherkeys = sort {$alignmentscores{$a} <=> $alignmentscores{$b} } keys %alignmentscores; foreach $dist (@STRkeys) { $a2mfile = $dist; $a2mfile =~ s/.dist/.a2m.gz/; #only have target-template alignments from STR two-track $dist =~ /^\w+-(\w+)-.*/; $templateid = $1; $prettyScore = sprintf "%5.4e", exp($STR_alignmentscores{$dist}); print "$a2mfile\t$prettyScore\t"; #get fssp-rep $fssprep = $fssphash{$templateid}; print $fssprep if (defined $fssprep); print "\t"; #get scop domain NOTE: NOT GETTING MULTIPLE DOMAINS HERE $scopdom = $scophash{$templateid}; print $scopdom if (defined $scopdom); print "\n"; $count++; last if $count == $n; } foreach $dist (@otherkeys) { $a2mfile = $dist; $a2mfile =~ s/.dist/.a2m.gz/; if ($dist =~ /^$TARGET/) { #case of target-template alignment $dist =~ /^\w+-(\w+)-.*/; $templateid = $1; } else { #case of template-target alignment #get the template id $dist =~ /^(\w+)-.*/; $templateid = $1; } $prettyScore = sprintf "%5.4e", exp($alignmentscores{$dist}); print "$a2mfile\t$prettyScore\t"; #get fssp-rep $fssprep = $fssphash{$templateid}; print $fssprep if (defined $fssprep); print "\t"; #get scop domain NOTE: NOT GETTING MULTIPLE DOMAINS HERE $scopdom = $scophash{$templateid}; print $scopdom if (defined $scopdom); print "\n"; $count++; last if $count == $n; } } sub print_usage_exit { print "Usage:gather_best_align_scores numtopredict working_dir target topalignrdb topscorerdb \n"; exit(-1); }