#!/usr/bin/perl -w # To do: ADD POD documentation! # Move table of BestAlignmentTypes to an external file. # Sun Dec 30 17:06:32 PST 2007 Kevin Karplus # removed all use of FSSP. # Sun Apr 6 11:33:56 PDT 2008 Kevin Karplus # Commented out the check for duplicate hits, so that all get reported. # Wed Apr 23 17:35:54 PDT 2008 Kevin Karplus # fixed bug in priority setting, so that t06 preferred over t04. use strict; use FileHandle; STDERR->autoflush(1); use English; use File::Basename; use Getopt::Long; use Pod::Usage; # prototype declarations: sub read_scores($\@\%\%\%); sub read_rdb_header(*\@\%); sub find_best_alignment($$$\%\%); sub is_better_alignment($$$$); # preferences for alignment types based on observations # of alignments in tests %::BestAlignmentType = ( "local-str2[+]near-backbone-11-0.8[+]0.6[+]0.8-adpstyle5" =>5 , "local-str2[+]CB_burial_14_7-1.0[+]0.4[+]0.4-adpstyle5" =>4 , "local-str2[+]CB_burial_14_7-0.4[+]0.4-adpstyle5" =>4 # old style name , "local-str2-1.0[+]0.3-adpstyle5" =>3 , "local-str2-0.1-adpstyle5" =>3 , "local-str-0.3-adpstyle5" => 2 , "fssp-local-adpstyle5" => 1 ); #main { my $target; # name of target sequence my $scores_file; # name of rdb file to find E_values in # defaults to $target.best-scores.rdb my $max_align=9999; #maximum number of alignments to output my $select_re="."; # alignment name must match this regular-expression to be included GetOptions( "target=s" => \$target , "scores_file=s" => \$scores_file , "max_align=n" => \$max_align , "select_re=s" => \$select_re , "help|?" => sub {pod2usage("verbose"=>1);} , "man" => sub {pod2usage("verbose"=>2);} ) or pod2usage("verbose"=>0); if (! defined($target)) { print STDERR "Error: $PROGRAM_NAME requires -target argument\n"; pod2usage("verbose"=>0); } $scores_file = $target. ".best-scores.rdb" if (!defined($scores_file)); if (! -e $scores_file) { die "Error: $PROGRAM_NAME can't fnd scores file $scores_file\n"; } my @templates = (); # name of templates from scores file in order by e-value # hashes indexed by template name my %SCOP = (); # SCOP ids from scores file # (fake if begins with "fake") my %SUID = (); # SCOP su-ids from scores file # (fake if begins with "fake") my %EVALUES = (); # evalues for templates read_scores($scores_file,@templates,%SCOP,%SUID,%EVALUES); # hashes indexed by template name my %best_align = (); # best alignment for template my %cost_for_best_align = (); # alignment e-value for best alignment # find the alignment to use for each template: foreach my $id (@templates) { find_best_alignment($target, $id, $select_re, %best_align, %cost_for_best_align); } my $num_printed = 0; # How many alignments were printed? print "Alignment Evalue Align_Evalue SCOP SUID\n"; print "50S 10N 10N 50S 10S\n"; # Now reduce redundancy by removing extra hits to same family # Check each template to see if there is a different template with # SCOP family that has a better alignment. CHECK: foreach my $template (@templates) { next CHECK if (!defined($best_align{$template})); my $s = $SCOP{$template}; my $s_fake =($s =~ /^fake/); # Sun Apr 6 11:33:56 PDT 2008 Kevin Karplus # Commented out the check for duplicate hits, so that all get reported. # # check for duplicate hit to same SCOP domain # OTHER: foreach my $other (@templates) # { next OTHER if ($template eq $other); # don't supress self # # # don't suppress if SCOP is mismatch or both are blank # my $so = $SCOP{$other}; # next OTHER if ($s ne $so || $s_fake); # # if (is_better_alignment($best_align{$other}, $EVALUES{$other}, # $best_align{$template}, $EVALUES{$template})) # { # print STDERR "DEBUG: $template suppressed in favor of $other:\n" # # . "DEBUG: $best_align{$other} => $EVALUES{$other}\n" # # . "DEBUG: $best_align{$template} => $EVALUES{$template}\n"; # next CHECK; # } # } if ($num_printed < $max_align) { # $template not suppressed, so print it print "$best_align{$template}\t$EVALUES{$template}\t$cost_for_best_align{$template}\t" . ($s !~ /^fake/? $s:"") . "\t" . (defined($SUID{$template})? $SUID{$template}: "") . "\n"; $num_printed++; } } } # read in a scores file in RDB format, saving the names of the # templates in order in @$templates_ref # Store the SCOP domains in %$scop_ref as a comma-separated list # Store the SCOP suids in %$suid_ref as a comma-separated list # Store the E-values in %$evalue_ref sub read_scores($\@\%\%\%) { my ($scores_file, $templates_ref, $scop_ref, $suid_ref, $evalue_ref) = @_; open(SCORES_FILE, $scores_file) || die "Error: $PROGRAM_NAME can't open $scores_file for reading.\n"; my @col_names; my %col_num; read_rdb_header(*SCORES_FILE{IO}, @col_names, %col_num); my $idcolnum = $col_num{"Sequence_ID"}; # $lencolnum = $col_num{"Length"}; # $BEcolnum = $col_num{"Best_Evalue"}; #unused in html output my $Ecolnum = $col_num{"Evalue"}; my $SCOPcolnum = $col_num{"SCOP_domain"}; my $SUIDcolnum = $col_num{"SCOP_suid"}; my $id ; my $num_templates = 0; while(my $line = ) { chomp $line; my @cols = split('\t', $line); if ($cols[$idcolnum] =~ /\w+/) { $id = $cols[$idcolnum]; $$templates_ref[$num_templates] = $id; $num_templates++; $$scop_ref{$id} = (defined($cols[$SCOPcolnum]) && ($cols[$SCOPcolnum] ne ""))? $cols[$SCOPcolnum] : ("fake" . $num_templates); $$evalue_ref{$id} = $cols[$Ecolnum]; $$suid_ref{$id} = defined($cols[$SUIDcolnum])? $cols[$SUIDcolnum]: " "; } else { my $SCOPc = defined($cols[$SCOPcolnum])? $cols[$SCOPcolnum] : ""; $$scop_ref{$id} .= ",".$SCOPc; $$suid_ref{$id} .= "," . (defined($cols[$SUIDcolnum])? $cols[$SUIDcolnum] : " "); } # possible bug: the SCOP ids for multiple domains may be in DIFFERENT # orders in different templates, even if they have the same structure. # Check 1aipE vs 1efcA? } close(SCORES_FILE); print STDERR "$PROGRAM_NAME read $num_templates templates\n"; } # read in the header of an RDB file and store the column names in @$col_names_ref # with a reversed index in %$col_num_ref. sub read_rdb_header(*\@\%) { my ($FILE,$col_names_ref,$col_num_ref) = @_; # skip the comments at the beginning while (<$FILE> ) { last if /.+/ && !(/^\s*#/); } # read the column names chomp; @$col_names_ref = split(/\t/); # index the column names for(my $i = 0; $i; } # find the best alignment for $template and save its name in $$best_id_ref{$template} # and its e-value in $$cost_best_ref{$template} (of the sequence with poorer score) sub find_best_alignment($$$\%\%) { my ($target, $template, $select_re, $best_id_ref, $cost_best_ref) = @_; #find directory where candidate's alignments are stored if (! -d $template) { print STDERR "$PROGRAM_NAME Warning: no directory for $template\n"; return; } open(THEFILES, "ls $template | "); while(my $filename = ) { chomp $filename; next if ($filename !~ /[.]dist([.]gz)?$/); next if ($filename !~ /$select_re/); my $full_filename= "$template/$filename"; my $align_name = $full_filename; $align_name =~ s/[.]dist([.]gz)?/.a2m/; if (! -e $align_name) { my $align_name_gz = $align_name . ".gz"; if (! -e $align_name_gz) { print STDERR "$PROGRAM_NAME: Warning: neither $align_name nor $align_name_gz exist\n"; next; } $align_name = $align_name_gz; } if ($filename =~ /[.]dist[.]gz$/) { open(DISTFILE, "gunzip -c $full_filename|") || die "Error: $PROGRAM_NAME can't open $full_filename for reading\n"; } else { open(DISTFILE, "<$full_filename") || die "Error: $PROGRAM_NAME can't open $full_filename for reading\n"; } my $template_cost=9999999999999999; my $target_cost=9999999999999999; while(my $l = ) { next if $l =~ /^[%#]/; my @cols = split(" ", $l); if ($cols[0] eq $template) { $template_cost= $cols[4]; } if ($cols[0] eq $target) { $target_cost= $cols[4]; } } close(DISTFILE); # Fri Apr 18 14:06:48 PDT 2008 Kevin Karplus # Changed warning to only complain when both sequences are # missing from dist file, since new alignment method for # target models only runs template sequence, not both. if (!defined($target_cost) && !defined($template_cost)) { print STDERR "$PROGRAM_NAME Warning: no $target in $full_filename\n"; print STDERR "$PROGRAM_NAME Warning: no $template in $full_filename\n"; next; } # the cost is the more expensive of the two sequences my $cost = ($target_cost<$template_cost)? $template_cost: $target_cost; if (is_better_alignment($align_name, $cost, $$best_id_ref{$template}, $$cost_best_ref{$template})) { $$best_id_ref{$template} = $align_name; $$cost_best_ref{$template} = $cost; } } close(THEFILES); # print STDERR "DEBUG: Best alignment for $template is $$best_id_ref{$template}\n"; } # compare alignment with name1 and cost1 against # alignment with name2 and cost2 # return 1 if name1 to be preferred, 0 if name2 to be preferred sub is_better_alignment($$$$) { my ($name1, $cost1, $name2, $cost2) = @_; return 1 if (!defined($name2) || !defined($cost2)); # print STDERR "DEBUG: name1=$name1 name2=$name2\n"; # get priorities for alignments my $priority1=0; my $priority2=0; foreach my $al_type (keys %::BestAlignmentType) { my $match1 = ($name1 =~ /$al_type/); $priority1 = $::BestAlignmentType{$al_type} if $match1; # print STDERR "DEBUG: $name1 ". ($match1? "matches ": "doesn't match ") # .$al_type."\n"; my $match2 = ($name2 =~ /$al_type/); # print STDERR "DEBUG: $name2 ". ($match2? "matches ": "doesn't match ") # .$al_type."\n"; $priority2 = $::BestAlignmentType{$al_type} if $match2; } $priority1 += 1.5 if ($name1 =~ /-t04-/); $priority2 += 1.5 if ($name2 =~ /-t04-/); $priority1 += 2.5 if ($name1 =~ /-t06-/); $priority2 += 2.5 if ($name2 =~ /-t06-/); print STDERR "DEBUG: $name1 has priority $priority1 $name2 has priority $priority2\n"; return 1 if ($priority1 > $priority2); return 0 if ($priority1 < $priority2); # if they have the same priority, the lower cost is better return $cost1 < $cost2; }