#!/usr/local/bin/perl -w # Usage: # best_scores seed.t2k-100-30-dssp-ebghstl-scores.rdb . . . > seed.combined-scores.rdb # # optional arguments: # -num 30 minimum number of hits to output # -E 0.10 report any hit with this Evalue or better in ANY of the input files # Read in all the score files. # Output a "best hits" list (sorted by Evlaue) in RDB format that includes # Sequence_ID Length Best_Evalue Evalue FSSP-rep SCOP_domain SCOP_suid # The evalue is calculated as a weighted geometric average of the values # in the different RDB values. # There is a default E value for missing values, and special treatment for # hits that are not part of the template library. # prototypes sub print_usage_exit (); sub process_command_line(); sub print_best($$); #main { use FileHandle; STDERR->autoflush(1); use English; use File::Basename; use lib dirname($PROGRAM_NAME); use IdChecker; use READ_RDB; &process_command_line(); $weight{"t2k-100-30-dssp-ebghstl"} = 2.0; $weight{"t2k-100-30-stride-ebghtl"} = 2.0; $weight{"t2k-100-30-str"} = 3.0; $weight{"t2k-100-30-alpha"} = 2.0; $weight{"t2k-100-30-dssp_ehl2"} = 2.0; $weight{"t2k-w0.5"} = 2.0; $weight{"template-lib"} = 5.0; # if a chain is not one of our template id's, we report # the one-track target E-value, averaged with the default Evalue. # for the default Evalue for missing scores, take the geometric mean of # the best unreported score (assuming -Emax 40.) and the library size $LIBSIZE = 6014; $LOG_DEFAULT_EVALUE = (log(40.)+log($LIBSIZE))/2; # build a hash of all ID's in our template library # that we can use to check membership fo ID's in the files $T2K_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t2k.ids"; %t2k_ids = IdChecker::ReadIDs($T2K_ID_FILE); # each chainID will be the key in a variety of hashes # that we'll use to keep track of all the information # our hashes will be: %len = (); %BestLogEvalue = (); %logEvalue = (); %total_weight = (); %FSSP_rep = (); %SCOP_dom_info = (); # we will have multiple entries under each key # for many of these. For instance, each key that # is in our template library will have multiple (weighted) # Evalues, keys may have several SCOP_domains, etc. # Weights for the E-values are set by which file we get the id from. # $desired_total_weight will be the sum of the weights for all files. $desired_total_weight = 0; foreach $scorefile (@rdb_files) { # print STDERR "reading $scorefile\n"; # we want to know which file we're in to assign E-value weights $scorefile =~ /[^.]+[.](\S+)-scores.rdb/; $score_type = $1; if (!defined ($weight{$score_type})) { die "Invalid input file: $scorefile, unrecognized type $score_type\n"; } $score_wt = $weight{$score_type}; $desired_total_weight += $score_wt; open (SCOREFILE, $scorefile); read_rdb_header(*SCOREFILE{IO}); $idcolnum = $col_num{"Sequence_ID"}; $idcolnum = $col_num{"Template_ID"} if !defined($idcolnum); $lencolnum = $col_num{"Length"}; $lencolnum = $col_num{"Template_Length"} if !defined($lencolnum); $Ecolnum = $col_num{"Target_Evalue"}; $Ecolnum = $col_num{"Template_Evalue"} if !defined($Ecolnum); $FSSPcolnum = $col_num{"FSSP-rep"}; $SCOPcolnum = $col_num{"SCOP_domain"}; $suidcolnum = $col_num{"SCOP_suid"}; while($line=) { chomp($line); @cols = split( /\t/, $line ) ; $id = $cols[$idcolnum]; $length = $cols[$lencolnum]; $Evalue = $cols[$Ecolnum]; $FSSP = $cols[$FSSPcolnum]; $SCOP = $cols[$SCOPcolnum]; $SCOP_suid = $cols[$suidcolnum]; # line which gives only additional scop domains if (!($id =~ /\w+/)) { # no chainID found in the first column (use id from previous line) if ( ($SCOP =~ /^\w+/) && ($SCOP_suid =~ /^\w+/) ) { # if SCOP defined $SCOP_dom_info{$lastID}{"$SCOP:$SCOP_suid"} = 1; } next; } $FSSP_rep{$id} = $FSSP; push @{$len{$id}}, $length; # add the log E-value, multiplied by the scaling factor $logE = $Evalue<=0? -999: log($Evalue); if (!defined($logEvalue{$id})) { $logEvalue{$id} = $logE*$score_wt; $total_weight{$id} = $score_wt; } else { $logEvalue{$id} += $logE*$score_wt; $total_weight{$id} += $score_wt; } if (!defined($BestLogEvalue{$id}) || $logE < $BestLogEvalue{$id}) { $BestLogEvalue{$id} = $logE; } # load up SCOP domain info if it's there if ($#cols>=5 && ($SCOP =~ /^\w+/) && ($SCOP_suid =~ /^\w+/) ) { $SCOP_dom_info{$id}{"$SCOP:$SCOP_suid"} = 1; } $lastID = $id; } close(SCOREFILE); } # compute the Evals with our current rules for missing weights. foreach $id (keys %len) { if(defined ($t2k_ids{$id})) { # id is in the template library, compute geometric average # of weighted scores. Missing scores get default Evalue $MissingWeight = $desired_total_weight - $total_weight{$id}; $logscore = $logEvalue{$id} +$MissingWeight*$LOG_DEFAULT_EVALUE; $logscore /= $desired_total_weight; } else { $logscore = ($logEvalue{$id} + $LOG_DEFAULT_EVALUE) / ($total_weight{$id}+1.); } $logEvalue{$id} = $logscore; } print_best($num_best, $E_thresh); } sub print_usage_exit () { print "Usage: best_scores [-num 30] [-E 0.10] seed.t2k-100-30-dssp-ebghstl-scores.rdb \\\n"; print " seed.t2k-100-30-stride-ebghtl-scores.rdb seed-template-lib-scores.rdb . . . > tmp.best.rdb\n"; exit(-1); } # print out the top N sequences, sorted by Evalue # also any with BestEvalue < threshold sub print_best($$) { my ($num_best, $Ethresh) = @_; # sort ids by increasing $logEvalue @sorted_ids = sort {$logEvalue{$a} <=> $logEvalue{$b}} keys(%logEvalue); # print headers print "Sequence_ID\tLength\tBest_Evalue\tEvalue\tFSSP-rep\tSCOP_domain\tSCOP_suid\n"; print "5S\t5N\t12N\t12N\t8S\t12S\t12N\n"; my $logthresh = log($Ethresh); my $lines_printed = 0; foreach $id (@sorted_ids) { next if $lines_printed>= $num_best && $BestLogEvalue{$id} > $logthresh; $lines_printed ++; $bestScore = sprintf "%5.4e", exp($BestLogEvalue{$id}); $prettyScore = sprintf "%5.4e", exp($logEvalue{$id}); $FSSP_id = defined($FSSP_rep{$id})? $FSSP_rep{$id} : ""; if (defined $SCOP_dom_info{$id}) { my $first_line=1; foreach $scopDom (keys %{$SCOP_dom_info{$id}}) { # parse the domain name and the sunid @scopInfo = split(":", $scopDom); print $first_line? "$id\t$len{$id}->[0]\t$bestScore\t$prettyScore\t$FSSP_id\t$scopInfo[0]\t$scopInfo[1]\n" : "\t\t\t\t\t$scopInfo[0]\t$scopInfo[1]\n"; $first_line=0; } } else { # print line without any domain information print "$id\t$len{$id}->[0]\t$bestScore\t$prettyScore\t$FSSP_id\t\t\n"; } } } sub process_command_line () { $PROGRAM_NAME = $0; # defaults $num_best = 30; $E_thresh = 0.10; undef @rdb_files; # Get user specified values for(my $i=0; $i <= $#ARGV; ) { $_ = $ARGV[$i++]; if (/^-num/) {$num_best = $ARGV[$i++]; } elsif (/^-E/) {$E_thresh = $ARGV[$i++]; } else { push @rdb_files, $_; } } &print_usage_exit if($#rdb_files < 1) ; }