#!/usr/bin/perl -w # Usage: # best_scores seed.t2k-100-30-dssp-ebghstl-scores.rdb . . . > seed.combined-scores.rdb # # optional arguments: # -num 10 minimum number of hits to output # -maxnum 100 maximum number of hits to output # -E 0.10 report any hit with this Evalue or better in ANY of the input files # Read in all the score files. # Output a "best hits" list (sorted by Evlaue) in RDB format that includes # Sequence_ID Length Best_Evalue Evalue FSSP-rep SCOP_domain SCOP_suid # The evalue is calculated as a weighted geometric average of the values # in the different RDB values. # There is a default E value for missing values, and special treatment for # hits that are not part of the template library. # Sun Jan 2 09:10:56 PST 2005 Kevin Karplus # Added -maxnum parameter, and changed default num to 10 # Thu Jul 13 17:32:01 PDT 2006 Kevin Karplus # Added weights for o_notor2, n_sep, and o_sep alphabets. # Adjusted weights to rely less on 3-track HMMs # Fixed bug that did not check whether template was in t06 library. # prototypes sub print_usage_exit (); sub print_best($$$\%); sub read_scop_suid($\%); sub read_rdb_header(*\@\%); #main { use FileHandle; STDERR->autoflush(1); use Getopt::Long; use English; use File::Basename; use lib dirname($PROGRAM_NAME); use ChainIds; # defaults my $min_num_best = 10; my $max_num_best = 100; my $E_thresh = 0.10; my $LIBSIZE = 9000; undef @rdb_files; my $scop_file = "/projects/compbio/data/scop/dir.cla.scop.txt.gz"; GetOptions( "num=n" => \$min_num_best , "maxnum=n" => \$max_num_best , "E=f" => \$E_thresh , "lib_size=n" => \$LIBSIZE , "scop_file=s" => \$scop_file ); @rdb_files = @ARGV; &print_usage_exit if(scalar(@rdb_files)<2) ; $weight{"t2k-100-30-dssp-ebghstl"} = 2.0; $weight{"t2k-100-30-stride-ebghtl"} = 2.0; $weight{"t2k-100-30-bys"} = 1.0; $weight{"t2k-100-30-str"} = 1.0; $weight{"t2k-100-30-str2"} = 3.0; $weight{"t2k-100-30-alpha"} = 2.0; $weight{"t2k-100-30-n_notor2"} = 1.5; $weight{"t2k-100-30-o_notor2"} = 1.5; $weight{"t2k-100-30-n_sep"} = 1.5; $weight{"t2k-100-30-o_sep"} = 1.5; $weight{"t2k-100-30-dssp-ehl2"} = 3.0; $weight{"t2k-100-30-near-backbone-11"} = 3.0; $weight{"t2k-100-30-CB_burial_14_7"} = 2.0; $weight{"t2k-100-40-40-str+CB_burial_14_7"} = 2.0; $weight{"t2k-100-40-40-str2+CB_burial_14_7"} = 2.0; $weight{"t2k-80-60-80-str2+near-backbone-11"} = 2.0; $weight{"t2k-w0.5"} = 2.0; $weight{"t04-100-30-dssp-ebghstl"} = 2.0; $weight{"t04-100-30-stride-ebghtl"} = 2.0; $weight{"t04-100-30-bys"} = 1.0; $weight{"t04-100-30-str"} = 1.0; $weight{"t04-100-30-str2"} = 4.0; $weight{"t04-100-30-alpha"} = 2.0; $weight{"t04-100-30-dssp-ehl2"} = 3.0; $weight{"t04-100-30-n_notor2"} = 1.5; $weight{"t04-100-30-o_notor2"} = 1.5; $weight{"t04-100-30-n_sep"} = 1.5; $weight{"t04-100-30-o_sep"} = 1.5; $weight{"t04-100-30-near-backbone-11"} = 4.0; $weight{"t04-100-30-CB_burial_14_7"} = 2.0; $weight{"t04-100-40-40-str+CB_burial_14_7"} = 2.0; $weight{"t04-100-40-40-str2+CB_burial_14_7"} = 2.0; $weight{"t04-80-60-80-str2+near-backbone-11"} = 2.0; $weight{"t04-w0.5"} = 3.0; $weight{"t06-100-30-dssp-ebghstl"} = 2.0; $weight{"t06-100-30-stride-ebghtl"} = 2.0; $weight{"t06-100-30-bys"} = 1.0; $weight{"t06-100-30-str"} = 1.0; $weight{"t06-100-30-str2"} = 4.0; $weight{"t06-100-30-alpha"} = 2.0; $weight{"t06-100-30-n_notor2"} = 1.5; $weight{"t06-100-30-o_notor2"} = 1.5; $weight{"t06-100-30-n_sep"} = 1.5; $weight{"t06-100-30-o_sep"} = 1.5; $weight{"t06-100-30-dssp-ehl2"} = 3.0; $weight{"t06-100-30-near-backbone-11"} = 4.0; $weight{"t06-100-30-CB_burial_14_7"} = 2.0; $weight{"t06-100-40-40-str2+CB_burial_14_7"} = 2.0; $weight{"t06-80-60-80-str2+near-backbone-11"} = 2.0; $weight{"t06-w0.5"} = 3.0; $weight{"template-lib"} = 6.0; $weight{"t2k-template-lib"} = 6.0; $weight{"t04-template-lib"} = 8.0; $weight{"t06-template-lib"} = 9.0; # if a chain is not one of our template id's, we report # the one-track target E-value, averaged with the default Evalue. # for the default Evalue for missing scores, take the geometric mean of # the best unreported score (assuming -Emax 40.) and the library size, # but weight the emax value more my $threshold_weight= 3.0; $LOG_DEFAULT_EVALUE = ($threshold_weight*log(40.)+log($LIBSIZE))/ ($threshold_weight+1.0); # build a hash of all ID's in our template library # that we can use to check membership for ID's in the files $T2K_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t2k.ids"; ReadIDs($T2K_ID_FILE, %t2k_ids); # build a hash of all ID's in our template library # that we can use to check membership for ID's in the files $T04_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t04.ids"; ReadIDs($T04_ID_FILE, %t04_ids); # build a hash of all ID's in our template library # that we can use to check membership for ID's in the files $T06_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t06.ids"; ReadIDs($T06_ID_FILE, %t06_ids); # each chainID will be the key in a variety of hashes # that we'll use to keep track of all the information # our hashes will be: %len = (); %BestLogEvalue = (); %logEvalue = (); %total_weight = (); %FSSP_rep = (); %SUID = (); # comma-separated list of SUIDs for chain # mapping from SUID values to SCOP codes. my %SCOP_from_SUID; read_scop_suid($scop_file,%SCOP_from_SUID); # we will have multiple entries under each key # for many of these. For instance, each key that # is in our template library will have multiple (weighted) # Evalues, keys may have several SCOP_domains, etc. # Weights for the E-values are set by which file we get the id from. # $desired_total_weight will be the sum of the weights for all files. my $desired_total_weight = 0; # how much weight is available due to membership in template libraries my $total_t2k_weight=0; my $total_t04_weight=0; my $total_t06_weight=0; foreach $scorefile (@rdb_files) { # print STDERR "reading $scorefile\n"; # we want to know which file we're in to assign E-value weights $scorefile =~ /\S+[.](t\S+)-scores.rdb/; $score_type = $1; if (!defined ($weight{$score_type})) { die "Invalid input file: $scorefile, unrecognized type $score_type\n"; } $score_wt = $weight{$score_type}; $desired_total_weight += $score_wt; $total_t2k_weight += $score_wt if (($scorefile =~ /t2k/) && ($scorefile !~ /t2k-w0.5/)); $total_t04_weight += $score_wt if (($scorefile =~ /t04/) && ($scorefile !~ /t04-w0.5/)); $total_t06_weight += $score_wt if (($scorefile =~ /t06/) && ($scorefile !~ /t06-w0.5/)); open (SCOREFILE, $scorefile); my @col_names; my %col_num; read_rdb_header(*SCOREFILE{IO}, @col_names, %col_num); $idcolnum = $col_num{"Sequence_ID"}; $idcolnum = $col_num{"Template_ID"} if !defined($idcolnum); $idcolnum = $col_num{"HMM"} if !defined($idcolnum); $lencolnum = $col_num{"Length"}; $lencolnum = $col_num{"Template_Length"} if !defined($lencolnum); $lencolnum = $col_num{"HMMLEN"} if !defined($lencolnum); $Ecolnum = $col_num{"Target_Evalue"}; $Ecolnum = $col_num{"Template_Evalue"} if !defined($Ecolnum); $Ecolnum = $col_num{"EVALUE"} if !defined($Ecolnum); $FSSPcolnum = $col_num{"FSSP-rep"}; $SCOPcolnum = $col_num{"SCOP_domain"}; $suidcolnum = $col_num{"SCOP_suid"}; while($line=) { chomp($line); @cols = split( /\t/, $line ) ; $id = $cols[$idcolnum]; $length = $cols[$lencolnum]; $Evalue = $cols[$Ecolnum]; $FSSP = $cols[$FSSPcolnum] if defined($FSSPcolnum); $SCOP = $cols[$SCOPcolnum] if defined($SCOPcolnum); $SCOP_suid = $cols[$suidcolnum] if defined($suidcolnum); # line which gives only additional scop domains if (!($id =~ /\S+/)) { # no chainID found in the first column (use id from previous line) if ( ($SCOP =~ /^\S+/) && ($SCOP_suid =~ /^\S+/) ) { # if SCOP defined $SUID{$lastID} .= ",$SCOP_suid"; } next; } $FSSP_rep{$id} = $FSSP if defined($FSSPcolnum); push @{$len{$id}}, $length; # add the log E-value, multiplied by the scaling factor $logE = $Evalue<=0? -999: log($Evalue); if (!defined($logEvalue{$id})) { $logEvalue{$id} = $logE*$score_wt; $total_weight{$id} = $score_wt; } else { $logEvalue{$id} += $logE*$score_wt; $total_weight{$id} += $score_wt; } if (!defined($BestLogEvalue{$id}) || $logE < $BestLogEvalue{$id}) { $BestLogEvalue{$id} = $logE; } # load up SCOP domain info if it's there if (defined($SCOPcolnum) && defined($SCOP_suid) && ($SCOP =~ /^\S+/) && ($SCOP_suid =~ /^\S+/) ) { $SUID{$id}=$SCOP_suid; } $lastID = $id; } close(SCOREFILE); } # print STDERR "DEBUG: desired weight= $desired_total_weight\n"; # compute the Evals with our current rules for missing weights. foreach $id (keys %len) { # how much of the possible weight did this id get my $MissingWeight = $desired_total_weight - $total_weight{$id}; $MissingWeight -= $total_t2k_weight if (!defined ($t2k_ids{$id})); $MissingWeight -= $total_t04_weight if (!defined ($t04_ids{$id})); $MissingWeight -= $total_t06_weight if (!defined ($t06_ids{$id})); # penalty for sequences not in template library $MissingWeight+= 1.0 if (!defined($t2k_ids{$id}) && !defined($t04_ids{$id}) && !defined($t06_ids{$id}) ); $MissingWeight = 0 if ($MissingWeight < 0); my $avg=$logEvalue{$id}/$total_weight{$id}; # print STDERR "DEBUG: $id total_weight= $total_weight{$id} missing_weight= $MissingWeight, $avg => "; $logEvalue{$id} = ($logEvalue{$id} +$MissingWeight*$LOG_DEFAULT_EVALUE)/ ($total_weight{$id} + $MissingWeight); # print STDERR "$logEvalue{$id}\n"; } print_best($min_num_best, $max_num_best, $E_thresh, %SCOP_from_SUID); } sub print_usage_exit () { print "Usage: best_scores [-num 30] [-E 0.10] seed.t2k-100-30-dssp-ebghstl-scores.rdb \\\n"; print " seed.t2k-100-30-stride-ebghtl-scores.rdb seed-template-lib-scores.rdb . . . > tmp.best.rdb\n"; print "Must have at least 2 rdb files to combine, or there's no point.\n"; exit(-1); } # print out the top N sequences, sorted by Evalue # also any with BestEvalue < threshold sub print_best($$$\%) { my ($min_num_best, $max_num_best, $Ethresh,$scop_ref ) = @_; # sort ids by increasing $logEvalue @sorted_ids = sort {$logEvalue{$a} <=> $logEvalue{$b}} keys(%logEvalue); # print headers print "Sequence_ID\tLength\tBest_Evalue\tEvalue\tFSSP-rep\tSCOP_domain\tSCOP_suid\n"; print "5S\t5N\t12N\t12N\t8S\t12S\t12N\n"; my $logthresh = log($Ethresh); my $lines_printed = 0; foreach $id (@sorted_ids) { last if $lines_printed >= $max_num_best; next if $lines_printed>= $min_num_best && $BestLogEvalue{$id} > $logthresh; $lines_printed ++; $bestScore = sprintf "%5.4e", exp($BestLogEvalue{$id}); $prettyScore = sprintf "%5.4e", exp($logEvalue{$id}); $FSSP_id = defined($FSSP_rep{$id})? $FSSP_rep{$id} : ""; if (defined($SUID{$id})) { my $suid_string = $SUID{$id}; my @suids = sort(split(',', $suid_string)); $suid_string = join(',', @suids); my $scop_string = join(',', map($$scop_ref{$_}, @suids)); print"$id\t$len{$id}->[0]\t$bestScore\t$prettyScore\t$FSSP_id\t$scop_string\t$suid_string\n"; } else { # print line without any domain information print "$id\t$len{$id}->[0]\t$bestScore\t$prettyScore\t$FSSP_id\t\t\n"; } } } # read in the SCOP data base, creating a mapping from SUID->SCOP sub read_scop_suid($\%) { my ($scop_file, $scop_ref) = @_; open (SCOP, "gunzip -c $scop_file |") || die "best_scores can't unzip $scop_file"; while () { next if /^#/; my @fields = split('\t'); $$scop_ref{$fields[4]} = $fields[3]; } close SCOP; } # read in the header of an RDB file and store the column names in @$col_names_ref # with a reversed index in %$col_num_ref. sub read_rdb_header(*\@\%) { my ($FILE,$col_names_ref,$col_num_ref) = @_; # skip the comments at the beginning while (<$FILE> ) { last if /.+/ && !(/^\s*#/); } # read the column names chomp; @$col_names_ref = split(/\t/); # index the column names for(my $i = 0; $i; }