#!/usr/local/bin/perl5 -w # # double-blast -q query.fasta -db db.fasta # [-interdb nrp.fasta] [-intere 5.0e-5] [-finale 0.02] # > output_rdb_file # # Does a two-step search for homologs, using wu-blast. # The first step searches interdb, the second uses the hits # found there to search db. # # 30 Dec 1997---differs from older versions in that # each blast hit in an intermediate is searched separately, # rather than merging them together. # # 24 July 1998---adds the original query to the set of intermediates, # in case it isn't in NRP to be found. # # Arguments: # -q query sequence file in fasta format. # May have multiple queries. # -db Database to report homologs from. # -interdb Database to do first step in. # Default = NRP # -intere E threshold for first step. # -finale E threshold for second step. # -extend how much to extend blast hit in each direction # # The output has 4 columns: # TARGET the query sequence name # HMM the sequence IDs for the homologs # SCORE not the raw score, but log(max(interE,finalE)) # interE how good was match at first step # finalE how good was match at second step # intermediate the sequence IDs for the intermediate that found this # # The "HMM" name is unfortunate, but is chosen for compatibility with # the output of score-lib. # 25 May 2000, changed temp directory sub absolute($); $PDB = "/projects/compbio/data/pdb/all-protein"; $NRP = "/projects/compbio/data/nrp/NRP_ALL.PDB"; # $NRP_SIZE = 77000000; # approx numbers of residues in NRP $tmpdir = "/projects/compbio/tmp/double-blast-$$"; make_directories($tmpdir); &process_command_line; # process_command_line copies the (or uncompresses) the queries to # $tmpdir/queries chdir $tmpdir; print "TARGET\tHMM\tSCORE\tINTERE\tFINALE\tintermediate\n"; print "30S\t30S\t10N\t10N\t10N\t26S\n"; open(QUERIES, "< queries"); open(ONE, "> one-query"); $id = ""; while () { $save = $_; if (/^>\s*(\S+?)([, ]|$)/) { $newid = $1; if ($id ne "") { close ONE; get_all_hits ($id); open(ONE, "> one-query"); } $id = $newid; } print ONE $save; } close(ONE); get_all_hits($id) if ($id ne ""); close(QUERIES); system "rm -rf $tmpdir"; exit; #------------------------ subroutines -------------------------------- sub get_all_hits($) { my ($queryid) = @_; my $blast_out = "blast-out-$$"; print STDERR "\nstarting $queryid with E=$intere\n"; blast_query_to_database("wu-blastp", "one-query", $interdbfile, $blast_out, $intere); accumulate_blast_hits($blast_out); unlink $blast_out; # try searching the final database directly with the query. rename("one-query", "second-query"); second_level_blast($queryid, $queryid, 0); open(INTERDB , "< $interdbfile") || die "Can't read sequences from intermediate DB $interdbfile\n"; $copying=0; while () { if (/^>\s*(\S+?)([, ]|$)/) { my $id = $1; if ($copying) { do_second_level_blasts($queryid,$secondid,$seq); } $copying = defined($first_sub{$id}); if ($copying) { $secondid = $id; $seq= ""; } } elsif ($copying) { chomp; tr/- . //d; # remove whitespace, ., and - $seq .= $_; } } if ($copying) { do_second_level_blasts($queryid, $secondid, $seq); } close INTERDB; # May want to move to after outer loop my @scores = values(%best_score); my @unsorted_keys = keys(%best_score); my @sorted_keys = @unsorted_keys [ sort { $scores[$a] <=> $scores[$b] } 0..$#scores ]; for $key (@sorted_keys) { printf "%s\t%.6g\t%.5g\t%.5g\t%s\n", $key, $best_score{$key}, $first_e{$key}, $second_e{$key}, $via{$key}; } undef %via; undef %best_score; undef %first_e; undef %second_e; } sub do_second_level_blasts($$$) { my ($queryid,$secondid,$seq) =@_; for ($sub= $first_sub{$secondid}; $sub<=$last_sub{$secondid}; $sub++) { print_sub_seq($seq, $secondid, $starts[$sub], $ends[$sub]); second_level_blast($queryid, "$secondid\_$starts[$sub]:$ends[$sub]", $expects[$sub]); } } sub print_sub_seq($$$$) { my ($seq, $secondid, $start, $end) = @_; open (TO, "> second-query") || die "Can't write file second_query\n"; print TO ">$secondid\_$start:$end \n"; print TO substr($seq, $start, $end-$start+1); close TO; } sub second_level_blast($$$) { my($queryid, $secondid, $interexpect) = @_; print STDERR "$queryid $secondid blasting:\n"; # print STDERR "Expected number for hit was $interexpect\n"; # open (TWO_IN, "< second-query"); # while () # { print STDERR $_; # } # close TWO_IN; my $second_out = "blast-second-$$"; blast_query_to_database("wu-blastp", "second-query", $dbfile, $second_out, $finale); record_hits($second_out, $queryid, $secondid, $interexpect); unlink $second_out; } # # sub blast_query_to_database($blast_prog, # $query_file, $DatabaseFile, # $blast_out_file, $e); # sub blast_query_to_database($$$$$) { my($blast_prog, $query_file, $database_file, $blast_out_file,$e) = @_; $blast_params = " E=$e B=10000 V=10000 -gi"; my($blast_cmd) = "$blast_prog $database_file $query_file" . " $blast_params > $blast_out_file"; # print STDERR "\n$blast_cmd\n"; system($blast_cmd); } sub save_hit() { #save old hit push @expects, $expect; $start -= $extend; $end += $extend; push @starts, $start; push @ends, $end; $start = 999999; $end = 0; $expect = 999999; $last_sub{$hitseq_id} = $#starts; } # # accumulate_blast_hits(blast_output,) # reads the blast_output file and collects all hits # Records the expected number of hits this good, # the low and high subscript of the hit in # @expects, @starts, @ends, # And records the subscripts for those arrays in hashes # $first_sub{$hitseq_id} $last_sub{hitseq_id} # sub accumulate_blast_hits($) { my($blast_out_file) = @_; # clear the arrays undef @starts; undef @ends; undef @expects; undef %first_sub; undef %last_sub; open(BLASTOUT_IN, "<$blast_out_file") || die "Error: can't open file $blast_out_file\n"; $hitseq_id = ""; $start=999999; $end=0; $expect=999999; while () { if (/^>(\S+) /) { # found a new hit save_hit() if ($start<=$end); $hitseq_id = $1; $first_sub{$hitseq_id} = $#starts +1; } elsif (/Expect = (\S+), .*P.* = (\S+)/) { save_hit() if ($start<=$end); $expect = $1 if $1 < $expect; # $p= $2; } elsif (/Sbjct:\s+(\d+)\s+\S+\s+(\d+)/) { $start = $1 if ($1 < $start); $end = $2 if ($2 > $end); } elsif (/^Parameters:/) { save_hit() if ($start<=$end); last; } } close BLASTOUT_IN; } # # record_hits(blast_output) # reads the blast_output file and records all hits in hashes # %best_score, %via, %first_e, %second_e # sub record_hits($) { my($blast_out_file, $queryid, $secondid, $interexpect) = @_; open(BLASTOUT_IN, "<$blast_out_file") || die "Error: can't open file $blast_out_file\n"; $hitseq_id = ""; while () { if (/^>(\S+) /) { # found a new hit if ($hitseq_id ne "") { add_hit( $queryid, $hitseq_id, $interexpect, $expect,$secondid); } $hitseq_id = $1; $expect=999999; } elsif (/Expect = (\S+), .*P.* = (\S+)/) { $expect = $1 if $1 < $expect; } elsif (/^Parameters:/) { if ($hitseq_id ne "") { add_hit( $queryid, $hitseq_id, $interexpect, $expect,$secondid); } last; } } close BLASTOUT_IN; } sub add_hit($$$$$$) { my ($queryid, $hitseq_id, $interexpect, $expect, $secondid) = @_; my $maxexpect = $expect < $interexpect? $interexpect: $expect; my $score = $maxexpect<=0? -1300: log($maxexpect); my $key="$queryid\t$hitseq_id"; if (! defined $best_score{$key} || $score < $best_score{$key}) { $best_score{$key} = $score; $via{$key} = $secondid; $first_e{$key} = $interexpect; $second_e{$key} = $expect; } } sub process_command_line { local($i) = 0; # Check for even number of command line words if ($#ARGV%2 != 1) { print "ERROR: Command line error.\n"; exit (-1); } # Set default values $dbfile = $PDB; $interdbfile = $NRP; $intere = 5.0e-5; $finale = 0.02; $extend = 0; # Get user specified values while ($i <= $#ARGV) { $_ = $ARGV[$i++]; SWITCH: { if (/^-q/) {$queryfile = absolute( $ARGV[$i++]); last SWITCH; } if (/^-db/) { $dbfile = absolute( $ARGV[$i++]); (-e $dbfile) || die "$dbfile doesn't exist.\n"; last SWITCH; } if (/^-interdb/) { $interdbfile = absolute( $ARGV[$i++]); (-e $interdbfile) || die "$interdbfile doesn't exist.\n"; last SWITCH; } if (/^-intere/) {$intere = $ARGV[$i++]; last SWITCH;} if (/^-finale/) {$finale = $ARGV[$i++]; last SWITCH;} if (/^-extend/) {$extend = $ARGV[$i++]; last SWITCH;} } } # Make certain that there are now values for those parameters with non- # default values (defined($queryfile)) || die "ERROR: Nothing specified for -q queryfile\n"; $BASE_A2M = absolute(($queryfile =~ /(\S+)[.]gz$/ )? $1 : $queryfile); $tmpqueries= "$tmpdir/queries"; if (-e $BASE_A2M) { system "cp $BASE_A2M $tmpqueries"; } elsif (-e "$BASE_A2M.gz") { system "zcat $BASE_A2M.gz > $tmpqueries"; } else { die "$0 failed: can't find $BASE_A2M or $BASE_A2M.gz\n"; } # force $intere and $finale into fixed-point format # since I'm unsure that BLAST understands exponents $intere = sprintf("%f",$intere); $finale = sprintf("%f",$finale); return; } # prepend relative path names with current working directory # usage $absolute_file = absolute($file) sub absolute($) { my ($file) = @_; return $file if ($file =~ /^\/.*/); my $wd = `pwd`; chomp($wd); if ($wd=~/\/auto(.*)/) {return "$1/$file"}; return "$wd/$file"; } # make all directories along path to specified directory sub make_directories ($) { my ($dir) = @_; @pieces = split /\//, $dir; for (my $i = 0; $i <=$#pieces; $i++) { my $path = join ('/', @pieces[0 .. $i]); next if $path eq ""; if (! -d $path) { mkdir $path, 0775; chmod 0775, $path; system "chgrp protein $path"; } } }