#!/usr/local/bin/perl5 -w # # single-blast -q query.fasta -db db.fasta # [-e 10.0] [-prog wu-blastp] # > output_rdb_file # # Does a one-step search for homologs, using wu-blastp # (or specified program). # # Arguments: # -q query sequence file in fasta format. # May have multiple queries. # -db Database to report homologs from. # -e E threshold for blast # # The output has 3 columns: # TARGET the query sequence name # HMM the sequence IDs for the homologs # SCORE not the raw score, but log(P value) # # The "HMM" name is unfortunate, but is chosen for compatibility with # the output of score-lib. # 22 Dec 1997, modified to use expected number rather than P-value # so that scores greater than 0 are possible (too much piles up on zero). # 25 May 2000, changed temp directory sub absolute($); $PDB = "/projects/compbio/data/pdb/all-protein"; $tmpdir = "/projects/compbio/tmp/single-blast-$$"; make_directories($tmpdir); &process_command_line; # process_command_line copies the (or uncompresses) the queries to # $tmpdir/queries chdir $tmpdir; print "TARGET\tHMM\tSCORE\n"; print "30S\t30S\t10N\n"; open(QUERIES, "< queries"); open(ONE, "> one-query"); $id = ""; while () { $save = $_; if (/^>\s*(\S+?)([, ]|$)/) { $newid = $1; if ($id ne "") { close ONE; get_all_hits ($id); open(ONE, "> one-query"); } $id = $newid; } print ONE $save; } close(ONE); get_all_hits($id) if ($id ne ""); close(QUERIES); system "rm -rf $tmpdir"; exit; #------------------------ subroutines -------------------------------- sub get_all_hits($) { my ($queryid) = @_; my $blast_out = "blast-out-$$"; print STDERR "\nstarting $queryid with E=$e\n"; blast_query_to_database("$prog", "one-query", $dbfile, $blast_out, $e); accumulate_blast_hits($blast_out); unlink "one-query"; unlink $blast_out; } # # sub blast_query_to_database($blast_prog, # $query_file, $DatabaseFile, # $blast_out_file, $e); # sub blast_query_to_database($$$$$) { my($blast_prog, $query_file, $database_file, $blast_out_file,$e) = @_; $blast_params = " E=$e V=10000 B=100000 -gi"; my($blast_cmd) = "$blast_prog $database_file $query_file" . " $blast_params > $blast_out_file"; # print STDERR "\n$blast_cmd\n"; system($blast_cmd); } # # accumulate_blast_hits(blast_output) # reads the blast_output file and prints all hits # # # accumulate_blast_hits(blast_output,) # reads the blast_output file and collects all hits # Records the expected number of hits this good, # the low and high subscript of the hit in # hashes %blast_expect, %blast_start, %blast_end # sub accumulate_blast_hits($) { my($blast_out_file) = @_; open(BLASTOUT_IN, "<$blast_out_file") || die "Error: can't open file $blast_out_file\n"; # # seek Query line # my($blastout_in_line); $blastout_in_line = ; while ($blastout_in_line and $blastout_in_line !~ /^Query=/) { $blastout_in_line = ; } if ($blastout_in_line and $blastout_in_line =~ /^Query=\s+(\S+?)([, ]|$)/) { $query_id = $1; } else { print STDERR "Error: file $blast_out_file has no Query line\n"; return; } $hitseq_id = ""; while () { if (/^>(\S+) /) { # found a new hit save_hit() if ($hitseq_id ne ""); $hitseq_id = $1; $start=999999; $end=0; $expect=999999; } elsif (/Expect = (\S+), .*P.* = (\S+)/) { $expect = $1 if $1 < $expect; # $p= $2; } elsif (/Sbjct:\s+(\d+)\s+\S+\s+(\d+)/) { $start = $1 if ($1 < $start); $end = $2 if ($2 > $end); } elsif (/^Parameters:/) { save_hit() if ($hitseq_id ne ""); last; } } close BLASTOUT_IN; } sub save_hit() { $pvalue = $expect; $log_p = ($pvalue <= 0)? -1300: log($pvalue); printf "%s\t%s\t%.8g\n", $query_id,$hitseq_id,$log_p; } sub process_command_line { local($i) = 0; # Check for even number of command line words if ($#ARGV%2 != 1) { print "ERROR: Command line error.\n"; exit (-1); } # Set default values $dbfile = $PDB; $e = 10.0; $prog = "wu-blastp"; # Get user specified values while ($i <= $#ARGV) { $_ = $ARGV[$i++]; SWITCH: { if (/^-q/) {$queryfile = absolute( $ARGV[$i++]); last SWITCH; } if (/^-db/) { $dbfile = absolute( $ARGV[$i++]); (-e $dbfile) || die "$dbfile doesn't exist.\n"; last SWITCH; } if (/^-e/) {$e = $ARGV[$i++]; last SWITCH;} if (/^-prog/) {$prog = $ARGV[$i++]; last SWITCH;} } } # Make certain that there are now values for those parameters with non- # default values (defined($queryfile)) || die "ERROR: Nothing specified for -q queryfile\n"; $BASE_A2M = absolute(($queryfile =~ /(\S+)[.]gz$/ )? $1 : $queryfile); $tmpqueries= "$tmpdir/queries"; if (-e $BASE_A2M) { system "cp $BASE_A2M $tmpqueries"; } elsif (-e "$BASE_A2M.gz") { system "zcat $BASE_A2M.gz > $tmpqueries"; } else { die "$0 failed: can't find $BASE_A2M or $BASE_A2M.gz\n"; } return; } # prepend relative path names with current working directory # usage $absolute_file = absolute($file) sub absolute($) { my ($file) = @_; return $file if ($file =~ /^\/.*/); my $wd = `pwd`; chomp($wd); if ($wd=~/\/auto(.*)/) {return "$1/$file"}; return "$wd/$file"; } # make all directories along path to specified directory sub make_directories ($) { my ($dir) = @_; @pieces = split /\//, $dir; my $i; for ($i = 0; $i <=$#pieces; $i++) { my $path = join ('/', @pieces[0 .. $i]); next if $path eq ""; if (! -d $path) { mkdir $path, 0775; chmod 0775, $path; system "chgrp protein $path"; } } }