#!/usr/bin/perl -w #Usage: annotate_target_scores modelname < scores.dist #takes a SAM dist file of scores for protein chains listed by PDB ID #outputs the list as an rdb table with these columns: # Sequence ID Length E-value FSSP-rep SCOP domains #where there are multiple SCOP domains, we will have one row per domain #include commented explanation at the top of the file use FileHandle; STDERR->autoflush(1); use Getopt::Long; use English; use File::Basename; #SCOP list of chains and domains $scop_file = "/projects/compbio/data/scop/dir.cla.scop.txt.gz"; #fssp list of chains and representatives $table2 = "/projects/compbio/data/fssp/TABLE2.gz"; GetOptions( "scop=s" => \$scop_file , "fssp=s" => \$table2 ); if ($#ARGV < 0 ) { &print_usage_exit; } $modelname = join(" ",@ARGV); #print comment text here print "# List of top-scoring protein chains for $modelname hidden Markov model.\n"; print "# HMM and HMM E-value scores computed by SAM (c) 1992-2001 Regents of the University of California, Santa Cruz\n"; print "# Sequence Alignment and Modeling Software System\n"; print "# http://www.cse.ucsc.edu/research/compbio/sam.html\n"; print "# ----------------- Citations (SAM, SAM-T99, HMMs) --------------------\n"; print "# R. Hughey, A. Krogh, Hidden Markov models for sequence analysis:\n"; print "# Extension and analysis of the basic method, CABIOS 12:95-107, 1996.\n"; print "# K. Karplus, C. Barrett, R. Hughey, Hidden Markov models for detecting\n"; print "# remote protein homologies, Bioinformatics 14(10):846-856, 1998.\n"; print "# A. Krogh et al., Hidden Markov models in computational biology:\n"; print "# Applications to protein modeling, JMB 235:1501-1531, Feb 1994.\n"; print "#\n"; print "# FSSP representative for each protein chain taken from FSSP:\n"; print "# ------------------Citations (FSSP) -----------------------------------\n"; print "# L. Holm and C. Sander (1996) Mapping the protein universe. Science 273:595-602.\n"; print "# \n"; print "#SCOP domains found in each protein chain taken from SCOP:\n"; print "# ------------------Citations (SCOP) -----------------------------------\n"; print "#Murzin A. G., Brenner S. E., Hubbard T., Chothia C. (1995).\n"; print "#SCOP: a structural classification of proteins database for the investigation of sequences and structures.\n"; print "#J. Mol. Biol. 247, 536-540.\n"; #print RDB column head and column format text here print "Sequence_ID\tLength\tTarget_Evalue\tFSSP-rep\tSCOP_domain\tSCOP_suid\n"; print "5S\t5N\t12S\t6S\t12S\t10N\n"; #open TABLE2 and scop_file for reading open(TABLE2, "gunzip -c $table2|") || die "Can't open $table2 for reading\n"; #Note: we will need multiple searches through this table. Since we can't rewind #from a pipe, we'll read the PDBid and their reps into memory #key is the PDBid and value is its rep # could add additional values if we want (such as RMSD to rep . . .) %fsspReps = (); # skip header while() { last if (/^PDBid/); } # read FSSP representatitives while() { if (/(\S+)\s+(\S+)/) { $fsspReps{$1} = $2; } } close(TABLE2); #likewise for this table #key is chain name and value is comma-separated SCOP suid-list, my %suids = (); my %scop_from_suid = (); # map suid to scop name open(SCOP_FILE, "gunzip -c $scop_file|") || die "Can't open $scop_file for reading\n"; while($line = ) { next if $line =~ /^\#/; @cols = split(" ", $line); if ($cols[2] =~ /^[A-Z]{1,1}:/) { # chain letter $chainID = $cols[1] . substr($cols[2], 0, 1); } else { $chainID = $cols[1]; } # get the domain name i.e. a.1.1.1 and unique scop id for the # domain (sunid) i.e. 14982 and map 14982->a.1.1.1 and # chainID -> comma-separated list of suids. $scop_from_suid{$cols[4]} = $cols[3]; $suids{$chainID} = defined($suids{$chainID})? $suids{$chainID}.",".$cols[4] : $cols[4]; } close(SCOP_FILE); while($line = ) { next if $line =~ /^%/; chomp $line; @cols = split(" ", $line); # $cols[0] is sequence id # $cols[1] is length # $cols[4] is E-value print "$cols[0]\t$cols[1]\t$cols[4]\t" . (defined($fsspReps{$cols[0]})? $fsspReps{$cols[0]}:"") . "\t"; #we'll report all SCOP domains, comma-separated #(since the "hit" chain may contain several SCOP domains) #also consider that there was no scop domain for this #chain in our version of the scop index if (defined ($suids{$cols[0]})) { #parse the domain name and the sunid # sorting the suids to get a canonical order my @suidlist = sort(split(",", $suids{$cols[0]})); my @scoplist = map($scop_from_suid{$_}, @suidlist); print join(",", @scoplist) ."\t" . join(",", @suidlist) . "\n"; } else { print "\t\n"; } } sub print_usage_exit { print "Usage: annotate_target_scores modelname < distfile\n"; print " Options: -scop /projects/compbio/data/scop/dir.cla.scop.txt.gz\n"; print " Options: -fssp /projects/compbio/data/fssp/TABLE2.gz\n"; exit(-1); }