#!/usr/bin/perl -w #Usage: annotate_target_scores modelname < scores.dist #takes a SAM dist file of scores for protein chains listed by PDB ID #outputs the list as an rdb table with these columns: # Sequence ID Length E-value SCOP domains #where there are multiple SCOP domains, we will have one row per domain #include commented explanation at the top of the file # Sun Dec 30 16:57:50 PST 2007 Kevin Karplus # Removed FSSP columns and reading of FSSP file use FileHandle; STDERR->autoflush(1); use Getopt::Long; use English; use File::Basename; #SCOP list of chains and domains $scop_file = "/projects/compbio/data/scop/dir.cla.scop.txt.gz"; GetOptions( "scop=s" => \$scop_file ); if ($#ARGV < 0 ) { &print_usage_exit; } $modelname = join(" ",@ARGV); #print comment text here print "# List of top-scoring protein chains for $modelname hidden Markov model.\n"; print "# HMM and HMM E-value scores computed by SAM (c) 1992-2001 Regents of the University of California, Santa Cruz\n"; print "# Sequence Alignment and Modeling Software System\n"; print "# http://www.cse.ucsc.edu/research/compbio/sam.html\n"; print "# ----------------- Citations (SAM, SAM-T99, HMMs) --------------------\n"; print "# R. Hughey, A. Krogh, Hidden Markov models for sequence analysis:\n"; print "# Extension and analysis of the basic method, CABIOS 12:95-107, 1996.\n"; print "# K. Karplus, C. Barrett, R. Hughey, Hidden Markov models for detecting\n"; print "# remote protein homologies, Bioinformatics 14(10):846-856, 1998.\n"; print "# A. Krogh et al., Hidden Markov models in computational biology:\n"; print "# Applications to protein modeling, JMB 235:1501-1531, Feb 1994.\n"; print "#\n"; print "#SCOP domains found in each protein chain taken from SCOP:\n"; print "# ------------------Citations (SCOP) -----------------------------------\n"; print "#Murzin A. G., Brenner S. E., Hubbard T., Chothia C. (1995).\n"; print "#SCOP: a structural classification of proteins database for the investigation of sequences and structures.\n"; print "#J. Mol. Biol. 247, 536-540.\n"; #print RDB column head and column format text here print "Sequence_ID\tLength\tTarget_Evalue\tSCOP_domain\tSCOP_suid\n"; print "5S\t5N\t12S\t12S\t10N\n"; #open scop_file for reading #Note: we will need multiple searches through this table. Since we can't rewind #from a pipe, we'll read the PDBid and their reps into memory #key is chain name and value is comma-separated SCOP suid-list, my %suids = (); my %scop_from_suid = (); # map suid to scop name open(SCOP_FILE, "gunzip -c $scop_file|") || die "Can't open $scop_file for reading\n"; while($line = ) { next if $line =~ /^\#/; @cols = split(" ", $line); if ($cols[2] =~ /^[A-Z]{1,1}:/) { # chain letter $chainID = $cols[1] . substr($cols[2], 0, 1); } else { $chainID = $cols[1]; } # get the domain name i.e. a.1.1.1 and unique scop id for the # domain (sunid) i.e. 14982 and map 14982->a.1.1.1 and # chainID -> comma-separated list of suids. $scop_from_suid{$cols[4]} = $cols[3]; $suids{$chainID} = defined($suids{$chainID})? $suids{$chainID}.",".$cols[4] : $cols[4]; } close(SCOP_FILE); while($line = ) { next if $line =~ /^%/; chomp $line; @cols = split(" ", $line); # $cols[0] is sequence id # $cols[1] is length # $cols[4] is E-value print "$cols[0]\t$cols[1]\t$cols[4]\t"; #we'll report all SCOP domains, comma-separated #(since the "hit" chain may contain several SCOP domains) #also consider that there was no scop domain for this #chain in our version of the scop index if (defined ($suids{$cols[0]})) { #parse the domain name and the sunid # sorting the suids to get a canonical order my @suidlist = sort(split(",", $suids{$cols[0]})); my @scoplist = map($scop_from_suid{$_}, @suidlist); print join(",", @scoplist) ."\t" . join(",", @suidlist) . "\n"; } else { print "\t\n"; } } sub print_usage_exit { print "Usage: annotate_target_scores modelname < distfile\n"; print " Options: -scop /projects/compbio/data/scop/dir.cla.scop.txt.gz\n"; exit(-1); }