#!/usr/bin/perl -w #Usage: annotate_template_scores < template_scores_rdbfile #takes an rdb file of SAM dist scores for the target sequence with #respect to a template library. Note: SAM outputs an rdb file of #template scores rather than a dist file and here we work with #a preprocessed version of that file, in which comments have been #stripped out and full path names of the models shortened to chainID name. #We output the list of scores as an rdb table with these columns: # Template_ID Template_Length E-value FSSP-rep SCOP domains #where there are multiple SCOP domains, we will have one row per domain #include commented explanation at the top of the file use FileHandle; STDERR->autoflush(1); use Getopt::Long; use English; use File::Basename; #SCOP list of chains and domains $scop_file = "/projects/compbio/data/scop/dir.cla.scop.txt.gz"; #fssp list of chains and representatives $table2 = "/projects/compbio/data/fssp/TABLE2.gz"; GetOptions( "scop=s" => \$scop_file , "fssp=s" => \$table2 ); #print comment text here print "# List of template models which gave the target the best scores.\n"; print "# HMM and HMM E-value scores computed by SAM (c) 1992-2001 Regents of the University of California, Santa Cruz\n"; print "# Sequence Alignment and Modeling Software System\n"; print "# http://www.soe.ucsc.edu/research/compbio/sam.html\n"; print "# ----------------- Citations (SAM, SAM-T99, HMMs) --------------------\n"; print "# R. Hughey, A. Krogh, Hidden Markov models for sequence analysis:\n"; print "# Extension and analysis of the basic method, CABIOS 12:95-107, 1996.\n"; print "# K. Karplus, C. Barrett, R. Hughey, Hidden Markov models for detecting\n"; print "# remote protein homologies, Bioinformatics 14(10):846-856, 1998.\n"; print "# A. Krogh et al., Hidden Markov models in computational biology:\n"; print "# Applications to protein modeling, JMB 235:1501-1531, Feb 1994.\n"; print "#\n"; print "# FSSP representative for each protein chain taken from FSSP:\n"; print "# ------------------Citations (FSSP) -----------------------------------\n"; print "# L. Holm and C. Sander (1996) Mapping the protein universe. Science 273:595-602.\n"; print "# \n"; print "#SCOP domains found in each protein chain taken from SCOP:\n"; print "# ------------------Citations (SCOP) -----------------------------------\n"; print "#Murzin A. G., Brenner S. E., Hubbard T., Chothia C. (1995).\n"; print "#SCOP: a structural classification of proteins database for the investigation of sequences and structures.\n"; print "#J. Mol. Biol. 247, 536-540.\n"; #print RDB column head and column format text here print "Template_ID\tTemplate_Length\tTemplate_Evalue\tFSSP-rep\tSCOP_domain\tSCOP_suid\n"; print "5S\t5N\t12S\t6S\t12S\t10N\n"; #open TABLE2 for reading open(TABLE2, "gunzip -c $table2|") || die "Can't open $table2 for reading\n"; #Note: we will need multiple searches through this table. Since we can't rewind #from a pipe, we'll read the PDBid and their reps into memory #key is the PDBid and value is its rep # could add additional values if we want (such as RMSD to rep . . .) %fsspReps = (); # skip header while() { last if (/^PDBid/); } # read FSSP representatitives while() { if (/(\S+)\s+(\S+)/) { $fsspReps{$1} = $2; } } close(TABLE2); #likewise for this table #key is chain name and value is comma-separated SCOP suid-list, my %suids = (); my %scop_from_suid = (); # map suid to scop name open(SCOP_FILE, "gunzip -c $scop_file|") || die "Can't open $scop_file for reading\n"; while($line = ) { next if $line =~ /^\#/; @cols = split(" ", $line); if ($cols[2] =~ /^[A-Z]{1,1}:/) { # chain letter $chainID = $cols[1] . substr($cols[2], 0, 1); } else { $chainID = $cols[1]; } # get the domain name i.e. a.1.1.1 and unique scop id for the # domain (sunid) i.e. 14982 and map 14982->a.1.1.1 and # chainID -> comma-separated list of suids. $scop_from_suid{$cols[4]} = $cols[3]; $suids{$chainID} = defined($suids{$chainID})? $suids{$chainID}.",".$cols[4] : $cols[4]; } close(SCOP_FILE); #skip rdb column names and formats while() { # print STDERR "DEBUG: checking header: $_"; last if /^\d+S\s+/; } while($line = ) { chomp $line; @cols = split(" ", $line); # $cols[0] is template id # $cols[1] is hmm length # $cols[7] is E-value # print STDERR "DEBUG: id=$cols[0] len=$cols[1] E_val=$cols[7]\n"; print "$cols[0]\t$cols[1]\t$cols[7]\t" . (defined($fsspReps{$cols[0]})? $fsspReps{$cols[0]}:"") . "\t"; if (defined ($suids{$cols[0]})) { #parse the domain name and the sunid # sorting the suids to get a canonical order my @suidlist = sort(split(",", $suids{$cols[0]})); my @scoplist = map($scop_from_suid{$_}, @suidlist); print join(",", @scoplist) ."\t" . join(",", @suidlist) . "\n"; } else { print "\t\n"; } } #sub print_usage_exit { # print "Usage: annotate_template_scores modelname < template-scores-rdbfile\n"; # exit(-1); #}