#!/usr/bin/perl -w #Usage: create_summary_html id foo.fasta > summary.html #Create a SAM_T02 summary html page # with links to the CASP6 web page use strict; use Getopt::Long; use Pod::Usage; sub make_url($$$); { my $url_type; my $id; my $fasta; my $comment; GetOptions( "url_type=s" => \$url_type , "id=s" => \$id , "fasta=s" => \$fasta , "comment=s" => \$comment , "help|?" => sub {pod2usage("verbose"=>1);} , "man" => sub {pod2usage("verbose"=>2);} ) or pod2usage("verbose" => 0); # parse leftover arguments foreach my $arg (@ARGV) { if (!defined($id)) {$id=$arg;} elsif (!defined($fasta) && -e $arg) {$fasta=$arg;} elsif (!defined($comment)) {$comment=$arg;} else { print STDERR "Error: too many unlabeled arguments to create_summary_html\n"; pod2usage("verbose" => 0); } } pod2usage("verbose" => 0) if (!defined($id)); # look for the id in the fasta file, and save the comment if (defined($fasta) && ! defined($comment)) { open(FASTA, "< $fasta"); while() { if (/^>(\S+) (.*)/ && $1 eq $id) { $comment = $2; } } close(FASTA); } my $url = make_url($id,$comment,$url_type); my $link = (defined($url)? "$id" : $id); print "\n\n"; print "SAM_T02 Results for $id\n"; print "\n\n"; print "\n\n"; print "

SAM_T02 Results for $link

\n"; print "
\n"; print "
\n"; print "

$link"; print " $comment" if defined($comment); print "

\n"; print "


\n"; print "\n\n"; print "\n"; print "\n"; print "\n"; print "\n"; print "\n"; print "
DescriptionLink to fileDate
\n"; print "
\n"; } # Create a URL from a protein name sub make_url($$$) { my ($id, $comment,$url_type) = @_; my $lc_id = lc($id); my $geneid; # first field of comment after id if (defined($comment)) { ($comment =~ /^\s*(\S+)/ ) && ($geneid=$1); } # try to see id as an NCBI identifier my $ncbi_id; ($id =~ /^gi[|](\d+)[|]/) && ($ncbi_id=$1); # try to see id as a Swissprot identifier my $sprot_id; ($id =~ /^[|]sp[|]([^|]+)[|]/) && ($sprot_id = $1); if (!defined($sprot_id)) { ($id =~ /^[A-Za-z0-9]+_[A-Za-z0-9]+$/) && ($sprot_id = $id); } my $yeast_id; ($id =~ /^[A-Za-z][A-Za-z][A-Za-z]\d\d\d[CW]/) && ($yeast_id=$id); my $casp_id; ($id =~ /^[tT]\d+$/) && ($casp_id=$id); if (!defined($url_type)) { # try to guess what sort of id this is if (defined($ncbi_id)) {$url_type="ncbi";} elsif (defined($sprot_id)) {$url_type="swissprot";} elsif (defined($yeast_id)) {$url_type="yeast";} elsif (defined($casp_id)) {$url_type="casp6";} # TO DO: add genome browser guesses? else { warn "Warning: -url_type not specified, and can't guess from $id\n"; return undef; } } $ncbi_id = "**NOT_DEFINED**" if (!defined($ncbi_id)); $sprot_id = "**NOT_DEFINED**" if (!defined($sprot_id)); $yeast_id = "**NOT_DEFINED**" if (!defined($yeast_id)); $url_type = lc($url_type); if ($url_type eq "yeast") { $yeast_id = $id; } my %url_map = ( "casp6" => "http://predictioncenter.llnl.gov/casp6/targets/templates/$lc_id.doc.html" , "yeast" => "http://genome-www4.stanford.edu/cgi-bin/SGD/singlepageformat?locus=$id" , "ncbi" => "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=protein&list_uids=$ncbi_id&dopt=GenPept" , "swissprot" => "http://www.expasy.org/cgi-bin/niceprot.pl?$sprot_id" # TO DO: add genome browser ); my $url = $url_map{$url_type}; if (!defined($url)) { print STDERR "Error: url_type $url_type not understod. Use one of\n\t" . join(" ", keys(%url_map)) . "\n"; pod2usage("verbose" => 0); } if ($url =~ /[*][*]NOT_DEFINED[*][*]/) { warn "Warning: -url_type $url_type specified or guessed,\n" ." but '$id' not of appropriate type.\n"; return undef; } return $url; } __END__ =pod =head1 NAME create_summary_html -- create the beginning part of a summary.html file =head1 SYNOPSIS create_summary_html -id YKL149C -comment "DBR1 lariat-debranching enzyme" > summary.html Options: -help brief help -man detailed help -id The identifier to create a page for. -comment Used for adding a comment further identifying the protein. -fasta Specifies a fasta file which will be searched for the id, to provide a comment from the rest of the fasta id line. -url_type specifies which database should be linked to for the id. =head1 OPTIONS =over 4 =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-id> YKL149C The identifier that is to be used for the header and for creating links. This is a required field, but the "-id" can be dropped. =item B<-comment> "further description" Further description can be added to the file, by providing a comment option. =item B<-fasta> foo.a2m If the -fasta option is provided, and the -comment option is not, then the fasta file will be searched for the id, and the comment from the fasta id line used as the comment. =item B<-url_type> yeast Specifies what database to link to. If not provided, create-summary_html will try to guess, based on the format of the id, but the guesses are often terrible. Currently known url types are =over 4 =item B link to the SCG database from a name like YKL149C. =item B link to the NR database in Entrez at NCBI from a name like gi|... =item B link to the Swissprot database from a name like ACH7_HUMAN, though an ncbi identifier that includes ...|sp|... can also be parsed. =item B link to the casp6 data base from a name like T0196 =back =back =head1 DESCRIPTION create_summary_html creates the beginning of an html page for fold-recognition results. It needs to be told what protein it is creating a page for (giving an id). It will use that id to create the title and header for the HTML page. In addition, it will try to create a link to protein database that has more information about the protein. If unlabeled arguments are given to the program, it will assume that they are -id then -fasta (unless those have already been defined). =head1 BUGS No links to the genome.ucsc.edu browser are created. No links to PDB are created. Only casp6 is used for CASP targets, not earlier casp target lists. Only one link is created, even if the sequence is in multiple useful databases. =cut