)
{ if (/^>(\S+) (.*)/ && $1 eq $id)
{ $comment = $2;
}
}
close(FASTA);
}
my $url = make_url($id,$comment,$url_type);
my $link = (defined($url)? "$id"
: $id);
print "\n\n";
print "SAM_T02 Results for $id\n";
print "\n\n";
print "\n\n";
print "SAM_T02 Results for $link
\n";
print "
\n";
print "\n";
print "$link";
print " $comment" if defined($comment);
print "
\n";
print "
\n";
print "\n\n";
print "\n";
print "Description | \n";
print "Link to file | \n";
print "Date | \n";
print "
\n";
print "
\n";
print "
\n";
}
# Create a URL from a protein name
sub make_url($$$)
{
my ($id, $comment,$url_type) = @_;
my $lc_id = lc($id);
my $geneid; # first field of comment after id
if (defined($comment))
{ ($comment =~ /^\s*(\S+)/ ) && ($geneid=$1);
}
# try to see id as an NCBI identifier
my $ncbi_id;
($id =~ /^gi[|](\d+)[|]/) && ($ncbi_id=$1);
# try to see id as a Swissprot identifier
my $sprot_id;
($id =~ /^[|]sp[|]([^|]+)[|]/) && ($sprot_id = $1);
if (!defined($sprot_id))
{ ($id =~ /^[A-Za-z0-9]+_[A-Za-z0-9]+$/)
&& ($sprot_id = $id);
}
my $yeast_id;
($id =~ /^[A-Za-z][A-Za-z][A-Za-z]\d\d\d[CW]/) && ($yeast_id=$id);
my $casp_id;
($id =~ /^[tT]\d+$/) && ($casp_id=$id);
if (!defined($url_type))
{ # try to guess what sort of id this is
if (defined($ncbi_id)) {$url_type="ncbi";}
elsif (defined($sprot_id)) {$url_type="swissprot";}
elsif (defined($yeast_id)) {$url_type="yeast";}
elsif (defined($casp_id)) {$url_type="casp6";}
# TO DO: add genome browser guesses?
else
{ warn "Warning: -url_type not specified, and can't guess from $id\n";
return undef;
}
}
$ncbi_id = "**NOT_DEFINED**" if (!defined($ncbi_id));
$sprot_id = "**NOT_DEFINED**" if (!defined($sprot_id));
$yeast_id = "**NOT_DEFINED**" if (!defined($yeast_id));
$url_type = lc($url_type);
if ($url_type eq "yeast")
{ $yeast_id = $id;
}
my %url_map = (
"casp6" => "http://predictioncenter.llnl.gov/casp6/targets/templates/$lc_id.doc.html"
, "yeast" => "http://genome-www4.stanford.edu/cgi-bin/SGD/singlepageformat?locus=$id"
, "ncbi" => "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=protein&list_uids=$ncbi_id&dopt=GenPept"
, "swissprot" => "http://www.expasy.org/cgi-bin/niceprot.pl?$sprot_id"
# TO DO: add genome browser
);
my $url = $url_map{$url_type};
if (!defined($url))
{ print STDERR "Error: url_type $url_type not understod. Use one of\n\t"
. join(" ", keys(%url_map)) . "\n";
pod2usage("verbose" => 0);
}
if ($url =~ /[*][*]NOT_DEFINED[*][*]/)
{ warn "Warning: -url_type $url_type specified or guessed,\n"
." but '$id' not of appropriate type.\n";
return undef;
}
return $url;
}
__END__
=pod
=head1 NAME
create_summary_html -- create the beginning part of a summary.html file
=head1 SYNOPSIS
create_summary_html -id YKL149C
-comment "DBR1 lariat-debranching enzyme" > summary.html
Options:
-help brief help
-man detailed help
-id The identifier to create a page for.
-comment Used for adding a comment further identifying the protein.
-fasta Specifies a fasta file which will be searched for the id,
to provide a comment from the rest of the fasta id line.
-url_type specifies which database should be linked to
for the id.
=head1 OPTIONS
=over 4
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=item B<-id> YKL149C
The identifier that is to be used for the header and for creating links.
This is a required field, but the "-id" can be dropped.
=item B<-comment> "further description"
Further description can be added to the file, by providing a comment option.
=item B<-fasta> foo.a2m
If the -fasta option is provided, and the -comment option is not, then
the fasta file will be searched for the id, and the comment from the
fasta id line used as the comment.
=item B<-url_type> yeast
Specifies what database to link to. If not provided,
create-summary_html will try to guess, based on the format of the id,
but the guesses are often terrible.
Currently known url types are
=over 4
=item B
link to the SCG database from a name like YKL149C.
=item B
link to the NR database in Entrez at NCBI from a name like gi|...
=item B
link to the Swissprot database from a name like ACH7_HUMAN, though an
ncbi identifier that includes ...|sp|... can also be parsed.
=item B
link to the casp6 data base from a name like T0196
=back
=back
=head1 DESCRIPTION
create_summary_html creates the beginning of an html page for
fold-recognition results.
It needs to be told what protein it is creating a page for (giving an id).
It will use that id to create the title and header for the HTML page.
In addition, it will try to create a link to protein database that has
more information about the protein.
If unlabeled arguments are given to the program, it will assume that
they are -id then -fasta (unless those have already been defined).
=head1 BUGS
No links to the genome.ucsc.edu browser are created.
No links to PDB are created.
Only casp6 is used for CASP targets, not earlier casp target lists.
Only one link is created, even if the sequence is in multiple useful
databases.
=cut