#!/usr/bin/perl -w # Sat Oct 29 10:33:40 PDT 2005 Kevin Karplus # split-into-domains < foo.a2m # creates a number of subdirectories, each getting a Makefile and an # a2m file. The a2m file has a portion of the sequence read from # stdin, and the Makefile is set up to do a structure prediction for # that portion. =pod =head1 NAME split-into-domains -- splitting a sequence into overlapping pieces for separate prediction =head1 SYNOPSIS split-into-domains --length 240 --advance 100 --nomake < foo.a2m Options: -help brief help -man detailed help -length 240 how long is each piece (except last) -advance 100 how much to advance between beginnings of pieces =head1 OPTIONS =over 4 =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-length> 240 Specifies how long each piece (except the last one) is. If the sequence is shorter than the length, no subparts are created, unless -first is greater than 1. The default value is 240, which is long enough for most common domains. =item B<-advance> 100 Specifies how far forward to move for each subsequent piece. =item B<-first> 1 Specifies which residue to start with (1-based numbering) for the first part. Note: to select a single domain, specify -first and -length, and make -advance be larger than the total chain length. =item B<-min_length> 100 Specifies the minimum length to accept for a subpart being created. If not specified, it defaults to the smaller of -advance and -length. =item B<-make> If --make is set, calls para-trickle-make to start make in each created subdirectory. Default is --nomake. =back =head1 DESCRIPTION split-into-domains is misleadingly named---it does not actually look for domains in a protein sequence, but splits the protein into overlapping pieces of a fixed length for separate prediction. The input (from STDIN) is a fasta-formatted file containing a single protein sequence. For each piece, a subdirectory of the current directory is created, with a name that reflects the range of the piece. Within the new directory, three files are created: a Makefile, a fasta file that contains the piece, and a README file that gives the date and how the piece was created. The name for the fasta file that is created is taken from the name of the sequence that is being split up, and the sequence in the fasta file has the same name as the original sequence, except that the name has all all special characters converted to underbars, for safety. =cut # CHANGE LOG # Fri Dec 9 06:07:18 PST 2005 Kevin Karplus # Modified min_length computation so that single chain can be # created if --first >1 and --min_length not specified. use strict; use FileHandle; STDERR->autoflush(1); use English; use File::Basename; use Getopt::Long; use Pod::Usage; # function prototypes sub read_sequence(); sub create_part($$$); #hardwired filenames my $Make_main = "/projects/compbio/experiments/protein-predict/casp7/starter-directory/Make.main"; my $para_trickle = "/projects/compbio/experiments/models.97/scripts/para-trickle-make"; my $fixmode = "/projects/compbio/bin/fixmode"; #main { my $part_length=240; # how long is each part (except last few) my $part_advance=100; # how far to advance for each piece my $part_first = 1; # first subscript (1-based numbering) my $min_length; # don't bother with fragments shorter than $min_length my $make=0; # set to start a make for each subdirectory on the default cluster GetOptions( "advance=n" => \$part_advance , "length=n" => \$part_length , "minlength=n" => \$min_length , "first=n" => \$part_first , "make!" => \$make , "help|?" => sub {pod2usage("verbose"=>1);} , "man" => sub {pod2usage("verbose"=>2);} ) or pod2usage("verbose"=>0); my $id; # name of id sequence my $sequence; # sequence of amino acids for the protein ($id, $sequence) = read_sequence(); my $len = length($sequence); if (!defined($min_length)) { $min_length = $part_advance<$part_length? $part_advance: $part_length; if ($part_first >1 && $min_length > $len-$part_first +1) { $min_length = $len-$part_first +1; } } for (my $start=$part_first-1; $start+$min_length<=$len; $start+= $part_advance) { my $dir=create_part($id, substr($sequence, $start, $part_length), $start+1); system("$fixmode $dir"); if ($make) { system("cd $dir; $para_trickle -rmvdir -se2log " ."-quick \'make -k >& make.log; gzip -9f make.log\' " )==0 || die "Error: can't run $para_trickle\n"; print STDERR "Started in $dir\n"; } else { print STDERR "Created $dir, but no make started\n"; } } } # Read a single fasta sequence from STDIN. # Return the ID for the sequence and the sequence itself. sub read_sequence() { my $id; # name of id sequence my $sequence; # sequence of amino acids for the protein while() { chomp; # Mon Jan 23 07:40:22 PST 2006 Kevin Karplus tr/\r//d; # remove WINDOZE ^M charaters if (/^>([^, ]+)/) { # found an id line last if (defined($id)); # first sequence ended $id=$1; next; } next if (/^\s*#/); # comment line (non-standard) ignored tr/A-Za-z//cd; # eliminate everything except letters if (defined($sequence)) { $sequence .= $_; } else { $sequence = $_; } } # print STDERR "DEBUG: id='$id'\n"; $id =~ tr/A-Za-z0-9_/_/c; #replace all non-alphamerics with _ for safety return($id, $sequence); } # Create a subdirectory for the specified sequence # The first argument is the name of the sequence. # The second argument is the subsequence to use. # The third argument is the number (1-based) of the residue that # starts the sequence. # Return the name of the created directory (not the full path). sub create_part($$$) { my ($id, $seq, $first) = @_; my $len = length($seq); # construct a directory name from the range of residues my $dirname = substr($seq,0,1).$first."-".substr($seq,-1).($first+$len-1); mkdir $dirname; open(FASTA, ">$dirname/$id.a2m") || die "Error: couldn't open $dirname/$id.a2m for writing\n"; print FASTA ">$id\n"; for(my $s=0; $s<$len; $s+=50) { print FASTA substr($seq,$s,50)."\n"; } close FASTA; open(README, ">$dirname/README") || die "Error: couldn't open $dirname/README for writing\n"; print README localtime() . "\n\n"; print README "split-into-domains created subdirectory for $dirname of $id\n\n"; close README; open(MAKEFILE, ">$dirname/Makefile") || die "Error: couldn't open $dirname/Makefile for writing\n"; print MAKEFILE "TARGET:=$id\n" ."START_COL:=$first\n\n" ."include $Make_main\n"; close MAKEFILE; return $dirname; }