#!/usr/bin/perl -w

# Sat Oct 29 10:33:40 PDT 2005 Kevin Karplus

# split-into-domains < foo.a2m
# creates a number of subdirectories, each getting a Makefile and an
# a2m file.  The a2m file has a portion of the sequence read from
# stdin, and the Makefile is set up to do a structure prediction for
# that portion.

=pod

=head1 NAME

split-into-domains -- splitting a sequence into overlapping pieces for separate prediction

=head1 SYNOPSIS

split-into-domains --length 240 --advance 100 --nomake < foo.a2m

 Options:
 	-help	brief help 
	-man	detailed help
	-length	240 how long is each piece (except last)
	-advance 100 how much to advance between beginnings of pieces

=head1 OPTIONS

=over 4

=item B<-help>

Print a brief help message and exits.

=item B<-man>

Prints the manual page and exits.

=item B<-length> 240

Specifies how long each piece (except the last one) is.
If the sequence is shorter than the length, no subparts are created,
unless -first is greater than 1.
The default value is 240, which is long enough for most common domains.

=item B<-advance> 100

Specifies how far forward to move for each subsequent piece.

=item B<-first> 1

Specifies which residue to start with (1-based numbering) for the
first part.  Note: to select a single domain, specify -first and
-length, and make -advance be larger than the total chain length.

=item B<-min_length> 100

Specifies the minimum length to accept for a subpart being created.
If not specified, it defaults to the smaller of -advance and -length.

=item B<-make>

If --make is set, calls para-trickle-make to start make in each
created subdirectory.  Default is --nomake.

=back

=head1 DESCRIPTION

split-into-domains is misleadingly named---it does not actually
look for domains in a protein sequence, but splits the protein into
overlapping pieces of a fixed length for separate prediction.
The input (from STDIN) is a fasta-formatted file containing a single
protein sequence.

For each piece, a subdirectory of the current directory is created,
with a name that reflects the range of the piece.  Within the new
directory, three files are created: a Makefile, a fasta file that
contains the piece, and a README file that gives the date and how the
piece was created.

The name for the fasta file that is created is taken from the name of
the sequence that is being split up, and the sequence in the fasta
file has the same name as the original sequence, except that the name
has all all special characters converted to underbars, for safety.

=cut

# CHANGE LOG
# Fri Dec  9 06:07:18 PST 2005 Kevin Karplus
#	Modified min_length computation so that single chain can be
# 	created if --first >1 and --min_length not specified.


use strict;

use FileHandle;
STDERR->autoflush(1);
use English;
use File::Basename;
use Getopt::Long;
use Pod::Usage;

# function prototypes
sub read_sequence();
sub create_part($$$);

#hardwired filenames

my $Make_main = "/projects/compbio/experiments/protein-predict/casp7/starter-directory/Make.main";
my $para_trickle = "/projects/compbio/experiments/models.97/scripts/para-trickle-make";
my $fixmode = "/projects/compbio/bin/fixmode";

#main
{    
    my $part_length=240;	# how long is each part (except last few)
    my $part_advance=100;	# how far to advance for each piece
    my $part_first = 1;		# first subscript (1-based numbering)
    my $min_length;		# don't bother with fragments shorter than $min_length
    my $make=0;			# set to start a make for each subdirectory on the default cluster
    GetOptions(
    	      "advance=n"	=> \$part_advance
	    , "length=n"	=> \$part_length
	    , "minlength=n"	=> \$min_length
	    , "first=n"		=> \$part_first
	    , "make!"		=> \$make
	    , "help|?"	=>	sub {pod2usage("verbose"=>1);}
	    , "man"		=> 	sub {pod2usage("verbose"=>2);}
	    )
	    or pod2usage("verbose"=>0);
    

    my $id;		# name of id sequence
    my $sequence;	# sequence of amino acids for the protein
    ($id, $sequence) = read_sequence();    
    my $len = length($sequence);
    
    if (!defined($min_length))
    {	$min_length = $part_advance<$part_length? $part_advance: $part_length;
	if ($part_first >1 && $min_length > $len-$part_first +1)
	{   $min_length = $len-$part_first +1;
	}
    }

    
    for (my $start=$part_first-1; $start+$min_length<=$len; $start+= $part_advance)
    {   my $dir=create_part($id, substr($sequence, $start, $part_length), $start+1);
    	system("$fixmode $dir");
	if ($make)
	{    system("cd $dir; $para_trickle -rmvdir -se2log "
		."-quick \'make -k  >& make.log; gzip -9f make.log\' "
		)==0 || die "Error: can't run $para_trickle\n";
	    print STDERR "Started in $dir\n";
	}
	else
	{    print STDERR "Created $dir, but no make started\n";
	}
    }
}


# Read a single fasta sequence from STDIN.
# Return the ID for the sequence and the sequence itself.
sub read_sequence()
{
    my $id;		# name of id sequence
    my $sequence;	# sequence of amino acids for the protein
    while(<STDIN>)
    {   chomp;
        		# Mon Jan 23 07:40:22 PST 2006 Kevin Karplus
	tr/\r//d;	# remove WINDOZE ^M charaters
	
	if (/^>([^,	 ]+)/)
    	{   # found an id line
	    last if (defined($id));		# first sequence ended
	    $id=$1;
	    next;
	}
	next if (/^\s*#/);	# comment line (non-standard) ignored
	tr/A-Za-z//cd;		# eliminate everything except letters
	if (defined($sequence)) 
	{    $sequence .= $_;
	}
	else
	{    $sequence = $_;
	}
    }
#    print STDERR "DEBUG: id='$id'\n";
    $id =~ tr/A-Za-z0-9_/_/c;	#replace all non-alphamerics with _ for safety
    return($id, $sequence);
}


# Create a subdirectory for the specified sequence
#	The first argument is the name of the sequence.
#	The second argument is the subsequence to use.
#	The third argument is the number (1-based) of the residue that
#	starts the sequence.
# Return the name of the created directory (not the full path).
sub create_part($$$)
{   my ($id, $seq, $first) = @_;
    my $len = length($seq);
    
    # construct a directory name from the range of residues
    my $dirname = substr($seq,0,1).$first."-".substr($seq,-1).($first+$len-1);
    mkdir $dirname;
    
    open(FASTA, ">$dirname/$id.a2m")
    	|| die "Error: couldn't open $dirname/$id.a2m for writing\n";
    print FASTA ">$id\n";
    for(my $s=0; $s<$len; $s+=50)
    {    print FASTA substr($seq,$s,50)."\n";
    }
    close FASTA;
    
    open(README, ">$dirname/README")
    	|| die "Error: couldn't open $dirname/README for writing\n";
    print README localtime() . "\n\n";
    print README "split-into-domains created subdirectory for $dirname of $id\n\n";
    close README;
    
    open(MAKEFILE, ">$dirname/Makefile")
    	|| die "Error: couldn't open $dirname/Makefile for writing\n";
    print MAKEFILE "TARGET:=$id\n"
    	."START_COL:=$first\n\n"
	."include $Make_main\n";
    close MAKEFILE;

    return $dirname;
}