#!/usr/bin/perl -w


# To do:  ADD POD documentation!
#	Move table of BestAlignmentTypes to an external file.

# Sun Dec 30 17:06:32 PST 2007 Kevin Karplus
#	removed all use of FSSP.
# Sun Apr  6 11:33:56 PDT 2008 Kevin Karplus
#	Commented out the check for duplicate hits, so that all get reported.
# Wed Apr 23 17:35:54 PDT 2008 Kevin Karplus
#	fixed bug in priority setting, so that t06 preferred over t04.

use strict;

use FileHandle;
STDERR->autoflush(1);
use English;
use File::Basename;
use Getopt::Long;
use Pod::Usage;

# prototype declarations:
sub read_scores($\@\%\%\%);
sub read_rdb_header(*\@\%);
sub find_best_alignment($$$\%\%);
sub is_better_alignment($$$$);

    # preferences for alignment types based on observations
    #	of alignments in tests
    %::BestAlignmentType =
    	 (
	  "local-str2[+]near-backbone-11-0.8[+]0.6[+]0.8-adpstyle5" =>5
	, "local-str2[+]CB_burial_14_7-1.0[+]0.4[+]0.4-adpstyle5" =>4
	, "local-str2[+]CB_burial_14_7-0.4[+]0.4-adpstyle5" =>4     # old style name
	, "local-str2-1.0[+]0.3-adpstyle5" =>3
	, "local-str2-0.1-adpstyle5" =>3
	, "local-str-0.3-adpstyle5" => 2
	, "fssp-local-adpstyle5" => 1
	);

#main
{    
    my $target;		# name of target sequence
    my $scores_file;	# name of rdb file to find E_values in
    			# defaults to $target.best-scores.rdb
    my $max_align=9999;	#maximum number of alignments to output
    my $select_re=".";	# alignment name must match this regular-expression to be included
    GetOptions(
    	      "target=s"	=> \$target
	    , "scores_file=s"	=> \$scores_file
	    , "max_align=n"	=> \$max_align
	    , "select_re=s"	=> \$select_re
	    , "help|?"	=>	sub {pod2usage("verbose"=>1);}
	    , "man"		=> 	sub {pod2usage("verbose"=>2);}
	    )
	    or pod2usage("verbose"=>0);
    if (! defined($target))
    {   print STDERR "Error: $PROGRAM_NAME requires -target argument\n";
        pod2usage("verbose"=>0);
    }
    
    $scores_file = $target. ".best-scores.rdb" if (!defined($scores_file));
    if (! -e $scores_file)
    {    die "Error: $PROGRAM_NAME can't fnd scores file $scores_file\n";
    }
        
    my @templates = ();		# name of templates from scores file in order by e-value
    
    # hashes indexed by template name
    my %SCOP = ();	# SCOP ids from scores file
    			# (fake if begins with "fake")
    my %SUID = ();	# SCOP su-ids from scores file
    			# (fake if begins with "fake")
    my %EVALUES = ();	# evalues for templates

    read_scores($scores_file,@templates,%SCOP,%SUID,%EVALUES);
    
    # hashes indexed by template name
    my %best_align = ();	# best alignment for template
    my %cost_for_best_align = ();	# alignment e-value for best alignment

    # find the alignment to use for each template:
    foreach my $id (@templates)
    {    find_best_alignment($target, $id, $select_re,
    		%best_align,    %cost_for_best_align);
    }
    
    
    my $num_printed = 0;	# How many alignments were printed?
    print "Alignment	Evalue	Align_Evalue	SCOP	SUID\n";
    print "50S	10N	10N	50S	10S\n";
    
    # Now reduce redundancy by removing extra hits to same family
    # Check each template to see if there is a different template with
    #  SCOP family that has a better alignment.
    CHECK: foreach my $template (@templates)
    {   next CHECK if (!defined($best_align{$template}));
	my $s = $SCOP{$template};
	my $s_fake =($s =~ /^fake/);
	
# Sun Apr  6 11:33:56 PDT 2008 Kevin Karplus
# Commented out the check for duplicate hits, so that all get reported.
#	# check for duplicate hit to same SCOP domain
#	OTHER: foreach my $other (@templates)
#	{   next OTHER if ($template eq $other);	# don't supress self
#
#	    # don't suppress if SCOP is mismatch or both are blank
#	    my $so = $SCOP{$other};
#	    next OTHER if ($s ne $so || $s_fake);
#
#	    if (is_better_alignment($best_align{$other}, $EVALUES{$other},
#			$best_align{$template}, $EVALUES{$template}))
#	    {   # print STDERR "DEBUG: $template suppressed in favor of $other:\n"
#		#	. "DEBUG:	$best_align{$other} => $EVALUES{$other}\n"
#		#	. "DEBUG:	$best_align{$template} => $EVALUES{$template}\n";
#		next CHECK;
#	    }
#	}

	if ($num_printed < $max_align)
	{   # $template not suppressed, so print it
	    print "$best_align{$template}\t$EVALUES{$template}\t$cost_for_best_align{$template}\t"
		. ($s !~ /^fake/? $s:"") . "\t"
		. (defined($SUID{$template})? $SUID{$template}: "")
		. "\n";
    	    $num_printed++;
	}
    }
    

}


# read in a scores file in RDB format, saving the names of the
#	templates in order in @$templates_ref
#	Store the SCOP domains in %$scop_ref as a comma-separated list
#	Store the SCOP suids in %$suid_ref as a comma-separated list
#	Store the E-values in %$evalue_ref
sub read_scores($\@\%\%\%)
{    my ($scores_file, $templates_ref, $scop_ref, $suid_ref, $evalue_ref) = @_;
    open(SCORES_FILE, $scores_file) 
    	|| die "Error: $PROGRAM_NAME can't open $scores_file for reading.\n";
    
    my @col_names;
    my %col_num;
    read_rdb_header(*SCORES_FILE{IO}, @col_names, %col_num);
    my	$idcolnum = $col_num{"Sequence_ID"};
    #	$lencolnum = $col_num{"Length"};
    #	$BEcolnum = $col_num{"Best_Evalue"};	#unused in html output
    my $Ecolnum = $col_num{"Evalue"};
    my	$SCOPcolnum = $col_num{"SCOP_domain"};
    my	$SUIDcolnum = $col_num{"SCOP_suid"};

    my $id ;
    my $num_templates = 0;
    while(my $line = <SCORES_FILE>)
    {   chomp $line;
	my @cols = split('\t', $line);
	if ($cols[$idcolnum] =~ /\w+/)
	{   $id = $cols[$idcolnum];
	    $$templates_ref[$num_templates] = $id;
	    $num_templates++;
	    $$scop_ref{$id} = (defined($cols[$SCOPcolnum]) 
		    	&& ($cols[$SCOPcolnum] ne ""))? 
		    $cols[$SCOPcolnum] 
		    : ("fake" . $num_templates);
	    $$evalue_ref{$id} = $cols[$Ecolnum];
	    $$suid_ref{$id} = defined($cols[$SUIDcolnum])?
		    $cols[$SUIDcolnum]: " ";

	}
	else
	{   my $SCOPc = defined($cols[$SCOPcolnum])? $cols[$SCOPcolnum] : "";
	    $$scop_ref{$id} .= ",".$SCOPc;
	    $$suid_ref{$id} .= "," .
		    (defined($cols[$SUIDcolnum])? $cols[$SUIDcolnum] : " ");
	}
	# possible bug: the SCOP ids for multiple domains may be in DIFFERENT
	# orders in different templates, even if they have the same structure.
	# Check 1aipE vs 1efcA?
    }
    close(SCORES_FILE);
    print STDERR "$PROGRAM_NAME read $num_templates templates\n";

}

# read in the header of an RDB file and store the column names in @$col_names_ref
# with a reversed index in %$col_num_ref.
sub read_rdb_header(*\@\%)
{   my ($FILE,$col_names_ref,$col_num_ref) = @_;
    # skip the comments at the beginning
    while (<$FILE> )
    {	last if /.+/ && !(/^\s*#/);
    }
    
    # read the column names
    chomp;
    @$col_names_ref = split(/\t/);

    # index the column names
    for(my $i = 0; $i<scalar(@$col_names_ref); $i++)
    {   $$col_num_ref{$$col_names_ref[$i]} = $i;
        # print STDERR "column $i is $$col_names_ref[$i]\n";
    }
    
    # skip the field-length definitions
    <$FILE>;
}


# find the best alignment for $template and save its name in $$best_id_ref{$template}
# and its e-value in $$cost_best_ref{$template} (of the sequence with poorer score)
sub find_best_alignment($$$\%\%)
{   my ($target, $template, $select_re, $best_id_ref, $cost_best_ref) = @_;
    
    #find directory where candidate's alignments are stored
    if (! -d $template)
    {   print STDERR "$PROGRAM_NAME  Warning: no directory for $template\n";
        return;
    }
    
    open(THEFILES, "ls $template | ");
    while(my $filename = <THEFILES>)
    {   chomp $filename;
	next if ($filename !~ /[.]dist([.]gz)?$/);
	next if ($filename !~ /$select_re/);

	my $full_filename= "$template/$filename";

        my $align_name = $full_filename;
	$align_name =~ s/[.]dist([.]gz)?/.a2m/;
	if (! -e $align_name)
	{   my $align_name_gz = $align_name . ".gz";
	    if (! -e $align_name_gz)
	    {   print STDERR "$PROGRAM_NAME: Warning: neither $align_name nor $align_name_gz exist\n";
	    	next;
	    }
	    $align_name = $align_name_gz;
	}
	
	if ($filename =~ /[.]dist[.]gz$/)
	{     open(DISTFILE, "gunzip -c $full_filename|") 
		|| die "Error: $PROGRAM_NAME can't open $full_filename for reading\n";
	}
	else
	{     open(DISTFILE, "<$full_filename") 
		|| die "Error: $PROGRAM_NAME can't open $full_filename for reading\n";
	}
	
	my $template_cost=9999999999999999;
	my $target_cost=9999999999999999;
	while(my $l = <DISTFILE>)
	{   next if $l =~ /^[%#]/;
	
	    my @cols = split(" ", $l);
	    
	    if ($cols[0] eq $template) 
	    {	$template_cost= $cols[4];
	    }
	    if ($cols[0] eq $target) 
	    {	$target_cost= $cols[4];
	    }
	}
	close(DISTFILE);
	
	# Fri Apr 18 14:06:48 PDT 2008 Kevin Karplus
	# Changed warning to only complain when both sequences are
	# missing from dist file, since new alignment method for
	# target models only runs template sequence, not both.
	if (!defined($target_cost) && !defined($template_cost))
	{   print STDERR "$PROGRAM_NAME  Warning: no $target in $full_filename\n";
	    print STDERR "$PROGRAM_NAME  Warning: no $template in $full_filename\n";
	    next;
	}
	
	# the cost is the more expensive of the two sequences
	my $cost = ($target_cost<$template_cost)? $template_cost: $target_cost;
	
	if (is_better_alignment($align_name, $cost, 
			$$best_id_ref{$template}, $$cost_best_ref{$template}))
	{   $$best_id_ref{$template} = $align_name;
	    $$cost_best_ref{$template} = $cost;
	}
    }
    close(THEFILES);
    
    # print STDERR "DEBUG: Best alignment for $template is $$best_id_ref{$template}\n";
}


# compare alignment with name1 and cost1 against
#	alignment with name2 and cost2
# return 1 if name1 to be preferred, 0 if name2 to be preferred
sub is_better_alignment($$$$)
{   my ($name1, $cost1, $name2, $cost2) = @_;
    return 1 if (!defined($name2) || !defined($cost2));
    
    # print STDERR "DEBUG: name1=$name1 name2=$name2\n";    
    
    # get priorities for alignments
    my $priority1=0;
    my $priority2=0;
    foreach my $al_type (keys %::BestAlignmentType)
    {   my $match1 = ($name1 =~ /$al_type/);
        $priority1 = $::BestAlignmentType{$al_type} if $match1;
#	print STDERR "DEBUG: $name1 ". ($match1? "matches ": "doesn't match ")
#		.$al_type."\n";
	my $match2 = ($name2 =~ /$al_type/);
#	print STDERR "DEBUG: $name2 ". ($match2? "matches ": "doesn't match ")
#		.$al_type."\n";
	$priority2 = $::BestAlignmentType{$al_type} if $match2;
    }
    $priority1 += 1.5 if ($name1 =~ /-t04-/);
    $priority2 += 1.5 if ($name2 =~ /-t04-/);
    
    $priority1 += 2.5 if ($name1 =~ /-t06-/);
    $priority2 += 2.5 if ($name2 =~ /-t06-/);
    
     print STDERR "DEBUG: $name1 has priority $priority1  $name2 has priority $priority2\n";
    
    return 1 if ($priority1 > $priority2);
    return 0 if ($priority1 < $priority2);
    # if they have the same priority, the lower cost is better
    return $cost1 < $cost2;
}