#!/usr/bin/perl -w

# Usage: 
#	best_scores seed.t2k-100-30-dssp-ebghstl-scores.rdb  . . . > seed.combined-scores.rdb
#
#   optional arguments:
#	-num 10		minimum number of hits to output
#	-maxnum 100	maximum number of hits to output
#	-E  0.10	report any hit with this Evalue or better in ANY of the input files

# Read in all the score files.
# Output a "best hits" list (sorted by Evlaue) in RDB format that includes
# Sequence_ID Length Best_Evalue  Evalue FSSP-rep SCOP_domain SCOP_suid

# The evalue is calculated as a weighted geometric average of the values
# in the different RDB values.
# There is a default E value for missing values, and special treatment for
#	hits that are not part of the template library.

    
# Sun Jan  2 09:10:56 PST 2005 Kevin Karplus
# 	Added -maxnum parameter, and changed default num to 10
# Thu Jul 13 17:32:01 PDT 2006 Kevin Karplus
#	Added weights for o_notor2, n_sep, and o_sep alphabets.
#	Adjusted weights to rely less on 3-track HMMs
#	Fixed bug that did not check whether template was in t06 library.
    
# prototypes
sub print_usage_exit ();
sub print_best($$$\%);
sub read_scop_suid($\%);
sub read_rdb_header(*\@\%);

#main
{    
    use FileHandle;
    STDERR->autoflush(1);
    use Getopt::Long;
    use English;
    use File::Basename;
    
    use lib dirname($PROGRAM_NAME);
    use ChainIds;
    
    # defaults
    my $min_num_best = 10;
    my $max_num_best = 100;
    my $E_thresh = 0.10;
    my $LIBSIZE = 9000;
    undef @rdb_files;
    my $scop_file = "/projects/compbio/data/scop/dir.cla.scop.txt.gz";

    GetOptions(	  "num=n" => \$min_num_best
    		, "maxnum=n" => \$max_num_best
		, "E=f"	=> \$E_thresh
		, "lib_size=n" => \$LIBSIZE
		, "scop_file=s" => \$scop_file
		);
    @rdb_files = @ARGV;
    &print_usage_exit if(scalar(@rdb_files)<2) ;
    
    $weight{"t2k-100-30-dssp-ebghstl"} = 2.0;
    $weight{"t2k-100-30-stride-ebghtl"} = 2.0;
    $weight{"t2k-100-30-bys"} = 1.0;
    $weight{"t2k-100-30-str"} = 1.0;
    $weight{"t2k-100-30-str2"} = 3.0;
    $weight{"t2k-100-30-alpha"} = 2.0;
    $weight{"t2k-100-30-n_notor2"} = 1.5;
    $weight{"t2k-100-30-o_notor2"} = 1.5;
    $weight{"t2k-100-30-n_sep"} = 1.5;
    $weight{"t2k-100-30-o_sep"} = 1.5;
    $weight{"t2k-100-30-dssp-ehl2"} = 3.0;
    $weight{"t2k-100-30-near-backbone-11"} = 3.0;
    $weight{"t2k-100-30-CB_burial_14_7"} = 2.0;
    $weight{"t2k-100-40-40-str+CB_burial_14_7"} = 2.0;
    $weight{"t2k-100-40-40-str2+CB_burial_14_7"} = 2.0;
    $weight{"t2k-80-60-80-str2+near-backbone-11"} = 2.0;
    $weight{"t2k-w0.5"} = 2.0;
    
    $weight{"t04-100-30-dssp-ebghstl"} = 2.0;
    $weight{"t04-100-30-stride-ebghtl"} = 2.0;
    $weight{"t04-100-30-bys"} = 1.0;
    $weight{"t04-100-30-str"} = 1.0;
    $weight{"t04-100-30-str2"} = 4.0;
    $weight{"t04-100-30-alpha"} = 2.0;
    $weight{"t04-100-30-dssp-ehl2"} = 3.0;
    $weight{"t04-100-30-n_notor2"} = 1.5;
    $weight{"t04-100-30-o_notor2"} = 1.5;
    $weight{"t04-100-30-n_sep"} = 1.5;
    $weight{"t04-100-30-o_sep"} = 1.5;
    $weight{"t04-100-30-near-backbone-11"} = 4.0;
    $weight{"t04-100-30-CB_burial_14_7"} = 2.0;
    $weight{"t04-100-40-40-str+CB_burial_14_7"} = 2.0;
    $weight{"t04-100-40-40-str2+CB_burial_14_7"} = 2.0;
    $weight{"t04-80-60-80-str2+near-backbone-11"} = 2.0;
    $weight{"t04-w0.5"} = 3.0;

    $weight{"t06-100-30-dssp-ebghstl"} = 2.0;
    $weight{"t06-100-30-stride-ebghtl"} = 2.0;
    $weight{"t06-100-30-bys"} = 1.0;
    $weight{"t06-100-30-str"} = 1.0;
    $weight{"t06-100-30-str2"} = 4.0;
    $weight{"t06-100-30-alpha"} = 2.0;
    $weight{"t06-100-30-n_notor2"} = 1.5;
    $weight{"t06-100-30-o_notor2"} = 1.5;
    $weight{"t06-100-30-n_sep"} = 1.5;
    $weight{"t06-100-30-o_sep"} = 1.5;
    $weight{"t06-100-30-dssp-ehl2"} = 3.0;
    $weight{"t06-100-30-near-backbone-11"} = 4.0;
    $weight{"t06-100-30-CB_burial_14_7"} = 2.0;
    $weight{"t06-100-40-40-str2+CB_burial_14_7"} = 2.0;
    $weight{"t06-80-60-80-str2+near-backbone-11"} = 2.0;
    $weight{"t06-w0.5"} = 3.0;
    
    $weight{"template-lib"} = 6.0;
    $weight{"t2k-template-lib"} = 6.0;
    $weight{"t04-template-lib"} = 8.0;
    $weight{"t06-template-lib"} = 9.0;

    # if a chain is not one of our template id's, we report
    # the one-track target E-value, averaged with the default Evalue.
    #  for the default Evalue for missing scores, take the geometric mean of
    #  the best unreported score (assuming -Emax 40.) and the library size,
    #  but weight the emax value more
    my $threshold_weight= 3.0;
    $LOG_DEFAULT_EVALUE = ($threshold_weight*log(40.)+log($LIBSIZE))/ ($threshold_weight+1.0);

    # build a hash of all ID's in our template library
    # that we can use to check membership for ID's in the files
    $T2K_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t2k.ids";
    ReadIDs($T2K_ID_FILE, %t2k_ids);

    # build a hash of all ID's in our template library
    # that we can use to check membership for ID's in the files
    $T04_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t04.ids";
    ReadIDs($T04_ID_FILE, %t04_ids);

    # build a hash of all ID's in our template library
    # that we can use to check membership for ID's in the files
    $T06_ID_FILE = "/projects/compbio/experiments/models.97/indexes/t06.ids";
    ReadIDs($T06_ID_FILE, %t06_ids);


    # each chainID will be the key in a variety of hashes
    # that we'll use to keep track of all the information
    # our hashes will be:
    %len = ();
    %BestLogEvalue = ();
    %logEvalue = ();
    %total_weight = ();
    %FSSP_rep = ();
    %SUID = ();	# comma-separated list of SUIDs for chain
    
    # mapping from SUID values to SCOP codes.
    my %SCOP_from_SUID;
    read_scop_suid($scop_file,%SCOP_from_SUID);
    

    #  we will have multiple entries under each key
    #  for many of these.  For instance, each key that
    #  is in our template library will have multiple (weighted)
    #  Evalues, keys may have several SCOP_domains, etc.
    # Weights for the E-values are set by which file we get the id from.

    # $desired_total_weight will be the sum of the weights for all files.
    my $desired_total_weight = 0;
    
    # how much weight is available due to membership in template libraries
    my $total_t2k_weight=0;
    my $total_t04_weight=0;
    my $total_t06_weight=0;


    foreach $scorefile (@rdb_files)
    {
	# print STDERR "reading $scorefile\n";
	# we want to know which file we're in to assign E-value weights
	$scorefile =~ /\S+[.](t\S+)-scores.rdb/;
	$score_type = $1;
	if (!defined ($weight{$score_type}))
	{	die "Invalid input file: $scorefile, unrecognized type $score_type\n";
	}
	$score_wt = $weight{$score_type};
	$desired_total_weight += $score_wt;
        $total_t2k_weight += $score_wt if (($scorefile =~ /t2k/) && ($scorefile !~ /t2k-w0.5/));
        $total_t04_weight += $score_wt if (($scorefile =~ /t04/) && ($scorefile !~ /t04-w0.5/));
        $total_t06_weight += $score_wt if (($scorefile =~ /t06/) && ($scorefile !~ /t06-w0.5/));

	open (SCOREFILE, $scorefile);
	my @col_names;
	my %col_num;
	read_rdb_header(*SCOREFILE{IO}, @col_names, %col_num);
	$idcolnum = $col_num{"Sequence_ID"};
	$idcolnum = $col_num{"Template_ID"} if !defined($idcolnum);
	$idcolnum = $col_num{"HMM"} if !defined($idcolnum);
	
	$lencolnum = $col_num{"Length"};
	$lencolnum = $col_num{"Template_Length"} if !defined($lencolnum);
	$lencolnum = $col_num{"HMMLEN"} if !defined($lencolnum);
	
	$Ecolnum = $col_num{"Target_Evalue"};
	$Ecolnum = $col_num{"Template_Evalue"} if !defined($Ecolnum);
	$Ecolnum = $col_num{"EVALUE"} if !defined($Ecolnum);
	
	$FSSPcolnum = $col_num{"FSSP-rep"};
	$SCOPcolnum = $col_num{"SCOP_domain"};
	$suidcolnum = $col_num{"SCOP_suid"};
	
	while($line=<SCOREFILE>)
	{   chomp($line);
	    @cols = split( /\t/, $line ) ; 
	    $id = $cols[$idcolnum];
	    $length = $cols[$lencolnum];
	    $Evalue = $cols[$Ecolnum];
	    $FSSP = $cols[$FSSPcolnum] if defined($FSSPcolnum);
	    $SCOP = $cols[$SCOPcolnum] if defined($SCOPcolnum);
	    $SCOP_suid = $cols[$suidcolnum] if defined($suidcolnum);

	    # line which gives only additional scop domains
	    if (!($id =~ /\S+/))
	    {  	# no chainID found in the first column (use id from previous line)
		if ( ($SCOP =~ /^\S+/) && ($SCOP_suid  =~ /^\S+/) )
		{   # if SCOP defined
		    $SUID{$lastID} .= ",$SCOP_suid";
		}
		next;
	    }
	    $FSSP_rep{$id} = $FSSP if defined($FSSPcolnum);
	    push @{$len{$id}}, $length;

	    #  add the log E-value, multiplied by the scaling factor
	    $logE = $Evalue<=0? -999: log($Evalue);
	    if (!defined($logEvalue{$id})) 
	    {
		$logEvalue{$id} = $logE*$score_wt;
		$total_weight{$id} = $score_wt;
	    }
	    else 
	    {
		$logEvalue{$id} += $logE*$score_wt;
		$total_weight{$id} += $score_wt;
	    }

	    if (!defined($BestLogEvalue{$id}) || $logE < $BestLogEvalue{$id})
	    {   $BestLogEvalue{$id} = $logE;
	    } 

	    # load up SCOP domain info if it's there
	    if (defined($SCOPcolnum) && defined($SCOP_suid)
	    	&& ($SCOP =~ /^\S+/) && ($SCOP_suid  =~ /^\S+/) )
	    {    $SUID{$id}=$SCOP_suid;
	    }
	    $lastID = $id;
	}
	close(SCOREFILE);
    }


#    print STDERR "DEBUG: desired weight= $desired_total_weight\n";
    # compute the Evals with our current rules for missing weights.
    foreach $id (keys %len)
    {
        # how much of the possible weight did this id get
	my $MissingWeight = $desired_total_weight - $total_weight{$id};
	$MissingWeight -= $total_t2k_weight if (!defined ($t2k_ids{$id}));
	$MissingWeight -= $total_t04_weight if (!defined ($t04_ids{$id}));
	$MissingWeight -= $total_t06_weight if (!defined ($t06_ids{$id}));
	
	# penalty for sequences not in template library
	$MissingWeight+= 1.0 if (!defined($t2k_ids{$id})
				 && !defined($t04_ids{$id})
				 && !defined($t06_ids{$id})
				 );
        
	$MissingWeight = 0 if ($MissingWeight < 0);

	my $avg=$logEvalue{$id}/$total_weight{$id};
#	print STDERR "DEBUG: $id total_weight= $total_weight{$id} missing_weight= $MissingWeight, $avg => ";
	$logEvalue{$id} = ($logEvalue{$id} +$MissingWeight*$LOG_DEFAULT_EVALUE)/
		($total_weight{$id} + $MissingWeight);
#	print STDERR "$logEvalue{$id}\n";
    }
    
    print_best($min_num_best, $max_num_best, $E_thresh, %SCOP_from_SUID);
}

sub print_usage_exit ()
{
    print "Usage: best_scores [-num 30] [-E 0.10] seed.t2k-100-30-dssp-ebghstl-scores.rdb  \\\n";
    print "	seed.t2k-100-30-stride-ebghtl-scores.rdb seed-template-lib-scores.rdb . . . > tmp.best.rdb\n";
    print "Must have at least 2 rdb files to combine, or there's no point.\n";
    exit(-1);
}


# print out the top N sequences, sorted by Evalue
#	also any with BestEvalue < threshold
sub print_best($$$\%)
{    my ($min_num_best, $max_num_best, $Ethresh,$scop_ref ) = @_;
    # sort ids by increasing $logEvalue
    @sorted_ids = sort {$logEvalue{$a} <=> $logEvalue{$b}} keys(%logEvalue);
    
    # print headers
    print "Sequence_ID\tLength\tBest_Evalue\tEvalue\tFSSP-rep\tSCOP_domain\tSCOP_suid\n";
    print "5S\t5N\t12N\t12N\t8S\t12S\t12N\n";
    
    my $logthresh = log($Ethresh);
    my $lines_printed = 0;
    
    foreach $id (@sorted_ids)
    {   last if $lines_printed >= $max_num_best;
        next if $lines_printed>= $min_num_best && $BestLogEvalue{$id} > $logthresh;
	$lines_printed ++;
	$bestScore   = sprintf "%5.4e", exp($BestLogEvalue{$id});
	$prettyScore = sprintf "%5.4e", exp($logEvalue{$id});
	$FSSP_id = defined($FSSP_rep{$id})? $FSSP_rep{$id} : "";

	
	if (defined($SUID{$id}))
	{   my $suid_string =  $SUID{$id};
	    my @suids = sort(split(',', $suid_string));
	    $suid_string = join(',', @suids);
	    my $scop_string = join(',', map($$scop_ref{$_}, @suids));
	
    	    print"$id\t$len{$id}->[0]\t$bestScore\t$prettyScore\t$FSSP_id\t$scop_string\t$suid_string\n";
	}
	else
	{	# print line without any domain information
	    print "$id\t$len{$id}->[0]\t$bestScore\t$prettyScore\t$FSSP_id\t\t\n";
	}

    }


}


# read in the SCOP data base, creating a mapping from SUID->SCOP
sub read_scop_suid($\%)
{
    my ($scop_file, $scop_ref) = @_;
    open (SCOP, "gunzip -c $scop_file |")
    	|| die "best_scores can't unzip $scop_file";
    while (<SCOP>)
    {   next if /^#/;
    	my @fields = split('\t');
	$$scop_ref{$fields[4]} = $fields[3];
    }
    
    close SCOP;
}

# read in the header of an RDB file and store the column names in @$col_names_ref
# with a reversed index in %$col_num_ref.
sub read_rdb_header(*\@\%)
{   my ($FILE,$col_names_ref,$col_num_ref) = @_;
    # skip the comments at the beginning
    while (<$FILE> )
    {	last if /.+/ && !(/^\s*#/);
    }
    
    # read the column names
    chomp;
    @$col_names_ref = split(/\t/);

    # index the column names
    for(my $i = 0; $i<scalar(@$col_names_ref); $i++)
    {   $$col_num_ref{$$col_names_ref[$i]} = $i;
        # print STDERR "column $i is $$col_names_ref[$i]\n";
    }
    
    # skip the field-length definitions
    <$FILE>;
}