#!/usr/bin/perl =head1 NAME cfneval.pl - evaluates one or more cost functions =head1 SYNOPSIS cfneval.pl [I] [I] =head1 DESCRIPTION Parses a set of cost function evaluation files (one for each target). For each target and cost function, an evaluation metric (Kendall's tau by default) is computed between the cost function and some measure of quality (GDT by default). The result is printed as an RDB file with columns for each cost function and rows for each target. E.g., target burial_cfn hbond_cfn S N N targ1 0.143 0.114 tart2 0.892 0.215 targ3 0.520 0.223 =head2 ARGUMENTS I, if specified, is the RDB file to which the results are written. If no I is given, results will be printed to STDOUT. =head2 OPTIONS =over 4 =item B<-h, --help> displays the usage, arguments, and options section of this program's documentation to STDOUT. =item B<-m, --man> displays full man-page style documentation (similar to the man command). =item B<-v, --verbose> displays verbose messages. =item B<-c >I, B<--cost_fcn=>I specifies the cost function to examine; this option may be specified multiple times to examine multiple cost functions. If not specified, the script will process all cost functions. =item B<-e >I, B<--eval_dir=>I specifies the directory where the evaluation is done. By default, this is "/projects/compbio/experiments/protein-predict/casp7/". =item B<-t >I, B<--target_file=>I specifies a file containing a list of targets, one per line, which contain the names of directories in I that contain a I to be evaluated. By default this is, "/projects/compbio/experiments/protein-predict/casp7/target_list.txt" =item B<-s >I, B<--score_file=>I specifies the relative directory to the RDB score file from the target directory in the evaluation directory. By default this is, "decoys/evaluate.anglevector.rdb". =item B<-q >I, B<--quality_metric=>I specifies the quality metric used to assess model quality. By default this is "GDT". =item B<-r >I, B<--cor_metric=>I specifies the correlation metric to use when comparing the cost function to the cheating cost function. By default this is "tau" for Kendall's tau but it may also be set to "btr" for the porportion of decoys scoring better-than-real or better than the real structure. =item B<-f >I, B<--filter_mis_atom=>I specifies that only models with no more than I missing atoms will be used when computing the statistics. =item B<-a >I, B<--alpha=>I when computing tau, weight points according to their cost function ranks using the formula: -exp(-alpha * r / (n - 1)) where alpha is I, r is the rank of the point, and n is the total number of points. Positive values of alpha will result in points with lower cost being weighted more. A more detailed description of the mathematics will be written elsewhere. Kevin Karplus has suggested using alpha=1 for weighting, as this value seems to produce more reasonable results than alpha=0, which is standard Kendall's tau. However, no justifiable method was used to determine an ideal value for alpha--just the subjective evaluation of cost function versus quality metric plots. The default is alpha=0. =item B<-u>, B<--make_score_file> if specified, instead of an RDB file, commands required to create the Undertaker score files will be outputted to the standard output. This command list is suitable for piping to tcsh or para-trickle-make -command ' ' -max_jobs 5 =back =head1 AUTHOR Created by: John Archie Created on: 2007-08-18 RCS/CVS Information: $Id$ $Source$ =head1 BUGS This script does not check the input files for validity. The "btr" measure assumes that the model with the smallest cheating cost function score is the real structure. Hopefully, this is correct for your data! =cut use strict; use warnings; use Getopt::Long qw(:config gnu_getopt); use Pod::Usage; use List::Util qw(min); ###################################################### ## Function forward declarations ###################################################### sub kendall(\@\@); sub btr(\@\@$); ###################################################### ## Define constants and initialize global variables ###################################################### my( $verbose, # if true, print verbose output $filter_mis_atom, # if true, filter-out missing atoms @cost_fcn, # the list of cost functions to evaluate $make_score_file, # the path to a cost function script ); my $alpha = 0; # alpha controls the weighting function for computing tau my $eval_dir = "/projects/compbio/experiments/protein-predict/casp7/"; my $score_file = "decoys/evaluate.anglevector.rdb"; my $target_file = $eval_dir . "target_list.txt"; my $quality_metric = "GDT"; my $cor_metric = "tau"; # columns which are cheating cost functions or are not cost functions my %ignore_cols = map { $_ => 1 } ( "name", "length", "real_hbond", "real_hbond_u", "decoy_hbond", "decoy_hbond_u", "real_NO_hbond", "real_NO_hbond_u", "decoy_NO_hbond", "decoy_NO_hbond_u", "knot", "clens", "rmsd", "log_rmsd", "rmsd_ca", "log_rmsd_ca", "GDT", "smooth_GDT", "missing_atoms", "real_cost", ); #################################### ## process command line arguments #################################### GetOptions "v|verbose" => \$verbose, "h|help" => sub { pod2usage -verbose => 1 }, "m|man" => sub { pod2usage -verbose => 2 }, "c|cost_fcn=s" => \@cost_fcn, "e|eval_dir=s" => \$eval_dir, "s|score_file=s" => \$score_file, "t|target_file=s" => \$target_file, "q|quality_metric=s" => \$quality_metric, "r|cor_metric=s" => \$cor_metric, "f|filter_mis_atom=i" => \$filter_mis_atom, "u|make_score_file" => \$make_score_file, "a|alpha=f" => \$alpha, or die "Try the -h option for program documentation.\n"; pod2usage -verbose => 1 if @ARGV > 1; ######################################################### ## It's time to actually do something. ######################################################### #TODO remove this line (it's for debugging) $| = 1; # open the output file, if necessary if(@ARGV) { open OUTFILE, ">$ARGV[0]" or die "unable to open $ARGV[0]: $!\n"; select OUTFILE; } # print the RDB header, if cost functions are known # (if not, this header will be printed later) if(@cost_fcn) { print "target\t", join("\t", @cost_fcn), "\n", "S\t", join("\t", map 'N', @cost_fcn), "\n"; } # Get the list of targets open TARGETS, "$target_file" or die "unable to open $target_file: $!\n"; my @targets = ; chomp @targets; close TARGETS; # Go through each target... foreach my $target (@targets) { if($make_score_file) { print "cd $eval_dir/$target; make $score_file\n"; next; } # open the file of scores for the target my $score_file_full = $eval_dir . "/" . $target . "/" . $score_file; open SCORES, $score_file_full or die "unable to open $score_file_full: $!\n"; # put the data into memory my %score_colname2idx; my @score_data; my $datalineno = 0; # the line number (not including comment lines) while() { next if /^#/; ++$datalineno; chomp; my @fields = split /\t/; if($datalineno == 1) { %score_colname2idx = map { $fields[$_] => $_ } 0..$#fields; } elsif($datalineno > 2) { push @score_data, \@fields; } } # get the scores from the cheating cost function my $real_cfn_colno = $score_colname2idx{$quality_metric} or die "unable find $quality_metric column for target $target\n"; my @y = map $score_data[$_][$real_cfn_colno], 0..$#score_data; # print an RDB header if the list of @cost_fcn is to be automagically # determined from the first file if(!@cost_fcn) { @cost_fcn = grep !defined $ignore_cols{$_}, keys %score_colname2idx; print "target\t", join("\t", @cost_fcn), "\n", "S\t", join("\t", map 'N', @cost_fcn), "\n"; } # compute the subset of data points in which we are interested my $missing_atoms_colno = $score_colname2idx{'missing_atoms'}; my @model_subset = 0..$#score_data; if(defined $filter_mis_atom && !$missing_atoms_colno) { die "unable to find missing_atoms column\n"; } # find the index of the smallest value in @y my $y_min = min @y; my($y_min_idx) = grep $y[$_] == $y_min, 0..(@y-1); @model_subset = grep { (! defined $filter_mis_atom || $score_data[$_][$missing_atoms_colno] <= $filter_mis_atom) && $_ != $y_min_idx } @model_subset; my @y_sub = @y[@model_subset]; # finally, actually compute and print the statistics print "$target"; foreach(@cost_fcn) { my $cost_fcn_colno = $score_colname2idx{$_} or die "unable find $_ column for target $target\n"; my @x = map $score_data[$_][$cost_fcn_colno], 0..$#score_data; my @x_sub = @x[@model_subset]; # get the real structure's score my $real_score = $x[$y_min_idx]; if($cor_metric eq "tau") { printf "\t%.3f", kendall(@x_sub, @y_sub); } elsif($cor_metric eq "btr") { printf "\t%.3f", btr(@x_sub, @y_sub, $real_score); } else { die "unrecognized value given for --cor_metric"; } } print "\n"; close SCORES; } ######################################################### ## kendall(@x, @y) ## returns Kendall's tau among the points $x and $y ## ## The complexity is O(n log n) when there are no ties ## ## This algorithom is based on the one described in ## William R. Knight, A Computer Method for Calculating ## Kendall's Tau with Ungrouped Data, ## JASA, Vol 61, No 314, Part 1 ## ## The approach has been generalized to support point ## weighting via the global variable $alpha. ######################################################### sub kendall(\@\@) { my($x, $y) = @_; die "unequal number of x and y points when computing tau\n" if @$x != @$y; die "not enough points for computing tau\n" if @$x < 2; my($fromArray, $toArray, $tmpArray); # @$fromArray will contain @$x and @$y indexes, but sorted by @$x $fromArray = [ sort { $x->[$a] <=> $x->[$b] || $y->[$a] <=> $y->[$b] } 0..(@$x-1) ]; my $firstSameX = 0; my $totalWeight = exp(0); my @w; # the weights given to each individual point for(my $i = 1; $i <= @$x; ++$i) { if($i == @$x || $x->[$fromArray->[$firstSameX]] != $x->[$fromArray->[$i]]) { for(my $j = $firstSameX; $j < $i; ++$j) { $w[$fromArray->[$j]] = $totalWeight / ($i - $firstSameX); } $firstSameX = $i; $totalWeight = 0; } $totalWeight += exp(-$alpha * $i / (@$x - 1)); } # count ties in x and subtract off ties in both x and y for each point # (ties in both x and y will be counted again when ties in y are added) my @ties; my $firstSameXY = $firstSameX = 0; for(my $i = 1; $i <= @$x; ++$i) { if($i == @$x || $x->[$fromArray->[$firstSameX]] != $x->[$fromArray->[$i]]) { for(my $j = $firstSameX; $j < $i; ++$j) { $ties[$fromArray->[$j]] += $i - $firstSameX - 1; } for(my $j = $firstSameXY; $j < $i; ++$j) { $ties[$fromArray->[$j]] -= $i - $firstSameXY - 1; } $firstSameXY = $firstSameX = $i; } elsif($y->[$fromArray->[$firstSameXY]] != $y->[$fromArray->[$i]]) { for(my $j = $firstSameXY; $j < $i; ++$j) { $ties[$fromArray->[$j]] -= $i - $firstSameXY - 1; } $firstSameXY = $i; } } # count discordant pairs via merge sort my @disPairs; for(my $psize = 1; $psize < @$x; $psize *= 2) { my $bend = 0; # we lie about the last bend to make the loop start at a=0 while($bend < @$x) { my $a = $bend; # [a, aend) will be merged with [b, bend) my $b = my $aend = $a + $psize; $bend = ($b + $psize < @$x ? $b + $psize : @$x); for(my $i = $a; $i < $bend; ++$i) { if($b >= $bend || $a < $aend && $y->[$fromArray->[$a]] <= $y->[$fromArray->[$b]]) { $toArray->[$i] = $fromArray->[$a]; $disPairs[$toArray->[$i]] += $i - $a++; } else { $toArray->[$i] = $fromArray->[$b]; $disPairs[$toArray->[$i]] -= $i - $b++; } } } $tmpArray = $fromArray; $fromArray = $toArray; $toArray = $tmpArray; } # add up the ties in @$y my $firstSameY = 0; for(my $i = 1; $i <= @$x; ++$i) { if($i == @$x || $y->[$fromArray->[$firstSameY]] != $y->[$fromArray->[$i]]) { for(my $j = $firstSameY; $j < $i; ++$j) { $ties[$fromArray->[$j]] += $i - $firstSameY - 1; } $firstSameY = $i; } } my $totalDis = $totalWeight = 0; for(my $i = 0; $i < @$x; ++$i) { $totalWeight += $w[$i]; $totalDis += $w[$i] * $disPairs[$i] + 0.5 * $w[$i] * $ties[$i]; } $totalWeight *= @$x - 1; # each point is part of @$x - 1 pairs 1 - 2 * $totalDis / $totalWeight; } ######################################################### ## btr(@x, @y, $real_score) ## returns the "better-than-real" metric between ## the cost function scores @x and the cheating ## cost function scores @y. "better-than-real" is the ## porportion of structures scoring better than the actual ## structure. The structure with the lowest cheating ## cost function score is assumed to be the real ## structure. ######################################################### sub btr(\@\@$) { my($x, $y, $real_score) = @_; die "unequal number of x and y points when computing btr\n" if @$x != @$y; die "not enough points for computing btr\n" if @$x < 2; # count the number of structures scoring "better-than-real" my $btr_count = grep $_ < $real_score, @$x; # return the better-than-real score $btr_count / @$x; }