#!/usr/local/bin/perl -w
#
# Library of routines for managing RDB files
#

package RDBtable;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw( set_name
              get_name
              get_numrows
              get_numcols
              set_numrows
              set_numcols
              get_row
              get_cell
              copy
              init_from_file
              init_from_array
              get_column_index
              get_column
              get_colnames
              append_column
              append_body
              print
              print_to_file
       ); 
use strict;


sub new 
{
	my ($class) = @_;
	bless {
                _comments	=>  [],
		_colnames	=>  [],
		_coldefs	=>  [],
		_body		=>  [],
                _colhash        =>  {},
                _numcols        =>  0,
                _numrows        =>  0,
                _name           =>  undef,
            }, $class;
}

sub set_name($)
{
    my($self, $name) = @_;
    $self->{_name} = $name;
}

sub set_numrows($$)
{
    my($self, $num) = @_;
    $self->{_numrows} = $num;
}

sub set_numcols($$)
{
    my($self, $num) = @_;
    $self->{_numcols} = $num;
}

sub get_name($)
{
    my($self) = @_;
    return $self->{_name};
}

sub get_numrows($)
{
    my($self) = @_;
    return $self->{_numrows};
}

 sub get_numcols($)
{
    my($self) = @_;
    return $self->{_numcols};
}

#returns an array of the column names in the file
sub get_colnames($)
{   
    my($self) = @_;
    return $self->{_colnames};
}

# Given a row number, return an array with row data
#Note: rows are indexed starting with 0
sub get_row($$)
{
    my($self, $rownum) = @_;
#error if rownum out of bounds
    if ($rownum > $self->{_numrows} - 1 )
    {
        print STDERR "Error: request for row number $rownum but RDBtable $self->{_name} has $self->{_numrows} rows\n";
        exit(1);
    }
    return  $self->{_body}->[$rownum]; 

}

#Given a row number and column number, return the data in the cell
#Note: rows and columns are indexed starting with 0
sub get_cell($$$)
{
    my($self,$rownum,$colnum) = @_;
#error if rownum or colnum out of bounds
    if ( ($rownum > $self->{_numrows} - 1 ) || 
         ($colnum > $self->{_numcols} - 1 ) )
    {
        print STDERR "Error: request for cell $rownum:$colnum but RDBtable $self->{_name} has $self->{_numrows} rows and $self->{_numcols} columns\n";
        exit(1);
    }

    return  $self->{_body}->[$rownum][$colnum]; 
}

#copy all fields of the target object to self (the caller)
sub copy($)
{
    my($self,$target) = @_;
    if (defined $target->{_comments})
    {
        foreach my $commentline (@{$target->{_comments}})
        {
            push @{$self->{_comments}}, $commentline;
        }
    }

    foreach my $cn (@{$target->{_colnames}})
    {
        push @{$self->{_colnames}}, $cn;
#also copy the colhash
        ${$self->{_colhash}}{$cn}=${$target->{_colhash}}{$cn};
    } 

    foreach my $cd (@{$target->{_coldefs}})
    {
        push @{$self->{_coldefs}}, $cd;
    } 

    foreach my $row (@{$target->{_body}})
    {
        push @{$self->{_body}}, $row;
    }

    $self->{_numcols} = $target->{_numcols};
    $self->{_numrows} = $target->{_numrows};
    $self->{_name} = $target->{_name};
}

#initialize from a file
#given a filename, loads the comments, colnames, coldefs and body
#constructs a colhash that maps column names to numbers 
sub init_from_file ($$)
{
    my($self, $RDBfilename) = @_;
    open(RDBFILE, "$RDBfilename") || die "Can't open $RDBfilename for reading\n";
    
    my @RDBlines = <RDBFILE>; 
    
    my $lln = 0;
    my $rowcount = 0;
    my $colcount = 0;
 
    $self->{_name} = $RDBfilename;

    foreach my $line (@RDBlines)
    {
# print "line:$line\n";
        chomp $line;
        if ($line =~ /^\#/)
        {
            push @{$self->{_comments}}, $line;
            next;
        }
        
        $lln++ ;

        if( $lln == 1 ) 
        { 
            my @colnames = split( /\t/, $line ) ; # column names
            foreach my $cn (@colnames)
            {
                push @{$self->{_colnames}}, $cn;
#set up the column hash to get a column number from its name
                ${$self->{_colhash}}{$cn}=$colcount++;
            }
        }
        elsif( $lln == 2 )
        {   
            $self->{_numcols} = $colcount;

            my @coldefs = split( /\t/, $line ) ; # data definitions
            foreach my $cd(@coldefs)
            {
                push @{$self->{_coldefs}}, $cd;
            } 
        }
        else
        {
#body is array of arrays with each entry an array of the three column fields
            my @coldata = split( /\t/, $line ); # grab the data
            push @{$self->{_body}}, [ @coldata ];
            $rowcount++;
        }
    }
    $self->{_numrows} = $rowcount;

}


#initialize from an array.  Array must be have the same form as an rdb file:
# comment lines begin with #, first non-# line is col names, second non-# line
# is col defs.  Will accept either a "single column".  If multiple columns are
# used, each array entry should be a tab-separated "row"
sub init_from_array($@$)
{
    my ($self, $RDBlines, $name) = @_;
    my $lln = 0;
    my $rowcount = 0;
    my $colcount = 0;
 
    $self->{_name} = $name;

    foreach my $line (@$RDBlines)
    {
# print "line:$line\n";
        chomp $line;
        if ($line =~ /^\#/)
        {
            push @{$self->{_comments}}, $line;
            next;
        }
        
        $lln++ ;

        if( $lln == 1 ) 
        { 
            my @colnames = split( /\t/, $line ) ; # column names
            foreach my $cn (@colnames)
            {
                push @{$self->{_colnames}}, $cn;
#set up the column hash to get a column number from its name
                ${$self->{_colhash}}{$cn}=$colcount++;
            }
        }
        elsif( $lln == 2 )
        {   
            $self->{_numcols} = $colcount;

            my @coldefs = split( /\t/, $line ) ; # data definitions
            foreach my $cd(@coldefs)
            {
                push @{$self->{_coldefs}}, $cd;
            } 
        }
        else
        {
#body is array of arrays with each entry an array of the three column fields
            my @coldata = split( /\t/, $line ); # grab the data
            push @{$self->{_body}}, [ @coldata ];
            $rowcount++;
        }
    }
    $self->{_numrows} = $rowcount;

}

#given a column name, return its index
sub get_column_index($$)
{
    my($self, $colname) = @_;
    return ${$self->{_colhash}}{$colname};
}

#retrieve a column of data (header info plus body)
#given a column name, return it in an array
#Note: columns are indexed starting with 0
sub get_column($$)
{
    my($self, $colname) = @_;
    my $colnum = ${$self->{_colhash}}{$colname};
    my @column;

    push @column, $colname;
    my $coldef = ${$self->{_coldefs}}[$colnum];
    push @column, $coldef;

    my $i = 0;

     foreach my $row (@{$self->{_body}})
    {
        my $data = $row->[$colnum];
        push @column, $data;
    }
    return @column;
}

#append a column of data (header info plus body)
#inserts a new column to the right of existing ones
sub append_column($$$@)
{
    my($self, $colname, $coldef, $coldata) = @_;


#error checking
    my $numerrs = 0;

  #check that we got all the arguments
    if ($#_ < 3 )
    {
        print STDERR "Error: append column for $self->{_name} requires 3 arguments, got $#_\n";
        $numerrs++;
    }

#check that data we want to append is the correct length
    if ( ($#{$coldata} + 1) !=  $self->{_numrows}  )
    {
        my $coldatalines = $#{$coldata} + 1;
       print STDERR "Error: $self->{_name} length mismatch.  Master has $self->{_numrows} lines and slave has $coldatalines lines\n";
       $numerrs++;
    }
    
    die if ($numerrs > 0);

    my $colnum = $self->{_numcols};
    ${$self->{_colnames}}[$colnum] = $colname;
    ${$self->{_coldefs}}[$colnum] = $coldef;
  
#add to the array hash
    ${$self->{_colhash}}{$colname}=$colnum;

    my $lastrow = $self->{_numrows} - 1;
    for my $i ( 0..$lastrow )
    {
        $self->{_body}->[$i][$colnum] = $$coldata[$i]; 
    }
    $self->{_numcols}++;
}


#appends body of the RDB in the argument to the body of the caller RDB
#requires that they have the same number of columns and that
#column names, definitions and orderings are the same in both
sub append_body($)
{
    my($self, $RDBtocat) = @_;

    my $errorcount = 0;

# error checking
# do both have all non-null fields?
# do the two RDBs match ?

    if ($self->{_numcols} != $RDBtocat->{_numcols})
    {
      print STDERR "$self->{_name} has $self->{_numcols} columns but $RDBtocat->{_name} has $RDBtocat->{_numcols}\n";  
      $errorcount++;
    }

for(my $i = 0; $i < $self->{_numcols}; $i++) 
{
    if (${$self->{_colnames}}[$i] ne ${$RDBtocat->{_colnames}}[$i])
    {
      print STDERR "$self->{_name} column " . $i . " name=${$self->{_colnames}}[$i] but $RDBtocat->{_name} column " . $i . " name=${$RDBtocat->{_colnames}}[$i] \n";  
      $errorcount++;
    }
    if (${$self->{_coldefs}}[$i] ne ${$RDBtocat->{_coldefs}}[$i])
    {
      print STDERR "$self->{_name} column " . $i . " definition=${$self->{_coldefs}}[$i] but $RDBtocat->{_name} column " . $i . " definition=${$RDBtocat->{_coldefs}}[$i] \n";  
      $errorcount++;
    }
}
# exit with error if any test failures
    die if ($errorcount > 0);


    foreach my $row (@{$RDBtocat->{_body}})
    {
        push @{$self->{_body}}, $row;
        $self->{_numrows}++;
    }

}

#caller and target RDB objects are joined but second occurrences of values in
#the joined columns are ignored
#returns the resulting "joined" RDB table
#special purpose routine designed to construct a "canonical" dssp sequence

#PRECONDITION:  both tables must be sorted on the joining columns
#prior to calling join_on_first_occurrence
sub join_on_first_occurrence($$$$)
{
    my($self, $colHead1, $target, $colHead2) = @_;

#get numeric values of desired column heads
    my $cH1 = $self->{_colhash}{$colHead1};
    my $cH2 = $target->{_colhash}{$colHead2};

    my $result = new RDBtable;
#union of self and target column heads are column heads for the result table
#also set up the column hash for the result table
    my $colcount = 0;
    foreach my $cn (@{$self->{_colnames}}){
        push @{$result->{_colnames}}, $cn;
        ${$result->{_colhash}}{$cn}=$colcount++;
    }
    foreach my $cn (@{$target->{_colnames}}){
        push @{$result->{_colnames}}, $cn;
        ${$result->{_colhash}}{$cn}=$colcount++;
      }

    $result->{_numcols} = $colcount;
#now get the column defs
    foreach my $cd (@{$self->{_coldefs}}){
        push @{$result->{_coldefs}}, $cd;
    }
    foreach my $cd (@{$target->{_coldefs}}){
        push @{$result->{_coldefs}}, $cd;
    }


#find and assemble rows to put in result table body
#check for a match (ignore second occurrence)
#between values in $colHead1 and $colHead2
#if there's a match, get the rows from each table, cat together and
#add to result table.  Maintain a row count as we go.
    my $s;   #index into rows of self table
    my $t = 0;   #index into rows of target table
    my $catted_row =0;
    my %dupeCheck = (); 

    for($s = 0; $s < $self->{_numrows}; $s++)
    {
       my $foundFlag = 0;
       my $sVal = $self->get_cell($s, $cH1);
       next if $sVal eq "\t" || $sVal eq "";
        while($foundFlag==0 && $t < $target->{_numrows})
        { 
            my $tVal = $target->get_cell($t, $cH2);

    #       print "sval:$sVal\ttval:$tVal\n";

            if ($sVal eq $tVal)
            {
                $foundFlag = 1;
                next if (defined($dupeCheck{$s}));
                $catted_row = $self->cat_arrays($self->{_body}->[$s], $target->{_body}->[$t]);
                push @{$result->{_body}}, $catted_row;
#enter this item into the dupeCheck hash
                $dupeCheck{$s} = 1;
                $result->{_numrows}++;
           }
            $t++;  
        }
       if($foundFlag==0 && ($t == $target->{_numrows}))
       { # value in cH1 column did not have a match in cH2 
         # reset the index in the target table to 0
         # more efficient would be to memorize its position at the last
         # successful match (except for special case of first search) and start again from there
         # but since these are small tables and this "not found" condition is rare, 
         # for now rewind all the way
           $t = 0;
       }
   }

#give result table a name
    my $sName = $self->get_name();
    my $tName = $target->get_name();
    my $rName = $sName . $tName;
    $result->set_name($rName);
    return $result;
}
#given two array references, return a reference to an array
#which is a concatenated version of both arrays 
sub cat_arrays($$$)
{
    my($self, $array1, $array2) = @_;

    unless (ref($array1) eq 'ARRAY' && ref($array2) eq 'ARRAY') {
        die "usage: cat_arrays ARRAYREF1 ARRAYREF2";
    }

    my @catted_array;

    for (my $i = 0; $i < @$array1; $i++)
    {
        push @catted_array, $array1->[$i];
    }

    for (my $i = 0; $i < @$array2; $i++)
    {
        push @catted_array, $array2->[$i];
    }

    return \@catted_array;
}


#print method
sub print()
{
    my($self) = @_;
    if (defined $self->{_comments})
    {
        foreach my $commentline (@{$self->{_comments}})
        {
            print "$commentline\n";
        }
    }
    my $j = 0;
    foreach my $cn (@{$self->{_colnames}})
    {
        print "$cn";
        $j++;
        if($j < $self->{_numcols}){ print "\t";}
    } 
    print "\n";

    $j = 0;
    foreach my $cd (@{$self->{_coldefs}})
    {
        print "$cd";
        $j++;
        if($j < $self->{_numcols}){ print "\t";}
    } 
    print "\n";

    foreach my $row (@{$self->{_body}})
    {
        for (my $i = 0; $i < $self->{_numcols}; $i++)
        {
            print "$row->[$i]";
            if ($i < ($self->{_numcols} - 1) ) { print "\t"; }
            else { print "\n"; }
        }
    }

}


#print to a file, given a file name (which does not have to exist yet)
sub print_to_file($)
{
    my($self, $filename) = @_;

    open (OUTFILE, ">$filename") || die "Can't open $filename for writing.\n";

    if (defined $self->{_comments})
    {
        foreach my $commentline (@{$self->{_comments}})
        {
            print OUTFILE "$commentline\n";
        }
    }
    my $j = 0;
    foreach my $cn (@{$self->{_colnames}})
    {
        print OUTFILE "$cn";
        $j++;
        if($j < $self->{_numcols}){ print OUTFILE "\t";}
    } 
    print OUTFILE "\n";

    $j = 0;
    foreach my $cd (@{$self->{_coldefs}})
    {
        print OUTFILE "$cd";
        $j++;
        if($j < $self->{_numcols}){ print OUTFILE "\t";}
    } 
    print OUTFILE "\n";

    foreach my $row (@{$self->{_body}})
    {
        for (my $i = 0; $i < $self->{_numcols}; $i++)
        {
            print OUTFILE "$row->[$i]";
            if ($i < ($self->{_numcols} - 1) ) { print OUTFILE "\t"; }
            else { print OUTFILE "\n"; }
        }
    }

    close(OUTFILE);
}


# packages must end with a true value---a restriction that seems
# to be poorly documented.
1;