#!/usr/local/bin/perl -w # # Library of routines for managing RDB files # package RDBtable; require Exporter; @ISA = qw(Exporter); @EXPORT = qw( set_name get_name get_numrows get_numcols set_numrows set_numcols get_row get_cell copy init_from_file init_from_array get_column_index get_column get_colnames append_column append_body print print_to_file ); use strict; sub new { my ($class) = @_; bless { _comments => [], _colnames => [], _coldefs => [], _body => [], _colhash => {}, _numcols => 0, _numrows => 0, _name => undef, }, $class; } sub set_name($) { my($self, $name) = @_; $self->{_name} = $name; } sub set_numrows($$) { my($self, $num) = @_; $self->{_numrows} = $num; } sub set_numcols($$) { my($self, $num) = @_; $self->{_numcols} = $num; } sub get_name($) { my($self) = @_; return $self->{_name}; } sub get_numrows($) { my($self) = @_; return $self->{_numrows}; } sub get_numcols($) { my($self) = @_; return $self->{_numcols}; } #returns an array of the column names in the file sub get_colnames($) { my($self) = @_; return $self->{_colnames}; } # Given a row number, return an array with row data #Note: rows are indexed starting with 0 sub get_row($$) { my($self, $rownum) = @_; #error if rownum out of bounds if ($rownum > $self->{_numrows} - 1 ) { print STDERR "Error: request for row number $rownum but RDBtable $self->{_name} has $self->{_numrows} rows\n"; exit(1); } return $self->{_body}->[$rownum]; } #Given a row number and column number, return the data in the cell #Note: rows and columns are indexed starting with 0 sub get_cell($$$) { my($self,$rownum,$colnum) = @_; #error if rownum or colnum out of bounds if ( ($rownum > $self->{_numrows} - 1 ) || ($colnum > $self->{_numcols} - 1 ) ) { print STDERR "Error: request for cell $rownum:$colnum but RDBtable $self->{_name} has $self->{_numrows} rows and $self->{_numcols} columns\n"; exit(1); } return $self->{_body}->[$rownum][$colnum]; } #copy all fields of the target object to self (the caller) sub copy($) { my($self,$target) = @_; if (defined $target->{_comments}) { foreach my $commentline (@{$target->{_comments}}) { push @{$self->{_comments}}, $commentline; } } foreach my $cn (@{$target->{_colnames}}) { push @{$self->{_colnames}}, $cn; #also copy the colhash ${$self->{_colhash}}{$cn}=${$target->{_colhash}}{$cn}; } foreach my $cd (@{$target->{_coldefs}}) { push @{$self->{_coldefs}}, $cd; } foreach my $row (@{$target->{_body}}) { push @{$self->{_body}}, $row; } $self->{_numcols} = $target->{_numcols}; $self->{_numrows} = $target->{_numrows}; $self->{_name} = $target->{_name}; } #initialize from a file #given a filename, loads the comments, colnames, coldefs and body #constructs a colhash that maps column names to numbers sub init_from_file ($$) { my($self, $RDBfilename) = @_; open(RDBFILE, "$RDBfilename") || die "Can't open $RDBfilename for reading\n"; my @RDBlines = ; my $lln = 0; my $rowcount = 0; my $colcount = 0; $self->{_name} = $RDBfilename; foreach my $line (@RDBlines) { # print "line:$line\n"; chomp $line; if ($line =~ /^\#/) { push @{$self->{_comments}}, $line; next; } $lln++ ; if( $lln == 1 ) { my @colnames = split( /\t/, $line ) ; # column names foreach my $cn (@colnames) { push @{$self->{_colnames}}, $cn; #set up the column hash to get a column number from its name ${$self->{_colhash}}{$cn}=$colcount++; } } elsif( $lln == 2 ) { $self->{_numcols} = $colcount; my @coldefs = split( /\t/, $line ) ; # data definitions foreach my $cd(@coldefs) { push @{$self->{_coldefs}}, $cd; } } else { #body is array of arrays with each entry an array of the three column fields my @coldata = split( /\t/, $line ); # grab the data push @{$self->{_body}}, [ @coldata ]; $rowcount++; } } $self->{_numrows} = $rowcount; } #initialize from an array. Array must be have the same form as an rdb file: # comment lines begin with #, first non-# line is col names, second non-# line # is col defs. Will accept either a "single column". If multiple columns are # used, each array entry should be a tab-separated "row" sub init_from_array($@$) { my ($self, $RDBlines, $name) = @_; my $lln = 0; my $rowcount = 0; my $colcount = 0; $self->{_name} = $name; foreach my $line (@$RDBlines) { # print "line:$line\n"; chomp $line; if ($line =~ /^\#/) { push @{$self->{_comments}}, $line; next; } $lln++ ; if( $lln == 1 ) { my @colnames = split( /\t/, $line ) ; # column names foreach my $cn (@colnames) { push @{$self->{_colnames}}, $cn; #set up the column hash to get a column number from its name ${$self->{_colhash}}{$cn}=$colcount++; } } elsif( $lln == 2 ) { $self->{_numcols} = $colcount; my @coldefs = split( /\t/, $line ) ; # data definitions foreach my $cd(@coldefs) { push @{$self->{_coldefs}}, $cd; } } else { #body is array of arrays with each entry an array of the three column fields my @coldata = split( /\t/, $line ); # grab the data push @{$self->{_body}}, [ @coldata ]; $rowcount++; } } $self->{_numrows} = $rowcount; } #given a column name, return its index sub get_column_index($$) { my($self, $colname) = @_; return ${$self->{_colhash}}{$colname}; } #retrieve a column of data (header info plus body) #given a column name, return it in an array #Note: columns are indexed starting with 0 sub get_column($$) { my($self, $colname) = @_; my $colnum = ${$self->{_colhash}}{$colname}; my @column; push @column, $colname; my $coldef = ${$self->{_coldefs}}[$colnum]; push @column, $coldef; my $i = 0; foreach my $row (@{$self->{_body}}) { my $data = $row->[$colnum]; push @column, $data; } return @column; } #append a column of data (header info plus body) #inserts a new column to the right of existing ones sub append_column($$$@) { my($self, $colname, $coldef, $coldata) = @_; #error checking my $numerrs = 0; #check that we got all the arguments if ($#_ < 3 ) { print STDERR "Error: append column for $self->{_name} requires 3 arguments, got $#_\n"; $numerrs++; } #check that data we want to append is the correct length if ( ($#{$coldata} + 1) != $self->{_numrows} ) { my $coldatalines = $#{$coldata} + 1; print STDERR "Error: $self->{_name} length mismatch. Master has $self->{_numrows} lines and slave has $coldatalines lines\n"; $numerrs++; } die if ($numerrs > 0); my $colnum = $self->{_numcols}; ${$self->{_colnames}}[$colnum] = $colname; ${$self->{_coldefs}}[$colnum] = $coldef; #add to the array hash ${$self->{_colhash}}{$colname}=$colnum; my $lastrow = $self->{_numrows} - 1; for my $i ( 0..$lastrow ) { $self->{_body}->[$i][$colnum] = $$coldata[$i]; } $self->{_numcols}++; } #appends body of the RDB in the argument to the body of the caller RDB #requires that they have the same number of columns and that #column names, definitions and orderings are the same in both sub append_body($) { my($self, $RDBtocat) = @_; my $errorcount = 0; # error checking # do both have all non-null fields? # do the two RDBs match ? if ($self->{_numcols} != $RDBtocat->{_numcols}) { print STDERR "$self->{_name} has $self->{_numcols} columns but $RDBtocat->{_name} has $RDBtocat->{_numcols}\n"; $errorcount++; } for(my $i = 0; $i < $self->{_numcols}; $i++) { if (${$self->{_colnames}}[$i] ne ${$RDBtocat->{_colnames}}[$i]) { print STDERR "$self->{_name} column " . $i . " name=${$self->{_colnames}}[$i] but $RDBtocat->{_name} column " . $i . " name=${$RDBtocat->{_colnames}}[$i] \n"; $errorcount++; } if (${$self->{_coldefs}}[$i] ne ${$RDBtocat->{_coldefs}}[$i]) { print STDERR "$self->{_name} column " . $i . " definition=${$self->{_coldefs}}[$i] but $RDBtocat->{_name} column " . $i . " definition=${$RDBtocat->{_coldefs}}[$i] \n"; $errorcount++; } } # exit with error if any test failures die if ($errorcount > 0); foreach my $row (@{$RDBtocat->{_body}}) { push @{$self->{_body}}, $row; $self->{_numrows}++; } } #caller and target RDB objects are joined but second occurrences of values in #the joined columns are ignored #returns the resulting "joined" RDB table #special purpose routine designed to construct a "canonical" dssp sequence #PRECONDITION: both tables must be sorted on the joining columns #prior to calling join_on_first_occurrence sub join_on_first_occurrence($$$$) { my($self, $colHead1, $target, $colHead2) = @_; #get numeric values of desired column heads my $cH1 = $self->{_colhash}{$colHead1}; my $cH2 = $target->{_colhash}{$colHead2}; my $result = new RDBtable; #union of self and target column heads are column heads for the result table #also set up the column hash for the result table my $colcount = 0; foreach my $cn (@{$self->{_colnames}}){ push @{$result->{_colnames}}, $cn; ${$result->{_colhash}}{$cn}=$colcount++; } foreach my $cn (@{$target->{_colnames}}){ push @{$result->{_colnames}}, $cn; ${$result->{_colhash}}{$cn}=$colcount++; } $result->{_numcols} = $colcount; #now get the column defs foreach my $cd (@{$self->{_coldefs}}){ push @{$result->{_coldefs}}, $cd; } foreach my $cd (@{$target->{_coldefs}}){ push @{$result->{_coldefs}}, $cd; } #find and assemble rows to put in result table body #check for a match (ignore second occurrence) #between values in $colHead1 and $colHead2 #if there's a match, get the rows from each table, cat together and #add to result table. Maintain a row count as we go. my $s; #index into rows of self table my $t = 0; #index into rows of target table my $catted_row =0; my %dupeCheck = (); for($s = 0; $s < $self->{_numrows}; $s++) { my $foundFlag = 0; my $sVal = $self->get_cell($s, $cH1); next if $sVal eq "\t" || $sVal eq ""; while($foundFlag==0 && $t < $target->{_numrows}) { my $tVal = $target->get_cell($t, $cH2); # print "sval:$sVal\ttval:$tVal\n"; if ($sVal eq $tVal) { $foundFlag = 1; next if (defined($dupeCheck{$s})); $catted_row = $self->cat_arrays($self->{_body}->[$s], $target->{_body}->[$t]); push @{$result->{_body}}, $catted_row; #enter this item into the dupeCheck hash $dupeCheck{$s} = 1; $result->{_numrows}++; } $t++; } if($foundFlag==0 && ($t == $target->{_numrows})) { # value in cH1 column did not have a match in cH2 # reset the index in the target table to 0 # more efficient would be to memorize its position at the last # successful match (except for special case of first search) and start again from there # but since these are small tables and this "not found" condition is rare, # for now rewind all the way $t = 0; } } #give result table a name my $sName = $self->get_name(); my $tName = $target->get_name(); my $rName = $sName . $tName; $result->set_name($rName); return $result; } #given two array references, return a reference to an array #which is a concatenated version of both arrays sub cat_arrays($$$) { my($self, $array1, $array2) = @_; unless (ref($array1) eq 'ARRAY' && ref($array2) eq 'ARRAY') { die "usage: cat_arrays ARRAYREF1 ARRAYREF2"; } my @catted_array; for (my $i = 0; $i < @$array1; $i++) { push @catted_array, $array1->[$i]; } for (my $i = 0; $i < @$array2; $i++) { push @catted_array, $array2->[$i]; } return \@catted_array; } #print method sub print() { my($self) = @_; if (defined $self->{_comments}) { foreach my $commentline (@{$self->{_comments}}) { print "$commentline\n"; } } my $j = 0; foreach my $cn (@{$self->{_colnames}}) { print "$cn"; $j++; if($j < $self->{_numcols}){ print "\t";} } print "\n"; $j = 0; foreach my $cd (@{$self->{_coldefs}}) { print "$cd"; $j++; if($j < $self->{_numcols}){ print "\t";} } print "\n"; foreach my $row (@{$self->{_body}}) { for (my $i = 0; $i < $self->{_numcols}; $i++) { print "$row->[$i]"; if ($i < ($self->{_numcols} - 1) ) { print "\t"; } else { print "\n"; } } } } #print to a file, given a file name (which does not have to exist yet) sub print_to_file($) { my($self, $filename) = @_; open (OUTFILE, ">$filename") || die "Can't open $filename for writing.\n"; if (defined $self->{_comments}) { foreach my $commentline (@{$self->{_comments}}) { print OUTFILE "$commentline\n"; } } my $j = 0; foreach my $cn (@{$self->{_colnames}}) { print OUTFILE "$cn"; $j++; if($j < $self->{_numcols}){ print OUTFILE "\t";} } print OUTFILE "\n"; $j = 0; foreach my $cd (@{$self->{_coldefs}}) { print OUTFILE "$cd"; $j++; if($j < $self->{_numcols}){ print OUTFILE "\t";} } print OUTFILE "\n"; foreach my $row (@{$self->{_body}}) { for (my $i = 0; $i < $self->{_numcols}; $i++) { print OUTFILE "$row->[$i]"; if ($i < ($self->{_numcols} - 1) ) { print OUTFILE "\t"; } else { print OUTFILE "\n"; } } } close(OUTFILE); } # packages must end with a true value---a restriction that seems # to be poorly documented. 1;