# # Library of useful routines for I/O of Fasta files # package libFasta; @ISA = qw(Exporter); @EXPORT = qw( &read_fasta_file &read_fasta_gz_file &read_a2m_as_fasta_file &read_a2m_as_fasta_gz_file &print_fasta_file &read_fasta_file_keep_newlines ); # # Remember to precede all names with the desired Perl namespace modifier # in the EXPORT list # # # Subroutines # # # read_fasta_file( $fasta_file, @names_return, @seqs_return ) # # $fasta_file, name of file to read, in fasta format # @names_return, list to return sequence names in, # $names_return[$i] corresponds to $seqs_return[$i] # @seqs_return, list to return sequences in # sub read_fasta_file($\@\@) { use strict 'vars'; my($fasta_in_file, $names_ref, $seqs_ref) = @_; if ($fasta_in_file =~ /.gz$/){ open(FASTA_IN, "gunzip -c $fasta_in_file|") || die "Error: can't open $fasta_in_file for reading\n"; } else { open(FASTA_IN, "<$fasta_in_file") || die "Error: can't open $fasta_in_file\n"; } my($fasta_in_line); $fasta_in_line = ; while($fasta_in_line and $fasta_in_line !~ /^>/) { $fasta_in_line = ; } @$names_ref = (); @$seqs_ref = (); #first way gets the whole annotation of the sequence #second way just gets the first identifying word like 'T0103' # while ($fasta_in_line && $fasta_in_line =~ /^>(.*)/) while ($fasta_in_line && $fasta_in_line =~ /^>(\S+)\s+.*/) { my($name, $seq); $name = $1; $seq = ''; $fasta_in_line = ; while ($fasta_in_line and $fasta_in_line !~ /^>/) { chomp($fasta_in_line); $seq .= $fasta_in_line; $fasta_in_line = ; } push(@$names_ref, $name); push(@$seqs_ref, $seq); } close(FASTA_IN); } # # read_fasta_gz_file( $fasta_file, @names_return, @seqs_return ) # # $fasta_file, name of file to read, in fasta format # @names_return, list to return sequence names in, # $names_return[$i] corresponds to $seqs_return[$i] # @seqs_return, list to return sequences in # sub read_fasta_gz_file($\@\@) { use strict 'vars'; my($fasta_in_file, $names_ref, $seqs_ref) = @_; if ($fasta_in_file =~ /\.gz/) { my($tmp_gzcat_out) = "tmp-gzcat-out-$$"; my($gzcat_cmd) = "gunzip -c $fasta_in_file > $tmp_gzcat_out"; system($gzcat_cmd); read_fasta_file($tmp_gzcat_out, @$names_ref, @$seqs_ref); my($rm_cmd) = "rm $tmp_gzcat_out"; system($rm_cmd); } else { read_fasta_file($fasta_in_file, @$names_ref, @$seqs_ref); } } # # read_a2m_as_fasta_file( $fasta_file, @names_return, @seqs_return ) # # $fasta_file, name of file to read, in fasta format # @names_return, list to return sequence names in, # $names_return[$i] corresponds to $seqs_return[$i] # @seqs_return, list to return sequences in # sub read_a2m_as_fasta_file($\@\@) { use strict 'vars'; my($fasta_in_file, $names_ref, $seqs_ref) = @_; if ($fasta_in_file =~ /.gz$/){ open(FASTA_IN, "gunzip -c $fasta_in_file|") || die "Error: can't open $fasta_in_file for reading\n"; } else { open(FASTA_IN, "<$fasta_in_file") || die "Error: can't open $fasta_in_file\n"; } my($fasta_in_line); $fasta_in_line = ; while($fasta_in_line and $fasta_in_line !~ /^>/) { $fasta_in_line = ; } @$names_ref = (); @$seqs_ref = (); while ($fasta_in_line && $fasta_in_line =~ /^>(.*)/) { my($name, $seq); $name = $1; $seq = ''; $fasta_in_line = ; while ($fasta_in_line and $fasta_in_line !~ /^>/) { chomp($fasta_in_line); $seq .= $fasta_in_line; $fasta_in_line = ; } #get rid of all gaps in converting a2m to plain fasta $seq =~ s/-//g; $seq =~ s/\.//g; push(@$names_ref, $name); push(@$seqs_ref, $seq); } close(FASTA_IN); } # # read_a2m_as_fasta_gz_file( $fasta_file, @names_return, @seqs_return ) # # $fasta_file, name of file to read, in fasta format # @names_return, list to return sequence names in, # $names_return[$i] corresponds to $seqs_return[$i] # @seqs_return, list to return sequences in # sub read_a2m_as_fasta_gz_file($\@\@) { use strict 'vars'; my($fasta_in_file, $names_ref, $seqs_ref) = @_; if ($fasta_in_file =~ /\.gz/) { my($tmp_gzcat_out) = "tmp-gzcat-out-$$"; my($gzcat_cmd) = "gunzip -c $fasta_in_file > $tmp_gzcat_out"; system($gzcat_cmd); read_a2m_as_fasta_file($tmp_gzcat_out, @$names_ref, @$seqs_ref); my($rm_cmd) = "rm $tmp_gzcat_out"; system($rm_cmd); } else { read_a2m_as_fasta_file($fasta_in_file, @$names_ref, @$seqs_ref); } } # # print_fasta_file( @names, @seqs, $fasta_file ) # # $fasta_file, name of file to print, in fasta format # @names_return, list of sequence names to print, # $names_return[$i] corresponds to $seqs_return[$i] # @seqs_return, list of sequences to print # sub print_fasta_file(\@\@$) { use strict 'vars'; my($names_ref, $seqs_ref, $fasta_out_file) = @_; open(FASTA_OUT, ">$fasta_out_file") || die "Error: can't open $fasta_out_file\n"; my($i); $i = 0; while ($i < int(@$seqs_ref)) { my($name, $seq); $name = $$names_ref[$i]; $seq = $$seqs_ref[$i]; print FASTA_OUT ">$name\n$seq\n"; $i++; } close(FASTA_OUT); } # # read_fasta_file_keep_newlines( $fasta_file, @names_return, @seqs_return ) # # $fasta_file, name of file to read, in fasta format # @names_return, list to return sequence names in, # $names_return[$i] corresponds to $seqs_return[$i] # @seqs_return, list to return sequences in # sub read_fasta_file_keep_newlines($\@\@) { use strict 'vars'; my($fasta_in_file, $names_ref, $seqs_ref) = @_; if ($fasta_in_file =~ /.gz$/){ open(FASTA_IN, "gunzip -c $fasta_in_file|") || die "Error: can't open $fasta_in_file for reading\n"; } else { open(FASTA_IN, "<$fasta_in_file") || die "Error: can't open $fasta_in_file\n"; } my($fasta_in_line); $fasta_in_line = ; while($fasta_in_line and $fasta_in_line !~ /^>/) { $fasta_in_line = ; } @$names_ref = (); @$seqs_ref = (); while ($fasta_in_line && $fasta_in_line =~ /^>(.*)/) { my($name, $seq); $name = $1; $seq = ''; $fasta_in_line = ; while ($fasta_in_line and $fasta_in_line !~ /^>/) { $seq .= $fasta_in_line; $fasta_in_line = ; } chomp($seq); #take only the last newline off so it will print properly push(@$names_ref, $name); push(@$seqs_ref, $seq); } close(FASTA_IN); }