#!/usr/bin/perl -w # # a2m-to-caspal.perl arg_file # # The options are read from arg_file rather than the command line # so that they can contain spaces and stuff without needing quote marks or # backslashes. # # Options for arg_file (one per line): # -infasta input_fasta_file # -outcaspal output_casp_al_file # -author author_name # -target target_name # -template template_name # -method method_file_name # -modelnum model_number # -score evalue # # Chris Dragon, 9-9-03: # In error outputs, changed "Error:" to "Error in $PROGRAM_NAME:". # Fixed a bug in find_seq() and in the error message it prints when # it fails. # use FileHandle; STDERR->autoflush(1); use English; use File::Basename; use lib dirname($PROGRAM_NAME); use ServerPaths; use libFasta; $nr_args = int(@ARGV); if ($nr_args != 1) { print STDERR "Error in $PROGRAM_NAME: wrong number of arguments.\n"; print STDERR "a2m-to-caspal.perl arg_file\n"; exit(1); } # # Initialization and defaults # $ENV{PATH} = "${ServerPaths::server_bin_dir}" .":${ServerPaths::server_bin_scripts_dir}" .":${ServerPaths::server_bin_arch_dir}:" . $ENV{PATH}; $get_pdb_prog = "pdb-get-summary"; sub read_arg_file($); sub process_arg_file($); sub run_prog_and_return($@); sub run_prog($@); sub find_seq($$$); sub get_alicol_to_strpos_map($); sub get_alicol_to_seqpos_map($); sub get_seq_from_str($); sub get_pdb_summary_info($); $debug = 0; process_arg_file($ARGV[0]); if ($debug) { print STDERR "a2m-to-caspal args: \n" ."InFasta: $InFasta\n" ."OutCasp: $OutCasp\n" ."Target: $Target\n" ."Template: $Template\n" ."Author: $Author\n" ."Method: $Method\n" ."Score: $Evalue\n"; } open(CASP, ">$OutCasp") or die "Error in $PROGRAM_NAME: can't open file $OutCasp\n"; print CASP "PFRMAT AL\n"; print CASP "TARGET $Target\n"; print CASP "AUTHOR $Author\n"; if (defined($Method)) { open(METH, "<$Method") or die "Error in $PROGRAM_NAME: can't open file $Method\n"; my(@method_lines) = ; my($method_line); for $method_line (@method_lines) { print CASP "METHOD $method_line"; } } print CASP "MODEL $ModelNumber\n"; if ($Template =~ /^\s*(\S+)/) { $template_pdb_id = $1; } else { $template_pdb_id = $Template; } print CASP "PARENT $template_pdb_id\n"; if(defined $Evalue) { print CASP "REMARK SCORE $Evalue\n"; } @names = (); @seqs = (); read_fasta_file($InFasta, @names, @seqs); ($target_name, $target_str) = find_seq($Target, \@names, \@seqs); ($template_name, $template_str) = find_seq($Template, \@names, \@seqs); if ($target_name eq "") { die "Error in $PROGRAM_NAME: target $Target not found in $InFasta.\n"; } if ($template_name eq "") { die "Error in $PROGRAM_NAME: template $Template not found in $InFasta.\n"; } $target_seq = get_seq_from_str($target_str); $template_seq = get_seq_from_str($template_str); if ($debug) { print STDERR "target_name: $target_name\n"; print STDERR "target_str: $target_str\n"; print STDERR "target_seq: $target_seq\n"; print STDERR "template_name: $template_name\n"; print STDERR "template_str: $template_str\n"; print STDERR "template_seq: $template_seq\n"; } #$target_alicol_to_strpos_map_ref = get_alicol_to_strpos_map($target_str); $target_alicol_to_seqpos_map_ref = get_alicol_to_seqpos_map($target_str); if ($debug) { #print STDERR "target_alicol_to_strpos_map_ref: (length " # . int(@$target_alicol_to_strpos_map_ref) # . ") " # . join(',', @$target_alicol_to_strpos_map_ref) # . "\n"; print STDERR "target_alicol_to_seqpos_map_ref: (length " . int(@$target_alicol_to_seqpos_map_ref) . ") " . join(',', @$target_alicol_to_seqpos_map_ref) . "\n"; } #template_alicol_to_strpos_map_ref = get_alicol_to_strpos_map($template_str); $template_alicol_to_seqpos_map_ref = get_alicol_to_seqpos_map($template_str); if ($debug) { #print STDERR "template_alicol_to_strpos_map_ref: (length " # . int(@$template_alicol_to_strpos_map_ref) # . ") " # . join(',', @$template_alicol_to_strpos_map_ref) # . "\n"; print STDERR "template_alicol_to_seqpos_map_ref: (length " . int(@$template_alicol_to_seqpos_map_ref) . ") " . join(',', @$template_alicol_to_seqpos_map_ref) . "\n"; } ($template_pdb_seqres, $template_pdb_seqseq_ref) = get_pdb_summary_info($template_pdb_id); if ($debug) { print STDERR "template_pdb_seqres: $template_pdb_seqres\n"; print STDERR "template_pdb_seqseq_ref: (length " . int(@$template_pdb_seqseq_ref) . ") " . join(',', @$template_pdb_seqseq_ref) . "\n"; } if (int(@$template_alicol_to_seqpos_map_ref) != int(@$target_alicol_to_seqpos_map_ref)) { print STDERR "Error in $PROGRAM_NAME: in file $InFasta: target sequence has " . int(@$target_alicol_to_seqpos_map_ref) . " alignment cols, but template sequence has " . int(@$template_alicol_to_seqpos_map_ref) . " alignment cols.\n"; exit(-1); } if ($template_seq ne $template_pdb_seqres) { print STDERR "Warning: template sequence in file $InFasta does not match template SEQRES record for pdb id $template_pdb_id\n"; } $alicol_i = 0; while ($alicol_i < int(@$template_alicol_to_seqpos_map_ref)) { $target_seqpos = ${$target_alicol_to_seqpos_map_ref}[$alicol_i]; $template_seqpos = ${$template_alicol_to_seqpos_map_ref}[$alicol_i]; if ($target_seqpos != -1 && $template_seqpos != -1) { $target_ch = substr($target_seq, $target_seqpos, 1); $target_resnum = $target_seqpos + 1; $template_ch = substr($template_seq, $template_seqpos, 1); $template_resnum = ${$template_pdb_seqseq_ref}[$template_seqpos]; if ($template_resnum ne "_") { print CASP "$target_ch $target_resnum $template_ch $template_resnum\n"; } } $alicol_i++; } print CASP "TER\n"; print CASP "END\n"; close(CASP); exit(0); sub read_arg_file($) { my($arg_file) = @_; my($arg_names_ref) = []; my($arg_vals_ref) = []; my($debug) = 0; open(ARGS_IN, "<$arg_file") or die "Error in $PROGRAM_NAME: process_arg_file: can't open file $arg_file\n"; my($args_in_line); $args_in_line = ; while ($args_in_line) { print STDERR "read line: $args_in_line" if ($debug); if ($args_in_line =~ /^\s*(\S+)\s+(.*)$/) { push(@{$arg_names_ref}, $1); push(@{$arg_vals_ref}, $2); print STDERR "pushed args: $1 $2\n" if ($debug); } $args_in_line = ; } close(ARGS_IN); return ($arg_names_ref, $arg_vals_ref); } sub process_arg_file($) { my($debug) = 0; my($arg_file) = @_; my($arg_names_ref, $arg_vals_ref) = read_arg_file($arg_file); my($arg_i) = 0; undef $InFasta; undef $OutCasp; undef $Target; undef $Template; $Author = "None"; undef $Method; undef $Evalue; while ($arg_i < int(@{$arg_names_ref})) { my($cur_arg_name) = ${$arg_names_ref}[$arg_i]; my($cur_arg_val) = ${$arg_vals_ref}[$arg_i]; $arg_i++; if ($debug) { print STDERR "cur_arg_name: $cur_arg_name cur_arg_val: $cur_arg_val\n"; } if ($cur_arg_name eq "-infasta") { $InFasta = $cur_arg_val; } elsif ($cur_arg_name eq "-outcaspal") { $OutCasp = $cur_arg_val; } elsif ($cur_arg_name eq "-author") { $Author = $cur_arg_val; } elsif ($cur_arg_name eq "-target") { $Target = $cur_arg_val; } elsif ($cur_arg_name eq "-template") { $Template = $cur_arg_val; } elsif ($cur_arg_name eq "-method") { $Method = $cur_arg_val; } elsif ($cur_arg_name eq "-modelnum") { $ModelNumber = $cur_arg_val; } elsif ($cur_arg_name eq "-score") { $Evalue = $cur_arg_val; } else { die "Error in $PROGRAM_NAME: unknown option: $cur_arg_name\n"; } } $ModelNumber=1 if !(defined($ModelNumber)); if (!defined($InFasta)) { print STDERR "Error in $PROGRAM_NAME: input fasta file must be specified\n"; print STDERR $UsageStr; exit(1); } if (!defined($OutCasp)) { print STDERR "Error in $PROGRAM_NAME: output casp ss file must be specified\n"; print STDERR $UsageStr; exit(1); } if (!defined($Target)) { print STDERR "Error in $PROGRAM_NAME: target name must be specified\n"; print STDERR $UsageStr; exit(1); } if (!defined($Template)) { print STDERR "Error in $PROGRAM_NAME: template name must be specified\n"; print STDERR $UsageStr; exit(1); } } sub run_prog($@) { my($cmd, $errmsg) = @_; if (system($cmd) != 0) { if (!defined($errmsg)) { $errmsg = "command failed: $cmd"; } print STDERR "$errmsg\n"; exit(1); } } sub run_prog_and_return($@) { use strict; my($cmd, $errmsg) = @_; my($sys_ret); $sys_ret = system($cmd); if ($sys_ret != 0) { if (!defined($errmsg)) { $errmsg = "command failed: $cmd"; } print STDERR "$errmsg\n"; } return $sys_ret; } sub find_seq($$$) { my($name_to_find, $names_ref, $seqs_ref) = @_; my($name_i) = 0; my($ret_name, $ret_seq) = ("", ""); # Chris Dragon, 9-9-03: Was matching $name_to_find (such as '1h2gB') # against the full name of each a2m sequence including the comment # (such as '1h2gB 557 XRAY 2.00 0.152 0.194 PENICILLIN G ACYLASE BETA SUBUNIT' # This obviously wouldn't match. Changed code to match only up to the # first space in the a2m sequence name. while ($name_i < int(@$names_ref) and ${$names_ref}[$name_i] !~ m/^$name_to_find(?:\s|$)/) # and $name_to_find ne ${$names_ref}[$name_i]) { $name_i++; } if ($name_i < int(@$names_ref)) { $ret_name = ${$names_ref}[$name_i]; $ret_seq = ${$seqs_ref}[$name_i]; } return ($ret_name, $ret_seq); } sub get_alicol_to_strpos_map($) { use strict; my($str) = @_; my($map_ref) = []; my($strpos) = 0; while ($strpos < length($str)) { my($ch) = substr($str, $strpos, 1); if ($ch ne "." && ((uc($ch) eq $ch) || ($ch eq "-"))) { push(@$map_ref, $strpos); } $strpos++; } return $map_ref; } sub get_alicol_to_seqpos_map($) { use strict; my($str) = @_; my($map_ref) = []; my($seqpos) = 0; my($strpos) = 0; while ($strpos < length($str)) { my($ch) = substr($str, $strpos, 1); if ($ch =~ /[A-Z]/ or $ch =~/[a-z]/) { if ($ch =~ /[A-Z]/) { push(@$map_ref, $seqpos); } $seqpos++; } elsif ($ch eq "-") { push(@$map_ref, -1); } $strpos++; } return $map_ref; } sub get_seq_from_str($) { my($str) = @_; $str =~ s/-//g; $str =~ s/\.//g; $str = uc($str); return $str; } sub get_pdb_summary_info($) { use strict; my($pdb_id) = @_; my($debug) = 0; my($ret_seqres) = ""; my($ret_seqseq_ref) = []; my($pdbsum_file) = "tmp-pdbsum-$$"; my($pdbsum_cmd) = "$main::get_pdb_prog -cat $pdb_id > $pdbsum_file"; my($max_tries) = 8; my($tries) = 0; my($pdbsum_ret) = -1; while ($pdbsum_ret != 0 && $tries < $max_tries) { $pdbsum_ret = run_prog_and_return($pdbsum_cmd); $tries++; } die "Error in $PROGRAM_NAME: can't pdb-get-summary for $pdb_id\n" if ($pdbsum_ret != 0); my($seqseq_line) = ""; my($seqres_line) = ""; open(PDBSUM, "<$pdbsum_file") or die "Error in $PROGRAM_NAME: can't open file $pdbsum_file\n"; my($pdbsum_line); $pdbsum_line = ; while ($pdbsum_line) { if ($pdbsum_line =~ /^SEQSEQ/) { $seqseq_line = $pdbsum_line; } if ($pdbsum_line =~ /^SEQRES/) { $seqres_line = $pdbsum_line; } $pdbsum_line = ; } close(PDBSUM); if ($seqres_line =~ /^SEQRES\s+(\S+)/) { $ret_seqres = $1; } if ($seqseq_line =~ /^SEQSEQ\s+(\S+)/) { my($pdbnum_str) = $1; my(@split_pdbnum_str) = split /,/ , $pdbnum_str; my($elem); for $elem (@split_pdbnum_str) { if ($elem ne ',') { push(@$ret_seqseq_ref, $elem); } } } run_prog("rm -f $pdbsum_file") if (!$debug); return ($ret_seqres, $ret_seqseq_ref); }