/* * fasta2al - convert a FASTA alignment into the CASP3 AL format. * * Update history: * 6/3/98 cline author, copied code from talign2fasta * 6/9/98 cline moved all the PDB summary reading functions * to pdbsummary.c */ #include #include #include #include #include "alignsummary.h" #include "compute_shift.h" #include "pdbsummary.h" /* * These #defines tell readAlignment how small an alignment we're * interested in reading. */ #define SINGLE_SEQ 1 #define PAIRWISE_ALIGN 2 /* * Prefix for the pathname of the target sequence files */ #define TARGET_PATHNAME_PREFIX "/projects/compbio/experiments/casp3" /* * TYPEDEFS */ /* * PROTOTYPES */ int checkArguments( cmdlineInfo *parameters); /* called with: the command line * parameters, as parsed by parseCmdline. */ void getTargetSeqPathname( char *targetName, /* called with: name of the target seq */ char *targetSeqFilename); /* return with: pathname of the seq * file under the casp3 directory */ int matchedResiduesIdentical( AlignSummary *templateSeqFromPdb, /* called with: the alignsummary * version of the sequence, * possibly with inserts added */ char *seqReadFromPdb); /* called with: sequence as it * was originally read from PDB */ int targetSeqConsistencyCheck( char *targetName, /* called with: name of the target seq */ AlignSummary *candidate, /* called with: pointer to the candidate * alignment */ char *checkTargetFilename); /* called with: filename of uncorrupted * target seq */ /* * CODE */ /* * getTargetSeqPathname - get the pathname of the .seq file */ void getTargetSeqPathname( char *officialTargetName, /* called with: name of the target seq */ char *targetSeqFilename) /* return with: pathname of the seq * file under the casp3 directory */ { int ii, jj; int length; int trailingZero; char unofficialTargetName[MAX_LABEL_LENGTH + 1]; /* * Take the target name, remove the padding zeros, and convert * the alphabetic letters to lowercase. For example, T0047 becomes * t47, but T0060 becomes t60. */ strcpy(unofficialTargetName, officialTargetName); trailingZero = TRUE; for (ii = strlen(unofficialTargetName) - 1; ii >= 0; ii--) { if (unofficialTargetName[ii] != '0') { trailingZero = FALSE; if (isalpha(unofficialTargetName[ii])) { unofficialTargetName[ii] = tolower(unofficialTargetName[ii]); } } else { /* * Found a padding (non-trailing) zero. Copy all characters * from strlen to ii backwards by one position. Note that * by extending this copy to strlen, the '\0' will be copied * with everything else. */ if (!trailingZero) { for (jj = ii; jj < strlen(unofficialTargetName); jj++) { unofficialTargetName[jj] = unofficialTargetName[jj+1]; } } } } /* * Build a pathname starting with the prefix of the experiment * directory. Append the unofficial target name, directory delimiter, * unofficial target name again, and suffix ".seq". This will yield * //.seq */ sprintf(targetSeqFilename, "%s/%s/%s.seq", TARGET_PATHNAME_PREFIX, unofficialTargetName, unofficialTargetName); } /* * checkArguments - check that all the required arguments were given * * Return value: TRUE if all the required arguments were specified, * FALSE otherwise */ int checkArguments( cmdlineInfo *parameters) /* called with: the command line * parameters, as parsed by parseCmdline. */ { int is_okay; if (parameters->candidateA2mName != NULL && parameters->templateName != NULL) { is_okay = TRUE; } else { is_okay = FALSE; fprintf(stderr, "Error: missing required argument(s) "); if (parameters->candidateA2mName == NULL) { fprintf(stderr, "%s ", CANDIDATE_STRING); } if (parameters->templateName == NULL) { fprintf(stderr, "%s ", TEMPLATE_STRING); } fprintf(stderr, "\n"); } return(is_okay); } /* * targetSeqConsistencyCheck - check the target sequence against the * copy of the sequence in the casp3 * directory. * * Return value: TRUE if the sequences are the same, FALSE if they * are different or if the consistency check blew up. */ int targetSeqConsistencyCheck( char *targetName, /* called with: name of the target seq */ AlignSummary *candidate, /* called with: pointer to the candidate * alignment */ char *checkTargetFilename) /* called with: name of the uncorrupted * target sequence file */ { /* char targetSeqFilename[PATHNAME_LENGTH]; */ AlignSummary *targetSeqOnly = NULL; int isOkay = FALSE; char *buffer = NULL; /* getTargetSeqPathname(targetName, targetSeqFilename); * isOkay = readAlignment(targetSeqFilename, &buffer, &targetName, * SINGLE_SEQ, &targetSeqOnly); */ isOkay = readAlignment(checkTargetFilename, &buffer, &targetName, SINGLE_SEQ, &targetSeqOnly); if (!isOkay) { fprintf(stderr, "Error cannot find file %s for target consistency check\n", checkTargetFilename); } else if (findSequenceIndex(targetName, targetSeqOnly) == NOT_FOUND) { fprintf(stderr, "Error: cannot perform target consistency check, %s not found in %s\n", targetName, checkTargetFilename); } else { if (!sequencesIdentical(candidate, targetName, targetSeqOnly, targetName)) { fprintf(stderr, "Error: target sequence %s differs between %s and %s\n", targetName, candidate->filename, checkTargetFilename); } else { isOkay = TRUE; } } if (targetSeqOnly != NULL) freeAlignSummary(targetSeqOnly); return(isOkay); } /* * matchedResiduesIdentical - check that the matched residues in * the specified alignment are the same as the residues in * the specified sequence, that any changes made to the alignment * result in added insert states only. * * Return value: TRUE if the sequence of match states are the same, * FALSE otherwise. */ int matchedResiduesIdentical( AlignSummary *templateSeqFromPdb, /* called with: the alignsummary * version of the sequence, * possibly with inserts added */ char *seqReadFromPdb) /* called with: sequence as it * was originally read from PDB */ { int matchedResiduesSame = TRUE; int ii; int seqIndex; seqIndex = 0; for (ii = 0; matchedResiduesSame && ii < templateSeqFromPdb->rows; ii++) { if (isupper(templateSeqFromPdb->sequence[ii][0])) { if (templateSeqFromPdb->sequence[ii][0] != seqReadFromPdb[seqIndex]) { matchedResiduesSame = FALSE; } seqIndex++; } } return(matchedResiduesSame); } /* * printALRecord - print the specified alignment in CASP3 AL format * * Print the alignment to STDOUT in a long list of AL records. For each * target residue aligned to the template sequence, specify the target * residue by amino acid and sequence position and the template residue * by amino acid and PDB number. */ void printALRecord( AlignSummary *candidate, /* called with: the alignment to * convert. */ AlignSummary *templateSeq, /* called with: the template sequence as * read from PDB, with a direct correspondence * between match states and PDB numbers */ char **pdbNumberList) /* called with: list of PDB numbers */ { int templateSeqIndex, targetSeqIndex; int pdbNumberIndex, templateResidueIndex, targetResidueIndex; int ii; char chain; /* * first, figure out the row indices of the target and template * sequences in the candidate alignment */ templateSeqIndex = findSequenceIndex(templateSeq->label[0], candidate); assert(templateSeqIndex >= 0); if (templateSeqIndex == 0) { targetSeqIndex = 1; } else { targetSeqIndex = 0; } /* * Print out the header info */ printf("MODEL 1\n"); if (strlen(templateSeq->label[0]) > 4) { chain = templateSeq->label[0][4]; templateSeq->label[0][4] = '\0'; printf("PARENT %s_%c\n", templateSeq->label[0], chain); } else { printf("PARENT %s\n", templateSeq->label[0]); } /* * For each column in the candidate alignment, a target residue is * aligned to a candidate residue, print the amino acid and sequence * index of that target residue. Print the amino acid of the * candidate residue, and using the templateSeq alignment as a * reference, print the corresponding PDB number. If at any point * we align to a residue that has no PDB number, print a warning * message but print the residue data anyway. * * Note that the target residue counter starts at 1 while the * others start at 0. That's because the format expects the * first target residue to be indexed at 1. */ pdbNumberIndex = templateResidueIndex = 0; targetResidueIndex = 1; for (ii = 0; ii < candidate->columns; ii++) { if (isupper(candidate->sequence[ii][targetSeqIndex]) && isupper(candidate->sequence[ii][templateSeqIndex])) { printf("%c %d %c %s\n", candidate->sequence[ii][targetSeqIndex], targetResidueIndex, candidate->sequence[ii][templateSeqIndex], pdbNumberList[pdbNumberIndex]); } if (isalpha(candidate->sequence[ii][targetSeqIndex])) { targetResidueIndex++; } if (isalpha(candidate->sequence[ii][templateSeqIndex])) { if (toupper(candidate->sequence[ii][templateSeqIndex]) != templateSeq->sequence[templateResidueIndex][0]) { printf("candidate position %d pdbnum %s residue %c, template %d residue |%c|\n", ii, pdbNumberList[pdbNumberIndex], candidate->sequence[ii][templateSeqIndex], templateResidueIndex, templateSeq->sequence[templateResidueIndex][0]); } assert(toupper(candidate->sequence[ii][templateSeqIndex]) == templateSeq->sequence[templateResidueIndex][0]); if (isupper(templateSeq->sequence[templateResidueIndex][0])) { pdbNumberIndex++; } templateResidueIndex++; } } printf("TER\n"); } enum { CHECK_TARGET_CONSISTENCY = 0 }; int main(int argc, char *argv[]) { int isOkay; cmdlineInfo parameters; char *seqReadFromPdb; char **pdbNumberList; int pdbListSize; AlignSummary *candidate; AlignSummary *templateSeqFromPdb; /* * Parse the command line and read the candidate alignment, the one * to be used in the prediction. The user must specify the template * name on the command line, but doesn't have to supply the target * name. If the alignment is pairwise, we'll figure out the * target name. If we still don't know the target name after * reading the alignment, it means that the alignment wasn't * pairwise and we couldn't deduce it; the user will have to * specify it. */ if (parseCmdline(argc, argv, NULL, NULL, &checkArguments, ¶meters)) { isOkay = readAlignment(parameters.candidateA2mName, ¶meters.templateName, ¶meters.targetName, PAIRWISE_ALIGN, &candidate); if (!isOkay) exit(-1); if (parameters.targetName == NULL) { fprintf(stderr, "Error: cannot deduce the target from the alignment\n"); exit(-1); } /* * Integrity check #1: make sure that we haven't damaged the * target sequence by comparing it to the sequence in * pce/casp3//.seq */ if (parameters.checkTargetFilename != NULL && !targetSeqConsistencyCheck(parameters.targetName, candidate, parameters.checkTargetFilename)) { fprintf(stderr, "Target sequence consistency check failed\n"); exit(-1); } /* * Read the pdb summary file for the template sequence. * The file will be read into a special "alignment" in which * one sequence is the template sequence as seen in the PDB * SEQRES records and the other is the string of PDB numbers * as represeted in the SEQSEQ record. */ isOkay = getPdbsumAlignment(parameters.templateName, &pdbNumberList, &pdbListSize, &seqReadFromPdb, &templateSeqFromPdb); if (!isOkay) { fprintf(stderr, "Cannot translate alignment positions to PDB numbers\n"); } else { if (!sequencesIdentical(candidate, parameters.templateName, templateSeqFromPdb, parameters.templateName) ) { fprintf(stderr, "Warning: reconciling two different versions of sequence %s\n", parameters.templateName); alignSequencesReviseAlignments(templateSeqFromPdb, candidate, parameters.templateName, parameters.templateName); /* * Integrity check #2: I believe that at this point, any * residues added to templateSeqFromPdb should be inserts, * so there should be a 1 to 1 correspondence between * match characters in templateSeqFromPdb and the PDB numbers. * Verify this by comparing the sequence of match characters * to the sequence as it was read from PDB. */ if (!matchedResiduesIdentical(templateSeqFromPdb, seqReadFromPdb)) { fprintf(stderr, "Error: template sequence changed beyond recognition\n"); isOkay = FALSE; } } /* * If the user asked for a working version of the candidate * alignment, the one we have when any sequences differences * are resolved, print it to the specified file. */ if (parameters.workingCandidateFileName != NULL) { printAlign(candidate, parameters.workingCandidateFileName); } if (isOkay) { printALRecord(candidate, templateSeqFromPdb, pdbNumberList); } } } if (isOkay) { return(0); } else { return(-1); } }