/* * al2fasta - convert a CASP3 AL format alignment back to FASTA * * Update history * 6/9/98 cline author */ #include #include #include #include #include "alignsummary.h" #include "compute_shift.h" #include "pdbsummary.h" /* * These #defines tell readAlignment how small an alignment we're * interested in reading. */ #define SINGLE_SEQ 1 /* * We don't expect a target name to be more than 5 characters long, * since they only show a 4 digit number on their web page. As for the * template name, they have a PDB ID of 4 characters plus an optional * chain of 1 character, and a terminating null character of course. */ #define MAX_TARGET_NAME_LENGTH 6 #define MAX_TEMPLATE_NAME_LENGTH 6 /* * Prefix for the pathname of the target sequence files */ #define TARGET_PATHNAME_PREFIX "/projects/compbio/experiments/casp3/" /* * #defines for some of the recognizable strings in a CASP3 prediction */ #define CASP3_TARGET_STRING "TARGET" #define CASP3_REMARK_STRING "REMARK" #define CASP3_PARENT_STRING "PARENT" #define CASP3_TER_STRING "TER" /* * define some rows in the output alignment for the target and template * sequences */ #define TARGET_ROW 0 #define TEMPLATE_ROW 1 /* * below are a couple #defines used in command line parsing */ #define OUTPUT_STRING "-output" #define TEMPLATE_NAME_STRING "-template" /* * PROTOTYPES */ void addToAlignment( int targetSeqStart, /* called with: index in the target * sequence that we start copying from. */ int targetSeqEnd, /* called with: last index position to * copy in the target sequence */ int templateSeqStart, /* called with: index in the template * sequence that we start copying from */ int templateSeqEnd, /* called with: index in the template * sequence after which we stop copying */ alignedSequence *target, /* called with: structure describing how * the target aligns in the next argument */ alignedSequence *template, /* called with: structure describing how * the template sequence aligns in the * next argument */ AlignSummary *alignWithTarget, /* called with: alignment containing * the target sequence */ AlignSummary *alignWithTemplate, /* called with: an alignment containing * the template sequence */ int isMatch, /* TRUE if the residues are to be added * as matched residues, FALSE if they're * to be added as inserts */ int *alignColumnIdx, /* called with: current column in the * alignment. Return with: updated * current column, when we're done here */ AlignSummary *alignTargetTemplate); /* return with: updated alignment * of the target and template seqs, * reflecting this new segment */ void emptyAlignSummary( char *targetName, /* called with: name of the target * sequence */ char *templateName, /* called with: name of the template * sequence */ int targetLength, /* called with: length of the target * sequence */ int templateLength, /* called with: length of the template * sequence */ AlignSummary **newAlign); /* return with: pointer to a pointer to * the new AlignSummary struct. */ int getTargetAlignment( char *targetName, /* called with: name of the * target sequence */ char *checkTargetFileName, /* file name of file with target sequence in * a2m format */ AlignSummary **targetSeq); /* return with: pointer^2 to an * AlignSummary representation of the * target sequence */ int getTargetName( char *targetName); /* called with: empty string array, * allocated to the max target size. * return with: completed target name */ void getTargetSeqPathname( char *targetName, /* called with: name of the target seq */ char *targetSeqFilename); /* return with: pathname of the seq * file under the casp3 directory */ int getTemplateName( char *templateName); /* called with: empty string array, * allocated to the max target size. * return with: completed target name */ int makeNewAlignment( AlignSummary *targetSeq, /* called with: alignment * containing only the target * seq */ AlignSummary *templateSeq, /* called with: alignment * containing only the template * seq */ alignedSequence *target, /* called with: structure * detailing the target sequence * within its alignment */ alignedSequence *template, /* called with: structure * detailing the template * sequence within its alignment */ char *targetName, /* called with: name of the * target sequence */ char *templateName, /* called with: name of the * template sequence */ char **pdbNumList, /* called with: list of the PDB * numbers aligning at each * position */ int pdbListSize, /* called with: the number of * PDB numbers in the list */ AlignSummary **alignTargetTemplate); /* return with: alignment of the * target and template seqs */ int pdbnumToIndex( char *thisPdbnum, /* called with: pdb number we're * looking for */ int sequenceLength, /* called with: length of the sequence, * also length of the pdbnum array */ char **pdbList); /* called with: list of the pdb * numbes, by index position */ /* * CODE */ /* * getTargetSeqPathname - get the pathname of the .seq file */ void getTargetSeqPathname( char *officialTargetName, /* called with: name of the target seq */ char *targetSeqFilename) /* return with: pathname of the seq * file under the casp3 directory */ { int ii, jj; int length; int trailingZero; char unofficialTargetName[MAX_LABEL_LENGTH + 1]; /* * Take the target name, remove the padding zeros, and convert * the alphabetic letters to lowercase. For example, T0047 becomes * t47, but T0060 becomes t60. */ strcpy(unofficialTargetName, officialTargetName); trailingZero = TRUE; for (ii = strlen(unofficialTargetName) - 1; ii >= 0; ii--) { if (unofficialTargetName[ii] != '0') { trailingZero = FALSE; if (isalpha(unofficialTargetName[ii])) { unofficialTargetName[ii] = tolower(unofficialTargetName[ii]); } } else { /* * Found a padding (non-trailing) zero. Copy all characters * from strlen to ii backwards by one position. Note that * by extending this copy to strlen, the '\0' will be copied * with everything else. */ if (!trailingZero) { for (jj = ii; jj < strlen(unofficialTargetName); jj++) { unofficialTargetName[jj] = unofficialTargetName[jj+1]; } } } } /* * Build a pathname starting with the prefix of the experiment * directory. Append the unofficial target name, directory delimiter, * unofficial target name again, and suffix ".seq". This will yield * //.seq */ sprintf(targetSeqFilename, "%s/%s/%s.seq", TARGET_PATHNAME_PREFIX, unofficialTargetName, unofficialTargetName); } /* * getTargetName - get the name of the target from the prediction * * Scan through the prediction until finding the target sequence * identifier. Change the leading T to lowercase, drop the excess * zeros, and return with the name. * * Return value: TRUE if we read the target name with no I/O error, * FALSE otherwise. */ int getTargetName( char *targetName) /* called with: empty string array, * allocated to the max target size. * return with: completed target name */ { int ii, jj; int targetLabelFound = FALSE; char input_buffer[INPUT_BUFFER_LENGTH]; int isOkay = TRUE; /* * scan down to the line containing the target seq name */ while (isOkay && !targetLabelFound) { if (feof(stdin)) { fprintf(stderr, "Error: waiting for %s, found EOF\n", CASP3_TARGET_STRING); isOkay = FALSE; } else { gets(input_buffer); if (strncasecmp(input_buffer, CASP3_TARGET_STRING, strlen(CASP3_TARGET_STRING)) == 0) { targetLabelFound = TRUE; } } } /* * pull the target name out of the line. The line will * be of the format "TARGET T00XX". */ if (isOkay) { for (ii = 0; !isspace(input_buffer[ii]); ii++); for (; isspace(input_buffer[ii]); ii++); for (jj = 0; ii < strlen(input_buffer); ii++ ) { if (isalnum(input_buffer[ii])) { targetName[jj] = input_buffer[ii]; jj++; } } targetName[jj] = '\0'; } return(isOkay); } /* * getTargetAlignment - get an "alignment" of the CASP3-released target seq * * Return value: TRUE if the file I/O and memory allocation went okay, * FALSE else. */ int getTargetAlignment( char *targetName, /* called with: name of the * target sequence */ char *checkTargetFileName, /* file name of file with target sequence in * a2m format */ AlignSummary **targetSeq) /* return with: pointer^2 to an * AlignSummary representation of the * target sequence */ { int isOkay = TRUE; char targetSeqFilename[PATHNAME_LENGTH]; char buffer[PATHNAME_LENGTH]; /* getTargetSeqPathname(targetName, targetSeqFilename); */ isOkay = readAlignment(checkTargetFileName, (char **) &buffer, &targetName, SINGLE_SEQ, targetSeq); return(isOkay); } /* * getTemplateName - get the name of the template structure from the * CASP3 prediction. * * Return value: TRUE if we found the name without an unexpected I/O * error, FALSE otherwise. */ int getTemplateName( char *templateName) /* called with: empty string array, * allocated to the max target size. * return with: completed target name */ { int templateLabelFound = FALSE; char input_buffer[INPUT_BUFFER_LENGTH]; int isOkay = TRUE; char *delimitPtr; int ii, jj; /* * The template is identified on the line with the keyword PARENT */ while (isOkay && !templateLabelFound) { if (feof(stdin)) { fprintf(stderr, "Error: EOF reached before template declaration\n"); isOkay = FALSE; } else { gets(input_buffer); if (strncasecmp(input_buffer, CASP3_PARENT_STRING, strlen(CASP3_PARENT_STRING)) == 0) { templateLabelFound = TRUE; } } } /* * The template line declaration will be of the form * "PARENT " where, if struct includes a chain * declaration, it is of the form XXXX_Y. Copy everything * but the underscore into the template name. */ if (isOkay) { for (ii = 0; !isspace(input_buffer[ii]); ii++); for (; isspace(input_buffer[ii]); ii++); jj = 0; for (; ii < strlen(input_buffer); ii++) { if (isalnum(input_buffer[ii])) { templateName[jj] = input_buffer[ii]; jj++; } templateName[jj] = '\0'; } } return(isOkay); } /* * emptyAlignSummary - make an empty AlignSummary structure to hold * the new alignment. * * This function creates an empty AlignSummary structure for the * two sequences to be aligned: target and template. We cannot know * how long the alignment will be, so instead we use an upper bound: * it won't be longer than the sum of the length of the two sequences. */ void emptyAlignSummary( char *targetName, /* called with: name of the target * sequence */ char *templateName, /* called with: name of the template * sequence */ int targetLength, /* called with: length of the target * sequence */ int templateLength, /* called with: length of the template * sequence */ AlignSummary **newAlign) /* return with: pointer to a pointer to * the new AlignSummary struct. */ { AlignSummary *alignTargetTemplate = NULL; int ii; alignTargetTemplate = malloc(sizeof(AlignSummary)); assert(alignTargetTemplate != NULL); alignTargetTemplate->columns = targetLength + templateLength; alignTargetTemplate->rows = 2; alignTargetTemplate->label = calloc(alignTargetTemplate->rows, sizeof(char *)); assert(alignTargetTemplate->label != NULL); alignTargetTemplate->label[TARGET_ROW] = strdup(targetName); alignTargetTemplate->label[TEMPLATE_ROW] = strdup(templateName); alignTargetTemplate->sequence = calloc(alignTargetTemplate->columns, sizeof(char *)); assert(alignTargetTemplate->sequence != NULL); for (ii = 0; ii < alignTargetTemplate->columns; ii++) { alignTargetTemplate->sequence[ii] = calloc(alignTargetTemplate->rows, sizeof(char)); assert(alignTargetTemplate->sequence[ii] != NULL); } *newAlign = alignTargetTemplate; } /* * pdbnumToIndex - given a PDB number, find the index position * it refers to * * This function takes a PDB number and finds which sequence * position has that PDB number. The index returned is in * the range 0:maxindex-1, not 1:maxindex. * * Return value: the index position with the specified PDB number, * if found. Else, it returns -1. */ int pdbnumToIndex( char *thisPdbnum, /* called with: pdb number we're * looking for */ int sequenceLength, /* called with: length of the sequence, * also length of the pdbnum array */ char **pdbList) /* called with: list of the pdb * numbes, by index position */ { int whereFound = -1; int ii; for (ii = 0; whereFound == -1 && ii < sequenceLength; ii++) { if (strcasecmp(thisPdbnum, pdbList[ii]) == 0) { whereFound = ii; } } return(whereFound); } /* * addToAlignment: copy the specified residues into the alignment, * in either insert states or in match states as indicated. * Return with an updated index on what alignment column we're on. */ void addToAlignment( int targetSeqStart, /* called with: index in the target * sequence that we start copying from. */ int targetSeqEnd, /* called with: last index position to * copy in the target sequence */ int templateSeqStart, /* called with: index in the template * sequence that we start copying from */ int templateSeqEnd, /* called with: index in the template * sequence after which we stop copying */ alignedSequence *target, /* called with: structure describing how * the target aligns in the next argument */ alignedSequence *template, /* called with: structure describing how * the template sequence aligns in the * next argument */ AlignSummary *alignWithTarget, /* called with: alignment containing * the target sequence */ AlignSummary *alignWithTemplate, /* called with: an alignment containing * the template sequence */ int isMatch, /* TRUE if the residues are to be added * as matched residues, FALSE if they're * to be added as inserts */ int *alignColumnIdx, /* called with: current column in the * alignment. Return with: updated * current column, when we're done here */ AlignSummary *alignTargetTemplate) /* return with: updated alignment * of the target and template seqs, * reflecting this new segment */ { int ii; int seqIndex; char nextResidue; int targetSegmentLength, templateSegmentLength; targetSegmentLength = targetSeqEnd - targetSeqStart + 1; templateSegmentLength = templateSeqEnd - templateSeqStart + 1; if (isMatch) assert(targetSegmentLength == templateSegmentLength); /* * copy the residues from the target sequence into row TARGET. */ for (seqIndex = 0, ii = targetSeqStart; ii <= targetSeqEnd; ii++, seqIndex++) { nextResidue = alignWithTarget->sequence[target->seq_to_column[ii]][target->index]; assert(isalpha(nextResidue)); if (isMatch) { alignTargetTemplate->sequence[*alignColumnIdx+seqIndex][TARGET_ROW] = toupper(nextResidue); } else { alignTargetTemplate->sequence[*alignColumnIdx+seqIndex][TARGET_ROW] = tolower(nextResidue); } } /* * If we're in insert mode, the two segments can be of different * lengths. If the target segment is shorter than the template * segment, we need to pad the target segment with dots to bring * it up to the same length as the template segment. */ if (!isMatch) { for (; seqIndex <= templateSegmentLength; seqIndex++, ii++) { alignTargetTemplate->sequence[*alignColumnIdx+seqIndex][TARGET_ROW] = '.'; } } /* * Repeat the whole operation for the template sequence. */ for (seqIndex = 0, ii = templateSeqStart; ii <= templateSeqEnd; ii++, seqIndex++) { nextResidue = alignWithTemplate->sequence[template->seq_to_column[ii]][template->index]; assert(isalpha(nextResidue)); if (isMatch) { alignTargetTemplate->sequence[*alignColumnIdx+seqIndex][TEMPLATE_ROW] = toupper(nextResidue); } else { alignTargetTemplate->sequence[*alignColumnIdx+seqIndex][TEMPLATE_ROW] = tolower(nextResidue); } } if (!isMatch) { for (; seqIndex <= targetSegmentLength; seqIndex++, ii++) { alignTargetTemplate->sequence[*alignColumnIdx+seqIndex][TEMPLATE_ROW] = '.'; } } if (targetSegmentLength > templateSegmentLength) { *alignColumnIdx += targetSegmentLength; } else { *alignColumnIdx += templateSegmentLength; } } /* * makeNewAlignment - make a new alignment between the target and * template sequences as specified by the AL record * * This function creates an alignment between the target and template * sequences as described by the AL records. It creates the alignment * by reading the next AL record, deteriming the next match column, and * either adding that match column or adding an insert region before the * match column. * * Return value: boolean indicating if the operation went okay. */ int makeNewAlignment( AlignSummary *targetSeq, /* called with: alignment * containing only the target * seq */ AlignSummary *templateSeq, /* called with: alignment * containing only the template * seq */ alignedSequence *target, /* called with: structure * detailing the target sequence * within its alignment */ alignedSequence *template, /* called with: structure * detailing the template * sequence within its alignment */ char *targetName, /* called with: name of the * target sequence */ char *templateName, /* called with: name of the * template sequence */ char **pdbNumList, /* called with: list of the PDB * numbers aligning at each * position */ int pdbListSize, /* called with: the number of * PDB numbers in the list */ AlignSummary **alignTargetTemplate) /* return with: alignment of the * target and template seqs */ { char input_buffer[INPUT_BUFFER_LENGTH]; int nextInsertColumnTarget, nextInsertColumnTemplate; int alignedColumnTarget, alignedColumnTemplate; char alignedColumnPdbNumber[PDB_NUMBER_LENGTH]; char alignedTargetResidue, alignedTemplateResidue; char testTargetResidue, testTemplateResidue; int alignColumnIndex; int testColumn; int isOkay = TRUE; /* * Step 1: create an empty structure to hold the alignment. We * can't predict the length of the alignment, but can assume that * it won't be longer than the combined lengths of the two sequences */ emptyAlignSummary(targetName, templateName, target->sequence_length, template->sequence_length, alignTargetTemplate); assert(*alignTargetTemplate != NULL); nextInsertColumnTarget = nextInsertColumnTemplate = 0; alignColumnIndex = 0; gets(input_buffer); /* * Step 2: read the next AL record. Skip over any REMARK records. * In the AL record, parse out the specification of the next pair * of residues aligned: target amino acid, target sequence index, * template amino acid, template pdb number. Do a quick comparison * of the target and template amino acids with the amino acids in * the corresponding sequence positions. Beware that the target * seq position is in the range 1:length when it rolls out of the * input file, NOT 0:length-1. * Once we reach a TER record, we're done. */ while (isOkay && !feof(stdin) && strncasecmp(input_buffer, CASP3_TER_STRING, strlen(CASP3_TER_STRING)) != 0) { if (strncasecmp(input_buffer, CASP3_REMARK_STRING, strlen(CASP3_REMARK_STRING)) == 0) { gets(input_buffer); continue; } sscanf(input_buffer, "%c %d %c %s", &alignedTargetResidue, &alignedColumnTarget, &alignedTemplateResidue, alignedColumnPdbNumber); alignedColumnTarget--; alignedColumnTemplate = pdbnumToIndex(alignedColumnPdbNumber, pdbListSize, pdbNumList); testColumn = target->seq_to_column[alignedColumnTarget]; testTargetResidue = targetSeq->sequence[testColumn][target->index]; assert(toupper(alignedTargetResidue) == toupper(testTargetResidue)); testColumn = template->seq_to_column[alignedColumnTemplate]; testTemplateResidue = templateSeq->sequence[testColumn][template->index]; assert(toupper(alignedTemplateResidue) == toupper(testTemplateResidue)); /* * Step 3: Given the index positions of the next pair of * aligning residues, add to the alignment any residues before * this pair and since the last match or beginning of alignment. */ addToAlignment(nextInsertColumnTarget, alignedColumnTarget - 1, nextInsertColumnTemplate, alignedColumnTemplate - 1, target, template, targetSeq, templateSeq, FALSE, &alignColumnIndex, *alignTargetTemplate); /* * Step 4: with the insert done, add the matching column to the * alignment. */ addToAlignment(alignedColumnTarget, alignedColumnTarget, alignedColumnTemplate, alignedColumnTemplate, target, template, targetSeq, templateSeq, TRUE, &alignColumnIndex, *alignTargetTemplate); nextInsertColumnTarget = alignedColumnTarget+1; nextInsertColumnTemplate = alignedColumnTemplate+1; gets(input_buffer); } /* * Step 5: add any remaining residues to the end of the alignment * as an insert. Note that we're specifying columns in the range * 0:length-1, not 1:length. */ addToAlignment(nextInsertColumnTarget, target->sequence_length-1, nextInsertColumnTemplate, template->sequence_length-1, target, template, targetSeq, templateSeq, FALSE, &alignColumnIndex, *alignTargetTemplate); (*alignTargetTemplate)->columns = alignColumnIndex + 1; return(isOkay); } int main(int argc, char *argv[]) { AlignSummary *templateSeq = NULL; AlignSummary *targetSeq = NULL; AlignSummary *alignTargetTemplate = NULL; char *seqReadFromPdb; char **pdbNumList; int pdbListSize; alignedSequence *target = NULL; alignedSequence *template = NULL; int isOkay = TRUE; int ii; char templateName[MAX_TEMPLATE_NAME_LENGTH] = ""; char targetName[MAX_TARGET_NAME_LENGTH] = ""; char *outputFileName = ""; char *desiredTemplateName = ""; char *checkTargetFileName = ""; if (argc == 1) { fprintf(stderr, "Usage: %s <%s output_align_name> [%s template_name] %s target_seq_filename %s\n", argv[0], OUTPUT_STRING, TEMPLATE_NAME_STRING, CHECKTARGET_STRING, "< "); exit(-1); } else { for (ii = 1; ii < argc; ii++) { if (strncasecmp(argv[ii], OUTPUT_STRING, strlen(argv[ii])) == 0) { outputFileName = argv[ii+1]; ii++; } if (strncasecmp(argv[ii], TEMPLATE_NAME_STRING, strlen(argv[ii])) == 0) { desiredTemplateName = argv[ii+1]; ii++; } if (strncasecmp(argv[ii], CHECKTARGET_STRING, strlen(argv[ii])) == 0) { checkTargetFileName = argv[ii+1]; ii++; } } /* * First, parse the target name out of the AL file. Next, read * the target sequence from checkTargetFileName and build * an alignedSequence struct for the target sequence. */ isOkay = getTargetName(targetName); if (isOkay) isOkay = getTargetAlignment(targetName, checkTargetFileName, &targetSeq); if (isOkay) { target = alignedSequenceStruct(targetSeq, targetName); if (target == NULL) isOkay = FALSE; } /* * If the template name was given, read down to the AL record for * the desired template name. Else, parse the template name * out of the AL record. Finally, use the pdb summary tool to get * the PDB guide sequence and the list of PDB numbers at each guide * sequence position. */ if (isOkay) do { isOkay = getTemplateName(templateName); } while (isOkay && (strncasecmp(desiredTemplateName, templateName, strlen(desiredTemplateName)) != 0)); if (isOkay) { isOkay = getPdbsumAlignment(templateName, &pdbNumList, &pdbListSize, &seqReadFromPdb, &templateSeq); if (isOkay) { template = alignedSequenceStruct(templateSeq, templateName); if (template == NULL) isOkay = FALSE; } } if (isOkay) isOkay = makeNewAlignment(targetSeq, templateSeq, target, template, targetName, templateName, pdbNumList, pdbListSize, &alignTargetTemplate); if (isOkay) printAlign(alignTargetTemplate, outputFileName); if (target != NULL) freeAlignedSequence(target); if (template != NULL) freeAlignedSequence(template); if (targetSeq != NULL) freeAlignSummary(targetSeq); if (templateSeq != NULL) freeAlignSummary(templateSeq); if (alignTargetTemplate != NULL) freeAlignSummary(alignTargetTemplate); } if (isOkay) { return(0); } else { return(-1); } }