/* * alignsummary.c - contains functions to process the alignSummay struct. * * This module contains functions to create, delete, and process the * alignSummary structure. The global entry points to this module are * as follows: * alignedSequenceStruct * alignSequencesReviseAlignments * findSequenceIndex * freeAlignSummary * printAlign * readAlignment * readReferenceCandidate * sequencesIdentical * * Modifications: * 12/9/97 cline author, moved from measure_shift.c * 2/5/98 cline added readReferenceCandidate * 2/11/98 added removeAlignmentColumn * added restoreAlignmentColumn * 2/16/98 replaced getTemplate and getTarget with * alignedSequenceStruct * 3/5/98 karplus added echo of alignment summary info to stderr * Tried to fix bug and simplify fixupColumns, * but the code is so wrong it may be hopeless. * Threw it out and did the conversion correctly * in copyFromSeqNodeToAlignSeq. * Eliminated MaxSeqLength * Added count_match * 3/8/98 karplus Changed strcmp to strcasecmp so that names needn't * be converted to uppercase. * 3/9/98 karplus Fixed my bugs in copyFromSeqNodeToAlignSeq. * 3/10/98 cline moved printFinalA2m from a2mtrim, renamed as * printAlign. * 4/7/98 cline Fixed a bug in sequencesIdentical that was causing * residues tacked on the end of one sequence to be * missed. Fixed readReferenceCandidate to check * for the presence of the template and target * residues in the two sequences before erroneously * assuming that they're there. * 4/16/98 cline Fixed a bug in the error message that appears * when the alignment doesn't have the minumum * number of sequences to be interestint; stopped * the core dump. * 4/20/98 cline Revised readAlignment and readReferenceCandidate * to reflect the new handling of the command line * parameters. * 4/21/98 cline Added functionality to handle gzipped alignments * 5/20/98 cline Removed bug that caused files to be left on /tmp * after the program was run. * 5/22/98 cline Trying again to fix the bug of 5/20. * 5/26/98 cline Back again on this problem of temp file removal, * renamed the temp files to make them distinct * and traceable to a set of parameters. * 5/27/98 cline Discovered the problem of the temp file removal. * The files were not being removed because the * parent program was dumping core. This seems to * be due to a memory management bug. Changing the * strcpy in nextSequence to strncpy seems to work * around the problem. * 6/5/98 cline Removed the unused isTemplate argument from * alignSequencesReviseAlignments. * 6/8/98 cline Added functionality in readNextSequence to * remove whitespace added to the string for * readability. * 6/9/98 cline added function readStringFromFile */ #include #include #include #include #include "alignsummary.h" /* * #DEFINES */ #define BUFSIZE 4095 #define LABEL_MARKER '>' /* * TYEPDEFS * /* * the aligned sequence data is initially read into a linked list, * a list of seqNode structures. This is a convenient structure to * read the data into until we know details such as the number of * sequences and their maximum length, and can copy the whole * thing into the AlignSummary struct. */ typedef struct seqData{ char label[MAX_LABEL_LENGTH]; /* sequence label */ char *sequence; /* the aligned sequence itself */ struct seqData *next; /* next node in the list */ } seqNode; /* * PROTOTYPES FOR LOCAL FUNCTIONS */ int count_match(AlignSummary *align); /* count the number of match columns in the first sequence of alignment */ void addResidueToAlignedSeq( AlignSummary *align, /* called with: the alignment to * modify */ int sequenceToModify, /* called with: index of the sequence * to be modified */ int beforeSeqPosition, /* called with: the sequence position * (in sequenceToModify) before which * the new column should be added */ char newResidue); /* called with: the new residue to * add at this position. */ int copyFromSeqNodeToAlignedSeq( seqNode *seqList, /* called with: head of the linked list * of seqNode structures */ int numberSequences, /* called with: number of sequences in the * structure */ AlignSummary *thisAlign); /* called with: target data structure, * allocated but with empty alignment fields. * return with: structure with multiple * alignment fields filled in */ AlignSummary *getAlignSummary( char *a2mFilename, /* called with: filename of the * alignment file */ char *templateName, /* called with: name of the template * sequence */ char *targetName); /* called with: anme of the target * sequence */ int getMultipleAlignment( char *filename, /* called with: name of the file * containing the alignment data */ int minNumberAlignedSeqs, /* the smallest number of aligned * sequences expected by the caller */ AlignSummary *thisAlign); /* called with: pointer to a struct * already allocated but with no alignment. * return with: struct with the * alignment fields filled in */ int getSequenceAlignment( AlignSummary *align1, /* called with: first alignment */ AlignSummary *align2, /* called with: second alignment */ int sequenceIndex1, /* called with: index of the sequence * of interest within the first * alignment */ int sequenceIndex2, /* called with: index of the sequence * of interest within the second * alignment */ char *structName, /* called with: name of the structure * of interest */ AlignSummary *sequenceAlignment); /* return with: pointer to the * alignment of the specified * sequences */ int inputs_okay( int argc, char *argv[]); int hasSuffix(char *filename, char *suffix); void makeSequenceAlignment( FILE *alignFile, AlignSummary *align1, int templateSequenceIndex1, AlignSummary *align2, int templateSequenceIndex2); void makeSequenceFile( FILE *alignFile, AlignSummary *align1, int templateSequenceIndex1); int readMultipleAlignment( char *filename, /* called with: name of the file with the * alignment data */ int minNumberAlignedSeqs, /* called with: smallest number of * aligned sequences we could return * with, and still have an alignment * of interest to the caller */ seqNode **seqList, /* return with: a linked list * representation of the sequence data */ int *numberSequences); /* return with: number of sequences read */ int readNextSeq( FILE *fp, /* called with: the stream with the sequence * data, open for reading */ char *filename, /* called with: name of the file, used in * error messages */ int seqNumber, /* called with; what sequence we're on, used * in error messages */ seqNode *nextSeq); /* called with: allocated but empty seqNode structure. * Return with: structure with fields filled in. */ void writeSequenceToFile( FILE *seqFile, /* called with: file to write to */ AlignSummary *align, /* called with: alignment containing * the sequence of interest */ int sequenceIndex); /* called with: row index of the * sequence of interest within * the alignment */ /* * CODE */ /* * hasSuffix - determine if the specified filename has the specified suffix */ int hasSuffix(char *filename, char *suffix) { int ii; int suffixLength, filenameLength; int isSuffix = TRUE; filenameLength = strlen(filename); suffixLength = strlen(suffix); if (filenameLength < suffixLength) isSuffix = FALSE; for (ii = 0; ii < suffixLength && isSuffix; ii++) { if (toupper(filename[filenameLength - suffixLength + ii]) != toupper(suffix[ii])) { isSuffix = FALSE; } } return(isSuffix); } /* count_match * count the number of match columns in the first sequence of alignment */ int count_match(AlignSummary *align) { int num_match; int col; char c; if (!align || align->rows==0) return 0; num_match=0; for (col=0; colcolumns; col++) { c = align->sequence[col][0]; if (isupper(c) || c=='-') num_match++; } return num_match; } /* * alignedSequenceStruct - get all the gory information relating * to the specified aligned sequence * in the alignment. * * This function allocates and fills in the alignedSequence * struct for the specified sequence in the specified alignment. * If the sequnce is not found, it fills in the index as NOT_ALIGNED * and returns the otherwise empty structure. * * Return value - a pointer to the structure if all the mallocs and * everything went okay, else NULL. */ alignedSequence *alignedSequenceStruct( AlignSummary *alignment, /* called with: the alignment in question */ char *sequenceName) /* called with: name of the sequence * in question */ { alignedSequence *sequenceData = NULL; int ii; int sequenceIndex, residueIndex; /* * first off, the overhead. Allocate the structure and its * various substructures. Look to see if this sequence is * even part of this alignment. */ sequenceIndex = findSequenceIndex(sequenceName, alignment); if (sequenceIndex == NOT_ALIGNED) { fprintf(stderr, "Error: sequence %s not found in alignment %s\n", sequenceName, alignment->filename); } else { sequenceData = malloc(sizeof(alignedSequence)); assert(sequenceData != NULL); sequenceData->index = sequenceIndex; sequenceData->seq_to_column = calloc(alignment->columns, sizeof(int)); assert(sequenceData->seq_to_column != NULL); sequenceData->column_to_seq = calloc(alignment->columns, sizeof(int)); assert(sequenceData->column_to_seq != NULL); /* * Now we begin with the two arrays to reference the alignment * data. This alignment structure is a big matrix containing * inserts and deletions as well as matching columns. For any * given residue in the sequence, we want something that tells * us where it is in the alignment. For any given residue in * the alignment, we want to know which position it occupies in * the sequence. */ residueIndex = 0; for (ii = 0; ii < alignment->columns; ii++) { if (isalpha(alignment->sequence[ii][sequenceData->index])) { sequenceData->seq_to_column[residueIndex] = ii; sequenceData->column_to_seq[ii] = residueIndex; residueIndex++; } else { sequenceData->column_to_seq[ii] = NOTHING_ALIGNED; } } sequenceData->sequence_length = residueIndex; } return(sequenceData); } /* * freeAlignSummary - free the align summary structure and its * substructures. */ void freeAlignSummary( AlignSummary *alignData) /* called with: structure to be freed */ { int ii; for (ii = 0; ii < alignData->rows; ii++) free(alignData->label[ii]); for (ii = 0; ii < alignData->columns; ii++) free(alignData->sequence[ii]); free(alignData->sequence); free(alignData); } /* * freeAlignedSequence - free an aligned sequence struct */ void freeAlignedSequence( alignedSequence *sequenceStruct) /* called with: a pointer * to an allocated * alignedSequence struct. */ { if (sequenceStruct != NULL) { free(sequenceStruct->seq_to_column); free(sequenceStruct->column_to_seq); free(sequenceStruct); } } /* * sequencesIndentical - check if two sequences are the same in the two * alignments. * * This function checks to see whether or not the two versions of the * sequence from the two alignments are the same. It checks all * amino acids, and if the amino acid sequence is different, it returns * FALSE. If the amino acid sequence is the same, it returns TRUE. It * doesn't matter if the characters are upper or lowercase, or * interspersed with inserts and so forth. */ int sequencesIdentical( AlignSummary *align1, /* called with: first alignment */ char *name1, /* called with: name of the * sequence of interest */ AlignSummary *align2, /* called with: second alignment */ char *name2) /* called with: name of the * sequence of interest */ { int identical = TRUE; int ii; int alignIndex1, alignIndex2; int seqlength1, seqlength2; int index1, index2; index1 = findSequenceIndex(name1, align1); if (index1 ==NOT_ALIGNED ) { fprintf(stderr, "Error: can't find sequence named %s in %s\n", name1, align1->filename); } assert(index1 >= 0); index2 = findSequenceIndex(name2, align2); if (index2 ==NOT_ALIGNED ) { fprintf(stderr, "Error: can't find sequence named %s in %s\n", name2, align2->filename); } assert(index2 >= 0); seqlength1 = seqlength2 = 0; for (alignIndex1 = alignIndex2 = 0; identical && alignIndex1 < align1->columns && alignIndex2 < align2->columns; alignIndex1++, alignIndex2++) { /* * In both alignment 1 and alignment 2, advance to either the * next residue or the end of the alignment, whichever comes * first. */ while ((alignIndex1 < align1->columns) && (!isalpha(align1->sequence[alignIndex1][index1]))) { alignIndex1++; } while ((alignIndex2 < align2->columns) && (!isalpha(align2->sequence[alignIndex2][index2]))) { alignIndex2++; } /* * After the while loops above, both sequence indices will * either be at a residue or past the end of the alignment. * If they're both at a residue, check whether the residues * are the same. */ if (alignIndex1 < align1->columns && alignIndex2 < align2->columns) { if (toupper(align1->sequence[alignIndex1][index1]) != toupper(align2->sequence[alignIndex2][index2])) { identical = FALSE; } seqlength1++; seqlength2++; } else if (alignIndex1 < align1->columns) { identical = FALSE; } else if (alignIndex2 < align2->columns) { identical = FALSE; } } /* * At this point, either we've found differences in the sequences, * or we've reached the end of one of the alignments. If we haven't * found any differences in the sequences yet, make sure that we * don't have a fe residues tacked on the end of one sequence but * not the other. Check for this by seeing if there are any * residues left in either sequence. */ if (identical) { while (alignIndex1 < align1->columns) { if (isalpha(align1->sequence[alignIndex1][index1])) { identical = FALSE; } alignIndex1++; } while (alignIndex2 < align2->columns) { if (isalpha(align2->sequence[alignIndex2][index2])) { identical = FALSE; } alignIndex2++; } } return(identical); } /* * findSequenceIndex - find the index of the specified sequence * in the alignment. * * This function looks through the labels of the specified alignment * and looks for the specified sequence label. If it finds it, it * returns its row index. Otherwise, it returns NOT_FOUND. */ int findSequenceIndex( char *target_label, /* called with: the label we're * looking for */ AlignSummary *align) /* called with: where we're * looking for it */ { int ii; int target_index = NOT_FOUND; for (ii = 0; ii < align->rows && target_index == NOT_FOUND; ii++) { if (strcasecmp(target_label, align->label[ii]) == 0) { target_index = ii; } } return(target_index); } /* * writeSequenceToFile - write the sequence data of the specified * sequence to a .seq or .a2m file. * * This function writes out the sequence data from the indicated * aligned sequence to the specified file. The sequence data it * writes out are the residues only, uppercased. Note that this * is not the same as the aligned sequence. */ void writeSequenceToFile( FILE *seqFile, /* called with: file to write to */ AlignSummary *align, /* called with: alignment containing * the sequence of interest */ int sequenceIndex) /* called with: row index of the * sequence of interest within * the alignment */ { int ii; fprintf(seqFile, ">%s1\n", align->label[sequenceIndex]); for (ii = 0; ii < align->columns; ii++) { if (isalpha(align->sequence[ii][sequenceIndex])) { fputc(toupper(align->sequence[ii][sequenceIndex]), seqFile); } } fputc('\n', seqFile); } /* * makeSequenceFile - to start the alignment process, make a file of * only one sequence, to which it and the other * sequence will be aligned. * * This function writes out the data for the starting point of * modelfromalign - a copy of one of the sequences. It doesn't * matter which sequence is used, as the other one should be * similar and should align to it easily. This function differs from * makeAlignFile in that it only records one of the two sequences; the * other function makes the database which will later be used for * aligning both sequences to the model. NOTE: modelfromalign won't * work on an .a2m file with a single sequence. To get around that, * write out two copies of the same sequence. */ void makeSequenceFile( FILE *seqFile, AlignSummary *align1, int sequenceIndex) { writeSequenceToFile(seqFile, align1, sequenceIndex); writeSequenceToFile(seqFile, align1, sequenceIndex); } /* * makeSequenceAlignment - make an input alignment of the two template * sequences. * * This routine creates a dump alignment of the two template sequences * by writing out the sequence data for each sequence in .a2m format. * This alignment is to be used as input by such things as modelfromalign. */ void makeSequenceAlignment( FILE *alignFile, AlignSummary *align1, int sequenceIndex1, AlignSummary *align2, int sequenceIndex2) { int ii; int seq1Length, seq2Length; writeSequenceToFile(alignFile, align1, sequenceIndex1); writeSequenceToFile(alignFile, align2, sequenceIndex2); } /* * getSequenceAlignment - get the realignment of the two sequences * * This function takes as input two alignments containing two different * versions of the same sequence, and returns an alignment of these two * different version to each other. * * return value: a boolean flag indicating if the operation was successful */ int getSequenceAlignment( AlignSummary *align1, /* called with: first alignment */ AlignSummary *align2, /* called with: second alignment */ int sequenceIndex1, /* called with: index of the sequence * of interest within the first * alignment */ int sequenceIndex2, /* called with: index of the sequence * of interest within the second * alignment */ char *structName, /* called with: name of the structure * of interest */ AlignSummary *sequenceAlignment) /* return with: pointer to the * alignment of the specified * sequences */ { FILE *alignFile; FILE *seqFile; char alignFilename[FILENAME_LENGTH + 1]; char seqFilename[FILENAME_LENGTH + 1]; char finalAlignFilename[FILENAME_LENGTH + 1]; char modFileName[FILENAME_LENGTH + 1]; char commandBuffer[COMMAND_LENGTH + 1]; int returnValue; /* * First step: build a sequence file containing only * one of the two sequences. Which one it is doesn't * matter. This will be a starting point for the alignment * process. */ sprintf(seqFilename, "/tmp/%s.init.a2m", structName); seqFile = fopen(seqFilename, "w"); assert(seqFile != NULL); makeSequenceFile(seqFile, align1, sequenceIndex1); fclose(seqFile); /* * Next step: build an alignment of the two template sequences. * This requires finding which sequences are the template sequences, * opening a file, and writing out the sequence data. */ sprintf(alignFilename, "/tmp/%s.%s.a2m", structName, structName); alignFile = fopen(alignFilename, "w"); assert(alignFile != NULL); makeSequenceAlignment(alignFile, align1, sequenceIndex1, align2, sequenceIndex2); fclose(alignFile); /* * construct and run the modelfromalign command */ sprintf(modFileName, "/tmp/%s.%s", structName, structName); sprintf(commandBuffer, "modelfromalign %s -alignfile %s", modFileName, seqFilename); strcat(modFileName, ".mod"); returnValue = system(commandBuffer); if (returnValue == 0) { /* * now, construct and run the align2model command. When done, * delete the alignments and models. */ sprintf(finalAlignFilename, "/tmp/%s.%s.final", structName, structName); sprintf(commandBuffer, "align2model %s -i %s -db %s > /dev/null", finalAlignFilename, modFileName, alignFilename); returnValue = system(commandBuffer); if (returnValue == 0) { strcat(finalAlignFilename, ".a2m"); returnValue = getMultipleAlignment(finalAlignFilename, 2, sequenceAlignment ); returnValue = unlink(finalAlignFilename); assert(returnValue == 0); } returnValue = unlink(modFileName); assert(returnValue == 0); } returnValue = unlink(alignFilename); assert(returnValue == 0); returnValue = unlink(seqFilename); assert(returnValue == 0); return(returnValue); } /* * addResidueToAlignedSeq - this function is called to add a residue to * an aligned sequence in the specified alignment. * * This function modifies an alignment by adding a residue to an aligned * sequence. In most cases, this results in a new column added to the * alignment. The particulars of what happens depend on the other sequence * of interest, and what if anything it has in the adjacent columns as the * column to be added: * - there's a deletion in that section of the sequence to be modified, and * an aligned residue in the other sequence. In that case, the new * residue in the sequence to be modified is aligned to the residue in the * other sequence. * - else, a new column is added to the alignment, and the other sequence * has a deletion in this column. */ void addResidueToAlignedSeq( AlignSummary *align, /* called with: the alignment to * modify */ int sequenceToModify, /* called with: index of the sequence * to be modified */ int beforeSeqPosition, /* called with: the sequence position * (in sequenceToModify) before which * the new column should be added */ char newResidue) /* called with: the new residue to * add at this position. */ { int seqPosition; int columnToAddBefore, columnToAddAfter; int ii; int columnFound; char **newData; /* * First, find what alignment columns this sequence position * corresponds to. As an exception case, if this sequence position * is at the end of the alignment, set the second marker for just * past the end of the alignment. */ columnToAddAfter = -1; for (seqPosition = 0; seqPosition < beforeSeqPosition; ) { columnToAddAfter++; if (isalpha(align->sequence[columnToAddAfter][sequenceToModify])) { seqPosition++; } } columnToAddBefore = columnToAddAfter; while (seqPosition <= beforeSeqPosition && columnToAddBefore < align->columns-1) { columnToAddBefore++; if (isalpha(align->sequence[columnToAddBefore][sequenceToModify])) { seqPosition++; } } if (columnToAddBefore == align->columns-1 && !isalpha(align->sequence[align->columns-1][sequenceToModify])) { columnToAddBefore = align->columns; } /* * It's time to add a column. This alignment is in an array * that's indexed first by column and then by row. The column field * is an array of pointers to all the row fields. So, we've got to * allocate a new column field, one column larger, and move all the * pieces of row data into it, keeping a spot empty for the new column. */ newData = calloc(align->columns+1, sizeof(char*)); assert(newData != NULL); for (ii = 0; ii <= columnToAddAfter; ii++) { newData[ii] = align->sequence[ii]; } newData[ii] = calloc(align->rows, sizeof(char)); for (; ii < align->columns; ii++) { newData[ii+1] = align->sequence[ii]; } free(align->sequence); align->sequence = newData; for (ii = 0; ii < align->rows; ii++) { align->sequence[columnToAddAfter+1][ii] = '-'; } align->sequence[columnToAddAfter+1][sequenceToModify] = newResidue; align->columns++; } /* * removeAlignmentColumn - remove a column from the specified alignment * * This function "removes" a column by converting it to an insert. All * aligned residues in the column are converted to inserts, */ void removeAlignmentColumn( AlignSummary *thisAlign, /* called with: structure detailing the * alignment to update */ int column) /* called with: column index for the * column to delete */ { int ii; for (ii = 0; ii < thisAlign->rows; ii++) { assert(isupper(thisAlign->sequence[column][ii]) || thisAlign->sequence[column][ii] == '-'); if (isupper(thisAlign->sequence[column][ii])) { thisAlign->sequence[column][ii] = tolower(thisAlign->sequence[column][ii]); } else { thisAlign->sequence[column][ii] = '.'; } } return; } /* * restoreAlignmentColumn - restore a column that was deleted previously * * This function "restores" a column that had been deleted previously * by converting it from an indel column to a match column. */ void restoreAlignmentColumn( AlignSummary *thisAlign, /* called with: structure detailing the * alignment to update */ int column) /* called with: column index for the * column to delete */ { int ii; for (ii = 0; ii < thisAlign->rows; ii++) { assert(islower(thisAlign->sequence[column][ii]) || thisAlign->sequence[column][ii] == '.'); if (islower(thisAlign->sequence[column][ii])) { thisAlign->sequence[column][ii] = toupper(thisAlign->sequence[column][ii]); } else { thisAlign->sequence[column][ii] = '-'; } } } /* * alignSequencesReviseAlignments - align the two copies of the one sequence * to each other, and insert any missing pieces into * the alignment. * * This function goes into the two alignments and looks at the two versions * of the specified sequence. If the two versions are different, it creates * an alignment of the two sequences. Then, using this alignment, it * identifies any regions missing from one of the input alignments. It adds * these regions to the input alignment as additional columns: the sequence * specified matches these columns, all others have a delete in that position. */ void alignSequencesReviseAlignments( AlignSummary *align1, /* called with: one of the two alignments */ AlignSummary *align2, /* called with: the other of the two alignments */ char *structName1, /* called with: name in the first alignment * of the sequence whose alignment is being * checked */ char *structName2) /* called with: name in the second alignment * of the sequence whose alignment is being * checked */ { AlignSummary *sameSeqAlignment; int column_index, seq_index; int is_okay = TRUE; int sequenceIndex1, sequenceIndex2; sequenceIndex1 = findSequenceIndex(structName1, align1); assert(sequenceIndex1 >= 0); sequenceIndex2 = findSequenceIndex(structName2, align2); assert(sequenceIndex2 >= 0); /* * First off, we want to know if the template sequences are * identical. If they are, we can shortcut this process * significantly. */ if (!sequencesIdentical(align1, structName1, align2, structName2)) { sameSeqAlignment = malloc(sizeof(AlignSummary)); assert(sameSeqAlignment != NULL); is_okay = getSequenceAlignment( align1, align2, sequenceIndex1, sequenceIndex2, structName1, sameSeqAlignment); assert(is_okay = TRUE); /* * Go through this alignment. Wherever the sequence in align1 has * positions missing from align2 (or vise versa), add the missing * positions to the alignment as new columns. The way the missing * positions will be noted is this. In the same-sequence alignment, * there will be three possibilities: * (1) both sequences will have the same residue. All is well, * we continue. * (2) Both sequence have residues, but they're different * residues. What's necessary then is to take the residue * from the first sequence and add it to the second, then * take the residue from the second sequence and add it to * the first. * (2) One seqeunce will insert or delete a residue relative * to the other sequence. In that case, one sequence will * have an alphabetic character, and the other won't. The * character will be inserted in whatever sequence doesn't * have it. */ seq_index = 0; for (column_index = 0; column_index < sameSeqAlignment->columns; column_index++) { if (isalpha(sameSeqAlignment->sequence[column_index][0]) && !isalpha(sameSeqAlignment->sequence[column_index][1])) { addResidueToAlignedSeq(align2, sequenceIndex2, seq_index, toupper(sameSeqAlignment->sequence[column_index][0])); } else if (isalpha(sameSeqAlignment->sequence[column_index][1]) && !isalpha(sameSeqAlignment->sequence[column_index][0])) { addResidueToAlignedSeq(align1, sequenceIndex1, seq_index, toupper(sameSeqAlignment->sequence[column_index][1])); } else if (toupper(sameSeqAlignment->sequence[column_index][0]) != toupper(sameSeqAlignment->sequence[column_index][1])) { addResidueToAlignedSeq(align2, sequenceIndex2, seq_index, toupper(sameSeqAlignment->sequence[column_index][0])); seq_index++; addResidueToAlignedSeq(align1, sequenceIndex1, seq_index, toupper(sameSeqAlignment->sequence[column_index][1])); } seq_index++; } freeAlignSummary(sameSeqAlignment); } return; } /* * readStringFromFile - read from the specified file a string of * unknown length, allocating memory as you go. * * This function reads and allocates memory for a string terminated by * the carriage return character. * * return value: TRUE if the read went okay, FALSE if there was an * an I/O or memory error. */ int readStringFromFile( FILE *fp, /* called with: the input file, open for * reading, and advanced to the beginning * of the string. */ char *filename, /* name of the file, for error reporting * purposes */ char **stringFromFile) /* return with: the string as read * from the file */ { int stringLength, oldStringLength; int readingString; char *oldString = NULL; char *newString = ""; char buffer[BUFSIZE]; char *crPtr; int is_okay = TRUE; readingString = TRUE; stringLength = 0; while (is_okay && readingString) { if (fgets(buffer, BUFSIZE, fp) == NULL) { fprintf(stderr, "Error: premature EOF in file %s\n", filename); is_okay = FALSE; } else { if ((crPtr = index(buffer, '\n')) != NULL) { *crPtr = '\0'; } if (strlen(buffer) > 0) { oldStringLength = stringLength; stringLength += strlen(buffer); if (oldString != NULL) free(oldString); oldString = newString; newString = calloc(stringLength, sizeof(char)); assert(newString != NULL); strncpy(newString, oldString, oldStringLength); strcat(newString, buffer); } if (strlen(buffer) < BUFSIZE - 1) { readingString = FALSE; } } } *stringFromFile = newString; return(is_okay); } /* * readNextSeq - read the data on the next sequence * * This function reads the data on the next sequence from the * specified stream, and fills in the data in the passed seqNode * struct. * * return value: boolean flag set to TRUE if the data was read * successfully, FALSE if any errors were encountered. */ int readNextSeq( FILE *fp, /* called with: the stream with the sequence * data, open for reading */ char *filename, /* called with: name of the file, used in * error messages */ int seqNumber, /* called with; what sequence we're on, used * in error messages */ seqNode *nextSeq) /* called with: allocated but empty seqNode * structure. * Return with: structure with fields filled in. */ { char buffer[BUFSIZE]; int is_okay = TRUE; int ii; char *spacePtr; char *oldSeq, *newSeq, *nextSubSeq; int seqLength, oldSeqLength; char nextChar; /* * read the label line. Check that it is a label line, and extract * and save the label. */ if (fgets(buffer, BUFSIZE-1, fp) == NULL) { fprintf(stderr, "Error: premature EOF on sequence %d, file %s\n", seqNumber, filename); is_okay = FALSE; } else { if (buffer[0] != LABEL_MARKER) { fprintf(stderr, "Error: alignment file %s is missing sequence label %d\n", filename, seqNumber); is_okay = FALSE; } else { for (ii = 1; isalnum(buffer[ii]); ii++) { nextSeq->label[ii-1] = buffer[ii]; } nextSeq->label[ii-1] = '\0'; } } /* * read the next sequence from the input file. The sequence * will be one or more strings of indefinite length, each terminated * by a carriage return. When the next character in the file is * finally a '>', then we've reached the end of the sequence. */ newSeq = oldSeq = NULL; seqLength = 0; nextChar = fgetc(fp); ungetc(nextChar, fp); while (is_okay && !feof(fp) && nextChar != '>') { if (oldSeq != NULL) free(oldSeq); oldSeq = newSeq; oldSeqLength = seqLength; is_okay = readStringFromFile(fp, filename, &nextSubSeq); if (is_okay) { seqLength += strlen(nextSubSeq); newSeq = calloc(seqLength, sizeof(char)); strncpy(newSeq, oldSeq, oldSeqLength); strcat(newSeq, nextSubSeq); free(nextSubSeq); nextChar = fgetc(fp); if (!feof(fp)) ungetc(nextChar, fp); } } nextSeq->sequence = newSeq; /* * sometimes, a string is delimited with some blanks for readability. * If the string has any spaces in it, remove them. */ while ((spacePtr = index(nextSeq->sequence, ' ')) != NULL) { strcpy(spacePtr, spacePtr+1); } return(is_okay); } /* * readMultipleAlignment - read the multiple alignment data from * the specified file into a linked list structure * * This function reads a FASTA-format multiple alignment from * the specified file into a linked list structure. Additionally, * this module checks that there is a minimum number of aligned * sequences, the number specified by the calling program. * * Return value: flag set to TRUE if the data could be read * successfully, the file was in the expected format, and * there were at least two aligned sequences; * FALSE otherwise. */ int readMultipleAlignment( char *filename, /* called with: name of the file with the * alignment data, used in error messages */ int minNumberAlignedSeqs, /* called with: smallest number of * aligned sequences we could return * with, and still have an alignment * of interest to the caller */ seqNode **seqList, /* return with: a linked list * representation of the sequence * data */ int *numberSequences) /* return with: number of sequences * read */ { FILE *fp; int isOkay = TRUE; seqNode *firstSeq, *lastSeq, *nextSeq; int numberSeqsRead = 0; fp = fopen(filename, "r"); if (fp == NULL) { fprintf(stderr, "Error: could not read alignment file %s\n", filename); isOkay = FALSE; } firstSeq = lastSeq = (seqNode *) NULL; while (isOkay && !feof(fp)) { nextSeq = (seqNode *) malloc(sizeof(seqNode)); assert(nextSeq != NULL); nextSeq->sequence = NULL; nextSeq->next = (seqNode *) NULL; isOkay = readNextSeq(fp, filename, numberSeqsRead+1, nextSeq); /* * now for the bookkeeping: update the linked list, the number * of sequences, and the length of the longest sequence */ if (isOkay) { numberSeqsRead++; if (lastSeq == NULL) { firstSeq = lastSeq = nextSeq; } else { lastSeq->next = nextSeq; lastSeq = lastSeq->next; } } } /* * and now for the cleanup. If the file was opened, close it and * look at how many sequences were read. If this number is less * than the minimum number to make this alignment interesting, * print and error message and set the return value to indicate * an error. */ if (fp != NULL) { fclose(fp); if (numberSeqsRead < minNumberAlignedSeqs) { fprintf(stderr, "Error: alignment %s has only %d sequences, at least %d expected\n", filename, numberSeqsRead, minNumberAlignedSeqs); isOkay = FALSE; } } *seqList = firstSeq; *numberSequences = numberSeqsRead; return(isOkay); } /* * copyFromSeqNodeToAlignedSeq - copy the alignment data from a linked * list of seqNode structures to the fields of the alignedSeq struct. * * Make sure all sequences have same number of match columns * and figure out where they go in the final alignment. * * copy the data from the temporary linked list to the permanent data * structure, and free the nodes of the linked list. * * * return value: boolean value set to TRUE if the operation was successful, * FALSE otherwise. */ int copyFromSeqNodeToAlignedSeq( seqNode *seqList, /* called with: head of the linked list * of seqNode structures */ int numberSequences, /* called with: number of sequences in the * structure */ AlignSummary *thisAlign) /* called with: target data structure, * allocated but with empty alignment fields. * return with: structure with multiple * alignment fields filled in */ { seqNode *lastSeqNode, *nextSeqNode; int ii, jj; int is_okay = TRUE; int row, col; /* counters for sequence and position in sequence */ char *seq; int num_match=0; /* number of match columns in first sequence */ int check_match; /* number of match columns in sequence to * compare to num_match */ int num_before; /* number of insertions immediately before current * position in current row. */ int *before_match; /* before_match[i] is max number of insertions * immediately before match column i. * One extra position for final inserts. */ int *where_match; /* where_match[i] is location for ith match column * (cumulative sum of before_match +i) * One extra position for full alignment width. */ int TotalColumns; /* number of columns needed when insertions * are all padded to make match columns align. */ /* count the number of match columns in the first sequence */ assert(seqList); seq = seqList->sequence; assert(seq); for (col=0; seq[col]; col++) { if (isupper(seq[col]) || seq[col] == '-') num_match++; } /* every sequence must have num_match match columns */ before_match = calloc(num_match+1, sizeof(int)); where_match = calloc(num_match+1, sizeof(int)); /* verify number of match columns * count insertions before each match position for each sequence */ for (nextSeqNode = seqList, row = 0; nextSeqNode != NULL; nextSeqNode = nextSeqNode->next, row++) { seq = nextSeqNode->sequence; assert(seq); check_match=0; num_before=0; for (col=0; seq[col]; col++) { if (islower(seq[col]) || seq[col] == '.') num_before++; else if (isupper(seq[col]) || seq[col] == '-') { if (check_match >= num_match) { fprintf(stderr, "Error: row %d (%s) has more match columns than row 0\n", row, nextSeqNode->label); return FALSE; } if (num_before>before_match[check_match]) before_match[check_match] = num_before; num_before=0; check_match++; } } if (check_match!=num_match) { fprintf(stderr, "Error: row %d (%s) has fewer match columns than row 0\n", row, nextSeqNode->label); return FALSE; } if (num_before>before_match[check_match]) before_match[check_match] = num_before; } /* Now have number of insertions needed before each match column * Add them up to figure out where matches go. */ num_before=0; /* cumulative number before match */ for(col=0; col<=num_match; col++) { where_match[col] = num_before + before_match[col]; num_before = where_match[col] + 1; } /* Note: where_match[num_match] is the total number of columns needed */ TotalColumns = where_match[num_match]; /* * now, allocate the target fields */ thisAlign->label = calloc(numberSequences, sizeof(char *)); assert(thisAlign->label != NULL); thisAlign->sequence = calloc(TotalColumns, sizeof(char *)); assert(thisAlign->sequence != NULL); for (ii = 0; ii < TotalColumns; ii++) { thisAlign->sequence[ii] = calloc(numberSequences, sizeof(char)); assert(thisAlign->sequence[ii] != NULL); } /* * now, go through the nodes of the linked list. At each node, * copy the data into the target struct and free the node. */ for (nextSeqNode = seqList, row = 0; nextSeqNode != NULL; row++) { assert(row < numberSequences); thisAlign->label[row] = strdup(nextSeqNode->label); assert(thisAlign->label[row] != NULL); /* go through sequence filling in position */ col=0; check_match=0; for(seq = nextSeqNode->sequence; *seq; seq++) { assert(col <= TotalColumns); if (islower(*seq) || *seq == '.') { thisAlign->sequence[col++][row] = *seq; } else if (isupper(*seq) || *seq == '-') { while (colsequence[col++][row] = '.'; } thisAlign->sequence[col++][row] = *seq; check_match++; } } while (colsequence[col++][row] = '.'; } /* make sure that the entire sequence properly copied */ assert(col==TotalColumns); assert(check_match == num_match); lastSeqNode = nextSeqNode; nextSeqNode = nextSeqNode->next; free(lastSeqNode->sequence); free(lastSeqNode); } assert(row == numberSequences); thisAlign->columns = TotalColumns; thisAlign->rows = numberSequences; free(before_match); free(where_match); return(is_okay); } /* * getMultipleAlignment - read a multiple alignment into the * alignStruct fields * * This function reads an alignment from the specified input file, * and fills in the relevant fields in the alignSummary * struct. It assumes that the alignment is in FASTA format. * * Return value: boolean flag set to TRUE if the operation was * successful, FALSE otherwise. */ int getMultipleAlignment( char *filename, /* called with: name of the file * containing the alignment data */ int minNumberAlignedSeqs, /* the smallest number of aligned * sequences expected by the caller */ AlignSummary *thisAlign) /* called with: pointer to a struct * already allocated but with no alignment. * return with: struct with the * alignment fields filled in */ { int is_okay = FALSE; int numberSequences = 0; seqNode *seqList = NULL; seqNode *seqPtr, *nextSeqPtr; /* * a linked list makes sequence data easy to read, but hard to * work with afterwards. An array is easy to work with, but hard * to read into in the case of sequence data. So, we will read * the sequence data into a linked list structure, and then * copy the data into an array structure and delete the linked * list. */ is_okay = readMultipleAlignment(filename, minNumberAlignedSeqs, &seqList, &numberSequences); if (is_okay) { is_okay = copyFromSeqNodeToAlignedSeq(seqList, numberSequences, thisAlign); } return(is_okay); } /* * readAlignment - read the alignment, filling in the target and * template fields if specified. * * This function takes as input a filename containing an alignment * and returns with an AlignSummary struct representing the alignment. * If the gotTemplate and gotTarget flags are TRUE, it will * require that the specified target and template sequences exist in * the alignment, and will fill in the target and template fields in * the structure. If the flags are FALSE, it will leave the * target and template structure fields NULL. * * If template and/or target is not specified, the code will check * if either alignment is pairwise and will assign template and/or * target by the sequence names in the pairwise alignment. If neither * template nor target were specified and the alignment is pairwise, * it will choose one at random to be target and one to be template * */ int readAlignment( char *alignFilename, /* called with: file containing the * alignment */ char **templateName, /* called with: pointer to the name of * the template sequence, or to NULL if * the sequence name is not yet known */ char **targetName, /* called with: pointer to the name of * the target sequence, or to NULL if * the sequence name is not yet known. */ int minNumberAlignedSeqs, /* called with: fewest number of sequences * expected in the alignment by the caller */ AlignSummary **align) /* return with: pointer to the * filled-in alignment struct */ { AlignSummary *thisAlign; int is_okay = TRUE; int returnValue; char *baseFilenamePtr; char baseFilename[FILENAME_LENGTH + 1]; char tmpFilename[FILENAME_LENGTH + 1]; char cmdBuffer[COMMAND_LENGTH + 1]; thisAlign = malloc(sizeof(AlignSummary)); assert(thisAlign != NULL); thisAlign->rows = 0; thisAlign->columns = 0; thisAlign->label = NULL; thisAlign->sequence = NULL; strcpy(thisAlign->filename, alignFilename); if (hasSuffix(alignFilename, ".gz")) { /* * If the alignment is gzipped, make a gunzipped copy of it on * /tmp. Identify it with the process ID to make it unique * and the filename of the alignment, to make it traceable * if something blows up. Note that we want the filename but * not the pathname, so strip out any directory specification. */ if (strchr(alignFilename, '/') != NULL) { baseFilenamePtr = strrchr(alignFilename, '/'); baseFilenamePtr++; strcpy(baseFilename, baseFilenamePtr); } else { strcpy(baseFilename, alignFilename); } sprintf(tmpFilename, "/tmp/%s.%d", baseFilename, getpid()); sprintf(cmdBuffer, "gunzip -c %s > %s", alignFilename, tmpFilename); system(cmdBuffer); is_okay = getMultipleAlignment(tmpFilename, minNumberAlignedSeqs, thisAlign); returnValue = unlink(tmpFilename); assert(returnValue == 0); } else { is_okay = getMultipleAlignment(alignFilename, minNumberAlignedSeqs, thisAlign); } if (is_okay) { /* * If the template and/or target names were not specified, * if the alignment is pairwise, then deduce the template * and target names from the sequence names. */ if (*templateName == NULL) { if (thisAlign->rows == 2) { if (*targetName == NULL) { *templateName = strdup(thisAlign->label[0]); } else if (strcasecmp(thisAlign->label[0], *targetName) == 0) { *templateName = strdup(thisAlign->label[1]); } else { *templateName = strdup(thisAlign->label[0]); } } } if (*targetName == NULL) { if (thisAlign->rows == 2) { if (*templateName == NULL) { *targetName = strdup(thisAlign->label[0]); } else if (strcasecmp(thisAlign->label[0], *templateName)==0) { *targetName = strdup(thisAlign->label[1]); } else { *targetName = strdup(thisAlign->label[0]); } } } } if (!is_okay) freeAlignSummary(thisAlign); *align = thisAlign; return(is_okay); } /* * readReferenceCandidate - read the reference and candidate alignments, * make the alignments consistent with each other * if necessary, and return with two aligned * sequence structs. * * This function makes calls to read the reference and candidate * alignments, given their pathnames and optionally given the * template and target names. * * return value: TRUE if the entire operation went okay, FALSE otherwise. */ int readReferenceCandidate( char *referenceA2mName, /* called with: filename of the * reference alignment */ char *candidateA2mName, /* called with: filename of the * candidate alignment */ char **templateName, /* called with: pointer to the name of * the template sequence, or to NULL if * the sequence name is not yet known */ char **targetName, /* called with: pointer to the name of * the target sequence, or to NULL if * the sequence name is not yet known. */ char *workingCandidateFileName, /* called with: name of a file for * printing out the working version of * the candidate alignment, NULL if * no owrking version is requested */ int minNumberAlignedSeqs, /* called with: the fewest aligned * sequences that would still make this * alignment interesting to the caller */ AlignSummary **reference, /* return with: reference alignment data */ AlignSummary **candidate) /* return with: candidate alignment data */ { int is_okay; is_okay = readAlignment(referenceA2mName, templateName, targetName, minNumberAlignedSeqs, reference); if (is_okay) { is_okay = readAlignment(candidateA2mName, templateName, targetName, minNumberAlignedSeqs, candidate); /* * If we're claiming to have a target and / or template * sequence in both alignments, make sure that the sequence * truly exists in both alignments. */ if (is_okay && *templateName != NULL) { if (findSequenceIndex(*templateName, *reference) == NOT_FOUND) { fprintf(stderr, "Error: sequence %s not found in alignment %s\n", *templateName, (*reference)->filename); is_okay = FALSE; } if (findSequenceIndex(*templateName, *candidate) == NOT_FOUND) { fprintf(stderr, "Error: sequence %s not found in alignment %s\n", *templateName, (*candidate)->filename); is_okay = FALSE; } } if (is_okay && *targetName != NULL) { if (findSequenceIndex(*targetName, *reference) == NOT_FOUND) { fprintf(stderr, "Error: sequence %s not found in alignment %s\n", *targetName, (*reference)->filename); is_okay = FALSE; } if (findSequenceIndex(*targetName, *candidate) == NOT_FOUND) { fprintf(stderr, "Error: sequence %s not found in alignment %s\n", *targetName, (*candidate)->filename); is_okay = FALSE; } } /* * check that the two versions of the target and template * sequences are identical, and resolve any differences. * but first, make sure we know what the target and template * sequences are. If we don't, we need to go yell at the * user about the arguments specified. */ if (is_okay && *targetName != NULL && *templateName != NULL) { if (!sequencesIdentical(*reference, *templateName, *candidate, *templateName)) { alignSequencesReviseAlignments(*reference, *candidate, *templateName, *templateName); } if (!sequencesIdentical(*reference, *targetName, *candidate, *targetName)) { alignSequencesReviseAlignments(*reference, *candidate, *targetName, *targetName); } /* * At this point, we have modified the candidate alignment * to resolve any differences between the two copies of the * template and target sequences. If the caller has * requested a dump of the working version of the candidate * alignment, here's the spot. */ if (workingCandidateFileName != NULL) { printAlign(*candidate, workingCandidateFileName); } } } #ifdef DEBUG if (is_okay) { fprintf(stderr, "target=%s, template=%s\n", *targetName, *templateName); fprintf(stderr, "reference alignment has %d columns (%d match)\n", (*reference)->columns, count_match(*reference)); fprintf(stderr, "candidate alignment has %d columns (%d match)\n", (*candidate)->columns, count_match(*candidate)); } #endif return(is_okay); } /* * printAlign - print out the alignment in an AlignSummary struct, * in FASTA format. */ void printAlign( AlignSummary *thisAlign, /* called with: alignment in question */ char *a2mName) /* called with: filename for the alignment * file */ { FILE *fp; int ii,jj; fp = fopen(a2mName, "w"); if (fp == NULL) { fprintf(stderr, "Error: cannot open %s for writing!", a2mName); } else { for (ii = 0; ii < thisAlign->rows; ii++) { fprintf(fp, ">%s\n", thisAlign->label[ii]); for (jj = 0; jj < thisAlign->columns; jj++) { fputc(thisAlign->sequence[jj][ii], fp); if ((jj + 1) % RESIDUES_PER_LINE == 0) { fputc('\n', fp); } } fprintf(fp, "\n"); } fclose(fp); } }