/* * alignsummary.h - contains typedefs and prototypes * * This module contains functions to create, delete, and process the * alignSummary structure. The global entry points to this module are * as follows: * alignedSequenceStruct * alignSequencesReviseAlignments * findSequenceIndex * freeAlignedSequence * freeAlignSummary * printAlign * readAlignment * readReferenceCandidate * readStringFromFile * sequencesIdentical * * Modifications: * 12/9/97 cline author, moved from measure_shift.c * 2/5/98 added readReferenceCandidate * 2/11/98 added removeAlignmentColumn * added restoreAlignmentColumn * 2/16/98 replaced getTemplate and getTarget with * alignedSequenceStruct * 3/10/98 cline moved printFinalA2m from a2mtrim, renamed as * printAlign. * 4/20/98 cline Revised readAlignment and readReferenceCandidate * to reflect the new handling of the command line * parameters. * 4/26/98 cline moved prototype for printAlign from alignsummary.c * 6/5/98 cline Removed the unused isTemplate argument from * alignSequencesReviseAlignments. * 6/9/98 cline added function readStringFromFile */ /* * #DEFINES */ #define TRUE 1 #define FALSE 0 /* space set aside for a system command */ #define COMMAND_LENGTH 256 /* maximum expected length of a SAM filename */ #define FILENAME_LENGTH 300 /* 5 March 1998, changed from 80 to 300, Kevin Karplus */ /* conservatively defines the longest sequence label we're expecting */ #define MAX_LABEL_LENGTH 200 /* 5 March 1998, changed from 40 to 200, Kevin Karplus */ /* Value that indicates the specified label was not found in the alignment */ #define NOT_FOUND -1 /* when looking at what residue aligns where, this value denotes nowhere */ #define NOT_ALIGNED -1 /* for a column in an aligned sequence, denotes no residue aligned there */ #define NOTHING_ALIGNED -2 /* the maximum number of residues per line for the output a2m file */ #define RESIDUES_PER_LINE 50 /* * TYPEDEFS */ /* * this structure has all the gory details regarding the alignment of a * single sequence to the alignment. */ typedef struct { int index; /* indexes the specified sequence in the * multiple alignment array */ int *seq_to_column; /* for each position ii in the sequence, * seq_to_column[ii] tells what alignment * column (if any) its in. */ int *column_to_seq; /* for each column jj in the alignment, * column_to_seq[jj] tells us what * residue in the sequence (if any) is * aligned there */ int sequence_length; /* gives the length of the raw sequence, * as opposed to the length of the * unaligned sequence */ } alignedSequence; /* * This structure contains all the data we need for one alignment * of reference to template sequence */ typedef struct { char filename[FILENAME_LENGTH]; /* name of the file that this * alignment was read from */ int columns; /* number of columns in the * alignment */ int rows; /* number of rows (sequences) * in the alignment */ char **label; /* label[x] contains the label * of aligned sequence # x */ char **sequence; /* sequence[x] contains the alignment * data for column x. sequence[x][y] * contains the alignment character * in column x, sequence y. */ } AlignSummary; /* * PROTOTYPES */ /* * alignedSequenceStruct - get all the gory information relating * to the specified aligned sequence * in the alignment. * * This function allocates and fills in the alignedSequence * struct for the specified sequence in the specified alignment. * If the sequnce is not found, it fills in the index as NOT_ALIGNED * and returns the otherwise empty structure. * * Return value - a pointer to the structure if all the mallocs and * everything went okay, else NULL. */ alignedSequence *alignedSequenceStruct( AlignSummary *alignment, /* called with: the alignment in question */ char *sequenceName); /* called with: name of the sequence * in question */ /* * alignSequencesReviseAlignments - align the two copies of the one sequence * to each other, and insert any missing pieces into * the alignment. * * This function goes into the two alignments and looks at the two versions * of the specified sequence. If the two versions are different, it creates * an alignment of the two sequences. Then, using this alignment, it * identifies any regions missing from one of the input alignments. It adds * these regions to the input alignment as additional columns: the sequence * specified matches these columns, all others have a delete in that position. */ void alignSequencesReviseAlignments( AlignSummary *align1, AlignSummary *align2, /* called with: the other of the two alignments */ char *structName1, /* called with: name in the first alignment * of the sequence whose alignment is being * checked */ char *structName2); /* called with: name in the second alignment * of the sequence whose alignment is being * checked */ /* * findSequenceIndex - find the index of the specified sequence * in the alignment. * * This function looks through the labels of the specified alignment * and looks for the specified sequence label. If it finds it, it * returns its row index. Otherwise, it returns NOT_FOUND. */ int findSequenceIndex( char *target_label, /* called with: the label we're * looking for */ AlignSummary *align); /* called with: where we're * looking for it */ /* * freeAlignedSequence - free an aligned sequence struct */ void freeAlignedSequence( alignedSequence *sequenceStruct); /* called with: a pointer * to an allocated * alignedSequence struct. */ /* * freeAlignSummary - free the align summary structure and its * substructures. */ void freeAlignSummary( AlignSummary *alignData); /* called with: structure to be freed */ /* * printAlign - print out the alignment in an AlignSummary struct, * in FASTA format. */ void printAlign( AlignSummary *thisAlign, /* called with: alignment in question */ char *a2mName); /* called with: filename for the alignment * file */ /* * readAlignment - read the alignment, filling in the target and * template fields if specified. * * This function takes as input a filename containing an alignment * and returns with an AlignSummary struct representing the alignment. * If the gotTemplate and gotTarget flags are TRUE, it will * require that the specified target and template sequences exist in * the alignment, and will fill in the target and template fields in * the structure. If the flags are FALSE, it will leave the * target and template structure fields NULL. * * If template and/or target is not specified, the code will check * if either alignment is pairwise and will assign template and/or * target by the sequence names in the pairwise alignment. If neither * template nor target were specified and the alignment is pairwise, * it will choose one at random to be target and one to be template * */ int readAlignment( char *alignFilename, /* called with: file containing the * alignment */ char **templateName, /* called with: pointer to the name of * the template sequence, or to NULL if * the sequence name is not yet known */ char **targetName, /* called with: pointer to the name of * the target sequence, or to NULL if * the sequence name is not yet known. */ int minNumberAlignedSeqs, /* called with: fewest number of sequences * expected in the alignment by the caller */ AlignSummary **align); /* return with: pointer to the * filled-in alignment struct */ /* * readStringFromFile - read from the specified file a string of * unknown length, allocating memory as you go. * * This function reads and allocates memory for a string terminated by * the carriage return character. * * return value: TRUE if the read went okay, FALSE if there was an * an I/O or memory error. */ int readStringFromFile( FILE *fp, /* called with: the input file, open for * reading, and advanced to the beginning * of the string. */ char *filename, /* name of the file, for error reporting * purposes */ char **stringFromFile); /* return with: the string as read * from the file */ /* * readReferenceCandidate - read the reference and candidate alignments, * make the alignments consistent with each other * if necessary, and return with two aligned * sequence structs. * * This function makes calls to read the reference and candidate * alignments, given their pathnames and optionally given the * template and target names. * * return value: TRUE if the entire operation went okay, FALSE otherwise. */ int readReferenceCandidate( char *referenceA2mName, /* called with: filename of the * reference alignment */ char *candidateA2mName, /* called with: filename of the * candidate alignment */ char **templateName, /* called with: pointer to the name of * the template sequence, or to NULL if * the sequence name is not yet known */ char **targetName, /* called with: pointer to the name of * the target sequence, or to NULL if * the sequence name is not yet known. */ char *workingCandidateFileName, /* called with: name of a file for * printing out the working version of * the candidate alignment, NULL if * no owrking version is requested */ int minNumberAlignedSeqs, /* called with: the fewest aligned * sequences that would still make this * alignment interesting to the caller */ AlignSummary **reference, /* return with: reference alignment data */ AlignSummary **candidate); /* return with: candidate alignment data */ /* * removeAlignmentColumn - remove a column from the specified alignment * * This function "removes" a column by converting it to an insert. All * aligned residues in the column are converted to inserts, */ void removeAlignmentColumn( AlignSummary *thisAlign, /* called with: structure detailing the * alignment to update */ int column); /* called with: column index for the * column to delete */ /* * restoreAlignmentColumn - restore a column that was deleted previously * * This function "restores" a column that had been deleted previously * by converting it from an indel column to a match column. */ void restoreAlignmentColumn( AlignSummary *thisAlign, /* called with: structure detailing the * alignment to update */ int column); /* called with: column index for the * column to delete */ /* * sequencesIndentical - check if two sequences are the same in the two * alignments. * * This function checks to see whether or not the two versions of the * sequence from the two alignments are the same. It checks all * amino acids, and if the amino acid sequence is different, it returns * FALSE. If the amino acid sequence is the same, it returns TRUE. It * doesn't matter if the characters are upper or lowercase, or * interspersed with inserts and so forth. */ int sequencesIdentical( AlignSummary *align1, /* called with: first alignment */ char *name1, /* called with: name of the * sequence of interest */ AlignSummary *align2, /* called with: second alignment */ char *name2); /* called with: name of the * sequence of interest */