/* compute_shift - measure the shift in a structure-structure alignment * relative to the FSSP structure-structure alignment * for the same structures. * * This module looks at a structure-structure alignment, the * corresponding FSSP struct-struct alignment, and computes how much * the first alignment is shifted in each position relative to the * fssp alignment. Then, it computes a shift score relating to * their differences. * * Update history * 2/11/98 cline author, actually started recording updates. * 2/24/98 cline added -briefcaption switch * 3/5/98 karplus Added extra test to make sure templateColumn or * targetColumn non-negative before use as subscript. * Added min and max shift to verbose report with mean * 3/6/98 karplus Added shiftSummary so that min/max/mean can * be returned from trimAligment. * Added printRDBresults. * 3/7/98 stu fixed bug in trim_alignment so that the var done * would be set to TRUE whenever the shift score * stopped improving * 3/8/98 karplus Removed uppercasing of template and target names. * 3/9/98 karplus Added num_pairs_ref, num_pairs_cand, num_pairs_opt * to shiftSummary * 3/10/98 karplus Added positionsCounted to shiftDetail, and * pos_counted_cand and pos_counted_opt to * shiftSummary. * 4/20/98 cline moved the function printUsage from * measure_shift.c, a2mtrim.c, and shift_figure.c. * It's time to have one all-purpose function. * 4/20/98 cline Added code for printing out detailed shift info * 4/26/98 cline Added support for the -workingcandidate switch. * 4/27/98 cline Added support for the -matchonly switch. */ #include #include #include #include #include "alignsummary.h" #include "compute_shift.h" /* WARNING this def of min and max will compute x or y twice! */ #define min(x,y) ((x)<(y)? (x): (y)) #define max(x,y) ((x)>(y)? (x): (y)) /* * PROTOTYPES */ int computeShiftInfo( AlignSummary *reference, /* called with: details on the * reference alignment */ AlignSummary *candidate, /* called with: details on the * candidate alignment */ char *template, /* called with: name of the * template sequence */ char *target, /* called with: name of the * target sequence */ shiftDetail **shiftData); /* called with: pointer to an * allocated shift detail struct. * return with: struct with fields * filled in reflecting the current * alignments */ float computeShiftInfoStats( shiftDetail *shiftInfo, /* called with: structure detailing * the shift between the two alignments */ char *templateName, /* called with: name of the template * sequence */ char *targetName, /* called with: name of the target sequence */ float epsilon, /* called with: parameter value for the * shift score */ int verbose); /* called with: flag indicating if output * should be seen */ int findRemoveWorstShift( AlignSummary *reference, /* called with: structure detailing * the reference alignment */ AlignSummary *candidate, /* called with: structure detailing * the candidate alignment. * return with: structure detailing * the revised alignment */ int verbose, /* called with: flag indicating * if output should be echoed * back to the user */ int *shiftRemoved, /* called with: the larger magnitudes * of the shift removed */ shiftDetail *targetToTemplate, /* called with: structure describing * shift of the target sequence * relative to template */ shiftDetail *templateToTarget); /* called with: structure describing * shift of the template sequence * relative to target */ int findWorstShift( shiftDetail *shiftInfo); /* called with: struct detailing the * shift per position for some alignment */ void recordShiftColumn( AlignSummary *reference, /* called with: the reference alignment */ AlignSummary *candidate, /* called with: the candidate alignment */ alignedSequence *rTemplate, /* called with: template sequence as * aligned in the reference alignment */ alignedSequence *cTemplate, /* called with: template sequence as * aligned in the candidate alignment */ alignedSequence *rTarget, /* called with: target sequence as * aligned in the reference alignment */ alignedSequence *cTarget, /* called with: target sequence as * aligned in the candidate alignment */ shiftDetail *shiftInfo); /* called with: shiftDetail struct * fully allocated. * Return with: struct with the per * position fields filled in */ void zeroCmdlineInfo( cmdlineInfo *parameters); /* called with: pointer to an * empty, uninitialized struct. * Return with: all fields in * the structure initialized. */ /* * CODE */ /* * zeroCmdlineInfo - zero the contents of the command line info struct. */ void zeroCmdlineInfo( cmdlineInfo *parameters) /* called with: pointer to an * empty, uninitialized struct. * Return with: all fields in * the structure initialized. */ { parameters->referenceA2mName = parameters->candidateA2mName = NULL; parameters->templateName = parameters->targetName = NULL; parameters->RDBFilename = parameters->OutputFileName = NULL; parameters->shiftDetailFileName = NULL; parameters->workingCandidateFileName = NULL; parameters->briefcaption = FALSE; parameters->dumpRelativeMagnitudes = FALSE; parameters->color = FALSE; parameters->matchOnly = FALSE; parameters->verbose = DEFAULT_VERBOSE; parameters->epsilon = DEFAULT_EPSILON; } /* * parseCmdline - get the argument valus from the command line. * * This function takes as input the command line arguments in the * argc and argv variables. It looks for the arguments, parses * the argument value, checks to see if all required arguments * were given, and calls checkUsage if no arguments were given. * * return value: TRUE if all the required arguments were present, * FALSE otherwise. */ int parseCmdline( int argc, /* called with: number of command * line arguments */ char *argv[], /* called with: array containing the * command line arguments */ void additional_arglist(), /* called with: a function to list any * command line arguments or switches * particular to the calling program */ void additional_arginfo(), /* called with; a function to describe * those command line arguments or switches */ int checkCmdline(cmdlineInfo *paramters), /* called with: a function to check * whether all the required arguments * are given */ cmdlineInfo *parameters) /* return with: pointer to a structure * detailing all the relevant information * from the command line. * called with: pointer to an empty struct. */ { int ii, jj; int is_okay = FALSE; int haveReference, haveCandidate; if (argc == 1) { printUsage(argv, additional_arglist, additional_arginfo); } else { zeroCmdlineInfo(parameters); for (ii = 1; ii < argc; ii++) { if (strlen(argv[ii]) > 1) { if (ii < argc-1 && strncasecmp(REFERENCE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->referenceA2mName = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(CANDIDATE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->candidateA2mName = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(TARGET_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->targetName = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(TEMPLATE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->templateName = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(OUTPUTFILE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->OutputFileName = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(RDB_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->RDBFilename = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(DETAILS_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->shiftDetailFileName = strdup(argv[ii+1]); } else if (ii < argc-1 && strncasecmp(WORKINGCANDIDATE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->workingCandidateFileName = strdup(argv[ii+1]); } else if (strncasecmp(EPSILON_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->epsilon = atof(argv[ii+1]); } else if (strncasecmp(VERBOSE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->verbose = TRUE; } else if (strncasecmp(COLOR_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->color = TRUE; } else if (strncasecmp(BRIEFCAPTION_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->briefcaption = TRUE; } else if (strncasecmp(RELATIVEMAGNITUDE_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->dumpRelativeMagnitudes = TRUE; } else if (strncasecmp(MATCHONLY_STRING, argv[ii], strlen(argv[ii])) == 0) { parameters->matchOnly = TRUE; } } } if ((*checkCmdline)(parameters)) { is_okay = TRUE; } } return(is_okay); } /* * printUsage - print a usage summary */ void printUsage( char *argv[], /* called with: command line tokens */ void additional_arglist(void), /* called with: a function to list * any arguments particular to this main */ void additional_arginfo(void)) /* called with: a function to describe * any arguments particular to this main */ { char ch; printf( "Usage: %s <%s CandidateA2m> <%s ReferenceA2m> \\ \n", argv[0], CANDIDATE_STRING, REFERENCE_STRING); printf("\t[%s TargetName] [%s TemplateName] [%s Epsilon] \\ \n", TARGET_STRING, TEMPLATE_STRING, EPSILON_STRING); printf("\t[%s OutputFile] [%s OutputFile]", RDB_STRING, DETAILS_STRING); if (additional_arglist != NULL) (*additional_arglist)(); printf("\t[%s] [%s]\n", VERBOSE_STRING, RELATIVEMAGNITUDE_STRING); printf("Would you like more detail\n"); ch = getchar(); if (toupper(ch) == 'Y') { printf("where:\tReferenceA2m is the filename of the reference\n"); printf("\t\talignment, the one that is shown in the shift figure.\n"); printf("\t\tThis alignment is usually produced by a structural\n"); printf("\t\talignment program\n"); printf("\tCandidateA2m is the filename of the candidate\n"); printf("\t\talignment, the one reflected in the shift lines.\n"); printf("\t\tThis alignment is usually a predicted alignment\n"); printf("\tTemplateName is the name of the template sequence,\n"); printf("\t\tthe one relative to which shift is measured.\n"); printf("\tTargetName is the name of the target sequence, the one\n"); printf("\t\tfor which the shift is measured.\n"); printf("\tEpsilon is the value of the epsilon parameter (default %f)\n", DEFAULT_EPSILON); printf("\t%s specifies an optional filename for RDB format summary\n", RDB_STRING); printf("\t%s turns on verbose output\n", VERBOSE_STRING); printf("\t%s specifies an output file for a detailed dump of the \n", DETAILS_STRING); printf("\t\tper-position shift information. That switch can be\n"); printf("\t\taugmented by %s, which causes the shift data \n", RELATIVEMAGNITUDE_STRING); printf("\t\tdumped to heflect the magnitude of the shift relative to\n"); printf("\t\tthe smallest shift removed to acheive an optimal subalign.\n"); if (additional_arginfo != NULL) (*additional_arginfo)(); printf("\n"); printf("\tNote that if either alignment is pairwise, the TargetName\n"); printf("\tand TemplateName arguments are optional and will be\n"); printf("\tinferred from the alignment. The first sequence in the\n"); printf("\tpairwise alignment will become the template sequence, and\n"); printf("\tthe second will become the target sequence. If both\n"); printf("\talignments are pairwise, the candidate alignment is.\n"); printf("\tlooked at first\n"); } return; } /* * printNoTemplateError - print a message saying that no template * sequence was specified * * In many programs, the template sequence switch is optional, but a * template sequence must be specified somehow. If either alingment * is pairwise, the program will assign the template sequence to be * the first sequence in the pairwise alignment, looking at the * reference alignment first. If neither alignment is pairwise, * the template switch must be used. If both alignments had more * than two sequences and no teplate switch was given, this program * is called to tell the user not to do it again. */ void printNoTemplateError() { fprintf(stderr, "Error: the template sequence was not specified, and could\n"); fprintf(stderr, "not be deduced from the data.\n"); return; } /* * printNoTargetError - print a message saying that no target * sequence was specified * * In many programs, the target sequence switch is optional, but a * target sequence must be specified somehow. If either alingment is * pairwise, the program will assign the target sequence to be the * second sequence in the pairwise alignment, checking the reference * alignment first. If neither alignment is pairwise, the target * switch must be used. If both alignments had more than two sequences * and no target switch was given, this program is called to tell the * user not to do it again. */ void printNoTargetError() { fprintf(stderr, "Error: the target sequence was not specified, and could not be\n"); fprintf(stderr, "deduced from the data.\n"); return; } /* * recordShiftColumn - fill in the column array of the * shift detail structure. * * This function takes as input an allocated shiftDetail structure, * two alignments, and representations of the target and template * sequences within each alignment. For each alignment column, * it records the residues, if there is a shift, and how many residues * and column positions in the shift. */ void recordShiftColumn( AlignSummary *reference, /* called with: the reference alignment */ AlignSummary *candidate, /* called with: the candidate alignment */ alignedSequence *rTemplate, /* called with: template sequence as * aligned in the reference alignment */ alignedSequence *cTemplate, /* called with: template sequence as * aligned in the candidate alignment */ alignedSequence *rTarget, /* called with: target sequence as * aligned in the reference alignment */ alignedSequence *cTarget, /* called with: target sequence as * aligned in the candidate alignment */ shiftDetail *shiftInfo) /* called with: shiftDetail struct * fully allocated. * Return with: struct with the * column array filled in */ { int ii; char ch1, ch2; char chtarg, chtargcand, chtempl, chtemplcand; int templateSeqPosition, targetSeqPosition; int cTemplateCol, cTargetCol; int templateColSeqPosition; /* * first, record the residues from the candidate alignment, and * initialize all positions to no data available. */ shiftInfo->column = calloc(candidate->columns, sizeof(shiftPerColumn)); assert(shiftInfo->column != NULL); for (ii = 0; ii < candidate->columns; ii++) { shiftInfo->column[ii].haveValue = FALSE; shiftInfo->column[ii].templateChar = candidate->sequence[ii][cTemplate->index]; shiftInfo->column[ii].targetChar = candidate->sequence[ii][cTarget->index]; } /* * Now, record the shift data for each position where there is shift * data. Shift values are determined by the following steps: * For each pair of residues (temp,targ) aligned in candidate * alignment: * if targ is aligned to some template sequence residue (temp') * in the reference alignment: * shift(targ) = temp - temp' * So, that shift value is recorded for the target residue in the * reference alignment. */ #ifdef OLDCODE for (ii = 0; ii < reference->columns; ii++) { if (isupper(reference->sequence[ii][rTemplate->index]) && isupper(reference->sequence[ii][rTarget->index])) { /* * found an aligned pair. Note their positions in template * and target sequences, respectively. */ templateSeqPosition = rTemplate->column_to_seq[ii]; targetSeqPosition = rTarget->column_to_seq[ii]; /* * check to see if the target residue is aligned to some * template sequence residue in the reference alignment. */ cTargetCol = cTarget->seq_to_column[targetSeqPosition]; if (isupper( candidate->sequence[cTargetCol][cTemplate->index]) && isupper(candidate->sequence[cTargetCol][cTarget->index])) { /* * the shift is the difference in template sequence * positions between the template residue and the residue * aligned to the target residue. */ templateColSeqPosition = cTemplate->column_to_seq[cTargetCol]; cTemplateCol = cTemplate->seq_to_column[templateSeqPosition]; shiftInfo->column[cTargetCol].residuesShifted = templateSeqPosition - templateColSeqPosition; shiftInfo->column[cTargetCol].columnsShifted = cTemplateCol - cTargetCol; shiftInfo->column[cTargetCol].haveValue = TRUE; ch1 = reference->sequence[ii][rTemplate->index]; ch2 = candidate->sequence[cTemplateCol][cTemplate->index]; assert(toupper(ch1) == toupper(ch2)); ch1 = reference->sequence[ii][rTarget->index]; ch2 = candidate->sequence[cTargetCol][cTarget->index]; assert(toupper(ch1) == toupper(ch2)); } } } #else templateSeqPosition=templateColSeqPosition=cTemplateCol = -1; for (ii = 0; ii < reference->columns; ii++) { chtarg=reference->sequence[ii][rTarget->index]; if (!isalpha(chtarg)) { /* nowhere to store any information */ continue; } targetSeqPosition = rTarget->column_to_seq[ii]; cTargetCol = cTarget->seq_to_column[targetSeqPosition]; chtargcand = candidate->sequence[cTargetCol][cTarget->index]; assert(toupper(chtarg) == toupper(chtargcand)); shiftInfo->column[cTargetCol].reftargetChar = chtarg; chtempl=reference->sequence[ii][rTemplate->index]; shiftInfo->column[cTargetCol].reftemplateChar = chtempl; if (isupper(chtempl)) { templateSeqPosition = rTemplate->column_to_seq[ii]; cTemplateCol = cTemplate->seq_to_column[templateSeqPosition]; chtemplcand = candidate->sequence[cTemplateCol][cTemplate->index]; assert(toupper(chtempl) == toupper(chtemplcand)); shiftInfo->column[cTargetCol].columnsShifted = cTemplateCol - cTargetCol; } else { shiftInfo->column[cTargetCol].columnsShifted = -99999; } if (isupper(chtempl) && isupper(candidate->sequence[cTargetCol][cTemplate->index])) { templateColSeqPosition = cTemplate->column_to_seq[cTargetCol]; shiftInfo->column[cTargetCol].residuesShifted = templateSeqPosition - templateColSeqPosition; } else { shiftInfo->column[cTargetCol].residuesShifted = -99999; } shiftInfo->column[cTargetCol].haveValue = (isupper(chtarg) && isupper(chtempl) && isupper(chtargcand) && isupper(candidate->sequence[cTargetCol][cTemplate->index])); } #endif } /* * computeShiftInfo - compute the shift at each position * * for each position in the candidate alignment, either compute how far * the target sequence residue has drifted from the template sequence * residue to which it's aligned in the reference alignment, or specify * that there's no shift data for that alignment position. * * Return value: a flag specifying if the operation went okay. */ int computeShiftInfo( AlignSummary *reference, /* called with: details on the * reference alignment */ AlignSummary *candidate, /* called with: details on the * candidate alignment */ char *template, /* called with: name of the * template sequence */ char *target, /* called with: name of the * target sequence */ shiftDetail **shiftData) /* called with: pointer to an * allocated shift detail struct. * return with: struct with fields * filled in reflecting the current * alignments */ { shiftDetail *shiftInfo = NULL; alignedSequence *referenceTarget, *referenceTemplate; alignedSequence *candidateTarget, *candidateTemplate; int ii; int is_okay = TRUE; referenceTemplate = alignedSequenceStruct(reference, template); if (referenceTemplate == NULL) is_okay = FALSE; referenceTarget = alignedSequenceStruct(reference, target); if (referenceTarget == NULL) is_okay = FALSE; candidateTemplate = alignedSequenceStruct(candidate, template); if (candidateTemplate == NULL) is_okay = FALSE; candidateTarget = alignedSequenceStruct(candidate, target); if (candidateTarget == NULL) is_okay = FALSE; if (is_okay) { shiftInfo = malloc(sizeof(shiftDetail)); assert(shiftInfo != NULL); shiftInfo->numberColumns = candidate->columns; shiftInfo->template = strdup(template); shiftInfo->target = strdup(target); shiftInfo->referenceFilename = strdup(reference->filename); shiftInfo->candidateFilename = strdup(candidate->filename); recordShiftColumn(reference, candidate, referenceTemplate, candidateTemplate, referenceTarget, candidateTarget, shiftInfo); /* * Finally, record the number of columns aligned in each alignment. */ shiftInfo->alignedPairsReference = 0; for (ii = 0; ii columns; ii++) { if (isupper(reference->sequence[ii][referenceTemplate->index]) && isupper(reference->sequence[ii][referenceTarget->index])) { shiftInfo->alignedPairsReference++; } } shiftInfo->alignedPairsCandidate = 0; for (ii = 0; ii < candidate->columns; ii++) { if (isupper(candidate->sequence[ii][candidateTemplate->index]) && isupper(candidate->sequence[ii][candidateTarget->index])) { shiftInfo->alignedPairsCandidate++; } } } if (referenceTemplate != NULL) freeAlignedSequence(referenceTemplate); if (candidateTemplate != NULL) freeAlignedSequence(candidateTemplate); if (referenceTarget != NULL) freeAlignedSequence(referenceTarget); if (candidateTarget != NULL) freeAlignedSequence(candidateTarget); *shiftData = shiftInfo; return(is_okay); } /* * computeShiftInfoStats - compute statistics on the shift data * * This function takes the shift data and computes and prints * shift-related statistics. It computes and prints mean shift. * It also computes half of the normalized shift statistic: the half * that relates to this particular assignment of template and target. * It returns this half of the normalized shift statistic. */ float computeShiftInfoStats( shiftDetail *shiftInfo, /* called with: structure detailing * the shift between the two alignments */ char *templateName, /* called with: name of the template * sequence */ char *targetName, /* called with: name of the target sequence */ float epsilon, /* called with: parameter value for the * shift score */ int verbose) /* called with: flag indicating if output * should be seen */ { int ii; float sum; float this_term; int positionsCounted; int shift; int minShift, maxShift; float meanShift; float normalizedShiftMeasure; if (verbose) { printf("Template sequence %s, target %s:\n", templateName, targetName); } /* * First, compute the interesting statistic */ if (shiftInfo->alignedPairsReference == 0 && shiftInfo->alignedPairsCandidate == 0) { if (verbose) { printf( "No positions available - normalized shift measure undefined\n"); } } else { sum = 0.0; for (ii = 0; ii < shiftInfo->numberColumns; ii++) { if (shiftInfo->column[ii].haveValue) { shift = abs(shiftInfo->column[ii].residuesShifted); this_term = (1.0 + epsilon) / (1.0 + (float) shift) - epsilon; sum += this_term; } } normalizedShiftMeasure = sum /((float) shiftInfo->alignedPairsReference + shiftInfo->alignedPairsCandidate); if (verbose) { printf("Portion of shift measure = %f\n", normalizedShiftMeasure); } } /* * Next, compute the easy one - mean shift. */ sum = 0.0; positionsCounted = 0; shiftInfo->minShift=999999; shiftInfo->maxShift=0; for (ii = 0; ii < shiftInfo->numberColumns; ii++) { if (shiftInfo->column[ii].haveValue) { shift = abs(shiftInfo->column[ii].residuesShifted); sum += (float) shift; if (shift < shiftInfo->minShift) shiftInfo->minShift=shift; if (shift > shiftInfo->maxShift) shiftInfo->maxShift=shift; positionsCounted++; } } if (positionsCounted == 0) { shiftInfo->meanShift = 9999; if (verbose) { printf("No positions available - mean shift undefined\n"); } } else { shiftInfo->meanShift = sum / ((float) positionsCounted); if (verbose) { printf("Num columns=%d min(|shift|)=%d max(|shift|)=%d mean=%f\n", positionsCounted, shiftInfo->minShift, shiftInfo->maxShift, shiftInfo->meanShift); } } shiftInfo->positionsCounted = positionsCounted; return(normalizedShiftMeasure); } /* * dumpShiftDetail - dump the per-position detail of the candidate * alignment into the indicated file. * * This function is called when the user wants details - namely the * details of by how much each alignment column is shifted, etc. * For each column in the candidate alignment, it lists template and * target residues, columns shifted, and residues shifted. Residues * shifted can take one of two flavors. By default, it's the pure, * unaltered number of sequence positions. Alternatively, it can be * the magnitude of the shift relative to the smallest shift removed * from the alignment in order to acheive an optimal subalignment. * That apparently-sounding quirky information is useful for determining * if a shift is large for this alignment. */ void dumpShiftDetail( char *shiftDetailFileName, /* called with: name of the * output file */ shiftDetail *candidateInfo, /* called with: the gross detail * on the candidate alignment */ int matchOnly, /* called with: TRUE if only the * matched positions should have * detail dumped, FALSE if all * positions should be detailed */ int dumpRelativeMagnitude, /* called with: flag indicating * if we're to give the residues * shifted in terms of magnitude * relative to the smallest * shift removed */ int lastShiftRemovedMagnitude) /* called with: magnitude of the * smallest shift removed */ { FILE *fp; int ii; fp = fopen(shiftDetailFileName, "w"); if (fp == NULL) { fprintf(stderr, "Error: could not open %s for writing!\n", shiftDetailFileName); } else { fprintf(fp, "templat\ttarget\tdefined\tcol\tresidues\n"); for (ii = 0; ii < candidateInfo->numberColumns; ii++) { if (!matchOnly || ((isupper(candidateInfo->column[ii].templateChar) || candidateInfo->column[ii].templateChar == '-') && (isupper(candidateInfo->column[ii].targetChar) || candidateInfo->column[ii].targetChar == '-'))) { fprintf(fp, "%c\t %c\t%d\t%d\t", candidateInfo->column[ii].templateChar, candidateInfo->column[ii].targetChar, (candidateInfo->column[ii].haveValue == TRUE), candidateInfo->column[ii].columnsShifted); if (dumpRelativeMagnitude) { if (candidateInfo->column[ii].haveValue) { if (lastShiftRemovedMagnitude == 0) { fprintf(fp, "%d\n", -1 - abs(candidateInfo->column[ii].residuesShifted)); } else { fprintf(fp, "%d\n", lastShiftRemovedMagnitude - abs(candidateInfo->column[ii].residuesShifted)); } } else { fprintf(fp, "%d\n", abs(candidateInfo->column[ii].residuesShifted)); } } else { fprintf(fp, "%d\n", candidateInfo->column[ii].residuesShifted); } } } } } /* * freeShiftDetail - free the shiftInfo struct */ void freeShiftDetail( shiftDetail *shiftInfo) /* called with: a filled-in shift detail struct */ { free(shiftInfo->referenceFilename); free(shiftInfo->candidateFilename); free(shiftInfo->template); free(shiftInfo->target); free(shiftInfo->column); free(shiftInfo); } /* * compute_shift - measure and output shift data * * this function does the mechanics of measuring the shift with one * assignment of template and target sequence, and with one assignment * of reference and candidate alignment. The return value is the * half of the normalized shift measure corresponding to this template * and target assignment. The function returns with the shiftDetail * struct filled in pertaining to the candidate alignment. */ float compute_shift( AlignSummary *reference, /* called with: structure detailing the * reference alignment, the one that * the shiftDetail struct does not * pertain to directly. */ AlignSummary *candidate, /* called with: structure detailing the * candidate alignment, the one that the * shiftDetail struct pertains to */ char *templateName, /* called with: name of the template * structure */ char *targetName, /* called with: name of the target * structure */ int verbose, /* called with: flag indicating whether * or not to print verbose output */ float epsilon, /* called with: the value to use for * the epsilon parameter in the shift score */ shiftDetail **shiftInfo) /* return with: a pointer to the shift * detail struct filled in here. */ { float normalizedShiftSubmeasure; int is_okay; is_okay = computeShiftInfo(reference, candidate, templateName, targetName, shiftInfo); normalizedShiftSubmeasure = computeShiftInfoStats(*shiftInfo, templateName, targetName, epsilon, verbose); return(normalizedShiftSubmeasure); } /* * findWorstShift - identify the position with the worst shift * in the indicated shiftInfo struct. * * Return value: index of the worst position, or -1 if there is no * aligned position or shifted position. */ int findWorstShift( shiftDetail *shiftInfo) /* called with: struct detailing the * shift per position for some alignment */ { int worstShift = 0; int worstShiftPosition = -1; int ii; for (ii = 0; ii < shiftInfo->numberColumns; ii++) { if (shiftInfo->column[ii].haveValue) { if (abs(shiftInfo->column[ii].residuesShifted) > worstShift) { worstShift = abs(shiftInfo->column[ii].residuesShifted); worstShiftPosition = ii; } } } return(worstShiftPosition); } /* * findRemoveWorstShift - find the most shifted position in the candidate * alignment and remove it. * * This function looks for the worst shift in both versions of the alignment * (target to template, and template to target). If the position it finds * has a nonzero shift, then it removes the culprit position from the * candidate alignment by turning the matches into inserts. * * Return value: If a column is trimmed, the index of the column trimmed * is returned. Otherwise, the return value is -1. * -1 otherwise */ int findRemoveWorstShift( AlignSummary *reference, /* called with: structure detailing * the reference alignment */ AlignSummary *candidate, /* called with: structure detailing * the candidate alignment * return with: structure detailing * the revised alignment */ int verbose, /* called with: flag indicating * if output should be echoed * back to the user */ int *shiftRemoved, /* called with: the larger magnitudes * of the shift removed */ shiftDetail *targetToTemplate, /* called with: structure describing * shift of the target sequence * relative to template */ shiftDetail *templateToTarget) /* called with: structure describing * shift of the template sequence * relative to target */ { int return_value = -1; int targetColumn, templateColumn; int targetSeqPosition, templateSeqPosition; int referenceCol, candidateCol; int templateRow, targetRow; int ii; targetRow = findSequenceIndex(targetToTemplate->target, candidate); templateRow = findSequenceIndex(targetToTemplate->template, candidate); targetColumn = findWorstShift(targetToTemplate); templateColumn = findWorstShift(templateToTarget); if (targetColumn >= 0 || templateColumn >= 0) { /* * shift is reported at the target sequence position in the * candidate alignment. To get rid of that shift, cast the * corresponding column in the alignment to lowercase. */ if (targetColumn<0 || !targetToTemplate->column[targetColumn].haveValue) { candidateCol = templateColumn; *shiftRemoved = abs(templateToTarget->column[templateColumn].residuesShifted); } else if (templateColumn <0 || !templateToTarget->column[templateColumn].haveValue) { candidateCol = targetColumn; *shiftRemoved = abs(targetToTemplate->column[targetColumn].residuesShifted); } else if (abs(targetToTemplate->column[targetColumn].residuesShifted) > abs(templateToTarget->column[templateColumn].residuesShifted)) { candidateCol = targetColumn; *shiftRemoved = abs(targetToTemplate->column[targetColumn].residuesShifted); } else { candidateCol = templateColumn; *shiftRemoved = abs(templateToTarget->column[templateColumn].residuesShifted); } assert(candidateCol >=0); assert(isupper( candidate->sequence[candidateCol][targetRow]) && isupper( candidate->sequence[candidateCol][templateRow])); if (verbose) { printf("Removing column %d, shifts %d and %d\n", candidateCol, targetToTemplate->column[candidateCol].residuesShifted, templateToTarget->column[candidateCol].residuesShifted); } removeAlignmentColumn(candidate, candidateCol); return_value = candidateCol; } return(return_value); } /* * trim_alignment - trim the candidate alignment to its optimal subalignment * * This function takes as input a reference and candidate alignment, amoung * other things. It determines the subalignment of the candidate alignment * that receives the optimal shift score. It returns with the final shift * score, and the number of iterations required to obtain it. * * Return value: TRUE if everything worked okay, FALSE otherwise */ int trim_alignment( AlignSummary *reference, /* called with: structure describing * the reference alignment */ AlignSummary *candidate, /* called with: structure describing * the candidate alignment */ char *templateName, /* called with: name of the template sequence */ char *targetName, /* called with: name of the target sequence */ int verbose, /* called with: flag set to TRUE * for verbose output */ float epsilon, /* called with: parameter for the * shift score computation */ int thisFunctionVerbose, /* called with: flag set to TRUE if we want * to see the output messages from this * function */ shiftSummary *final) /* return with: the shift score * of the optimal subalignment */ { float ShiftScore; int is_okay = TRUE; shiftDetail *shiftTemplateTarget, *shiftTargetTemplate; int column_removed = -1; int done = FALSE; int iteration; int printOutput; int thisShiftMagnitude, previousShiftMagnitude; assert(final); /* must be somewhere to put the results */ /* * For each iteration of the loop below, find the position with the * single greatest shift (whether target-template or template-target) * and remove it from the candidate alignment. The loop finishes when * the normalized shift score no longer improves, or there are no more * aligned or shifted positions to remove. */ thisShiftMagnitude = previousShiftMagnitude = 0; printOutput = verbose || thisFunctionVerbose; final->shift_score_opt = -1 * epsilon; final->num_cols_ref = reference->columns; final->num_cols_cand = candidate->columns; final->num_match_ref = count_match(reference); final->num_match_cand = count_match(candidate); iteration = 0; while (!done) { iteration++; ShiftScore = compute_shift(reference, candidate, templateName, targetName, verbose, epsilon, &shiftTargetTemplate); ShiftScore += compute_shift(reference, candidate, targetName, templateName, verbose, epsilon, &shiftTemplateTarget); if (iteration == 1) { final->shift_score_cand = ShiftScore; final->min_shift_cand = min(shiftTargetTemplate->minShift, shiftTemplateTarget->minShift); final->max_shift_cand = max(shiftTargetTemplate->maxShift, shiftTemplateTarget->maxShift); final->mean_shift_cand = (shiftTargetTemplate->meanShift + shiftTemplateTarget->meanShift)/2; final->num_pairs_ref = shiftTemplateTarget->alignedPairsReference; final->num_pairs_cand = shiftTemplateTarget->alignedPairsCandidate; final->pos_counted_cand = shiftTemplateTarget->positionsCounted +shiftTargetTemplate->positionsCounted; } if (printOutput) printf("\n%d: Shift score %f ", iteration, ShiftScore); if (final->shift_score_opt < ShiftScore) { final->shift_score_opt = ShiftScore; final->min_shift_opt = min(shiftTargetTemplate->minShift, shiftTemplateTarget->minShift); final->max_shift_opt = max(shiftTargetTemplate->maxShift, shiftTemplateTarget->maxShift); final->mean_shift_opt = (shiftTargetTemplate->meanShift + shiftTemplateTarget->meanShift)/2; final->num_pairs_opt = shiftTemplateTarget->alignedPairsCandidate; final->pos_counted_opt = shiftTemplateTarget->positionsCounted +shiftTargetTemplate->positionsCounted; /* try removing another */ previousShiftMagnitude = thisShiftMagnitude; column_removed = findRemoveWorstShift(reference, candidate, printOutput, &thisShiftMagnitude, shiftTargetTemplate, shiftTemplateTarget); } else { if (column_removed >=0) { /* removed one too many */ if (printOutput) { printf("Restoring column %d\n", column_removed); printf("Final shift score %f, initial %f, %d columns removed\n", final->shift_score_opt, final->shift_score_cand, iteration-2); } restoreAlignmentColumn(candidate, column_removed); } done = TRUE; } freeShiftDetail(shiftTargetTemplate); freeShiftDetail(shiftTemplateTarget); } final->last_shift_removed_magnitude = previousShiftMagnitude; final->num_match_opt = count_match(candidate); return(is_okay); } /* printRDBresults * * print results in format of an RDB database file * * return TRUE if printing goes ok. * */ int printRDBresults( char * RDBFilename, /* if 0, then print to stdout */ shiftSummary *results, /* most information comes from here */ char *templateName, /* name of template sequence */ char *targetName /* name of target sequence */ ) { FILE *RDBFile; if (RDBFilename) { RDBFile = fopen(RDBFilename, "w"); if (RDBFile == NULL) { fprintf(stderr, "Could not open %s for writing\n", RDBFilename); return FALSE; } } else RDBFile=stdout; fprintf(RDBFile, "template\ttarget\tref_match\tcand_match\tcand_score\tcand_min\tcand_max\tcand_mean\topt_match\topt_score\topt_min\topt_max\topt_mean\n"); fprintf(RDBFile, "30S\t30S\t10N\t10N\t10N\t10N\t10N\t10N\t10N\t10N\t10N\t10N\t10N\n"); fprintf(RDBFile, "%s\t%s\t%d\t%d\t%.5f\t%d\t%d\t%.3f\t%d\t%.5f\t%d\t%d\t%.3f\n", templateName, targetName, results->num_match_ref, results->num_match_cand, results->shift_score_cand, results->min_shift_cand, results->max_shift_cand, results->mean_shift_cand, results->num_match_opt, results->shift_score_opt, results->min_shift_opt, results->max_shift_opt, results->mean_shift_opt); fclose(RDBFile); return TRUE; }