// SeqWeight.cc // Kevin Karplus // 23 Oct 1995 // Sequence weighting techniques applied to alignment (no insertions) #include #define __USE_ISOC99 1 // for isfinite() in math.h #include #include #include "Utilities/abs.h" #include "Utilities/minmax.h" #include "Regularizer/MLZReg.h" #include "SeqWeight.h" #include "alignment.h" // for the alignment data structure #include "Globals.h" // for default parameters to "execute" #include "NeuralNet.h" #include "InterfaceDescription.h" NameToPtr* SequenceWeightObject::WeighterTable = 0; void SequenceWeightObject::execute(alignment&b, int interface_num, ostream* weight_file) const { const InterfaceDescription* ifd=Globals::nn()->interface(interface_num); float bits_to_save=ifd->SequenceWeightBitsToSave; Regularizer* r=ifd->WeightingRegularizer; float extra_param=ifd->SequenceWeightParam; float clip_exponent=ifd->ClipExponent; (*Function)(b,bits_to_save,weight_file,r,extra_param, clip_exponent>=0? pow(b.num_seqs, clip_exponent): 0.0); } static void AdjustWeights(alignment &b, float bits_to_save, ostream* weight_file, Regularizer* given_weighting_reg, float relative_weight_power, float clip_to) { if (b.num_seqs <=0 || !b.weights) return; int alphabet_size = b.alphabet_size(); ostream *errors_to = &cerr; if (weight_file) { (*weight_file) << "# weighting_entropy attempting to save " << bits_to_save << " bits/position\n"; if (relative_weight_power) (*weight_file) << "# relative weights are encoding cost^" << relative_weight_power << "\n"; else (*weight_file) << "# relative weights same as initial weights\n"; if (clip_to >0) (*weight_file) << "# clipping total weight to <= " << clip_to << "\n"; errors_to = weight_file; } if (bits_to_save <0) { (*errors_to) << "# requested less than 0 bits saved, using initial weights\n"; if (weight_file) b.report_weights(*weight_file); return; } Regularizer *weighting_reg=given_weighting_reg? given_weighting_reg: new MLZReg(b.alphabet, 1.e-8, "tiny_zero-offset_regularizer"); // (*errors_to) << "# using regularizer " << weighting_reg->name() // << "\n"; float* num_char=new float[alphabet_size]; // weighted number of each char. float* probs=new float[alphabet_size]; // regularized probability int c; for (c=alphabet_size-1; c>=0; c--) num_char[c]=0; weighting_reg->get_probs(num_char,probs); // estimate background probs double max_entropy=0.; for (c=alphabet_size-1; c>=0; c--) max_entropy -= probs[c] * log(probs[c]); float nats_to_save= bits_to_save * M_LN2;; float target_in_nats = max_entropy - nats_to_save; if (target_in_nats <= 0.) { (*errors_to) << "# AdjustWeight can't possibly save " << bits_to_save << " bits/column,\n" << "# since even with zero weights only takes " << max_entropy*M_LOG2E << " bits/column.\n" << flush; double sum=0.; for (int s=b.num_seqs-1; s>=0; s--) sum+=b.weights[s]; for (int s=b.num_seqs-1; s>=0; s--) b.weights[s] *= (clip_to>0 && sum>0)? clip_to/sum : 1.e6; if (weight_file) b.report_weights(*weight_file); delete [] num_char; delete [] probs; return; } double* entropy=new double[b.num_seqs]; // encoding cost for each sequence double* rel_weight=new double[b.num_seqs]; // relative weight of seqs int* num_normal=new int[b.num_seqs]; // how many normal characters are there in each sequence? int total_num_normal=0; // total number of normal characters int num_real_seqs=b.num_seqs; // now many of the sequences have some normal characters? int s; for (s=b.num_seqs-1; s>=0; s--) { num_normal[s] = 0; for (int col=b.width-1; col>=0; col--) { Base bs = b.data[s][col]; if (bs.is_normal()) { num_normal[s] ++; total_num_normal++; } } if (num_normal[s]==0) { b.weights[s] = 0; // don't count all-gaps num_real_seqs--; } } double total_real_entropy=-999; // sum of true entropy for columns int num_real_cols=b.width; // number of columns that aren't empty double old_tot_weight = 0; double new_tot_weight =b.total_weight(); const int max_iter=46; int last_iter=0; // set during the last iteration double eta = 0.7; // interpolation weight double* char_entropy=new double[alphabet_size]; // keep track of two previous iterations float older_nats_saved= -999; float old_nats_saved= -999; float nats_saved= -999; int iter; // not inside for, so that actual iter count can be reported. for (iter=0; iter 1.e-20 && !last_iter; iter++) { last_iter = (iter+1==max_iter) || kk::abs(target_in_nats-total_real_entropy/num_real_cols) < 1.0e-5 * target_in_nats; for (s=b.num_seqs-1; s>=0; s--) entropy[s] = 0; total_real_entropy=0; // compute entropy for each character in each column, // and sum for each sequence, also sum total for all columns double total_seq_entropy=0; // sum of encoding cost of sequences num_real_cols=0; for (int col=b.width-1; col>=0; col--) { float total_num_char=0; int c; for (c=alphabet_size-1; c>=0; c--) num_char[c] = 0; for (s=b.num_seqs-1; s>=0; s--) { Base bs = b.data[s][col]; if (bs.is_normal()) { num_char[b.alphabet->index(bs)] += b.weights[s]; total_num_char += b.weights[s]; } } if (total_num_char <=0.) continue; num_real_cols ++; weighting_reg->get_probs(num_char, probs); for (c=alphabet_size-1; c>=0; c--) { if (probs[c]>0) { char_entropy[c] = -log(probs[c]); total_real_entropy += probs[c]*char_entropy[c]; } else char_entropy[c] = 99999.; } for (s=b.num_seqs-1; s>=0; s--) { Base bs = b.data[s][col]; if (bs.is_normal()) entropy[s] += char_entropy[b.alphabet->index(bs)]; } } double total_rel_weight=0; for (s=b.num_seqs-1; s>=0; s--) { rel_weight[s] = relative_weight_power? (num_normal[s]==0? 0: pow(static_cast(entropy[s])/num_normal[s], static_cast(relative_weight_power))): b.weights[s]; total_rel_weight += rel_weight[s]; if (num_normal[s]) total_seq_entropy += entropy[s]/num_normal[s]; } for (s=b.num_seqs-1; s>=0; s--) { rel_weight[s] /= total_rel_weight; } // NEW METHOD: make average entropy=param // NOTE: only works if there is a weighting regularizer, and preferably // a Dirichlet mixture or pseudocount one! older_nats_saved= old_nats_saved; old_nats_saved = nats_saved; nats_saved = max_entropy - total_real_entropy/num_real_cols; // assume (without justification), that savings are // proportional to sequence weight ^ power. const double power= 1.0; double ideal_weight = nats_saved<=0? 1.0: (new_tot_weight * pow(static_cast(nats_to_save)/nats_saved, static_cast(1.0)/power)); if (clip_to >0 && ideal_weight > clip_to) ideal_weight = clip_to; // set interpolation weight for new estimate // if we seem to moving in one direction, use almost all new weights. // if we seem to be oscilating, damp heavily. eta = older_nats_saved <0 ? 0.7: (older_nats_saved-old_nats_saved)*(old_nats_saved-nats_saved)>0? min(0.8, eta*1.2) : min(eta * 0.5, 0.5); if (clip_to>0 && old_tot_weight > clip_to) eta= 1.0; double damped_ideal_weight = exp(eta *log(ideal_weight) + (1-eta)*log(new_tot_weight)); #ifdef DEBUG assert(isfinite(damped_ideal_weight)); #endif // Damp the relative weights also. // Linear interpolation preserves sum=1. // Note: MUST have 0<= eta <= 1.0 to keep weights non-negative. if (relative_weight_power) { for (s=b.num_seqs-1; s>=0; s--) { rel_weight[s] = eta *rel_weight[s] + (1-eta)*b.weights[s]/new_tot_weight; } } // desired weight of sequence s proportional to rel_weight[s] for (s=b.num_seqs-1; s>=0; s--) { b.weights[s] = damped_ideal_weight * rel_weight[s]; } old_tot_weight = new_tot_weight; new_tot_weight = b.total_weight(); } (*errors_to) << "# " << b.alignmentID << " with " << b.num_seqs << " sequences," << " total weight= " << b.total_weight() << " avg weight= " << b.total_weight()/num_real_seqs << ((clip_to>0 && kk::abs(b.total_weight()-clip_to) < 0.0001)? " clipped " : " ") << iter << " iterations" << "\n"; if (kk::abs(target_in_nats-total_real_entropy/num_real_cols) > 0.0001 * target_in_nats) { (*errors_to) << "# AdjustWeights couldn't save exactly " << bits_to_save << " bits/position, saving " << (max_entropy- total_real_entropy/num_real_cols) *M_LOG2E << " bits.\n" << flush; } if (weight_file) b.report_weights(*weight_file); if (!given_weighting_reg) delete weighting_reg; delete [] char_entropy; delete [] num_normal; delete [] rel_weight; delete [] entropy; delete [] num_char; delete [] probs; } // set the weights of b equal, so that // total_weight() = pow(num_seqs, pow_weight) static void InitFlatWeight(alignment &b, float pow_weight, ostream* weight_file) { float weight= pow(b.num_seqs, pow_weight)/ b.num_seqs; if (weight_file) { (*weight_file) << "# flat weighting with " << b.num_seqs << " sequences\n" << "# initial total weight = num_seqs^" << pow_weight << " = " << weight*b.num_seqs << "\n"; } for (int i=b.num_seqs-1; i>=0; i--) b.weights[i] = weight; } static void FlatWeight(alignment &b, float bits_to_save, ostream* weight_file, Regularizer* reg, float pow_weight, float clip_to) { if (b.num_seqs <=0 || !b.weights) return; InitFlatWeight(b, pow_weight, weight_file); AdjustWeights(b, bits_to_save, weight_file, reg, 0, clip_to); } static SequenceWeightObject Flat("FlatWeight", FlatWeight, "Sets the weights so that the average entropy per column saves the\n\ specified number of bits over the background given by the\n\ WeightingRegularizer. If no WeightingRegularizer is specified, a small\n\ zero-offset is used, giving a flat background distribution.\n\ \n\ Sets the weights of all sequences in a block equal.\n\ \n\ If a negative savings is specified, the initial weighting is used\n\ without attempting to adjust the bits saved. The total weight for the\n\ alignment is then be set to pow(num_seqs, parameter). With the default\n\ parameter value of 1, this sets the total weight to the number of\n\ sequences. With parameter=0, the total weight is 1.\n\ \n\ Wildcards and other non-normal characters are ignored."); // sets the weights using Henikoffs' position-specific weighting // scheme to set the relative weights, // but set the total weight to pow(num_seqs, pow_weight) static void InitHenikoffWeight(alignment &b, float pow_weight, ostream *weight_file) { if (b.num_seqs <=0 || !b.weights) return; int alphabet_size = b.alphabet_size(); int* num_normal=new int[b.num_seqs]; // how many normal characters are there in each sequence? int s; for (s=b.num_seqs-1; s>=0; s--) { b.weights[s] = 0.; num_normal[s] = 0; } int* num_char=new int[alphabet_size]; for (int col=b.width-1; col>=0; col--) { for (int c=alphabet_size-1; c>=0; c--) num_char[c] = 0; int num_different_char=0; for (s=b.num_seqs-1; s>=0; s--) { Base bs = b.data[s][col]; if (bs.is_normal()) { int subscript=b.alphabet->index(bs); if (!num_char[subscript]) num_different_char++; num_char[subscript] ++; num_normal[s] ++; } } for (s=b.num_seqs-1; s>=0; s--) { Base bs = b.data[s][col]; if (bs.is_normal()) b.weights[s] += 1. / (num_different_char * num_char[b.alphabet->index(bs)]); } } delete [] num_char; for (s=b.num_seqs-1; s>=0; s--) if (num_normal[s]) b.weights[s] /= num_normal[s]; // each sequence is now weighted with its average share of weight delete [] num_normal; double scale = pow(b.num_seqs, pow_weight)/ b.total_weight(); for (s=b.num_seqs-1; s>=0; s--) b.weights[s] *= scale; if (weight_file) { (*weight_file) << "# Initially Henikoff position-specific weighting with " << b.num_seqs << " sequences\n" << "# total_weight = num_seqs^" << pow_weight << " = " << b.total_weight() << "\n"; } } static void HenikoffWeight(alignment &b, float bits_to_save, ostream* weight_file, Regularizer* reg, float pow_weight, float clip_to) { if (b.num_seqs <=0 || !b.weights) return; InitHenikoffWeight(b, pow_weight, weight_file); AdjustWeights(b, bits_to_save, weight_file, reg, 0, clip_to); } static SequenceWeightObject Henikoff("HenikoffWeight", HenikoffWeight, "Sets the weights so that the average entropy per column saves the\n\ specified number of bits over the background given by the\n\ WeightingRegularizer. If no WeightingRegularizer is specified, a small\n\ zero-offset is used, giving a flat background distribution.\n\ \n\ Relative weights of sequences are set proportional to the Henikoffs'\n\ position-specific weighting scheme (jmb 243:4 pp574-578, nov 1994).\n\ \n\ If a negative savings is specified, the initial weighting is used\n\ without attempting to adjust the bits saved. The total weight for the\n\ alignment is then be set to pow(num_seqs, parameter). With the default\n\ parameter value of 1, this sets the total weight to the number of\n\ sequences. With parameter=0, the Henikoffs' scheme (summing to 1 for\n\ block) is used.\n\ \n\ Wildcards and other non-normal characters are ignored."); // set the weights so that // average entropy = entropy at weight=0 - bits_to_save // and individual sequences are weighted proportional to the entropy // for that sequence raised to the entropy_power static void EntropyWeight(alignment &b, float bits_to_save, ostream *weight_file, Regularizer *weighting_reg, float entropy_power, float clip_to) { if (b.num_seqs <=0 || !b.weights) return; if (entropy_power) InitHenikoffWeight(b, 0.5, weight_file); else InitFlatWeight(b, 0.5, weight_file); AdjustWeights(b, bits_to_save, weight_file, weighting_reg, entropy_power, clip_to); } static SequenceWeightObject Entropy("EntropyWeight", EntropyWeight, "Sets the weights so that the average entropy per column saves the\n\ specified number of bits over the background given by the\n\ WeightingRegularizer. If no WeightingRegularizer is specified, a small\n\ zero-offset is used, giving a flat background distribution.\n\ If a negative savings is specified, the initial weighting is used\n\ without attempting to adjust the bits saved.\n\ \n\ Relative weights of sequences are set proportional to their encoding\n\ costs using the final distribution raised to a power (default=1).\n\ \n\ Wildcards and other non-normal characters are ignored."); // Change Log: // (earlier changes not recorded) // 28 April 1996 Kevin Karplus // Made all weighters take bits-to-save as a parameter, // and separated out AdjustWeights procedure to do it. // 29 July 1996 Kevin Karplus // Changed TargetRegularizer to WeightingRegularizer // Changed estimate of ideal weight to assume that nats saved are // roughly proportional to sequence weight. // 7 Dec 1996 Kevin Karplus // Changed multi-line strings and local arrays to match CXX's // subset of c++ // 23 March 1997 Kevin Karplus // increased number of iterations to 40 // started EntropyWeight with total weight = sqrt(num_seq) // 20 February 2000 Kevin Karplus // Copied clip_to parameter from estimate-dist // 9 October 2001 Kevin Karplus // Added interface_num to SequenceWeigher::execute() // 9 April 2004 Kevin Karplus // Changed from abs.h to Utilities/abs.h // 26 May 2004 Kevin Karplus // Added isfinite assertions, protected by ifdef DEBUG // Fri Jun 24 20:44:10 PDT 2005 Kevin Karplus // Replaced Globals::NN with Globals::nn()