//OneChain.cc // Kevin Karplus //21 July 1997 #include #include // for strcpy #include // for tellg() and seekg() #include "OneChain.h" #include "TrainSet.h" #include "Regularizer/MLZReg.h" #include "Regularizer/BackgroundProbs.h" #include "AlphabetTuple/AlphabetTuple.h" #include "Alphabet/Alphabet.h" #include "Utilities/Random.h" #include "gen_sequence/GenSequence.h" #include "Alphabet/CombinedAlphabet.h" #include "Alphabet/GaussianAlphabet.h" #include "Globals.h" // To force linking of CombinedAlphabet and GaussianAlphabet static IdObject* tmp_ga = GaussianAlphabet::classID(); static IdObject* tmp_ca = CombinedAlphabet::classID(); const float HasInsertProb=0.15; // taken from M->I probability of fssp-trained-regularizer // This is a very crude estimate of how many columns have an // insertion somewhere in the column, and should probably be // re-estimated. const float HasDeleteProb=0.75; // This is entirely a guess, has no basis in anything but being // a placeholder, and should be reestimated. float* OneChain::ZeroProbs=0; float* OneChain::ZeroCounts=0; float* OneChain::ZeroComponentProbs=0; void OneChain::set_zero_counts(Regularizer *r) { const InterfaceDescription *ifd = NN->interface(0); // We've got a lot of sizes here, let's do a quick run-down on what // each of them are. Recall that the alignment interface can // include any of (amino acid probs, insert probs, delete probs, // guide sequence, component probs). // - num_units is the number of units for this layer. It's not // directly tied to the contents of the layer, though the sizes // should correspond. See the definition of NumUnits in InterfaceDescription.h // - num_alph is the number of normal characters in the alphabet. // - extended_alph_size is num_alph plus 1 position each for inserts and deletes // - max_num is the max of num_units and num_alph // - num_comp is the number of components in the Dirichlet regularizer. // NOTE: may have to modify what follows to use ifd->TupleStates int num_units = ifd->num_units(); DirichletReg* dirch=(r==NULL || !r->is_a(DirichletReg::classID()))? NULL : static_cast(r); int num_comp = (dirch==NULL)? 0: dirch->num_components(); int num_alph = (ifd->Alpha!=NULL)? ifd->Alpha->num_normal() : num_units; int max_num = (num_alph>num_units? num_alph: num_units); //cerr << "NumAlph is: " << num_alph << endl << flush; //cerr << "NumUnits is: " << num_units << endl << flush; // If NetRegularizer is specified for the interface description, // UseComponentProbs and ReRegularizers for that interface have to // be off. If they are on, they are turned off assert (ifd->num_units_ok()); double entropy=log(num_alph+0.0); // ZeroCounts, ZeroProbs, and data[layer]->ProbVectors are all // packed with the same order. To see in what order data goes into // these arrays, see the comments above ProbVector in the ChainData // class declaration. if (!ZeroCounts) { ZeroCounts = new float[max_num]; if (!ZeroProbs) ZeroProbs = new float[max_num]; for (int c=max_num-1; c>=0; c--) { ZeroCounts[c]=ZeroProbs[c]=0; } } if (r) { float *dummyZeroArray = new float[num_alph]; for (int jj = 0; jj < num_alph; jj++) dummyZeroArray[jj] = 0.0; // Just in case you find it confusing that an array dimensioned // to max_num is the input and an array dimensioned to num_alph // is the output, here's what's going on. When get_probs is // called below with a zeroed array for input, it will compute // probabilities for each of the num_alph alphabet characters // of seeing that character when having seen no counts. So, in // ZeroCounts, get_probs pays attention to only the first // num_alph array cells. r->get_probs(ZeroCounts, dummyZeroArray); entropy=0; for (int jj = 0; jj < num_alph; jj++) { float p= dummyZeroArray[jj]; entropy -= (p<=0? 0: (p* log(p))); } if (!ZeroComponentProbs && num_comp>0) { ZeroComponentProbs = new float[num_comp]; assert(dirch); const double * double_component_probs=dirch->component_probs(); // note: double_component_probs belongs to dirch(=r) for (int c=num_comp-1; c>=0; c--) ZeroComponentProbs[c] = double_component_probs[c]; } // Pack ZeroProbs according to what we're using: amino acid // probs, inserts, deletes, guide sequence, component probs. ifd->fill_vector(ZeroProbs, dummyZeroArray, HasInsertProb, HasDeleteProb, dummyZeroArray, ZeroComponentProbs, entropy, 0); delete [] dummyZeroArray; } } // This section builds the data needed for input to layer lay // based on the alignment. void OneChain::RegularizeAlignment(DirichletReg *r, const alignment& align, int layer) { const InterfaceDescription *ifd = NN->interface(layer); // We've got a lot of sizes here, let's do a quick run-down on what // each of them are. Recall that the alignment interface can // include any of (amino acid probs, insert probs, delete probs, // guide sequence, component probs). // - num_units is the number of units for this layer. It's not // directly tied to the contents of the layer, though the sizes // should correspond. See the definition of NumUnits in InterfaceDescription.h // - num_alph is the number of normal characters in the alphabet. // - extended_alph_size is num_alph plus 1 position each for inserts and deletes // - max_num is the max of num_units and num_alph // - num_comp is the number of components in the Dirichlet regularizer. // - NumCols is the number of columns in this alignment, and is the // orthogonal dimension to everything discussed here. // cerr << "# Regularizing alignment for " << align.Filename << "\n" << flush; // cerr << "# this = " << this << "\n" << flush; // NOTE: may have to modify what follows to use ifd->TupleStates int num_units = ifd->num_units(); int num_comp = r!=NULL? r->num_components(): 0; int num_alph = (ifd->Alpha!=NULL)? ifd->Alpha->num_normal() : num_units; //cerr << "NumAlph is: " << num_alph << endl << flush; //cerr << "NumUnits is: " << num_units << endl << flush; // If NetRegularizer is specified for the interface description, // UseComponentProbs and ReRegularizers for that interface have to // be off. If they are on, they are turned off assert (ifd->num_units_ok()); int extended_alph_size = num_alph + 2; data[layer] = new ChainData(NumCols); // cerr << "# data[layer] = " << data[layer] << "\n" << flush; // Set the guide sequence for the layer // because it will be needed to create GuideVectors if UseGuide. // NOTE: data[layer]->GuideSequence now OWNS the new copy of the first_sequence data[layer]->GuideSequence=align.first_sequence(); data[layer]->GuideAlphabet=align.alphabet; set_zero_counts(r); typedef float* floatptr; data[layer]->ProbVectors = new floatptr[NumCols]; data[layer]->Counts = new floatptr[NumCols]; data[layer]->PreviousStep = new floatptr[NumCols]; // Here's how the (temporary) AminoAcidProbs are indexed, // while calculating for a particular column. // The first num_alph characters collect the probability for // the num_alph characters in the alphabet. This happens // whether or not the amino acid probs are selected for input. // Why? We'd have to do the same work anyway if we used insert // or delete probs, so skipping would only save time if we used // only component probs. // The inserts probs are calculated and held in cell num_alph, // the one first after the end of the alphabet. // The deletes probs are calculated and held in cell num_alph + 1 float *AminoAcidProbs = new float[extended_alph_size]; // Only the counts for the amino acids are stored for a column. float *Counts= new float[num_alph]; float *ComponentProbs = 0; if (r && ifd->UseComponentProbs && num_comp>0) ComponentProbs = new float[num_comp]; // The GuideVector is a one-hot (or zero-hot for deletes) representation // of a single column in the guide sequence. float *GuideVector = 0; if (ifd->UseGuide) GuideVector = new float[num_alph]; // create a null regularizer for use if r does not exist MLZReg *NullReg = r? 0: new MLZReg(ifd->Alpha, 1.e-20, "null_regularizer"); // When we encounter gap characters in the alignment, we want to // know if they're real gaps or not. If a gap appears before the // first matching position in a sequence or after the last matching // position, it's not considered a "real" gap, it just probably indicates // that a smaller sequence has been aligned to the alignment. To // determine whether or not a gap represents a gap in the alignment, // we'll want to know where the alignment of each sequence begins // and ends. int *first_match = new int[align.num_seqs]; int *last_match = new int[align.num_seqs]; for (int seqIdx = 0; seqIdx < align.num_seqs; seqIdx++) { first_match[seqIdx] = NumCols-1; last_match[seqIdx] = 0; for (int colIdx = 0; colIdx < NumCols; colIdx++) { if (colIdx < first_match[seqIdx]) { if (align.data[seqIdx][colIdx].is_normal()) { first_match[seqIdx] = colIdx; } } if (colIdx > last_match[seqIdx]) { if (align.data[seqIdx][colIdx].is_normal()) { last_match[seqIdx] = colIdx; } } } } // Step through the columns, performing sequence-weighted counting // of the AminoAcids, Inserts, and Deletes. Create a GuideVector for the column // from the guide sequence if needed. For each column, regularize // the alphabet and/or component probability vector and convert to logodds // if desired. Finally, assemble the pieces to store in ProbVector and Counts // for the column in this layer. for (int col=0; col=0; a--) Counts[a]=0.0; //cerr << "Num Seq: " << align.num_seqs << endl << flush; float TotalWeight=0, InsertWeight=0, DeleteWeight=0; // collect the weighted counts for the column for (int seq=align.num_seqs-1; seq>=0; seq--) { float weight = align.weights[seq]; TotalWeight+=weight; if (align.insert_before[seq][col]) { InsertWeight+=weight; } else if (align.data[seq][col].is_null()) { if (col > first_match[seq] && col < last_match[seq]) { DeleteWeight+=weight; } } Base b = align.data[seq][col]; if (b.is_normal()) { Counts[align.alphabet->index(b)] += weight; } else { // null or wild-card const int num= align.alphabet->num_matches(b); const Base* allb = align.alphabet->matches(b); while (allb->is_normal()) { // this (rare) loop should be for wild cards only assert (b.is_wild()); // excludes is_null() assert (num); // divide the weight among the wild-card matches Counts[align.alphabet->index(*allb)] += weight/num; if (! allb->no_wc_match(b)) { //cerr << "For " << align.alphabet->to_char(*allb) << " matching " // << align.alphabet->to_char(b) // << " incrementing " << align.alphabet->index(*allb) // << "\n"; } allb++; } } } //cerr << "Counts::" << endl << flush; //for(int i=0; i < num_alph; i++) // cerr << i<< ": " << Counts[i] << endl << flush; // The insert and delete probs are simple calculations AminoAcidProbs[num_alph] = InsertWeight/TotalWeight; AminoAcidProbs[num_alph+1] = DeleteWeight/TotalWeight; // Regularize the column for AminoAcid and Component probs, if applicable. if (r) { r->get_probs(Counts, AminoAcidProbs); //cerr << "AminoAcidProbs (Reg)::" << endl << flush; //for(int i=0; i < num_alph; i++) // cerr << i<< ": " << AminoAcidProbs[i] << endl << flush; if (ComponentProbs) { const double * double_component_probs=r->component_probs(); for (int c=r->num_components()-1; c>=0; c--) ComponentProbs[c] = double_component_probs[c]; } } else { NullReg->get_probs(Counts, AminoAcidProbs); // cerr << "AminoAcidProbs (Null Reg)::" << endl << flush; // for(int i=0; i < num_alph; i++) // cerr << i<< ": " << AminoAcidProbs[i] << endl << flush; } float entropy=0; for(int i=0; i < num_alph; i++) { float p = AminoAcidProbs[i]; entropy -= ( p<=0? 0.0: p*log(p)); } // Convert AminoAcid and Component to log odds if desired. if (ifd->UseLogOdds) { assert(r!=NULL); // need regularizer if log-odds-ratio to be used. const double MAXODDS = 4; for(int a=num_alph-1; a>=0; a--) { double log_odds= log (AminoAcidProbs[a]/ ZeroProbs[a]); AminoAcidProbs[a] = log_odds>MAXODDS? MAXODDS : log_odds <-MAXODDS? -MAXODDS : log_odds; } if (ComponentProbs) { for (int c=num_comp-1; c>=0; c--) { double log_odds = log(ComponentProbs[c] / ZeroComponentProbs[c]); ComponentProbs[c] = log_odds>MAXODDS? MAXODDS : log_odds <-MAXODDS? -MAXODDS : log_odds; } } } float ProbOfGuide=0; // Calculate the GuideVector for this column, if desired. if (ifd->UseGuide) { for (int i=0; i < num_alph; i++) GuideVector[i] = 0.0; Base b = data[layer]->GuideSequence[col]; if (b.is_normal()) { GuideVector[align.alphabet->index(b)] = 1.0; ProbOfGuide = AminoAcidProbs[align.alphabet->index(b)]; } else { // for possible wildcard, divide it into its normal matches float fnum = static_cast(align.alphabet->num_matches(b)); const Base* allb = align.alphabet->matches(b); // Null-terminated list ProbOfGuide=0; while (allb->is_normal()) { GuideVector[align.alphabet->index(*allb)] = 1.0/fnum; ProbOfGuide+=AminoAcidProbs[align.alphabet->index(*allb)]; allb++; } } } // Assign the layer's storage for ProbVectors and Counts for this column. if(ifd->NetRegularizer) { data[layer]->ProbVectors[col] = new float[num_alph]; data[layer]->PreviousStep[col] = new float[num_alph]; data[layer]->Counts[col] = new float[num_alph]; } else { data[layer]->ProbVectors[col] = new float[num_units]; data[layer]->PreviousStep[col] = new float[num_units]; data[layer]->Counts[col] = new float[num_units]; } // Counts is simply a copy of the AA counts for this column. for (int CountIndex=0; CountIndex < num_alph; CountIndex++) { data[layer]->Counts[col][CountIndex] = Counts[CountIndex]; } // ProbVectors must be assembled from the optional pieces. ifd->fill_vector(data[layer]->ProbVectors[col], AminoAcidProbs, AminoAcidProbs[num_alph], AminoAcidProbs[num_alph+1], GuideVector, ComponentProbs, entropy, ProbOfGuide ); } // cerr << "# about to cleanup after regularizing alignment\n" << flush; // cerr << "# this = " << this << "\n" << flush; // cerr << "# AminoAcidProbs = " << AminoAcidProbs << "\n" << flush; delete [] AminoAcidProbs; delete [] Counts; // cerr << "# AminoAcidProbs deleted\n" << flush; if (ComponentProbs) delete [] ComponentProbs; if (GuideVector) delete [] GuideVector; delete NullReg; delete [] first_match; delete [] last_match; } OneChain::OneChain(DirichletReg *r, const alignment& align, const NeuralNet *nn, char *ID) { // const InterfaceDescription *ifd; NumCols = align.width; // OneChain's data field relies on the network's InterfaceDescriptions, if (!nn) { cerr << "ERROR: The neural network structure must be established\n" << "before sequences can be added to training sets. Exiting.\n" << flush; } NN = nn; // Set chain's ID if (ID!=0) { ChainID = new char[strlen(ID)+1]; strcpy(ChainID, ID); } else { ChainID=0; } assert(align.Filename!=NULL); Filename = new char[strlen(align.Filename)+1]; strcpy(Filename, align.Filename); set_name(align.names[0]); int NumLayers = nn->num_layers(); int NumInterface = NumLayers+1; // Now set the chain's ChainData based on what's needed, as // specified by each layer's InterfaceDescription data = new ChainData*[NumInterface]; // AlphabetTuple *at=0; // BAD CODE: assumes that the first interface always uses the alignment. // This will cause problems when we try to predict the distribution // from a single sequence. RegularizeAlignment(r,align,0); // Why does this start at 1?---because RegularizeAlignment // set up data[0] for (int l=1; l random_seq; while (num_cols<1) { Globals::SequenceGenerator->generate(random_seq); num_cols=random_seq.size(); } } else { num_cols=50; } } // const InterfaceDescription *ifd; NumCols = num_cols; // OneChain's data field relies on the network's InterfaceDescriptions, if (!nn) { cerr << "ERROR: The neural network structure must be established\n" << "before sequences can be added to training sets. Exiting.\n" << flush; } NN = nn; // Set chain's ID ChainID=NULL; if (ID!=0) { ChainID = new char[strlen(ID)+1]; strcpy(ChainID, ID); } Filename=NULL; if (filename) { Filename = new char[strlen(filename)+1]; strcpy(Filename, filename); } int NumLayers = nn->num_layers(); int NumInterface = NumLayers+1; // Now set the chain's ChainData based on what's needed, as // specified by each layer's InterfaceDescription data = new ChainData*[NumInterface]; // AlphabetTuple *at=0; for (int l=0; linterface(0)->ReRegularizer; if (!r) { const InterfaceDescription *ifd = NN->interface(0); MLZReg nullr(ifd->Alpha, 1.e-20, "null_regularizer"); set_zero_counts(&nullr); } else { set_zero_counts(r); } typedef float* floatptr; data[0]->ProbVectors = new floatptr[NumCols]; data[0]->PreviousStep = new floatptr[NumCols]; for (int col = 0; col < NumCols; col++) { data[0]->ProbVectors[col] = new float[NN->interface(0)->num_units()]; data[0]->PreviousStep[col] = new float[NN->interface(0)->num_units()]; } // We can only use neural nets with single-alphabet inputs // for this constructor, since we need to set up a GuideAlphabet, // and that is not currently designed to work with AlphabetTuples. const AlphabetTuple* nn_alpha = nn->interface(0)->Alpha; assert(nn_alpha->num_alphabets()==1); data[0]->GuideAlphabet= (*nn_alpha)[0]; set_guide(0); // clear the guide sequence for interface 0 RandomInput(NN->interface(0)->ReRegularizer); } OneChain::~OneChain(void) { for (int c=0; c<=NN->num_layers(); c++) delete data[c]; delete [] data; delete [] ChainID; delete [] Filename; } short int *OneChain::osec(int layer) const { assert(data[layer+1]->Structure != NULL); return (data[layer+1]->Structure); } int OneChain::correct_value(int lay, int pos) const { assert(pos>=0 && pos=0 && lay<=NN->num_layers()); return (data[lay]->Structure[pos]); } const float *OneChain::probs_for_layer(int lay, int col) const { // cerr << "Interface ::" << lay << " Col ::"<< col << endl << flush; // cerr << data[lay]->ProbVectors << endl << flush; if (col>=0 && colProbVectors[col]; } else { return NN->interface(lay)->UseLogOdds? ZeroCounts:ZeroProbs; } } // Use the ReRegularizer to set the profile from counts void OneChain::set_profile_from_counts(int col, int inter) { const InterfaceDescription *ifd= NN->interface(inter); assert(data[inter] && data[inter]->Counts && data[inter]->Counts[col]); assert(data[inter]->ProbVectors && data[inter]->ProbVectors[col]); const BackgroundProbs* NullProbs= Globals::background_probs(ifd->Alpha); assert(NullProbs); Regularizer *r = ifd->ReRegularizer; if (!r) { for (int a=0; aAlpha->num_normal(); a++) { profile(col,inter)[a] = NullProbs->probs()[a]; } } else { r->get_probs(data[inter]->Counts[col], profile(col,inter)); } // require that profile numbering and PreviousStep // numbering be the same: assert(ifd->profile_first_num()==0); // Initially, set the PreviousStep to be a constant*ln(prob/background) for (int a=0; aAlpha->num_normal(); a++) { data[inter]->PreviousStep[col][a] = 0.03* log(profile(col)[a]/NullProbs->probs()[a]); } } // Added by Frellsen // Creates a random inputlayer at layer void OneChain::RandomInput(DirichletReg *r, int layer) { const InterfaceDescription *ifd = NN->interface(layer); // Basic assumptions about the net assert( ifd && !ifd->UseComponentProbs && !ifd->NetRegularizer && !ifd->UseLogOdds ); int num_units = ifd->num_units(); int num_alph = (ifd->Alpha!=NULL) ? ifd->Alpha->num_normal() : num_units; assert(ifd->num_units_ok()); assert (! ifd->UseComponentProbs); assert(ifd->Alpha!=NULL); // We need to have single alphabet, since setting the guide sequence // uses the Base class. assert(ifd->Alpha->num_alphabets()==1); if (! data[layer]) data[layer] = new ChainData(NumCols); // Create: // * Create arrays of pointers that points to the arrays of // the probability vector and the counts // * Create the arrays to hold the hold the probability // vector and the counts typedef float* floatptr; assert(data[layer]->ProbVectors); assert(data[layer]->Counts); clear_probvectors(); // Generate a random guide sequence for the layer if (Globals::SequenceGenerator) { assert(Globals::SequenceGenerator->alphabet()->same_as(ifd->Alpha)); vector seq; Globals::SequenceGenerator->generate(seq,NumCols); for (int col = 0; col < NumCols; col++) { set_guide_base(0,col,seq[col]); // sets ProbVectors also! } } else { for (int col = 0; col < NumCols; col++) { Base b; b.set_int(irandom(0,num_alph-1)); set_guide_base(0,col,b); // sets ProbVectors also! } } set_counts_from_guide(1.0); for (int col = 0; col < NumCols; col++) { set_profile_from_counts(col); } } // Added by Frellsen // Sample the Guide Sequence from the Amino Acid probability // distribution for each column void OneChain::SampleGuideFromAAProbs(int layer) { const InterfaceDescription *ifd = NN->interface(layer); // Basic assumptions about the net // The net needs to use the Guide Sequence assert( ifd && ifd->UseAminoAcidProbs && ifd->UseGuide && !ifd->UseComponentProbs && !ifd->NetRegularizer && !ifd->UseLogOdds ); int num_units = ifd->num_units(); int num_alph = (ifd->Alpha!=NULL) ? ifd->Alpha->num_normal() : num_units; // Do the sampling for each column for (int col = 0; col < NumCols; col++) { int sampleIndex = get_random_given_weights(num_alph, data[layer]->ProbVectors[col]); assert(sampleIndex < num_alph); Base b; b.set_int(sampleIndex); set_guide_base(0,col,b); } } // Added by Frellsen // Adjusts the input layer according to the gradient of the error. void OneChain::AdjustToGradientDescent(float learning_rate, double **InputPartials) { const InterfaceDescription *ifd = NN->interface(0); assert(ifd->Alpha); int num_alph = ifd->Alpha->num_normal(); for (int col=0; col0 && Globals::FreezeTheseColumns[col]) { return; } const InterfaceDescription *ifd = NN->interface(0); assert(ifd->Alpha); int num_alph = ifd->Alpha->num_normal(); // adjust the steps so that double adjusted_step[num_alph]; // Now, subtract a constant from previous_step, so that // the adjusted_step is always negative // to keep exp(adjusted_step) from blowing up. // This will rescale all the exp_adj, which is ok, since they // are normalized to produce the final ProbVectors double max_prev_step= prev_step(col)[0]; for (int i=1; imax_prev_step) { max_prev_step=prev_step(col)[i]; } } assert(isfinite(max_prev_step)); for (int i=0; i=0); if (input_i < 1.e-06) input_i = 1.e-06; exp_adj[i] = input_i * exp(adjusted_step[i]); assert(isfinite(exp_adj[i])); sum_exp_adj += exp_adj[i]; } assert (sum_exp_adj>0); // Finally propagate this result to the inputlayer for (int i=0; iinterface(0); assert(ifd->Alpha); int num_alph = ifd->Alpha->num_normal(); const float MIN_STEP=0.0005; const float MAX_STEP=1.0; for (int col=0; col 0 )? 1.3 * old_step : -0.5 *old_step; // new_step is now in desired direction, // bigger than before if same as old step, // smaller than before if different from old step assert(desired_dir * new_step >=0); if (new_step < -MAX_STEP) new_step = -MAX_STEP; if (MAX_STEP < new_step) new_step = MAX_STEP; if (-MIN_STEP < new_step && new_step0? MIN_STEP: -MIN_STEP; } prev_step(col)[i] = new_step; } AdjustProfileByPreviousStep(col); } } // CHANGE LOG: // 10 May 1998 Melissa Cline // Extended support for the variables UseInsert, UseDelete, // UseComponentProbs, and UseAminoAcidProbs. These variables // all specify alignment attributes, and the user can use any // or all of them. // 2 May 1998 Melissa Cline // When applying the regularizer to an alignment, make use of // the knowledge of when a column contains a "real" gap. When // there's a deletion character at the beginning or end of the // alignment of some sequence, it's not a real gap; it just means // the sequence isn't in the process of aligning to the other stuff. // When the sequence has a deletion between aligned residues, that's // a real gap - it indicates that something's missing in this sequence. // 21 March 1998 Kevin Karplus // Fixed erroneous deletion of data (delete [] should have been delete) // Restructured constructor to separate out the handling of the // alignment. // Add NN to OneChain // 5 March 1998 Kevin Karplus // Added UseLogProb to translating alignment // 9 April 1998 Kevin Karplus // Eliminated last dependencies on input being amino acids. // 25 May 1998 Kevin Karplus // Fixed inefficient (and incorrect) allocations of AminoAcidProbs // 5 June 1998 Kevin Karplus // Fixed bug which caused ZeroCounts to be allocated too small when // num_comp < num_alph // 15 September 1999 Sugato Basu // Added code to handle NetRegularizer to predict mixture component // probs // 15 September 1999 Sugato Basu // Corrected typo data[1]->assign_guide(first_alignment_sequence) to // data[0]->assign_guide(first_alignment_sequence) // 21 April 2004 Sol Katzman // Added support for UseGuide // Updated comments and rearranged code for improved understanding. // 29 April 2004 Sol Katzman // Added assertions for mutual exclusion of UseAminoAcidProbs and NetRegularizer // (should be ok, since NetRegularizer is an option for the // output interface). // 19 May 2004 Sol Katzman // Added assertions to check that matches() and num_matches() return // consistent values. (wildcard-loop in RegularizeAlignment) // Mon Jun 13 04:13:49 PDT 2005 Kevin Karplus // Picked up Jes Frellsen's additions of // constructor for empty OneChain // RandomInput // SampleGuideFromAAProbs // AdjustToGradientDescent // Mon Jun 13 05:36:42 PDT 2005 Kevin Karplus // Added isfinite assertions to AdjustToGradientDescent // Mon Jun 13 05:47:50 PDT 2005 Kevin Karplus // Added prescaling of dE in AdjustToGradientDescent // to prevent exp from blowing up. // Mon Jun 13 06:55:42 PDT 2005 Kevin Karplus // Added guide_sequence_0 to sampleGuideFromAAProbs and // RandomInput // Mon Jun 13 08:27:38 PDT 2005 Kevin Karplus // Removed deletion of data[layer] in RandomInput // Mon Jun 13 09:49:09 PDT 2005 Kevin Karplus // Initialized GuideAlphabet in constructor for empty OneChain // (as first alphabet of NeuralNet input) // Tue Jun 14 10:11:26 PDT 2005 Kevin Karplus // Using set_guide to clear guide sequence in constructor for empty OneChain // Separated choosing guide sequence from setting up arrays in // RandomInput (in preparation for spearating out fully setting // the guide_sequence and initializing the One-Hot codes) // Tue Jun 14 15:33:28 PDT 2005 Kevin Karplus // Modified SampleGuideFromAAProbs to use set_guide_base // Wed Jun 15 12:56:32 PDT 2005 Kevin Karplus // Allow empty filename for empty OneChain // Wed Jun 15 13:53:00 PDT 2005 Kevin Karplus // Simplified RandomInput to use clear_counts and set_profile_from_counts // Sat Jul 9 08:07:12 PDT 2005 Kevin Karplus // Swapped order of arguments in set_profile_from_counts // Mon Jul 11 13:44:39 PDT 2005 Kevin Karplus // Removed some old commented-out code. // Used set_counts_from_guide in RandomInput. // Thu Jul 21 18:04:00 PDT 2005 Kevin Karplus // Added SequenceGenerator to RandomInput. // Fri Jul 22 10:05:29 PDT 2005 Kevin Karplus // Modified no-data constructor to use RandomInput to set the inputs, // and changed order of inputs. // Fri Jul 22 14:48:38 PDT 2005 Kevin Karplus // Made counts_for_layer inline // Fri Aug 12 15:25:23 PDT 2005 Kevin Karplus // Used new InterfaceDescription::num_units_ok to reduce uses of // the InterfaceDescription::Use Insert/Delete variables // Fri Aug 12 17:17:36 PDT 2005 Kevin Karplus // Used new InterfaceDescription::fill_vector to eliminate uses of // Use Insert/Delete // Fri Aug 12 23:16:43 PDT 2005 Kevin Karplus // Changed fill_vector calls to use entropy and ProbOfGuide // Sun Feb 5 04:55:49 PST 2006 Kevin Karplus // Added some assertions in AdjustToGradientDescent // Mon Mar 13 10:30:19 PST 2006 Kevin Karplus // Added AdjustWithRprop // Added PreviousStep creation wherever ProbVectors was created. // Mon Mar 13 11:07:13 PST 2006 Kevin Karplus // Created AdjustProfileByPreviousStep to share code between // AdjustWithRprop and AdjustToGradientDescent // Mon Mar 13 13:45:00 PST 2006 Kevin Karplus // Added checks to keep probabilities from going to zero in rprop // Mon Mar 13 15:25:14 PST 2006 Kevin Karplus // Added Globals::FreezeTheseColumns to AdjustProfileByPreviousStep // Wed Mar 15 10:26:55 PST 2006 Kevin Karplus // Moved set_profile_from_counts from .h to .cc // Wed Mar 15 16:16:46 PST 2006 Kevin Karplus // Reduced multiplier for initial PreviousStep from log(prob/background) to // 0.01 // Wed Mar 15 16:56:39 PST 2006 Kevin Karplus // Reduced MIN_STEP and MAX_STEP in rprop, reduced growth rate // from 1.3 to 1.2 // Thu Mar 16 11:24:08 PST 2006 Kevin Karplus // Increased multiplier for initial PreviousStep from log(prob/background) to 0.03 // Increased growth rate in rprop to 1.3 // Wed Jun 17 16:57:58 PDT 2009 Kevin Karplus // Added tmp_ga and tmp_ca to force linking of GaussianAlphabet // and CombinedAlphabet