// OneChain.h // 21 July 1997 // Kevin Karplus #ifndef OneChain_H #define OneChain_H #include #include #include "alignment.h" #include "Regularizer/DirichletReg.h" #include "InterfaceDescription.h" #include "AlphabetTuple/AlphabetTuple.h" #include "NeuralNet.h" // This class was created for generalization of the data that a // OneChain object can have as input and train to. // There should be one of these layers for each Interface in the // Network that can have a file associated with it. // Most members in a ChainData object point to a list // indexed by column number (conceptually in the parent OneChain object): // ProbVectors[col] is the probability vector for the interface // (input or output) for the column. // Counts[col] stores the count values before probvectors are produced // by regularization // Structure[col] is a single correct value for the (output) interface // GuideSequence[col] is a single character of the (input) guide sequence // PreviousStep[col][a] stores the last step size for // ln(ProbVectors[col][a]) when doing RPROP for protein design. class ChainData { private: ChainData(int cols=0) { NumCols=cols; ProbVectors=NULL; Counts = NULL; Structure=NULL; GuideSequence=NULL; GuideAlphabet=NULL; PreviousStep=NULL; }; ~ChainData(void) { if (Structure) delete [] Structure; if (ProbVectors) { for (int k=0; kProbVectors: // 1. Amino Acid probabilities, if UseAminoAcidProbs // 2. Insert probability, if UseInsertProb // 3. Delete probablity, if UseDeleteProb // 4. One-hot GuideVector, if UseGuide // 5. Component probabilities, if UseComponentProbs. float **ProbVectors; // Is currently AA probability vector and/or // a component prob vector, depending on context. // Context is determined by the InterfaceDescription float ** Counts; // Stores the counts corresponding to the ProbVectors // before regularization float ** PreviousStep; // PreviousStep[col][a] stores the // last step size for // ln(ProbVectors[col][a]) when doing // RPROP for protein design. short int *Structure; // BaseTuples indexed for the AlphabetTuple given // in the layer's InterfaceDescription Base *GuideSequence; // If input to network was a sequence or multiple // alignment, the only/first raw sequence is saved // here. const Alphabet *GuideAlphabet; friend class OneChain; }; class OneChain: public NamedObject { private: void RegularizeAlignment(DirichletReg *r, const alignment& align, int layer); // convert the data in the alignment into probabilities and / or // Component probabilities, as specified in the InterfaceDescription // for the input of layer (or output of layer-1). // ZeroCounts and ZeroProbs are packed in the same way as // *ProbVectors, see comment above. At this code writing, // ZeroComponentProbs isn't really needed. static float *ZeroProbs; // regularized ZeroCounts static float *ZeroCounts; // all zeros, big enough // for counts,probs, and/or components. static float *ZeroComponentProbs; const NeuralNet *NN; // need to know the neural net // to be able to interpret the data. int NumCols; double TotalSeqWeight; // total weight of sequences in alignment char* ChainID; char* Filename; ChainData **data; // data for each interface stored here friend class TrainSet; void AdjustProfileByPreviousStep(int col); // Adjust the profile as if it were computed from a softmax unit, // whose inputs were incremented by prev_step(col). // Used in AdjustToGradientDescent() and AdjustWithRprop() public: OneChain(DirichletReg *reg, const alignment& align, const NeuralNet *nn, char *ID=0 ); // Create an instance of OneChain, with a random input chain. // If num_cols <0, then make up a length (based on Globals::SequenceGenerator). OneChain(const NeuralNet *nn, int num_cols=-1, char *filename=0, char *ID = 0); ~OneChain(void); inline const char *get_ChainID(void) const {return ChainID;} inline const char* filename(void) const {return Filename;} inline void set_structure(int inter, short int* str) { assert(data[inter]->Structure == 0); data[inter]->Structure = str; } inline const short int* structure(int inter) const { return data[inter]->Structure; } inline short int structure(int inter, int col) const { assert(data[inter]->Structure !=NULL); return data[inter]->Structure[col]; } inline void clear_probvectors(int inter=0) { assert(data[inter]->ProbVectors != 0); int num_units = NN->interface(inter)->num_units(); for (int c=0; cProbVectors[c][u]=0; } } } inline void copy_probvectors(const OneChain*from, int inter=0) { assert(data[inter]->ProbVectors != 0); assert(from->data[inter]->ProbVectors != 0); int num_units = NN->interface(inter)->num_units(); assert (num_units== from->NN->interface(inter)->num_units()); for (int c=0; cProbVectors[c][u]= from->data[inter]->ProbVectors[c][u]; } } } inline void set_probvectors(int inter, const alignment &align,DirichletReg *r) { assert(data[inter]->ProbVectors == 0); // cerr << "In set_probvectors interface is:" << inter << endl << flush; RegularizeAlignment(r, align, inter); } const Alphabet* guide_alphabet(int inter=0) const { return data[inter]->GuideAlphabet; } inline void set_guide_base(int inter,int col, Base b) { const InterfaceDescription *ifd= NN->interface(inter); ChainData* dat=data[inter]; if (!dat->GuideSequence) { dat->GuideSequence = new Base[NumCols]; } dat->GuideSequence[col] =b; if (ifd->UseGuide) { assert(dat->ProbVectors !=NULL); for (int g=ifd->guide_first_num(); g<=ifd->guide_last_num(); g++) dat->ProbVectors[col][g]=0.0; int index=b.raw_int()+(ifd->guide_first_num()); assert(indexnum_units()); dat->ProbVectors[col][index] = 1.0; } } inline void set_guide(int inter, int col, int base_int) { set_guide_base(inter,col, guide_alphabet(inter)->unindex(base_int)); } inline void set_guide(int inter, const Base *b=NULL) { if (b!=NULL) { for (int c=0; c(NULL)); } } // set profile for a column to values of an externally provided array inline void set_profile(int col, const float* prof, int inter=0) { const InterfaceDescription *ifd= NN->interface(inter); assert(data[inter]->ProbVectors && data[inter]->ProbVectors[col]); for (int a=0; aAlpha->num_normal(); a++) { data[inter]->ProbVectors[col][ifd->profile_first_num()+a]=prof[a]; } } // Use the ReRegularizer to set the profile from counts void set_profile_from_counts(int col, int inter=0); // return a pointer to the input profile for the given column inline const float* profile(int col, int inter=0) const { const InterfaceDescription *ifd= NN->interface(inter); assert(data[inter]); assert(data[inter]->ProbVectors && data[inter]->ProbVectors[col]); return (data[inter]->ProbVectors[col]) + (ifd->profile_first_num()); } // return a pointer to the input profile for the given column inline float* profile(int col, int inter=0) { const InterfaceDescription *ifd= NN->interface(inter); assert(data[inter]); assert(data[inter]->ProbVectors && data[inter]->ProbVectors[col]); return (data[inter]->ProbVectors[col]) + (ifd->profile_first_num()); } // return a pointer to the previous steps for the given column inline const float* prev_step(int col, int inter=0) const { const InterfaceDescription *ifd= NN->interface(inter); assert(data[inter]); assert(data[inter]->PreviousStep && data[inter]->PreviousStep[col]); return (data[inter]->PreviousStep[col]) + (ifd->profile_first_num()); } // return a pointer to the previous steps for the given column inline float* prev_step(int col, int inter=0) { return const_cast( const_cast(this)->prev_step(col,inter)); } inline void set_counts_from_guide(int col,float weight) { clear_counts(0,col); int index = guide_sequence_0(col).raw_int(); // Set the Counts for the column data[0]->Counts[col][index] = weight; } inline void set_counts_from_guide(float weight=1.0) { clear_counts(0); for (int col = 0; col < NumCols; col++) { set_counts_from_guide(col,weight); } } short int *osec(int layer) const; // Returns the correct (observed) output stucture // as an array of indexes into the AlphabetTuple. int correct_value(int lay, int pos) const; // single correct value for input to layer lay // or output of layer lay-1 // (equivalent to osec(layer)[pos], but with range checks). // Get a single character for the guide sequence from interface 0. inline char guide_sequence_ifd0_char(int pos) const { assert(pos>=0 && posto_char(data[0]->GuideSequence[pos]); } inline const Base* guide_sequence_0(void) const { return data[0]->GuideSequence; } // return a reference to the guide sequence of input layer in position pos inline Base& guide_sequence_0(int pos) { assert(pos>=0 && posGuideSequence !=NULL); return data[0]->GuideSequence[pos]; } inline Base guide_sequence_0(int pos) const { assert(pos>=0 && posGuideSequence !=NULL); return data[0]->GuideSequence[pos]; } void print_guide_sequence_0(ostream &out) const { assert (data[0]->GuideSequence !=NULL); for (int col=0; colto_char(data[0]->GuideSequence[col]); } out << "\n"; } inline const NeuralNet* nn(void) const {return NN;} inline int num_cols() const {return NumCols;} inline double total_sequence_weight(void) const {return TotalSeqWeight;} inline void clear_counts(int layer, int col) { assert(data[layer]); assert(data[layer]->Counts); assert(data[layer]->Counts[col]); const InterfaceDescription *ifd = NN->interface(layer); int num_units= ifd->num_units(); for (int u=0; uCounts[col][u] = 0.0; } } inline void clear_counts(int layer) { const InterfaceDescription *ifd = NN->interface(layer); int num_units= ifd->num_units(); assert(data[layer]); if (!data[layer]->Counts) { data[layer]->Counts = new float*[NumCols]; for (int col = 0; col < NumCols; col++) data[layer]->Counts[col] = new float[num_units]; } for(int col=0; colCounts[col][u] = 0.0; } } } void set_zero_counts(Regularizer *r); // set up ZeroCounts and ZeroProbs const float **probs_for_layer(int lay) const { return const_cast(data[lay]->ProbVectors); } const float *probs_for_layer(int lay, int col) const; // probablities for input to layer lay // or output of layer lay-1 // counts for input to layer lay or output of layer lay-1 inline float *counts_for_layer(int lay, int col) { return (col>=0 && colCounts[col] : NN->interface(lay)->UseLogOdds? ZeroCounts:ZeroProbs; } inline const float *counts_for_layer(int lay, int col) const { return (col>=0 && colCounts[col] : NN->interface(lay)->UseLogOdds? ZeroCounts:ZeroProbs; } // Added by Frellsen void RandomInput(DirichletReg *r, int layer=0); // Creates a random inputlayer at layer // Added by Frellsen void SampleGuideFromAAProbs(int layer); // Sample the Guide Sequence from the Amino Acid probability // distribution for each column // Added by Frellsen void AdjustToGradientDescent(float learningrate, double **InputPartials); // Adjusts the input layer according to the gradient of the error. void AdjustWithRprop(double **InputPartials); // Adjusts the profile in the input layer according to the gradient of // the error, using the RPROP algorithm. // Updates PreviousStep. }; // CHANGE LOG: // 23 July 1997 Kevin Karplus // Added insert before position to end of Probs vectors. // 21 March 1998 Kevin Karplus // Fixed description of data to refer to Interfaces, not layers. // 27 March 1998 Kevin Karplus // Simplified constructor considerably, moving out lots of // junk to ReadCommands.cc // 10 May 1998 Melissa Cline // Revised the comments on ProbVectors, reflecting how we can // now use amino acid and / or component probs. // 15 September 1999 Sugato Basu // Added code to store the count values before probvectors are // produced by regularization - useful when NetRegularizers are // used to predict mixture component probs // 15 September 1999 Sugato Basu // Added code to handle output format of ALIGNMENT // 21 April 2004 Sol Katzman // Added UseGuide support. Rename data member guide_sequence to GuideSequence. // Changed GuideSequence to Base* from char* in ChainData, // and added GuideAlphabet to qualify it. // Renamed guide_sequence_char to guide_sequence_ifd0_char, // since it returns a char from the Interface 0 guide sequence // 24 May 2004 Kevin Karplus // inlined simple functions // Mon Jun 13 04:20:23 PDT 2005 Kevin Karplus // Picked up Jes Frellsen's additions // constructor for empty OneChain // RandomInput // SampleGuideFromAAProbs // AdjustToGradientDescent // Mon Jun 13 06:37:49 PDT 2005 Kevin Karplus // Made GuideSequence not be const, so that it could be modified // in designs. // Added guide_sequence_0(pos) member. // Mon Jun 13 09:38:26 PDT 2005 Kevin Karplus // Added print_guide_sequence_0 // Tue Jun 14 09:09:08 PDT 2005 Kevin Karplus // Eliminated assign_guide() // Tue Jun 14 10:11:08 PDT 2005 Kevin Karplus // Added set_guide // Tue Jun 14 15:31:25 PDT 2005 Kevin Karplus // Added set_guide_base and rewrote set_guide to use it //Wed Jun 15 12:55:57 PDT 2005 Kevin Karplus // Default to null filename for empty OneChain. // Wed Jun 15 13:14:48 PDT 2005 Kevin Karplus // Added clear_counts // Wed Jun 15 13:55:46 PDT 2005 Kevin Karplus // Added set_profile_from_counts // Wed Jun 15 14:41:18 PDT 2005 Kevin Karplus // Added set_zero_counts // Fri Jun 17 12:41:03 PDT 2005 Kevin Karplus // Added nn() member // Fri Jun 17 15:41:14 PDT 2005 Kevin Karplus // Added structure() member functions. // 24 June 2005 Kevin Karplus // Added profile() member function. // Sat Jul 9 07:21:13 PDT 2005 Kevin Karplus // Added guide_sequence_0(void) // Sat Jul 9 08:07:12 PDT 2005 Kevin Karplus // Swapped order of arguments in set_profile_from_counts // Added set_profile // Sat Jul 9 15:56:06 PDT 2005 Kevin Karplus // Added set_guide(inter,col, int) // Sat Jul 9 16:08:22 PDT 2005 Kevin Karplus // Added guide_alphabet() // Mon Jul 11 13:42:55 PDT 2005 Kevin Karplus // removed irrelevant assertion from set_profile // Mon Jul 11 13:47:54 PDT 2005 Kevin Karplus // added set_guide from short int array. // added set_counts_from_guide // added clear_prob_vectors // Thu Jul 14 21:33:28 PDT 2005 Kevin Karplus // added column-specific set_counts_from_guide and clear_counts // Sun Jul 17 09:44:02 PDT 2005 Kevin Karplus // added copy_probvectors // Fri Jul 22 10:05:29 PDT 2005 Kevin Karplus // Modified no-data constructor to use RandomInput to set the inputs, // and changed order of inputs. // Fri Jul 22 14:07:46 PDT 2005 Kevin Karplus // Made probs_for_layer return const float *, and have both const // varieties of counts_for_layer // Fri Jul 22 14:48:38 PDT 2005 Kevin Karplus // Made counts_for_layer inline // Sun Feb 5 03:02:54 PST 2006 Kevin Karplus // Added non-const version of profile() for forcing change in design // Mon Mar 13 10:30:19 PST 2006 Kevin Karplus // Added AdjustWithRprop // Added PreviousStep and prev_step(). // Initialize PreviousStep in set_profile_from_counts. // Mon Mar 13 16:32:28 PST 2006 Kevin Karplus // Reduced initial value of PreviousStep to 0.3*log(prob) // Wed Mar 15 10:17:33 PST 2006 Kevin Karplus // Removed MLZReg, using BackgroundProbs instead in // set_profile_from_counts. // Changed initital value of PreviousStep to 0.2*log(prob/background) // Wed Mar 15 10:26:55 PST 2006 Kevin Karplus // Moved set_profile_from_counts from .h to .cc // Mon Dec 17 12:44:54 PST 2007 Mark Diekhans // Removed extra qualification on OneChain and AdjustToGradientDescent, // which is an error in g++ v4. #endif