// InterfaceDescription.h // copyright 30 July 1997 Kevin Karplus #ifndef InterfaceDescription_H #define InterfaceDescription_H #include using namespace std; #include "NamedClass/NamedObject.h" #include "NamedClass/NamedClass.h" #include "NamedClass/NameToPtr.h" #include "SeqWeight.h" #include "Regularizer/DirichletReg.h" #include "AlphabetTuple/AlphabetTuple.h" class InterfaceDescription; // forward declaration // To do: // make some parts private, so that consistency of Alpha and regularizers // and NumUnits and Alpha can be maintained. class IFInputCommand: public NamedObject { typedef int (*fcn) (istream &in, InterfaceDescription *chg, IFInputCommand *self); fcn CommandFunction; // function to execute when keyword found // Reading from "in" into "chg" // Pass this down to function as 3rd arg, so it can // report error using self->name(). // Return 1 if input should continue, 0 if error or end of input public: IFInputCommand(const char *nm, fcn c=0) { set_name(nm); CommandFunction=c; } inline int execute(istream &in, InterfaceDescription *chg) { return (*CommandFunction)(in, chg, this); } }; // function for IFInputCommand that treats keyword as a command and skips // to the end of the line int ReadComment(istream &in, InterfaceDescription *chg, IFInputCommand* self); // UnitName is a simple NamedObject for naming the units of a layer class UnitName: public NamedObject { public: int number; }; // InterfaceDescription provides information about how the input or output // the neural net is to be handled class InterfaceDescription: public NamedClass, public NamedObject { static IdObject ID; // for NamedClass static NameToPtr *CommandTable; // for I/O virtual int read_knowing_type(istream &in); virtual void write_knowing_type(ostream &out) const; int NumUnits; // number of units // == size(Alphabet)*(1+TupleStop-TupleStart) // + UseInsert + UseDelete // + UseEntropy + UseProbOfGuide // + UseGuide*size(Alphabet) // + size(ComponentProbs) bool UseInsert; // 1 => neural net input or output includes // probability that there is an insert before position bool UseDelete; // 1 => neural net input or output includes // probability that there is a deletion in position bool UseEntropy; // 1 => neural net input includes // entropy of position bool UseProbOfGuide; // 1 => neural net input includes the // probability from the profile of the character of the guide sequence bool TrainTo; // Set to 1 if this is an output layer with training data bool HideTemporarily; // set to 1 if this layer should // (temporarily) ignore the TrainTo info UnitName *Names; // Names[i] is the name of ith unit NameToPtr UnitNumber; // hash table to look up names const char* generate_unit_name(int i) const; inline int compute_num_units(void) const { assert(Alpha || (!UseAminoAcidProbs && !UseGuide)); assert(ReRegularizer || ! UseComponentProbs); assert (! (ReRegularizer && NetRegularizer)); return (UseAminoAcidProbs? TupleStates: 0) + UseInsert + UseDelete + UseEntropy + UseProbOfGuide + (UseGuide? Alpha->num_normal() :0) + (UseComponentProbs? ReRegularizer->num_components() :0) + (NetRegularizer? NetRegularizer->num_components() :0) ; } public: // on input or output, the data may be amino acid probabilities // and probabilities of insertion (before position) // Or they may be from any other alphabet (such as secondary structure) // Or thay may be simply numbered. AlphabetTuple *Alpha; // 0 if purely numeric // Alphabet for each position in sequence int TupleStart, TupleStop; // normally both 0 // Instead of a single postion (i) // can use a tuple (i+TupleStart .. i+TupleStop). // Currently not compatible with UseComponentProbs. int DefaultBaseIndex; // If TupleStart or TupleStop non-zero, then need to // know what label to impute to missing positions at the // ends of the sequence. int TupleStates; // Alphabet size ^ (1+TupleStop-TupleStart) // The number of states the tuple of // Alpha characters can have void set_unit_name(int i, const char* nm); inline const char* unit_name(int u) const { if (u<0 || u >=NumUnits) return "illegal_unit"; if (Names) return Names[u].name(); return generate_unit_name(u); } int unit_number(const char* nm); int UseLogOdds; // 1 => use log(P(x|data)/P(x)) // instead of P(x) // Note: currently only supported // for input to network, will eventually // be extended to hidden layers. // function for weighting sequences, and its parameters const SequenceWeightObject* SequenceWeighter; float SequenceWeightBitsToSave; float SequenceWeightParam; float ClipExponent; // clip total weights to numseq^ClipExponent // (unless ClipExponent<0, in which case turn off clipping) Regularizer* WeightingRegularizer; // NOTE: NOT OWNED BY InterfaceDescription // The best estimate of the true probability is // computed from the observed weighted counts using // ReRegularizer. If pointer is 0, just normalize to sum to 1. // NOTE: NOT OWNED BY InterfaceDescriptions DirichletReg* ReRegularizer; // On input, can recode inputs to use component probabilities // of ReRegularizer instead of or in addition to amino acid // probabilities. // ?? Not designed or tested for use with TupleStart or TupleStop // non-zero. // // On output, neural net probabilities are used to set // mixture coefficients, and the modified DirichletReg is used // with the input counts (not probabilities, but counts) // to get probabilities for the outputs. //SUGATO : 7/1/99 DirichletReg * NetRegularizer; // The output layer trains to the mixture coefficients of // the NetRegularizer. Cost of encoding is calculated by // the cost_from_mix function, which takes as arguments the // input counts to the net, the output_counts from the training // set for the last layer and the mixture coefficients output // from the net int UseComponentProbs; // 0 => inputs/outputs are probabilities // 1 => use component probs int UseAminoAcidProbs; // 1 => neural net input or output includes // probabilities over an alphabet. On input // the probabilities are calculated from a // multiple alignment over the alphabet. int UseGuide; // 1 => neural net input includes // a guide sequence, which for each column // is a (nominally) one-hot vector over the // 'AminoAcidProbs' alphabet. For columns with // a delete in the guide sequence, the vector is // all zeroes. Inserts in the guide sequence // are ignored. typedef enum {ALIGNMENT, SEQUENCE, VECTORS, NUMBERS} FormatType; FormatType InputFormat; InterfaceDescription(void); InterfaceDescription(const InterfaceDescription* old); //copy ~InterfaceDescription(void); inline void hide(int h=1) {HideTemporarily=h;} inline void unhide(void) {HideTemporarily=0;} inline int is_hidden(void) const { return HideTemporarily || !TrainTo; } inline int is_TrainTo(void) const { return TrainTo; } inline int train_to_unique(void) const { return TrainTo && (InputFormat==SEQUENCE || InputFormat==NUMBERS); } inline int num_units(void) const {return NumUnits;} // use assert(num_units_ok()) to check that NumUnits is // properly set inline bool num_units_ok(void) const { return NumUnits == compute_num_units(); } // fill the vector vect with the appropriate // values (depending on the Use... variables) void fill_vector(float *vect, const float* aa_probs, float insert_prob=0.0, float delete_prob=0.0, const float* guide_probs=NULL, const float* component_probs=NULL, float entropy=0.0, float prob_of_guide=0.0) const; // Functions to get the first and last unit numbers of the guide vector inline int guide_first_num(void) const { assert(UseGuide); return (UseAminoAcidProbs*TupleStates + UseInsert + UseDelete); } inline int guide_last_num(void) const { assert(UseGuide); assert(Alpha); return (guide_first_num() + Alpha->num_normal() - 1); } inline int profile_first_num(void) const { assert(UseAminoAcidProbs); return 0; } inline int profile_last_num(void) const { assert(UseAminoAcidProbs); return TupleStates-1; } // Functions for the input commands static NameToPtr * command_table(void) {return CommandTable;} static void init_command_table(void); // NamedClass functions inline static IdObject* classID(void) {return &ID;} virtual IdObject* type(void) const {return &ID;} // Input commands that need to access private structure friend int ReadIntParam(istream &in, int ¶m, InterfaceDescription *chg, IFInputCommand* self); friend int ReadFloatParam(istream &in, float ¶m, InterfaceDescription *chg, IFInputCommand* self); typedef int IFCommandfcn(istream &in, InterfaceDescription *chg, IFInputCommand* self); friend IFCommandfcn ReadName; friend IFCommandfcn ReadTrainTo; friend IFCommandfcn ReadInsertUse; friend IFCommandfcn ReadDeleteUse; friend IFCommandfcn ReadEntropyUse; friend IFCommandfcn ReadGuideProbUse; friend IFCommandfcn ReadNumUnits; friend IFCommandfcn ReadCompUse; friend IFCommandfcn ReadAAUse; friend IFCommandfcn ReadGuideUse; friend IFCommandfcn ReadInputFormat; friend IFCommandfcn ReadAlphabetTuple; // Although the InterfaceDescription doesn't own the Regularizers, // in come cases all other pointers to them may be lost, so there // is an explicit way to delete the regularizers, which should NOT // be part of the normal destructor for InterfaceDescription. inline void delete_regularizers(void) { delete WeightingRegularizer; if (ReRegularizer!=WeightingRegularizer) delete ReRegularizer; if (NetRegularizer!=WeightingRegularizer && NetRegularizer!=ReRegularizer) delete NetRegularizer; WeightingRegularizer=ReRegularizer=NetRegularizer=0; } // check whether this is compatible with an existing interface // (useful for checking whether two neural nets can share a // common input). // Report message if not compatible and err_log not NULL, // with names of networks provided by old_name and new_name bool is_compatible(const InterfaceDescription* old_ifd, const char *old_name="", const char* new_name="", ostream *err_log=NULL) const; // estimate approximate total weight of an input vector for // this Interface inline double num_ranges(void) const { if (Alpha ==NULL) return 1.0; // believed to be output of softmax double num_r = 0; if (UseAminoAcidProbs) num_r++; if (UseGuide) num_r++; if (NetRegularizer) num_r++; if (UseComponentProbs) num_r++; if (UseDelete) num_r += 0.01; if (UseInsert) num_r += 0.01; if (UseEntropy) num_r += 0.9; if (UseProbOfGuide) num_r += 0.3; return num_r; } }; #endif // CHANGE LOG: // 25 March 1998 Kevin Karplus // Added UseLogOdds. // 14 April 1998 Kevin Karplus // Added copy constructor. // 10 May 1998 Melissa Cline // Modified the comments on the input probablilities, reflecting // how component probs and amino acid probs can both be used. // Added UseAminoAcidProbs. Added function ReadAAUse. // 27 December 1999 Kevin Karplus // Added TupleStart, TupleStop, and DefaultBaseIndex // 19 April 2004 Sol Katzman // Added UseGuide // 20 May 2004 Sol Katzman // Corrected guide_first_num and also support in case of not UseAminoAcidProbs. // 24 May 2004 Kevin Karplus // inlined simple functions // Wed Jun 15 13:32:36 PDT 2005 Kevin Karplus // Added profile_first_num and profile_last_num // Fri Aug 12 14:46:15 PDT 2005 Kevin Karplus // Created is_compatible() from existing code in Globals::add_neural_net // Fri Aug 12 15:06:25 PDT 2005 Kevin Karplus // Created num_ranges() from code in NeuralLayer::center_weights // Fri Aug 12 15:26:28 PDT 2005 Kevin Karplus // Created num_units_ok() from code in OneChain // Fri Aug 12 17:06:20 PDT 2005 Kevin Karplus // Created fill_vector to fill an input vector in the right positions // Fri Aug 12 17:18:43 PDT 2005 Kevin Karplus // Made UseInsert and UseDelete private // Fri Aug 12 17:38:05 PDT 2005 Kevin Karplus // Added compute_num_units and used it in ok_num_units // Fri Aug 12 20:43:32 PDT 2005 Kevin Karplus // Added UseEntropy and UseProbOfGuide // Changed several "Use" parameters to bool. // Tue Oct 25 13:29:21 PDT 2005 Kevin Karplus // Added generate_unit_name and rewrote unit_name to be inline // and use generate_unit_name // Thu Jul 16 13:40:00 PDT 2009 Kevin Karplus // Made command_table and init_command_table static, as they // should have been all along.