// NeuralLayer.h // 23 July 1997 Kevin Karplus #ifndef NeuralLayer_H #define NeuralLayer_H #include #include #include #include "NamedClass/NamedObject.h" #include "NamedClass/NamedClass.h" // forward declarations class NeuralLayer; class TrainSet; class InterfaceDescription; // class for keywords that can be used in NeuralLayer input class NLInputCommand: public NamedObject { typedef int (*fcn)(istream &in, NeuralLayer *change, NLInputCommand* self); fcn CommandFunction; // function to execute when keyword found. // Reading from "in" into "change". // Pass this down to function as 3rd arg, // so it can report error using self->name(). // Return 1 if input should continue, 0 if error or end of input. public: NLInputCommand(const char *nm, fcn c=0) { set_name(nm); CommandFunction=c; } inline int execute(istream &in, NeuralLayer *change) { return (*CommandFunction)(in, change, this); } }; // NeuralLayer represents one layer of a neural network. // Only the permanent structure is recorded here, not the activation // for particular inputs. // Each layer has a "2-dimensional" input---a fixed size window // of fixed length input vectors (the window is a subset of // the full sequence of input vectors that is derived from // a training or prediction chain.) // A typical input vector is a prob distribution over an alphabet, // but may include other unrelated elements. // All inputs values are expected to be scaled to the range [0,1]. // The window size and the length of the input vectors may be // different for each layer. // The outputs are treated as a probability vector---that is, they // will be scaled to sum to one (using a new variant of soft-max, // developed by Kevin Karplus). // All inputs from the window are connected to all output units. // Each output unit also has a bias input, which is used just to // keep the unit in the high-gain region. // There is an additional pseudocount for each unit (used just before // the normalization to sum to one) that allows the network to get // background probabilities correct. // For output o, windowpositions w, inputelements i: // X[o] = bias[o] + gain[o] * ( sum_i,w weights[o,i,w] * in[i,w] ) // // out[o] = (exp(X[o]) + pseudo[o]) / sum (exp(X) + pseudo) // Note: the Gain parameter is redundant with the weights, and // normalization of the network can set the gain to 1 by rescaling // the weights. class ActivationRecord; // forward reference class NeuralNet; // forward reference class NeuralLayer: public NamedClass, public NamedObject { private: // For NamedClass static IdObject ID; virtual int read_knowing_type(istream &in); virtual void write_knowing_type(ostream &out) const; static NameToPtr *CommandTable; // for reading input NeuralNet *Owner; NeuralLayer* NextTrainableLayer; int MyLayerNumber; int IsFrozen; // set to 1 if you don't want to update // during training int UseMultUpdate; // set to 1 if you want to use multiplicative // updates instead of additive ones. // (requires all positive weights) int NumInputs; // number of inputs in each position of sequence int WindowSize; // how big is window // True number of inputs is NumInputs*WindowSize int NumWeights; // NumInputs*Windowsize int NumOutputs; // how many outputs int Overhang; // How far does layer extend before beginning // and after end of sequence? float *Weights; // array of NumInputs*WindowSize*NumOutputs weights float *Bias; // array of NumOuputs bias inputs float *Gain; // array of NumOuputs gain parameters float *Pseudo; // array of NumOutputs pseudocounts void Alloc(int in, int wind, int out); void Alloc_Weights(void); void Alloc_Pseudo(void); void Alloc_Gain(void); void Alloc_Bias(void); void Dealloc(void); // procedures for input commands. virtual NameToPtr * command_table(void) {return CommandTable;} virtual void init_command_table(void); inline float& weight(int in, int wind, int out) { return Weights[in+NumInputs*(wind + WindowSize*out)]; } // input commands need access to private structure friend int ReadSizeParam(istream &in, int ¶m, NeuralLayer *change, NLInputCommand* self); typedef int NLcommandfcn(istream &in, NeuralLayer *change, NLInputCommand* self); friend NLcommandfcn ReadNumInputs; friend NLcommandfcn ReadWindowSize; friend NLcommandfcn ReadNumOutputs; friend NLcommandfcn ReadOverhang; friend NLcommandfcn ReadUseMultUpdate; friend NLcommandfcn ReadWeights; friend NLcommandfcn ReadBias; friend NLcommandfcn ReadGain; friend NLcommandfcn ReadPseudo; public: // Learning rates for the different types of weights. double WeightRate, BiasRate, GainRate, PseudoRate; double DesiredSq; // target for Sums[o]*Sums[o] to // keep range meaningful. (default=1) // multipliers for gain, bias, and pseudocount on each update double GainExtinction, BiasExtinction, PseudoExtinction; NeuralLayer(int in=0, int wind=0, int out=0) { IsFrozen=0; UseMultUpdate=0; Owner=0; MyLayerNumber=0; Weights=0; Bias=0; Pseudo=0; Gain=0; Alloc(in,wind,out); } NeuralLayer(const NeuralLayer *old); // copy constructor ~NeuralLayer(void) { Dealloc(); } void copy_weights(const NeuralLayer *src); void initialize_learning_rates(int num_sequences, int distance_to_training); // Set the learning rates to appropriate defaults based on the // size of the training set and how far to next trainable layer. // Network should be fully constructed before this initialization. inline void set_layer_number(int lay) {MyLayerNumber=lay;} inline int layer_number(void) const {return MyLayerNumber;} // NamedClass functions inline static IdObject* classID(void) {return &ID;} virtual IdObject* type(void) const {return &ID;} inline const NeuralNet *owner(void) const {return Owner;} inline void set_owner(NeuralNet *net) {Owner=net;} const InterfaceDescription *input_interface(void) const; const InterfaceDescription *output_interface(void) const; inline int num_in(void) const {return NumInputs;} inline int num_wind(void) const {return WindowSize;} inline int num_out(void) const {return NumOutputs;} inline int degrees_freedom(void) const { // degrees of freedom does not count // Gain (can be subsumed into weights) // Bias (can be subsumed into weights, given that // a known subset of inputs sums to 1---true except // for Dummy units. // Pseudocounts, if they are all 0. // One of the outputs (because of softmax rescaling). int numdeg=(NumOutputs-1)*WindowSize*(NumInputs); for (int o=NumOutputs-1; o>=0; o--) { if (Pseudo[o]!=0) return numdeg+NumOutputs; } return numdeg; } inline int overhang(void) const {return Overhang;} inline void set_overhang(int o) { assert(o>=0); Overhang=o; } inline void freeze(int f=1) {IsFrozen=f;} inline void unfreeze(void) {IsFrozen=0;} inline int is_frozen(void) const {return IsFrozen;} inline void set_weight(int in, int wind, int out, float f) { Weights[in+NumInputs*(wind + WindowSize*out)] = f; } inline float weight(int in, int wind, int out) const { return Weights[in+NumInputs*(wind + WindowSize*out)]; } inline float pseudo(int out) const {return Pseudo[out];} inline float bias(int out) const {return Bias[out];} inline float gain(int out) const {return Gain[out];} inline void set_bias(int out, float b) {Bias[out] = b;} inline void set_gain(int out, float b) {Gain[out] = b;} inline void set_pseudo(int out, float b) {Pseudo[out] = b;} // Initialization routines void initialize_weights(ostream &logfile); void initialize_pseudo(ostream &logfile, const TrainSet *training); void initialize_bias(ostream &logfile); void initialize_gain(ostream &logfile); inline void set_DesiredSq(int square) { DesiredSq = square; } // Set gains to 1, // adjusting weights to preserve meaning void normalize(void); // for each set of inputs known to sum to one, // adjust weights for those inputs to sum to zero, // (adjusting bias as needed to preserve meaning). void center_weights_input_1(void); // center weights for each input position independently // Meaning is NOT quite preserved, unless pseudocounts are 0. // biases are adjusted to keep meaning roughly the same--do center_biases // after center_weights void center_weights(void); // adjust biases so min(bias)=-max(bias), adjusting // pseudocounts to preserve meaning void center_biases(void); // Do updates based on ActivationRecord that has already // had its share computed. // Dependencies: // compute share and propagate to inputs before any updates // update weights before gain or pseudo // update gain before bias void update_weights(const ActivationRecord *act); void update_pseudo(const ActivationRecord *act); void update_gain(const ActivationRecord *act); void update_bias(const ActivationRecord *act); // Functions for changing the learning rates on the above // parameters. void change_WeightRate(double old_TEC, double new_TEC); void change_BiasRate(double old_TEC, double new_TEC, double old_SS2, double new_SS2); void change_GainRate(double old_TEC, double new_TEC, double old_SS2MDes2, double new_SS2MDes2); void change_PseudoRate(double old_TEC, double new_TEC); // Functions to return the rate values inline double weight_rate(void) const {return WeightRate; } inline double bias_rate(void) const {return BiasRate; } inline double gain_rate(void) const {return GainRate; } inline double pseudo_rate(void) const {return PseudoRate; } }; #endif // CHANGE LOG: // 23 March 1998 Kevin Karplus // Added Owner // 26 March 1998 Kevn Karplus // changed normalization to remove bias, as well as gain // based on assumption that inputs of each windowframe sum to 1. // (Commented changes out again, as they didn't seem to help) // 30 March 1998 Kevin Karplus // Added IsFrozen to stop learning in individual layers. // 14 April 1998 Kevin Karplus // Added copy constructor // 13 May 1998 Kevin Karplus // Added input_interface and output_interface. // 18 May 1998 Kevin Karplus // Added center_weights() // 23 November 1999 Kevin Karplus // Changed num_param() so it doesn't count gain or bias. // 7 January 2000 Kevin Karplus // Added center_biases // 26 Nov 2001 Kevin Karplus // Changed num_params to degrees_freedom (and changed meaning) // 21 Apr 2004 Sol Katzman // Improved comments. // 24 May 2004 Sol Katzman // inline numerous simple functions // 25 May 2004 Kevin Karplus // inlined more of the simple functions