// NeuralLayer.cc // copyright Kevin Karplus // 28 July 1997 #include // needed for isfinite() check on old compiler #define __USE_ISOC99 1 // for isfinite() in math.h #include #include // for exp #include #include "Input/Input.h" #include "EqualStrings/EqualStrings.h" #include "Utilities/IOSmacros.h" #include "Utilities/Random.h" #include "NeuralLayer.h" #include "ActivationRecord.h" #include "TrainSet.h" #include "LearningParam.h" #include "InterfaceDescription.h" // information for the NamedClass NeuralLayer static NamedClass *create_neural_layer(void) {return new NeuralLayer;} IdObject NeuralLayer::ID("NeuralLayer",create_neural_layer, 0, "NeuralLayer is a single-layer neural network whose outputs are\n\ normalized to sum to one (a probability vector). Related classes are\n\ ActivationRecord (for recording one activation of a NeuralLayer) and\n\ NeuralNetwork (for multi-layer networks).\n"); NameToPtr* NeuralLayer::CommandTable = 0; NeuralLayer::NeuralLayer(const NeuralLayer *old) { IsFrozen = old->IsFrozen; UseMultUpdate= old->UseMultUpdate; Owner= old->Owner; MyLayerNumber= old->MyLayerNumber; Overhang= old->Overhang; Weights=0; Bias=0; Pseudo=0; Gain=0; Alloc(old->NumInputs,old->WindowSize,old-> NumOutputs); WeightRate= old->WeightRate; BiasRate= old->BiasRate; GainRate= old->GainRate; PseudoRate= old->PseudoRate; DesiredSq= old->DesiredSq; GainExtinction= old->GainExtinction; BiasExtinction= old->BiasExtinction; PseudoExtinction= old->PseudoExtinction; copy_weights(old); for (int o=0; oBias[o]; Pseudo[o] = old->Pseudo[o]; Gain[o] = old->Gain[o]; } } void NeuralLayer::copy_weights(const NeuralLayer *old) { assert(NumInputs == old->NumInputs); assert(WindowSize == old->WindowSize); assert(NumOutputs == old->NumOutputs); int total= NumInputs*WindowSize*NumOutputs; for (int i=total-1; i>=0; i--) Weights[i] = old->Weights[i]; } void NeuralLayer::Alloc(int in, int wind, int out) { NumInputs = in; WindowSize = wind; NumOutputs = out; NumWeights = in*wind; Overhang =0; // These values may not be known as the time of the layer's creation if (NumInputs>0 && WindowSize>0 &&NumOutputs>0) { Alloc_Weights(); Alloc_Pseudo(); Alloc_Gain(); Alloc_Bias(); } else { Weights=NULL; Bias=Gain=Pseudo=NULL; } // rather arbitary initial learning rates, // not taken from LearningParam, since Owner may // not be known. Use initialize_learning_rates once // Owner is known to set parameters properly. WeightRate = 0.01; PseudoRate = 0.001; // bias and gain are less important BiasRate = 0.0001; GainRate = 0.00001; DesiredSq = 1.0; GainExtinction=BiasExtinction=PseudoExtinction=1.0; } void NeuralLayer::Alloc_Weights() { if(NumInputs<=0 || WindowSize<=0 || NumOutputs<=0) { cerr << "#ERROR: NumInputs, WindowSize, and NumOutputs must all "; cerr << " be correctly specified before setting weights.\n"; exit(-1); } NumWeights = NumInputs*WindowSize; int num_allocate = NumWeights*NumOutputs; if(Weights) {delete [] Weights;} Weights = new float[num_allocate]; for (int j=0; j=0; o--) { if (Gain[o] == 1) continue; for (int w=WindowSize-1; w>=0; w--) { for (int i=NumInputs-1; i>=0; i--) weight(i,w,o) *= Gain[o]; } Gain[o] = 1; } } // for each set of inputs known to sum to one, // adjust weights for those inputs to sum to zero, // (adjusting bias as needed to preserve meaning). void NeuralLayer::center_weights_input_1(void) { const int MAX_ranges=3; // maximum number of ranges of inputs // that are known to sum to 1 int start[MAX_ranges], stop[MAX_ranges]; // for each distinct set, s, of inputs to be centered // the range is [start[s], stop[s]) i.e. open on the right const InterfaceDescription *ifd_in=input_interface(); // set up ranges: int num_ranges=0; // actual number of ranges. if (ifd_in->Alpha ==NULL) { start[num_ranges] = 0; stop [num_ranges] = NumInputs; num_ranges++; } else { if (ifd_in->UseAminoAcidProbs) { start[num_ranges] = 0; stop[num_ranges] = ifd_in->TupleStates; num_ranges++; } if (ifd_in->UseGuide) { start[num_ranges] = ifd_in->guide_first_num(); stop [num_ranges] = ifd_in->guide_last_num() +1; num_ranges++; } if(ifd_in->NetRegularizer) { start[num_ranges] = NumInputs - ifd_in->NetRegularizer->num_components(); stop [num_ranges] = NumInputs; num_ranges++; } if(ifd_in->UseComponentProbs) { start[num_ranges]= NumInputs - ifd_in->ReRegularizer->num_components(); stop[num_ranges] = NumInputs; num_ranges++; } assert (num_ranges <= MAX_ranges); } for (int o=NumOutputs-1; o>=0; o--) { for (int w=WindowSize-1; w>=0; w--) { for (int r=0; r start[r]); for (int i=start[r]; i=0; w--) { for (int i=NumInputs-1; i>=0; i--) { // center weights for position w,i double sum=0; for (int o=NumOutputs-1; o>=0; o--) { sum += weight(i,w,o); } double average = sum/NumOutputs; for (int o=NumOutputs-1; o>=0; o--) { weight(i,w,o) -= average; } total_correction += average; } } // Now make a crude estimate of how much the output units have // been scaled by. double num_ranges=input_interface()->num_ranges(); double average_correction = total_correction*num_ranges/(NumInputs*WindowSize); // adjust biases to keep overall scaling roughly the same. for (int o=NumOutputs-1; o>=0; o--) { Bias[o] += average_correction; } } // Center the biases around zero, modifying pseudocounts to preserve // meaning. // Note: centering is done so min(bias) = - max(bias), // not so that sum is 0. void NeuralLayer::center_biases(void) { double max_bias=Bias[0]; double min_bias=Bias[0]; for (int o=1; omax_bias) max_bias=Bias[o]; if (Bias[o] learning_params(); float BaseRate = (distance_to_training+1)* lp->LayBaseTimesSeq / num_sequences; if (BaseRate > lp->LayMaxBaseRate) BaseRate=lp->LayMaxBaseRate; WeightRate= BaseRate*(UseMultUpdate? lp->LayWeightOverBaseForMult :lp->LayWeightOverBase); PseudoRate= BaseRate*(UseMultUpdate? lp->LayPseudoOverBaseForMult :lp->LayPseudoOverBase); BiasRate= BaseRate*(UseMultUpdate? lp->LayBiasOverBaseForMult :lp->LayBiasOverBase); GainRate= BaseRate*(UseMultUpdate? lp->LayGainOverBaseForMult :lp->LayGainOverBase); GainExtinction=BiasExtinction=PseudoExtinction=1.0; DesiredSq = NumWeights * lp->LayDesiredSqOverNumWeights; } // first some generic commands for input, borrowed from // the Regularizer classes static int ReadName(istream &in, NeuralLayer *change, NLInputCommand* self) { char word[500]; get_word(in, word); change->set_name(word); return 1; } static int ReadComment(istream &in, NeuralLayer *change, NLInputCommand* self) { SkipSeparators(in, 1, '\n'); return 1; } static int VerifyClassName(istream &in, NeuralLayer *change, NLInputCommand* self) { char word[100]; get_word(in, word); const IdObject *end_id = IdObject::id(word); if (end_id != change->type()) { cerr << "Warning: " << self->name() << word << " doesn't match " << change->type()->name() << "\n" << flush; } // continue if "ClassName", stop if "EndClassName" return EqualStrings(self->name(), "ClassName", 1); } // Now some keywords specific to NeuralLayer // input format // initial keywords: // NumInputs = # // WindowSize = # (defaults to 1) // NumOutputs = # static int ReadIntParam(istream &in, int ¶m, NeuralLayer *change, NLInputCommand* self) { in >> param; return 1; } int ReadSizeParam(istream &in, int ¶m, NeuralLayer *change, NLInputCommand* self) { if (change->Weights) { cerr << "Error: can't change " << self->name() << " after allocation has been done---\n" << "specify all size parameters first\n"; return 0; } int tmp; in >> tmp; if (tmp <0) { cerr << "Error: must have " << self->name() << " >0\n"; return 0; } param=tmp; return 1; } int ReadNumInputs(istream &in, NeuralLayer *change, NLInputCommand* self) { return ReadSizeParam(in, change->NumInputs, change, self); } int ReadWindowSize(istream &in, NeuralLayer *change, NLInputCommand* self) { return ReadSizeParam(in, change->WindowSize, change, self); } int ReadNumOutputs(istream &in, NeuralLayer *change, NLInputCommand* self) { return ReadSizeParam(in, change->NumOutputs, change, self); } int ReadOverhang(istream &in, NeuralLayer *change, NLInputCommand* self) { return ReadIntParam(in, change->Overhang, change, self); } int ReadUseMultUpdate(istream &in, NeuralLayer *change, NLInputCommand* self) { return ReadIntParam(in, change->UseMultUpdate, change, self); } // input commands once allocation is fixed. static int ReadOutParam(istream &in, float *param, NeuralLayer *change, NLInputCommand* self) { assert (param!=NULL); for (int i=0; inum_out(); i++) in >> param[i]; return 1; } int ReadBias(istream &in, NeuralLayer *chg, NLInputCommand* self) { chg->Alloc_Bias(); return ReadOutParam(in, chg->Bias, chg, self); } int ReadGain(istream &in, NeuralLayer *chg, NLInputCommand* self) { chg->Alloc_Gain(); return ReadOutParam(in, chg->Gain, chg, self); } int ReadPseudo(istream &in, NeuralLayer *chg, NLInputCommand* self) { chg->Alloc_Pseudo(); return ReadOutParam(in, chg->Pseudo, chg, self); } int ReadWeights(istream &in, NeuralLayer *chg, NLInputCommand* self) { chg->Alloc_Weights(); char word[100]; for (int o=0; onum_out(); o++) { for (int w=0; wnum_wind(); w++) { for (int i=0; inum_in(); i++) { get_word(in,word); // use get_word to allow comments chg->weight(i,w,o) = atof(word); } } } return 1; } void NeuralLayer::init_command_table() { assert(!CommandTable); CommandTable = new NameToPtr(13); CommandTable->ignore_case(); CommandTable->AddName(new NLInputCommand("Name", ReadName)); CommandTable->AddName(new NLInputCommand("NumInputs", ReadNumInputs)); CommandTable->AddName(new NLInputCommand("NumOutputs", ReadNumOutputs)); CommandTable->AddName(new NLInputCommand("WindowSize", ReadWindowSize)); CommandTable->AddName(new NLInputCommand("Overhang", ReadOverhang)); CommandTable->AddName(new NLInputCommand("UseMultUpdate", ReadUseMultUpdate)); CommandTable->AddName(new NLInputCommand("Bias", ReadBias)); CommandTable->AddName(new NLInputCommand("Gain", ReadGain)); CommandTable->AddName(new NLInputCommand("Pseudo", ReadPseudo)); CommandTable->AddName(new NLInputCommand("Weights", ReadWeights)); CommandTable->AddName(new NLInputCommand("Comment", ReadComment)); CommandTable->AddName(new NLInputCommand("ClassName", VerifyClassName)); CommandTable->AddName(new NLInputCommand("EndClassName", VerifyClassName)); } int NeuralLayer::read_knowing_type(istream &in) { Dealloc(); // Should this be a conditional deallocation? WindowSize=1; // set default window size if (! command_table()) init_command_table(); char word[300]; while (in.good()) { get_word(in, word, '='); NLInputCommand *comm = dynamic_cast (command_table()->FindOldName(word, ZeroIfNew)); if (!comm) comm = dynamic_cast (NeuralLayer::CommandTable->FindOldName(word, ZeroIfNew)); if (comm) { if (!comm->execute(in, this)) return 1; } else { cerr << "Unrecognized keyword: " << word << " for type " << type()->name() << " " << name() << "\n" << flush; } } return 0; } void NeuralLayer::write_knowing_type(ostream &out) const { int o; // which output long oldprecision = out.precision(4); out << IOSFIXED; if (name()) out << "Name = " << name() << "\n"; out << "NumInputs = " << NumInputs << "\n" << "WindowSize = " << WindowSize << "\n" << "NumOutputs = " << NumOutputs << "\n" << "Overhang = " << Overhang << "\n" << "UseMultUpdate = " << UseMultUpdate << "\n"; if (Bias) { out << "Bias =\n"; for (o=0; ounit_name(o) << "\n#"; out << IOSLEFT; for (int i=0; iunit_name(i) ; out << IOSRIGHT; out << "\n"; float g= gain(o); for (int w=0; wcenter_weight()*sum^2 // + Owner->range_weight()*((sum-bias)^2 - DesiredSq)^2 // Dependency: update weight BEFORE Pseudo or Gain void NeuralLayer::update_weights(const ActivationRecord *act) { if (is_frozen()) return; if (!act->in()) return; // no update for dummy record const int num_o_m1 = num_out() -1; const int num_i_m1 = num_in() -1; const int num_w_m1 = num_wind() -1; const int num_w_m1_by2 = num_w_m1/2; const double MAX_CHANGE=Owner->learning_params()->LayMaxWeightChange; const double MAX_WEIGHT=Owner->learning_params()->LayMaxWeight; const double main_objective_weight = UseMultUpdate? Owner->learning_params()->LayMainObjWeightForMult: Owner->learning_params()->LayMainObjWeight; const double SHAPE_EXP =Owner->learning_params()->LayWindowShapeExp; // for performance purposes, declare and compute (outside the output // loop) the windowshape array, which is independent of output unit. double WindowShape[50]; assert (50 > num_w_m1); for (int w=num_w_m1; w>=0; w--) { int offset = w - num_w_m1_by2; // how far from midpoint? if (offset <0) offset = 0-offset; // do faster weight decay as you move away from center. WindowShape[w] = pow(GainExtinction, SHAPE_EXP*offset); } for (int o= num_o_m1; o>=0; o--) { double scaled = act->exp_share(o) *main_objective_weight; #ifdef DEBUG_UPDATE double sum = act->sums()[o]; assert (-99999 < sum); assert (sum < 99999); // scaled += Owner->center_weight() * 2.0 * sum; // double sum_bias=sum-Bias[o]; // double ssmd = sum_bias*sum_bias - DesiredSq; // double deriv = 4*ssmd*sum_bias; // derivative w.r.t. sum(g*w*in) // scaled += Owner->range_weight() * deriv; #endif scaled *= WeightRate * gain(o); #ifdef DEBUG_UPDATE if (o==0) { cerr << "exp_share= " << act->exp_share(o) << " sum= " << sum << " deriv= " << deriv << " change= " << -scaled*act->in(0)[0] << " old_weight= " << weight(0,0,0) << "\n" << flush; } #endif // Limit change: if (scaled > MAX_CHANGE) scaled=MAX_CHANGE; else if (scaled < - MAX_CHANGE) scaled = 0-MAX_CHANGE; for (int w=num_w_m1; w>=0; w--) { const float* one_in = act->in(w); const double WindowShape_w = WindowShape[w]; // separate inner loop cases for performance (not clarity) purposes if (UseMultUpdate) { for (int i=num_i_m1; i>=0; i--) { double wg = weight(i,w,o); wg *= exp(-scaled*wg*one_in[i]); // UseMultUpdate #ifdef DEBUG assert(isfinite(wg)); #endif wg *= WindowShape_w; if (wg > MAX_WEIGHT) wg = MAX_WEIGHT; else if (wg < -MAX_WEIGHT) wg = 0-MAX_WEIGHT; weight(i,w,o)=wg; } } else { for (int i=num_i_m1; i>=0; i--) { double wg = weight(i,w,o); wg -= scaled * one_in[i]; // not UseMultUpdate wg *= WindowShape_w; if (wg > MAX_WEIGHT) wg = MAX_WEIGHT; else if (wg < -MAX_WEIGHT) wg = 0-MAX_WEIGHT; weight(i,w,o)=wg; } } } } } // Pseudocount must remain positive, so update is multiplicative // rather than additive // Contrain growth to no more than a factor of 2. void NeuralLayer::update_pseudo(const ActivationRecord *act) { if (is_frozen()) return; double MAX_CHANGE=Owner->learning_params()->LayMaxPseudoChange; double MAX_PSEUDO=Owner->learning_params()->LayMaxPseudo; double main_objective_weight = UseMultUpdate? Owner->learning_params()->LayMainObjPseudoForMult: Owner->learning_params()->LayMainObjPseudo; for (int o= num_out()-1; o>=0; o--) { double mult = exp(-PseudoRate* Pseudo[o] * act->share()[o] * main_objective_weight); #ifdef DEBUG assert(isfinite(mult)); #endif if (mult > MAX_CHANGE) mult = MAX_CHANGE; else if (mult *MAX_CHANGE < 1.0) mult = 1.0/MAX_CHANGE; Pseudo[o] *= mult * PseudoExtinction; if (Pseudo[o] > MAX_PSEUDO) Pseudo[o] = MAX_PSEUDO; } } // update bias to minimize sum_vectors sum_o Sums**2[o]; // This is independent of the normal error function, and // does not propagate back to inputs. void NeuralLayer::update_bias(const ActivationRecord *act) { if (is_frozen()) return; if (!act->in()) return; // no update for dummy record double MAX_CHANGE=Owner->learning_params()->LayMaxBiasChange; double MAX_BIAS=Owner->learning_params()->LayMaxBias; double main_objective_weight = UseMultUpdate? Owner->learning_params()->LayMainObjBiasForMult: Owner->learning_params()->LayMainObjBias; for (int o= num_out()-1; o>=0; o--) { double scaled = act->exp_share(o); // artificially reduce importance of encoding cost for bias scaled *= main_objective_weight; double sum = act->sums()[o]; double deriv = 2.0 * sum; scaled += Owner->center_weight() * deriv; scaled *= BiasRate; #ifdef DEBUG_UPDATE if (o==0) { cerr << "exp_share= " << act->exp_share(o) << " sum= " << sum << " deriv= " << deriv << " change= " << -scaled << " old_bias= " << Bias[o] << "\n" << flush; } #endif if (scaled > MAX_CHANGE) scaled=MAX_CHANGE; else if (scaled < -MAX_CHANGE) scaled=-MAX_CHANGE; Bias[o] -= scaled; Bias[o] *= BiasExtinction; if (Bias[o] > MAX_BIAS) Bias[o]=MAX_BIAS; else if (Bias[o] < -MAX_BIAS) Bias[o] = -MAX_BIAS; } } // update gain to minimize sum_o ((Sums[o]-bias)^2 - DesiredSq)^2 ; // Note: this is independent of the normal error function, // and does not propagate back to inputs. // Gain must remain positive so use multiplicative update // (that is, use log(Gain) as parameter) // Dependency: Update Gain BEFORE Bias. void NeuralLayer::update_gain(const ActivationRecord *act) { if (is_frozen()) return; if (!act->in()) return; // no update for dummy record double MAX_CHANGE=Owner->learning_params()->LayMaxGainChange; double MIN_CHANGE=1.00/MAX_CHANGE; double MAX_GAIN=Owner->learning_params()->LayMaxGain; double MIN_GAIN=1.00/MAX_GAIN; double main_objective_weight = UseMultUpdate? Owner->learning_params()->LayMainObjGainForMult: Owner->learning_params()->LayMainObjGain; for (int o= num_out()-1; o>=0; o--) { double scaled = act->exp_share(o) * main_objective_weight; double sum_bias=act->sums()[o]-Bias[o]; double ssmd = sum_bias*sum_bias - DesiredSq; double deriv = 4*ssmd*sum_bias*sum_bias; // derivative w.r.t log(gain) if (scaled *deriv >0) { // both encoding cost and range control push in same // direction, so use both scaled += Owner->range_weight() * deriv; } else { // conflict, don't change gain continue; } scaled *= GainRate; double mult = exp(-scaled) * GainExtinction; #ifdef DEBUG assert(isfinite(mult)); #endif Gain[o] *= mult>MAX_CHANGE? MAX_CHANGE: mult MAX_GAIN) Gain[o]=MAX_GAIN; if (Gain[o] < MIN_GAIN) Gain[o] = MIN_GAIN; } } // The next four "change" functions adjust the various learning rates. void NeuralLayer::change_WeightRate(double old_cost, double new_cost) { assert(old_cost!=0 && new_cost!=0); double factor = old_cost/new_cost; const LearningParam* lp=Owner->learning_params(); double MAX_FACTOR=lp->LayMaxWeightRateFactor; double RATEDECAY=lp->LayWeightRateDecay; if (factor > MAX_FACTOR) factor=MAX_FACTOR; WeightRate*= factor*RATEDECAY; // gradually slow down learning GainExtinction = exp(-WeightRate * (UseMultUpdate? lp->LayGainExtConstForMult: lp->LayGainExtConst)); #ifdef DEBUG assert(isfinite(GainExtinction)); #endif BiasExtinction = exp(-WeightRate * (UseMultUpdate? lp->LayBiasExtConstForMult: lp->LayBiasExtConst)); #ifdef DEBUG assert(isfinite(BiasExtinction)); #endif PseudoExtinction = exp(-WeightRate * (UseMultUpdate? lp->LayPseudoExtConstForMult: lp->LayPseudoExtConst)); #ifdef DEBUG assert(isfinite(PseudoExtinction)); #endif } void NeuralLayer::change_PseudoRate(double old_cost, double new_cost) { assert(old_cost!=0 && new_cost!=0); double factor = old_cost/new_cost; const LearningParam* lp=Owner->learning_params(); double MAX_FACTOR=lp->LayMaxPseudoRateFactor; double RATEDECAY=lp->LayPseudoRateDecay; if (factor > MAX_FACTOR) factor=MAX_FACTOR; PseudoRate*= factor*RATEDECAY; // gradually slow down learning } void NeuralLayer::change_BiasRate(double old_cost, double new_cost, double old_rms, double new_rms) { assert(old_rms!=0 && new_rms!=0); // increase learning rate when sum of squares gets bigger double factor = new_rms/old_rms; const LearningParam* lp=Owner->learning_params(); double MAX_FACTOR=lp->LayMaxBiasRateFactor; double RATEDECAY=(old_cost < new_cost)? lp->LayBiasRateDecayForHigherCost: lp->LayBiasRateDecayForLowerCost; if (factor > MAX_FACTOR) factor=MAX_FACTOR; BiasRate*= factor*RATEDECAY; // gradually slow down learning } void NeuralLayer::change_GainRate(double old_cost, double new_cost, double old_rmsMDes2, double new_rmsMDes2) { // learn faster when farther from desired sum of squares double factor = new_rmsMDes2/old_rmsMDes2; const LearningParam* lp=Owner->learning_params(); double MAX_FACTOR=lp->LayMaxGainRateFactor; double RATEDECAY=(old_cost < new_cost)? lp->LayGainRateDecayForHigherCost: lp->LayGainRateDecayForLowerCost; if (factor > MAX_FACTOR) factor=MAX_FACTOR; GainRate*= factor*RATEDECAY; // gradually slow down learning } // Only initializes if weights has not already been set void NeuralLayer::initialize_weights(ostream &logfile) { const LearningParam* lp=Owner->learning_params(); double mult = lp->LayRangeMult; double denom = lp->LayRangeDenom; double min_mult = lp->LayRangeMinForMult; if (!Weights) { logfile << "# Initializing Weights for " << this->name() << endl; Alloc_Weights(); for (int w=0; w