// NeuralNet.cc // Kevin Karplus and Christian Barrett // 14 Nov. 1997 #include #include #include "Utilities/IOSmacros.h" #include "Utilities/Random.h" #include "Regularizer/BackgroundProbs.h" #include "NeuralNet.h" #include "NeuralLayer.h" #include "ActivationRecord.h" #include "InterfaceDescription.h" #include "TrainSet.h" #include "NetActivation.h" #include "NetQuality.h" #include "OneChain.h" #include "LearningParam.h" #include "Input/Input.h" #include "EqualStrings/EqualStrings.h" // information for the IdObject of NamedClass NeuralNet static NamedClass *create_neural_net(void) {return new NeuralNet;} IdObject NeuralNet::ID("NeuralNet",create_neural_net, 0, "NeuralNet is a arbitrary layer neural network.\n"); NameToPtr* NeuralNet::CommandTable = 0; NeuralNet::NeuralNet(void) { NumLayers=AllocLayers=0; layer_counter=ifd_counter=0; MyNetActivation=NULL; Layers=NULL; Interfaces=NULL; EpochCounter=0; LearningParams = 0; set_learning_params( new LearningParam); } // Copy everything EXCEPT MyNetActivation NeuralNet::NeuralNet(const NeuralNet *old) { NumLayers=AllocLayers=0; MyNetActivation=NULL; Layers=NULL; Interfaces=NULL; Alloc(old->NumLayers); layer_counter=old->layer_counter; ifd_counter= old->ifd_counter; EpochCounter=old->EpochCounter; CenterWeight= old->CenterWeight; RangeWeight= old->RangeWeight; LearningParams = new LearningParam; (*LearningParams) = (* (old->LearningParams)); NumLayers= old->NumLayers; for (int lay=0; layLayers[lay]); } for (int i=0; iInterfaces[i]); } } void NeuralNet::Alloc(int num) { AllocLayers = num; assert(AllocLayers >0); typedef NeuralLayer* nlp; Layers = new nlp [AllocLayers]; typedef InterfaceDescription *nid; Interfaces = new nid [AllocLayers+1]; typedef unsigned int ui; for (int i=0; iNumLayers); for (int lay=0; layLayers[lay]; assert (layer->num_in() == old->num_in()); assert (layer->num_wind() == old->num_wind()); assert (layer->num_out() == old->num_out()); layer->copy_weights(old); for (int o=layer->num_out()-1; o>=0; o--) { layer->set_bias(o, old->bias(o)); layer->set_pseudo(o, old->pseudo(o)); layer->set_gain(o, old->gain(o)); } } } // Remove MyNetActivation, so that network can be safely // resized (say, changing the overhangs). // Need to call initialize_net() again after resizing void NeuralNet::remove_activation(void) { delete MyNetActivation; MyNetActivation=NULL; } // Initializes any of the network parameters that *have not* been set // These are currently the weights, pseudo, bias, and gain values void NeuralNet::initialize_net(ostream &logfile, const TrainSet *training) { NeuralLayer *curr_lay; // int windsize, numin; // Reallocate and define the NetActivation. delete MyNetActivation; MyNetActivation = new NetActivation(this); assert(MyNetActivation != NULL); // For each layer, if they aren't set, initialize weights, bias, // gain and pseudo for (int i=NumLayers-1; i>=0; --i) { curr_lay=Layers[i]; curr_lay->initialize_weights(logfile); curr_lay->initialize_pseudo(logfile,training); curr_lay->initialize_bias(logfile); curr_lay->initialize_gain(logfile); const InterfaceDescription* ifd=curr_lay->output_interface(); if (ifd->Alpha && ! Globals::background_probs(ifd->Alpha,ZeroIfNew)) { BackgroundProbs *bp=new BackgroundProbs(ifd->Alpha); cerr << "WARNING: BackgroundProbs for " << ifd->Alpha->name() << " not read in before initializing network.\n"; const TrainSet* use_for_background=NULL; if (Globals::training && Globals::training->num_cols()>0) { use_for_background= Globals::training; cerr << "Setting BackgroundProbs for " << ifd->Alpha->name() << " based on training set\n"; } else if (Globals::cross_training && Globals::cross_training->num_cols()>0) { use_for_background= Globals::cross_training; cerr << "Setting BackgroundProbs for " << ifd->Alpha->name() << " based on cross-training set\n"; } else if (Globals::testing && Globals::testing->num_cols()>0) { use_for_background= Globals::testing; cerr << "Setting BackgroundProbs for " << ifd->Alpha->name() << " based on testing set\n"; } if (use_for_background) { for (int out=0; outnum_out(); ++out) { double cnt = use_for_background->output_count(i,out); (*bp)[out] = cnt; } bp->normalize(); bp->set_name(ifd->Alpha->name()); Globals::BackgroundProbsByName.AddName(bp); } } } logfile << "# Network initialization done\n"; } // This is the function that starts the show. Since I don't know what // to use for convergence/stop criterion, the net just cycles through // num_iterations estimation cycles. void NeuralNet::learning_loop( int num_iterations, TrainSet *training, ofstream *report_training, ofstream *report_training_indiv, TrainSet *cross_training, ofstream *report_cross_training, ofstream *report_cross_training_indiv, ofstream *report_unit_usage, ofstream *report_Q_vs_Phat ) { int lay; // frequently used counter for layers. NetQuality *TrainNQ=0; NetQuality *NewNQ = new NetQuality(this); NetQuality *OldNQ = new NetQuality(this); if (!cross_training || cross_training->num_cols()==0) cross_training=training; assert(training != NULL || (num_iterations==0 && cross_training!=NULL)); assert(layer_counter==NumLayers); if (EpochCounter==0 && num_iterations>0) { int distance_to_training=0; // how far to next layer that // has output to train to? for(lay=num_layers()-1; lay>=0; lay--) { distance_to_training = is_layer_hidden(lay)? distance_to_training+1: 0; Layers[lay] -> initialize_learning_rates(training->num_chains(), distance_to_training); } } // Initialize, and see where we have to improve from for (lay=NumLayers-1; lay>=0; lay--) { Layers[lay]->normalize(); // Layers[lay]->center_weights(); Layers[lay]->center_biases(); } if (report_cross_training && cross_training && cross_training->num_cols()>0 && num_iterations>0) { NewNQ->print_data_header(*report_cross_training); } test(cross_training, NewNQ, report_cross_training, report_cross_training_indiv); if (report_unit_usage) { NewNQ->print_unit_usage(* report_unit_usage,EpochCounter); } if (report_Q_vs_Phat) { NewNQ->print_Q_vs_Phat(* report_Q_vs_Phat,EpochCounter); } if (num_iterations<=0) { delete NewNQ; delete OldNQ; return; } NeuralNet *BestNN=new NeuralNet(this); NetQuality *BestNQ=new NetQuality(NewNQ); double BestObjective =NewNQ->objective(NumLayers-1); int best_last_epoch = EpochCounter; if (training && training->num_cols()>0) { TrainNQ = training==cross_training? new NetQuality(NewNQ) : new NetQuality(this); if (report_training) { TrainNQ->print_data_header(*report_training); (*report_training) << flush; } } bool do_shuffle=1; double PrevObjective=BestObjective; for (int i=1; i<=num_iterations; ++i) { EpochCounter++; cerr << "Epoch: " << EpochCounter << " " << flush; // Save the current point, in case we decide NOT // to accept the training epoch. NeuralNet *PrevNN = new NeuralNet(this); train(training, TrainNQ, report_training, report_training_indiv, do_shuffle); for (lay=NumLayers-1; lay>=0; lay--) { Layers[lay]->normalize(); Layers[lay]->center_weights(); Layers[lay]->center_biases(); } // Cost changed to be reported as bits instead of nats if (TrainNQ != NULL) { float cost_in_bits = TrainNQ->record(NumLayers-1)->encoding_cost() * M_LOG2E; cerr << cost_in_bits; } NetQuality *switch_tmp = OldNQ; OldNQ = NewNQ; NewNQ = switch_tmp; test(cross_training, NewNQ, report_cross_training, report_cross_training_indiv); if (report_unit_usage) { NewNQ->print_unit_usage(* report_unit_usage, EpochCounter); } if (report_Q_vs_Phat) { NewNQ->print_Q_vs_Phat(* report_Q_vs_Phat, EpochCounter); } update_rates(OldNQ, NewNQ); double Objective = NewNQ->objective(NumLayers-1); cerr << " cross_bits=" << NewNQ->record(NumLayers-1)->encoding_cost() * M_LOG2E << " objective=" << Objective; if (Objective>BestObjective) { // save the new best delete BestNQ; BestNQ=new NetQuality(NewNQ); delete BestNN; BestNN=new NeuralNet(this); best_last_epoch = EpochCounter; BestObjective=Objective; // shuffle 10% of time, if new best do_shuffle= (drandom() <= 0.10); cerr << " new best" << (do_shuffle? ", reshuffling": ""); } else if (EpochCounter-best_last_epoch >= learning_params()->NetResetAfter || i==num_iterations) { // reset from best so far. best_last_epoch = EpochCounter; delete NewNQ; NewNQ = new NetQuality(BestNQ); copy_weights_from(BestNN); if (report_cross_training) { NewNQ->print_data(*report_cross_training); (*report_cross_training) << flush; } do_shuffle=1; Objective=BestObjective; cerr << " reset to old best (" << Objective << "), reshuffling"; } else if (Objective - PrevObjective < learning_params()->NetTemperature *log(drandom())) { // Reject this change and go back to previous network. // That is if change is negative, accept with probability // exp(change/temperature). delete NewNQ; NewNQ = new NetQuality(OldNQ); copy_weights_from(PrevNN); Objective = PrevObjective; do_shuffle = 1; cerr << " worse network rejected, reshuffling"; } else { do_shuffle = (Objective <= PrevObjective)? 1 : (drandom() <= 0.4); // shuffle 40% of time if better cerr << (Objective>PrevObjective? " improvement" : " worse network") << " accepted" << (do_shuffle? ", reshuffling": ""); } cerr << "\n" << flush; CenterWeight *= LearningParams->NetCenterDecay; RangeWeight *= LearningParams->NetRangeDecay; PrevObjective = Objective; delete PrevNN; } delete OldNQ; delete NewNQ; delete BestNQ; delete BestNN; delete TrainNQ; } void NeuralNet::test_or_train(bool do_training, const TrainSet *test_set, NetQuality *nq, ofstream* report_summary, ofstream* report_individually, bool shuffle ) { assert(nq!=NULL); // Note: individual chain reporting and SOV computations are // currently done only for the final output layer. // They should probably be done for all layers that have training data. for (int lay=NumLayers-1; lay>=0; --lay) { // Make certain the quality records are zero'd out nq->record(lay)->clear(); // // reset null cost // nq->record(lay)->set_average_null_cost(test_set->null_cost(lay)); if (do_training) { // reset weight and gain so all gains are 1. Layers[lay]->normalize(); } } if (do_training && shuffle) test_set->shuffle_training_order(); QualityRecord last_qr(layer(NumLayers-1)); // last_qr is quality record for last layer of current chain. // It is used for setting chain weight and for indiviudal chain // reporting. // last_qr.set_average_null_cost(test_set->null_cost(NumLayers-1)); if (report_individually) { (*report_individually) << "# " << name() << " (" << degrees_freedom() << " degrees of freedom)" << "\n" << "# chainID "; last_qr.print_header(*report_individually); } int numchains = test_set->num_chains(); for (int chain_ind=0; chain_indget_shuffled_chain(chain_ind): test_set->get_chain(chain_ind); MyNetActivation->assign_chain(curr_chain); MyNetActivation->activate(); MyNetActivation->test_outputs(); if (report_individually || do_training) last_qr.clear(); // last_qr.set_average_null_cost(test_set->null_cost(NumLayers-1)); for (int la=NumLayers-1; la>=0; --la) { QualityRecord *qualrecord= nq->record(la); qualrecord->compile_unit_usage_data(MyNetActivation, curr_chain); int num_actrecords = MyNetActivation->layer_length(la); for (int n=0; nadd(MyNetActivation->record(la,n)); } if(interface(la+1)->train_to_unique()) { const short int *osec = curr_chain->osec(la); short int *psec = MyNetActivation->psec(la); qualrecord->addSOV(curr_chain->num_cols(), osec, psec); if (report_individually || do_training) { for (int n=0; nrecord(la,n)); } last_qr.addSOV(curr_chain->num_cols(), osec, psec); } delete [] psec; } } if (report_individually) { (*report_individually) << IOSLEFT << std::setw(20) << curr_chain->name() << IOSRIGHT << " " << last_qr << flush; } if (do_training) { MyNetActivation->back_propagate( pow(1.0-last_qr.q(), LearningParams->NetFractionWrongExponent) ); reest_weights(MyNetActivation); } } if (report_summary) { nq->print_data(*report_summary); (*report_summary) << flush; } } // Changes the learning rates of the net based on the its improvement, // or lack of, in quality. void NeuralNet::update_rates(const NetQuality *old_nq, const NetQuality *new_nq) { double old_cost, new_cost; double old_rms, new_rms; double old_rmsMDes2, new_rmsMDes2; // These are output layer values that are used to effect changes in // all of the layers. It will have to be reworked if more than one // output layer exists in the network. int op = NumLayers-1; old_cost = old_nq->encoding_cost(op); new_cost = new_nq->encoding_cost(op); for (int k=NumLayers-1; k>=0; --k) { old_rms = old_nq->rms_sum(k); new_rms = new_nq->rms_sum(k); old_rmsMDes2 = old_nq->rms_sum2_minus_des(k); new_rmsMDes2 = new_nq->rms_sum2_minus_des(k); NeuralLayer *l=Layers[k]; if (l->is_frozen()) continue; l->change_WeightRate(old_cost, new_cost); l->change_BiasRate(old_cost, new_cost, old_rms, new_rms); l->change_GainRate(old_cost, new_cost, old_rmsMDes2, new_rmsMDes2); l->change_PseudoRate(old_cost, new_cost); l->normalize(); } } NeuralLayer* NeuralNet::next_TrainTo_layer(int start) const { assert(start>=0 && start is_TrainTo()) return Layers[i]; } return NULL; } // for a NetActivation record that has already had both // activate and back_propagate done, update the weights in the network. void NeuralNet::reest_weights(const NetActivation *app) { int k, m; int num_units; NeuralLayer *nl; ActivationRecord *curr_record; int numlayers=app->num_layers(); assert(numlayers == this->NumLayers); // Update weights, pseudo, gain, and bias parameters in the neural // net, beginning with the output layer for (k=numlayers-1; k>=0; --k) { nl=Layers[k]; num_units=app->layer_length(k); for (m=num_units-1; m>=0; --m) { curr_record=app->record(k,m); nl->update_weights(curr_record); nl->update_pseudo(curr_record); nl->update_gain(curr_record); nl->update_bias(curr_record); } } } // Reads the NeuralNet basic structure, then assumes that the individual // layers' NeuralLayer and InterfaceDescription definitions follow. // These are also read in and added to the structure. // As a side effect, this function also creates the global NetActivation // structure NeuralNet* NeuralNet::read_new(istream& in) { NeuralNet *retnet; // NamedClass *q; // int input_error=0; // int assign_error=0; int layers_in_net; int lay; NamedClass *p=NamedClass::read_new(in); if (!p) return 0; if (!p->is_a(NeuralNet::classID())) { cerr << "Error while attempting to read a NeuralNet.\n " << "Found " << p->type()->name() << "instead.\n"; return 0; } retnet = dynamic_cast(p); layers_in_net = retnet->NumLayers; // This routine now checks for consistency between the // specified layers and interfaces. if (retnet->layer_counter != retnet->NumLayers) { cerr << "Error: inconsistent number of layers " <layer_counter << " and " << retnet->NumLayers << " reading " << retnet->name() << endl; delete retnet; return 0; } if (retnet->ifd_counter != retnet->NumLayers+1) { cerr << "Error: inconsistent number of interfaces (" <ifd_counter << ") and layers (" << retnet->NumLayers << ") reading " << retnet->name() << endl; delete retnet; return 0; } for (lay=0; layNumLayers; lay++) { NeuralLayer *layer = retnet->Layers[lay]; InterfaceDescription *before = retnet->Interfaces[lay]; InterfaceDescription *after = retnet->Interfaces[lay+1]; if (layer->num_in() != before->num_units()) { cerr << "Error: inconsistent number of units " << " at input to layer " << lay << " (" << before->num_units() << " and " << layer->num_in() << ")\n"; delete retnet; return 0; } if (layer->num_out() != after->num_units()) { cerr << "Error: inconsistent number of units " << " at output to layer " << lay << " (" << layer->num_out() << " and " << after->num_units() << ")\n"; delete retnet; return 0; } } return retnet; } // Returns 1 on error, 0 otherwise int NeuralNet::add_interface(InterfaceDescription *descr) { assert (ifd_counter <= AllocLayers+1); if (!Interfaces) { cerr << "Error adding " << descr->type() << " to\n" << type() << name() << ". Must first allocate for member \"Interfaces\".\n"; return 1; } Interfaces[ifd_counter++] = descr; return 0; } // Returns 1 on error, 0 otherwise int NeuralNet::add_layer(NeuralLayer *layer) { assert (layer_counter < AllocLayers); if (!Layers) { cerr << "Error adding " << layer->type() << " to\n" << type() << name() << ". Must first allocate for member \"Layers\".\n"; return 1; } layer->set_owner(this); layer->set_layer_number(layer_counter); Layers[layer_counter++] = layer; return 0; } // General purpose function for reading integer values for NeuralNet // member variables int ReadIntParam(istream &in, int ¶m, NeuralNet *change, NNInputCommand* self) { if (change->Layers) { cerr << "ERROR: Can't change " << self->name() << " after allocation has been done---\n" << " specify NumLayers fist\n"; return 1; } int tmp; in >> tmp; if (tmp <= 0) { cerr << "ERROR: must have " << self->name() << " >0\n"; return 0; } param=tmp; return 1; } int ReadComment(istream &in, NeuralNet *change, NNInputCommand* self) { SkipSeparators(in, 1, '\n'); return 1; } int ReadName(istream &in, NeuralNet *chg, NNInputCommand *self) { assert(chg!=NULL); char word[500]; get_word(in, word); chg->set_name(word); return 1; } int ReadNumLayers(istream &in, NeuralNet *chg, NNInputCommand *self) { assert(chg!=NULL); if (ReadIntParam(in, chg->NumLayers, chg, self)) { assert(chg->NumLayers>0); chg->Alloc(chg->NumLayers); return 1; } else {return 0;} } int ReadLayerOrInterface(istream &in, NeuralNet *chg, NNInputCommand *self) { assert(chg!=NULL); NamedClass *q = NamedClass::read_new_after_classname(in); if (q->is_a(InterfaceDescription::classID())) return ! chg->add_interface(dynamic_cast (q)); else if (q->is_a(NeuralLayer::classID())) return ! chg->add_layer(dynamic_cast(q)); cerr << "Unrecognized object in NeuralNet: " << q->type()->name() << "\n"; return 0; } int VerifyClassName(istream &in, NeuralNet *change, NNInputCommand *self) { assert(change!=NULL); char word[100]; get_word(in, word); const IdObject *end_id = IdObject::id(word); if (end_id != change->type()) { cerr << "Warning: " << self->name() << word << " doesn't match " << change->type()->name() << endl << flush; } // continue if "ClassName", stop if "EndClassName" return EqualStrings(self->name(), "ClassName", 1); } void NeuralNet::init_command_table() { assert(!CommandTable); CommandTable = new NameToPtr(10); CommandTable->ignore_case(); CommandTable->AddName(new NNInputCommand("Name", ReadName)); CommandTable->AddName(new NNInputCommand("Comment", ReadComment)); CommandTable->AddName(new NNInputCommand("NumLayers", ReadNumLayers)); CommandTable->AddName(new NNInputCommand("ClassName", ReadLayerOrInterface)); CommandTable->AddName(new NNInputCommand("EndClassName", VerifyClassName)); } int NeuralNet::read_knowing_type(istream &in) { if (! command_table()) {init_command_table();} char word[300]; while (in.good()) { get_word(in, word, '='); NNInputCommand *comm = dynamic_cast (command_table()->FindOldName(word, ZeroIfNew)); if (comm) { if (!comm->execute(in, this)) return 1; } else { cerr << "Unrecognized keyword: " << word << " for type " << type()->name() << " " << name() << endl; } } return 0; } void NeuralNet::write_knowing_type(ostream &out) const { out << "Name = " << name() << endl << "NumLayers = " << NumLayers << endl << endl ; int numlayers = num_layers(); for (int i=0; iwrite(out); out << endl; layer(i)->write(out); out << endl; } interface(numlayers)->write(out); out << endl; } // Using the usage statistics in NQ, adjust the biases of each layer // and the weights of the next layer to get approximately equal usage of // each hidden unit. // Do not adjust the biases for layers with training data for the // outputs, nor for the layer before a frozen layer. void NeuralNet::equilibrate(NetQuality *NQ) { for (int i=0; ioutput_interface()->is_hidden() || layer(i+1)->is_frozen()) continue; // skip layers that can't be equilibrated. NeuralLayer *next_lay = layer(i+1); int next_num_w = next_lay->num_wind(); int next_num_o = next_lay->num_out(); int num_out = lay->num_out(); for (int o=num_out-1; o>=0; o--) { double phat = NQ->record(i)->phati(o); // usage of unit o double scale = phat * num_out; // how much to shrink unit double log_scale = (scale <=0.01 )? -4.60517: log(scale); lay->set_bias(o, lay->bias(o) - log_scale); for (int w=0; w< next_num_w; w++) { for (int no=0; no < next_num_o; no++) { next_lay->set_weight(o,w,no, const_cast(next_lay)->weight(o,w,no)*scale); } } } } } // CHANGE LOG: // 12 March 1998 Kevin Karplus // Added initialize_learning_rate to learning_loop // 13 April 1998 Kevin Karplus // Improved output to include Name of network. // 14 April 1998 Kevin Karplus // Added keeping best and restarting from it after 30 worse epochs. // 18 May 1998 Kevin Karplus // Added normalize and center_weights to training loop. // 9 June 1998 Kevin Karplus // Modified learning_loop so that cross_training set not // required (uses training set to choose best if cross_training missing). // 20 July 1998 Kevin Karplus // Added individual sequence reporting option to test() and train() // Moved summary printing from learning_loop to test() and train(). // Merged test() and train() into test_or_train() // Fixed bug in set_learning_params which caused CenterWeight and // RangeWeight not to be reset properly. // Added comment to NetQuality file about what layers are frozen. // 25 July 1998 Kevin Karplus // Added print_Q_vs_Phat to learning_loop // 15 September 1999 Sugato Basu // Added code to handle output format ALIGNMENT // 5 Nov 1999 Kevin Karplus // Changed format of networks to have interfaces and layers // inside the NeuralNet instead of outside. // 10 Dec 1999 Kevin Karplus // Added equilibrate() // 7 Jan 2000 Kevin Karplus // Added center_biases after each center_weights // 28 Feb 2000 Kevin Karplus // Modified learning_loop so that training set not shuffled after new // best and only shuffled 30% of time after an improvement. // 8 Jan 2001 Kevin Karplus // Added null_cost initialization and negated objective function. // 22 Nov 2001 Kevin Karplus // Commented out center_weights in learning_loop---now need to use // explicit CenterWeights command. // 12 Dec 2001 Kevin Karplus // Added NetTemperature and copy_weights_from() // Added extra info to log file on training iterations. // 16 Aug 2003 George Shackelford // Replaced deprecated 'form(...)' with proper formating // 01 June 2004 Sol Katzman // Replaced drand48() with utilities from ultimate library. // Sat Jun 18 22:13:17 PDT 2005 Kevin Karplus // Added initialization of BackgroundProbs from cross_training in learning_loop // Sun Jun 19 07:16:21 PDT 2005 Kevin Karplus // Moved initialization of BackgroundProbs to initialize_net // Thu Jul 7 04:44:17 PDT 2005 Kevin Karplus // Removed a couple of spaces before print_header() // Thu Jul 7 13:45:46 PDT 2005 Kevin Karplus // Added check for empty training/cross-training sets, // to existing checks for null pointers. // Sat Jul 23 17:32:14 PDT 2005 Kevin Karplus // Fixed do_training and shuffle parameters of test_or_train to // be bool (not int).