// DesignCommands.cc // Sun Jul 10 16:55:29 PDT 2005 Kevin Karplus // Moved from TrainingCommands.cc // Note: there is no .h file for this .cc file, since the commands // are automatically entered in the command table when they // are declared. #include #include #include "Command/Command.h" #include "Input/Input.h" #include "Utilities/Random.h" #include "Filenames/Filenames.h" #include "Regularizer/DirichletReg.h" #include "gen_sequence/GenSequence.h" #include "NeuralNet.h" #include "OneChain.h" #include "NetActivation.h" #include "NetQuality.h" #include "Globals.h" #include "ActivationRecord.h" // do one forward propagation with output evaluation and return objective static double forward_prop_one_design(const NeuralNet *NN, OneChain *design_chain, QualityRecord& last_qr) { last_qr.clear(); NetActivation *netact = NN->net_activation(); int num_lay = NN->num_layers(); // AlphabetTuple *alpha_out = NN->interface(num_lay)->Alpha; // last_qr.set_average_null_cost(log(static_cast(alpha_out->num_normal()))); // do forward propagation and evaluate netact->activate(); netact->test_outputs(); if (NN->interface(num_lay)->train_to_unique()) { const short int *osec = design_chain->osec(num_lay-1); short int *psec = netact->psec(num_lay-1); int num_actrecords = netact->layer_length(num_lay-1); for (int n=0; nrecord(num_lay-1,n)); } last_qr.addSOV(design_chain->num_cols(), osec, psec); delete [] psec; } return last_qr.objective(); } static void fix_profiles_for_changes(void) { if (Globals::ChangeTheseColumns.size() <= 0) return; OneChain *chain0 = Globals::design_chains[0]; int num_cols = chain0->num_cols(); int num_alph= chain0->guide_alphabet()->norm_length(); assert (num_alph); // check for positions that must be forced to change for (int col = 0; col < num_cols; col++) { if (!Globals::ChangeTheseColumns[col]) continue; // column col is required to change, so set to 0 // after spreading its weight to the other units int index= chain0->structure(0,col); float reassign = chain0->profile(col)[index]/ (num_alph-1); for (int a=0; aprofile(col)[a] += reassign; } chain0->profile(col)[index]=0.0; if (chain0->guide_sequence_0(col).raw_int() == index) { // resample this position int sampleIndex = get_random_given_weights(num_alph, chain0->profile(col)); assert(sampleIndex < num_alph); assert(sampleIndex != index); Base b; b.set_int(sampleIndex); chain0->set_guide_base(0,col,b); // cerr << "DEBUG: col " << col+1 // << " set to " << Globals::design_chains[0]->guide_alphabet()->to_char(b) // << "\n" << flush; } } // propagate input to all design_chains for (int n=1; n< Globals::num_nns(); n++) { Globals::design_chains[n]->set_guide(0,chain0->guide_sequence_0()); Globals::design_chains[n]->copy_probvectors(chain0); } } // try "forward_samples" different sequences in design_chain, // scoring each one and keeping the best in design_chain. // If the best is better than best_objective_for_start, // then update best_seq_for_start. static void forward_sampling( vector last_qrs, bool do_most_probable, double forward_samples, Base best_seq_for_start[], double& best_objective_for_start ) { OneChain *chain0 = Globals::design_chains[0]; assert(chain0); int num_cols = chain0->num_cols(); int num_alph= chain0->guide_alphabet()->norm_length(); assert (num_alph); Base best_seq_for_iter[num_cols]; double best_objective=-99999.; int best_at=-1; double objective=-9999999.; fix_profiles_for_changes(); int s; for (s=(do_most_probable?-1:0); s0 && Globals::ChangeTheseColumns[c] && a==Globals::design_chains[0]->structure(0,c)) continue; if (chain0->profile(c)[a]> most) { most_at=a; most=chain0->profile(c)[a]; } } if (Globals::FreezeTheseColumns.size()>0 && Globals::FreezeTheseColumns[c]) { most_at = Globals::design_chains[0]->structure(0,c); } assert(0<=most_at && most_at < num_alph); chain0->set_guide(0,c, most_at); } } else { // Sample profile to 1-hot input, chain0->SampleGuideFromAAProbs(0); } // setting all design_chains to same guide sequence. for (int n=1; n< Globals::num_nns(); n++) { Globals::design_chains[n]->set_guide(0,chain0->guide_sequence_0()); } objective=0; for (int n=0; n< Globals::num_nns(); n++) { objective += forward_prop_one_design(Globals::nn(n),Globals::design_chains[n], *(last_qrs[n])); } if (s==0 || best_objectiveguide_sequence_0(c); } } } if (best_at != s-1) { // best seen was not the last one, so reset sequence to best // and redo forward propagation, so that back propagaton can be applied objective=0; for (int n=0; n< Globals::num_nns(); n++) { Globals::design_chains[n]->set_guide(0,best_seq_for_iter); objective += forward_prop_one_design(Globals::nn(n),Globals::design_chains[n], *(last_qrs[n])); } } if (best_objective_for_start < objective) { best_objective_for_start=objective; for (int c=0; cguide_sequence_0(c); } } } // return expected number of residues recovered static double print_quality(ostream &out, const OneChain* design_chain, const QualityRecord & last_qr, double start_cputime, double iter_start_cputime, int start_num, int iter ) { double new_cputime = Globals::user_seconds(); int residues_recovered=0; double expected_residues_recovered=0; int num_cols = design_chain->num_cols(); double input_encoding_cost=0.; if (design_chain->structure(0) !=NULL) { for (int col=0; colguide_sequence_0(col).raw_int() == design_chain->structure(0,col)) { residues_recovered++; } double p_correct= design_chain->profile(col)[design_chain->structure(0,col)]; input_encoding_cost -= log(p_correct); expected_residues_recovered+=p_correct; } } out << setw(5) << start_num+1 << " "<< setw(5) << iter+1 << " " << setw(7) << setprecision(3) << (new_cputime-iter_start_cputime)<< " " << setw(7) << setprecision(1) << (new_cputime-start_cputime) << " " << setw(5) << residues_recovered << " " << setw(6) << setprecision(2) << ((residues_recovered *100.)/num_cols) << " " << setw(6) << setprecision(2) << ((expected_residues_recovered *100.)/num_cols) << " " << setw(6) << setprecision(3) << (M_LOG2E*input_encoding_cost/num_cols) << " " << last_qr; return expected_residues_recovered; } static void create_starting_point( int num_alph, // size of input alphabet float cheating_start, // look up right answer if >0 bool do_random, // generate random seq const OneChain& reset_counts, // reset profile from count, but leave sequence alone double weight // weight to multiply counts by ) { int num_cols = Globals::design_chains[0]->num_cols(); if (cheating_start>0) { // copy the right answer into the guide sequence Globals::design_chains[0]->clear_probvectors(); Globals::design_chains[0]->set_guide(0, Globals::design_chains[0]->structure(0)); Globals::design_chains[0]->set_counts_from_guide(cheating_start); for (int col = 0; col < num_cols; col++) { Globals::design_chains[0]->set_profile_from_counts(col); } } else if (do_random) { // Make a random starting point Globals::design_chains[0]->RandomInput(Globals::nn(0)->interface(0)->ReRegularizer, 0); } else { // on second round, use profile from previous starts for (int c=0; ccounts_for_layer(0,c)[a] = reset_counts.counts_for_layer(0,c)[a] * weight; } Globals::design_chains[0]->set_profile_from_counts(c); } } fix_profiles_for_changes(); // check for frozen values if (Globals::FreezeTheseColumns.size()>0) { for (int col = 0; col < num_cols; col++) { if (!Globals::FreezeTheseColumns[col]) continue; // column col is frozen, so set it to the right answer. Globals::design_chains[0]->set_guide(0, col, Globals::design_chains[0]->structure(0,col)); Globals::design_chains[0]->set_counts_from_guide(col, 100.); Globals::design_chains[0]->set_profile_from_counts(col); } } // propagate input to all design_chains for (int n=1; n< Globals::num_nns(); n++) { Globals::design_chains[n]->set_guide(0,Globals::design_chains[0]->guide_sequence_0()); Globals::design_chains[n]->copy_probvectors(Globals::design_chains[0]); } } // Added by Frellsen // This is the command for design primary structure // It should be called after everything has been set up. static int design1st_alg( float cheating_start, // set >0 to start profile from true sequence on first iter int num_startpoints, int iterations, bool do_most_probable, double forward_samples, // number samples per back-propagate // if non-integer, do probabilistic extra sample float learning_rate, // multiplier for gradient in gradient descent bool use_rprop, // use RPROP algorithm instead of gradient descent bool add_guide_partials, // add partials with respect // to guide sequence to partials with respect to profile float pseudocount, // add a small flat background to prevent probabilties from getting too small int num_second_round, int num_outer_iter, ostream &logfile, ostream *design_out, ostream *predict_out, ostream *quality_out) { assert(design_out); int num_nets = Globals::num_nns(); assert(num_nets>0); assert(static_cast (Globals::design_chains.size()) == num_nets); const LearningParam *lp = Globals::nn(0)->learning_params(); assert(lp); const InterfaceDescription *ifd = Globals::nn(0)->interface(0); int num_alph=0; // size of input alphabet int num_units=0; // size of input to networks for (int n=0; ninitialize_net(logfile, NULL); // Count input size const InterfaceDescription *ifd = NN->interface(0); int num_units_n = ifd->num_units(); assert(n==0 || num_units_n==num_units); num_units=num_units_n; int num_alph_n = (ifd->Alpha!=NULL) ? ifd->Alpha->num_normal() : num_units; assert(n==0 || num_alph_n==num_alph); num_alph=num_alph_n; } // Set up the array holding the partial derivative of the error with // respect to the input of the first layer int num_cols = Globals::design_chains[0]->num_cols(); double **InputPartials = new double*[num_cols]; for (int col=0; colnet_activation(); assert(netact); netact->assign_chain(Globals::design_chains[n]); } // create a record to keep track of the quality of the predictions. vector last_qrs; for (int n=0; n< num_nets; n++) { last_qrs.push_back(new QualityRecord(Globals::nn(n)->layer(Globals::nn(n)->num_layers()-1))); assert(last_qrs[n]); } if (quality_out) { // print header for quality output file (*quality_out) << "# QRBestCostMult = " << lp->QRBestCostMult << "\n" << "# QRBestQMult = " << lp->QRBestQMult << "\n" << "# QRBestSOVMult = " << lp->QRBestSOVMult << "\n" << "\n" << "# NetActWrongWeight = " << lp->NetActWrongWeight << "\n" << "# NetActChangeCorrectWeight = " << lp->NetActChangeCorrectWeight << "\n" << "# LayWeightRateDecay = " << lp->LayWeightRateDecay << "\n" << "\n" << "# " << num_startpoints << " starting points\n" << "# " << iterations << " back propagations per start\n" << "# " << forward_samples << " forward propagates per back\n" << "# " << learning_rate << " initial learning rate\n" << "\n"; (*quality_out) << "# sequence-recovery\n"; for (int n=0; nprint_header(*quality_out); } } double start_cputime = Globals::user_seconds(); int num_starts=num_startpoints + num_second_round; for (int outer_iter=0; outer_iter=num_startpoints); assert(!use_profile || sum_second_round_counts>0); create_starting_point(num_alph, i==0? cheating_start: 0, !use_profile, second_round, use_profile? i/sum_second_round_counts: 0); // keep track of the best sequence on any iteration Base best_seq_for_start[num_cols]; double best_objective_for_start=-99999.; // reset the learning rate to the initial learning rate double decaying_rate=learning_rate; for (int iter=0; iter < iterations; iter++) { forward_sampling(last_qrs, do_most_probable, forward_samples, best_seq_for_start, best_objective_for_start); if (quality_out) { double expected_residues_recovered=0; for (int n=0; n< num_nets; n++) { expected_residues_recovered=print_quality(*quality_out, Globals::design_chains[n], *(last_qrs[n]), start_cputime, iter_start_cputime, i, iter); } (*quality_out) << "### outer= " << outer_iter+1 << " start= " << i+1 << " iter= " << iter+1 << " E(%recovered)= " << 100.*expected_residues_recovered/num_cols << " objective= " << best_objective_for_start << "\n" << flush; } // Adjust the profile Globals::nn(0)->net_activation()->clear_InputPartials(InputPartials); for (int n=0; n< num_nets; n++) { NetActivation *netact = Globals::nn(n)->net_activation(); netact->back_propagate(1.0); netact->add_InputPartials(InputPartials); } if(ifd->UseGuide && ifd->UseAminoAcidProbs && add_guide_partials) { // add partials for the 1-hot guide-sequnce inputs to the // partials for the profiles. for(int c=0; cprofile_first_num()] += InputPartials[c][a+ifd->guide_first_num()]; } } } if (use_rprop) { Globals::design_chains[0]->AdjustWithRprop(InputPartials); // logfile << "DEBUG: profile adjusted by rprop\n" << flush; } else { Globals::design_chains[0]->AdjustToGradientDescent(decaying_rate, InputPartials); // logfile << "DEBUG: profile adjusted by backprop\n" << flush; } if (pseudocount>0) { // add the pseudocount to every profile entry and renormalize for(int c=0; cprofile(c)[a] += pseudocount; sum += Globals::design_chains[0]->profile(c)[a]; } for (int a=0; aprofile(c)[a] /=sum; } } } // copy profiles to other design chains. for (int n=0; nset_profile(c, Globals::design_chains[0]->profile(c)); } } decaying_rate *= lp->LayWeightRateDecay; } // add to second-round profile, based on objective double growing_weight= i>num_startpoints? pow(1.2, i-num_startpoints): 1; double add_weight = best_objective_for_start<=0? 0.001: best_objective_for_start*growing_weight; for (int col=0; colset_guide(0,best_seq_for_start); if (quality_out || predict_out) { for (int n=0; n< num_nets; n++) { forward_prop_one_design(Globals::nn(n),Globals::design_chains[n], *(last_qrs[n])); } } logfile << " objective= " << best_objective_for_start << "\n" << flush; if (num_outer_iter<=1 || i==num_starts-1 || num_second_round==0) { // Output 1-hot input *design_out << ">Starting_point_" << outer_iter+1 << "_" << i+1 << endl; Globals::design_chains[0]->print_guide_sequence_0(*design_out); *design_out << flush; } if (quality_out) { for (int n=0; n< num_nets; n++) { print_quality(*quality_out, Globals::design_chains[n], *(last_qrs[n]), start_cputime, iter_start_cputime, i, iterations); } (*quality_out) << "\n###\n" << flush; } if (predict_out) { // Output the predicted sequences for (int n=0; n< num_nets; n++) { NeuralNet * NN=Globals::nn(n); NetActivation *netact = NN->net_activation(); int num_lay= NN->num_layers(); AlphabetTuple *alpha_out = NN->interface(num_lay)->Alpha; *predict_out << ">Starting_point_" << i+1 << "_net_" << n << endl; for (int col=0; colrecord(num_lay-1,col)->highest_prob_output(); alpha_out->print_unindex(*predict_out, index); if (col%50==0 && col>0) {*predict_out << endl;} } *predict_out << endl << flush; } } } } // Clean up the mess :) for (int col=0; col>num_startpoints;} else if (EqualStrings(buffer,"iter",1)) {in>>num_iterations;} else if (EqualStrings(buffer,"cheating_start",1)) {in>>cheating_start;} else if (EqualStrings(buffer,"most_probable",1)) {do_most_probable=true;} else if (EqualStrings(buffer,"add_guide_partials",1)) {add_guide_partials=true;} else if (EqualStrings(buffer,"rprop",1)) {use_rprop=true;} else if (EqualStrings(buffer,"backprop",1)) {use_rprop=false;} else if (EqualStrings(buffer,"pseudocount",1)) {in>>pseudocount;} else if (EqualStrings(buffer,"forward",1)) {in>>forward_samples;} else if (EqualStrings(buffer,"restart",1)) {in>>num_restart;} else if (EqualStrings(buffer,"outer_iter",1)) {in>>num_outer_iter;} else if (EqualStrings(buffer,"rate",1)) {in>>learning_rate;} else if (EqualStrings(buffer,"seqs",1)) {get_word(in,filename_design,'\n',0);} else if (EqualStrings(buffer,"qual",1)) {get_word(in,filename_quality,'\n',0);} else if (EqualStrings(buffer,"pred",1)) {get_word(in,filename_pred,'\n',0);} else if (EqualStrings(buffer,"\\",1)) {SkipSeparators(in,1,'\n');} else { logfile << "Error: in " << self->name() << " unknown keyword '" << buffer << "'----skipping command.\n"; SkipSeparators(in, 1, '\n'); return 1; } } logfile << "\n\n# " << self->name() << " outer_iter " << num_outer_iter ; if (cheating_start>0) { logfile << " cheating_start " << cheating_start; } logfile << " start " << num_startpoints << " restart " << num_restart << " iter " << num_iterations << (do_most_probable? " most_probable": "") << " forward " << forward_samples; if (use_rprop) logfile << " rprop"; else { logfile << " rate " << learning_rate; } logfile << (add_guide_partials? " add_guide_partials" : "") << " pseudocount " << pseudocount << " \\\n" << "# "; if (filename_design[0]!=0) logfile << " seqs " << filename_design; if (filename_pred[0]!=0) logfile << " pred " << filename_pred; if (filename_quality[0]!=0) logfile << " qual " << filename_quality; logfile << "\n" << flush; // Validate the input if ( num_outer_iter<=0 || (num_startpoints <= 0 && cheating_start<=0) || num_iterations <= 0 || (!do_most_probable && forward_samples<=0) ) { logfile << "# ERROR: The number of outer iterations (" << num_outer_iter << "), "; if (!cheating_start) logfile << "start points (" << num_startpoints << "), "; logfile << "iterations (" << num_iterations << "), "; if (!do_most_probable) logfile << "forward samples (" << forward_samples << ")\n" << "# must all be greater than zero.\n"; return 1; } if ( learning_rate <= 0 ) { logfile << "# ERROR: The learning rate (" << learning_rate << ") must be a positive floating point value.\n" << flush; return 1; } if (!Globals::nn()) { logfile << "# ERROR: A neural network must be read " << "before designing primary structure!\n"; return 1; } if (Globals::design_chains.size()==0) { logfile << "# ERROR: A secondary structure must be read " << "before designing primary structure! (use DesignTo)\n"; return 1; } // Check that the net meets the assertions in the further calls const InterfaceDescription *ifd = Globals::nn()->interface(0); if ( !ifd && ifd->Alpha==NULL && !ifd->UseAminoAcidProbs && !ifd->UseGuide && ifd->UseComponentProbs && ifd->NetRegularizer && ifd->UseLogOdds ) { logfile << "# ERROR: The neural net must meet the following requirements for the first layer:\n" << "# * A alphabet is used\n" << "# * Probabilities and a guidesequence over the input alphabet is used\n" << "# * A net regularizer, component probabilities, and logodds are not used\n" ; return 1; } // Open the filestreams ofstream *design_out= NULL, *quality_out=NULL, *pred_out=NULL; if (filename_design[0]!=0) { design_out = Filenames::open_output(filename_design); if (design_out==NULL) { logfile << "# ERROR: Couldn't open " << filename_design << " for design output.\n"; } } if (filename_pred[0]!=0) { pred_out = Filenames::open_output(filename_pred); if (pred_out==NULL) { logfile << "# ERROR: Couldn't open " << filename_pred << " for predicted output.\n"; } } if (filename_quality[0]!=0) { quality_out = Filenames::open_output(filename_quality); if (quality_out==NULL) { logfile << "# ERROR: Couldn't open " << filename_quality << " for quality output.\n"; } } int result = design1st_alg(cheating_start, num_startpoints, num_iterations, do_most_probable, forward_samples, learning_rate, use_rprop, add_guide_partials, pseudocount, num_restart, num_outer_iter, logfile, design_out==NULL? (&logfile): design_out, pred_out, quality_out); delete design_out; delete quality_out; delete pred_out; return result; } static Command Design1stCommand("Design1st", Design1st, "The command for designing primary structure.\n\ The arguments are keywords:\n\ outer_iter 1 # of independent optimizations run\n\ cheating_start 0 if non-zero, do one run starting from the correct sequence\n\ with weight of sequence = to argument\n\ start 5 # of random starting points for each optimization\n\ restart 20 number of starts based on previous results\n\ (after the random starts are done)\n\ iter 10 # of iterations of back propagation\n\ for each start\n\ (note: the number of back propagations is outer_iter*(start+restart)*iter\n\ or outer_iter*(1+start+restart)*iter, if cheating_start)\n\ forward 1.0 # of sampled forward propagations\n\ for each back propagation\n\ most_probable do the most probable sequence based on the profile\n\ (in addition to the forward samples)\n\ rate 4.0 learning rate for back propagation\n\ seqs filename file for FASTA file of sequences\n\ (defaults to the log file)\n\ qual filename file for quality report on each back propagation\n\ (defaults to no output)\n\ pred filename file for FASTA file of predicted outputs\n\ (defaults to no output)\n\ backprop use standard gradient-descent backpropagation \n\ algorithm (default)\n\ rprop use resilient backpropagation algorithm\n\ add_guide_partials add the partials for the 1-hot guide inputs\n\ to the partials for the profile inputs\n\ pseudocount 0.0 Add a small flat background to the profiles,\n\ to prevent extinction of probabilties\n\ \n\ A network must first have been read with 'ReadNeuralNet'.\n\ A desired output must have been read with the command 'DesignTo'.\n\ The learning rate is allowed to decay on each iteration, \n\ multiplied by the LayWeightRateDecay learning parameter.\n\ \n\ The initial learning rate * the learning parameter NetActWrongWeight\n\ should probably be about 4.\n\ \n\ The number of forward samples per iteration may be non-integer---\n\ any fractional part results in probabilistic decisions on whether to\n\ do an extra sample.\n\ \n\ "); static int FreezeDesign(istream& in, Command* self, ostream &logfile) { char buffer[500]; if (Globals::design_chains.size()==0) { logfile << "# ERROR: A secondary structure must be read " << " before " << self->name() << " (use DesignTo)\n"; SkipSeparators(in, 1, '\n'); return 1; } int num_cols = Globals::design_chains[0]->num_cols(); Globals::initialize_freeze(); logfile << "# Freezing columns "; for(get_word(in,buffer, '\n', 0); buffer[0]; get_word(in,buffer, '\n', 0)) { if (EqualStrings(buffer,"\\",1)) {SkipSeparators(in,1,'\n');} else { int col = atoi(buffer)-1; if (col <0 || col >= num_cols) { logfile << "# ERROR: " << self->name() << " col " << col-1 << " out of range;" << " design sequence has " << num_cols << "\n"; } else { Globals::FreezeTheseColumns[col] = true; logfile << " " << Globals::design_chains[0]->guide_alphabet()->unindex( Globals::design_chains[0]->structure(0,col)) << col+1 ; } } } logfile << "\n"; return 1; } static Command FreezeDesignCommand("FreezeDesign", FreezeDesign, "This command takes any number of numeric arguments.\n\ Each argument specifies one column that should be copied from the\n\ 'correct' sequence read in by the preceding DesignTo command.\n\ The first column is numbered 1.\n\ The command may be continued onto subsequent lines by using a \\ \n\ at the end of the line.\n\ With no arguments, clears previous FreezeDesign setting.\n\ "); static int ChangeDesign(istream& in, Command* self, ostream &logfile) { char buffer[500]; if (Globals::design_chains.size()==0) { logfile << "# ERROR: A secondary structure must be read " << " before " << self->name() << " (use DesignTo)\n"; SkipSeparators(in, 1, '\n'); return 1; } Globals::initialize_change(); logfile << "# Changing columns "; int num_cols = Globals::design_chains[0]->num_cols(); for(get_word(in,buffer, '\n', 0); buffer[0]; get_word(in,buffer, '\n', 0)) { if (EqualStrings(buffer,"\\",1)) {SkipSeparators(in,1,'\n');} else { int col = atoi(buffer)-1; if (col <0 || col >= num_cols) { logfile << "# ERROR: " << self->name() << " col " << col-1 << " out of range;" << " design sequence has " << num_cols << "\n"; } else { Globals::ChangeTheseColumns[col] = true; logfile << " " << Globals::design_chains[0]->guide_alphabet()->unindex( Globals::design_chains[0]->structure(0,col)) << col+1 ; } } } logfile << "\n"; return 1; } static Command ChangeDesignCommand("ChangeDesign", ChangeDesign, "This command takes any number of numeric arguments.\n\ Each argument specifies one column that must be changed from the\n\ 'correct' sequence read in by the preceding DesignTo command.\n\ The first column is numbered 1.\n\ The command may be continued onto subsequent lines by using a \\ \n\ at the end of the line.\n\ With no arguments, clears previous ChangeDesign setting.\n\ "); static int ReadSequenceGenerator(istream& in, Command* self, ostream &logfile) { delete Globals::SequenceGenerator; Globals::SequenceGenerator = NULL; char filename[500]; get_word(in, filename, '\n'); if (!filename[0]) { logfile << "# Not using a sequence generator\n"; return 1; } gzifstream *reg_in = Filenames::open_input(filename); if (!reg_in) return 0; Regularizer *r= Regularizer::read_new(*reg_in); delete reg_in; if (!r || !r->is_a(DirichletReg::classID())) { logfile << "# ERROR: no DirichletReg found in " << filename << "\n" << "# Not using a sequence generator\n"; return 1; } double mean=237; double std_dev=175; get_word(in, filename, '\n'); if (filename[0] !=0) { mean=atof(filename); logfile << "# DEBUG: Setting mean to " << filename << " = " << mean << "\n" << flush; get_word(in, filename, '\n'); if (filename[0] !=0) { std_dev = atof(filename); } } Globals::SequenceGenerator = new GenSequence( static_cast(r), mean,std_dev); delete r; logfile << "# SequenceGenerator set up\n"; return 1; } static Command ReadSequenceGeneratorCommand("ReadSequenceGenerator", ReadSequenceGenerator, "This command takes up to three arguments:\n\ a filename for reading in a Dirichlet mixture regularizer (required)\n\ a mean sequence length (optional, default=237)\n\ a standard deviation of sequence length (optional, default=175)\n\ The regularizer should be one trained to model the compositions of\n\ sequences over the input alphabet of the neural nets.\n\ The regularizer (and the optional parameters) are used to create a\n\ random-sequence generator which can mimic the biased compositions\n\ found in real sequences.\n\ "); // CHANGE LOG: // Sun Jul 10 17:02:42 PDT 2005 Kevin Karplus // Moved routines from TrainingCommands.cc // Sun Jul 10 17:10:34 PDT 2005 Kevin Karplus // Changed second round from a single run to cumulative profile accumulation. // Sun Jul 10 17:20:23 PDT 2005 Kevin Karplus // Added output of ojbective to logfile // Sun Jul 10 17:30:30 PDT 2005 Kevin Karplus // Added weighting (by objective) for creating profile of // second-round counts // Sun Jul 10 21:32:55 PDT 2005 Kevin Karplus // added growing weight for objective when creating second-round profile // Mon Jul 11 08:11:47 PDT 2005 Kevin Karplus // added cumulative cpu_time as well as cpu_time for iteration to print_quality // Mon Jul 11 11:15:03 PDT 2005 Kevin Karplus // Added check for negative objective in creating second-round profile // Mon Jul 11 14:09:35 PDT 2005 Kevin Karplus // added cheating_start to design1st // Thu Jul 14 18:01:04 PDT 2005 Kevin Karplus // added outer_iter // Thu Jul 14 21:13:12 PDT 2005 Kevin Karplus // rearranged code to separate out create_starting_point // Thu Jul 14 21:37:45 PDT 2005 Kevin Karplus // added Globals::FreezeTheseColumns to create_starting_point // added FreezeDesign command // Thu Jul 14 23:23:41 PDT 2005 Kevin Karplus // Replaced clock() calls with Globals::user_seconds, which doesn't // wrap around. // Fri Jul 15 12:30:08 PDT 2005 Kevin Karplus // Increased growth rate for adding to second round, so that more // recent sequences have more importance. // Decreased weight of counts for making profile from second round, // so that profiles don't get too specialized. // Fri Jul 15 21:41:55 PDT 2005 Kevin Karplus // Changed check on validity to allow 0 starts if cheating_start. // Sun Jul 17 09:45:04 PDT 2005 Kevin Karplus // fixed create_starting_point to use copy_probvectors // Tue Jul 19 11:28:56 PDT 2005 Kevin Karplus // Improved feedback from design1st error-checking (echoing bad parameters) // Thu Jul 21 18:33:18 PDT 2005 Kevin Karplus // Added ReadSequenceGenerator // Sun Feb 5 02:16:57 PST 2006 Kevin Karplus // Added sequence output on every start if num_second_round==0 // Sun Feb 5 02:22:38 PST 2006 Kevin Karplus // Added output of overall objective as summary to quality_out // Sun Feb 5 02:59:20 PST 2006 Kevin Karplus // Added ChangeDesign command // Sun Feb 5 03:27:10 PST 2006 Kevin Karplus // Added freezing and changing to forward_sampling (not just create_starting_point) // Sun Feb 5 04:24:26 PST 2006 Kevin Karplus // Echoed frozen and changed columns in FreezeDesign and ChangeDesign // Sun Feb 5 12:02:22 PST 2006 Kevin Karplus // Created fix_profiles_for_changes to remove duplicate code and // correct handling of changes in forward_sampling. // Also, fixed bug in which position in profile was zeroed. // Mon Mar 13 13:32:30 PST 2006 Kevin Karplus // Added "rprop" option to Design1st // Wed Mar 15 17:07:05 PST 2006 Kevin Karplus // added E(%residues recovered) to summary output comments // Thu Mar 16 17:00:07 PST 2006 Kevin Karplus // Doubled weight for second round counts in call to create_starting_point. // Sat Mar 18 15:29:59 PST 2006 Kevin Karplus // Added add_guide_partials // Sat Mar 18 16:20:37 PST 2006 Kevin Karplus // Added echo of rprop and add_guide_partials // Sat Mar 18 16:28:34 PST 2006 Kevin Karplus // Added pseudocount parameter to design1st