// ReadCommands.cc // Kevin Karplus // 22 Nov 1995 // Commands for reading alignments and correct outputs for neural nets. // Note: there is no .h file for this .cc file, since the commands // are automatically entered in the command table when they // are declared. #include #include // for atof #include "AlphabetTuple/AlphabetTuple.h" #include "Command/Command.h" #include "Input/Input.h" #include "PrintInColumns/PrintInColumns.h" #include "Filenames/Filenames.h" #include "Regularizer/BackgroundProbs.h" #include "OneChain.h" #include "SeqWeight.h" #include "Globals.h" #include "InterfaceDescription.h" #include "alignment.h" #include "TrainSet.h" #include "NeuralNet.h" #include "NetActivation.h" #include "ActivationRecord.h" // To Do: // Fix read_sequence to be able to read sequences of unknown length, // then fix DesignTo so that it does not need to know the length. // Sat Jun 25 06:36:40 PDT 2005 Kevin Karplus // I believe that WeightingRegularizer, ReRegularizer, and SequenceWeight // commands are all obsolete---these features of the input interface // should be obtained from a seed network read in with ReadNeuralNet #ifdef WEIGHTING_REGULARIZER_CODE static int EstWeightingRegularizer(istream& in, Command* self, ostream&logfile) { char filename[500]; get_word(in, filename, '\n'); if (!filename[0]) { Globals::nn()->interface(0)->WeightingRegularizer=0; logfile << "# Not using a weighting regularizer\n"; return 1; } gzifstream *reg_in = Filenames::open_input(filename); if (!reg_in) return 0; Regularizer *r= Regularizer::read_new(*reg_in); delete reg_in; if (!r) return 0; if (!r->name()) r->set_name(filename); Globals::nn()->interface(0)->WeightingRegularizer=r; logfile << "# Weighting regularizer set to " << r->name() << "\n"; return 1; } static Command WeightingRegularizerCommand("WeightingRegularizer", EstWeightingRegularizer, "takes one optional argument, the filename for a regularizer.\n\ \n\ This is an obsolete command, supported in the newer version of the code\n\ to maintain compatibility with old scripts.\n\ In the newer code version, weighting regularizers are specified in the\n\ Interface descriptions.\n\ This regularizer will be used for while setting sequence weights, and\n\ so should be a Dirichlet mixture or pseudocounts regularizer\n\ (DirichletReg, MLZPREg, MLZReg).\n\ \n\ If the command is given with no argument, then no weighting regularizer\n\ is used, which makes the EntropyWeight option unusable.\n\ HenikoffWeight and FlatWeight are still usable, but only if the first\n\ argument is set negative to indicate that weighting is not based on\n\ bits saved.\n\ "); #endif #ifdef REREGULARIZER_CODE static int PredReRegularizer(istream& in, Command* self, ostream&logfile) { char filename[500]; get_word(in, filename, '\n'); if (!filename[0]) { delete Globals::nn()->interface(0)->ReRegularizer; Globals::nn()->interface(0)->ReRegularizer=0; logfile << "# Not using a re-encodingReadReRegularizer regularizer\n"; return 1; } gzifstream *reg_in = Filenames::open_input(filename); if (!reg_in) return 0; Regularizer *r= Regularizer::read_new(*reg_in); delete reg_in; if (!r) return 0; // Add a test HERE that r really is a DirichletReg if (!r->name()) r->set_name(filename); delete Globals::nn()->interface(0)->ReRegularizer; Globals::nn()->interface(0)->ReRegularizer= (DirichletReg*)r; logfile << "# Re-encoding regularizer set to " << r->name() << "\n"; return 1; } static Command ReRegularizerCommand("ReRegularizer", PredReRegularizer, "takes one optional argument, the filename for a regularizer.\n\ \n\ This regularizer will be used for producing the \"correct\" distribution from\n\ a count vector and for re-encoding inputs into component probabilities.\n\ The regularizer must be a Dirichlet mixture.\n\ \n\ If the command is given with no argument, then no re-regularizer is\n\ used, and \"correct\" distribution from a count vector is then taken\n\ to be just the normalization of the count vector to sum to 1.\n\ "); #endif #ifdef SEQUENCE_WEIGHT_CODE static int EstSequenceWeight(istream& in, Command* self, ostream&logfile) { char word[100]; get_word(in, word); const SequenceWeightObject * w=SequenceWeightObject::find_name(word); if (!w) { cerr << "Error: Unknown SequenceWeight scheme " << word << "\n" << "Known schemes are " << "\n"; const char* Names[100]; int num=AccumulateNames(SequenceWeightObject::weighter_table(), Names); PrintNames(cerr, Names,num); cerr << flush; return 0; } Globals::nn()->interface(0)->SequenceWeighter=w; get_word(in,word, '\n', 0); Globals::nn()->interface(0)->SequenceWeightBitsToSave = word[0]? atof(word): 1.0; if(word[0]) get_word(in,word, '\n', 0); Globals::nn()->interface(0)->SequenceWeightParam = word[0]? atof(word): 1.0; logfile << "# Sequence weighting on input set to " << w->name() << "(" << Globals::nn()->interface(0)->SequenceWeightBitsToSave << ", " << Globals::nn()->interface(0)->SequenceWeightParam << ")\n"; return 1; } static Command SequenceWeightCommand("SequenceWeight", EstSequenceWeight, "has one required argument: a weighting scheme name\n\ and two optional arguments used as parameters for the weighting scheme\n\ \n\ This is an obsolete command, supported in the newer version of the code\n\ to maintain compatibility with old scripts.\n\ In the newer code version, sequence weights are specified in the\n\ Interface descriptions.\n\ The first optional argument is the number of bits to save compared to\n\ the background distribution specified by the weighting regularizer (flat\n\ distribution if no weighting regularizer).\n\ \n\ If the bits to save is negative, then the weights are not adjusted to\n\ save a specified number of bits, but the initial weighting is used\n\ without adjustment.\n\ \n\ Weighting schemes:\n\ FlatWeight gives same weight for every sequence in a block\n\ HenikoffWeight uses Henikoff position-specific weighting\n\ EntropyWeight weights relative to entropy of characters\n\ in alignment columns\n\ \n\ For FlatWeight and HenikoffWeight, initial total weight for a block is\n\ pow(number_of_sequences, second param), so 1.0 sets the total weight\n\ to the number of sequences and 0.0 sets the total weight for the block\n\ to one.\n\ \n\ For EntropyWeight, the second param speficies the power of the entropy\n\ to be used for relative weighting.\n\ \n\ If no weighting scheme is specified with a SequenceWeight command,\n\ then flat weights of 1 are used."); #endif static int PredReportWeights(istream& in, Command* self, ostream&logfile) { char filename[500]; get_word(in, filename, '\n'); if (!filename[0]) { Globals::ReportSequenceWeightsTo=0; logfile << "# Not reporting sequence weights\n"; return 1; } if (Globals::ReportSequenceWeightsTo) { Globals::ReportSequenceWeightsTo->close(); delete Globals::ReportSequenceWeightsTo; Globals::ReportSequenceWeightsTo=0; } ofstream *reg_out = Filenames::open_output(filename); if (!reg_out) return 0; Globals::ReportSequenceWeightsTo=reg_out; logfile << "# Sequence weights will be reported to " << filename << "\n"; return 1; } static Command ReportWeightsCommand("ReportWeights", PredReportWeights, "takes one optional argument, the filename to output sequence weights to.\n\ \n\ Note: the sequence weights are reported for all SUBSEQUENT a2m\n\ files that are read, since sequence weighting is done on input\n\ to get the count columns and then discarded.\n\ \n\ If the command is given with no argument, then no sequence weights\n\ will be output.\n\ "); static void PrintWeightingScheme(ostream &logfile) { if (Globals::nn()->interface(0)->SequenceWeighter) { logfile << "# Using SequenceWeight " << Globals::nn()->interface(0)->SequenceWeighter->name() << "(" << Globals::nn()->interface(0)->SequenceWeightBitsToSave << ", " << Globals::nn()->interface(0)->SequenceWeightParam << ")\n"; if (Globals::nn()->interface(0)->WeightingRegularizer) logfile << "# Using regularizer " << Globals::nn()->interface(0)->WeightingRegularizer->name() << " for sequence weight regularizer.\n"; } else { logfile << "# No weighting\n"; } logfile << flush; } static int PredReadToWhichSet(istream& in, Command* self, ostream&logfile) { char word[500]; get_word(in,word); if (EqualStrings(word, "train", 1)) { if (! Globals::training) Globals::training= new TrainSet; Globals::which_set_to_add_to = Globals::training; } else if (EqualStrings(word, "cross-train", 1)) { if (! Globals::cross_training) Globals::cross_training= new TrainSet; Globals::which_set_to_add_to = Globals::cross_training; } else if (EqualStrings(word, "test", 1)) { if (! Globals::testing) Globals::testing= new TrainSet; Globals::which_set_to_add_to = Globals::testing; } else if (EqualStrings(word, "predict", 1)) { logfile << "# Warning: deprecated argument predict for ReadToSet " << "associated with deprecated command ReadInput. Instead, use ReadForPredict.\n" << flush; return 1; } else { logfile << "# Error: can't choose " << word << " to add input to, only " << "train, cross-train, or test.\n" << "Possibly you wanted deprecated default value predict, for deprecated command ReadInput. Instead, use ReadForPredict.\n" << flush; return 0; } // Output counts are accumulated in the TrainSet class and here is // where the accumulation data structure is initialized. Globals::which_set_to_add_to->initialize_accumulation(Globals::nn()); return 1; } static Command ReadToSetCommand("ReadToSet", PredReadToWhichSet, "takes one argument, one of \n\ train\n\ cross-train\n\ test\n\ to specify which data set subsequent ReadTrain commands\n\ will be directed to.\n\ The train data is used to set parameters of the neural network.\n\ The cross-train data is used to monitor learning and control learning rate\n\ parameters.\n\ The test data is used to validate the network, but is not used to train it.\n\ \n\ Note: the following usage is deprecated: ReadToSet predict; ReadInput\n\ instead use: ReadForPredict\n\ The predict data does not have known correct answers, and is used for\n\ making predictions.\n\ "); static int PredReadForPredict(istream& in, Command* self, ostream&logfile) { char filename[500]; get_word(in, filename); char *full_name=Filenames::full_in_filename(filename); logfile << "# Reading A2M format from " << full_name << "\n" << flush; alignment A; A.read_a2m(full_name, Globals::nn()->interface(0)->Alpha); assert (A.Filename !=NULL); PrintWeightingScheme(logfile); if (Globals::nn()->interface(0)->SequenceWeighter) { Globals::nn()->interface(0)->SequenceWeighter->execute(A,0, Globals::ReportSequenceWeightsTo); } if (! Globals::predicting) Globals::predicting= new TrainSet; Globals::predicting->add_chain(new OneChain(Globals::nn()->interface(0)->ReRegularizer,A, Globals::nn(), A.names[0])); logfile << "# After reading " << filename << ", have " << Globals::predicting->num_cols() << " columns in " << Globals::predicting->num_chains() << " chains\n" ; delete [] full_name; return 1; } static Command ReadForPredictCommand("ReadForPredict", PredReadForPredict, "takes one argument, a filename for an A2M format file.\n\ \n\ The alignment is added to the predicting TrainSet.\n\ To clear the predicting TrainSet, see the ClearPrediction command.\n\ \n\ All commands determing about how the input is to be handled should be\n\ given before the ReadForPredict command, since the alignment is discarded after\n\ the necessary information has been extracted.\n\ See commands WeightingRegularizer, SequenceWeights, ReportWeights.\n\ \n\ The A2M format is a FASTA file (names are the first word on a line\n\ starting with '>') in which uppercase letters and '-' are treated as\n\ alignment columns, and lowercase letters and '.' are treated as\n\ insertion columns. Only the alignment columns are read, though\n\ number of sequences with an insertion is noted.\n\ \n\ Note: unlike the deprecated ReadInput command, ReadForPredict is NOT preceded\n\ by a ReadToSet command.\n\ "); static int PredReadInput(istream& in, Command* self, ostream&logfile) { logfile << "# Warning: deprecated command ReadInput. Use ReadForPredict.\n" << flush; return (PredReadForPredict (in, self, logfile)); } static Command ReadInputCommand("ReadInput", PredReadInput, "This command is deprecated. Instead use ReadForPredict.\n\ \n\ The old usage equivalent to ReadForPredict required two commands:\n\ ReadToSet predict\n\ ReadInput a2mfilename\n\ \n\ "); static int PredReadA2m(istream& in, Command* self, ostream&logfile) { logfile << "# Warning: deprecated command ReadA2m. Use ReadForPredict.\n" << flush; return (PredReadForPredict (in, self, logfile)); } static Command ReadA2mCommand("ReadA2m", PredReadA2m, "This command is deprecated. Instead use ReadForPredict.\n\ \n\ The old usage equivalent to ReadForPredict was:\n\ ReadA2m a2mfilename\n\ "); // Reads a character string from filename and interprets it as a // string of training data with one BaseTuple per position. // There should be "length" tuples for AlphabetTuple at. // For secondary structure, "at" is usually the EHL alphabet. // static short int *read_sequence(const char*filename, AlphabetTuple *at, int length) { char c; int num_read=0; BaseTuple bt(*at); short int *ret_array = new short int[length]; // int pre_peek; gzifstream *in = Filenames::open_input(filename); if (in==NULL || !in->good()) { cerr << "Error: Can't open structure file " << filename << " for input\n" << flush; delete [] ret_array; delete in; return 0; } for( (*in) >> c; in->good() && !in->eof() ; (*in) >> c) { switch (c) { case ' ': case '\t': case '\n': continue; // ignore white space case '>': case '#': // skip id lines and comments SkipSeparators((*in), 1, '\n'); continue; default: // UGLY CODE, reads character both as character // and as BaseTuple: if (c=='x') { // unaligned wild card, ignore it. } else if (islower(c)) { // unaligned, not wild card #ifdef WARN_LOWER_CASE cerr << "WARNING: skipping " << c << ", since it is lower case.\n"; #endif } else if (num_read >= length) { num_read++; // don't write past end of array } else { // Real character or gap. in->putback(c); (*in) >> bt; // If we come across a gap character in the structure, // it tells us that that position in the structure has // no correct answer - that it's a position to be skipped // over in both training and testing. Mark it with a // negative value. // Treat wildcards as unpredictable also ret_array[num_read++] = bt.is_normal()? at->index(bt) : -1; } } } delete in; if (num_read != length) { cerr << "Error: " << filename << " was " << num_read << " not " << length << "\n" << " Reading structure string from " << filename << ",\n" << " should have " << length << " columns (based on alignment)," << " but read " << num_read << " instead.\n" << flush; delete [] ret_array; return 0; } // cerr << "OK: " << filename << " read as sequence\n" << flush; return ret_array; } // read a seqence or alignment into an interface layer of a OneChain record // Returns 1 if read ok (or nothing to read for this layer). // Returns 0 on any error. static bool ReadTrainingData(istream& in, Command*self, ostream&logfile, OneChain *chain, int inter, InterfaceDescription::FormatType input_format) { const NeuralNet*NN = chain->nn(); const InterfaceDescription* ifd = NN->interface(inter); if (inter>0 && ifd->is_hidden()) { return 1; // nothing to train to on hidden layer } char filename[500]; for (get_word(in, filename); EqualStrings(filename,"\\"); get_word(in, filename)) {} if (!ifd->Alpha) { logfile << "# Error: training chain requires an AlphabetTuple\n" << "# to interpret sequence, but none supplied for interface layer " << inter << "\n" << "# Chain not properly constructed.\n"; return 0; } switch (input_format) { case InterfaceDescription::SEQUENCE: { logfile << "# Reading sequence for layer " << inter << " from " << filename << "\n" << flush; short int *sequence = read_sequence(filename, ifd->Alpha,chain->num_cols()); if (!sequence) { logfile << "# Error: Skipping " << filename << " because of problems reading sequence.\n"; return 0; } short int* tuple_sequence=sequence; if (ifd->TupleStart!=0 || ifd->TupleStop!=0) { // We need to redefine tuple_sequence, since we are // taking tuples that are not just one-long in // the standard position. int start=ifd->TupleStart; int stop =ifd->TupleStop; int alphsize = ifd->Alpha->num_normal(); int num_cols= chain->num_cols(); tuple_sequence=new short int [num_cols]; for (int k=0; knum_cols(); k++) { int tuple=0; for (int kt = k+start; kt<=k+stop; kt++) { tuple *= alphsize; if (kt<0 || kt>=num_cols) tuple += ifd->DefaultBaseIndex; else if (sequence[kt]<0) { tuple = -1; // gap or wildcard in sequence break; } else tuple += sequence[kt]; } assert(tupleTupleStates); tuple_sequence[k] = static_cast(tuple); } delete [] sequence; } //BAD CODE: counting and initialization for pseudocounts should // be done in a subesequent pass over the entire training set // not here! if (Globals::which_set_to_add_to && Globals::which_set_to_add_to->layer_needs_counts(inter-1)) { for (int k=0; knum_cols(); k++) Globals::which_set_to_add_to->add_count(inter-1, tuple_sequence[k]); // These counts are used to initialize Pseudo } chain->set_structure(inter, tuple_sequence); return 1; } case InterfaceDescription::ALIGNMENT: { char *full_name=Filenames::full_in_filename(filename); logfile << "# Reading Input A2M format from " << full_name << "\n" << flush; assert(ifd->Alpha->num_alphabets()==1); alignment A; A.read_a2m(full_name, ifd->Alpha); if (!A.data) { logfile << "# Skipping " << full_name << " because of problems reading desired output\n"; delete [] full_name; return 0; } delete [] full_name; if (ifd->SequenceWeighter) { // cerr << "# ClipExponent= " << ifd->ClipExponent // << " for interface " << inter << "\n" << flush; ifd->SequenceWeighter->execute(A, inter); } // cerr << "Setting Probvectors for layer :: "<< inter << endl << flush; chain->set_probvectors(inter,A,ifd->ReRegularizer); if (Globals::which_set_to_add_to && Globals::which_set_to_add_to->layer_needs_counts(inter-1)) { for (int k=0; knum_cols(); k++) { // cerr << "Trying to read from interface :"<probs_for_layer(inter, k); for(int num=0; numnum_units(); num++) { Globals::which_set_to_add_to->add_count(inter-1,num,prob[num]); } } } return 1; } default: { logfile << "Error: only training to single sequences" << " and alignments is understood now.\n" << " Training to vectors will be added later.\n"; return 0; } } return 1; } static int PredReadTrain(istream& in, Command* self, ostream& logfile) { char filename[500]; // char structfile[500]; int numlayers; // const InterfaceDescription **ifd; // int structure_needed=0; // char *structure; // int a2m_needed=1; char *full_name; NeuralNet *NN = Globals::nn(); if (!NN) { logfile << "ERROR: Must first create the neural network before " << "making the training set. Training data not read.\n"; return 0; } assert(NN->num_defined_layers() == NN->num_layers()); assert(NN->num_interfaces() == NN->num_layers()+1); numlayers = NN->num_layers(); if (!Globals::which_set_to_add_to) { if (!Globals::training) { Globals::training = new TrainSet; Globals::training->initialize_accumulation(Globals::nn()); } Globals::which_set_to_add_to = Globals::training; } // Read in the required files data structures in the order // input file then each layer that needs training. // BAD CODE: should check the first InferfaceDescription to // determine what format inputs are in!! // SUGATO: For present use, code supports just alignments as input format assert(NN->interface(0)->Alpha->num_alphabets()==1); alignment A; get_word(in, filename); full_name=Filenames::full_in_filename(filename); logfile << "# Reading Input A2M format from " << full_name << "\n" << flush; A.read_a2m(full_name, Globals::nn()->interface(0)->Alpha); if (A.width==0 || A.num_seqs==0) { logfile << "# Error: alignment from " << full_name << " not usable---had " << A.num_seqs << " sequences, each " << A.width << " long.\n" << flush; delete [] full_name; return 1; // don't abort, just ignore. } delete [] full_name; //PrintWeightingScheme(logfile); if (NN->interface(0)->SequenceWeighter) NN->interface(0)->SequenceWeighter-> execute(A,0, Globals::ReportSequenceWeightsTo); OneChain *chain = new OneChain(NN->interface(0)->ReRegularizer, A, NN, A.names[0]); // cerr << "Interface 1 IsHidden: " << NN->interface(1)->is_hidden() << endl << flush; for (int inter=1; internum_interfaces(); inter++) { if (!ReadTrainingData(in,self,logfile,chain,inter, NN->interface(inter)->InputFormat)) { delete chain; chain=NULL; SkipSeparators(in,1,'\n'); return 0; } } Globals:: which_set_to_add_to->add_chain(chain); logfile << "# After reading " << filename << ", have " << Globals::which_set_to_add_to->num_cols() << " columns in " << Globals::which_set_to_add_to->num_chains() << " chains\n" ; return 1; } static Command ReadTrainCommand("ReadTrain", PredReadTrain, "takes a filename for the input interface and each trainable interface\n\ of the network.\n\ Currently only two formats are understood:\n\ 1) A2M Alignments for the input interface\n\ 2) Sequence over a specified alphabet for trainable interfaces.\n\ Eventually other formats will be accepted (probability vectors, for example).\n\ \n\ Order of the filenames in the input script must correspond to their\n\ order in the network---input then one or more trainable layers.\n\ \n\ This command is affected by the most recently issued ReadToSet,\n\ and if no ReadToSet command has been issued, ReadToSet train is assumed.\n\ "); static int PredReadNeuralNet(istream& in, Command* self, ostream& logfile) { char filename[500]; filename[0]=0; get_word(in, filename, '\n'); if (!filename[0]) { logfile << "# No file specified for neural network description\n"; return 1; } gzifstream *net_in = Filenames::open_input(filename); if (!net_in) return 0; // Can't delete neural net if there are any OneChain records // that might be using it! // We'll just accept some storage leakage here, since the alternative // is a messy garbage collection or reference counting mechanism. Globals::add_neural_net(NeuralNet::read_new(*net_in)); delete net_in; if (!Globals::nn()) { logfile << "# Neural net description not read from " << filename << endl; return 0; } if (!Globals::nn()->name()) Globals::nn()->set_name(filename); logfile << "# Neural network set to " << Globals::nn()->name() << endl; return 1; } static Command ReadNeuralNet("ReadNeuralNet", PredReadNeuralNet, "takes one required argument, the filename for a neural network\n\ description.\n\ Reads and sets the global neural network structure.\n\ Note: some old networks have an incorrect format---the current format\n\ begins with ClassName = NeuralNet and ends with EndClassname = NeuralNet,\n\ with all the NeuralLayer and InterfaceDescription records inside.\n\ \n\ The command actually adds a new neural network to a list of neural nets,\n\ with subsequent commands referring to just the last neural net read.\n\ Eventually, the Design1st and DesignTo commands will refer to the entire\n\ list of neural nets.\n\ All neural nets read in should have identical interface descriptions\n\ for the inputs, so that the back-propagation in Design1st can work.\n\ "); static int EstNameNet(istream& in, Command* self, ostream& logfile) { char netname[300]; get_word(in, netname, '\n'); if (!Globals::nn()) { logfile << "# Error: no neural net to name " << netname << endl << flush; return 0; } if (!netname[0]) { logfile << "# Warning: no name specified for network in" << self->name() << endl <set_name(netname); logfile << "# Neural network set to " << Globals::nn()->name() << endl << flush; return 1; } static Command NameNet("NameNet", EstNameNet, "takes one required argument, the name to give the current neural network,\n\ which should have already been read by ReadNeuralNet.\n"); static Command NameNeuralNet("NameNeuralNet", &NameNet); // OBSOLETE CODE: // static int ReadAlphabet(istream &in, Command* self, ostream&logfile) // { char filename[500]; // get_word(in, filename, '\n'); // if (!filename[0]) // { logfile << "# Error: no file specified for " << self->name() <<"\n"; // return 1; // } // // int num_read=Alphabet::load_alphabet_file(filename); // logfile << "# Read "<< num_read // << " alphabets from " << filename << "\n" << flush; // // return 1; // } // // static int ReadAlphabetOrBackgroundProbs(istream &in, Command* self, ostream&logfile) { char filename[500]; get_word(in, filename, '\n'); if (!filename[0]) { logfile << "# Error: no file specified for " << self->name() <<"\n"; return 1; } vector probs_set; int num_read=NamedClass::read_many_new(probs_set,filename,NULL); int NumAlphabets=0; int NumBackgrounds=0; for(int i=0; iis_a(Alphabet::classID())) { NumAlphabets++; } else if (probs_set[i]->is_a(BackgroundProbs::classID())) { BackgroundProbs *bp = dynamic_cast(probs_set[i]); const AlphabetTuple *al = bp->alphabet_tuple(); if (! al) { // dummy BackgroundProbs that never found an Alphabet delete bp; probs_set[i]=NULL; continue; } NumBackgrounds++; bp->set_name(al->name()); NamedObject *old_bp = Globals::BackgroundProbsByName.FindOldName(bp->name(), ZeroIfNew); if (old_bp && old_bp!=bp) { logfile << "# Warning: replacing old background probs for Alphabet " << bp->name() << "\n" << flush; Globals::BackgroundProbsByName.DeleteName(old_bp->name()); delete old_bp; } Globals::BackgroundProbsByName.AddName(bp); } else { logfile << "Warning: in " << self->name() << " read an object of type " << probs_set[i]->type()->name() << " which is neither an Alphabet nor a BackgroundProbs\n" << flush; } } logfile << "# Read "<< NumAlphabets << " alphabets and " << NumBackgrounds << " BackgroundProbs from " << filename << "\n" << flush; return 1; } static Command ReadBackgroundProbsCommand("ReadBackgroundProbs", ReadAlphabetOrBackgroundProbs, "ReadBackgroundProbs filename\n\ reads a file defining new BackgroundProbs probability vectors for use\n\ in computing information gain in networks.\n\ Alphabets may also be defined in the same file, but must be defined\n\ (not necessarily in the same file) before the BackgroundProbs using the alphabet.\n\ Note: ReadBackgroundProbs and ReadAlphabet are now synonymous.\n\ "); static Command ReadAlphabetCommand("ReadAlphabet", ReadAlphabetOrBackgroundProbs, "ReadAlphabet filename\n\ reads a file defining new alphabets for use in networks.\n\ BackgroundProbs may also be defined in the same file, but the Alphabet\n\ must be defined (not necessarily in the same file)\n\ before the BackgroundProbs using the alphabet.\n\ Note: ReadBackgroundProbs and ReadAlphabet are now synonymous.\n\ "); // Added by Frellsen static int PredReadDesignTo(istream& in, Command* self, ostream& logfile) { if (!Globals::nn()) { logfile << "# ERROR: A neural network must be read before\n" << "# reading data to design for. DesignTo data not read.\n"; return 1; } // Assert that the neural nets are well-defined for(int n=0; nnum_defined_layers() == NN->num_layers()); assert(NN->num_interfaces() == NN->num_layers()+1); } // If a design_chains exist, delete them Globals::clear_design_chains(); // Read the number of columns from in and create a empty chain int num_cols; in >> num_cols; if ( num_cols <= 0) { logfile << "# ERROR: The number of columns must be greater than 0.\n" << "# You entered " << num_cols << ".\n"; return 1; } for(int n=0; n(Globals::design_chains.size()) == Globals::num_nns()); Globals::initialize_freeze(); Globals::initialize_change(); // Set the "correct" sequence in the "Structure" field of layer 0, // for use in evaluating sequence-recovery. // This can be set to an arbitrary sequence of the right length, // if real design is being done. if (!ReadTrainingData(in,self,logfile,Globals::design_chains[0],0,InterfaceDescription::SEQUENCE)) { Globals::clear_design_chains(); SkipSeparators(in,1,'\n'); return 0; } // copy interface 0 from design_chains[0] to rest of design_chains. const short int* copy_structure_from = Globals::design_chains[0]->structure(0); for(int n=1; nset_structure(0,tuple_sequence); } // Read sequences to design to for each trainable interface for(int n=0; nnum_interfaces(); inter++) { if (!ReadTrainingData(in,self,logfile,Globals::design_chains[n],inter,NN->interface(inter)->InputFormat)) { Globals::clear_design_chains(); SkipSeparators(in,1,'\n'); return 0; } } } return 1; } // Added by Frellsen static Command ReadDesignToCommand("DesignTo", PredReadDesignTo, "Sets up the desired target outputs for a Design1st command\n\ The arguments are\n\ 1st : The number of columns in the designed sequence.\n\ 2nd : filename for the sequence of the actual protein, used for \n\ sequence-recovery tests. This can be a dummy sequence\n\ of the right length for real designs.\n\ 3rd-nth : The sequence to design to (one for each trainable layer)\n\ (Format: Sequence over a specified alphabet for\n\ trainable interfaces.)\n\ \n\ A neural net must first have been read with 'ReadNeuralNet'.\n\ The order of the filenames in the input script must correspond to their\n\ order in the network---input one for each trainable layers.\n\ If multiple neural nets have been read in, there must be outputs for\n\ all the trainable layers, in the order that the neural nets were read.\n\ "); // CHANGE LOG: // 21 July 1997 Kevin Karplus // copied from estimate-dist and removed MSSP and BLOCKS formats // (also ReadCounts and ClearCounts) // 23 July 1997 Kevin Karplus // modified ReadA2M to try upen_input first to gunzip if needed // 21 March 1998 Kevin Karplus // added NameNet command. // 5 March 1998 Kevin Karplus // fixed bug in PredReadTrain that caused the weighting not to be done. // 25 March 1998 Kevin Karplus // initialize network (if necessary) before printing it. // 30 March 1998 Kevin Karplus // Moved some stuff out to TrainingCommands.cc // Fixed read_sequence to accept standard fasta format (with single // sequence) // 5 May 1998 Melissa Cline // Added support for gap characters in the structure files. When a // gap character is encountered, it indicates that the // corresponding input position should not be used for training or // testing. // 11 May 1998 Melissa Cline // Moved the regularizer reading to the neural net interface. // 31 May 1998 Kevin Karplus // Improved error messages, and fixed structure string reading to // use Filenames::open_input. // 4 June 1998 Kevin Karplus // Modified PredReadTrain to ignore alignments with missing // or mismatching training data. // 20 July 1998 Kevin Karplus // Removed initialization of network from PredReadNeuralNet // (may need to add it elsewhere, but I'm not sure where---the // obvious places seemed to have it already) // 15 September 1999 Sugato Basu // Removed code for reading weighting regularizer and sequence // weight into the interface description. Commands for reading // weighting regularizer and sequence weight are still supported // to maintain compatibility with old scripts, but these commands // are obsolete wrt the new version of the code // 15 September 1999 Sugato Basu // Added code to handle ALIGNMENT as an output format // 16 August 2001 Kevin Karplus // Added ReadAlphabet command. // 9 October 2001 Kevin Karplus // Fixed sequence weighting for layers other than 0. // 30 March 2004 Sol Katzman // Deprecated ReadInput,ReadA2m in favor of ReadForPredict // 15 April 2004 Kevin Karplus // Added missing \n\ to ReadInput help string. // 01 June 2004 Sol Katzman // Fatal error if invalid ReadToSet parameter is specified. // Mon Jun 13 04:32:02 PDT 2005 Kevin Karplus // Picked up Jes Frellen's additions // PredReadDesignTo // ReadDesignToCommand // Mon Jun 13 04:59:10 PDT 2005 Kevin Karplus // Improved help string for ReadDesignToCommand // Wed Jun 15 12:57:43 PDT 2005 Kevin Karplus // Eliminate filename for OneChain constructor in ReadDesignTo // Wed Jun 15 20:32:45 PDT 2005 Kevin Karplus // Fixed incorrect delete (instead of delete[]) in ReadAlphabet // Fri Jun 17 12:55:05 PDT 2005 Kevin Karplus // Created ReadTrainingData, and used it to eliminate duplicated code. // Fri Jun 17 15:28:56 PDT 2005 Kevin Karplus // Added true protein sequence input for DesignTo, for // sequence-recovery tests. // 23 June 2005 Kevin Karplus // Removed extra full_in_filename from ReadAlphabet // Fri Jun 24 20:44:10 PDT 2005 Kevin Karplus // Replaced Globals::NN with Globals::nn() // Added Globals::add_neural_net() for reading neural net. // Sat Jun 25 06:42:51 PDT 2005 Kevin Karplus // fixed help string for ReadNeuralNet // Thu Jul 7 03:52:17 PDT 2005 Kevin Karplus // ReadAlphabet and ReadBackgroundProbs merged. // Thu Jul 7 17:45:40 PDT 2005 Kevin Karplus // Added warning for duplicate BackgroundProbs. // Thu Jul 7 18:32:31 PDT 2005 Kevin Karplus // Added check for BackgroundProbs that has no alphabet. // Sat Jul 9 04:52:17 PDT 2005 Kevin Karplus // Added reading of multiple design outputs in ReadDesignTo // Sat Jul 9 05:33:16 PDT 2005 Kevin Karplus // added copy of first layer in ReadDesignTo, which should // complete ReadDesignTo for multiple neural nets. // Sat Jul 9 19:34:50 PDT 2005 Kevin Karplus // added check for \ at end of line in ReadDesignTo // Thu Jul 14 21:54:57 PDT 2005 Kevin Karplus // added Globals::initialize_freeze to DesignTo // Fri Jul 22 14:40:41 PDT 2005 Kevin Karplus // Made const float * for return from probs_for_layer // Sun Feb 5 03:10:11 PST 2006 Kevin Karplus // Added initialize_change after initialize_freeze // Mon Dec 27 15:51:40 PST 2010 Kevin Karplus // Put the warning for lower-case characters inside an ifdef // so that it is usually not provided. I may want to make // this user-controlled.