// InterfaceDescription.cc // copyright Kevin Karplus // 30 July 1997 #include #include // included for atoi #include // for sprintf #include "InterfaceDescription.h" #include "Input/Input.h" #include "EqualStrings/EqualStrings.h" #include "AlphabetTuple/AlphabetTuple.h" #include "PrintInColumns/PrintInColumns.h" #include "Filenames/Filenames.h" // information for the NamedClass InterfaceDescription static NamedClass *create_interface(void) {return new InterfaceDescription;} IdObject InterfaceDescription::ID("InterfaceDescription", create_interface, 0, "InterfaceDescription provides information about how the input or output\n\ of a NeuralLayer is to be handled.\n"); NameToPtr* InterfaceDescription::CommandTable=0; static const char* FormatNames[] = { "ALIGNMENT", "SEQUENCE", "VECTORS", "NUMBERS", 0 }; InterfaceDescription::InterfaceDescription(void) { Alpha=0; TupleStart=TupleStop=0; DefaultBaseIndex=0; TupleStates=0; Names=0; UseInsert=UseDelete=0; UseEntropy=UseProbOfGuide=0; UseLogOdds=0; NumUnits=0; TrainTo=0; HideTemporarily = 0; SequenceWeighter=0; SequenceWeightBitsToSave=1.0; SequenceWeightParam=1.0; ClipExponent=-1; // turn off clipping by default. WeightingRegularizer=0; ReRegularizer=0; NetRegularizer = 0; UseComponentProbs=0; UseAminoAcidProbs=1; UseGuide=0; InputFormat = ALIGNMENT; } InterfaceDescription::InterfaceDescription(const InterfaceDescription*old) { NumUnits= old->NumUnits; TrainTo= old->TrainTo; HideTemporarily = old->HideTemporarily; Alpha= old->Alpha? new AlphabetTuple(* (old->Alpha)): 0; TupleStart=old->TupleStart; TupleStop=old->TupleStop; DefaultBaseIndex=old->DefaultBaseIndex; TupleStates= old->TupleStates; if (old->Names) { Names = new UnitName[NumUnits]; for (int i=0; iunit_name(i)); } } else Names=0; UseLogOdds= old->UseLogOdds; UseInsert= old->UseInsert; UseDelete= old->UseDelete; UseEntropy= old->UseEntropy; UseProbOfGuide= old->UseProbOfGuide; SequenceWeighter= old->SequenceWeighter; SequenceWeightBitsToSave= old->SequenceWeightBitsToSave; SequenceWeightParam= old->SequenceWeightParam; ClipExponent = old->ClipExponent; WeightingRegularizer= old->WeightingRegularizer; ReRegularizer= old->ReRegularizer; NetRegularizer = old->NetRegularizer; UseComponentProbs= old->UseComponentProbs; UseAminoAcidProbs= old->UseAminoAcidProbs; UseGuide= old->UseGuide; InputFormat = old->InputFormat; } InterfaceDescription::~InterfaceDescription(void) { if (Alpha) delete Alpha; if (Names) delete [] Names; } void InterfaceDescription::set_unit_name(int unit_number, const char*nm) { assert(unit_number < NumUnits); if (!Names) Names=new UnitName[NumUnits]; Names[unit_number].set_name(nm); Names[unit_number].number = unit_number; UnitNumber.AddName(&(Names[unit_number]), ErrorIfOld); } const char* InterfaceDescription::generate_unit_name(int u) const { if (u<0 || u >=NumUnits) return "illegal_unit"; static char temporary_space[30]; if (Alpha && (is_TrainTo() || (UseAminoAcidProbs && unum_alphabets()==1); // can't handle tuples of tuples yet int tmpu = u; for (int i=TupleStop-TupleStart; i>=0; i--) { temporary_space[i] = (*Alpha)[0] -> unindex_to_char( tmpu% Alpha->num_normal()); tmpu /= Alpha->num_normal(); } assert(tmpu==0); temporary_space[TupleStop-TupleStart+1] = 0; } else if (Alpha) { // insertion, deletion, net_reg, or component prob assert(!UseAminoAcidProbs || u>=TupleStates); int alph_size = ((UseAminoAcidProbs || UseGuide) ? Alpha->num_normal() : 0); int tmpu=u+1 - UseAminoAcidProbs*TupleStates; if (UseInsert) tmpu--; if (tmpu==0) return("insert"); if (UseDelete) tmpu--; if (tmpu==0) return("delete"); if (UseGuide) { if (tmpu <= alph_size) { sprintf(temporary_space,"guide_%d_%c",tmpu-1,(*Alpha)[0] -> unindex_to_char(tmpu-1)); return temporary_space; } else { tmpu -= alph_size; } } if (UseEntropy) tmpu --; if (tmpu==0) return "entropy"; if (UseProbOfGuide) tmpu --; if (tmpu==0) return "prob_of_guide"; if (NetRegularizer) { sprintf(temporary_space,"netreg_%d",tmpu-1); } else if (UseComponentProbs) { sprintf(temporary_space,"comp_%d",tmpu-1); } else { cerr << "DEBUG: u,tmpu,alph_size,TupleStates,UseAA,UseIns,UseDel,UseGuide,NetReg,UseCC " << u << "," << tmpu << "," << alph_size << "," << TupleStates << UseAminoAcidProbs << "," << UseInsert << "," << UseDelete << "," << UseGuide << "," << NetRegularizer << "," << UseComponentProbs << "\n" ; assert(0); // unreachable---indicates unit_number out of range } } else // !Alpha { sprintf(temporary_space, "%d", u); } // cerr << name() << ".unit_name(" << u << ")=" // <(UnitNumber.FindOldName(nm)); return u->number; } int ReadName(istream &in, InterfaceDescription *chg, IFInputCommand *self) { char word[500]; get_word(in, word); chg->set_name(word); return 1; } int VerifyClassName(istream &in, InterfaceDescription *change, IFInputCommand* self) { char word[100]; get_word(in, word); const IdObject *end_id = IdObject::id(word); if (end_id != change->type()) { cerr << "Warning: " << self->name() << word << " doesn't match " << change->type()->name() << "\n" << flush; } // continue if "ClassName", stop if "EndClassName" return EqualStrings(self->name(), "ClassName", 1); } int ReadPosIntParam(istream &in, int ¶m, InterfaceDescription *change, IFInputCommand* self) { int tmp; in >> tmp; if (tmp <0) { cerr << "Error: must have " << self->name() << " >=0\n"; return 0; } param=tmp; return 1; } int ReadFloatParam(istream &in, float ¶m, InterfaceDescription *change, IFInputCommand* self) { float tmp; in >> tmp; if (tmp <0) { cerr << "Error: must have " << self->name() << " >=0\n"; return 0; } param=tmp; return 1; } int ReadInputFormat(istream &in, InterfaceDescription *chg, IFInputCommand *self) { char word[500]; get_word(in, word); for(int i=0; FormatNames[i]; i++) { if (EqualStrings(word, FormatNames[i], 1)) { chg->InputFormat = static_cast (i); return 1; } } cerr << "Error: " << word << " not a recognized input format\n"; return 0; } int ReadTupleStart(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in >> chg->TupleStart; return 1; } int ReadTupleStop(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in >> chg->TupleStop; return 1; } int ReadDefaultBaseIndex(istream &in, InterfaceDescription *chg, IFInputCommand *self) { return ReadPosIntParam(in, chg->DefaultBaseIndex, chg, self); } int ReadDefaultBase(istream &in, InterfaceDescription *chg, IFInputCommand *self) { assert(chg->Alpha != NULL); BaseTuple bt(* (chg->Alpha)); in >> bt; chg->DefaultBaseIndex = chg->Alpha->index(bt); return 1; } int ReadUnitNames(istream &in, InterfaceDescription *chg, IFInputCommand *self) { // must have already set up NumUnits assert(chg->num_units()>0); char nm[500]; for (int i=0; inum_units(); i++) { get_word(in, nm); chg->set_unit_name(i,nm); } return 1; } int ReadInsertUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in>>chg->UseInsert; return 1; } int ReadDeleteUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in>>chg->UseDelete; return 1; } int ReadGuideUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in >> chg->UseGuide; return 1; } int ReadEntropyUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in>>chg->UseEntropy; return 1; } int ReadGuideProbUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in>>chg->UseProbOfGuide; return 1; } int ReadNumUnits(istream &in, InterfaceDescription *chg, IFInputCommand *self) { return ReadPosIntParam(in, chg->NumUnits, chg, self); } int ReadCompUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { return ReadPosIntParam(in, chg->UseComponentProbs, chg, self); } int ReadAAUse(istream &in, InterfaceDescription *chg, IFInputCommand *self) { return ReadPosIntParam(in, chg->UseAminoAcidProbs, chg, self); } int ReadUseLogOdds(istream &in, InterfaceDescription *chg, IFInputCommand *self) { return ReadPosIntParam(in, chg->UseLogOdds, chg, self); } int ReadTrainTo(istream &in, InterfaceDescription *chg, IFInputCommand *self) { in>> chg->TrainTo; return 1; } int ReadAlphabetTuple(istream &in, InterfaceDescription *chg, IFInputCommand *self) { AlphabetTuple *tmp = 0; const char*word=self->name(); if (EqualStrings(word, "Alphabet",1)) tmp= read_AlphabetTuple(in,1); else if (EqualStrings(word, "AlphabetPair",1)) tmp= read_AlphabetTuple(in,2); else if (EqualStrings(word, "AlphabetTriple",1)) tmp= read_AlphabetTuple(in,3); else if (EqualStrings(word, "AlphabetTuple",1)) tmp= read_AlphabetTuple(in); if (!tmp) return 0; chg->Alpha = tmp; return 1; } static int read_regularizer(istream& in,Regularizer *&d,int is_dirichlet) { char filename[500]; get_word(in, filename, '\n'); if (!filename[0] || EqualStrings(filename,"0")) { delete d; d=0; return 1; } gzifstream *reg_in = Filenames::open_input_fullname(filename); if (!reg_in) return 0; Regularizer *r= Regularizer::read_new(*reg_in); delete reg_in; if (!r) return 0; r->set_name(filename); // force the name to match the filename if (is_dirichlet && !r->is_a(DirichletReg::classID())) { cerr << "Regularizer " << r->name() << " is not a DirichletReg\n" <<" can't use anything else as a ReRegularizer.\n" << flush; return 0; } delete d; d=r; return 1; } int ReadWeightingRegularizer(istream& in, InterfaceDescription * chg, IFInputCommand * self ) { return read_regularizer(in,chg->WeightingRegularizer,0); } int ReadReRegularizer(istream& in, InterfaceDescription *chg, IFInputCommand *self) { Regularizer * reg=0; int ret = read_regularizer(in,reg,1); chg->ReRegularizer= dynamic_cast (reg); return ret; } int ReadNetRegularizer(istream& in, InterfaceDescription *chg, IFInputCommand *self) { Regularizer * reg=0; int ret = read_regularizer(in,reg,1); chg->NetRegularizer= dynamic_cast (reg); return ret; } int ReadSequenceWeight(istream& in, InterfaceDescription *chg, IFInputCommand *self) { char word[100]; get_word(in, word); const SequenceWeightObject * w=SequenceWeightObject::find_name(word); if (!w) { cerr << "Error: Unknown SequenceWeight scheme " << word << "\n" << "Known schemes are " << "\n"; const char* Names[100]; int num=AccumulateNames(SequenceWeightObject::weighter_table(), Names); PrintNames(cerr, Names,num); cerr << flush; return 0; } chg->SequenceWeighter=w; get_word(in,word, '\n', 0); chg->SequenceWeightBitsToSave = word[0]? atof(word): 1.0; if(word[0]) get_word(in,word, '\n', 0); chg->SequenceWeightParam = word[0]? atof(word): 1.0; if(word[0]) get_word(in,word, '\n', 0); chg->ClipExponent = word[0]? atof(word): -1.0; return 1; } int ReadSequenceWeightBitsToSave(istream& in, InterfaceDescription *chg, IFInputCommand *self) { char word[100]; get_word(in,word, '\n', 0); chg->SequenceWeightBitsToSave = word[0]? atof(word): 1.0; return 1; } int ReadSequenceWeightParam(istream& in, InterfaceDescription *chg, IFInputCommand *self) { char word[100]; get_word(in,word, '\n', 0); chg->SequenceWeightParam = word[0]? atof(word): 1.0; return 1; } int ReadClipExponent(istream& in, InterfaceDescription *chg, IFInputCommand *self) { char word[100]; get_word(in,word, '\n', 0); chg->ClipExponent = word[0]? atof(word): -1.0; return 1; } int InterfaceDescription::read_knowing_type(istream &in) { if (! command_table()) init_command_table(); int have_read_EndClass =0; char word[300]; while (in.good()) { get_word(in, word, '='); IFInputCommand *comm = dynamic_cast (command_table()->FindOldName(word, ZeroIfNew)); if (!comm) comm = dynamic_cast (InterfaceDescription::CommandTable-> FindOldName(word, ZeroIfNew)); if (comm) { if (!comm->execute(in, this)) { have_read_EndClass=1; break; } } else { cerr << "Unrecognized keyword: " << word << " for type " << type()->name() << " " << name() << "\n" << flush; SkipSeparators(in, 1, '\n'); } } // do some consistency checks: if (Alpha==NULL) { UseAminoAcidProbs=UseGuide=UseComponentProbs=UseInsert=UseDelete=UseEntropy=UseProbOfGuide=0; return have_read_EndClass; } assert(TupleStop >= TupleStart); int alph_size = ((UseAminoAcidProbs || UseGuide) ? Alpha->num_normal() : 0); TupleStates=1; for (int ts= TupleStart; ts<=TupleStop; ts++) TupleStates *= alph_size; if(NetRegularizer && UseComponentProbs) { cerr << "In interface " << name() << " UseComponentProbs cannot be set if NetRegularizer is set\n" << "Turning UseComponentProbs off\n"; UseComponentProbs = 0; } else if (UseComponentProbs && ReRegularizer==NULL) { cerr << "In interface " << name() << " UseComponentProbs specified without a ReRegularizer\n" << "Turning UseComponentProbs off\n"; UseComponentProbs = 0; } int real_num_units = compute_num_units(); if (real_num_units!=NumUnits) { if (NumUnits) cerr << "WARNING: In interface " << name() << " number of units should be " << real_num_units << " not " << NumUnits << "\n" << flush; NumUnits = real_num_units; } if (is_TrainTo()) { // This is a trainable (output) layer, so // the unit names need to agree with the letters of the alphabet. assert(Alpha); // earlier test returned if Alpha not specified if (NumUnits != TupleStates) { cerr << "ERROR: In output interface " << name() << " the number of units should be the size of the alphabet, " << TupleStates << ", not " << NumUnits << "\n" << flush; assert(NumUnits==TupleStates); } for (int u=0; uAddName(new IFInputCommand("ClassName", VerifyClassName)); CommandTable->AddName(new IFInputCommand("UseInsert", ReadInsertUse)); CommandTable->AddName(new IFInputCommand("UseDelete", ReadDeleteUse)); CommandTable->AddName(new IFInputCommand("UseEntropy", ReadEntropyUse)); CommandTable->AddName(new IFInputCommand("UseProbOfGuide", ReadGuideProbUse)); CommandTable->AddName(new IFInputCommand("NumUnits", ReadNumUnits)); CommandTable->AddName(new IFInputCommand("TrainTo", ReadTrainTo)); CommandTable->AddName(new IFInputCommand("InputFormat", ReadInputFormat)); CommandTable->AddName(new IFInputCommand("UseComponentProbs", ReadCompUse)); CommandTable->AddName(new IFInputCommand("UseAminoAcidProbs", ReadAAUse)); CommandTable->AddName(new IFInputCommand("UseGuide", ReadGuideUse)); CommandTable->AddName(new IFInputCommand("UseLogOdds", ReadUseLogOdds)); CommandTable->AddName(new IFInputCommand("TupleStart", ReadTupleStart)); CommandTable->AddName(new IFInputCommand("TupleStop", ReadTupleStop)); CommandTable->AddName(new IFInputCommand("DefaultBaseIndex", ReadDefaultBaseIndex)); CommandTable->AddName(new IFInputCommand("DefaultBase", ReadDefaultBase)); CommandTable->AddName(new IFInputCommand("UnitNames", ReadUnitNames)); CommandTable->AddName(new IFInputCommand("Alphabet", ReadAlphabetTuple)); CommandTable->AddName(new IFInputCommand("AlphabetPair", ReadAlphabetTuple)); CommandTable->AddName(new IFInputCommand("AlphabetTriple", ReadAlphabetTuple)); CommandTable->AddName(new IFInputCommand("AlphabetTuple", ReadAlphabetTuple)); CommandTable->AddName(new IFInputCommand("NetRegularizer", ReadNetRegularizer)); CommandTable->AddName(new IFInputCommand("ReRegularizer", ReadReRegularizer)); CommandTable->AddName(new IFInputCommand("WeightingRegularizer", ReadWeightingRegularizer)); CommandTable->AddName(new IFInputCommand("SequenceWeight", ReadSequenceWeight)); CommandTable->AddName(new IFInputCommand("SequenceWeightParam", ReadSequenceWeightParam)); CommandTable->AddName(new IFInputCommand("SequenceWeightBitsToSave", ReadSequenceWeightBitsToSave)); CommandTable->AddName(new IFInputCommand("ClipExponent", ReadClipExponent)); CommandTable->AddName(new IFInputCommand("EndClassName", VerifyClassName)); } // check whether this is compatible with an existing interface // (useful for checking whether two neural nets can share a // common input). // Report message if not compatible and err_log not NULL, // with names of networks provided by old_name and new_name bool InterfaceDescription::is_compatible(const InterfaceDescription* old_ifd, const char* old_name, const char*new_name, ostream *err_log) const { if (old_ifd->TupleStates != TupleStates) { if (err_log) { *(err_log) << "Error: old network " << old_name << " had alphabet size " << old_ifd->TupleStates << ", but new network " << new_name << " has " << TupleStates << "\n" << flush; } return 0; } if (old_ifd->UseInsert != UseInsert || old_ifd->UseDelete!=UseDelete) { if (err_log) { *(err_log) << "Error: old network " << old_name << " and new network " << new_name << " have different UseInsert or UseDelete settings" << "\n" << flush; } return 0; } if (old_ifd->UseEntropy != UseEntropy || old_ifd->UseProbOfGuide!=UseProbOfGuide) { if (err_log) { *(err_log) << "Error: old network " << old_name << " and new network " << new_name << " have different UseEntropy or UseProbOfGuide settings" << "\n" << flush; } return 0; } if (old_ifd->num_units() != num_units()) { if (err_log) { *(err_log) << "Error: old network " << old_name << " has " << old_ifd->num_units() << " input units" << " and new network " << new_name << " has " << num_units() << "\n" << flush; } return 0; } //BUG: still need to check UseLogOdds, ReRegularizer, UseComponentProbs, // UseAminoAcidProbs, UseGuide, // and maybe // SequenceWeighter, SequenceWeightBitsToSave, SequenceWeightParam, ClipExponent} return 1; } // fill the vector vect with the appropriate // values (depending on the Use... variables) // Make sure that order agrees with unit_name()! // Don't change order, as this will invalidate existing networks. void InterfaceDescription::fill_vector(float *vect, const float* aa_probs, float insert_prob, float delete_prob, const float* guide_probs, const float* component_probs, float entropy, float prob_of_guide ) const { int num_comp = ReRegularizer!=NULL? ReRegularizer->num_components(): 0; int num_alph = (Alpha!=NULL)? Alpha->num_normal() : num_units(); // BUG: forgot to include TupleStop and TupleStart! int ProbIndex = 0; // POSSIBLE BUG: // NetRegularizer may not be being filled correctly! if (UseAminoAcidProbs || NetRegularizer) { assert (!UseAminoAcidProbs || !NetRegularizer); // assume mutually exclusive for (int ii=0; ii < num_alph; ii++) { vect[ProbIndex++] = aa_probs[ii]; } } if (UseInsert) { vect[ProbIndex++] = insert_prob; } if (UseDelete) { vect[ProbIndex++] = delete_prob; } if (UseGuide) { for (int ii=0; ii < num_alph; ii++) { vect[ProbIndex++] = guide_probs[ii]; } } if (UseEntropy) vect[ProbIndex++] = entropy; if (UseProbOfGuide) vect[ProbIndex++] = prob_of_guide; if (UseComponentProbs) { for (int ii=0; ii < num_comp; ii++) { vect[ProbIndex++] = component_probs[ii]; } } assert(ProbIndex==num_units()); } // CHANGE LOG: // 30 March 1998 Kevin Karplus // eliminated UseUniqueSequence and UseVectors // in favor of InputFormat // 10 May 1998 Melissa Cline // Added support for UseAminoAcidProbs and new functionality that // permits use of Amino acid probabilities and / or component // probabilities. // 11 May 1998 Melissa Cline // Added ReadReRegularizer, moved from ReadCommands::PredReRegularizer. // 15 September 1999 Sugato Basu // Added ReadWeightingRegularizer, ReadSequenceWeight, ReadSequenceWeight // Param and ReadSequenceWeightBitsToSave, moved from ReadCommands // 15 September 1999 Sugato Basu // Added ReadNetRegularizer to read in the NetRegularizer. When the // NetRegularizer is used, the output layer trains to the mixture // coefficients of the NetRegularizer. // 27 December 1999 Kevin Karplus // Added TupleStart, TupleStop, and DefaultBaseIndex to I/O // 20 February 2000 Kevin Karplus // Added ClipExponent // 19 April 2004 Sol Katzman // Added UseGuide // 19 May 2004 Sol Katzman // Corrected calculation of NumUnits when UseGuide and !UseAminoAcidProbs // Fri Aug 12 14:53:17 PDT 2005 Kevin Karplus // Created is_compatible() from existing code in Globals::add_neural_net // Fri Aug 12 17:06:20 PDT 2005 Kevin Karplus // Created fill_vector to fill an input vector in the right positions // Fri Aug 12 17:37:16 PDT 2005 Kevin Karplus // Used compute_num_units in read_knowing_type // Fri Aug 12 20:46:28 PDT 2005 Kevin Karplus // Added entropy and prob_of_guide to fill_vector // Added ReadEntropyUse and ReadGuideProbUse // Changed several "Use" parameters to bool. // Tue Oct 25 13:23:15 PDT 2005 Kevin Karplus // Modified unit_name to generate names for output layers as well // as input layers // Renamed unit_name to be generate_unit_name. // Added test and correction of unit names for output layers.