// create small examples of what different regularizers do #include "SubstPseudoReg.h" #include "GribskovReg.h" #include "DirichletReg.h" #include "MLZReg.h" #include "MLPReg.h" #include "../Alphabet/Alph.h" #include "../Utilities/IOSmacros.h" #include #include void PrintProbs(Regularizer **regs, const float counts[20], int which_varies) { float probs[20]; int num_regs=0; Regularizer **r; for (r=regs; *r; r++) num_regs++; cout << "\\begin{table}\n" << "\\begin{center}\n" << "\\begin{tabular}{|l" ; for (r=regs; *r; r++) cout << "|l"; cout << "|}\n"; cout << "\\hline\n" << "&\\multicolumn{" << num_regs << "}{c|}{probabilities given " << counts[which_varies] << " " << Alph::ExtAA().full_name(Alph::ExtAA().unindex(which_varies)) << (counts[which_varies]>1 ? "s": "") ; for (int letter=0; letter<20; letter++) { if (letter==which_varies || counts[letter]==0) continue; cout << ", " << counts[letter] << " " << Alph::ExtAA().full_name(Alph::ExtAA().unindex(letter)) << (counts[letter]>1 ? "s": "") ; } cout << "} \\\\ \n" << "\\hline\n" ; Regularizer ** prev_reg=regs; for (r=regs; *r; r++) { if ((*r)->type() != (*prev_reg)->type()) { cout << "&\\multicolumn{" << (r-prev_reg) << "}{c|}{" << (*prev_reg)->type()->name() << "}"; prev_reg = r; } } cout << "&\\multicolumn{" << (r-prev_reg) << "}{c|}{" << (*prev_reg)->type()->name() << "}"; cout << "\\\\ \n" << "\\hline\n"; for (r=regs; *r; r++) { const char* nm = (*r)->name(); cout << "& " ; for (const char*x=nm; *x; x++) { if (!isalnum(*x)) cout << "\\"; cout << *x; } } cout << "\\\\ \n"; const char *BoldFaceThese = "ILV"; // note: rather inefficient to recompute the regularized // probabilities 20 times, but who cares? for(int i=0; i<20; i++) { if (i%5==0) cout << "\\hline\n"; int DoBoldFace=0; char lett= Alph::ExtAA().to_char(Alph::ExtAA().unindex(i)) ; for (const char *c= BoldFaceThese; *c; c++) { if (*c==lett) { DoBoldFace=1; break; } } cout << (DoBoldFace? "\\bf ": "") << lett << " "; for (r=regs; *r; r++) { (*r)->get_probs(counts, probs); cout << (DoBoldFace? "&\\bf ": "& ") << IOSFIXED << setprecision(3) << probs[i] ; } cout << "\\\\ \n"; } cout << "\\hline\n" << "\\end{tabular}\n" << "\\end{center}\n" << "\\end{table}\n" << "\\clearpage\n" << "\n\n" ; } void PrintComponentProbs(Regularizer **regs, const float counts[20]) { int max_comp=0; // maximum number of components Regularizer **r; for (r=regs; *r; r++) { if ((*r)->type() == DirichletReg::classID() && max_comp < (dynamic_cast(*r))->num_components()) max_comp = (dynamic_cast(*r))->num_components(); } double *comp_probs = new double[max_comp]; // component probabilities for (r=regs; *r; r++) { if ((*r)->type() != DirichletReg::classID()) continue; DirichletReg *d = dynamic_cast(*r); double SumCounts; d->component_probs(counts, SumCounts, comp_probs); cout << "% Probabilities for the components of " << d->name() << ":\n% "; for(int i=0; inum_components(); i++) cout << setw(7) << setprecision(4) << IOSFIXED << comp_probs[i]; cout << "\n"; cout << "% encoding cost (-log2(probability(column)) with " << d->name() << " is " << -log2(d->Probability(counts)) << " bits\n"; cout << "\n"; } delete [] comp_probs; } Regularizer* read_reg(const char* filename, const char* name, IdObject* reg_type) { ifstream in(filename); if (!in.good()) { cerr << "Error: can't open " << filename << " to read a regularizer\n" << flush; return 0; } NamedClass* x= NamedClass::read_new(in); assert(x->is_a(Regularizer::classID())); assert(x->is_a(reg_type)); Regularizer* r = dynamic_cast(x); r->set_name(name); return r; } int main(int, char**) { Alphabet::set_default(Alph::ExtAA()); DirichletReg* OneComp = dynamic_cast (read_reg("../estimate-dist/data/BLOCKS/all/all-opt.1comp", "1-comp", DirichletReg::classID())); DirichletReg* CoinTwo = dynamic_cast (read_reg("../estimate-dist/data/coin-flip.2comp", "coin-flip", DirichletReg::classID())); DirichletReg* NineComp = dynamic_cast (read_reg("../estimate-dist/data/unweight.9comp", "9-comp", DirichletReg::classID())); DirichletReg* ThirtyComp = dynamic_cast (read_reg("../estimate-dist/data/unweight-30comp.plib", "30-comp", DirichletReg::classID())); MLZReg AddOne(& Alph::ExtAA(), 1.0, "add_one"); MLZReg AddShare(& Alph::ExtAA(), 0.05, "add_share"); GribskovReg *Blosum = dynamic_cast (read_reg("../estimate-dist/data/blosum62.score", "blosum62", GribskovReg::classID())); SubstPseudoReg* Subst = dynamic_cast (read_reg("../estimate-dist/data/BLOCKS/all/all-w0-2.subst", "subst", SubstPseudoReg::classID())); Regularizer* regs[] = { Blosum, Subst, // &AddOne, &AddShare, OneComp, NineComp, // ThirtyComp, CoinTwo, 0}; const int isoleucine=Alph::ExtAA().index(Alph::ExtAA().to_base('I')); const int his=Alph::ExtAA().index(Alph::ExtAA().to_base('H')); const int thr=Alph::ExtAA().index(Alph::ExtAA().to_base('T')); const int trypt=Alph::ExtAA().index(Alph::ExtAA().to_base('W')); const int val=Alph::ExtAA().index(Alph::ExtAA().to_base('V')); float NumCounts[20] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; cout << "% The regularizers are \n" << "% " << Blosum->name() << " does Gribskov average score using the\n" << "% " << "\tBlosum-62 matrix (natural log base, 3 decimal places)\n" << "% " << Subst->name() << " does matrix multiply with an optimized matrix\n" << "% " << AddOne.name() << " adds 1 to each count\n" << "% " << AddShare.name() << " adds 0.05 to each count\n" << "% " << OneComp->name() << " adds pseudocounts, optimized for blocks database\n" << "% " << NineComp->name() << " uses a 9-component Dirichlet mixture\n" << "% " << ThirtyComp->name() << " uses a 30-component Dirichlet mixture\n" << "% " << CoinTwo->name() << " coin-flip biased+unbiased\n" << "\n\n"; for (float num_heads=0.; num_heads<1.1 ; num_heads+=0.1) { cout << "% For " << num_heads << " heads" << num_heads <<", probabilities are:\n"; NumCounts[his] = NumCounts[thr] = num_heads; PrintProbs(regs, NumCounts, his); PrintComponentProbs(regs, NumCounts); cout << "\n\f"; } NumCounts[his] = NumCounts[thr] = 0; int num_isoleu; for (num_isoleu=0; num_isoleu<=20; num_isoleu++) { cout << "% For " << num_isoleu << " isoleucine" << (num_isoleu==1? "": "s") <<", probabilities are:\n"; NumCounts[isoleucine] = num_isoleu; PrintProbs(regs, NumCounts, isoleucine); PrintComponentProbs(regs, NumCounts); cout << "\n\f"; } cout << "\n"; DirichletReg* post=NineComp->posterior_mixture(NumCounts); post->set_name("posterior_20I"); post->write(cout); NumCounts[trypt] =1; for (num_isoleu=0; num_isoleu<=20; num_isoleu++) { cout << "% For " << num_isoleu << " isoleucine" << (num_isoleu==1? "": "s") <<" and one tryptophan, probabilities are:\n"; NumCounts[isoleucine] = num_isoleu; PrintProbs(regs, NumCounts, isoleucine); PrintComponentProbs(regs, NumCounts); cout << "\n\f"; } NumCounts[trypt] =0; NumCounts[val] = 1; for (num_isoleu=0; num_isoleu<=20; num_isoleu++) { cout << "% For " << num_isoleu << " isoleucine" << (num_isoleu==1? "": "s") <<" and one valine, probabilities are:\n"; NumCounts[isoleucine] = num_isoleu; PrintProbs(regs, NumCounts, isoleucine); PrintComponentProbs(regs, NumCounts); cout << "\n\f"; } // for (Regularizer **r=regs; *r; r++) // { cout << "\n\f\n"; // (*r)->write(cout); // } } // CHANGE LOG: // Fri Dec 14 11:52:29 PST 2007 Kevin Karplus // Added IOSmacros.h and removed form // Fixed old-style casts