// Kevin karplus // 3 March 1996 (borrowed heavily from old example.cc) // test a couple of different ideas about how to create different // mixtures when subfamilies are known // USage: // test-sub subfamily_count_file < test_file #include "DirichletReg.h" #include "Alphabet.h" #include "Alph.h" #include #include #include // Build a bunch of new Dirichlet mixture regularizers, // based on dir and the subfamily counts seen in "in" // // The input should start with the number of subfamilies, // then have a 20-long count vector for each subfamily. // // For each subfamily, 2 regularizers are created, // one using the mixture coefficients from the sum of the subfamilies, // but the counts from the subfamily // the other using a weighted sum of the subfamilies. // In addition, a regularizer using the full set of data is created. Regularizer **SubFamilies(DirichletReg *dir, istream& in) { int num_subfamilies; in >> num_subfamilies; float sub_count[num_subfamilies][20]; float total[20]={0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0}; cout << "The " << num_subfamilies << " subfamilies are\n"; for (int fam=0; fam> sub_count[fam][letter]; cout << " " << sub_count[fam][letter]; total[letter] += sub_count[fam][letter]; } cout << "\n"; } cout << "New mixtures will be built from " << dir->name() << "\n"; typedef Regularizer *rp; Regularizer **ret = new rp[num_subfamilies*6+4]; int NumRegs=0; ret[NumRegs++] = dir; DirichletReg *FullCol= dir->posterior_mixture(total); FullCol->set_name("full_col"); ret[NumRegs++] = FullCol; DirichletReg *DavidsEmpty= (DirichletReg*)dir->copy(); DavidsEmpty->set_name("David's-empty"); for (int fam=0; famProbability(sub_count[fam]); for (int c=DavidsEmpty->num_components()-1; c>=0; c--) DavidsEmpty->set_mixture(c, DavidsEmpty->component_probs()[c]); } ret[NumRegs++] = DavidsEmpty; float weighted[20]; for (int fam=0; famposterior_mixture(sub_count[fam]); Davids->set_name("David's"); for (int c=Davids->num_components()-1; c>=0; c--) Davids->set_mixture(c, DavidsEmpty->mixture_coeff(c)); ret[NumRegs++] = Davids; } for (int fam=0; famposterior_mixture(weighted); Kevins->set_name("Kevin's(0.1)"); ret[NumRegs++]=Kevins; } for (int fam=0; famposterior_mixture(weighted); Kevins->set_name("Kevin's(0.01)"); ret[NumRegs++]=Kevins; } for (int fam=0; famposterior_mixture(sub_count[fam]); pure_sub->set_name("Kevin's(0.0)"); ret[NumRegs++]=pure_sub; } for (int fam=0; famposterior_mixture(sub_count[fam]); Full_part->set_name("full_part"); for (int c=Full_part->num_components()-1; c>=0; c--) Full_part->set_mixture(c, FullCol->mixture_coeff(c)); ret[NumRegs++] = Full_part; } for (int fam=0; famposterior_mixture(sub_count[fam]); alpha_only->set_name("alpha_only"); for (int c=alpha_only->num_components()-1; c>=0; c--) alpha_only->set_mixture(c, dir->mixture_coeff(c)); ret[NumRegs++] = alpha_only; } ret[NumRegs++] =0; return ret; } void PrintProbs(Regularizer **regs, const float counts[20]) { float probs[20]; int num_regs=0; Regularizer **r; for (r=regs; *r; r++) num_regs++; cout << "\\begin{table}\n" << "\\begin{center}\n" << "\\begin{tabular}{|l" ; for (r=regs; *r; r++) cout << "|l"; cout << "|}\n"; cout << "\\hline\n" << "&\\multicolumn{" << num_regs << "}{c|}{probabilities given "; for (int letter=0; letter<20; letter++) { if (counts[letter]==0) continue; cout << " " << counts[letter] << " " << Alph::ExtAA().full_name(Alph::ExtAA().unindex(letter)) << (counts[letter]>1 ? "s": "") ; } cout << "} \\\\ \n" << "\\hline\n" ; Regularizer ** prev_reg=regs; for (r=regs; *r; r++) { if ((*r)->type() != (*prev_reg)->type()) { cout << "&\\multicolumn{" << (r-prev_reg) << "}{c|}{" << (*prev_reg)->type()->name() << "}"; prev_reg = r; } } cout << "&\\multicolumn{" << (r-prev_reg) << "}{c|}{" << (*prev_reg)->type()->name() << "}"; cout << "\\\\ \n" << "\\hline\n"; for (r=regs; *r; r++) { const char* nm = (*r)->name(); cout << "& " ; for (const char*x=nm; *x; x++) { if (!isalnum(*x)) { switch(*x) {case '\'': case '.': case '-': break; default: cout << "\\"; break; } } cout << *x; } } cout << "\\\\ \n"; const char *BoldFaceThese = "ILV"; // note: rather inefficient to recompute the regularized // probabilities 20 times, but who cares? for(int i=0; i<20; i++) { if (i%5==0) cout << "\\hline\n"; int DoBoldFace=0; char lett= Alph::ExtAA().to_char(Alph::ExtAA().unindex(i)) ; for (const char *c= BoldFaceThese; *c; c++) { if (*c==lett) { DoBoldFace=1; break; } } cout << (DoBoldFace? "\\bf ": "") << lett << " "; for (r=regs; *r; r++) { (*r)->get_probs(counts, probs); cout << (DoBoldFace? "&\\bf ": "& ") << form("%.4f", probs[i]); } cout << "\\\\ \n"; } cout << "\\hline\n" << "\\end{tabular}\n" << "\\end{center}\n" << "\\end{table}\n" << "\\clearpage\n" << "\n\n" ; } void PrintComponentProbs(Regularizer **regs, const float counts[20]) { int max_comp=0; // maximum number of components Regularizer **r; for (r=regs; *r; r++) { if ((*r)->type() == DirichletReg::classID() && max_comp < ((DirichletReg*)(*r))->num_components()) max_comp = ((DirichletReg*)(*r))->num_components(); } double comp_probs[max_comp]; // component probabilities Regularizer *best_reg=0, *worst_reg=0; double low_bits=999999, hi_bits=-1; for (r=regs; *r; r++) { if ((*r)->type() != DirichletReg::classID()) continue; DirichletReg *d = (DirichletReg*)(*r); double SumCounts; d->component_probs(counts, SumCounts, comp_probs); cout << "% Probabilities for the components of " << d->name() << ":\n% "; for(int i=0; inum_components(); i++) cout << form(" %7.4f", comp_probs[i]); cout << "\n"; double bits=-log2(d->Probability(counts)); if (bits< low_bits) { low_bits=bits; best_reg = *r; } if (bits> hi_bits) { hi_bits=bits; worst_reg = *r; } cout << "% encoding cost (-log2(probability(column)) with " << d->name() << " is " << bits << " bits\n"; cout << "\n"; } cout << "% lowest encoding cost " << low_bits << " using " << best_reg->name() << "\n"; cout << "% highest encoding cost " << hi_bits << " using " << worst_reg->name() << "\n"; } Regularizer* read_reg(const char* filename, const char* name, IdObject* reg_type) { ifstream in(filename); if (!in.good()) { cerr << "Error: can't open " << filename << " to read a regularizer\n" << flush; return 0; } NamedClass* x= NamedClass::read_new(in); assert(x->is_a(Regularizer::classID())); assert(x->is_a(reg_type)); Regularizer* r = (Regularizer*) x; r->set_name(name? name:filename); return r; } main(int arc, char**argv) { Alph::set_default(Alph::ExtAA()); DirichletReg* NineComp = (DirichletReg*) read_reg("../estimate-dist/data/BLOCKS/all/mix-all-reopt.9comp", "mix-all-reopt.9comp", DirichletReg::classID()); ifstream in_fam(argv[1]); Regularizer** regs=SubFamilies(NineComp, in_fam); float NumCounts[20]; while (cin.good()) { for (int letter=0; letter< 20; letter++) cin >> NumCounts[letter]; if (!cin.good()) break; PrintProbs(regs, NumCounts); PrintComponentProbs(regs, NumCounts); cout << "\n\f"; } cout << "\n"; } // control information for gnuemacs to simplify compilation // Local Variables: // compile-command: "rsh apache '(cd dna/src/Regularizer; make test-sub; test-sub subfams.count test-sub.out)'" // End: