// entropy.cc // Kevin Karplus // 7 Oct 1995 // compute the entropy of a substitution matrix or other regularizer // entropy defined as // sum_{i,j} p(i | j) p(j) log_2(p(i|j) / p(i)) // with sums over the amino acids // Note: this formula comes from Altschul, and is copied from // the Henikoff's matrix.c comparison program // Compare with "expected entropy" // sum_{i,j} p(i) p(j) log_2(p(i|j) / p(i)) // The background probabilities are read from files--- // currently dayhoff, jones, and blosum62 frequencies are used. #include #include "SubstPseudoReg.h" #include "GribskovReg.h" #include "DirichletReg.h" #include "MLZReg.h" #include "MLPReg.h" #include "Alphabet/Alphabet.h" #include "Alphabet/Alph.h" #include "Utilities/log2.h" #include "Utilities/IOSmacros.h" const float Zero[20] ={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; // read an MLPReg regularizer file and use it to get background frequencies const float *ReadFreq(const char* filename) { ifstream in(filename); MLPReg reg(&Alph::ExtAA(), in, "reg"); in.close(); float *freq = new float[20]; reg.get_probs(Zero, freq); return const_cast(freq); } void print_entropy(Regularizer *r, const float* freq, double scale=1.) { // array to regularize (one non-zero count, set to scale) float counts[20]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; float probs[20]; // regularized probabilities p(i|j) double ent=0.0; // sum p(i,j) log (p(i|j)/ p(i)) double expected_ent=0.0; // sum p(i)p(j) log (p(i|j)/ p(i)) double self_ent=0.0; // -sum p(i) log (p(i|i)) for (int j=19; j>=0; j--) { counts[j] = scale; r->get_probs(counts,probs); counts[j] = 0; self_ent -= freq[j]*log2(probs[j]); for(int i=19; i>=0; i--) { double score = log2(probs[i]/freq[i]); ent += probs[i] * freq[j] * score; expected_ent += freq[i] * freq[j] * score; } } cout << r->name() << "\t" << IOSFIXED << setw(8) << setprecision(4) << scale << " " << setw(8) << setprecision(5) << ent << " " << setw(8) << setprecision(5) << expected_ent << " " << setw(8) << setprecision(5) << ent-expected_ent << " " << setw(8) << setprecision(5) << self_ent << "\n"; } void compare_entropies(const float* freq, Regularizer* reg1, Regularizer* reg2) { cout << "# name\tscale\tentropy(1st-2nd)\texpected entropy" << "\tscore gain\tentropy(self)\n"; print_entropy(reg1, freq, 1.0); for (double scale=0.001; scale<101; scale*=10) { for (int k=1; k<=9; k++) { print_entropy(reg2, freq, k*scale); if (k==1) print_entropy(reg2, freq, 1.5*scale); } } } Regularizer* read_reg(const char* filename, const char* name, IdObject* reg_type) { ifstream in(filename); if (!in.good()) { cerr << "Error: can't open " << filename << " to read a regularizer\n" << flush; return 0; } NamedClass* x= NamedClass::read_new(in); assert(x->is_a(Regularizer::classID())); assert(x->is_a(reg_type)); Regularizer* r = dynamic_cast(x); r->set_name(name); return r; } int main(int, char**) { Alph::set_default(Alph::ExtAA()); const float *DayhoffFreq=ReadFreq("../estimate-dist/data/dayhoff.counts"); const float *JonesFreq=ReadFreq("../estimate-dist/data/jones.counts"); const float *Blosum62Freq=ReadFreq("../estimate-dist/data/blosum62.counts"); Regularizer* NineComp = read_reg("../estimate-dist/data/mix-9comp.plib", "mix-9-comp", DirichletReg::classID()); Regularizer* Blosum62= read_reg("../estimate-dist/data/blosum62.freq","blosum62", SubstPseudoReg::classID()); cout << "# Print entropy in bits for different regularizers\n" << "# assuming different background frequencies\n" << "\n" << "# entropy(1st-2nd) is sum p(i,j) log (p(i|j)/ p(i))\n" << "# expected entropy is sum p(i)p(j) log (p(i|j)/ p(i))\n" << "# score gain is entropy(1st-2nd) - expected entropy\n" << "# self-entropy is -sum p(i) log (p(i|i))\n" << "\n"; float NineCompFreq[20]; NineComp->get_probs(Zero, NineCompFreq); cout << "# Using NineComp frequencies\n"; compare_entropies(NineCompFreq, Blosum62, NineComp); cout << "#\f\n"; cout << "# Using Blosum62 frequencies\n"; compare_entropies(Blosum62Freq, Blosum62, NineComp); cout << "#\f\n"; cout << "# Using Dayhoff frequencies\n"; compare_entropies(DayhoffFreq, Blosum62, NineComp); cout << "#\f\n"; cout << "# Using Jones frequencies\n"; compare_entropies(JonesFreq, Blosum62, NineComp); } //CHANGE LOG: // 9 April 2004 Kevin Karplus // Changed include because log2.h moved to Utilities/. // Fri Dec 14 12:03:01 PST 2007 Kevin Karplus // Changed old-style cast // Made main() declaration more complete // Fri Dec 14 12:08:14 PST 2007 Kevin Karplus // Added IOSmacros and removed form()