// ContingencyTable.cc // Kevin Karplus // 14 October 2003 #include #include #include "Utilities/Random.h" #include "incomplete-gamma.h" #include "ContingencyTable.h" #include "Utilities/IOSmacros.h" // A ContingencyTable is a 2-dimensional array of counts of // co-occurences of events. // Some common operations include // getting the marginal counts (for either dimension) // getting joint, marginal, and conditional probabilties // getting log-odds log ( P(x,y)/ (P(x)P(y)) ) // computing mutual information // computing other measures of dependence between x and y // Create a new contingency table, and set all counts to 0. ContingencyTable::ContingencyTable(int x, int y) { assert(x>0 && y>0); XNum=x; YNum=y; TotalCounts=0; XCounts = new int[x]; for(int i=0; i=0; i--) Counts[i] = 0; invalidate_cache(); } void ContingencyTable::compute_statistics(void) { XEntropy=0; for (int x=0; x0) XEntropy -= prob * log(prob); } YEntropy=0; for (int y=0; y0) YEntropy -= prob * log(prob); } XYEntropy = raw_xy_entropy(Counts); MI = raw_mutual_information(Counts); CHI2 = raw_chi2(Counts); NegativeLogProb = raw_negative_log_prob(Counts); clear_if_no_samples(); StatisticsComputed=true; } // return the entropy for the joint distribution // for the supplied array (which must have same marginal counts as Counts) double ContingencyTable::raw_xy_entropy(const int* cnts) const { if (TotalCounts==0) return 0; double tc = static_cast(TotalCounts); double ent=0; for (int x=0; x 0 ) { double prob= count/tc; ent -= prob * log(prob); } } } return ent; } // return mutual information (in nats) without small-sample corrections // for the supplied array (which must have same marginal counts as Counts) double ContingencyTable::raw_mutual_information(const int* cnts) const { double mi=0.; if (TotalCounts==0) return mi; for (int x=XNum-1; x>=0; x--) { for (int y=YNum-1; y>=0; y--) { int count = cnts[subscript(x,y)]; if (count==0) continue; double indep = XCounts[x]*YCounts[y]; assert(indep >0); mi += count/ static_cast(TotalCounts) *log(count*(TotalCounts/indep)); } } return mi; } // Compute the chi^2 statistic for the contingency table. // chi^2 = sum (Counts_x,y - IndepCounts_x,y)^2/(IndepCounts_x,y) double ContingencyTable::raw_chi2(const int* cnts) const { double sum=0; for (int x=0; x0); double diff = cnts[subscript(x,y)] - ind_xy; sum += diff*diff / ind_xy; } } return sum; } double ContingencyTable::raw_negative_log_prob(const int* cnts) const { // Probability of table = prod_x(XCounts[x]!) prod_y(YCounts[y]!) / // (TotalCounts! prod_x,y(Counts[x,y]!)) double sum=lgamma(TotalCounts+1); for (int x=0; x 1 ) sum -= lgamma(XCounts[x] + 1); } for (int y=0; y 1 ) sum -= lgamma(YCounts[y] + 1); } for (int x=XNum-1; x>=0; x--) { for (int y=YNum-1; y>=0; y--) { int count = cnts[subscript(x,y)]; if (count>1) sum += lgamma(count+1); } } return sum; } // set scrambled_counts (a pre-allocated array with the same // size and interpretation as Counts) to a random contingency table // with the same marginals as this. void ContingencyTable::scramble(int *scrambled_counts) const { // This routine creates a random contingency table with fixed // marginal counts, so forms the core of the small-sample corrections. // It needs to be accurate and fairly efficient. // clear the array. for(int i=XNum*YNum-1; i>=0; i--) scrambled_counts[i] = 0; // one approach: keep track of "remaining" Y events // for each X event, do XCounts[x] random draws (without // replacement) from remaining Y events. // This requires TotalCounts calls to a random number generator, // and up to YNum comparisons for each. int * RemainingY = new int[YNum]; for (int y=YNum-1; y>=0; y--) RemainingY[y] = YCounts[y]; int RemainingCounts = TotalCounts; for (int x=0; x0); int rand = irandom(RemainingCounts); int y; for (y=0; y=RemainingY[y]; y++) { rand -= RemainingY[y]; } assert( y=0; i--) { int c=cnts[i]; if (c==0) continue; if (c>1) all_different=false; num_nonzero += 1; if (num_nonzero>1 && !all_different) return false; } // #ifdef DEBUG // cerr << "DEBUG: num_nonzero= " << num_nonzero // << " all_different= " << all_different // << "\n"; // #endif return num_nonzero<=1 || all_different; } bool ContingencyTable::is_trivial(void) const { return all_same_or_all_different(XCounts,XNum) || all_same_or_all_different(YCounts,YNum) ; } // Add more samples to the sampling done and compute the sums // of powers of the statistics. // If out is not NULL, then output lines containing the sample values of // mutual information, chi^2, negative_log_prob, and xy_entropy. void ContingencyTable::sample_tables(int num_samples, ostream *out) { // First get the observed values and make sure that the summation // tables have been cleared if there are no counts. if (!StatisticsComputed) { compute_statistics(); } if (is_trivial()) return; // don't do scrambles for trivial tables int * scrambled_counts = new int[XNum*YNum]; if (out) { (*out) << "# mi\tchi2\tneg_log_prob\txy_entropy\n"; } for (int s=0; s=1); NumSamples ++; MI_Sum[0]++; MI_Sum[1]+=mi; CHI2_Sum[0]++; CHI2_Sum[1]+=c2; NLP_Sum[0]++; NLP_Sum[1]+=nlp; XYEntropy_Sum[0]++; XYEntropy_Sum[1]+=xy; for(int m=2; m<=MaxMoment; m++) { MI_Sum[m] += pow(mi, m); CHI2_Sum[m] += pow(c2, m); NLP_Sum[m] += pow(nlp, m); XYEntropy_Sum[m] += pow(xy, m); } if (mi >= MI) NumMIAsGood ++; if (c2 >= CHI2) NumCHI2AsGood ++; if (nlp >= NegativeLogProb) NumNLPAsGood ++; if (xy <= XYEntropy) NumXYEntropyAsGood ++; } delete [] scrambled_counts; } // Computes the mutual information of a table (in nats). // Scrambles a copy of the contingency table repeatedly to get // a small-sample correction. The scrambles have to maintain // the marginal counts. // It does at least min_scrambles and at most max_scrambles scrambles // (actually, does a multiple of min_scrambles, but not less than // max_scrambles+min_scrambles). // Early termination can be caused by getting a random MI higher than the // observed MI for the real table. // Returns // the corrected mutual information, which is the raw // minus the mean of the MI for the scrambled tables // the raw mutual information (uncorrected) // the probability of seeing such a high MI from a scrambled table. // If out is not NULL, then random mutual information values will // be printed one per line to out. double ContingencyTable::corrected_mutual_information( double& raw, double& P_value, double& log_P_value, int max_scrambles, int min_scrambles, std::ostream *out) { log_P_value = NAN; // P_value undefined if no scrambles done. P_value = NAN; // P_value undefined if no scrambles done. raw = raw_mutual_information(Counts); #ifdef DEBUG cerr << "DEBUG: min_scrambles=" << min_scrambles << " max_scrambles=" << max_scrambles << "\n" ; #endif if (is_trivial()) { log_P_value = 0.0; P_value = 1.0; #ifdef DEBUG cerr << "DEBUG: trivial table\n"; #endif return 0; } // I could have a test here for very small raw mutual information // (like raw < 1.e-6), but would it ever be useful---how often // does that happen on non-trivial tables? // Do at least min_scrambles more than before, // up to max_scrambles. // This version does not stop immediately on seeing // a better "random" table, but always does a multiple of // min_scrambles tests. sample_tables(min_scrambles, out); while (NumSamples < max_scrambles && NumMIAsGood==0) { sample_tables(min_scrambles, out); } if (NumSamples <=0) { return raw; } assert(MI_Sum[0] == NumSamples); assert(NumSamples > 0); double mean_random = MI_Sum[1]/NumSamples; #ifdef DEBUG cerr << "DEBUG: raw=" << raw << " NumSamples=" << NumSamples << " mean_random=" << mean_random << " corrected=" << raw-mean_random << "\n" ; #endif if (NumSamples<10 || mean_random==0 || raw<= 0.0001*mean_random) { // variance/mean undefined, because insufficient scrambles // or raw mutual information indicates (near)perfect indepdence, log_P_value = 0.0; P_value = 1.0; return raw-mean_random; } double var_random = (MI_Sum[2]*NumSamples- MI_Sum[1]*MI_Sum[1]) /(NumSamples*(NumSamples-1)); double sigma = var_random/mean_random; log_P_value = sigma<=1.e-6? 0.0: LogGammaQ(mean_random/sigma, raw/sigma); P_value = exp(log_P_value); #ifdef DEBUG cerr << "\tDEBUG: P_value=" << P_value << "\n" ; #endif return raw-mean_random; } // print the counts as a tab-separated table, // with x and y axes labeled (include row and column totals) ostream& ContingencyTable::print_counts(ostream &out) const { print_x_header(out); for (int y=0; y