// ContingencyTable // Kevin Karplus // 14 October 2003 #ifndef CONTINGENCYTABLE_H #define CONTINGENCYTABLE_H #include using namespace std; #include #include #ifndef NAN #include // added to handle stupid Linux setups that // omitted NAN from math.h #endif // A ContingencyTable is a 2-dimensional array of counts of // co-occurences of events. // Some common operations include // getting the marginal counts (for either dimension) // getting joint, marginal, and conditional probabilties // getting log-odds log ( P(x,y)/ (P(x)P(y)) ) // computing mutual information // including corrections and significance for // small-sample effects // STILL TO DO: // Input for contingency tables. // Computing other measures of dependence between x and y. // Rearrange code so that small-sample correction can be done on // several statistics at the same time. // Herbie Lee recommends looking at "Categorical Data Analysis" // by Alan Agresti. // Create a derived class AlphabetContingencyTable // in which both axes are defined by Alphabets. class ContingencyTable { protected: // data for the contingency table itself int XNum, YNum; // number of events for x and y dimensions int *Counts; // array of XNum*YNum counts int *XCounts, *YCounts; // marginal counts int TotalCounts; // total counts protected: // computed statistics for the observed table bool StatisticsComputed; // statistics that depend only on marginals: double XEntropy,YEntropy; // entropy of marginals // statistics that depend on whole joint table: double XYEntropy; // entropy of joint double MI; // mutual information double CHI2; // chi^2 double NegativeLogProb; // -log P(Table | marginal counts) // encoding cost of table, using hypergeometric distribution // (assumes all permutations of input pairings equally likely) protected: // Sampling information for table. int NumSamples; // if NumSamples is zero, rest of this // section is junk and needs to be cleared before using. bool FullPopulation; // if FullPopulation is set, then the // numbers in this section are not for a sample, but for // the entire population of tables with these marginal counts. static const int MaxMoment=4; // a_Sum is the sums of a^i // (note: a_Sum[0] should be the same for all statistics, // since it is just the number of samples) double MI_Sum[MaxMoment+1]; // sums for mutual information double CHI2_Sum[MaxMoment+1]; // sums for chi^2 double NLP_Sum[MaxMoment+1]; // sums for negative log probability double XYEntropy_Sum[MaxMoment+1]; // sums for joint entropy // The following numbers can be used for estimating P-values // of the statistics, when the P-values are fairly large. // If the Num*AsGood counts are too small, it is probably // better to estimate P-values from moment-matching a // probability distribution. int NumMIAsGood; // how many times has random MI >= MI been seen? int NumCHI2AsGood; // how many times has random CHI2 >= CHI2 been seen? int NumNLPAsGood; // how many times has random NLP >= NegativeLogProb been seen? int NumXYEntropyAsGood; // how many times has random XYEntropy <= XYEntropy been seen? protected: inline int subscript(int x, int y) const { return x*YNum + y; } // invalidate the cache for statistics, since the table has // been modified. inline void invalidate_cache(void) { StatisticsComputed=false; NumSamples = 0; FullPopulation=false; } // if the sampling count is 0, then clear the summation tables. inline void clear_if_no_samples(void) { if (NumSamples==0) { for(int m=0; m<=MaxMoment; m++) { MI_Sum[m] = CHI2_Sum[m] = NLP_Sum[m] = XYEntropy_Sum[m] = 0; } NumMIAsGood=0; NumCHI2AsGood=0; NumNLPAsGood=0; NumXYEntropyAsGood=0; } } // compute the statistics that are cached with the table. void compute_statistics(); // return the entropy for the joint distribution // for the supplied array (which must have same marginal counts as Counts) double raw_xy_entropy(const int* counts) const; // return mutual information (in nats) without small-sample corrections // for the supplied array (which must have same marginal counts as Counts) double raw_mutual_information(const int* counts) const; // return chi^2 statistics for supplied array (which must have same marginal counts as Counts) double raw_chi2(const int* counts) const; // return the negative log probability that a table has exactly the given counts. // (array must have same marginal counts as Counts) double raw_negative_log_prob(const int* counts) const; // set scrambled_counts (a pre-allocated array with the same // size and interpretation as Counts) to a random contingency table // with the same marginals as this. void scramble(int *scrambled_counts) const; virtual ostream & print_x_label(ostream& out, int x) const { return out << x; } virtual ostream & print_y_label(ostream& out, int y) const { return out << y; } // print a header line for x inline ostream &print_x_header(ostream &out, bool print_total=true) const { for (int x=0; x=0 && x=0 && y=0 && x=0 && y(TotalCounts)); } inline double x_prob(int x) const { return TotalCounts<=0? NAN : (XCounts[x]/static_cast(TotalCounts)); } inline double y_prob(int y) const { return TotalCounts<=0? NAN : (YCounts[y]/static_cast(TotalCounts)); } // ln( P(x,y)/ (P(x)P(y)) ) inline double xy_log_odds(int x, int y) const { double joint=xy_count(x,y); double indep = XCounts[x]*YCounts[y]; return indep<=0? NAN: log(joint*TotalCounts / indep); } // Add to a cell of the contingency table. // Return the count in that cell. // Note: one can add multiple counts at once, or even negative counts, // but no cell should ever be allowed to go negative. inline int add_count(int x, int y, int add=1) { assert(x>=0 && x=0 && y=0); TotalCounts += add; XCounts[x] += add; YCounts[y] += add; return Counts[sub]; } // Return the expected number of counts for a cell if // the two random variables are independent. // Make double to avoid possible overflow in the computation. inline double xy_independent_counts(int x, int y) const { return TotalCounts<=0? 0.0 : static_cast(XCounts[x]) * static_cast(YCounts[y]) / static_cast(TotalCounts); } // Check for trivial tables. // A table is trivial if any table with those marginal counts // can be mapped to the table simply by renumbering the rows // and columns. // For trivial tables, all scrambles will have the same statistics. bool is_trivial(void) const; // Entropy measurements // Entropy of marginal probabilities for first variable // in nats. inline double x_entropy(void) { if (!StatisticsComputed) { compute_statistics(); } return XEntropy; } // Entropy of marginal probabilities for second variable // in nats. inline double y_entropy(void) { if (!StatisticsComputed) { compute_statistics(); } return YEntropy; } // Entropy of joint probability in nats. inline double xy_entropy(void) { if (!StatisticsComputed) { compute_statistics(); } return XYEntropy; } // Dependence measures: // mutual information inline double mutual_information(void) { if (!StatisticsComputed) { compute_statistics(); } return MI; } // Wilks' G^2 = 2 TotalCounts mutual info // chi^2 // asymptotically, both of these are supposed to // follow a chi^2 distrbution with (XNum-1)*(YNum-1) degrees // of freedom. // // According to Alan Agresti [Categorical Data Analysis p. 49] // "The chi-squared approximation is usually poor for G^2 when // n/IJ < 5. When I or J is large, it can be decent for chi^2 // for n/IJ as small as 1, if the table does not contain both // very small and moderately large expected frequencies." inline double wilks_g2(void) { return TotalCounts<=0? 0.0: 2*TotalCounts*mutual_information(); } // Compute the chi^2 statistic for the contingency table. // chi^2 = sum (Counts_x,y - IndepCounts_x,y)^2/(IndepCounts_x,y) inline double chi2(void) { if (!StatisticsComputed) { compute_statistics(); } return CHI2; } // Cramer's V^2 = chi^2/(TotalCounts*min(XNum-1, YNum-1)) inline double Cramer_V2(double chi2_value) const { int min_minus_one = (XNum