// Feature Partition method as a Regularizer #ifndef FEATUREReg_H #define FEATUREReg_H #include "Regularizer.h" #include class FeaturePartition { const AlphabetTuple *alph; // not owned by FeaturePartition int *WhichFeature; // WhichFeature[x] is feature number for // xth letter of alphabet int NumFeatures; // 0<=feature number <=NumFeatures int NumUndefined; // number of letters with WhichFeature[x] // still undefined public: FeaturePartition(const AlphabetTuple *a); FeaturePartition(const FeaturePartition& p); ~FeaturePartition() { delete [] WhichFeature; } int OK(void) const {return NumUndefined==0;} const AlphabetTuple* alphabet_tuple(void) const {return alph;} int which_feature(int i) const {return WhichFeature[i];} void set_feature(int letter, int which); int& which_feature(const BaseTuple bt) { return WhichFeature[alph->index(bt)]; } int num_features(void) const {return NumFeatures;} // fill in count array "reduced" for the reduced partition void ReduceCounts(const float* counts, float ZeroOffset, float *reduced) const { for (int j=num_features()-1; j>=0; j--) reduced[j] = ZeroOffset; for (int k=alph->num_normal()-1; k>=0; k--) reduced[which_feature(k)] += counts[k]; } void print(ostream &out) const; // alphabet not output void read(istream &in); // alphabet must already be known! }; inline ostream& operator << (ostream &out, const FeaturePartition& f) { f.print(out); return out; } // Try to find a feature partition to add to the Reg that will // improve its performance on predicting probabilities from // samples of size one. // require min_features <= # features <= max_features // Summary[i*alphabet_size() + j] is the frequency of character // i, having seen a sample containing character j. // Also return the zero-offset to use through best_z. extern FeaturePartition* best_feature_partition( Regularizer *d, const float *Summary, int min_features, int max_features, float &best_z ); class FeatureReg : public Regularizer { enum { MAX_ALPHS=100 }; FeaturePartition* parts[MAX_ALPHS]; // array of (up to MAX_ALPHS) feature partitions int NumAlphs; // number of feature partitions acutally used float ZeroOffset[MAX_ALPHS]; double SumReducedCounts[MAX_ALPHS]; // cache of the sum of counts (reduced for each feature partition) // for the last set of counts presented to get_modified_counts static IdObject ID; static NameToPtr* CommandTable; void write_knowing_type(ostream &out) const; void init_command_table(void); NameToPtr *command_table(void) {return CommandTable;} public: FeatureReg(void) : Regularizer() {NumAlphs=0;} FeatureReg(const Alphabet *a, istream &in, const char *nm) : Regularizer(a,nm) { read_knowing_type(in); } FeatureReg(const Alphabet *a, const char* nm=0) : Regularizer(a,nm) { NumAlphs=0; } FeatureReg(const AlphabetTuple *a, const char* nm=0) : Regularizer(a,nm) { NumAlphs=0; } Regularizer *copy(void) const; ~FeatureReg() { for (int i=NumAlphs-1; i>=0; i--) delete parts[i]; } static IdObject* classID(void) {return &ID;} virtual IdObject* type(void) const {return &ID;} void print_info(ostream &out) const { Regularizer::print_info(out); out << " (" << NumAlphs << " partitions)"; } void get_modified_counts( const float* TrainCounts, // what you use as counts float* probs); // you fill this in with probs. int num_alphs(void) const {return NumAlphs;} void set_zero_offset(int i, float f) {ZeroOffset[i] = f;} float zero_offset(int i) const {return ZeroOffset[i];} const FeaturePartition *partition(int i) const { return parts[i]; } void add_partition(FeaturePartition *f, float z=1.) // Adds partition--does NOT make copy. // Consider the FeatureReg the owner of the object. { assert(NumAlphs=0); FeaturePartition *f=parts[delete_this]; NumAlphs--; // move last partition into position of popped one parts[delete_this] = parts[NumAlphs]; ZeroOffset[delete_this] = ZeroOffset[NumAlphs]; return f; } void add_best_partition(const float *Summary, int min_features=2, int max_features=0); // Try to find a feature partition to add to the Regularizer that will // improve its performance on predicting probabilities from // samples of size one. // min_features <= number of features <= max_features // (unless max_features==0, then min_features <= number of features) // Summary[i*alphabet_size() + j] is the frequency of character // i, having seen a sample containing character j. protected: int num_parameters(void) const { return NumAlphs; } float parameter(int i) const { return ZeroOffset[i]; } void set_parameter(int i, float p) { ZeroOffset[i] = p; } float max_parameter(int i) const { return 1000.0; } void partials1(float *part1, int letter, const float* Counts); void partials2(float *part1, float *part2, int letter, const float* Counts); }; // CHANGE LOG: // 2 MAy 1995 Kevin Karplus // Renamed FeatureAlphabet FeaturePartition // 5 Dec 1995 Kevin Karplus // Fixed void constructor to make NumAlphs 0 // 5 January 1995 Kevin Karplus // Added SumReducedCounts cache, partials2, removed zero_second_deriv, // since new normailization scheme for modified counts does not have // a zero second partial (neither did the old, really). // 17 Oct 1996 Spencer Tu // Changed FeatureReg::MAX_ALPHS from type (const int) member // to an element of an enum type. #endif