/* * gen_sequence.cc program for generating random sequences of * amino acids that have lengths and compositions * similar to sequences in real protein databases. * Copyright (C) 2000,2005 Kevin Karplus * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * See http://www.gnu.org/copyleft/lesser.html for the license details, * or write to the Free Software Foundation, Inc., 59 Temple Place, Suite * 330, Boston, MA 02111-1307 USA */ /* gen_sequence.c * Kevin Karplus 7 Nov 2000 * * This program generates random amino acid sequences. * It takes one command-line argument, the number of sequences to * generate, and puts them out in FASTA format to standard out. * * The length of each sequence is taken from a discretized log-normal * distribution that was fit to the sequences in RSDB-60 [rsdb]. * * The amino acids of a sequence are generated by an independent, * identically distributed process. The probabilities for that * distribution are selected from a mixture of Dirichlet densities. * The mixture of Dirichlet densities was also trained on RSDB-60. * The particular mixture chosen here is not the best fit, but a * compromise between the number of components and the fit. * * A slightly more sophisticated generator (that accepts parameters for * different lognormal distributions and different Dirichlet mixtures * for the composition) will be included in a future release of the SAM * software suite. * * *[rsdb] Park J, Holm L, Heger A, Chothia C * RSDB: representative protein sequence databases have high * information content * Bioinformatics 2000 May;16(5):458-64 * ftp://ftp.ebi.ac.uk/pub/contrib/jong/RSDB/Ver_1999_8_30/ */ #include #include #include #include /* used for getpid to initialize random */ #include using namespace std; #include "Utilities/Random.h" #include "gen_norm.h" #include "gen_dirch_mix.h" const double mean_log_length= 5.4151; const double stddev_log_length= 1.03632564; const char *AA= "ACDEFGHIKLMNPQRSTVWY"; double mix_coeff[6] = {0.272155, 0.259417, 0.170403, 0.139492, 0.0813267, 0.0772063}; double comp0[20]={ 44.1267, 8.54778, 34.3287, 45.012, 25.4885, 40.1706, 13.8165, 37.9695, 40.2287, 57.0279, 15.7816, 24.6325, 26.6716, 21.0461, 33.2734, 39.015, 31.1269, 43.4722, 6.95188, 19.8187}; double comp1[20]={ 12.9502, 3.56352, 9.54209, 10.7789, 7.22886, 11.7739, 4.45173, 9.25549, 10.113, 16.1048, 4.26184, 8.89566, 10.1378, 8.39929, 9.83732, 16.2176, 11.391, 11.4088, 2.43706, 5.69262}; double comp2[20]={ 17.2777, 5.02666, 20.633, 27.1002, 18.6022, 16.5669, 7.46735, 29.459, 33.201, 36.5843, 8.33165, 23.2366, 12.2568, 12.8126, 15.8195, 26.7285, 18.5344, 20.7154, 3.5349, 14.591}; double comp3[20]={ 36.82, 3.69397, 16.532, 17.1343, 9.09368, 25.9803, 7.44929, 11.6098, 7.16141, 31.5673, 6.2929, 6.50351, 17.932, 9.98346, 24.4647, 17.2535, 16.1794, 23.996, 4.379, 6.65627}; double comp4[20]={ 2.4706, 0.588751, 1.33572, 1.65825, 1.08549, 2.23149, 0.722458, 1.4641, 1.70528, 2.49194, 0.900071, 1.33377, 1.84355, 1.25755, 1.65407, 2.63801, 1.88732, 1.84009, 0.399507, 0.92818}; double comp5[20]={ 25.0801, 4.5497, 8.56496, 10.2153, 24.5674, 21.9557, 5.42381, 29.6721, 13.1884, 43.4575, 10.6261, 11.4957, 13.0346, 8.10884, 11.8077, 25.3051, 18.1739, 25.4028, 5.88753, 12.6309}; double *comps[6] = {comp0, comp1, comp2, comp3, comp4, comp5}; int main(int argc, const char **argv) { int count=1; // number of sequences to generate double probs[20]; /* probability mass function for AAs */ double cum_probs[20]; /* probability distribution * (cumulative sum of probs) */ if (argc!=2) { cerr << "ERROR: test_gen_sequence requires exactly one argument,\n" << "the number of random sequences to generate.\n"; } else count=atoi(argv[1]); set_random(getpid()); gen_dirch_mix comp_gen(20, 6,mix_coeff, const_cast(comps)); for (int s=0; srseq_" << s << "\n"; int length; // length of seqence length = static_cast(exp(mean_log_length +stddev_log_length*gen_norm())); if (length==0) length=1; comp_gen.generate(probs); double sum = 0.; /* temporary for computing cum_probs */ /* convert probs to cumulative probabilities */ for (int a=0; a<20; a++) { sum += probs[a]; cum_probs[a] =sum; } for (int p=0; p< length; p++) { int aa = get_random_given_cum_weights(20, cum_probs); cout << AA[aa]; if (p%50 == 49) cout << "\n"; } cout << "\n"; } return 0; } // CHANGE LOG: // Wed Jul 20 18:54:08 PDT 2005 Kevin Karplus // Converted from c to c++ // Thu Jul 21 10:49:25 PDT 2005 Kevin Karplus // changed to remove extra value at beginning of component arrays.