/* * test_GenSequence.cc program for generating random sequences of * amino acids that have lengths and compositions * similar to sequences in real protein databases. * Copyright (C) 2000,2005 Kevin Karplus * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * See http://www.gnu.org/copyleft/lesser.html for the license details, * or write to the Free Software Foundation, Inc., 59 Temple Place, Suite * 330, Boston, MA 02111-1307 USA */ /* gen_sequence.c * Kevin Karplus 7 Nov 2000 * * This program generates random amino acid sequences. * It takes one command-line argument, the number of sequences to * generate, and puts them out in FASTA format to standard out. * * The length of each sequence is taken from a discretized log-normal * distribution that was fit to the sequences in RSDB-60 [rsdb]. * * The amino acids of a sequence are generated by an independent, * identically distributed process. The probabilities for that * distribution are selected from a mixture of Dirichlet densities. * The mixture of Dirichlet densities was also trained on RSDB-60. * The particular mixture chosen here is not the best fit, but a * compromise between the number of components and the fit. * * A slightly more sophisticated generator (that accepts parameters for * different lognormal distributions and different Dirichlet mixtures * for the composition) will be included in a future release of the SAM * software suite. * * *[rsdb] Park J, Holm L, Heger A, Chothia C * RSDB: representative protein sequence databases have high * information content * Bioinformatics 2000 May;16(5):458-64 * ftp://ftp.ebi.ac.uk/pub/contrib/jong/RSDB/Ver_1999_8_30/ */ #include #include #include #include /* used for getpid to initialize random */ #include using namespace std; #include "Filenames/Filenames.h" #include "Alphabet/Alph.h" // get ExtAA alphabet #include "Regularizer/DirichletReg.h" #include "Utilities/Random.h" #include "GenSequence.h" int main(int argc, const char **argv) { if (argc!=3) { cerr << "ERROR: test_gen_sequence requires exactly 2 arguments:\n" << "\tthe number of random sequences to generate and\n" << "\ta file with the Dirichlet regularizer for the composition.\n"; return 1; } int count=atoi(argv[1]); // number of sequences to generate Alphabet::set_default(Alph::ExtAA()); gzifstream *reg_in = Filenames::open_input(argv[2]); if (!reg_in) return 1; Regularizer *r= Regularizer::read_new(*reg_in); delete reg_in; if (!r || !r->is_a(DirichletReg::classID())) { cerr << "ERROR: no DirichletReg found in " << argv[2] << "\n"; return 1; } GenSequence gen(static_cast(r), 237., 175.); set_random(getpid()); vector seq; for (int s=0; sseq_"<< s<< "\n"; for (int p=0; p< length; p++) { cout << seq[p]; if (p%50 == 49) cout << "\n"; } cout << "\n"; } return 0; } // CHANGE LOG: // Thu Jul 21 10:35:59 PDT 2005 Kevin Karplus // New program (similer to old test_gen_sequence.cc) // to test GenSequence class