// alignment.cc #include #include // for atoi #include #include #include "Input/Input.h" #include "alignment.h" #include "EqualStrings/EqualStrings.h" #include "Filenames/Filenames.h" static char *copy_string(const char *s) { char *copy; if (s==0) { copy= new char[1]; copy[0] = 0; return copy; } copy=new char[strlen(s)+1]; strcpy(copy, s); return copy; } void alignment::delete_all(void) { delete [] alignmentID; delete [] Filename; if (names) { for (int i=num_seqs-1; i>=0; i--) delete [] names[i]; delete [] names; } if (weights) delete [] weights; if (data) { for (int i=num_seqs-1; i>=0; i--) delete [] data[i]; delete [] data; } if (insert_before) { for (int i=num_seqs-1; i>=0; i--) delete [] insert_before[i]; delete [] insert_before; } clear(); } void alignment::alloc(void) { assert(names==0); // not already allocated if (width==0 || num_seqs==0) { cerr << "Error: trying to allocate a block for " << num_seqs << " sequences, each " << width << " long.\n" << flush; return; } typedef char* charStar; names = new charStar[num_seqs]; weights = new float[num_seqs]; typedef Base* BaseStar; data = new BaseStar[num_seqs]; insert_before = new charStar[num_seqs]; int i; for (i=num_seqs-1; i>=0; i--) { names[i] =0; data[i] = new Base[width]; assert(data[i] !=NULL); insert_before[i] = new char[width+1]; assert(insert_before[i] !=NULL); for (int j=0; j<=width; ++j) {insert_before[i][j] = 0;} } } void alignment::alloc_for_a2m(istream &in) { char buf[39999]; char c=0; delete_all(); for(;;) { if (c!='>') SkipSeparators(in, 1, '>'); // find a > character if (!in.good()) break; get_word(in,buf,'\n',1); // buf has the sequence name (plus possibly ",junk") for (int w=0; buf[w]; w++) { if (buf[w] == ',') { buf[w] = 0; break; } } int this_seq_length=0, gap_length=0; for(in >> c; in.good() && c!='>' && c!=';' ; in>>c) { if (c=='-' || isupper(c)) this_seq_length++; if (c=='-') gap_length++; } if (this_seq_length==0) { cerr << "Sequence named " << buf << " has no alignment columns." << "\nIt will be ignored.\n" << flush; continue; } if (this_seq_length==gap_length) { cerr << "Sequence named " << buf << " has all gap columns." << "\nIt will be ignored.\n" << flush; continue; } if (num_seqs >0 && this_seq_length!=width) { cerr << "Sequence " << buf << " has " << this_seq_length << " alignment columns,\n" << "but previously alignment width set to " << width << "\n" << flush; } if (width< this_seq_length) width=this_seq_length; num_seqs++; } alloc(); } // Read an alignment in SAM's a2m format // That is, a fasta file in which // match states are indicated by uppercase letters or -, // and insert states by lowercase letters or . // Ignore the insert states. // Like with MSSP files, the file is read twice---the first time to // get the size, the second time to read the data. void alignment::read_a2m(const char*filename, const AlphabetTuple *alphat) { assert(alphat!=NULL); assert(alphat->num_alphabets()==1); alphabet = (*alphat)[0]; gzifstream *in1p = Filenames::open_input_fullname(filename); if (in1p==NULL || !in1p->good()) { cerr << "Can't open A2M file " << filename << " for input\n" << flush; delete in1p; delete_all(); return; } alloc_for_a2m((*in1p)); in1p->close(); delete in1p; if (width<=0 || num_seqs<=0) { cerr << "A2M file not well formed: " << num_seqs << " sequences " << width << " long" << "\n" << flush; delete_all(); return; } Filename = copy_string(filename); // remove any directory names and extension from filename to get ID char *tail = strrchr(filename, '/'); alignmentID = copy_string(tail? tail+1: filename); tail = strrchr(alignmentID, '.'); if (tail) *tail='\0'; gzifstream *inp = Filenames::open_input_fullname(filename); char buf[39999]; char c=0; int s; for (s=0; s'); // find a > character if (!inp->good()) break; get_word((*inp),buf,'\n',1); // buf has the sequence name (plus possibly ",junk") tail =strchr(buf,','); if (tail) *tail='\0'; names[s] = copy_string(buf); int len=0; for((*inp) >> c; inp->good() && c!='>' && c!=';' ; (*inp)>>c) { if (c=='-') // <-remove 2nd data[s][len++] = Base::null(); else if (isupper(c)) data[s][len++] = alphabet->to_base(c); else if (islower(c)) insert_before[s][len]=1; } if (len==0) s--; // that was an empty sequence, ignore it. else { while(lengood() = " << inp->good() << "\n" << flush; } inp->close(); delete inp; } // Report the weights in a format suitable for SAM void alignment::report_weights(ostream &out) const { out << "% total sequence weight = " << total_weight() << "\n"; float minw=1e25, maxw=-1e25, sum=0; int s; for (s=0; s maxw) maxw=weights[s]; } out << "% sequence weight min= " << minw << " ave= " << (num_seqs? sum/num_seqs: 1.) << " max= " << maxw << "\n"; out << "Sequence weights computed by estimate-dist\n"; out << num_seqs << " 1\n"; for (s=0; snull(); return base_seq; } // CHANGE LOG // 20 July 1997 Kevin Karplus // borrowed from estimate-dist, removed MSSP and BLOCKS formats // added num_insert counter, renamed data type. // 23 July 1997 Kevin Karplus // Replaced num_insert[width+1] with insert_before[seqs][width+1] // flagging which sequences have inserts. // 3 August 1997 Kevin Karplus // Fixed bug in reading sequence name followed by new line. // 30 April 1998 Kevin Karplus // Eliminated accessibility from alignment. // 21 April 2004 Sol Katzman // Changed first_sequence to Base* from char*