00001 #ifndef BIOSEQ_H
00002 #define BIOSEQ_H
00003
00012 #include "codon.h"
00013 #include <string>
00014 #include <iostream>
00015 #include <map>
00016 #include <algorithm>
00017 #include <list>
00018 #include "Range.h"
00019 #include <vector>
00020 #include "Interval.h"
00021
00022 using namespace std;
00023
00024 class greaterRangeLength {
00025 public:
00026 bool operator()(const Range &r1, const Range &r2) const {
00027 return r1.length() > r2.length();
00028 }
00029 };
00030
00034 class bioseqexception : public exception {
00035 public:
00036 bioseqexception() throw() : errstr("bioseq error") { }
00037 bioseqexception(const string &msg) throw() : errstr(msg) { }
00038 ~bioseqexception() throw() { }
00039 const char* what() const throw() { return errstr.c_str(); }
00040 private:
00041 string errstr;
00042 };
00043
00044
00050 void printFasta(ostream &ous, const string& seq, int width=70);
00052 string reverseComplement(const string& seq);
00054 void reverseComplementInPlace(string& seq);
00061 bool loadFastaIntoMap(const string &file, map<string,string> &store);
00065 enum sequenceType {PROTEINSEQ=1, DNASEQ=2, RNASEQ=4, NUCLEIC_ACID=6, GENERIC=7, UNKNOWN=0};
00084 void translate(string &pep, const string &seq, int begin, int end=-1);
00088 int countInternalStops(const string &seq);
00099 void longestORFPlus(const string &rna, string &pep, int &b, int &e, int &f);
00100 void longestORFPlus(const string &rna, string &pep, int &b, int &e);
00102 void longestORFPlusSuffix(const string &rna, string &pep, int &b, int &e);
00104 void longestORFPlusPrefix(const string &rna, string &pep, int &b, int &e);
00126 bool longestNoStartORFPlus(const string &rna, pair<int,int> &nterm, string &npep,
00127 pair<int,int> &full, string &pep);
00128 bool longestNoStopORFPlus(const string &rna, pair<int,int> &cterm, string &cpep,
00129 pair<int,int> &full, string &pep);
00130
00131
00132
00140 Range P2Rindex(const Range &ofb, const int frame);
00141 Interval P2Rindex(const Interval &ofb, const int frame);
00142 int P2Rindex(const int pos, const int frame);
00143
00144
00146 bool overlayRVR(const Range &r, const vector<Range> &vr);
00162 vector<Range> findAllORFIndex(const string &rna, int base=1,
00163 int pep_lencut=22, int HHcut=160, int HTcut=120, int TTcut=10);
00182 void findAllPepORFIndex(list<Range> &orfrange, const string &ss, int cutoff);
00183
00184
00188 class bioseq {
00189 public:
00190 bioseq() : seq(), name(), title(), code(0) {}
00191 bioseq(const bioseq &s) : seq(s.seq), name(s.name),
00192 title(s.title), code(0) { }
00193 bioseq(const string &s) : seq(s), name(),title(), code(0) {}
00197 bioseq(const string &n, const string &s)
00198 : seq(s), name(n), title(), code(0) {}
00203 bioseq(const string &n, const string &s, const string &t)
00204 : seq(s), name(n), title(t), code(0) {}
00205
00206 ~bioseq() { if (code != 0) delete[] code; }
00207
00208 void clear() { if (code != 0) delete[] code; code=0; seq.clear(); name.clear(); title.clear(); }
00209
00212 void assign(const string &s) {
00213 seq=s;
00214 if (code != 0) {
00215 delete[] code;
00216 code=0;
00217 }
00218 }
00220
00226 void read(const string &file);
00227
00244 bool read(istream &ins, string &header) throw (bioseqexception);
00245
00251 string& importFasta(const string &fas);
00252
00254
00260 const string& toString() const { return seq; }
00261
00264 void printFastaWithHeader(ostream &ous, int width=70);
00266 void printFasta(ostream &ous, int width=70);
00271 friend ostream& operator<<(ostream &ous, const bioseq &s);
00272
00277 string substr(int b, int e) const;
00278 string substring(const int b, const int len) const throw (bioseqexception)
00279 { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range"); }
00280 string substring(const int b) const throw (bioseqexception)
00281 { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range");
00282 return seq.substr(b); }
00283
00287 bioseq subseq(int b, int e) const throw (bioseqexception);
00288
00293 bioseq subsequence(int b, int len=-1) const throw (bioseqexception);
00294 void setName(const string &n) { name=n; }
00295 void setTitle(const string &t) { title=t; }
00296 const string& getName() const { return name; }
00297 const string& getTitle() const { return title; }
00301 const string& getSequence() const { return seq; }
00302
00306 void rvc();
00310 map<char, double> getFrequency() const;
00311
00315 pair<double,double> computeEntropy() const {
00316 return entropy(seq); }
00320 pair<double,double> computeEntropy(int b, int e) const throw(bioseqexception) {
00321 if (b<0 || e>seq.length()-1) {
00322 cerr << "index error in computeEntropy()";
00323 throw bioseqexception("index out of range inside bioseq::computeEntropy");
00324
00325 }
00326 return entropy(seq.substr(b,e-b+1)); }
00327
00332 static pair<double,double> entropy(const string &seq);
00333 int length() const { return seq.length(); }
00340 const int* getcode() const;
00341
00345 char operator[](const int i) const { return seq[i]; }
00346 bioseq& operator=(const bioseq &s);
00347 bioseq& operator+=(const bioseq &s) { seq += s.seq; return *this; }
00348 bioseq& operator=(const string &str);
00350 bool empty() { if (seq.empty()) return true; else return false; }
00351 void randomize() {
00352 random_shuffle(seq.begin(), seq.end());
00353 delete[] code;
00354 code=0;
00355 }
00357 void toLowerCase();
00358 void toUpperCase();
00359 virtual sequenceType getSequenceType() const { return GENERIC; }
00360
00361 protected:
00362 string seq;
00363 string name;
00364 string title;
00368 mutable int* code;
00369 };
00370
00372 int aachar2num(char a);
00374 char aanum2char(int c);
00375
00379 class Protein : public bioseq {
00380 public:
00381 Protein() : bioseq() { codefreq[0]=-1; }
00382 Protein(const Protein &s) : bioseq(s) {
00383 for (int i=0; i<numcodes; i++) codefreq[i]=s.codefreq[i];
00384 }
00385 Protein(const string &str) : bioseq(str) { codefreq[0]=-1; }
00386 Protein(const bioseq &s) : bioseq(s) { codefreq[0]=-1; }
00387 Protein(const string &n, const string &s)
00388 : bioseq (n,s) { codefreq[0]=-1; }
00389 Protein subseq(int b, int e) const {
00390 return bioseq::subseq(b,e); }
00391 ~Protein() { }
00392 Protein& operator=(const Protein &p);
00393 Protein& operator=(const string &str);
00394
00395 static const char* threeLetterCode(const char A)
00396 { return symbols[2*(toupper(A)-'A')]; }
00397 static const char* aminoAcidFullName(const char A)
00398 { return symbols[2*(toupper(A)-'A')+1]; }
00399
00401 double relativeEntropy() const;
00402 double relativeEntropyUniform() const;
00408
00411
00415 const double* getCodeFrequence();
00416 static const int numcodes=27;
00417 static const char* oneLetterSymbols;
00420 sequenceType getSequenceType() const { return PROTEINSEQ; }
00421 bool hasStart() const { return seq[0] == 'M'; }
00422 bool hasStop() const { return seq[seq.length()-1] == '*'; }
00423 bool hasInternalStop() const;
00424 int countInternalStops();
00425
00426 private:
00435 mutable double codefreq[27];
00436
00441 static const char* symbols[];
00443 static const double aafreq[];
00446
00447 };
00448
00453 class DNA : public bioseq {
00454 public:
00455 DNA() : bioseq() { }
00456 DNA(const DNA &s) : bioseq(s) { }
00457 DNA(const string &s) : bioseq(s) { }
00458 DNA(const string &n, const string &s)
00459 : bioseq(n,s) {}
00460 DNA(const string &n, const string &s, const string &t)
00461 : bioseq(n,s,t) { }
00462
00463 DNA(const bioseq &s) : bioseq(s) { }
00464
00465
00466 ~DNA() { }
00467 DNA& operator=(const DNA &s);
00468
00471 DNA& operator=(const string &str);
00472
00477 Protein translate() const;
00484 Protein translate(int begin, int end=-1) const;
00485 DNA subseq(int b, int e) const {
00486 return bioseq::subseq(b,e); }
00492 const int* getcodeNuc() const;
00493 sequenceType getSequenceType() const { return DNASEQ; }
00496 double GCContent() const;
00502 double GCContent(long int &A, long int &C, long int &G, long int &T, long int &N) const;
00503
00506 void revcomp() { reverseComplementInPlace(seq); }
00512 Protein longestORFForward(int &b, int &e) const;
00516 void longestORFForward(Protein &p, int &b, int &e) const;
00519 Protein longestORFReverse(int &b, int &e) const;
00520 void longestORFReverse(Protein &p, int &b, int &e) const;
00521 Protein longestORF(char strand, int &b, int &e) const;
00522 Protein longestORF(int &b, int &e) const;
00523 void longestORF(Protein &p, int &b, int &e) const;
00524
00525 static const codon& getCodonTable() { return codontable; }
00526 static void setCodonTable(const int ctid) { codontable.use(ctid); }
00527
00528 protected:
00535 static codon codontable;
00536
00537 };
00538
00541 class mRNA : public DNA {
00542 public:
00543 mRNA() : DNA() { }
00546 mRNA(const string &s);
00547
00548 private:
00551 int cdsb;
00553 int cdse;
00554 Protein prt;
00555 };
00556
00557
00558 #endif