bioseq.h

Go to the documentation of this file.
00001 #ifndef BIOSEQ_H
00002 #define BIOSEQ_H
00003 
00012 #include "codon.h"
00013 #include <string>
00014 #include <iostream>
00015 #include <map>
00016 #include <algorithm>
00017 #include <list>
00018 #include "Range.h"
00019 #include <vector>
00020 #include "Interval.h"
00021 
00022 using namespace std;
00023 
00024 class greaterRangeLength {
00025    public:
00026       bool operator()(const Range &r1, const Range &r2) const {
00027          return r1.length() > r2.length();
00028       }
00029 };
00030 
00034 class bioseqexception : public exception {
00035    public:
00036       bioseqexception() throw() : errstr("bioseq error") { }
00037       bioseqexception(const string &msg) throw() : errstr(msg) { }
00038       ~bioseqexception() throw() { }
00039       const char* what() const throw() { return errstr.c_str(); }
00040    private:
00041       string errstr;
00042 };
00043 
00044 // some helper functions
00050 void printFasta(ostream &ous, const string& seq, int width=70);
00052 string reverseComplement(const string& seq);
00054 void reverseComplementInPlace(string& seq);
00061 bool loadFastaIntoMap(const string &file, map<string,string> &store);
00065 enum sequenceType {PROTEINSEQ=1, DNASEQ=2, RNASEQ=4, NUCLEIC_ACID=6, GENERIC=7, UNKNOWN=0};
00084 void translate(string &pep, const string &seq, int begin, int end=-1);
00088 int countInternalStops(const string &seq);
00099 void longestORFPlus(const string &rna, string &pep, int &b, int &e, int &f);
00100 void longestORFPlus(const string &rna, string &pep, int &b, int &e);
00102 void longestORFPlusSuffix(const string &rna, string &pep, int &b, int &e);
00104 void longestORFPlusPrefix(const string &rna, string &pep, int &b, int &e);
00126 bool longestNoStartORFPlus(const string &rna, pair<int,int> &nterm, string &npep,
00127       pair<int,int> &full, string &pep);
00128 bool longestNoStopORFPlus(const string &rna, pair<int,int> &cterm, string &cpep,
00129       pair<int,int> &full, string &pep);
00130 
00131 //pair<int,int> longestORFIndex(const Protein &p);
00132 
00140 Range P2Rindex(const Range &ofb, const int frame);
00141 Interval P2Rindex(const Interval &ofb, const int frame);
00142 int P2Rindex(const int pos, const int frame);
00143 //{ return 3*pos+frame; }
00144 
00146 bool overlayRVR(const Range &r, const vector<Range> &vr);
00162 vector<Range> findAllORFIndex(const string &rna, int base=1,
00163       int pep_lencut=22, int HHcut=160, int HTcut=120, int TTcut=10);
00182 void findAllPepORFIndex(list<Range> &orfrange, const string &ss, int cutoff);
00183 
00184 
00188 class bioseq {
00189    public:
00190       bioseq() : seq(), name(), title(), code(0) {}
00191       bioseq(const bioseq &s) : seq(s.seq), name(s.name),
00192                                 title(s.title), code(0) { }
00193       bioseq(const string &s) : seq(s), name(),title(), code(0) {}
00197       bioseq(const string &n, const string &s) 
00198          : seq(s), name(n), title(), code(0) {}
00203       bioseq(const string &n, const string &s, const string &t) 
00204          : seq(s), name(n), title(t), code(0) {}
00205       //virtual ~bioseq() { 
00206       ~bioseq() { if (code != 0) delete[] code; }
00207 
00208       void clear() { if (code != 0) delete[] code; code=0; seq.clear(); name.clear(); title.clear(); }
00209 
00212       void assign(const string &s) { 
00213          seq=s; 
00214          if (code != 0) {
00215             delete[] code;
00216             code=0;
00217          } 
00218       }
00220       
00226       void read(const string &file);
00227 
00244       bool read(istream &ins, string &header) throw (bioseqexception);
00245 
00251       string& importFasta(const string &fas);
00252 
00254 
00260       const string& toString() const { return seq; }
00261 
00264       void printFastaWithHeader(ostream &ous, int width=70);
00266       void printFasta(ostream &ous, int width=70);
00271       friend ostream& operator<<(ostream &ous, const bioseq &s);
00272 
00277       string substr(int b, int e) const; 
00278       string substring(const int b, const int len) const throw (bioseqexception)
00279       { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range"); }
00280       string substring(const int b) const throw (bioseqexception)
00281       { if (b>=length() || b < 0) throw bioseqexception("bioseq::substring start index out of range");
00282          return seq.substr(b); }
00283 
00287       bioseq subseq(int b, int e) const throw (bioseqexception);
00288 
00293       bioseq subsequence(int b, int len=-1) const throw (bioseqexception);
00294       void setName(const string &n) { name=n; }
00295       void setTitle(const string &t) { title=t; }
00296       const string& getName() const { return name; }
00297       const string& getTitle() const { return title; }
00301       const string& getSequence() const { return seq; }
00302 
00306       void rvc();
00310       map<char, double> getFrequency() const;
00311 
00315       pair<double,double> computeEntropy() const {
00316          return entropy(seq); }
00320       pair<double,double> computeEntropy(int b, int e) const throw(bioseqexception) {
00321          if (b<0 || e>seq.length()-1) { 
00322             cerr << "index error in computeEntropy()"; 
00323             throw bioseqexception("index out of range inside bioseq::computeEntropy");
00324             //exit(1);
00325          }
00326          return entropy(seq.substr(b,e-b+1)); }
00327 
00332       static pair<double,double> entropy(const string &seq);
00333       int length() const { return seq.length(); }
00340       const int* getcode() const;
00341 
00345       char operator[](const int i) const { return seq[i]; }
00346       bioseq& operator=(const bioseq &s);
00347       bioseq& operator+=(const bioseq &s) { seq += s.seq; return *this; }
00348       bioseq& operator=(const string &str);
00350       bool empty() { if (seq.empty()) return true; else return false; }
00351       void randomize() { 
00352          random_shuffle(seq.begin(), seq.end()); 
00353          delete[] code;
00354          code=0; 
00355       }
00357       void toLowerCase();
00358       void toUpperCase();
00359       virtual sequenceType getSequenceType() const { return GENERIC; }
00360 
00361    protected:
00362       string seq;
00363       string name;
00364       string title;
00368       mutable int* code;
00369 };
00370 
00372 int aachar2num(char a);
00374 char aanum2char(int c);
00375 
00379 class Protein : public bioseq {
00380    public:
00381       Protein() : bioseq() { codefreq[0]=-1; }
00382       Protein(const Protein &s) : bioseq(s)  { 
00383          for (int i=0; i<numcodes; i++) codefreq[i]=s.codefreq[i]; 
00384       }
00385       Protein(const string &str) : bioseq(str) { codefreq[0]=-1; }
00386       Protein(const bioseq &s) : bioseq(s)  { codefreq[0]=-1; }
00387       Protein(const string &n, const string &s) 
00388          : bioseq (n,s)  { codefreq[0]=-1; }
00389       Protein subseq(int b, int e) const {
00390          return bioseq::subseq(b,e); }
00391       ~Protein() { /* parent destructor is enough */ }
00392       Protein& operator=(const Protein &p);
00393       Protein& operator=(const string &str);
00394 
00395       static const char* threeLetterCode(const char A) 
00396       { return symbols[2*(toupper(A)-'A')]; }
00397       static const char* aminoAcidFullName(const char A) 
00398       { return symbols[2*(toupper(A)-'A')+1]; }
00399 
00401       double relativeEntropy() const;
00402       double relativeEntropyUniform() const;
00408       //const int* encode() const;
00411       //const int* getcode() const { if (code == 0) encode(); return code; }
00415       const double* getCodeFrequence();
00416       static const int numcodes=27; // including *
00417       static const char* oneLetterSymbols;
00420       sequenceType getSequenceType() const { return PROTEINSEQ; }
00421       bool hasStart() const { return seq[0] == 'M'; }
00422       bool hasStop() const { return seq[seq.length()-1] == '*'; }
00423       bool hasInternalStop() const;
00424       int countInternalStops();
00425 
00426    private:
00435       mutable double codefreq[27];
00436       //mutable int* code;
00441       static const char* symbols[];
00443       static const double aafreq[];
00446       //static const double aafrequniform[];
00447 };
00448 
00453 class DNA : public bioseq {
00454    public:
00455       DNA() : bioseq() { }
00456       DNA(const DNA &s) : bioseq(s) { }
00457       DNA(const string &s) : bioseq(s)  { }
00458       DNA(const string &n, const string &s) 
00459          : bioseq(n,s)  {}
00460       DNA(const string &n, const string &s, const string &t) 
00461          : bioseq(n,s,t)  { }
00462 
00463       DNA(const bioseq &s) : bioseq(s)  { }
00464       //~DNA() { if (code != 0) delete[] code; }
00465       //~DNA() { bioseq::~bioseq(); }
00466       ~DNA() { /* parent destructor is enought */ }
00467       DNA& operator=(const DNA &s);
00468       //DNA& operator+=(const DNA &s) { return bioseq::operator+=(s); }
00471       DNA& operator=(const string &str);
00472 
00477       Protein translate() const;
00484       Protein translate(int begin, int end=-1) const;
00485       DNA subseq(int b, int e) const {
00486          return bioseq::subseq(b,e); }
00492       const int* getcodeNuc() const;
00493       sequenceType getSequenceType() const { return DNASEQ; }
00496       double GCContent() const;
00502       double GCContent(long int &A, long int &C, long int &G, long int &T, long int &N) const;
00503       //void revcomp() { seq=reverseComplement(seq); }
00506       void revcomp() { reverseComplementInPlace(seq); }
00512       Protein longestORFForward(int &b, int &e) const;
00516       void longestORFForward(Protein &p, int &b, int &e) const;
00519       Protein longestORFReverse(int &b, int &e) const;
00520       void longestORFReverse(Protein &p, int &b, int &e) const;
00521       Protein longestORF(char strand, int &b, int &e) const;
00522       Protein longestORF(int &b, int &e) const;
00523       void longestORF(Protein &p, int &b, int &e) const;
00524 
00525       static const codon& getCodonTable() { return codontable; }
00526       static void setCodonTable(const int ctid) { codontable.use(ctid); }
00527 
00528    protected:
00535       static codon codontable;
00536       //mutable int* code;
00537 };
00538 
00541 class mRNA : public DNA {
00542    public: 
00543       mRNA() : DNA() { }
00546       mRNA(const string &s);
00547 
00548    private:
00551       int cdsb;
00553       int cdse;
00554       Protein prt;
00555 };
00556 
00557 
00558 #endif

Generated on Wed Aug 10 11:56:48 2011 for Softwares from Orpara by  doxygen 1.5.6