00001 #ifndef GENMODEL_H
00002 #define GENMODEL_H
00003
00004 #include <string>
00005 #include <vector>
00006 #include "Range.h"
00007 #include <iostream>
00008 #include <cassert>
00009 #include "bioseq.h"
00010 #include <map>
00011 #include <set>
00012
00013 using namespace std;
00014
00017 class PointOutChain : public exception {
00018 private:
00019 string message;
00020
00021 public:
00022 PointOutChain() throw() : message("Point outside of Chain") { }
00023 PointOutChain(const char* msg) throw() : message(msg) { }
00024 explicit PointOutChain(const string &msg) throw() : message(msg) { }
00025 ~PointOutChain() throw() { }
00026 virtual const char* what() const throw() {
00027 return message.c_str(); }
00028 };
00029
00030 class Badinput : public exception {
00031 private:
00032 string message;
00033
00034 public:
00035 Badinput() throw() : message("Coordinates bad") { }
00036 Badinput(const char msg[]) throw() : message(msg) { }
00037 explicit Badinput(const string &msg) throw() : message(msg) { }
00038 ~Badinput() throw() { }
00039 virtual const char* what() const throw() {
00040 return message.c_str();
00041 }
00042 };
00043
00044 class InvalidModel : public exception {
00045 private:
00046 string message;
00047 public:
00048 InvalidModel() throw() : message("Invalid Model") { }
00049 InvalidModel(const char msg[]) throw() : message(msg) { }
00050 ~InvalidModel() throw() { }
00051 virtual const char* what() const throw() {
00052 return message.c_str();
00053 }
00054 };
00055
00056
00057 void string2Vector(vector<int> &v, const string &str);
00058 bool nonincreasingVector(const vector<int> &v);
00059 bool nondecreasingVector(const vector<int> &v);
00060 bool allLessVector(const vector<int> &v1, const vector<int> &v2);
00062
00069 class Noschain : public Range {
00070 public:
00071 Noschain() : Range(), exons() { }
00078 Noschain(const string &exstarts, const string &exends, char strand) throw(Badinput);
00085 Noschain(const string &exstr) throw(Badinput);
00086
00088 Noschain(const Noschain &eg);
00089
00090
00091
00092
00093
00098 void fixAndBuild(const vector<int> &bb, const vector<int> &ee, char strand);
00099 ~Noschain() { }
00100 Noschain& operator=(const Noschain &eg);
00104 Noschain& operator+=(const Range &r);
00106 Noschain operator+(int c) const;
00107 Noschain operator-(int c) const;
00108 friend Noschain operator+(int c, const Noschain& chain) {
00109 return chain+c; }
00112 friend Noschain operator-(int c, const Noschain& chain);
00113 Noschain& operator-=(int c);
00114 Noschain& operator+=(int c);
00115
00119 Range& operator[](int idx) { return exons[idx]; }
00122 const Range& operator[](int idx) const { return exons[idx]; }
00123 Range firstExon() const { return exons[0]; }
00124 Range lastExon() const { return exons.back(); }
00125
00127 bool operator==(const Noschain &nc) const { return exons == nc.exons; }
00128 bool operator!=(const Noschain &nc) const { return exons != nc.exons; }
00143 bool operator<(const Noschain &nc) const;
00147 int compareByDirection(const Noschain &nc) const;
00153 Noschain commonExons(const Noschain &eg) const;
00159 Noschain commonIntrons(const Noschain &nc) const;
00160
00164 bool isNull() const { return Range::isNull() && exons.empty(); }
00165 bool empty() const { return exons.empty(); }
00166 void clear() { exons.clear(); Range::setNull(); }
00168 void setNull() { Range::setNull(); exons.clear(); }
00170 int exonLength() const;
00172 int intronLength() const;
00175 Range maxIntron() const;
00176 int maxIntronLength() const { return maxIntron().length(); }
00178 Range minIntron() const;
00179 int minIntronLength() const { return minIntron().length(); }
00183 vector<Range> introns() const;
00189 bool containIntron(int p1, int p2) const {
00190 int i1=exonIndex(p1); int i2=exonIndex(p2);
00191 return i1 != -1 && i2 != -1 && i1 != i2;
00192 }
00197 Range outerRange() const {
00198
00199 return Range(b,e);
00200 }
00207 vector<Range> intronsInside(const int len) const;
00208
00209 int numberOfRanges() const { return exons.size(); }
00214 Noschain subchainBeforePoint(int p) const throw (PointOutChain);
00229 Noschain subchainAfterPoint(int p) const throw (PointOutChain);
00236 void trimAfterPoint(const int p) throw (PointOutChain);
00241 void trimBeforePoint(const int p) throw (PointOutChain);
00246 Noschain subchain(const Range &r) const throw (PointOutChain);
00247 Noschain subchain(int bg, int ed) const throw (PointOutChain)
00248 { return subchain(Range(bg,ed)); }
00249 const vector<Range>& getExons() const { return exons; }
00250 void setExons(const vector<Range>& ex) { exons=ex; updateOuterRange(); }
00255 void setChain(const Noschain &chain) { setExons(chain.exons); }
00258 void updateOuterRange() { Range::set(exons[0].begin(), exons.back().end()); }
00264 void set(const Range &r) throw (Badinput);
00272 void setEnd(int ne) throw (PointOutChain);
00273 void setBegin(int nb);
00280 void growEnd(int icr);
00289 int advancePosOnExon(int pos, unsigned int d) const;
00291 int retreatPosOnExon(int pos, unsigned int d) const;
00296 int exonIndex(int p) const;
00297 int exonIndex(const Range &r) const;
00301 bool insideExon(int p) const { return exonIndex(p) != -1; }
00310 bool insideExon(const Range &r, bool samedir=false) const;
00311
00312
00313
00316 virtual ostream& show(ostream &ous=cout) const;
00320 friend ostream& operator<<(ostream &ous, const Noschain &nc)
00321 { return nc.print(ous); }
00326 virtual ostream& print(ostream &ous) const;
00331 string toString() const;
00337 pair<string,string> startEnd(char sep=',') const;
00342 string jgiformat(char sep='\t') const;
00346 string sfExonsGenomic() const;
00347 string sfExonsTranscript() const;
00348
00355 Noschain onewayChain() const;
00359 void onewayChain(Noschain &owc) const;
00369 void exonOnlyChain(Noschain &eoc) const;
00371 Noschain exonOnlyChain() const;
00376 string asDelimitedString(const char sep=':', const char rsep[]=",") const;
00377 string asDelimitedString(const char sep=':', const char rsep=',') const;
00378
00394
00395 bool exonContain(const Noschain &nc, bool samedirection=false) const;
00417 int compareChainAndFix(Noschain &nc, int &edit);
00443 bool exonOverlap(const Noschain &nc) const;
00459 int exonOverlapLength(const Noschain &nc) const;
00467 int exonIntersectLength(const Noschain &nc) const;
00470 pair<float, float> exonIntersectFraction(const Noschain &nc) const;
00492 bool append(Noschain &nc);
00498 void extendEnd(const Noschain &nc);
00499
00506 Noschain& reverse();
00516 void subsequence(string &sub, const string &gs) const throw(Badinput);
00522 bool isDangle(const int intronlen=900, const int exonlen=27) const;
00528 bool directionAgree() const;
00529
00530 protected:
00540 vector<Range> exons;
00544
00545 };
00546
00548 void errorMessageFix1(Noschain &c1, Noschain &c2, int status, int end);
00549
00550 extern int FUZZYMARGIN;
00566 int comparePlusChainFirstBefore(Noschain &c1, Noschain &c2);
00567 int comparePlusChainIdenticalLeft(Noschain &c1, Noschain &c2);
00568 int compareMinusChainFirstBefore(Noschain &c1, Noschain &c2);
00569 int compareMinusChainMinusIdenticalRight(Noschain &c1, Noschain &c2);
00570
00574 class lessByChainDirectionPtr {
00575 public:
00576 bool operator()(const Noschain *c1, const Noschain *c2) const {
00577 return c1->compareByDirection(*c2) == -1;
00578 }
00579 };
00580
00581 class lessByChainDirection {
00582 public:
00583 bool operator()(const Noschain &c1, const Noschain &c2) const {
00584 return c1.compareByDirection(c2) == -1;
00585 }
00586 };
00587
00591 class lessChainPtr {
00592 public:
00593 bool operator()(const Noschain *c1, const Noschain *c2) const {
00594 return *c1 < *c2;
00595 }
00596 };
00597
00602 class Alnchain : public Noschain {
00603 public:
00604 Alnchain(const string &exstr)
00605 : Noschain(exstr), iden(100), cov(1) { }
00608 Alnchain(int begin, const string &cigar, int nmismatch);
00609 Alnchain(int begin, const string &cigar, char strand, int nmismatch);
00610 Alnchain() : Noschain(), iden(100), cov(1) { }
00611 Alnchain(double id, double cv) : Noschain(), iden(id), cov(cv) { }
00612 Alnchain(const string &exons, double identity, double coverage) : Noschain(exons), iden(identity), cov(coverage) {}
00613
00614 double identity() const { return iden; }
00615 double coverage() const { return cov; }
00616 void setIdentity(double newiden) { iden=newiden; }
00617 void setCoverage(double newcov) { cov=newcov; }
00622 string toString(const char sep='\t') const;
00623 Alnchain& operator=(const Alnchain &ch);
00625 ostream& print(ostream &ous) const;
00626
00627
00628 protected:
00630 double iden;
00631 double cov;
00632 };
00633
00638 class Alnchainid : public Alnchain {
00639 public:
00640 Alnchainid() : Alnchain(), estids() { }
00647 Alnchainid(double seqiden, double cv, const string &estid)
00648 : Alnchain(seqiden, cv), estids(1, estid) { }
00654 Alnchainid(int begin, const string &cigar, int nmismatch, const string &estid)
00655 : Alnchain(begin,cigar,nmismatch), estids(1,estid) { }
00657 Alnchainid(int begin, const string &cigar, char strand, int nmismatch, const string &estid)
00658 : Alnchain(begin,cigar,strand,nmismatch), estids(1,estid) { }
00661 Alnchainid(const string &exons, double identity, double coverage,
00662 const string &estidstr) : Alnchain(exons, identity, coverage) { setESTIds(estidstr); }
00663
00665 void addESTId(const string &estid) { estids.push_back(estid); }
00670 void setESTIds(const string &idstr);
00675 int numEST() const { return estids.size(); }
00676 const vector<string>& getESTIds() const { return estids; }
00677 vector<string>& getESTIds() { return estids; }
00688 string toString(const char sep='\t') const;
00691 string ESTIdString(const char sep=',') const;
00692 Alnchainid& operator=(const Alnchainid &ch);
00696 void absorbESTIds(const Alnchainid &ch) { estids.insert(estids.end(), ch.estids.begin(), ch.estids.end()); }
00706 void averageWith(const Alnchainid &chn);
00708 ostream& print(ostream &ous) const;
00709
00710 private:
00714 vector<string> estids;
00715 };
00716
00718
00727 void readandstoreGmap(const string &inputFile,
00728 map<string, set<Alnchainid*, lessChainPtr> > &alnseg,
00729 map<int,int> &pathstat);
00731 void readandstoreGmap(istream &ins, map<string, set<Alnchainid*, lessChainPtr> > &alnseg,
00732 map<int,int> &pathstat);
00733
00742 void archiveAln(map<string, set<Alnchainid*, lessChainPtr> > &alnseg, const string &file);
00744 void archiveAln(map<string, set<Alnchainid*, lessChainPtr> > &alnseg, ostream &ous);
00745
00755 class GenModel : public Noschain {
00756 public:
00757 GenModel() : Noschain(), cdsphase(0), numstop(0),
00758 hasstart(true), hasstop(true),
00759 pep(0) { }
00771 GenModel(const string &exstarts, const string &exends,
00772 char strand, int cdsb, int cdse, const string &gi,
00773 int ii, const string &si) throw(InvalidModel);
00774
00781 GenModel(const string &exstarts, const string &exends,
00782 char strand, int cdsb, int cdse, const string &gi,
00783 int ii, const string &si, const string &track, int trackid) throw(InvalidModel);
00784 ~GenModel();
00785 ostream& show(ostream &ous) const;
00786 Noschain FivePrimeUTR() const { return subchainBeforePoint(cds.begin()); }
00789 Noschain ThreePrimeUTR() const { return subchainAfterPoint(cds.end()); }
00790 int num5NoncodingExons() const { return FivePrimeUTR().numberOfRanges(); }
00791 int num3NoncodingExons() const { return ThreePrimeUTR().numberOfRanges(); }
00792 Noschain CDS() const throw (PointOutChain) { return subchain(cds); }
00793 Range CDSRange() const { return cds; }
00794 int FivePrimeUTRLength() const { return subchainBeforePoint(cds.begin()).exonLength(); }
00795 int ThreePrimeUTRLength() const { return subchainAfterPoint(cds.end()).exonLength(); }
00796 int UTRLength() const { return FivePrimeUTRLength() + ThreePrimeUTRLength(); }
00797 int CDSLength() const throw (PointOutChain) { return CDS().exonLength(); }
00799 double CDSFraction() const { return static_cast<double>(CDSLength())/length(); }
00800
00822
00823 int valid();
00829 bool valid(const string &gs);
00831
00832
00833
00834 const string& genomicId() const { return gid; }
00837 const string& modelName() const { return name; }
00838 const string& getName() const { return name; }
00839 int getId() const { return id; }
00843 string CDSSeq(const string &gs) const throw (PointOutChain);
00847 void growEnd(int icr);
00850 void setEnd(int ne) { Noschain::setEnd(ne); cds.setEnd(ne); }
00857 string toString(char sep='\t') const;
00858 string toSQLString(char sep=',') const;
00863 string toJGIModel(char sep='\t') const;
00880 ostream& printAllmod(ostream &ous, char sep='\t') const;
00881 bool operator==(const GenModel &gm) const { return Noschain::operator==(gm) && cds==gm.cds; }
00882 bool sameExons(const GenModel &gm) const { return Noschain::operator==(gm); }
00883 void makeProtein(const string &gs);
00884 const string& pepstr() const { return pep->getSequence(); }
00885 bool hasBegin() const { return hasstart; }
00886 bool hasEnd() const { return hasstop; }
00887
00888 private:
00895 string gid;
00901 int id;
00906 string name;
00912 Range cds;
00913
00916 int cdsphase;
00918 int numstop;
00919 bool hasstart, hasstop;
00920 Protein *pep;
00923 string track;
00925 int trackid;
00926
00930 static const int minModelLen=100;
00933 static const int minCDSLen=80;
00934 };
00935
00936 #endif