GenModel.h

Go to the documentation of this file.
00001 #ifndef GENMODEL_H
00002 #define GENMODEL_H
00003 
00004 #include <string>
00005 #include <vector>
00006 #include "Range.h"
00007 #include <iostream>
00008 #include <cassert>
00009 #include "bioseq.h"
00010 #include <map>
00011 #include <set>
00012 
00013 using namespace std;
00014 
00017 class PointOutChain : public exception {
00018    private:
00019       string message;
00020 
00021    public:
00022       PointOutChain() throw() : message("Point outside of Chain") { }
00023       PointOutChain(const char* msg) throw() : message(msg) { }
00024       explicit PointOutChain(const string &msg) throw() : message(msg) { }
00025       ~PointOutChain() throw() { }
00026       virtual const char* what() const throw() { 
00027          return message.c_str(); }
00028 };
00029 
00030 class Badinput : public exception {
00031    private:
00032       string message;
00033 
00034    public:
00035       Badinput() throw() : message("Coordinates bad") { }
00036       Badinput(const char msg[]) throw() : message(msg) { }
00037       explicit Badinput(const string &msg) throw() : message(msg) { }
00038       ~Badinput() throw() { }
00039       virtual const char* what() const throw() {
00040          return message.c_str();
00041       }
00042 };
00043 
00044 class InvalidModel : public exception {
00045    private: 
00046       string message;
00047    public:
00048       InvalidModel() throw() : message("Invalid Model") { }
00049       InvalidModel(const char msg[]) throw() : message(msg) { }
00050       ~InvalidModel() throw() { }
00051       virtual const char* what() const throw() {
00052          return message.c_str();
00053       }
00054 };
00055 
00056 // helper functions ////////
00057 void string2Vector(vector<int> &v, const string &str);
00058 bool nonincreasingVector(const vector<int> &v);
00059 bool nondecreasingVector(const vector<int> &v);
00060 bool allLessVector(const vector<int> &v1, const vector<int> &v2);
00062 
00069 class Noschain : public Range {
00070    public:
00071       Noschain() : Range(), exons() { }
00078       Noschain(const string &exstarts, const string &exends, char strand) throw(Badinput);
00085       Noschain(const string &exstr) throw(Badinput);
00086       //Noschain(const string &exstarts, const string &exends, unsigned char strand);
00088       Noschain(const Noschain &eg);
00089       /* Assume strand is +;  only match string such as 38M 
00090        * Cannot handle Hard clipping. Never seen this yet. */
00091       //Noschain(int begin, const string &cigar);
00092       /* strand is the direction of the alignment in mRNA direction. */
00093       //Noschain(int begin, const string &cigar, char strand);
00098       void fixAndBuild(const vector<int> &bb, const vector<int> &ee, char strand);
00099       ~Noschain() { }
00100       Noschain& operator=(const Noschain &eg);
00104       Noschain& operator+=(const Range &r); 
00106       Noschain operator+(int c) const;
00107       Noschain operator-(int c) const;
00108       friend Noschain operator+(int c, const Noschain& chain) {
00109          return chain+c; }
00112       friend Noschain operator-(int c, const Noschain& chain);
00113       Noschain& operator-=(int c);
00114       Noschain& operator+=(int c);
00115 
00119       Range& operator[](int idx) { return exons[idx]; }
00122       const Range& operator[](int idx) const { return exons[idx]; }
00123       Range firstExon() const { return exons[0]; }
00124       Range lastExon() const { return exons.back(); }
00125 
00127       bool operator==(const Noschain &nc) const { return exons == nc.exons; }
00128       bool operator!=(const Noschain &nc) const { return exons != nc.exons; }
00143       bool operator<(const Noschain &nc) const;
00147       int compareByDirection(const Noschain &nc) const;
00153       Noschain commonExons(const Noschain &eg) const;
00159       Noschain commonIntrons(const Noschain &nc) const;
00160 
00164       bool isNull() const { return Range::isNull() && exons.empty(); }
00165       bool empty() const { return exons.empty(); }
00166       void clear() { exons.clear(); Range::setNull(); }
00168       void setNull() { Range::setNull(); exons.clear(); }
00170       int exonLength() const;
00172       int intronLength() const;
00175       Range maxIntron() const;
00176       int maxIntronLength() const { return maxIntron().length(); }
00178       Range minIntron() const;
00179       int minIntronLength() const { return minIntron().length(); }
00183       vector<Range> introns() const;
00189       bool containIntron(int p1, int p2) const {
00190          int i1=exonIndex(p1); int i2=exonIndex(p2);
00191          return i1 != -1 && i2 != -1 && i1  != i2; 
00192       }
00197       Range outerRange() const {
00198          //return Range(exons.front().begin(), exons.back().end());
00199          return Range(b,e);
00200       }
00207       vector<Range> intronsInside(const int len) const;
00208 
00209       int numberOfRanges() const { return exons.size(); }
00214       Noschain subchainBeforePoint(int p) const throw (PointOutChain);
00229       Noschain subchainAfterPoint(int p) const throw (PointOutChain);
00236       void trimAfterPoint(const int p) throw (PointOutChain);
00241       void trimBeforePoint(const int p) throw (PointOutChain);
00246       Noschain subchain(const Range &r) const throw (PointOutChain);
00247       Noschain subchain(int bg, int ed) const throw (PointOutChain)
00248       { return subchain(Range(bg,ed)); }
00249       const vector<Range>& getExons() const { return exons; }
00250       void setExons(const vector<Range>& ex) { exons=ex; updateOuterRange(); }
00255       void setChain(const Noschain &chain) { setExons(chain.exons); }
00258       void updateOuterRange() { Range::set(exons[0].begin(), exons.back().end()); }
00264       void set(const Range &r) throw (Badinput);
00272       void setEnd(int ne) throw (PointOutChain);
00273       void setBegin(int nb);
00280       void growEnd(int icr);
00289       int advancePosOnExon(int pos, unsigned int d) const;
00291       int retreatPosOnExon(int pos, unsigned int d) const;
00296       int exonIndex(int p) const;
00297       int exonIndex(const Range &r) const;
00301       bool insideExon(int p) const { return exonIndex(p) != -1; }
00310       bool insideExon(const Range &r, bool samedir=false) const;
00311       
00312       /*** Output functions *****/
00313 
00316       virtual ostream& show(ostream &ous=cout) const;
00320       friend ostream& operator<<(ostream &ous, const Noschain &nc) 
00321          { return nc.print(ous); }
00326       virtual ostream& print(ostream &ous) const;
00331       string toString() const;
00337       pair<string,string> startEnd(char sep=',') const;
00342       string jgiformat(char sep='\t') const;
00346       string sfExonsGenomic() const;
00347       string sfExonsTranscript() const;
00348 
00355       Noschain onewayChain() const;
00359       void onewayChain(Noschain &owc) const;
00369       void exonOnlyChain(Noschain &eoc) const;
00371       Noschain exonOnlyChain() const;
00376       string asDelimitedString(const char sep=':', const char rsep[]=",") const;
00377       string asDelimitedString(const char sep=':', const char rsep=',') const;
00378 
00394       //bool exonContain(const Noschain &nc, bool samedirection=false) const throw(PointOutChain);
00395       bool exonContain(const Noschain &nc, bool samedirection=false) const;
00417       int compareChainAndFix(Noschain &nc, int &edit);
00443       bool exonOverlap(const Noschain &nc) const;
00459       int exonOverlapLength(const Noschain &nc) const;
00467       int exonIntersectLength(const Noschain &nc) const;
00470       pair<float, float> exonIntersectFraction(const Noschain &nc) const;
00492       bool append(Noschain &nc);
00498       void extendEnd(const Noschain &nc);
00499 
00506       Noschain& reverse();
00516       void subsequence(string &sub, const string &gs) const throw(Badinput);
00522       bool isDangle(const int intronlen=900, const int exonlen=27) const;
00528       bool directionAgree() const;
00529 
00530    protected:
00540       vector<Range> exons;
00544       //Range cds;
00545 };
00546 
00548 void errorMessageFix1(Noschain &c1, Noschain &c2, int status, int end);
00549 
00550 extern int FUZZYMARGIN; //=16;
00566 int comparePlusChainFirstBefore(Noschain &c1, Noschain &c2);
00567 int comparePlusChainIdenticalLeft(Noschain &c1, Noschain &c2);
00568 int compareMinusChainFirstBefore(Noschain &c1, Noschain &c2);
00569 int compareMinusChainMinusIdenticalRight(Noschain &c1, Noschain &c2);
00570 
00574 class lessByChainDirectionPtr {
00575    public:
00576       bool operator()(const Noschain *c1, const Noschain *c2) const {
00577          return c1->compareByDirection(*c2) == -1;
00578       }
00579 };
00580 
00581 class lessByChainDirection {
00582    public:
00583       bool operator()(const Noschain &c1, const Noschain &c2) const {
00584          return c1.compareByDirection(c2) == -1;
00585       }
00586 };
00587 
00591 class lessChainPtr {
00592    public:
00593       bool operator()(const Noschain *c1, const Noschain *c2) const {
00594          return *c1 < *c2;
00595       }
00596 };
00597 
00602 class Alnchain : public Noschain {
00603    public:
00604       Alnchain(const string &exstr) 
00605          : Noschain(exstr), iden(100), cov(1) { }
00608       Alnchain(int begin, const string &cigar, int nmismatch);
00609       Alnchain(int begin, const string &cigar, char strand, int nmismatch);
00610       Alnchain() : Noschain(), iden(100), cov(1) { }
00611       Alnchain(double id, double cv) : Noschain(), iden(id), cov(cv) { }
00612       Alnchain(const string &exons, double identity, double coverage) : Noschain(exons), iden(identity), cov(coverage) {}
00613 
00614       double identity() const { return iden; }
00615       double coverage() const { return cov; }
00616       void setIdentity(double newiden) { iden=newiden; }
00617       void setCoverage(double newcov) { cov=newcov; }
00622       string toString(const char sep='\t') const;
00623       Alnchain& operator=(const Alnchain &ch);
00625       ostream& print(ostream &ous) const;
00626       //friend ostream& operator<<(ostream &ous, const Alnchain &chn);
00627 
00628    protected:
00630       double iden;
00631       double cov;
00632 };
00633 
00638 class Alnchainid : public Alnchain {
00639    public:
00640       Alnchainid() : Alnchain(), estids() { }
00647       Alnchainid(double seqiden, double cv, const string &estid) 
00648          : Alnchain(seqiden, cv), estids(1, estid) { }
00654       Alnchainid(int begin, const string &cigar, int nmismatch, const string &estid)
00655          : Alnchain(begin,cigar,nmismatch), estids(1,estid) { }
00657       Alnchainid(int begin, const string &cigar, char strand, int nmismatch, const string &estid)
00658          : Alnchain(begin,cigar,strand,nmismatch), estids(1,estid) { }
00661       Alnchainid(const string &exons, double identity, double coverage,
00662               const string &estidstr) : Alnchain(exons, identity, coverage) { setESTIds(estidstr); }
00663 
00665       void addESTId(const string &estid) { estids.push_back(estid); }
00670       void setESTIds(const string &idstr);
00675       int numEST() const { return estids.size(); }
00676       const vector<string>& getESTIds() const { return estids; }
00677       vector<string>& getESTIds() { return estids; }
00688       string toString(const char sep='\t') const;
00691       string ESTIdString(const char sep=',') const;
00692       Alnchainid& operator=(const Alnchainid &ch);
00696       void absorbESTIds(const Alnchainid &ch) { estids.insert(estids.end(), ch.estids.begin(), ch.estids.end()); }
00706       void averageWith(const Alnchainid &chn);
00708       ostream& print(ostream &ous) const;
00709 
00710    private:
00714       vector<string> estids;
00715 };
00716 
00718 
00727 void readandstoreGmap(const string &inputFile,  
00728       map<string, set<Alnchainid*, lessChainPtr> > &alnseg,
00729       map<int,int> &pathstat);
00731 void readandstoreGmap(istream &ins,  map<string, set<Alnchainid*, lessChainPtr> > &alnseg,
00732       map<int,int> &pathstat);
00733 
00742 void archiveAln(map<string, set<Alnchainid*, lessChainPtr> > &alnseg, const string &file);
00744 void archiveAln(map<string, set<Alnchainid*, lessChainPtr> > &alnseg, ostream &ous);
00745 
00755 class GenModel : public Noschain {
00756    public:
00757       GenModel() : Noschain(), cdsphase(0), numstop(0),
00758                hasstart(true), hasstop(true),
00759                pep(0) { }
00771       GenModel(const string &exstarts, const string &exends, 
00772             char strand, int cdsb, int cdse, const string &gi, 
00773             int ii, const string &si) throw(InvalidModel);
00774 
00781       GenModel(const string &exstarts, const string &exends, 
00782             char strand, int cdsb, int cdse, const string &gi, 
00783             int ii, const string &si, const string &track, int trackid) throw(InvalidModel);
00784       ~GenModel();
00785       ostream& show(ostream &ous) const;
00786       Noschain FivePrimeUTR() const { return subchainBeforePoint(cds.begin()); }
00789       Noschain ThreePrimeUTR() const { return subchainAfterPoint(cds.end()); }
00790       int num5NoncodingExons() const { return FivePrimeUTR().numberOfRanges(); }
00791       int num3NoncodingExons() const { return ThreePrimeUTR().numberOfRanges(); }
00792       Noschain CDS() const throw (PointOutChain) { return subchain(cds); }
00793       Range CDSRange() const { return cds; }
00794       int FivePrimeUTRLength() const { return subchainBeforePoint(cds.begin()).exonLength(); }
00795       int ThreePrimeUTRLength() const { return subchainAfterPoint(cds.end()).exonLength(); }
00796       int UTRLength() const { return FivePrimeUTRLength() + ThreePrimeUTRLength(); }
00797       int CDSLength() const throw (PointOutChain) { return CDS().exonLength(); }
00799       double CDSFraction() const { return static_cast<double>(CDSLength())/length(); }
00800 
00822       //bool valid();
00823       int valid();
00829       bool valid(const string &gs); 
00831       //bool good() const;
00832       // information
00833       //string& genomicId() const { return gid; }
00834       const string& genomicId() const { return gid; }
00837       const string& modelName() const { return name; }
00838       const string& getName() const { return name; }
00839       int getId() const { return id; }
00843       string CDSSeq(const string &gs) const throw (PointOutChain);
00847       void growEnd(int icr);
00850       void setEnd(int ne) { Noschain::setEnd(ne); cds.setEnd(ne); }
00857       string toString(char sep='\t') const;
00858       string toSQLString(char sep=',') const;
00863       string toJGIModel(char sep='\t') const;
00880       ostream& printAllmod(ostream &ous, char sep='\t') const;
00881       bool operator==(const GenModel &gm) const { return Noschain::operator==(gm) && cds==gm.cds; }
00882       bool sameExons(const GenModel &gm) const { return Noschain::operator==(gm); }
00883       void makeProtein(const string &gs);
00884       const string& pepstr() const { return pep->getSequence(); }
00885       bool hasBegin() const { return hasstart; }
00886       bool hasEnd() const { return hasstop; }
00887 
00888    private:
00895       string gid;  
00901       int id;
00906       string name;
00912       Range cds;
00913 
00916       int cdsphase;
00918       int numstop;
00919       bool hasstart, hasstop;
00920       Protein *pep;
00923       string track;
00925       int trackid;
00926 
00930       static const int minModelLen=100;
00933       static const int minCDSLen=80;
00934 };
00935 
00936 #endif

Generated on Wed Aug 10 11:56:52 2011 for Softwares from Orpara by  doxygen 1.5.6