Match.h

Go to the documentation of this file.
00001 #ifndef MATCH_H
00002 #define MATCH_H
00003 
00004 #include <string>
00005 #include <vector>
00006 #include "Range.h"
00007 #include <iostream>
00008 #include <list>
00009 #include <set>
00010 #include "bioseq.h"
00011 
00012 using namespace std;
00013 
00014 // Match.h file
00015 
00025 class Match {
00026    public:
00028       Match() : q(), t() {}
00029       Match(const Match &m) : q(m.q), t(m.t) { }
00039       Match(int qb, int qe, int tb, int te) : q(qb,qe), t(tb,te) {}
00040       ~Match() {}
00041       Match& operator=(const Match &m);
00063       bool operator<(const Match &m) const;
00065       bool operator==(const Match &m) const { return q==m.q && t==m.t; }
00066 
00068 
00070       const Range& queryRange() const { return q; }
00071       const Range& targetRange() const { return t; }
00072       int targetBegin() const { return t.begin(); }
00073       int targetEnd() const { return t.end(); }
00074       int queryBegin() const { return q.begin(); }
00075       int queryEnd() const { return q.end(); }
00076       int queryLength() const { return q.length(); }
00077       int targetLength() const { return t.length(); }
00078       char targetDirection() const { return t.direction(); }
00079 
00084       bool onPlus() const { return t.direction() == '+' && q.direction() == '+'; }
00085       bool onMinus() const { return q.direction() == '-' && t.direction() == '-'; }
00086       /* the protein is always in the + direction.
00087        * the DNA can be + or -
00088        * The following methods will probe the direction of the match on DNA.
00089        */
00090       bool targetOnPlus() const { return t.direction() == '+'; }
00091       bool targetOnMinus() const { return t.direction() == '-'; }
00093       bool isNull() const { return q.isNull() && t.isNull(); }
00094 
00097       
00099       int qoverlap(const Match &m, bool sameDirection=true) const {
00100          return q.overlap(m.q, sameDirection); }
00102       int toverlap(const Match &m, bool sameDirection=true) const {
00103          return t.overlap(m.t, sameDirection); }
00111       pair<int, int> overlap(const Match &m, bool sd=true) const
00112       { return make_pair(q.overlap(m.q, sd), t.overlap(m.t, sd)); }
00113 
00115       pair<int, int> overlay(const Match &m) const
00116       { return make_pair(q.overlay(m.q), t.overlay(m.t)); }
00117       pair<float,float> overlayFraction(const Match &m) const {
00118          return make_pair(q.overlayFraction(m.q), t.overlayFraction(m.t)); }
00119 
00120       /* the overlap/max(L1,L2) where L1 is the range1's length
00121        * L2 is range2's length
00122        */
00123       //pair<float, float> overlapFraction(const Match &m, bool sameDirection=true) const;
00124       
00130       pair<float, float> queryOverlapFraction(const Match &m, bool sameDirection=true) const;
00131       float queryOverlapFractionMax(const Match &m, bool sameDirection=true) const { pair<float,float> f=queryOverlapFraction(m,sameDirection); 
00132          return f.first > f.second? f.first : f.second;
00133       }
00139       float queryOverlapFractionMerge(const Match &m, bool sameDirection=true) const { return q.overlapFraction(m.q, sameDirection); }
00140       // return the target overlap fractions relative to this and m object
00141       pair<float, float> targetOverlapFraction(const Match &m, bool sameDirection=true) const;
00142       float targetOverlapFractionMax(const Match &m, bool sameDirection=true) const {
00143          pair<float,float> f=targetOverlapFraction(m,sameDirection);
00144          return f.first>f.second? f.first : f.second; 
00145       }
00152       float targetOverlapFractionMerge(const Match &m, bool sameDirection=true) const {
00153          return t.overlapFraction(m.t, sameDirection); 
00154       }
00155 
00160       bool contain(const Match &m, bool sameDirection=true) const {
00161          return q.contain(m.q, sameDirection) && t.contain(m.t, sameDirection);
00162       }
00163       bool qcontain(const Match &m, bool sd=true) const {
00164          return q.contain(m.q,sd);
00165       }
00166       bool tcontain(const Match &m, bool sd=true) const {
00167          return t.contain(m.t,sd);
00168       }
00169       bool similar(const Match &m, bool sd=true) const {
00170          return q.similar(m.q,sd) && t.similar(m.t,sd); }
00171       bool coveredBy(const Match &m, bool sd=true, float fraction=0.8) const;
00172 
00176       bool plusContain(const Match &m) const { 
00177          return q.plusContain(m.q) && t.plusContain(m.t); }
00178       bool qplustminusContain(const Match &m) const { 
00179          return q.plusContain(m.q) && t.minusContain(m.t); }
00180 
00181       int queryDistance(const Match &m) { 
00182          return q.distance(m.q); }
00183       int targetDistance(const Match &m) { 
00184          return t.distance(m.t); }
00185 
00187 
00193       pair<int,int> combine(const Match &m) {
00194          return make_pair(q.combine(m.q), t.combine(m.t));
00195       }
00196       void setNull() { q=Range(); t=Range(); }
00197       void setTargetRange(int b, int e) { t.set(b,e); }
00198       void set(int qb, int qe, int tb, int te) {
00199          q.set(qb,qe); t.set(tb, te); }
00200 
00202 
00203       friend ostream& operator<<(ostream &ous, const Match &m) {
00204          ous << m.q << " x " << m.t; return ous; }
00207       ostream& output(ostream &ous, const char delimiter[]="\t") const {
00208          q.output(ous,delimiter) << delimiter;
00209          t.output(ous,delimiter);
00210          return ous;
00211       }
00213       string asDelimitedString(const char delimiter[]=",") const {
00214          return q.asDelimitedString(delimiter) + delimiter
00215             + t.asDelimitedString(delimiter); }
00216 
00217    protected:
00218       Range q, t;
00219 };
00220 
00230 class M8Match : public Match {
00231    public:
00232       M8Match() : Match(), iden(0), len(0), mis(0), gap(0), e(9999), _score(0) {}
00233       M8Match(const M8Match &m) : Match(m), iden(m.iden), len(m.len), 
00234                                   mis(m.mis), gap(m.gap), e(m.e), _score(m._score) { }
00235       M8Match(float identity, int alnlen, int mismatches, int gaps, 
00236             int qbegin, int qend, int tbegin, int tend, double expect, 
00237             double score)
00238          : Match(qbegin,qend,tbegin,tend), iden(identity), len(alnlen), 
00239            mis(mismatches), gap(gaps),e(expect), _score(score) {}
00240       M8Match& operator=(const M8Match &m);
00241       float identity() const { return iden; }
00242       int alnlen() const { return len; }
00243       double expect() const { return e; }
00244       double score() const { return _score; }
00245 
00248       friend ostream& operator<<(ostream &ous, M8Match &m);
00249 
00254       ostream& output(ostream &ous, const char delimiter[]="\t") const;
00257       string asDelimitedString(const char delimiter[]=",") const;
00258       
00263       bool contain(const M8Match& m, bool sameDirection=true, float factor=0.8) const;
00264       void combine(const M8Match &m);
00265 
00266    protected:
00268       float iden; 
00270       int len;   
00271       int mis, gap;
00272       double e, _score;
00273 };
00274 
00276 
00278 class cmpMatchTbeginInt {
00279    public:
00280       bool operator()(const Match &m, int x) { return m.targetBegin() < x; }
00281       bool operator()(int x, const Match &m) { return x < m.targetBegin(); }
00282 };
00284 class cmpMatchTbeginIntPtr {
00285    public:
00286       // according to SGI, equal_range use comp(*k, value)
00287       // in template <class ForwardIterator, class T, class StrictWeakOrdering>
00288       // pair<ForwardIterator, ForwardIterator>
00289       // equal_range(ForwardIterator first, ForwardIterator last, 
00290       // const T& value, StrictWeakOrdering comp);
00291       bool operator()(const Match *m, int x) { return m->targetBegin() < x; }
00292       // so the following version was not used by the STL
00293       // but some place in the document, it seemes to require both
00294       // versions
00295       bool operator()(int x, const Match *m) { return x < m->targetBegin(); }
00296 };
00297 class cmpMatchTendInt {
00298    public:
00299       bool operator()(const Match &m, int x) { return m.targetEnd() < x; }
00300       bool operator()(int x, const Match &m) { return x < m.targetEnd(); }
00301 };
00302 class cmpMatchTendIntPtr {
00303    public:
00304       bool operator()(const Match* m, int x) { return m->targetEnd() < x; }
00305       bool operator()(int x, const Match* m) { return x < m->targetEnd(); }
00306 };
00307 /*
00308 class MatchTbeginLessThan {
00309    public:
00310       MatchTbeginLessThan(int x) : xpoint(x) { }
00311       bool operator()(const Match &m) { return m.queryBegin()<xpoint; }
00312    private:
00313       int xpoint;
00314 };
00315 */
00316 
00319 class cmpQueryPtr {
00320    public:
00321    bool operator()(const Match* m1, const Match* m2) const {
00322       if (m1->queryRange() == m2->queryRange())
00323          return m1->targetRange() < m2->targetRange();
00324       else return m1->queryRange() < m2->queryRange(); }
00325 };
00326 
00329 class cmpTargetPtr {
00330    public:
00331    bool operator()(const Match* m1, const Match* m2) const {
00332       if (m1->targetRange() == m2->targetRange())
00333          return m1->queryRange() < m2->queryRange();
00334       else return m1->targetRange() < m2->targetRange(); }
00335 };
00344 class cmpTargetDirectionalPtr {
00345    public:
00346    bool operator()(const Match* m1, const Match* m2) const {
00347       int cr= m1->targetRange().compareDirectional(m2->targetRange());
00348       if (cr == -1) return true;
00349       else if (cr == 1) return false;
00350       else return m1->queryRange().directionalLess(m2->queryRange());
00351    }
00352 };
00353 
00354 class beforeMatchQuery {
00355    public:
00356       beforeMatchQuery(double fr=0.2) : ovlpfr(fr) { }
00357       bool operator()(const Match &m1, const Match &m2) const {
00358          return m1.queryRange() < m2.queryRange() && m1.queryOverlapFractionMerge(m2) < ovlpfr;
00359       }
00360    private:
00361       double ovlpfr;
00362 };
00366 class afterMatchQuery {
00367    public:
00368       afterMatchQuery(double fr=0.2) : ovlpfr(fr) { }
00369       bool operator()(const Match &m1, const Match &m2) const {
00370          return m1.queryRange() > m2.queryRange() && m1.targetOverlapFractionMerge(m2) < ovlpfr;
00371       }
00372    private:
00373       double ovlpfr;
00374 };
00377 class notlessMatchQueryBegin {
00378    public:
00379       bool operator()(const Match &m1, const Match &m2) const {
00380          return m1.queryBegin() >= m2.queryBegin();
00381       }
00382 };
00383 
00388 class M8MatchEX : public M8Match {
00389    public:
00390       friend class Boxchain;
00391       M8MatchEX() : M8Match(), maxScore(0), before(0), after(0) { }
00392       M8MatchEX(const M8MatchEX &m) : M8Match(m), maxScore(m.maxScore), before(m.before), after(m.after) { }
00393 
00394       /* assign the maxScore to the score of regular blast */
00395       M8MatchEX(const M8Match &m) : M8Match(m), maxScore(m.score()), before(0), after(0) { }
00396       M8MatchEX& operator=(const M8MatchEX &m);
00397       /* maxScore initialized to m._score */
00398       M8MatchEX& operator=(const M8Match &m);
00399       double getMaxScore() const { return maxScore; }
00400       void setMaxScore(double sc) { maxScore=sc; }
00401       /* only link to previous nodes */
00402       void attachTo(M8MatchEX &m) {
00403          maxScore=m.maxScore + score(); before=&m; }
00404       friend ostream& operator<<(ostream &ous, const M8MatchEX &m);
00405       M8MatchEX* previous() const { return before; }
00406       M8MatchEX* next() const { return after; }
00407       void setPrevious(M8MatchEX* p) { before=p; }
00408       void setNext(M8MatchEX* p) { after=p; }
00409       void setDefault() { before=0; maxScore=score(); }
00410 
00411 
00412    private:
00414       double maxScore;
00416       M8MatchEX* before;
00417       M8MatchEX* after; // for the chaining algorithm
00418 };
00420 // must be defined after M8MatchEX
00421 class MatchQueryLessThan {
00422    public:
00423       MatchQueryLessThan(const M8MatchEX& m, double fr=0.2) 
00424          : mm(m), ovlpfr(fr) { }
00425       MatchQueryLessThan(const M8MatchEX* m, double fr=0.2) 
00426          : mm(*m), ovlpfr(fr) { }
00427       bool operator()(const M8MatchEX *m) const {
00428          //return m->queryRange() < mm->queryRange() && m->queryOverlapFractionMerge(*mm) < ovlpfr;
00429          return m->queryRange() < mm.queryRange() && m->queryOverlapFractionMerge(mm) < ovlpfr;
00430       }
00431    private:
00432       const M8MatchEX& mm;
00433       double ovlpfr;
00434 };
00435 
00436 class ltMatchQueryBeginPtr {
00437    public:
00438       bool operator()(const M8MatchEX *p1, const M8MatchEX *p2) const {
00439          // this version will keep multiple scores for the 
00440          // same queryBegin location, we don't want this
00441          /*
00442          if (p1->queryBegin() == p2->queryBegin()) 
00443             return p1->getMaxScore() < p1->getMaxScore();
00444          else return p1->queryBegin() < p2->queryBegin();
00445          */
00446          return p1->queryBegin() < p2->queryBegin();
00447       }
00448 };
00449 
00450 class Boxchain;
00455 class M8MatchChain {
00456    public:
00457       friend class Tblastn;
00458       M8MatchChain() : chain(), _first(true), _last(true), sumiden(0), 
00459                        sumaln(0), sumscore(0) {}
00465       //M8MatchChain(Boxchain &ch);
00466       M8MatchChain(M8Match *m) 
00467          : chain(), _first(true), _last(true), 
00468            sumiden(m->identity()*m->alnlen()),
00469            sumaln(m->alnlen()), sumscore(m->score()) { 
00470             chain.push_back(m); 
00471             qchain.add(m->queryRange(), true); }
00472 
00476       void add(M8Match *m);
00477       void nofirst() { _first=false; }
00478       void nolast() { _last=false; }
00479       bool hasFirst() const { return _first; }
00480       bool hasLast() const { return _last; }
00481 
00482       // begining of the model, is on the plus strand, the smallest 
00483       // if - strand the largest number
00484       int getTargetBegin() const;
00485       int getTargetEnd() const;
00486       int getQueryBegin() const;
00487       int getQueryEnd() const;
00488 
00489       // this method only make sense if the chain is 
00490       // not empty, if empty return ' '
00491       char direction() const;
00492       int queryOverlap() const { return qchain.getOverlap(); }
00493       int queryLength() const { return qchain.length().first; }
00494 
00495       // use the M8Match operator <<
00496       friend ostream& operator<<(ostream &ous, const M8MatchChain &c);
00497       // not useful for output
00500       ostream& output(ostream &ous, char delimiter[]="\t") const;
00501 
00505       float avgIdentity() const { return (float)(sumiden/sumaln); }
00506       double sumScore() const { return sumscore; }
00507       int size() const { return chain.size(); }
00508 
00509    private:
00510       vector<M8Match*> chain; // sorted on genomic from small to large
00511 
00512      /* record completness regarding the matches
00513       * available. Not the true start or end
00514       */
00515       bool _first; // contain or overlap first match
00516       bool _last;  // contain or overlap last match
00517       RangeChain qchain; // query range chain
00518       // not needed void computestat();
00519 
00520       double sumiden;
00521       int sumaln;
00522       double sumscore;
00523       //float avgiden;
00524 };
00525 
00529 class Tblastn {
00530    public:
00531       Tblastn() : qid(), tid(), matches(), matchesQS(), sortedByQuery(false), sortedByTarget(false) {}
00532       Tblastn(const string &query, const string &target, int queryLen, int targetLen)
00533          : qid(query), tid(target), qlen(queryLen), tlen(targetLen), 
00534             matches(), matchesQS(), sortedByQuery(false),
00535             sortedByTarget(false) {}
00540       Tblastn(const Tblastn& tbn);
00541       Tblastn& operator=(const Tblastn& tbn);
00542       ~Tblastn();
00543       void clear() { removeAll(); }
00544 
00546 
00550       void addMatch(float identity, int alnlen, int mismatches, int gaps,
00551             int qbegin, int qend, int tbegin, int tend, double expect, double score);
00552       void addMatch(M8Match *mptr) { matches.push_back(new M8Match(*mptr)); }
00553       //void removeMatch(M8Match *mptr) { delete mptr; matches.remove(mptr); }
00554       void add(const Tblastn& tbn);
00555 
00564       void setSortedByTarget(bool yes=true) { sortedByTarget=yes; }
00565       bool isSortedByTarget() const { return sortedByTarget; }
00566 
00568 
00569       // dump all rows to ostream ous
00570       ostream& output(ostream& ous, char delimiter[]="\t") const;
00571 
00574       void outputModel(const M8MatchChain &amodel, int &id, 
00575             ostream& ouss, ostream &ousd, char delimiter[]="\t") const;
00598       void outputModel(const Boxchain &amodel, int &id, 
00599             ostream& ouss, ostream &ousd, char delimiter[]="\t") const;
00600       // for human to read
00601       friend ostream& operator<<(ostream &ous, const Tblastn &b);
00602       // debug show
00603       void show(ostream &ous) const;
00604 
00606       
00634       //int removeJunk(ostream &ous, list<string> &rows, 
00635        //     const Protein &qseq, const DNA &tseq, float factor=0.8);
00636       int removeJunk(const Protein &qseq, const DNA &tseq, float factor=0.8);
00637 
00646       void findFootprint(ostream &SUMM, ostream &DETA, float covcut=0.4);
00647       /* use the sparse dynamic programming to select the best
00648        * models.  Could be more than one models
00649        */
00650       //void bestModel();
00651       //
00654       vector<Match> findSections();
00655 
00657 
00658       bool empty() { return matches.empty(); }
00659       const string& query() const { return qid; }
00660       const string& target() const { return tid; }
00661 
00665       int numMatches() const { return matches.size(); }
00666       const vector<M8Match*>& getMatches() const { return matches; }
00667       int queryLength() const { return qlen; }
00668       int targetLength() const { return tlen; }
00669       const Range& firstQueryRange() const;
00670       const Range& lastQueryRange() const;
00675       float qcov() const;
00676 
00681       bool isFirstQueryMatch(const M8Match* m) const;
00682 
00687       bool overlapFirstQueryMatch(const M8Match* m, int cutoff=9) const;
00688       bool isLastQueryMatch(const M8Match* m) const;
00689       bool overlapLastQueryMatch(const M8Match* m, int cutoff=9) const;
00690 
00692       //
00703       //vector<M8Match*> checkContain() const;
00704       int checkContain(ostream &ous, float factor=0.8) const;
00705       bool sameQTPair(const Tblastn& tbn) {
00706          return qid==tbn.qid && tid == tbn.tid; }
00707 
00709 
00726       void sortMatchByTarget();
00727 
00729       void directionalSortMatchByTarget();
00730 
00732       static void readConfig(const string &conf);
00733       static double getIntronProbility(int len);
00734 
00735    private: 
00736       string qid, tid; // query and target id
00737       int qlen, tlen; // query and target length
00746       vector<M8Match*> matches;
00747       bool sortedByTarget; // state variable for matches
00753       void erase(int b, int e);
00759       int eraseMatch(const Range& r);
00762       void removeNullMatch();
00763 
00773       mutable vector<M8Match*> matchesQS; // sorted by query
00777       mutable bool sortedByQuery;
00778       void sortMatchByQuery() const;
00780       int removeAll(list<string> &rmlist, ostream &ous);
00785       int removeAll();
00789       //vector<M8MatchChain*> models;
00790 
00791       static float qcovCutoff;
00795       //static vector<pair<int, double> > intronProb;
00796       static vector<double> intronProb;
00797 };
00798 
00804 class Boxchain {
00805    public:
00806       //Boxchain() : chain() { }
00807       Boxchain() : begin(0), end(0), chlen(0), iden(0), qolp(-1) { }
00810       ~Boxchain();
00824       Boxchain(M8MatchEX* p);
00829       Boxchain(const Boxchain &bc);
00830       //Boxchain& operator=(M8MatchEX* end);
00832       Boxchain& operator=(const Boxchain& bc);
00837       void show(ostream &ous) const;
00838       const M8MatchEX* last() const { return end; }
00839       const M8MatchEX* first() const { return begin; }
00840       M8MatchEX* first() { return begin; }
00841       M8MatchEX* last() { return end; }
00842       int numMatch() const { return chlen; }
00843       char direction() const { return begin->targetDirection(); }
00844       // oputput functions 
00845       int queryBegin() const { return begin->queryBegin(); }
00846       int queryEnd() const { return end->queryEnd(); }
00849       int targetBegin() const { return begin->targetBegin(); }
00850       int targetEnd() const { return end->targetEnd(); }
00854       double sumScore() const { return end->getMaxScore(); }
00858       double maxScore() const { return end->getMaxScore(); }
00859       double avgIdentity() const;
00862       int queryOverlap() const;
00869       ostream& summaryOutput(ostream &ous, const char dl[]="\t") const;
00870       ostream& detailOutput(ostream &ous, const string &qid, const string &tid,
00871             const int &id, const char dl[]="\t") const;
00872 
00877       int queryCovered() const { return end->queryEnd()-begin->queryBegin()+1; }
00881       int querySeqCovered() const;
00882 
00883 
00884    private:
00885       //list<M8MatchEX*> chain;
00889       M8MatchEX* begin;
00890       M8MatchEX* end;
00893       int chlen;  // number of segments
00895       mutable double iden;
00898       mutable int qolp;
00899 };
00900 
00907 class Linkmatch {
00908    public:
00909       ~Linkmatch();
00910       //TblastnEx(const vector<M8Match*> &match);
00916       Linkmatch(Tblastn &tb);
00917       typedef multiset<M8MatchEX*, ltMatchQueryBeginPtr>::iterator bestiterator;
00918       typedef multiset<M8MatchEX*, ltMatchQueryBeginPtr>::reverse_iterator bestreverse_iterator;
00919 
00920       /* the following two functions reduce performance
00921        * but they look nicer
00922        */
00923       //vector<const M8MatchEX*> startAt(int x) const;
00924       //vector<M8MatchEX*> endAt(int x) const;
00937       template<class ForwardIterator> 
00938          M8MatchEX* bestModel(ForwardIterator first, ForwardIterator last, const Match &box);
00943       M8MatchEX* bestModel(const Match &sec);
00948       //void bestModel(vector<Boxchain>& topmodels);
00949       void bestModel(vector<Boxchain*>& topmodels);  // pointer version
00950       //void nextBestModel();
00951       //void bestModel(const Range& sec);
00952       //
00956       void showBest(ostream &ous) const;
00958       void show_best(ostream &ous) const;
00959       void show_end2match(ostream &ous) const;
00960       void show_start2match(ostream &ous) const;
00961       void show_sections(ostream &ous) const;
00970       //vector<Range> findSections() const;
00977       void removeChain(M8MatchEX* p, Match &box);
00978 
00979    private:
00984       //vector<M8MatchEX> elem;
00985       //list<int> xpoints;
00986       set<int> xpoints;
00987       // vector is hard to implement search of Ranges
00988       multimap<int, M8MatchEX*> start2match;
00989       multimap<int, M8MatchEX*> end2match;
00994       multiset<M8MatchEX*, ltMatchQueryBeginPtr> best;
01001       void cleanUpBest(bestiterator bi);
01002       vector<Match> sections;
01003 };
01004 
01005 #endif

Generated on Wed Aug 10 11:56:56 2011 for Softwares from Orpara by  doxygen 1.5.6