00001 #ifndef MATCH_H
00002 #define MATCH_H
00003
00004 #include <string>
00005 #include <vector>
00006 #include "Range.h"
00007 #include <iostream>
00008 #include <list>
00009 #include <set>
00010 #include "bioseq.h"
00011
00012 using namespace std;
00013
00014
00015
00025 class Match {
00026 public:
00028 Match() : q(), t() {}
00029 Match(const Match &m) : q(m.q), t(m.t) { }
00039 Match(int qb, int qe, int tb, int te) : q(qb,qe), t(tb,te) {}
00040 ~Match() {}
00041 Match& operator=(const Match &m);
00063 bool operator<(const Match &m) const;
00065 bool operator==(const Match &m) const { return q==m.q && t==m.t; }
00066
00068
00070 const Range& queryRange() const { return q; }
00071 const Range& targetRange() const { return t; }
00072 int targetBegin() const { return t.begin(); }
00073 int targetEnd() const { return t.end(); }
00074 int queryBegin() const { return q.begin(); }
00075 int queryEnd() const { return q.end(); }
00076 int queryLength() const { return q.length(); }
00077 int targetLength() const { return t.length(); }
00078 char targetDirection() const { return t.direction(); }
00079
00084 bool onPlus() const { return t.direction() == '+' && q.direction() == '+'; }
00085 bool onMinus() const { return q.direction() == '-' && t.direction() == '-'; }
00086
00087
00088
00089
00090 bool targetOnPlus() const { return t.direction() == '+'; }
00091 bool targetOnMinus() const { return t.direction() == '-'; }
00093 bool isNull() const { return q.isNull() && t.isNull(); }
00094
00097
00099 int qoverlap(const Match &m, bool sameDirection=true) const {
00100 return q.overlap(m.q, sameDirection); }
00102 int toverlap(const Match &m, bool sameDirection=true) const {
00103 return t.overlap(m.t, sameDirection); }
00111 pair<int, int> overlap(const Match &m, bool sd=true) const
00112 { return make_pair(q.overlap(m.q, sd), t.overlap(m.t, sd)); }
00113
00115 pair<int, int> overlay(const Match &m) const
00116 { return make_pair(q.overlay(m.q), t.overlay(m.t)); }
00117 pair<float,float> overlayFraction(const Match &m) const {
00118 return make_pair(q.overlayFraction(m.q), t.overlayFraction(m.t)); }
00119
00120
00121
00122
00123
00124
00130 pair<float, float> queryOverlapFraction(const Match &m, bool sameDirection=true) const;
00131 float queryOverlapFractionMax(const Match &m, bool sameDirection=true) const { pair<float,float> f=queryOverlapFraction(m,sameDirection);
00132 return f.first > f.second? f.first : f.second;
00133 }
00139 float queryOverlapFractionMerge(const Match &m, bool sameDirection=true) const { return q.overlapFraction(m.q, sameDirection); }
00140
00141 pair<float, float> targetOverlapFraction(const Match &m, bool sameDirection=true) const;
00142 float targetOverlapFractionMax(const Match &m, bool sameDirection=true) const {
00143 pair<float,float> f=targetOverlapFraction(m,sameDirection);
00144 return f.first>f.second? f.first : f.second;
00145 }
00152 float targetOverlapFractionMerge(const Match &m, bool sameDirection=true) const {
00153 return t.overlapFraction(m.t, sameDirection);
00154 }
00155
00160 bool contain(const Match &m, bool sameDirection=true) const {
00161 return q.contain(m.q, sameDirection) && t.contain(m.t, sameDirection);
00162 }
00163 bool qcontain(const Match &m, bool sd=true) const {
00164 return q.contain(m.q,sd);
00165 }
00166 bool tcontain(const Match &m, bool sd=true) const {
00167 return t.contain(m.t,sd);
00168 }
00169 bool similar(const Match &m, bool sd=true) const {
00170 return q.similar(m.q,sd) && t.similar(m.t,sd); }
00171 bool coveredBy(const Match &m, bool sd=true, float fraction=0.8) const;
00172
00176 bool plusContain(const Match &m) const {
00177 return q.plusContain(m.q) && t.plusContain(m.t); }
00178 bool qplustminusContain(const Match &m) const {
00179 return q.plusContain(m.q) && t.minusContain(m.t); }
00180
00181 int queryDistance(const Match &m) {
00182 return q.distance(m.q); }
00183 int targetDistance(const Match &m) {
00184 return t.distance(m.t); }
00185
00187
00193 pair<int,int> combine(const Match &m) {
00194 return make_pair(q.combine(m.q), t.combine(m.t));
00195 }
00196 void setNull() { q=Range(); t=Range(); }
00197 void setTargetRange(int b, int e) { t.set(b,e); }
00198 void set(int qb, int qe, int tb, int te) {
00199 q.set(qb,qe); t.set(tb, te); }
00200
00202
00203 friend ostream& operator<<(ostream &ous, const Match &m) {
00204 ous << m.q << " x " << m.t; return ous; }
00207 ostream& output(ostream &ous, const char delimiter[]="\t") const {
00208 q.output(ous,delimiter) << delimiter;
00209 t.output(ous,delimiter);
00210 return ous;
00211 }
00213 string asDelimitedString(const char delimiter[]=",") const {
00214 return q.asDelimitedString(delimiter) + delimiter
00215 + t.asDelimitedString(delimiter); }
00216
00217 protected:
00218 Range q, t;
00219 };
00220
00230 class M8Match : public Match {
00231 public:
00232 M8Match() : Match(), iden(0), len(0), mis(0), gap(0), e(9999), _score(0) {}
00233 M8Match(const M8Match &m) : Match(m), iden(m.iden), len(m.len),
00234 mis(m.mis), gap(m.gap), e(m.e), _score(m._score) { }
00235 M8Match(float identity, int alnlen, int mismatches, int gaps,
00236 int qbegin, int qend, int tbegin, int tend, double expect,
00237 double score)
00238 : Match(qbegin,qend,tbegin,tend), iden(identity), len(alnlen),
00239 mis(mismatches), gap(gaps),e(expect), _score(score) {}
00240 M8Match& operator=(const M8Match &m);
00241 float identity() const { return iden; }
00242 int alnlen() const { return len; }
00243 double expect() const { return e; }
00244 double score() const { return _score; }
00245
00248 friend ostream& operator<<(ostream &ous, M8Match &m);
00249
00254 ostream& output(ostream &ous, const char delimiter[]="\t") const;
00257 string asDelimitedString(const char delimiter[]=",") const;
00258
00263 bool contain(const M8Match& m, bool sameDirection=true, float factor=0.8) const;
00264 void combine(const M8Match &m);
00265
00266 protected:
00268 float iden;
00270 int len;
00271 int mis, gap;
00272 double e, _score;
00273 };
00274
00276
00278 class cmpMatchTbeginInt {
00279 public:
00280 bool operator()(const Match &m, int x) { return m.targetBegin() < x; }
00281 bool operator()(int x, const Match &m) { return x < m.targetBegin(); }
00282 };
00284 class cmpMatchTbeginIntPtr {
00285 public:
00286
00287
00288
00289
00290
00291 bool operator()(const Match *m, int x) { return m->targetBegin() < x; }
00292
00293
00294
00295 bool operator()(int x, const Match *m) { return x < m->targetBegin(); }
00296 };
00297 class cmpMatchTendInt {
00298 public:
00299 bool operator()(const Match &m, int x) { return m.targetEnd() < x; }
00300 bool operator()(int x, const Match &m) { return x < m.targetEnd(); }
00301 };
00302 class cmpMatchTendIntPtr {
00303 public:
00304 bool operator()(const Match* m, int x) { return m->targetEnd() < x; }
00305 bool operator()(int x, const Match* m) { return x < m->targetEnd(); }
00306 };
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00319 class cmpQueryPtr {
00320 public:
00321 bool operator()(const Match* m1, const Match* m2) const {
00322 if (m1->queryRange() == m2->queryRange())
00323 return m1->targetRange() < m2->targetRange();
00324 else return m1->queryRange() < m2->queryRange(); }
00325 };
00326
00329 class cmpTargetPtr {
00330 public:
00331 bool operator()(const Match* m1, const Match* m2) const {
00332 if (m1->targetRange() == m2->targetRange())
00333 return m1->queryRange() < m2->queryRange();
00334 else return m1->targetRange() < m2->targetRange(); }
00335 };
00344 class cmpTargetDirectionalPtr {
00345 public:
00346 bool operator()(const Match* m1, const Match* m2) const {
00347 int cr= m1->targetRange().compareDirectional(m2->targetRange());
00348 if (cr == -1) return true;
00349 else if (cr == 1) return false;
00350 else return m1->queryRange().directionalLess(m2->queryRange());
00351 }
00352 };
00353
00354 class beforeMatchQuery {
00355 public:
00356 beforeMatchQuery(double fr=0.2) : ovlpfr(fr) { }
00357 bool operator()(const Match &m1, const Match &m2) const {
00358 return m1.queryRange() < m2.queryRange() && m1.queryOverlapFractionMerge(m2) < ovlpfr;
00359 }
00360 private:
00361 double ovlpfr;
00362 };
00366 class afterMatchQuery {
00367 public:
00368 afterMatchQuery(double fr=0.2) : ovlpfr(fr) { }
00369 bool operator()(const Match &m1, const Match &m2) const {
00370 return m1.queryRange() > m2.queryRange() && m1.targetOverlapFractionMerge(m2) < ovlpfr;
00371 }
00372 private:
00373 double ovlpfr;
00374 };
00377 class notlessMatchQueryBegin {
00378 public:
00379 bool operator()(const Match &m1, const Match &m2) const {
00380 return m1.queryBegin() >= m2.queryBegin();
00381 }
00382 };
00383
00388 class M8MatchEX : public M8Match {
00389 public:
00390 friend class Boxchain;
00391 M8MatchEX() : M8Match(), maxScore(0), before(0), after(0) { }
00392 M8MatchEX(const M8MatchEX &m) : M8Match(m), maxScore(m.maxScore), before(m.before), after(m.after) { }
00393
00394
00395 M8MatchEX(const M8Match &m) : M8Match(m), maxScore(m.score()), before(0), after(0) { }
00396 M8MatchEX& operator=(const M8MatchEX &m);
00397
00398 M8MatchEX& operator=(const M8Match &m);
00399 double getMaxScore() const { return maxScore; }
00400 void setMaxScore(double sc) { maxScore=sc; }
00401
00402 void attachTo(M8MatchEX &m) {
00403 maxScore=m.maxScore + score(); before=&m; }
00404 friend ostream& operator<<(ostream &ous, const M8MatchEX &m);
00405 M8MatchEX* previous() const { return before; }
00406 M8MatchEX* next() const { return after; }
00407 void setPrevious(M8MatchEX* p) { before=p; }
00408 void setNext(M8MatchEX* p) { after=p; }
00409 void setDefault() { before=0; maxScore=score(); }
00410
00411
00412 private:
00414 double maxScore;
00416 M8MatchEX* before;
00417 M8MatchEX* after;
00418 };
00420
00421 class MatchQueryLessThan {
00422 public:
00423 MatchQueryLessThan(const M8MatchEX& m, double fr=0.2)
00424 : mm(m), ovlpfr(fr) { }
00425 MatchQueryLessThan(const M8MatchEX* m, double fr=0.2)
00426 : mm(*m), ovlpfr(fr) { }
00427 bool operator()(const M8MatchEX *m) const {
00428
00429 return m->queryRange() < mm.queryRange() && m->queryOverlapFractionMerge(mm) < ovlpfr;
00430 }
00431 private:
00432 const M8MatchEX& mm;
00433 double ovlpfr;
00434 };
00435
00436 class ltMatchQueryBeginPtr {
00437 public:
00438 bool operator()(const M8MatchEX *p1, const M8MatchEX *p2) const {
00439
00440
00441
00442
00443
00444
00445
00446 return p1->queryBegin() < p2->queryBegin();
00447 }
00448 };
00449
00450 class Boxchain;
00455 class M8MatchChain {
00456 public:
00457 friend class Tblastn;
00458 M8MatchChain() : chain(), _first(true), _last(true), sumiden(0),
00459 sumaln(0), sumscore(0) {}
00465
00466 M8MatchChain(M8Match *m)
00467 : chain(), _first(true), _last(true),
00468 sumiden(m->identity()*m->alnlen()),
00469 sumaln(m->alnlen()), sumscore(m->score()) {
00470 chain.push_back(m);
00471 qchain.add(m->queryRange(), true); }
00472
00476 void add(M8Match *m);
00477 void nofirst() { _first=false; }
00478 void nolast() { _last=false; }
00479 bool hasFirst() const { return _first; }
00480 bool hasLast() const { return _last; }
00481
00482
00483
00484 int getTargetBegin() const;
00485 int getTargetEnd() const;
00486 int getQueryBegin() const;
00487 int getQueryEnd() const;
00488
00489
00490
00491 char direction() const;
00492 int queryOverlap() const { return qchain.getOverlap(); }
00493 int queryLength() const { return qchain.length().first; }
00494
00495
00496 friend ostream& operator<<(ostream &ous, const M8MatchChain &c);
00497
00500 ostream& output(ostream &ous, char delimiter[]="\t") const;
00501
00505 float avgIdentity() const { return (float)(sumiden/sumaln); }
00506 double sumScore() const { return sumscore; }
00507 int size() const { return chain.size(); }
00508
00509 private:
00510 vector<M8Match*> chain;
00511
00512
00513
00514
00515 bool _first;
00516 bool _last;
00517 RangeChain qchain;
00518
00519
00520 double sumiden;
00521 int sumaln;
00522 double sumscore;
00523
00524 };
00525
00529 class Tblastn {
00530 public:
00531 Tblastn() : qid(), tid(), matches(), matchesQS(), sortedByQuery(false), sortedByTarget(false) {}
00532 Tblastn(const string &query, const string &target, int queryLen, int targetLen)
00533 : qid(query), tid(target), qlen(queryLen), tlen(targetLen),
00534 matches(), matchesQS(), sortedByQuery(false),
00535 sortedByTarget(false) {}
00540 Tblastn(const Tblastn& tbn);
00541 Tblastn& operator=(const Tblastn& tbn);
00542 ~Tblastn();
00543 void clear() { removeAll(); }
00544
00546
00550 void addMatch(float identity, int alnlen, int mismatches, int gaps,
00551 int qbegin, int qend, int tbegin, int tend, double expect, double score);
00552 void addMatch(M8Match *mptr) { matches.push_back(new M8Match(*mptr)); }
00553
00554 void add(const Tblastn& tbn);
00555
00564 void setSortedByTarget(bool yes=true) { sortedByTarget=yes; }
00565 bool isSortedByTarget() const { return sortedByTarget; }
00566
00568
00569
00570 ostream& output(ostream& ous, char delimiter[]="\t") const;
00571
00574 void outputModel(const M8MatchChain &amodel, int &id,
00575 ostream& ouss, ostream &ousd, char delimiter[]="\t") const;
00598 void outputModel(const Boxchain &amodel, int &id,
00599 ostream& ouss, ostream &ousd, char delimiter[]="\t") const;
00600
00601 friend ostream& operator<<(ostream &ous, const Tblastn &b);
00602
00603 void show(ostream &ous) const;
00604
00606
00634
00635
00636 int removeJunk(const Protein &qseq, const DNA &tseq, float factor=0.8);
00637
00646 void findFootprint(ostream &SUMM, ostream &DETA, float covcut=0.4);
00647
00648
00649
00650
00651
00654 vector<Match> findSections();
00655
00657
00658 bool empty() { return matches.empty(); }
00659 const string& query() const { return qid; }
00660 const string& target() const { return tid; }
00661
00665 int numMatches() const { return matches.size(); }
00666 const vector<M8Match*>& getMatches() const { return matches; }
00667 int queryLength() const { return qlen; }
00668 int targetLength() const { return tlen; }
00669 const Range& firstQueryRange() const;
00670 const Range& lastQueryRange() const;
00675 float qcov() const;
00676
00681 bool isFirstQueryMatch(const M8Match* m) const;
00682
00687 bool overlapFirstQueryMatch(const M8Match* m, int cutoff=9) const;
00688 bool isLastQueryMatch(const M8Match* m) const;
00689 bool overlapLastQueryMatch(const M8Match* m, int cutoff=9) const;
00690
00692
00703
00704 int checkContain(ostream &ous, float factor=0.8) const;
00705 bool sameQTPair(const Tblastn& tbn) {
00706 return qid==tbn.qid && tid == tbn.tid; }
00707
00709
00726 void sortMatchByTarget();
00727
00729 void directionalSortMatchByTarget();
00730
00732 static void readConfig(const string &conf);
00733 static double getIntronProbility(int len);
00734
00735 private:
00736 string qid, tid;
00737 int qlen, tlen;
00746 vector<M8Match*> matches;
00747 bool sortedByTarget;
00753 void erase(int b, int e);
00759 int eraseMatch(const Range& r);
00762 void removeNullMatch();
00763
00773 mutable vector<M8Match*> matchesQS;
00777 mutable bool sortedByQuery;
00778 void sortMatchByQuery() const;
00780 int removeAll(list<string> &rmlist, ostream &ous);
00785 int removeAll();
00789
00790
00791 static float qcovCutoff;
00795
00796 static vector<double> intronProb;
00797 };
00798
00804 class Boxchain {
00805 public:
00806
00807 Boxchain() : begin(0), end(0), chlen(0), iden(0), qolp(-1) { }
00810 ~Boxchain();
00824 Boxchain(M8MatchEX* p);
00829 Boxchain(const Boxchain &bc);
00830
00832 Boxchain& operator=(const Boxchain& bc);
00837 void show(ostream &ous) const;
00838 const M8MatchEX* last() const { return end; }
00839 const M8MatchEX* first() const { return begin; }
00840 M8MatchEX* first() { return begin; }
00841 M8MatchEX* last() { return end; }
00842 int numMatch() const { return chlen; }
00843 char direction() const { return begin->targetDirection(); }
00844
00845 int queryBegin() const { return begin->queryBegin(); }
00846 int queryEnd() const { return end->queryEnd(); }
00849 int targetBegin() const { return begin->targetBegin(); }
00850 int targetEnd() const { return end->targetEnd(); }
00854 double sumScore() const { return end->getMaxScore(); }
00858 double maxScore() const { return end->getMaxScore(); }
00859 double avgIdentity() const;
00862 int queryOverlap() const;
00869 ostream& summaryOutput(ostream &ous, const char dl[]="\t") const;
00870 ostream& detailOutput(ostream &ous, const string &qid, const string &tid,
00871 const int &id, const char dl[]="\t") const;
00872
00877 int queryCovered() const { return end->queryEnd()-begin->queryBegin()+1; }
00881 int querySeqCovered() const;
00882
00883
00884 private:
00885
00889 M8MatchEX* begin;
00890 M8MatchEX* end;
00893 int chlen;
00895 mutable double iden;
00898 mutable int qolp;
00899 };
00900
00907 class Linkmatch {
00908 public:
00909 ~Linkmatch();
00910
00916 Linkmatch(Tblastn &tb);
00917 typedef multiset<M8MatchEX*, ltMatchQueryBeginPtr>::iterator bestiterator;
00918 typedef multiset<M8MatchEX*, ltMatchQueryBeginPtr>::reverse_iterator bestreverse_iterator;
00919
00920
00921
00922
00923
00924
00937 template<class ForwardIterator>
00938 M8MatchEX* bestModel(ForwardIterator first, ForwardIterator last, const Match &box);
00943 M8MatchEX* bestModel(const Match &sec);
00948
00949 void bestModel(vector<Boxchain*>& topmodels);
00950
00951
00952
00956 void showBest(ostream &ous) const;
00958 void show_best(ostream &ous) const;
00959 void show_end2match(ostream &ous) const;
00960 void show_start2match(ostream &ous) const;
00961 void show_sections(ostream &ous) const;
00970
00977 void removeChain(M8MatchEX* p, Match &box);
00978
00979 private:
00984
00985
00986 set<int> xpoints;
00987
00988 multimap<int, M8MatchEX*> start2match;
00989 multimap<int, M8MatchEX*> end2match;
00994 multiset<M8MatchEX*, ltMatchQueryBeginPtr> best;
01001 void cleanUpBest(bestiterator bi);
01002 vector<Match> sections;
01003 };
01004
01005 #endif