alnrange.h

Go to the documentation of this file.
00001 #ifndef ALNRANGE_H
00002 #define ALNRANGE_H
00003 
00004 #include <iostream>
00005 #include <string>
00006 #include <vector>
00007 #include <list>
00008 
00009 #ifdef HAVE_PG
00010 #include "libpq++.h"
00011 #endif
00012 
00013 using namespace std;
00014 
00015 // detect chimera sequence given a cluster number
00016 
00035 class alnrange {
00036         public:
00037                 alnrange() : begin(0), end(0), score(0), ngidentity(0), cov(0) {}
00038                 alnrange(const alnrange &r);
00044                 alnrange(int b, int e, double s, double ng, double c) 
00045                         : begin(b), end(e), score(s), ngidentity(ng), cov(c) {}
00046 
00047 #ifdef HAVE_PG
00048 
00049                 alnrange(PgDatabase* db, int i);
00050 #endif
00051 
00052                 virtual ~alnrange() {}
00053                 alnrange& operator=(const alnrange& r);
00054 
00055                 /* this range overlap with another one
00056                 // This methods have defects, and should be applied
00057                 // on large range (>margin) only
00058                 */
00059                 bool overlap(const alnrange &r, const int margin);
00060 
00061                 /* use internal class wide parameters */
00062                 bool overlap(const alnrange &r);
00063                 int length() const { return end-begin+1; }
00064                 int getBegin() const { return begin; }
00065                 int getEnd() const { return end; }
00066                 double getScore() const { return score; }
00067                 double getNg() const { return ngidentity; }
00068                 double getCov() const { return cov; }
00069 
00070                 /* out put all fields
00071                  * begin,end,ngidentity,cov; separator: "\t"
00072                  * */
00073                 virtual string asTabedString() const;
00074                 virtual string asDelimitedString(char sep[]=",") const;
00075 
00076                 static string fields();
00077 
00078                 /* pro produce only the essential information for debug
00079                  * tab delimited string output
00080                  * begin,end,ngidentity
00081                  * */
00082                 virtual string essentialInfo() const;
00083 
00084                 friend ostream& operator<<(ostream &ous, const alnrange& r);
00085 
00086                 static void setmargin(int olpcut, float olpfrac) {
00087                         ovlpcut=olpcut; ovlpfraction=olpfrac; }
00088 
00089         protected:
00090                 int begin; // match begin index, usually 1-based
00091                 int end;  // matche end index
00092                 // keeps average score, 
00094                 double score;
00096                 double ngidentity;
00098                 double cov; // coverage relative to query
00099 
00100         static int ovlpcut;
00101         static float ovlpfraction;
00102         //double identity;
00103 };
00104 
00111 class rangePair : public alnrange {
00112         public:
00113                 rangePair() : alnrange(), tprtid(), tcov(0), tmodelid(0),
00114                         tgenomicid(), tstrand(), tstart(-1), tend(-1) {}
00115                 rangePair(const rangePair& r);
00124                 rangePair (int b, int e, double s, double ng, 
00125                                 double c, const string &tpi, double tc, 
00126                                 int tmi, const string &tgi, char tst, 
00127                                 int ts, int te) 
00128                         : alnrange(b,e,s,ng,c), tprtid(tpi), tcov(tc),
00129                         tmodelid(tmi), tgenomicid(tgi), tstrand(tst), 
00130                         tstart(ts), tend(te) {}
00131                 double getTcov() const { return tcov; }
00132                 string getTprtid() const { return tprtid; }
00139                 string asTabedString() const;
00140                 string asDelimitedString(char sep[]=",") const;
00141                 string genomicInfo(char sep[]="\t") const { 
00142                         return tgenomicid + sep + tstrand; }
00143                 string getGenomic() const { return tgenomicid; }
00144                 char getStrand() const { return tstrand; }
00145                 static string fields();
00146 
00150                 string essentialInfo() const;
00151 
00152                 ~rangePair() {}
00153 
00154                 bool sameGene(const rangePair &r) const;
00161                 //list<string> fieldsAsList() const;
00162 
00163                 // before using this class, this parameter must be
00164                 // set to a proper value
00165                 static void setIntronLimit(int length) { distance_cut=length; }
00166 
00167         private:
00168                 /* add target spcific staff */
00169                 string tprtid; // target id
00170                 double tcov; // target coverage
00171                 int tmodelid;
00172                 string tgenomicid;
00173                 char tstrand; // + or -
00174                 int tstart;  // genomic start
00175                 int tend;   // genomic end
00176 
00177                 static float ngdiff_cut;
00178 
00182                 static int distance_cut;
00183 };
00184 
00185 class SplitResult {
00186         private:
00187                 string guide;
00188                 int guideLen;
00189                 list< pair<rangePair,rangePair> > joins;
00190 
00191         public:
00192                 SplitResult() {}
00193                 void setGuide(const string &g, int len) { guide=g; guideLen=len; }
00194                 void add(const rangePair *left, const rangePair *right) {
00195                         joins.push_back(make_pair(*left, *right)); }
00196                 bool empty() const { return joins.empty(); }
00197                 // for constructing SQL use sep=",", for table dump use \t
00198                 list<string> outputRow(char sep[]=",") const;
00199 };
00200 
00201 // begin and end become the outer-most value
00207 class avgrange {
00208         public:
00209                 // default constructor, build an empty object
00210                 avgrange() 
00211                         : n(0), sumscore(0), sumng(0), sumcov(0), sumbegin(0), 
00212                           sumend(0), begin(999999), end(0), covs(), sorted(false), members() { }
00213 
00214                 /* construct an avgrange object out of r
00215                  * avgrang is the same as range if there is only one member.
00216                  * It makes a copy from r.
00217                  * r must be created by the new operator.
00218                  * */
00219                 avgrange(const alnrange &r);
00220                 avgrange(const alnrange *rp);
00221                 ~avgrange();
00222                 bool overlap(const alnrange &r, const int margin=10);
00223 
00224                 // this is the most useful method, for accumulating
00225                 // overlapping ranges
00226                 //void merge(const range &r);
00227                 void merge(const alnrange *r);
00228                 /* do pointer manipulation, no object copying
00229                  */
00230                 void merge(const avgrange *ar);
00231 
00232                 double length() const { return (sumend-sumbegin+n)/static_cast<double>(n); }
00233                 double getBegin() const { return sumbegin/static_cast<double>(n); }
00234                 double getEnd() const { return sumend/static_cast<double>(n); }
00235                 double getScore() const { return sumscore/sumcov; }
00236                 double getNg() const { return sumng/sumcov; }
00237                 double getCov() const { return sumcov/n; }
00238                 int maxlength() const { return end-begin+1; }
00239                 int minbegin() const { return begin; }
00240                 int maxend() const { return end; }
00241                 int getCount() const { return members.size(); }
00242                 const vector<const alnrange* > & getMembers() const { return members; }
00243                 // for human to read
00244                 friend ostream& operator<<(ostream &ous, const avgrange &ar);
00245 
00246         /* output one line in table format, not line terminator, 
00247                  * The fields are defined in the colheaders() method
00248                  **/
00249                 ostream& writeTable(ostream &ous) const;
00253                 string asDelimitedString(const char sep[]=",") const;
00254 
00255                 /* with SQL comment format, out put range information
00256                  * */
00257                 ostream& sqlinfo(ostream &ous);
00258 
00259                 // assume covs is sorted
00260                 double getMedianCov() const;
00261 
00262                 /* at testing stage write output to stdout
00263                  * This function test all pair-wise split genes, 
00264                  * 3 or more split is rare, but is present can be 
00265                  * easily detected. for example
00266                  * A-B B-C will be combined to A-B-C
00267                  * When we say it is a split gene, you have to make
00268                  * sure that the standard is not a chimera!
00269                  * */
00270                 string checkSplit_debug(const avgrange &r) const;
00271                 list<string> checkSplit(const avgrange &r, char sep[]=",") const;
00272                 SplitResult testSplit(const avgrange &r, char sep[]=",") const;
00273 
00274                 // for chimera detection
00275         static string colheaders() {
00276             return "min_begin\tmax_end\tavg_begin\tavg_end\tn\tavg_score\tavg_ngidentity\tavg_coverage\tmedian_coverage";
00277         }
00278 
00279         private:
00280                 int begin, end; // outer most value
00281                 int sumbegin, sumend;
00282                 double sumscore, sumng, sumcov;
00283                 //double sumweight;
00284                 int n;
00285 
00286                 mutable vector<double> covs; 
00287                 /* used to compute median coverage
00288                  * Only after sorting, median can be computed
00289                  * This is a state tag for covs
00290                  */
00291                 mutable bool sorted;
00292 
00293                 //int seqlength;  // length fo the complete sequence
00294                 //string id;  // sequence identifier
00295                 // as oppose to matchlen
00296 
00300                 vector<const alnrange*> members;
00301 };
00302 
00319 
00329 #endif

Generated on Wed Aug 10 11:56:49 2011 for Softwares from Orpara by  doxygen 1.5.6