alignment.h

Go to the documentation of this file.
00001 #ifndef ALIGNMENT_H
00002 #define ALIGNMENT_H
00003 
00004 // file: alignment.h
00005 
00006 #include <iostream>
00007 //#include <fstream.h>
00008 #include <string>
00009 #include <vector>
00010 
00011 using namespace std;
00012 
00013 enum quality { best, good, partial_good, ok, bad };
00014 // best good identity with nogaps
00015 // good: good identity in all non-gapped columns
00016 // partial_good:  good identity all columns with gap char < 0.4
00017 // ok: not used yet
00018 // bad: failed the above test
00019 
00020 enum alnmethod { clustal, dialign };
00021 
00027 list<pair<int, int> > listgap(const string &s, const char gapChar='-');
00028 
00029 class alignpos {
00030         public:
00031         alignpos(int l, int w) : length(l), width(w) { allocate(); }
00032         ~alignpos() { deallocate(); }
00033         alignpos(const alignpos &alp);
00034         alignpos& operator=(const alignpos &alp);
00035 
00036         private:
00037                 void deallocate();
00038                 void allocate();
00039                 int **posmap;
00040                 int length;
00041                 int width;
00042 };
00043 
00044 class alignment {
00045         public:
00046                 alignment() : idenCnt(0), nogapCol(0), nogapsimCnt(0), simCnt(0) {}
00047 
00048                 /* reading will refresh member values to default 
00049                  * will automatically call readDia or readClustal depending on
00050                  * the header of the sequence alignemnt file.*/
00051                 void read(istream &in);   
00052 
00054                 void readDia(istream &in);
00056                 void readClustal(istream &in);
00057 
00059                 void printPolymorphic(ostream &ous) const;
00060 
00061                 alignpos getPosition() const;
00062 
00063                 /* output the alignment model in gapinsert format*/
00064                 friend ostream& operator<<(ostream &ou, const alignment &aln);
00065                 string getalnmodel() const;
00066                 //string lengthToString() const;
00067                 //char getQualityAsChar() const;
00068 
00069                 void dumpaln();
00070                 bool highQuality(float cut=0.2);
00071 
00072                 /* length of the alignment */
00073                 int length() const { return seqarr[0].length(); }
00074                 int nogapLength() const { return nogapCol; }
00075 
00076                 /* identity of the multiple alignment */
00077                 float identity() const { return (float)idenCnt/length(); }
00078 
00079                 float similarity() const { return (idenCnt+simCnt)/(float)length(); }
00080                 int getIdentityCount() const { return idenCnt; }
00081                 int getSimilarityCount() const { return simCnt; }
00082                 bool goodSegment();
00083                 float nogapIdentity() const { return (float)idenCnt/nogapCol; }
00084                 float nogapSimilarity() { return (float)(idenCnt + nogapsimCnt)/nogapCol; }
00085                 int seqCount() const { return seqarr.size(); }
00086                 int size() const { return seqarr.size(); }  // number of sequences
00087                 ostream& dumpQuality(ostream &ou);
00088 
00089                 /* column gap char proportion < 0.49 will be considered */
00090                 float partialCount(float gapcut = 0.51); // returns partial identity
00091                 bool partialGood(float base);
00092 
00093                 quality getqual(float base = 0.08, float top = 1.2);  // return the quality values
00094 
00095                 static const int nogapcolcut;  // set to 8
00096                 float idenCut;
00097                 alnmethod method;
00098                 string clusterid;
00099 
00100 
00101         protected:
00102                 float calcut(float top, float n, int iden);
00103                 void count();
00104                 void countDia();
00105 
00106                 /* the core model structure, 
00107                  * as two array of strings: the name and the actual sequence
00108                  * */
00109                 vector<string> seqarr, namearr;
00112                 string consensus;
00113                 int idenCnt, simCnt;  // simCnt is the similarity count of 
00114                 // nongapped region only
00115                 int nogapsimCnt, nogapCol;  // number of nongapped columns
00116                 int partiden, partcol;   // only needed when failed QC
00117                 float avgThick;  // similar to size()
00118 };
00119 
00120 
00121 #endif

Generated on Wed Aug 10 11:56:47 2011 for Softwares from Orpara by  doxygen 1.5.6