dynaln.h

Go to the documentation of this file.
00001 #ifndef DYNALN_H
00002 #define DYNALN_H
00003 
00004 #include "bioseq.h"
00005 #include "matrix.h"
00006 #include <list>
00007 #include <vector>
00008 
00009 /* experimental, might not be needed 
00010 struct Alignedcol {
00011    Alignedcol() { }
00012    Alignedcol(const char t, const char m, const char b) 
00013       : top(t), middle(m), buttom(b) { }
00014    char top;
00015    char middle;
00016    char buttom;
00017 };
00018 
00019 struct Alnstring {
00020    string top;
00021    string middle;
00022    string buttom;
00023 };
00024 
00025 struct Alnsummary {
00026    int score;
00027    int idencnt;
00028    int simcnt;  // excluding identical residues
00029    int numgaps1;
00030    int numgaps2;
00031    int gaplen1;
00032    int gaplen2;
00033 };
00034 */
00035 enum ALNTYPE {GLOBAL, LOCAL};
00036 //enum POINTER {diag, top, left}; // for traceback
00037 
00046 class Dynaln {
00047    public:
00048       Dynaln() : ST(), gapi(-8), gape(-2), seq1(0),
00049                  seq2(0),alnidx(), S(0), M(0),
00050                  IX(0), IY(0), Smax(0), idencnt(0), simcnt(0), numgaps1(0),
00051                  numgaps2(0), gaplen1(0), gaplen2(0),
00052                  seq1begin(-1), seq1end(-1), seq2begin(-1), seq2end(-1),
00053                  topaln(), middle(), buttomaln() { }
00071       Dynaln(const bioseq &s1, const bioseq &s2) 
00072          : ST(), gapi(-8), gape(-2), seq1(&s1), seq2(&s2), 
00073             alnidx(), S(0), M(0), IX(0), IY(0),
00074             Smax(0), idencnt(0), simcnt(0), numgaps1(0),
00075             numgaps2(0), gaplen1(0), gaplen2(0),
00076             seq1begin(-1), seq1end(-1), seq2begin(-1), seq2end(-1),
00077             topaln(), middle(), buttomaln()
00078             { if ( (s1.getSequenceType() & NUCLEIC_ACID & s2.getSequenceType()) > 0)
00079                ST.setMatrix("NUC.4.4");
00080                   }
00081       Dynaln(const bioseq &s1, const bioseq &s2, const string &ma) 
00082          : ST(ma), seq1(&s1), seq2(&s2), 
00083             alnidx(), S(0), M(0), IX(0), IY(0),
00084             Smax(0), idencnt(0), simcnt(0), numgaps1(0),
00085             numgaps2(0), gaplen1(0), gaplen2(0),
00086             seq1begin(-1), seq1end(-1), seq2begin(-1), seq2end(-1),
00087             topaln(), middle(), buttomaln()
00088             { 
00089                gapi=2*ST.getMinScore(); 
00090                gape=ST.getMinScore()/3; 
00091                if (gape > -1) gape = -1; 
00092                if (gape < -3) gape = -3; }
00100       int global();
00104       int runglobal() {
00105          int s=global(); buildResult(); return s; }
00106       int local();
00107       int runlocal() {
00108          int s=local(); buildResult(); return s; }
00109       //void globalTraceBack();
00110       ~Dynaln();
00111 
00112       /* This methods build addional information from the
00113        * initial phase of the algorithm.
00114        *    1. call traceback.
00115        *    2. clear existing result (call clearResult())
00116        *    3. then use the alnidx list to update other information:
00117        *       idencnt: counts for identical resudues
00118        *       simcnt: similar residues
00119        *       numgaps1, numgaps2: number of gaps in seq1 and seq2
00120        *       gaplen1, gaplen2: total gaps in both sequences.
00121        *       topaln: aligned string of seq1
00122        *       buttomaln: aligned string of seq2
00123        *       middle: alignment middle line (| for identical : for similar)
00124        * This method create additional redundant information for
00125        * easy usage and user interface.
00126        */
00127       void buildResult();
00131       void buildAlnInfo();
00132 
00134       void printAlign(ostream &ous, const int w=70) const;
00135       const string& getTopAln() const { return topaln; }
00136       const string& getButtomAln() const { return buttomaln; }
00146       string toDelimitedString(const string &dl=",") const;
00147 
00148       void showseq() const {
00149          cout << seq1->toString() << "\n"
00150             << seq2->toString() << endl; }
00151 
00153       int getScore() { return Smax; }
00154       int getAlnlen() const { return alnidx.size(); }
00155       float getIdentity() const { return (float)idencnt/getAlnlen(); }
00156       float getSimilarity() const { return (float)(idencnt+simcnt)/getAlnlen(); }
00157       /* count the number of gaps in the two sequences
00158        */
00159       pair<int,int> numgaps() const { 
00160          return make_pair(numgaps1, numgaps2); }
00161       static string headers(const string &dl);
00162       //const Matrix& getScoringMatrix() const { return ST; }
00163       const Matrix* getScoringMatrix() const { return &ST; }
00164       int topBeginIndex() const { return seq1begin; }
00165       int buttomBeginIndex() const { return seq2begin; }
00166       int topEndIndex() const { return seq1end; }
00167       int buttomEndIndex() const { return seq2end; }
00168 
00170       void setMatrix(const string &mat) { ST.setMatrix(mat); }
00174       void setMatrix(const Matrix &mat) { ST=mat; }
00175 
00179       void setSeq1(const bioseq &sq) { seq1 = &sq;  }
00180       void setSeq2(const bioseq &sq) { seq2 = &sq; }
00181       void setSeq(const bioseq &sq1, const bioseq &sq2) {
00182          seq1=&sq1; seq2=&sq2; }
00187       void setseq(const bioseq &sq1, const bioseq &sq2) {
00188          seq1=&sq1; seq2=&sq2; }
00189       //void setSeq(const string &sq1, const string &sq2) {
00190        //  *seq1=sq1; *seq2=sq2; }
00191 
00192       void setGapInsert(int ins) { gapi=ins; }
00193       void setGapExtend(int ext) { gape=ext; }
00194       void setGapParameter(const int go, const int ge) {
00195          gapi=go; gape=ge; }
00196       void setAllParameter(const Matrix& mat, const int go, const int ge)
00197       { ST=mat; gapi=go; gape=ge; }
00198 
00199 
00200 
00201       static const char gapchar='-';
00202       static const char idenchar='|';
00203       static const char simchar=':';
00204       static const int simcut=0;
00205       /* use & | operator to figure out which direction
00206        * store the traceback pointers in an integer
00207        * multiple pointers can be stored as sum of the 
00208        * individual pointers
00209        * DIAG+TOP=3 etc...
00210        */
00211       static const int PTRNULL=0;
00212       static const int PTRDIAG=1;
00213       static const int PTRTOP=2; // Ix
00214       static const int PTRLEFT=4; // Iy
00215 
00216       /* debug functions */
00217       void debug_showmatrix(ostream &ous) const;
00218 
00219    protected:
00220       void countnumgaps();
00225       void clearResult();
00226       //void allocmemLS();
00227       
00228       /* Nr is the number of row (seq1 length + 1)
00229        * Nc is the number of columns (seq2 length + 1)
00230        */
00231       //void allocmem(const int Nr, const int Nc);
00232       void allocmem();
00233       //void traceback(int &i, int &j, int *&Sc, bool local=false);
00234       
00235       /*
00236        * the traceback function produce the alignment result
00237        * and put it into the alnidx list structure.
00238        * It also set the begin,end of each sequence that.
00239        * For global alignment, it will traceback the begining
00240        * gaps. The ending gap will also be removed from the
00241        * end of the sequence.
00242        *
00243        * i, and j are the matrix location index 0-based
00244        */
00245       //void traceback(int &i, int &j, bool local=false);
00246       //void traceback(int &i, int &j);
00247       /* this version works with the pointers
00248        * and only affect the alnidx list.
00249        */
00250       void tracepointer(int &i, int &j);
00251       void findAlnBoundary();
00252       //void initialize();
00253       int gapi, gape;  // should be negative numbers
00254       //Matrix ST; // scoring table, use protein BLOSUM50 as default.
00255       //We should use pointers
00259       Matrix ST; // scoring table, use protein BLOSUM50 as default.
00260       // for DNA use NUC.4.4
00261       const bioseq *seq1;
00262       const bioseq *seq2;
00263       // the code were assigned when allocating memory
00264       const int* C1; // needs to be updated properly when sequence got updated
00265       const int* C2; // code for the sequence
00266       int* S;    // simulate 2-D array, 
00267                  //size (seq1.length()+1)*(seq2.length()+1)
00268       int Ssize; // for the next round, memorize the last state
00269       int *M, *IX, *IY; // array for computing the scores
00270       //int* Itop;  // for temporary usage
00271       int numcol;  // for the next round
00272       //int* P; // for trace back pointer
00273       //int Ileft;  // for temporary usage
00274       //stack<pair<int, int> > alnidx;
00275       
00276       /* alnidx contains final alignment result
00277        * the numbers are the index in the two sequences.
00278        * -1 for a gap
00279        * we can print this result in many different ways
00280        */
00281       list<pair<int, int> > alnidx;
00282       //int score;  // assigned in the trace back step.
00283       int Smax;  // assigned before the trace back step.
00284       // in local it is use during the build up phase
00285       // in global it is the last cell m,n
00286       /* the two numbers are use to record the
00287        * traceback starting point in terms of index
00288        * in the input sequences.
00289        * For global, it is seq1len-1, seq2len-1
00290        */
00291       int Smaxi, Smaxj;  // for local dynamic square memory
00292 
00293       // alignment summary and detailed information
00294       int idencnt;
00295       int simcnt;
00296       int numgaps1;
00297       int numgaps2;
00298       int gaplen1;
00299       int gaplen2;
00300 
00301       /* start and end of alignment, if local then
00302        * it is the aligned part, if global, it 
00303        * excludes the begining and ending gaps
00304        * 0-based index, inclusive
00305       */
00306       int seq1begin, seq1end, seq2begin, seq2end;
00307       //int alnlen;
00308       //vector<Alignedcol> alnresult;
00309       string topaln, middle, buttomaln;  // aligned sequences
00310       ALNTYPE alntype;
00311 };
00312 
00313 class LSDynaln : public Dynaln {
00314    public:
00315       LSDynaln() : Dynaln() { }
00316       LSDynaln(const bioseq &s1, const bioseq &s2) 
00317          : Dynaln(s1,s2) { }
00318       LSDynaln(const bioseq &s1, const bioseq &s2, const string &mat) 
00319          : Dynaln(s1,s2,mat) { }
00320       ~LSDynaln();
00321       /* the linear space version
00322        * b1: begin index of sequence1 (0-based index)
00323        * e1: one-passed the end index of sequence1 (0-based index)
00324        * return: the best score
00325        */
00326       int global();
00327       int local();
00328       int runlocal() {
00329          int s=local(); buildResult(); return s; }
00330       //void global();
00331       /* globalLS in reverse direction
00332        */
00333       //int globalR(int b1, int e1, int b2, int e2);
00334       
00335       /* b1, e1, b2, and e2 are 0-based index of the 
00336        * first and the second sequences respectively.
00337        * if (b1<e1 and b2<e2)
00338        * then it does forward computation and store
00339        * the result in S.
00340        * when b1>e1 and b2>e2 then it does reverse
00341        * computation and store the result in SR
00342        *
00343        * return a pair of pointers to the top and buttom
00344        *    arrays respectively.
00345        *    This version is incorrect in detail.
00346        */
00347       //pair< int*, int*> computeScore(int b1, int e1, int b2, int e2);
00348       /* return the start index of the top and buttom rows
00349        * in the four matrices
00350        * This can be used to retrieve the top/buttom rows of the
00351        * three scoring matrices: M, IX, IY
00352        */
00353       pair<int, int> computeScoreFW(int b1, int e1, int b2, int e2);
00354       /* works on the reverse matrices
00355        * b1>e1 | b2>e2
00356        */
00357       pair<int, int> computeScoreBW(int b1, int e1, int b2, int e2);
00358 
00359       /* arguments:
00360          * 0-based index of sequences
00361          *  b1, e1:  begin and end of sequence 1
00362          *  b2, e2: begin and end of sequence 2
00363          *  Here b1<e1 and b2<e2
00364          *
00365          *  The result will be consolidated in the alnidx list.
00366        */
00367       int path(int b1, int e1, int b2, int e2);
00368 
00369       /* Completly different from the full dynamic program traceback.
00370        * Only the last two rows of the result matrix are in memory!
00371        *
00372        * given the two arrays M.first(TOP row), M.second (BUTTOM row)
00373        * and the 0-based index r, c in terms of the matrix
00374        * len1 is the length of the range of sequence 1
00375        * len2 is that of the sequence 2
00376        * if len1 and len2 are given, then it will use the 
00377        * reverse of the sequence for tracing.
00378        * Arguments: 
00379        *    r row index
00380        *    c column index, r and c specify the starting point
00381        *    for traceback and it is the cell with the best score.
00382        */
00383       //list<pair<int,int> > traceback(pair<int, int> mrow, int b1, int &e1, int b2, int &e2, bool rev=false);
00384       /* arguments: 
00385        *    mrow  index of the last two rows of the matrix (top,buttom)
00386        *    This function will decrement e1 and e2.  If they have become
00387        *    smaller than b1 and b2 respectively, then the job has been
00388        *    finished and there is no need to do another recursion call.
00389        *    The caller (path()) function uses this information.
00390        * return: path in forward direction
00391        */
00392       list<pair<int,int> > tracebackFW(pair<int,int> mrow, int b1, int &e1, int b2, int &e2);
00393       /*
00394        * arguments:
00395        *    mrow is the matrix row, a pair of integers for the top and buttom
00396        *       array starting point.  This information was used to locate the
00397        *       starting point for the MR, IXR, and IYR arrays.  The first
00398        *       integer starts the top array, and the second integer starts the
00399        *       buttom array.  MR + mrow.first --> top array.
00400        * Return the path in the forward direction, there is no need to reverse it
00401        */
00402       list<pair<int,int> > tracebackBW(pair<int,int> mrow, int b1, int &e1, int b2, int &e2);
00403       //void showaln();  // testing function
00404 
00405       /* totally overwrite the base class version.
00406        * There are some commonality, need to be factored out
00407        * in the future.
00408        */
00409       void buildResult();
00410 
00411       void debug_showmatrixForward(pair<int,int> &rows, const int col);
00412       void debug_showmatrixBackward(pair<int,int> &rows, const int col);
00413 
00414    private:
00415       /* allocate memory use two rows for top problem
00416        * two rows for buttom problem
00417        * S and SR have only one row to store the 
00418        * score of the last row.
00419        */
00420       void allocmem();
00421       int *MR, *IXR, *IYR;
00422       //int *SR;  // for LOCAL this is used to memorize the traceback pointer
00423       //int *ItopR; // not used in local
00424 };
00425 
00426 #endif

Generated on Wed Aug 10 11:56:55 2011 for Softwares from Orpara by  doxygen 1.5.6