group.h

Go to the documentation of this file.
00001 #ifndef GROUP_H
00002 #define GROUP_H
00003 
00005 //#define HAVE_NAMESPACE_STD
00006 
00007 #include <iostream>
00008 #include <cstring>
00009 #include <utility>
00010 //#include <libpq++.h>  // for database input
00011 #include <pqxx>  // for database input
00012 // change to the libpqxx interface
00013 #include <string>
00014 #include <vector>
00015 #include <fstream>
00016 #include "gconst.h"
00017 
00018 using namespace pqxx;
00019 
00020 //#define DIV 11    // fr to 11 other dbs comparison; this is stored
00021 //inside divisions vector
00022 #define FEAT 8  // we use 8 features to characterize each division
00023 
00024 class inputend {};  // for signaling
00025 
00045 class divstat
00046 {
00047         public:
00048                 divstat();  // default constructor
00049 
00050                 /* Construct object from from a stream. Input pointer read to read 
00051                  * div, qcov, tcov, score, identity, ngidentity, similarity, 
00052                  * matchlen, and nogaplen.
00053                  * All caller should use the interface functions to use
00054                  * this object.
00055                  * */
00056                 divstat(istream &in);  // use a get function
00057                 void read(istream &in);
00058 
00062                 //divstat(PgDatabase &pgdb, int row);
00063                 divstat(result &qres, int row);
00064 
00065                 /* copy constructor */
00066                 divstat(const divstat &d);
00067 
00068                 string getid() { return string(id); }
00069 
00070                 /* testing the div name */
00071                 bool operator==(const char s[]) { return !strcmp(s, id); }
00072                 friend ostream& operator<<(ostream &o, const divstat &ds);
00073                 divstat& operator=(const divstat &d);
00074                 bool operator>(const divstat &ds) const;
00075                 bool operator<(const divstat &ds) const;
00076 
00077                 /* this object is smaller than d after d is multiplied by k.
00078                  * if d is 0 or null, then returns false. */
00079                 bool smallerThan(const divstat &d, double k) const;
00080                 bool smallerThan(const divstat *const d, const double k) const;
00081                 void scale(double f);
00082                 /* use the following functions for flexibility */
00083                 double getQcov() const { return feat[0]; }
00084                 double getTcov() const { return feat[1]; }
00085                 double getScore() const { return feat[2]; }
00086                 double getIdentity() const { return feat[3]; }
00087                 double getNgidentity() const { return feat[4]; }
00088                 double getSimilarity() const { return feat[5]; }
00089                 double getMatchlen() const { return feat[6]; }
00090                 double getNogaplen() const { return feat[7]; }
00091                 ostream& dump(ostream &ou);
00092                 /* the following functions return the index of features in the 
00093                  * features array
00094                  * */
00095                 static int idxqcov() { return 0; }
00096                 static int idxtcov() { return 1; }
00097                 static int idxscore() { return 2; }
00098                 static int idxidentity() { return 3; }
00099                 static int idxngidentity() { return 4; }
00100                 static int idxsimilarity() { return 5; }
00101                 static int idxmatchlen() { return 6; }
00102                 static int idxnogaplen() { return 7; }
00103 
00104         //private:
00105                 char id[5];         // the target div name such as hs, bony, etc.
00106                 double feat[FEAT];  // feature values, fixed according to the above order
00107                 static const char* features[FEAT];  // feature string
00108 };
00109 
00110 /* A class to represent one query match against all target divisions
00111  * This program needs the guid file to instruct what is the query div
00112  * and what are target divisions. 
00113  *
00114  * The first action of this class is to read and initialize data from the
00115  * guid file*/
00116 class group : public gconst
00117 {
00118         public:
00119                 friend class gstat;
00120 
00121                 /* default constructor, allocate space, initialize to impossible
00122                  * values*/
00123                 group();
00124                 group(const group &g);
00125                 group& operator=(const group &g);
00126                 ~group() { for(int i=0; i<divisions.size(); i++) delete divm[i]; }
00127 
00128                 //this way of noticing the end of input stopped working 
00129                 group(int &q, istream &in) throw(inputend);
00130                 // replacement for special constructor, return true if more
00131                 // objects in the input stream, q is the query hint
00132                 bool next(int &q, istream &in);
00133 
00135                 
00136                 /* for human to read */
00137                 friend ostream& operator<<(ostream &o, const group &g);
00138 
00139                 /* output the raw data as tab delimited, for database entry */
00140                 void dumpAsTable(ostream &ou);
00141 
00146                 void dumpKey(ostream &ou);
00147 
00148                 // dump everything to ou for computer to read
00149                 ostream& dump(ostream &ou);
00150                 // output the stat matrix
00151                 void dumpStat(ostream &ou);
00152                 void dumpAllSMratio(ostream &ou);
00153 
00155                 // the first is index, second is the value of max tcov
00156                 pair<int, double> maxqcov() const { return max(divstat::idxqcov()); }
00157                 pair<int, double> maxtcov() const { return max(divstat::idxtcov()); }
00158                 pair<int, double> mintcov() const;
00159                 pair<int, double> maxidentity() const { return max(divstat::idxidentity()); }
00160                 pair<int, double> maxngidentity() const { return max(divstat::idxngidentity()); }
00161                 pair<int, double> maxsimilarity() const { return max(divstat::idxsimilarity()); }
00162                 pair<int, double> maxmatchlen() const { return max(divstat::idxmatchlen()); }
00163                 pair<int, double> maxnogaplen() const { return max(divstat::idxnogaplen()); }
00164                 pair<int, double> maxscore() const { return max(divstat::idxscore()); }
00165                 pair<int, double> max(int f) const;
00166 
00167                 /* obtain the minimum target sequence coverage, set the division index
00168                  * in the divm array.  */
00169                 double getMinTcov(int &div) const;
00170 
00171                 int getDivCnt() const { return divCnt; }
00172                 // obtain the current anchor index
00173                 int getAnchor() const { return anchor; }
00174                 // obtain the next anchor, returns -1 if no more anchor available
00175                 // for example, the first anchor human is bad, we want to get the 
00176                 // next one. If we have only two divisions, and we want the next again
00177                 // we will be running out of anchors!
00178                 // s is the starting poing for anchor
00179                 int nextAnchor();
00180 
00181                 /* SM means standard deviation/mean ratio */
00182                 double getIdenSMratio() const { return stat[divstat::idxidentity()][1]/stat[divstat::idxidentity()][0]; }
00183                 double getMlenSMratio() const { return stat[divstat::idxmatchlen()][1]/stat[divstat::idxmatchlen()][0]; }
00184                 double getQcovSMratio() const { return stat[divstat::idxqcov()][1]/stat[divstat::idxqcov()][0]; }
00185                 double getTcovSMratio() const { return stat[divstat::idxtcov()][1]/stat[divstat::idxtcov()][0]; }
00186                 double getScoreSMratio() const { return stat[divstat::idxscore()][1]/stat[divstat::idxscore()][0]; }
00187 
00188                 /* If identities are higher than 0.85 and SMratio < 0.1 or SMratio < 0.2 for
00189                  * bony and vrt 
00190                  * as defined in *.istr file under
00191                  * CONSERVED 0.85 0.1 0.2 bony vrt
00192                  * */
00193                 bool isConserved() const;
00194 
00196 
00197                 /* Remove a target division identified by index position i
00198                  * from the group output information to ou for debuging. 
00199                  * Recalculate stat. If anchor removed, reposition anchor.
00200                  * */
00201                 void rmbranch(int i, ostream &ou); // debug version
00202                 void rmbranch(int i);
00203                 
00204         protected:
00205                 bool highVarDivPresent() const;
00206                 /* this function compares divisions i with division b to e
00207                  * assuming i is mostly related.   This one is not very useful
00208                  * because it is based on human as query and the targets are
00209                  * in order mostly related to human */
00210                 bool checkAndRemove(int i, int b, int e, ostream &ou);
00211 
00212                 /* initialize or recalculate stat[5][2]; call this one after
00213                  * the object has been changed by removing a branch of division.
00214                  *
00215                  * Stattistics of each feature over all existing matching target
00216                  * divisions
00217                  * */
00218                 void dostat();
00219 
00220                 int query;  // the cds_id of a particular query
00221                 /* the divm array has a fixed order as defined in the guid file
00222                  * It hold the max values from each matching division
00223                  * */
00224                 vector<divstat*> divm;
00225                 int divCnt;  // the number of target divisions (including anchor) for this query
00226                 int anchor;       //index for division picked for normalization 
00227                 /* (avg,std) of features over divisions in each group */
00228                 double stat[FEAT][2]; 
00229 };
00230 
00235 class gdiagnosis : public group {
00236         public:
00237                 /* Needs to allocate space for copying, zcut set to 1 */
00238                 gdiagnosis();
00239                 gdiagnosis(double cut);
00240                 gdiagnosis(const gdiagnosis &gd);
00241                 // construct objects from in
00242                 //gdiagnosis(int &q, istream &in) throw(inputend);
00243                 
00244                 // for examining out put in the context of statistics
00245                 ostream&  dumpWithZval(ostream &ou) const;
00246                 //void normalize();  // use internally stored guid
00247                 bool next(int &q, istream &in);
00248 
00249                 void setzcut(double c) { zcut=c; }
00250                 double getzcut() const { return zcut; }
00251 
00253 
00254                 gdiagnosis& operator=(const gdiagnosis& g);
00255                 friend ostream& operator<<(ostream &ou, const gdiagnosis &g);
00256                 //ostream& printRemove(ostream &ou);
00257                 
00259                 // the most vigorous test, all features z-values are smaller
00260                 // than zcut
00261 
00262                 // there is no low and high != sum (anchor should not be low)
00263                 bool qcovpass() { return hgrm[0][divstat::idxqcov()] == 0 && hgrm[2][divstat::idxqcov()] != hgrm[3][divstat::idxqcov()]; }
00264                 bool scorepass() { return hgrm[0][divstat::idxscore()] == 0 && hgrm[2][divstat::idxscore()] != hgrm[3][divstat::idxscore()]; }
00265                 // any of the identity passed, and the anchor should also pass
00266                 bool qualitypass();
00267 
00268                 bool passed(const double zcut) const;
00269                 /* divCnt == 1 is good; decision should be made outside this class
00270                  * test divCnt == 1 or conserved or passedAvg
00271                  * */
00272                 bool goodQuality(double zcut) const;
00273 
00274                 bool passedAvg(const double zcut) const;
00275                 bool passedQcov(const double zcut) const; // only test qcov
00276                 /* check all three identity (identity, nogap, similarity), if anyone
00277                  * passed then passed */
00278                 bool passedIdentity(const double zcut) const;
00279                 /* check for nogaplength, or matchlength, if anyone passed then passed */
00280                 bool passedLength(const double zcut) const;
00281 
00282                 bool passedall(const double zcut) const {
00283                         return (passed(zcut) || goodQuality(zcut) || passedAvg(zcut) ||
00284                                 passedQcov(zcut) || passedIdentity(zcut) || passedLength(zcut));
00285                 }
00286 
00287                 /* some of the low scores is due to partial sequence, that we name
00288                  * coverageDefect */
00289                 bool coverageDefect(double zcut) const;
00290                 bool lastTest() const;
00291 
00293                 void calzval();   // helper function, initializes zval and norm
00294 
00295                 // remove divisions that are low, recalculate all derived matrices
00296                 // return a list of removed divisions (index value in divm vector)
00297                 vector<int> rmlow(double lowercut=-3, double avgzc=3); // check the quality of each division, load remove array
00298 
00299                 /* return the average of all zvalues. It is signed. 
00300                  **/
00301                 double getAvgZval() const { return zval[divisions.size()][FEAT]/(FEAT*(divCnt-1));}
00302                 /* same as above just easier to type */
00303                 double avgzval() const { return zval[divisions.size()][FEAT]/(FEAT*(divCnt-1));}
00304 
00305                 bool targetPartial(int i) const;  // division i has partial sequence
00306                 bool anchorPartial() const;
00307                 
00308                 //bool improve(ostream &ou);
00309                 bool trimAndTest(ostream &ou, double zcut);
00310 
00311                 void trimByIden(); // this method cannot be applied to 
00312                 // other situations than hs->fr hs->non-fish divs
00313 
00314                 /* d if for debug.  It contains the fixed object */
00315                 bool fixScoreAndTest(double szcut, gdiagnosis &d) const;
00316 
00318                 static void readGuid(istream &in);   // needs to be removed
00319                 //static double guid[8][5][2];
00320                 /* the guid is the result from trainer class */
00321 
00322         private:
00323                 static vector< vector< vector<double> > > guid;
00324                 /* standard normal z value as compared to the guid statistics
00325                  * dimension: (numdiv+1, numfeat+1) or (divisions.size()+1, FEAT+1) 
00326                  * Calculated after construction from input stream.
00327                  * the bottom row is the sum(fabs(z))/divCnt-1
00328                  * the last column is the sum of fabs(z) of all features for each div
00329                  * Some rows may not be populated (missing divisions).
00330                  * */
00331                 vector< vector<double> > zval; //division.size()+1 rows, FEAT+1 columns  
00332 
00333                 /* normalized features of populated divisions, with divisions.size()
00334                  * row, and FEAT columns */
00335                 vector< vector<double> > norm;
00336 
00337                 int hgrm[4][FEAT];  // histogram of low, passed, high, and sum counts, 
00338                 double zcut;        // blow this value is considered good
00339                 //vector<bool> remove;    // division is removed or not
00340                 //vector<int> poordiv;  // list of div needs to be removed
00341                 void getspace(); // helper function to allocate space for zval
00342                             //  and norm, all elements has value 0
00343 };
00344 
00349 class gstat : public gconst {
00350         public:
00351                 gstat();
00352 
00353                 /* for training the picker only
00354                  * training is done with some averaging of previous data points
00355                  * this may cause big errors.  Repeated training may be implemented
00356                  * in the future to increase the accuracy of the training. 
00357                  * */
00358                 void accumulate(const group &g);
00359                 /* for repeated training from the same data 
00360                  * REMEMBER to clear the matrices and cnt vector before calling this
00361                  * function after either calling accumulate or accumulateWithGuid
00362                  * */
00363                 void accumulateWithGuid(const group &g);
00364 
00365                 /* copy the guid matrix from the sum matrix that already contain
00366                  * the mean,std pair
00367                  * will overwrite the old values in guid  matrix
00368                  * return the sum of differences of old guid and new guid 
00369                  * matrix
00370                  * */
00371                 double calguid();
00372                 /* zero the sum matrix and the cnt vector for next round of training
00373                  * */
00374                 void zerosum();
00375                 /* It seem not necessary to call zeroguid */
00376                 void zeroguid(); // call this for a new round of training
00377                 /* dump the simple statics to a file for use as guid 
00378                  * if gs have not produced the guid matrix this function will
00379                  * do the calculation and thus modifying the gs object.*/
00380                 // not needed zerosum does the same void clear();
00381                 friend ostream& operator<<(ostream &o, gstat &gs);
00382                 bool guid_produced;  // the guid matrix is produced or not
00383 
00384                 //static void setNumdiv(int nd) { numdiv=nd; }
00385                 static int getNumdiv() { return divisions.size(); }
00386                 //static void setPivotDiv(string pdv) { pivotDiv=pdv; }
00387 
00388                 static const int pivotvalue = 10;  // smaller the better
00389 
00390         private: 
00391                 /* add info from normalizing g by Di */
00392                 //bool suck(const group &g, int Di);
00393                 vector<int> cnt;         // for each bin of divisions
00394                 //double sum[8][5][2];   // for calculating mean and variance
00395                 //double*** sum;   // 3-d array for calculating mean and variance
00396                 /* A lazy version of 3D array for 
00397                  * division, feature, (mean, standard deviation)
00398                  * this is used for computing the mean and std
00399                  * */
00400                 vector< vector< vector<double> > > sum;
00401                 // my coding time
00402                 //double sumsqr[8][5];
00403                 vector< vector< vector<double> > > guid;
00404                 int groupCount;         // total groups used
00405 
00407                 //static int numdiv;      // total number of target divisions in input data
00408 //              static string pivotDiv; // the div whose value is set to 100
00409                 // this value is redundant with the group::pivotdiv 
00410                 // need to remove this member in the future
00411 };
00412 
00413 /* pure training, will not discard low uising the crude method 
00414  * If the training dataset is not large, you can add a complete
00415  * set of training data to the beginning of the training data.  This
00416  * way, every division will be represented.  For example, there are
00417  * very little sequence information available for the cartilaginous
00418  * fish.  */
00419 class trainer {
00420         public:
00421                 /* numdiv is the number of target divisions
00422                  * pivotdiv is the pivot division used for normalization
00423                  * they have the exace meaning as defined in gstat class.  
00424                  * They are used to set the static member of the gstat class
00425                  * */
00426                 //trainer(int numdiv, const string &pivotdiv);
00427                 trainer();
00428                 /* inf contains the training data, 
00429                  * where * is the query division 
00430                  * the training result will be written to the STDOUT and file
00431                  * picker.guid
00432                  * */
00433                 //bool train(const string &inf, const string &guidfile);
00434                 bool train(const string &inf);
00435         private:
00436                 gstat model, csvdmodel;
00437 };
00438 
00439 #endif

Generated on Wed Aug 10 11:56:50 2011 for Softwares from Orpara by  doxygen 1.5.6