pfeature.h

Go to the documentation of this file.
00001 //file: pfeature.h
00002 
00003 /* a new feature class that is going to replace the old feature
00004  * initially used for the GenBank protein feature.
00005  * needs to define all the operators for it to be used as
00006  * elements of containers.
00007  * */
00008 
00009 #ifndef PFEATURE_H
00010 #define PFEATURE_H
00011 
00012 #include <string>
00013 #include <vector>
00014 #include <iostream>
00015 #include <map>
00016 #include <set>
00017 #include "strformat.h"
00018 #include "gbfn.h"
00019 #include "featureError.h"
00020 #include "gbseq.h"
00021 
00022 #define NO_CLONE
00023 //#define NO_GENE   // no gene object made for genomic DNA
00024 // also remove Gene, STS from sequence objects
00025 
00026 using namespace std;
00027 
00028 class gbdnaseq;
00029 class gbprtseq;
00030 
00031 /* an integer representation, better than string
00032  * but could not represent segmented sequences such as 
00033  *      gene            order(AF019954.1:<279..586,AF019955.1:1..1103,
00034  *                      AF019956.1:1..640,AF019957.1:1..1111,AF019958.1:1..707,
00035  *                      AF019959.1:1..918,AF019960.1:1..974,AF019961.1:1..749,
00036  *                      1..>423)
00037  *
00038  * */
00039 class locseg {
00040         public:
00041         friend class feature;
00042         locseg() : begin(0), end(0), rangeType('-'), fuzzy_begin(0), fuzzy_end(0), complement(false) { }
00043         locseg(int bb, int ee) : begin(bb), end(ee), rangeType('-'), fuzzy_begin(0), fuzzy_end(0), complement(false) { }
00044         locseg(int bb, int ee, char rt) : begin(bb), end(ee), rangeType(rt), fuzzy_begin(0), fuzzy_end(0), complement(false) { }
00045         locseg(const locseg &ls);
00046 
00051         locseg(const string &lstr) throw(featLocErr);
00052 
00053         virtual locseg& operator=(const locseg &ls);
00054 
00055         /* for 99.99% of the locations the returned value would
00056          * be an empty string
00057          * returns operation as to how to mark fuzzy ends
00058          * */
00059         string getOperation() const;
00060 
00061         bool isAbnormal() const { 
00062                 return rangeType != '-' || fuzzy_begin != 0 || fuzzy_end != 0 || complement; }
00063 
00064         virtual ostream& writeRange(ostream &ous) const { 
00065                 ous << begin << "\t" << end; return ous; }
00066 
00071         virtual ostream& outRange(ostream &ous) const;
00072 
00073         bool isFuzzy() const { 
00074                 return fuzzy_begin != 0 || fuzzy_end != 0 || rangeType != '-'; }
00075 
00076         int getFuzzyBegin() const { return complement ? fuzzy_end : fuzzy_begin; }
00077         int getFuzzyEnd() const { return complement ? fuzzy_begin : fuzzy_end; }
00078         int getBegin() const { return complement ? end : begin; }
00079         int getEnd() const { return complement ? begin : end; }
00080         char getRangeType() const { return rangeType; }
00081 
00082         protected:
00083                 void parse(string &str) throw(featLocErr);
00084                 int begin, end;
00085 
00093                 char rangeType;
00094 
00095                 /* end operation information
00096                  *default set to 0, nothing or normal
00097                  * -1 for < beyond first; len one_of
00098                  * positive meand begin is between begin and fuzzy_begin
00099                  * Needs to look at methods using this field
00100                  * meaning redefined
00101                 */
00102                 int fuzzy_begin;  
00103                 // should be all one_of in the range
00104 
00109                 int fuzzy_end;    
00110 
00111                 bool complement;   // default is false
00112                 //string name;       // optional name for the segment, 
00113                                    // usually for order()
00114 };
00115 class namedlocseg : public locseg {
00116         public:
00117         friend class feature;
00118         namedlocseg() : locseg(), name() {}
00119         namedlocseg(const string &nn, int bb, int ee) : locseg(bb,ee), name(nn) {}
00120         namedlocseg(const namedlocseg& loc) : locseg(loc) { name=loc.name; }
00121         namedlocseg(const locseg& loc) : locseg(loc) { }
00122         namedlocseg(const string &lstr) throw(featLocErr);
00123         namedlocseg& operator=(const namedlocseg& loc);
00124         const string& getName() const { return name; }  // for reading only
00125 
00130         ostream& writeRange(ostream &ous) const {
00131                 ous << name << "\t"; return locseg::writeRange(ous); }
00132 
00133         protected:
00134                 string name;
00135 };
00136 
00137 /* feature needs to know some information about the master
00138  * object sequence
00139  * We can include the main sequence as a member or passed 
00140  * the main sequence as an arguments in all the functions.
00141  * */
00142 class feature {
00143         public:
00144                 enum Segrelation {none=0, join, order};  // none is default
00145 
00146                 feature() : complement(false), segop(none), onone(true) { }
00147                 ~feature() { clear(); }
00148 
00149                 /* return true if it has read a feature successfully
00150                  * return false if could read any more
00151                  * Repeated qualifiers will be concatenated with ' | '
00152                  * */
00153                 bool next(string &ln, istream &ins); // construct the next object
00154                 feature(const feature &feat);
00155                 feature& operator=(const feature &feat);
00156                 void writeAceProtein(ostream &ous, gbprtseq &prt);  // protein features
00157                 void writeAceDNA(ostream &ous, ostream &sub, ostream &snp, gbdnaseq& seq) throw(featErr);  // for nucleic acids features
00158                 string getName() const { return name; }
00159                 //ostream& dumpRange(ostream &ous) { ous << loc.getBeginStr() << " " << loc.getEndStr(); return ous; }
00160 
00161                 /* string dump for debuging purposes */
00162                 friend ostream& operator<<(ostream &ous, const feature &feat);
00163 
00164                 /* concatenate all dbxref into one string */
00165                 string getDbxrefString() const;
00166 
00167                 /* concatenate all qualifiers into one single string */
00168                 //string getQualifierString() const;
00169                 string getAllQualifiers() const;
00170                 //ostream& writeRange(ostream &ous) const { loc.writeRange(ous); return ous; }
00171 
00175                 void clear();
00176 
00180                 int getBegin() const { return locs[0]->begin; }
00181                 int getEnd() const { return locs[locs.size()-1]->end; }
00182 
00183                 int getNumSeg() const { return locs.size(); }
00184                 //bool isJoin() const { return locs.size() > 1; }
00185                 //should look at the operation string: oder, join 
00186                 //friend ostream& operator<<(ostream &ous, const location &loc);
00187                 const string& getOperation() const { return locop; }
00188                 Segrelation getSegop() const { return segop; }  // replaces getOperation
00189 
00190                 /* this function uses integer begin and end, 
00191                  * has no sense of orientation.  It is safe to use this function for
00192                  * protein features which is always from small to large.
00193                  * */
00194                 ostream& writeRange(ostream &ous) const { 
00195                         ous << getBegin() << " " << getEnd(); return ous; }
00196                 /* this one know the direction of the gene, always from 5' to 3' */
00197                 ostream& outRange(ostream &ous) const;
00198                 //void clear();
00199 
00204                 int outSeg(ostream &ous) const;
00206                 void outSegMultiple(ostream &ous, const gbdnaseq &seq) const;
00207 
00211                 string getLocationString() const { return locstr; }
00212 
00217                 string getTaxid() const;
00218                 bool hasDbxref() const { return !dbxref.empty(); }
00219 
00220                 bool isComplement() const { return complement; }
00221 
00223                 int getBeginEndinfo() const;
00224                 int getEndEndinfo() const;
00225 
00226                 ostream& endInfo(ostream &ous) const;
00227                 // complement translated into large begin small end
00228                 bool nostart() const;
00229                 bool noend() const;
00230                 bool hasQualifier(const string &key) const { 
00231                         return qualifiers.find(key) != qualifiers.end(); }
00232                 // returns "" if qualifiers not found
00233                 string getQualifierValue(const string &qkey) const;
00234 
00235                 // this has not been fully used yet.
00236                 static void readValidQualifier(const string &file);
00237 
00243                 static void loadSubseq(const string &file);
00244                 static void dumpSubseq(const string &file);
00245                 // the default is 0
00246                 static void addSubseq(const string &seq) { subseq.insert(make_pair(seq,0)); }
00247                 static bool existSubseq(const string &seq) { return subseq.find(seq) != subseq.end(); }
00248 
00254                 static void loadGeneName(const string &file);
00255                 static void dumpGeneName(const string &file);
00256 
00261                 static string getGeneSymbol(const string &gene, string &allele);
00262                 // new version.  Given the gene, returns a <symbol,allele>pair
00263                 static pair<string,string> getGeneSymbol(const string &gene);
00264 
00265                 static const int QUAL = 5;
00266                 static const int QUAL_VAL = 21;
00267                 static const string QUAL_SPACE;
00268                 static bool PRTOUT;   // specify policy
00269                 // for genome annotation PRTOUT=false, for regular GB = true
00270 
00271         private:
00279                 static string addGene(const string &gene, string &allele);
00280                 //static bool isGeneSymbol(const string &gene);
00281                 // breaks up gene by char "*" gene*allele
00282                 static pair<string,string> separateAllele(const string &gene);
00283                 // make a pair of symbol,allele from gene
00284                 static pair<string,string> makeGeneSymbol(const string &gene);
00285                 // insertion only
00286                 static pair<string,string> insertGeneSymbol(const string& gene, const string &symbol);
00287                 // will check the presence of allele, do the actuall insertion
00288                 // return <symbol,allele> pair
00289                 static pair<string,string> insertGeneAsSymbol(const string& gene);
00290 
00294                 static string geneOfSymbol(const string& sym);
00295                 static string cleanGene(const string& gene, const string wd);
00296                 static string nojunkGene(const string &gene);
00297                 // use a counter to make sure returned symbol is unique
00298                 static string nextGeneSymbol();
00299                 static void goodGeneSymbol(const string &gene, string &symbol);
00300                 static bool isGoodGeneSymbol(const string &gene, const string &symbol);
00301                 static string acronymWithAllDigits(const string& str, const int n=1);
00302                 // extract hidden gene symbol, separated by sep
00303                 // if found one the return it, if not returns an empty string
00304                 static string xHiddenGeneSymbol(const string &str, const string &sep);
00305                 // helper function fro xHiddenGeneSymbol
00306                 static string shortIsGeneSymbol(const string &str, string::size_type idx, const int seplen);
00307                 // only test up to 8, because this is the numbers
00308                 // used for naming genes
00309                 static bool isRomanNumber(const string &str);
00310                 // only test the most commonly used by biologiest
00311                 static bool isGreek(const string &str);
00312 
00313                 // a helper function for next()
00314                 // after this function call ln will contain the line
00315                 // next to this qualifier.
00316                 void nextQualifier(istream &ins, string &ln); 
00317 
00318                 // extract the gene,allele pair from this feature
00319                 pair<string, string> geneallele() const;
00320 
00325                 void outgeneline(ostream &ous, const gbdnaseq &seq, const string &tag) const;
00326                 // return the iterator pointing to the longest 
00327                 // qualifier of five selected qualifiers: 
00328                 // gene, stantard_name, function, product, phenotype
00329                 map<string,string>::iterator composeTitle();
00330 
00331                 // a helper function for nextQualifier
00332                 void insertQualifier(const string& key, string& value);
00333                 void writeSource(ostream &ous, ostream &sub, gbdnaseq &seq) throw(featErr);
00334                 void writemRNA(ostream &ous, ostream &sub, const gbdnaseq &seq);
00335                 void writeSegmRNA(ostream &sub, const gbdnaseq &seq);
00336                 void writeCDS(ostream &ous, ostream &sub, const gbdnaseq &seq);
00337                 void writeSegCDS(ostream &sub, const gbdnaseq &seq);
00344                 void subCDS(ostream &ous, ostream &sub, const gbdnaseq &seq, const string &cdsKey, const string &prtKey, const string &title) const;
00345                 void writeGene(ostream &ous, ostream &sub, gbdnaseq &seq) const;
00346 
00348                 void writeProtein(ostream &sub, const gbdnaseq &seq, const string &key, const string &title) const;
00349                 //tRNA, snRNA, rRNA, etc
00350                 void writeRNA(ostream &sout, ostream &subout, gbdnaseq &seq);
00351                 void RNAdump(ostream &ous, const gbdnaseq &seq) const;
00352                 void writeFeature(ostream &ous, const gbdnaseq &seq) const throw(featErr);
00353                 bool writeFeatureOfWholeSeq(ostream &ous, const gbdnaseq &seq) const;
00354                 void writeImmuno(ostream &ous, ostream &sub, const gbdnaseq &seq) const;
00355                 void writeSNP(ostream &snp, gbdnaseq &seq) const;
00356                 // for Protein Objects
00357                 // site feature of protein objects
00358                 void writePrtSite(ostream &ous, gbprtseq &prt) const;
00359                 void writePrtRegion(ostream &ous, gbprtseq &prt) const;
00360 
00361                 string name;  // current feature name
00362 
00363                 string locstr;  // the location string in GenBank format
00364 
00368                 vector<locseg*> locs;  // location information
00369                 bool onone;  // location on one sequence as to one multiple seq
00370                 bool complement;
00371                 Segrelation segop;    // how to construct the sequence from segments
00372                 string locop;  // default is no operation, just a,b,
00373                 // join, order (only two possibilities, now)
00374                 void parseLoc() throw(featLocErr);
00375 
00376                 map<string, string> qualifiers;
00377                 vector<string> dbxref;
00378 
00379                 static set<string> validqual;  // valid qualifiers
00380 
00381            /* store all subsequence key
00382                  * During parsing, this class knows all subsequences, so that when
00383                  * duplicated subsequence names occur it can detect them.
00384                  * Known subsequence names can be loaded before the parsing.
00385                  * To control multiple sequences with the same name, changed
00386                  * it to a map that keeps track of how many has been named after
00387                  * this identifier.
00388                  *
00389                  * For example, Genome G1 has a model mRNA pointing to mRNA m1
00390                  * Genomic G2 also has a model pointing to mRNA m1.  In most cases
00391                  * there is only onel-model-to-one-mRNA.  But there exceptions
00392                  * make it necessary to name model other than the m1_mod
00393                  * */
00394                 //static set<string> subseq;    
00395                 static map<string, int> subseq;
00396 
00403                 static map<string,string> genen2s;  // name to symbol
00404                 static set<string> genesbl;  // all second from geneName
00405                 static int genecnt;  // for generating a gene symbol
00406 
00407 };
00408 
00409 #endif

Generated on Wed Aug 10 11:56:58 2011 for Softwares from Orpara by  doxygen 1.5.6