gbseq.h

Go to the documentation of this file.
00001 //file gbmodel.h
00002 //a generalized genbank entry model, for both DNA and Protein
00003 // right now this is mainly used for processing protein entries
00004 
00005 #ifndef GBMODEL_H
00006 #define GBMODEL_H
00007 
00008 #include <string>
00009 #include <vector>
00010 #include <iostream>
00011 #include "gbfn.h"
00012 //#include "ref.h"
00013 //#include "Reference.h"
00014 #include "Refreader.h"
00015 #include "strformat.h"
00016 #include "pfeature.h"
00017 
00018 using namespace std;
00019 
00020 class feature;  // this one not needed
00021 
00022 //enum Moltype { genomic, protein, mRNA, unknown };
00023 //enum fileType { dna, protein };
00024 
00025 class gbseq {
00026         public:
00027                 //gbseq() : seqLength(-1), _moltype(unknown)  { line.reserve(84); sequence.reserve(5000); }
00028                 gbseq() : seqLength(-1), features(), seg(0), segtotal(0), orgacronym()  
00029                 { line.reserve(84); sequence.reserve(5000); }
00030 
00031                 virtual ~gbseq() { for (int i=0; i<references.size(); i++) delete references[i]; }
00032 
00033                 /* Read one record.  Return true if not end of file
00034                  * It will remove the ending period of certain fields:
00035                  *      Definition, Author, taxonomy,   Keyword
00036                  */
00037                 virtual bool read(istream &ins);
00038                 // add snp file
00039                 virtual void writeAce(ostream &ous, ostream &sub, ostream &snp) = 0;
00040 
00044                 void writeSpecies(ostream& ous);
00045                  
00050                 int getLength() const; 
00056                 int getSeqlen() const { return sequence.length(); }
00057 
00058                 string getDate() const { return locus.substr(56, 11); }
00059                 // clear the base part, derived class have to clear
00060                 // additional members.
00061                 void clear();  // for repeated read this must be called
00062                 string getKey() const { return accession[0]; }
00063                 string getLocusName() const { return firstword(locus); }
00064                 string getSegmentLocus() const;
00065 
00069                 string getType() const { string tmp= locus.substr(35,6); trim(tmp); return tmp; }
00075                 string getMolType() const;  // from the source feature
00076                 // Moltype getMoltype() const { return _moltype; }
00077                 // void setMoltype(Moltype mtyp) { _moltype = mtyp; }
00083                 string getOrgAcronym() const;
00084                 string getOrganism() const { return organism; }
00085                 // feature exist at a particular location(b,e)
00086                 bool hasFeature(const string &f, int b, int e) const;
00087                 bool isSegment() const { return seg>0; }
00088 
00089                 static const int VAL_START = 12;  // value field start index
00090                 /*
00091                  * longText from GB are not exactly the same, they are very 
00092                  * similiar being generated by computer.
00093                 static map<string, int> lt2id;
00094                 static void setLongtextid(int i) { longtextid = i; }
00095                 static int nextLongtextid() { return longtextid++; }  // run the sequence
00096                 static int currLongtextid() { return longtextid; }
00097                 static void loadLongtext(string &inf);
00098                 */
00099 
00102                 static void init();
00104                 static void loadOrgmap(const string &file);
00106                 static void dumpOrgmap(const string &file);
00107                 //static getOrgKey(const string& org);
00108 
00109         protected:
00110                 //static int longtextid;
00111 
00112                 // raw data then produce secondary data
00113                 string locus;  // the whole locus line
00114                 mutable int seqLength;
00115                 //mutable Moltype _moltype;  // this one is redundant with the locus line
00116                 // future version should consolidate this one
00117 /*
00118 Positions  Contents
00119 ---------  --------
00120 01-05      'LOCUS'
00121 06-12      spaces
00122 13-28      Locus name
00123 29-29      space
00124 30-40      Length of sequence, right-justified
00125 41-41      space
00126 42-43      bp
00127 44-44      space
00128 45-47      spaces, ss- (single-stranded), ds- (double-stranded), or
00129            ms- (mixed-stranded)
00130 48-53      NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA), 
00131            mRNA (messenger RNA), uRNA (small nuclear RNA), snRNA,
00132            snoRNA. Left justified.
00133 54-55      space
00134 56-63      'linear' followed by two spaces, or 'circular'
00135 64-64      space
00136 65-67      The division code (see Section 3.3)
00137 68-68      space
00138 69-79      Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
00139 */
00140                 string definition;
00141                 vector<string> accession; // one or more
00142                 string version[2];        // version GI:int_id
00143                 string dbsource;          // only in protein record
00144                 string keywords;
00145                 // organism information
00146                 int seg, segtotal;   // segment # and total, optional
00147                 string source;
00148            string organism;
00149                 string taxonomy;    // the taxon tree branch leading to root
00150 
00151       //vector<Reference> references;
00152       vector<Ref*> references;
00153       string comment;               // optional
00154                 vector<feature> features;
00155                 string sequence;  // could be DNA or Protein
00156 
00157                 /* buffer for performance */
00158                 string line;
00159                 mutable string orgacronym; // organism acronym == key
00160 
00169                 static map<string,string> orgmap;  // maps organism name to key
00170                 static set<string> orgacronyms;       // assistance for search
00171                 static set<string> taxons;   // all taxons, including species
00172 };
00173 
00174 class gbprtseq : public gbseq {
00175         public:
00178                 bool read(istream &ins);
00179                 //void writeAce(ostream &ous, ostream &sub, ostream &aut);
00180                 // the snp seems to be useless for protein object
00181                 void writeAce(ostream &ous, ostream &sub, ostream &snp);
00182 };
00183 
00184 class gbdnaseq : public gbseq {
00185         public:
00186                 gbdnaseq() : gbseq(), subcnt(1) {}
00187                 bool read(istream &ins);
00188                 //void writeAce(ostream &ous, ostream &sub, ostream &aut);
00189                 void writeAce(ostream &ous, ostream &sub, ostream &snp);
00190                 void clear(); // only clear the added part
00191                 string getStrand() const { return locus.substr(32, 3); } // ds- by default, ss-, or ms-
00192                 int nextsub() const { return subcnt++; }
00193                 int currsub() const { return subcnt; }
00194 
00195         private:
00196                 int A, C, G, T, O;
00197                 mutable int subcnt;  // for all subsequences
00198 };
00199 
00200 #endif

Generated on Wed Aug 10 11:56:58 2011 for Softwares from Orpara by  doxygen 1.5.6