nrep.cpp File Reference

#include "GenModel.h"
#include "TranscriptExon.h"
#include <mysql++.h>
#include <dbinfo.h>
#include <fstream>
#include <map>
#include <set>
#include "hatrees.h"
#include "ModelFactory.h"
#include "strformat.h"
#include <cstdio>

Classes

class  Progparam

Functions

vector< GenModel * > removePregnantModels (vector< GenModel * > &mds, ostream &ous, vector< GenModel * > &preg)
vector< GenModel * > removeChimeraModels (vector< GenModel * > &mds, ostream &ous, vector< GenModel * > &chim, const Progparam &par)
vector< GenModel * > removeLowCDSModels (vector< GenModel * > &mds, ostream &ous, vector< GenModel * > &lwcds)
vector< GenModel * > pickGoodModel (vector< GenModel * > &mds, ostream &ouslog, vector< GenModel * > &bad, const Progparam &par)
vector< GenModel * > removeIdentical (const vector< GenModel * > &mds)
void showRow (const Row &row, ostream &ous)
vector< GenModel * > buildGeneCluster (const vector< GenModel * > &mds, map< int, int > &memrep, ostream &logos)
void checkUTRChimera (const GenModel *i, const GenModel *j, RangeChain &uc)
void memoryRelease (vector< GenModel * > &m)
void writeModel (vector< GenModel * > &srcmod, ostream &model, ostream &exon)
void storeModel (const vector< GenModel * > &mod, Query &mq, const string &modtab, const string &extab)
void createModelTable (Query &q, const string &tab)
void createGenericTable (Query &q, const string &tab)
void createExonTable (Query &q, const string &tab)
void constructJGIModel (const string &jmod, const string &mod, const string &ex, Connection &conn, const string &label, const string &allmod, int version, map< string, string > &trackfeat)
void storeModelCluster (const string &modcltab, Connection &conn, const map< int, int > &modcl, const string &modtab)
void colorRepModels (Connection &conn, const string &mod, const string &gen)
void constructJGIGene (Connection &conn, const string &tab, const string &jgimodel, const string &modcltab, int version)
void checkAndBuildInput (const Progparam &par, Connection &conn)
void writeResultToFile (Progparam &par, ModelFactory &modfac, ostream &ouslog)
void writeResultToDatabase (Progparam &par, ModelFactory &modfac, Connection &conn2, ostream &OU)
void createModelClusterTable (Connection &conn, const string &tabname)
void createModelAndExonTables (Connection &conn, Progparam &par)
void constructDerivedTables (Connection &conn, Progparam &par)
void loadPrimaryTables (Connection &conn, const Progparam &par)
void file2table (const string &file, const string &table, const Progparam &par)
void addSingletonGene (Connection &conn, const Progparam &par)
void removeDoneFile ()
void usage (const Progparam &par)
string getlinktab (Connection &conn)
int main (int argc, char *argv[])

Variables

const int chimera_intron_len = 500
const int minutr_len = 120
const int min_CDS_len = 1200
const int commonex_len = 170
int exonid = 1

Function Documentation

void addSingletonGene ( Connection &  conn,
const Progparam par 
)

vector< GenModel * > buildGeneCluster ( const vector< GenModel * > &  mds,
map< int, int > &  memrep,
ostream &  logos 
)

member => rep with the longest CDS if CDS are identical, then use the longest exon length

Only genes with more than one models are outputed. The rest are single exon genes. The user need to figure it out with set operations.

Parameters:
mds input vector of model pointers
memrep output. map of member -> rep. Rep has the longest CDS.
Returns:
vector of pointer to good gene models. Models that are missing are those that are contained in good models.
1. Discard contained models 2. Build overlap relationship 3. Cluster into member => rep relationship 4. Picke the longest models, longest CDS as rep. So rep should have most of the information.
Parameters:
memrep is the output or result of this funciton. It contains member => rep relationship.

References Noschain::exonLength(), hatrees< T >::getCluster(), good, Noschain::numberOfRanges(), and hatrees< T >::readFromMap().

Referenced by writeResultToDatabase(), and writeResultToFile().

void checkAndBuildInput ( const Progparam par,
Connection &  conn 
)

ideally, the input table should be made before hand If not available, it will generate a table from all existing tracks.

References Progparam::database, Progparam::getAllmodelTable(), and Progparam::host.

Referenced by main().

void checkUTRChimera ( const GenModel i,
const GenModel j,
RangeChain uc 
)

void colorRepModels ( Connection &  conn,
const string &  mod,
const string &  gen 
)

Referenced by constructDerivedTables().

void constructDerivedTables ( Connection &  conn,
Progparam par 
)

void constructJGIGene ( Connection &  conn,
const string &  tab,
const string &  jgimodel,
const string &  modcltab,
int  version 
)

References createGenericTable().

Referenced by constructDerivedTables().

void constructJGIModel ( const string &  jmod,
const string &  mod,
const string &  ex,
Connection &  conn,
const string &  label,
const string &  allmod,
int  version,
map< string, string > &  trackfeat 
)

Right now label is good or bad

References getlinktab(), and itos().

Referenced by constructDerivedTables().

void createExonTable ( Query &  q,
const string &  tab 
)

void createGenericTable ( Query &  q,
const string &  tab 
)

Referenced by constructJGIGene().

void createModelAndExonTables ( Connection &  conn,
Progparam par 
)

void createModelClusterTable ( Connection &  conn,
const string &  tabname 
)

modelid => geneid create a table of two columns, the first one is a primary key

Referenced by loadPrimaryTables(), and storeModelCluster().

void createModelTable ( Query &  q,
const string &  tab 
)

create an empty table tab with the following schema:

id serial primary key, genomic varchar(48), name varchar(200), exons text, mb integer, me integer, -- model boundary cdsb integer, cdse integer, -- CDS boundary cdsphase integer -- 0,1,2 hasstart boolean hasstrop boolean

Referenced by createModelAndExonTables().

void file2table ( const string &  file,
const string &  table,
const Progparam par 
)

string getlinktab ( Connection &  conn  ) 

this is also defined in gathercaiwegene, in the future this function should be put into a mysql helper header Now using copy and pasting.

void loadPrimaryTables ( Connection &  conn,
const Progparam par 
)

int main ( int  argc,
char *  argv[] 
)

void memoryRelease ( vector< GenModel * > &  m  ) 

vector< GenModel * > pickGoodModel ( vector< GenModel * > &  mds,
ostream &  ouslog,
vector< GenModel * > &  bad,
const Progparam par 
)

return a vector of pointers of good models But some model may be contained in other models.

Parameters:
mds vector of input model pointers
bad vector for containing the bad models This function simply append to the end of this container without any clearing action. output for detailed information why model bad This is mainly for debuggin purposes.
Given a set of overlapping models, this method tries to separate the set into good + bad models.

References commonex_len, Noschain::exonLength(), good, length, Progparam::maxintronlen, min_CDS_len, Noschain::numberOfRanges(), removeChimeraModels(), removeLowCDSModels(), and removePregnantModels().

Referenced by writeResultToDatabase(), and writeResultToFile().

vector< GenModel * > removeChimeraModels ( vector< GenModel * > &  mds,
ostream &  ous,
vector< GenModel * > &  chim,
const Progparam par 
)

void removeDoneFile (  ) 

Referenced by main().

vector< GenModel * > removeIdentical ( const vector< GenModel * > &  mds  ) 

References good.

Referenced by writeResultToDatabase(), and writeResultToFile().

vector< GenModel * > removeLowCDSModels ( vector< GenModel * > &  mds,
ostream &  ous,
vector< GenModel * > &  lwcds 
)

also remove models with too many non-coding exons

References good.

Referenced by pickGoodModel().

vector< GenModel * > removePregnantModels ( vector< GenModel * > &  mds,
ostream &  ous,
vector< GenModel * > &  preg 
)

always return a valid model pointer, unless it run out of input from res

Returns:
a vector of pointers of good models
Parameters:
mds input models as a vector of pointers details for logged information, for debug stage usage The output vector of pointer of pregnant models. this function will simply append to it.
Pregnant models are models whose introns contain one or more other models. Technically, we used the largest intron of the model to do this test. We also used 2 or more other models are the cutoff. We are alowing models pregnant with one models because this is could be valid situation in biology.

References Range::contain(), good, and Range::length().

Referenced by pickGoodModel().

void showRow ( const Row &  row,
ostream &  ous 
)

void storeModel ( const vector< GenModel * > &  mod,
Query &  mq,
const string &  modtab,
const string &  extab 
)

This is equivalent to writeModel(), directly store information into database table. This method is 100x slower than writeModel().

Parameters:
mod input gene model as vector of pointers
mq query object used to insert result
modtab model table name
extab exon table name
Columns of the table (id,genomic,name, exons, mb, me, cdsb, cdse, cdsphase, hasstart, hasstop, pep)

Referenced by writeResultToDatabase().

void storeModelCluster ( const string &  modcltab,
Connection &  conn,
const map< int, int > &  modcl,
const string &  modtab 
)

Parameters:
modcl is the input of member -> rep table.
modcltab is the output table name.
modtab was used to generate singleton clusters by set difference operation: modtab - modcltab (clusters with more than one member)
The model cluster implements the gene concept.

Parameters:
modcltab model cluster table with genes > 1 members no single ton. This is the database table name
modcl model cluster information, member => rep rep is one of the member's id
modtab model table name used to extract singleton genes by set operation. input: modcl, output modcltab

References createModelClusterTable().

Referenced by writeResultToDatabase().

void usage ( const Progparam par  ) 

void writeModel ( vector< GenModel * > &  srcmod,
ostream &  model,
ostream &  exon 
)

this is much faster than the database interactive method storeModel()

Parameters:
srcmod. Input models as a vector of pointers to GenModel
model. The output stream for model with columns: (id,genomic,name, exons, mb, me, cdsb, cdse, cdsphase, hasstart, hasstop, pep) Should have the same column as the storeModel() method.
exon. Exon output stream.

References exonid.

Referenced by writeResultToFile().

void writeResultToDatabase ( Progparam par,
ModelFactory modfac,
Connection &  conn2,
ostream &  OU 
)

void writeResultToFile ( Progparam par,
ModelFactory modfac,
ostream &  ouslog 
)

This is the main method through the file mechanism. It is much faster than the direct insert into database. The buffering of file system is a lot faster than the database system.

Parameters:
ouslog is the log file stream.
modfac is the model stream as input. This method first collect overlapping models (50nt or more) into a buffer. Then it calls pickGoodModels() to get good models.
In all the operations, file names are controlled by par.

References buildGeneCluster(), Range::combine(), GenModel::genomicId(), Progparam::getBadExonFile(), Progparam::getBadModelFile(), Progparam::getGoodExonFile(), Progparam::getGoodModelFile(), Progparam::getModelClusterFile(), Progparam::goodmod, memoryRelease(), GenModel::modelName(), ModelFactory::next(), Range::overlay(), pickGoodModel(), removeIdentical(), Progparam::showevery, and writeModel().

Referenced by main().


Variable Documentation

const int chimera_intron_len = 500

This is the cutoff value for checking that this intron is likely to bridge two models into on chimeric model

Referenced by checkUTRChimera().

const int commonex_len = 170

int exonid = 1

Referenced by writeModel().

const int min_CDS_len = 1200

const int minutr_len = 120

minimal UTR length. The UTR length that are more likely to be true. This is dependent on the organism. Some organisms have long UTRs than others. Small genomes tend to have short UTRs.

Referenced by checkUTRChimera().


Generated on Wed Aug 10 11:57:02 2011 for Softwares from Orpara by  doxygen 1.5.6