doublestop.cpp File Reference

#include <iostream>
#include <mysql++.h>
#include "dbinfo.h"
#include <cstring>
#include <string>
#include <set>
#include <map>
#include <vector>
#include "bioseq.h"
#include <cmath>
#include <boost/math/distributions.hpp>
#include <RNAModel.h>

Classes

class  Progparam
class  Conpos
class  Conseq

Functions

bool loadGenomic (map< string, string > &gstore, Connection &conn, const string &host, const string &database)
bool checkstop (const string &host, const string &dbname, const MysqlDBInfo &mydb, ostream &ous, ostream &txou, const Progparam &par)
bool isStop (const string &codon)
int stops (const string &seq)
bool hasTable (Connection &conn, const string &tab)
void readconf (const string &file, map< string, string > &dbtab, set< string > &ignore, map< string, int > &maxintron)
double distanceBaseFreq (double bf1[4], double bf2[4])
void checkBadStopIndex (const string &host, const string &dbname, const MysqlDBInfo &mydb, ostream &ous, ostream &osl, map< string, int > &maxintron, const Progparam &par)
void analyzeStop (const string &host, const vector< string > &dbs, const MysqlDBInfo &mydb, const Progparam &par, bool appendres=false)
void fixStopIndex (const string &host, vector< string > &dbs, const MysqlDBInfo &mydb, map< string, int > &maxintron, const Progparam &par)
void refineAllmodels (Connection &conn, const string &fixedfile, const string &badfile, const string &refinedallmod)
void getCandidateDatabase (vector< string > &dbnames, Connection &conn)
void discardIgnored (vector< string > &dbnames, const set< string > &ignored)
void getDbsFromFile (vector< string > &dbnames, const string &dblf)
void usage ()
int main (int argc, char *argv[])
ostream & operator<< (ostream &ous, const Conpos &pos)
ostream & operator<< (ostream &ous, const Conseq &cons)
void countbase (int bs[4], const string &seq)
double expectedStop (double bf[4], int n)
void stoplocation (const string &seq, vector< int > &sloc)
int count2freq (int bc[4], double bf[4])
template<class T>
ostream & writeBase (T bf[4], ostream &ous, const char *sep="\t")
bool loadGenomicFromScaffoldSeq (map< string, string > &gstore, Connection &conn)
ostream & writeRow (Row row, ostream &ous)
ostream & writeResult (ostream &ous, const Row &row, const JGIModel &mod)

Function Documentation

void analyzeStop ( const string &  host,
const vector< string > &  dbs,
const MysqlDBInfo mydb,
const Progparam par,
bool  appendres = false 
)

References checkstop().

Referenced by main().

void checkBadStopIndex ( const string &  host,
const string &  dbname,
const MysqlDBInfo mydb,
ostream &  ous,
ostream &  osl,
map< string, int > &  maxintron,
const Progparam par 
)

bool checkstop ( const string &  host,
const string &  dbname,
const MysqlDBInfo mydb,
ostream &  ous,
ostream &  txou,
const Progparam par 
)

Analysze the stop situation for one database.

Parameters:
ous contains the summary reuslt for the stop region
Returns:
true if process successful. False if failed for some reason.

References mRNAModel::CDSLength(), mRNAModel::CDSSequence(), count2freq(), countbase(), Range::direction(), distanceBaseFreq(), expectedStop(), mRNAModel::genomicCDSEnd(), RNAModel::getOid(), MysqlDBInfo::getPassword(), MysqlDBInfo::getUser(), hasTable(), L, loadGenomic(), Progparam::nreptab, reverseComplement(), stoplocation(), string(), and writeBase().

Referenced by analyzeStop().

int count2freq ( int  bc[4],
double  bf[4] 
)

Referenced by checkstop().

void countbase ( int  bs[4],
const string &  seq 
)

References toupper().

Referenced by checkstop().

void discardIgnored ( vector< string > &  dbnames,
const set< string > &  ignored 
)

References good.

Referenced by main().

double distanceBaseFreq ( double  bf1[4],
double  bf2[4] 
)

Eucleadian distance betwee two base coverage profiles

Referenced by checkstop().

double expectedStop ( double  bf[4],
int  n 
)

since stop codons are not independent in different frames, we need to do a small correction even for random sequences If frame 1 has a stop, then the other two frames will not be a stop codon for sure. So the absolute frequence of stop codon given any random codon is the sum of the three codons. In a long sequence this needs to be corrected by the following equation: Pcorrected = p-(4np^2)/(3n-2)

Parameters:
bf base frequence of A,C,G, and T
n number of codons of sequence. seqlen/3.

Referenced by checkstop().

void fixStopIndex ( const string &  host,
vector< string > &  dbs,
const MysqlDBInfo mydb,
map< string, int > &  maxintron,
const Progparam par 
)

this is the top level method, it can iterate through multiple databases. It uses fixStopIndex to work on each individual database.

References checkBadStopIndex().

Referenced by main().

void getCandidateDatabase ( vector< string > &  dbnames,
Connection &  conn 
)

References hasTable().

Referenced by main().

void getDbsFromFile ( vector< string > &  dbnames,
const string &  dblf 
)

References ifstream().

Referenced by main().

bool hasTable ( Connection &  conn,
const string &  tab 
)

bool isStop ( const string &  codon  ) 

Referenced by stoplocation(), and stops().

bool loadGenomic ( map< string, string > &  gstore,
Connection &  conn,
const string &  host,
const string &  database 
)

bool loadGenomicFromScaffoldSeq ( map< string, string > &  gstore,
Connection &  conn 
)

References hasTable(), and string().

Referenced by loadGenomic().

int main ( int  argc,
char *  argv[] 
)

ostream& operator<< ( ostream &  ous,
const Conseq cons 
)

References Conseq::cseq.

ostream& operator<< ( ostream &  ous,
const Conpos pos 
)

output frequency like the following: {A0.347526,C0.107838,G0.328182,T0.216454}

References Conpos::bases, Conpos::bc, H, and Conpos::height().

void readconf ( const string &  file,
map< string, string > &  dbtab,
set< string > &  ignore,
map< string, int > &  maxintron 
)

References dissect(), and ifstream().

Referenced by main().

void refineAllmodels ( Connection &  conn,
const string &  fixedfile,
const string &  badfile,
const string &  refinedallmod 
)

allmodels {remove bad, update stop-fixed } => allfixed

Parameters:
refinedallmod the result table containing allmodels: 1. without bad models. 2. stop fixed This table will not store the actually protein sequence. Only the proteinid. But the pepseq in protein table may be wrong.
fixedfile models whose coordinates were fixed and are good This is a subset of the allmodels. This file is loaded into a table. This schema has combined schema of model|transcript|protein, so it has a lots of columns.
badfile contains all bad models, they will be removed from the final modle set (refinedallmod)
Need to actually fix coordinates if they are bad, together with transcript, and protein table.

load bad models into table ///

Referenced by checkBadStopIndex().

void stoplocation ( const string &  seq,
vector< int > &  sloc 
)

References isStop().

Referenced by checkstop().

int stops ( const string &  seq  ) 

References isStop().

void usage (  ) 

given gmap summary format, this program converts it into combest archive format (*.car)

It can be used a pipe, or given specific file names.

this is a helper program to count distinct estids in the ESTId column of the combest result. So it is the actual number of ESTs mapped. This could be lower for deeply covered genome because of the coverage depth-dependent filtering.

given gmap summary format, this program converts it into combest archive format (*.car)

It can be used a pipe, or given specific file names.

this is a helper program to count distinct estids in the ESTId column of the combest result. So it is the actual number of ESTs mapped. This could be lower for deeply covered genome because of the coverage depth-dependent filtering.

template<class T>
ostream& writeBase ( T  bf[4],
ostream &  ous,
const char *  sep = "\t" 
) [inline]

References sep.

Referenced by checkstop().

ostream& writeResult ( ostream &  ous,
const Row &  row,
const JGIModel mod 
)

ostream& writeRow ( Row  row,
ostream &  ous 
)


Generated on Wed Aug 10 11:57:06 2011 for Softwares from Orpara by  doxygen 1.5.6