chimeradetect_mysql.cpp File Reference

#include "alnrange.h"
#include "ChainAvgrange.h"
#include <stdlib.h>
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <sstream>
#include <mysql++.h>
#include "strformat.h"
#include "bioseq.h"
#include "dynaln.h"
#include <cmath>
#include <iomanip>

Classes

struct  Cutoff
class  Progparam
class  seqelem

Functions

void createNREPSW (Connection &conn, const string &model, const string &prtsw, const string &subsetaln)
int getinslen (string str)
bool tableExists (Connection &conn, const string &tab)
bool mysqlTableExists (Connection &conn, const string &tab)
void checkchimera (mysqlpp::Connection *dbh, int olpmg, const string &matchtab, const string &db, const string &tabname)
void checksplit (mysqlpp::Connection *dbh, int olpmg, const string &matchtab, const string &db, const string &tabname, const string &modeltab)
void cleanBySpecies (Connection *dbh, const string &matab)
bool create_gapcount (Connection *dbh, const string &gaptab, const string &source)
void create_modelaln (Connection *dbh, const string &matab, const string &taxids, const Cutoff &cut, const string &sourceTable)
bool create_alnsplit (Connection *dbh, const string &matab, const string &alntoptab, const Cutoff &cut)
bool create_alnchimera (Connection *dbh, const string &matab, const string &alnchimera, const Cutoff &cut)
void split2tab (const list< string > &rows, const string &tabname, mysqlpp::Connection *dbh)
void storeInChimeraTable (const list< string > &input, const string &tabname, mysqlpp::Connection *dbh)
void toTable (const list< string > &rows, const string &tabname, mysqlpp::Connection *dbh)
void usage (const Cutoff &cut)
void removeSimpleMatch (Connection &conn, const string &alnintab)
pair< double, double > computeZvalue (Protein &p1, Protein &p2, int n)
string readUnwantedTaxid (const string &taxidsFile)
void bufferQueryProteins (Connection &conn, const string &alntab, map< int, string > &id2prt)
void bufferQueryProteinsVector (Connection &conn, const string &alntab, vector< pair< int, string > > &store, int offset, int rowcount)
void bufferTargetProteins (Connection &conn, const string &alntab, map< string, string > &id2prt, int maxcount=100000)
void bufferTargetProteins2 (Connection &conn, const string &alntab, map< string, seqelem > &id2prt, int maxcount)
void bufferAheadTargetProteins (Connection &conn, const string &alntab, map< string, string > &tpepstore, int offset, int maxcount)
bool getTargetProtein (Connection &conn, const string &pid, string &seq)
bool fetchTargetProtein (map< string, seqelem > &cache, const string &id, string &seqstr, Connection &conn)
void printBanner ()
bool prepare (const Progparam &par, Connection &conn)
int main (int argc, char *argv[])

Function Documentation

void bufferAheadTargetProteins ( Connection &  conn,
const string &  alntab,
map< string, string > &  tpepstore,
int  offset,
int  maxcount 
)

input: conn is the database connection alntab is the input alignment table tpepstore is the input map to hold results. offset is the offset in alntab to start buffering maxcount is the max number of row from alntab to include This function will take the protein sequences from table proteinHitDesc and store them into tpepstor.

Because the alignment table may have multiple rows for each target protein, the actual number of target protein is usually fewer than the number of rows in the alntab.

References string().

Referenced by removeSimpleMatch().

void bufferQueryProteins ( Connection &  conn,
const string &  alntab,
map< int, string > &  id2prt 
)

References bad, and string().

Referenced by removeSimpleMatch().

void bufferQueryProteinsVector ( Connection &  conn,
const string &  alntab,
vector< pair< int, string > > &  store,
int  offset,
int  rowcount 
)

References string().

void bufferTargetProteins ( Connection &  conn,
const string &  alntab,
map< string, string > &  id2prt,
int  maxcount = 100000 
)

References string().

void bufferTargetProteins2 ( Connection &  conn,
const string &  alntab,
map< string, seqelem > &  id2prt,
int  maxcount 
)

References string().

void checkchimera ( mysqlpp::Connection *  dbh,
int  olpmg,
const string &  matchtab,
const string &  db,
const string &  tabname 
)

void checksplit ( mysqlpp::Connection *  dbh,
int  olpmg,
const string &  matchtab,
const string &  db,
const string &  tabname,
const string &  modeltab 
)

use the matchtab to check split genes, This requires more information than split gene and do more checking. matchtab is usually the topaln table needs the modelsrep table for input

Output: written to database table tabname

References ChainAvgrange::addRange(), ChainAvgrange::checkSplits(), rangePair::fields(), ChainAvgrange::isChimera(), alnrange::setmargin(), split2tab(), and string().

Referenced by main().

void cleanBySpecies ( Connection *  dbh,
const string &  matab 
)

Referenced by create_modelaln().

pair< double, double > computeZvalue ( Protein p1,
Protein p2,
int  n 
)

return a pair of z-score for (score,identity)

References Dynaln::getIdentity(), bioseq::length(), Dynaln::printAlign(), bioseq::randomize(), Dynaln::runglobal(), and Dynaln::setSeq().

Referenced by main(), and removeSimpleMatch().

bool create_alnchimera ( Connection *  dbh,
const string &  matab,
const string &  alnchimera,
const Cutoff cut 
)

create the aliment input for chimera detection from the main alignment input table

pick good matches, for chimera detection, from modelaln table: matab ng > 0.34 and qcov>0.3 and tcov>0.4

alnchimera is the output table which is the input for chimera detection.

return true if operation is successful.

References Cutoff::chimera_cov, Cutoff::iden, and Cutoff::ng.

Referenced by main().

bool create_alnsplit ( Connection *  dbh,
const string &  matab,
const string &  alnsplit,
const Cutoff cut 
)

create the input table for split detection from the main alignment table

create aln table for input of split checking return true if operation is successful

References Cutoff::idenString(), Cutoff::ngString(), Cutoff::split_cov, and Cutoff::split_covString().

Referenced by main().

bool create_gapcount ( Connection *  dbh,
const string &  gaptab,
const string &  source 
)

input: source output: gaptab convert the longtext column hitInsertion and modelInsertion into integer type that represents the total number of gap and total gap length

References getinslen().

Referenced by create_modelaln().

void create_modelaln ( Connection *  dbh,
const string &  matab,
const string &  taxids,
const Cutoff cut,
const string &  sourceTable 
)

set up the input data

matab is the input table which can be either a subset of proteinSW or modified version of proteinSWX (with sequence removed) The proteinSWX table helps to increase performance.

We can choose a subset from proteinSWX based on modelsrep table.

Input: sourceTable: proteinSW or proteinSWX, and the taxids: a list of multiple taxids to be eliminated including the target organism itself. Used to construct sql string.

Output: Build the matab (default name modelaln) table

This is a filtering process, essentially removing the sequence part of the alignment. Furthermore, it marks the quality of each alignment. The most important quality is 20, stands for top 20% of each query and target. return the (maxqtab, maxttab) pair for use by the next program. maxqtab = matab + "_maxq"; maxttab = matab + "_maxt"; This will add flexibility for the future to return any names.

If matab exists, then this program will drop it first then recreate it.

References Cutoff::alnlen, cleanBySpecies(), create_gapcount(), Cutoff::e, Cutoff::hitlen, Cutoff::iden, itos(), Cutoff::modellen, Cutoff::ng, and removeSimpleMatch().

Referenced by main().

void createNREPSW ( Connection &  conn,
const string &  model,
const string &  prtsw,
const string &  subsetaln 
)

Input: model (suggested name modelsrep) prtsw (suggested is either proteinSW or proteinSWX) Output: subsetaln (suggested name should be nrepswx)

model, prtsw, and subsetaln are all table names.

Referenced by main(), and prepare().

bool fetchTargetProtein ( map< string, seqelem > &  cache,
const string &  id,
string &  seqstr,
Connection &  conn 
)

References getTargetProtein().

int getinslen ( string  str  ) 

helper function input: column modelInsertion or hitInsertion of the proteinSW table example 62:58,78:17,154:81,273:7, output: sum of the length of all insertions

References dissect().

Referenced by create_gapcount().

bool getTargetProtein ( Connection &  conn,
const string &  pid,
string &  seq 
)

return true if successful

get one target protein from the store table: proteinHitDesc

Parameters:
pid hitId to look for from table proteinHitDesc
seq is a string object that will contain the new sequence.
this method is not efficient and not used bulk loading.

Referenced by fetchTargetProtein().

int main ( int  argc,
char *  argv[] 
)

bool mysqlTableExists ( Connection &  conn,
const string &  tab 
)

Referenced by main().

bool prepare ( const Progparam par,
Connection &  conn 
)

This function prepare for this program, it should either overwrite existing table or just use old ones if they exist.

References createNREPSW(), Progparam::database, Progparam::host, Progparam::model, and Progparam::rawalntab.

Referenced by main().

void printBanner (  ) 

Referenced by main().

string readUnwantedTaxid ( const string &  taxidsFile  ) 

References dissect(), and ifstream().

Referenced by main().

void removeSimpleMatch ( Connection &  conn,
const string &  alnintab 
)

removing simple matches from the aln table this step is not very fast

parameters will be hard coded at this point input: alnintab output: alnouttab without simple alignments

References bufferAheadTargetProteins(), bufferQueryProteins(), bioseq::computeEntropy(), computeZvalue(), min, and string().

Referenced by create_modelaln().

void split2tab ( const list< string > &  rows,
const string &  tabname,
mysqlpp::Connection *  dbh 
)

References toTable().

Referenced by checksplit().

void storeInChimeraTable ( const list< string > &  input,
const string &  tabname,
mysqlpp::Connection *  dbh 
)

References toTable().

Referenced by checkchimera().

bool tableExists ( Connection &  conn,
const string &  tab 
)

void toTable ( const list< string > &  rows,
const string &  tabname,
mysqlpp::Connection *  dbh 
)

Referenced by split2tab(), and storeInChimeraTable().

void usage ( const Cutoff cut  ) 


Generated on Wed Aug 10 11:57:01 2011 for Softwares from Orpara by  doxygen 1.5.6