#include "alnrange.h"#include "ChainAvgrange.h"#include <stdlib.h>#include <iostream>#include <string>#include <vector>#include <fstream>#include <sstream>#include <mysql++.h>#include "strformat.h"#include "bioseq.h"#include "dynaln.h"#include <cmath>#include <iomanip>Classes | |
| struct | Cutoff |
| class | Progparam |
| class | seqelem |
Functions | |
| void | createNREPSW (Connection &conn, const string &model, const string &prtsw, const string &subsetaln) |
| int | getinslen (string str) |
| bool | tableExists (Connection &conn, const string &tab) |
| bool | mysqlTableExists (Connection &conn, const string &tab) |
| void | checkchimera (mysqlpp::Connection *dbh, int olpmg, const string &matchtab, const string &db, const string &tabname) |
| void | checksplit (mysqlpp::Connection *dbh, int olpmg, const string &matchtab, const string &db, const string &tabname, const string &modeltab) |
| void | cleanBySpecies (Connection *dbh, const string &matab) |
| bool | create_gapcount (Connection *dbh, const string &gaptab, const string &source) |
| void | create_modelaln (Connection *dbh, const string &matab, const string &taxids, const Cutoff &cut, const string &sourceTable) |
| bool | create_alnsplit (Connection *dbh, const string &matab, const string &alntoptab, const Cutoff &cut) |
| bool | create_alnchimera (Connection *dbh, const string &matab, const string &alnchimera, const Cutoff &cut) |
| void | split2tab (const list< string > &rows, const string &tabname, mysqlpp::Connection *dbh) |
| void | storeInChimeraTable (const list< string > &input, const string &tabname, mysqlpp::Connection *dbh) |
| void | toTable (const list< string > &rows, const string &tabname, mysqlpp::Connection *dbh) |
| void | usage (const Cutoff &cut) |
| void | removeSimpleMatch (Connection &conn, const string &alnintab) |
| pair< double, double > | computeZvalue (Protein &p1, Protein &p2, int n) |
| string | readUnwantedTaxid (const string &taxidsFile) |
| void | bufferQueryProteins (Connection &conn, const string &alntab, map< int, string > &id2prt) |
| void | bufferQueryProteinsVector (Connection &conn, const string &alntab, vector< pair< int, string > > &store, int offset, int rowcount) |
| void | bufferTargetProteins (Connection &conn, const string &alntab, map< string, string > &id2prt, int maxcount=100000) |
| void | bufferTargetProteins2 (Connection &conn, const string &alntab, map< string, seqelem > &id2prt, int maxcount) |
| void | bufferAheadTargetProteins (Connection &conn, const string &alntab, map< string, string > &tpepstore, int offset, int maxcount) |
| bool | getTargetProtein (Connection &conn, const string &pid, string &seq) |
| bool | fetchTargetProtein (map< string, seqelem > &cache, const string &id, string &seqstr, Connection &conn) |
| void | printBanner () |
| bool | prepare (const Progparam &par, Connection &conn) |
| int | main (int argc, char *argv[]) |
| void bufferAheadTargetProteins | ( | Connection & | conn, | |
| const string & | alntab, | |||
| map< string, string > & | tpepstore, | |||
| int | offset, | |||
| int | maxcount | |||
| ) |
input: conn is the database connection alntab is the input alignment table tpepstore is the input map to hold results. offset is the offset in alntab to start buffering maxcount is the max number of row from alntab to include This function will take the protein sequences from table proteinHitDesc and store them into tpepstor.
Because the alignment table may have multiple rows for each target protein, the actual number of target protein is usually fewer than the number of rows in the alntab.
References string().
Referenced by removeSimpleMatch().
| void bufferQueryProteins | ( | Connection & | conn, | |
| const string & | alntab, | |||
| map< int, string > & | id2prt | |||
| ) |
| void bufferQueryProteinsVector | ( | Connection & | conn, | |
| const string & | alntab, | |||
| vector< pair< int, string > > & | store, | |||
| int | offset, | |||
| int | rowcount | |||
| ) |
References string().
| void bufferTargetProteins | ( | Connection & | conn, | |
| const string & | alntab, | |||
| map< string, string > & | id2prt, | |||
| int | maxcount = 100000 | |||
| ) |
References string().
| void bufferTargetProteins2 | ( | Connection & | conn, | |
| const string & | alntab, | |||
| map< string, seqelem > & | id2prt, | |||
| int | maxcount | |||
| ) |
References string().
| void checkchimera | ( | mysqlpp::Connection * | dbh, | |
| int | olpmg, | |||
| const string & | matchtab, | |||
| const string & | db, | |||
| const string & | tabname | |||
| ) |
| void checksplit | ( | mysqlpp::Connection * | dbh, | |
| int | olpmg, | |||
| const string & | matchtab, | |||
| const string & | db, | |||
| const string & | tabname, | |||
| const string & | modeltab | |||
| ) |
use the matchtab to check split genes, This requires more information than split gene and do more checking. matchtab is usually the topaln table needs the modelsrep table for input
Output: written to database table tabname
References ChainAvgrange::addRange(), ChainAvgrange::checkSplits(), rangePair::fields(), ChainAvgrange::isChimera(), alnrange::setmargin(), split2tab(), and string().
Referenced by main().
| void cleanBySpecies | ( | Connection * | dbh, | |
| const string & | matab | |||
| ) |
Referenced by create_modelaln().
return a pair of z-score for (score,identity)
References Dynaln::getIdentity(), bioseq::length(), Dynaln::printAlign(), bioseq::randomize(), Dynaln::runglobal(), and Dynaln::setSeq().
Referenced by main(), and removeSimpleMatch().
| bool create_alnchimera | ( | Connection * | dbh, | |
| const string & | matab, | |||
| const string & | alnchimera, | |||
| const Cutoff & | cut | |||
| ) |
create the aliment input for chimera detection from the main alignment input table
pick good matches, for chimera detection, from modelaln table: matab ng > 0.34 and qcov>0.3 and tcov>0.4
alnchimera is the output table which is the input for chimera detection.
return true if operation is successful.
References Cutoff::chimera_cov, Cutoff::iden, and Cutoff::ng.
Referenced by main().
| bool create_alnsplit | ( | Connection * | dbh, | |
| const string & | matab, | |||
| const string & | alnsplit, | |||
| const Cutoff & | cut | |||
| ) |
create the input table for split detection from the main alignment table
create aln table for input of split checking return true if operation is successful
References Cutoff::idenString(), Cutoff::ngString(), Cutoff::split_cov, and Cutoff::split_covString().
Referenced by main().
| bool create_gapcount | ( | Connection * | dbh, | |
| const string & | gaptab, | |||
| const string & | source | |||
| ) |
input: source output: gaptab convert the longtext column hitInsertion and modelInsertion into integer type that represents the total number of gap and total gap length
References getinslen().
Referenced by create_modelaln().
| void create_modelaln | ( | Connection * | dbh, | |
| const string & | matab, | |||
| const string & | taxids, | |||
| const Cutoff & | cut, | |||
| const string & | sourceTable | |||
| ) |
set up the input data
matab is the input table which can be either a subset of proteinSW or modified version of proteinSWX (with sequence removed) The proteinSWX table helps to increase performance.
We can choose a subset from proteinSWX based on modelsrep table.
Input: sourceTable: proteinSW or proteinSWX, and the taxids: a list of multiple taxids to be eliminated including the target organism itself. Used to construct sql string.
Output: Build the matab (default name modelaln) table
This is a filtering process, essentially removing the sequence part of the alignment. Furthermore, it marks the quality of each alignment. The most important quality is 20, stands for top 20% of each query and target. return the (maxqtab, maxttab) pair for use by the next program. maxqtab = matab + "_maxq"; maxttab = matab + "_maxt"; This will add flexibility for the future to return any names.
If matab exists, then this program will drop it first then recreate it.
References Cutoff::alnlen, cleanBySpecies(), create_gapcount(), Cutoff::e, Cutoff::hitlen, Cutoff::iden, itos(), Cutoff::modellen, Cutoff::ng, and removeSimpleMatch().
Referenced by main().
| void createNREPSW | ( | Connection & | conn, | |
| const string & | model, | |||
| const string & | prtsw, | |||
| const string & | subsetaln | |||
| ) |
| bool fetchTargetProtein | ( | map< string, seqelem > & | cache, | |
| const string & | id, | |||
| string & | seqstr, | |||
| Connection & | conn | |||
| ) |
References getTargetProtein().
| int getinslen | ( | string | str | ) |
helper function input: column modelInsertion or hitInsertion of the proteinSW table example 62:58,78:17,154:81,273:7, output: sum of the length of all insertions
References dissect().
Referenced by create_gapcount().
| bool getTargetProtein | ( | Connection & | conn, | |
| const string & | pid, | |||
| string & | seq | |||
| ) |
return true if successful
get one target protein from the store table: proteinHitDesc
| pid | hitId to look for from table proteinHitDesc | |
| seq | is a string object that will contain the new sequence. |
Referenced by fetchTargetProtein().
| int main | ( | int | argc, | |
| char * | argv[] | |||
| ) |
References Cutoff::alnlen, checkchimera(), checksplit(), Cutoff::chimera_cov, create_alnchimera(), create_alnsplit(), create_modelaln(), createNREPSW(), Progparam::database, Cutoff::e, Cutoff::hitlen, Progparam::host, Cutoff::iden, Progparam::model, Cutoff::modellen, mysqlTableExists(), Cutoff::ng, passwd, prepare(), printBanner(), Progparam::rawalntab, readUnwantedTaxid(), Cutoff::split_cov, Progparam::taxids, usage, and user.
| bool mysqlTableExists | ( | Connection & | conn, | |
| const string & | tab | |||
| ) |
Referenced by main().
| bool prepare | ( | const Progparam & | par, | |
| Connection & | conn | |||
| ) |
This function prepare for this program, it should either overwrite existing table or just use old ones if they exist.
References createNREPSW(), Progparam::database, Progparam::host, Progparam::model, and Progparam::rawalntab.
Referenced by main().
| void printBanner | ( | ) |
Referenced by main().
| string readUnwantedTaxid | ( | const string & | taxidsFile | ) |
| void removeSimpleMatch | ( | Connection & | conn, | |
| const string & | alnintab | |||
| ) |
removing simple matches from the aln table this step is not very fast
parameters will be hard coded at this point input: alnintab output: alnouttab without simple alignments
References bufferAheadTargetProteins(), bufferQueryProteins(), bioseq::computeEntropy(), computeZvalue(), min, and string().
Referenced by create_modelaln().
| void split2tab | ( | const list< string > & | rows, | |
| const string & | tabname, | |||
| mysqlpp::Connection * | dbh | |||
| ) |
| void storeInChimeraTable | ( | const list< string > & | input, | |
| const string & | tabname, | |||
| mysqlpp::Connection * | dbh | |||
| ) |
| bool tableExists | ( | Connection & | conn, | |
| const string & | tab | |||
| ) |
Referenced by bufferGenomic(), bufferProtein(), createBootTable(), and createTranscriptTable().
| void toTable | ( | const list< string > & | rows, | |
| const string & | tabname, | |||
| mysqlpp::Connection * | dbh | |||
| ) |
Referenced by split2tab(), and storeInChimeraTable().
| void usage | ( | const Cutoff & | cut | ) |
1.5.6