#include "Range.h"#include "Match.h"#include <mysql++.h>#include <iostream>#include <map>#include <fstream>#include "bioseq.h"#include "dbinfo.h"Classes | |
| class | Tblastnmy |
| struct | PfogParameters |
Functions | |
| void | loadGenomicInfo (mysqlpp::Connection *dbh, map< string, int > &genomicinfo, const string &gentab) |
| void | loadProteinInfo (Query &query, map< string, string > &prtinfo, Connection &nrdb, const string &nrtab) |
| int | bufferProteins (Connection &conn, map< string, string > &pepstore, int offset, Connection &nrdb, const PfogParameters &par) |
| int | bufferProteinsFromFile (const string &infile, map< string, string > &store) |
| void | storeDeleteRows (Connection *dbh, list< string > &rows, const string &table) |
| void | fetchGenomic (Connection &dbh, const string &seqid, DNA &seq) |
| void | fetchProtein (Connection &dbh, const string &seqid, Protein &seq) |
| void | createRemoveTable (Connection &dbh, const string &table) |
| Tblastn & | processOneBatch (Connection &conn, Connection &conn2, Connection &nrconn, const PfogParameters &par, int offset, ostream &SUM, ostream &DET, int &rplcount, map< string, int > &genoinfo, Tblastn <bn) |
| void | storeBestModels (Tblastn &tbn, ostream &SUM, ostream &DET, const PfogParameters &par) |
| void | usage (const PfogParameters ¶m) |
| int | main (int argc, char *argv[]) |
| int bufferProteins | ( | Connection & | conn, | |
| map< string, string > & | pepstore, | |||
| int | offset, | |||
| Connection & | nrdb, | |||
| const PfogParameters & | par | |||
| ) |
References PfogParameters::chunk_size, PfogParameters::identitycut, PfogParameters::input_table, PfogParameters::nr, and string().
Referenced by processOneBatch().
| int bufferProteinsFromFile | ( | const string & | infile, | |
| map< string, string > & | store | |||
| ) |
this function buffers all the proteins from the input file into a map of <string,string> pid -> sequence string Not fully coded yet.
References bioseq::getName(), bioseq::getSequence(), ifstream(), and bioseq::read().
Referenced by main().
| void createRemoveTable | ( | Connection & | dbh, | |
| const string & | table | |||
| ) |
| void fetchGenomic | ( | Connection & | dbh, | |
| const string & | seqid, | |||
| DNA & | seq | |||
| ) |
assume that the target database has genomic table
Referenced by processAll(), and processOneBatch().
| void fetchProtein | ( | Connection & | dbh, | |
| const string & | seqid, | |||
| Protein & | seq | |||
| ) |
| void loadGenomicInfo | ( | mysqlpp::Connection * | dbh, | |
| map< string, int > & | genomicinfo, | |||
| const string & | gentab | |||
| ) |
Referenced by main().
| void loadProteinInfo | ( | Query & | query, | |
| map< string, string > & | prtinfo, | |||
| Connection & | nrdb, | |||
| const string & | nrtab | |||
| ) |
| int main | ( | int | argc, | |
| char * | argv[] | |||
| ) |
References PfogParameters::chunk_size, MysqlDBInfo::getAuthenInfo(), MysqlDBInfo::getPassword(), MysqlDBInfo::getUser(), PfogParameters::identitycut, PfogParameters::input_table, loadGenomicInfo(), processOneBatch(), PfogParameters::prtinfile, PfogParameters::qcovcut, Tblastn::readConfig(), storeBestModels(), and user.
| Tblastn & processOneBatch | ( | Connection & | conn, | |
| Connection & | conn2, | |||
| Connection & | nrconn, | |||
| const PfogParameters & | par, | |||
| int | offset, | |||
| ostream & | SUM, | |||
| ostream & | DET, | |||
| int & | rplcount, | |||
| map< string, int > & | genoinfo, | |||
| Tblastn & | ltbn | |||
| ) |
This is the major working horse
process a subset from the whole table. In this particular case, I am picking a Reverse transcriptase scanning against different databases. I used RT from SwissProt and searched against a dozen fungal gnomic databases using tblastn. Because this program only deals with the whole table. I have the choice to write a wrapper to use a dozen tables or use a sql to select a portion of the tables.
proteinSource Tblastn& processAll(Connection &conn, Connection &conn2, Connection &nrconn, const PfogParameters &par, int offset, ostream &SUM, ostream &DET, int &rplcount, map<string,int> &genoinfo, Tblastn <bn) { cerr << "Processing recoreds of the input table: " << par.input_table << "\n";
map<string,string> pepstore; bufferProteins(conn, pepstore, offset, nrconn, par); par.input_table, offset, par.chunk_size, nrconn, par.nr);
try { Query query=conn.query(); query << "select * from tbn_kznr2Aniger order by target,query limit 5000"; 0 1 2 3 4 5 6 query << "select query, target, identity, alnlen, mismatches, gaps, qbegin, " << " qend, tbegin, tend, E, score " << " from " << par.input_table << " where identity > " << par.identitycut << " order by target,query,tbegin,tend,qbegin,qend limit " << offset << "," << par.chunk_size; Result res=query.store(); if (res.num_rows() == 0) { return ltbn; } int showEvery=5000; cerr << res.num_rows() << " rows to process\n"; int nfields = res.num_fields(); Row arow; list<string> removedRows; // for removing junks Result::const_iterator rit=res.begin(); arow=res.fetch_row(); int rplcount=0; int cnt=0; DNA genomicSeq; Protein proteinSeq; string lastTarget; while (arow) { arow=*rit; while (rit != res.end()) { cnt++; string query=string(arow[0]); string query=string((*rit).at(0)); string target=string((*rit).at(1));
if (target != lastTarget) { fetchGenomic(conn2, target, genomicSeq); lastTarget=target; } the protein is always different fetchProtein(conn2, query, proteinSeq); map<string,string>::const_iterator it = pepstore.find(query); query protein not found in pepstore if (it == pepstore.end()) { ifdef DEBUG cerr << query << " removed from the nr database, ignoring this row\n"; endif discarding all rows with the query missing from nr while (rit != res.end() && string((*rit).at(0)) == query && string((*rit).at(1)) == target) { ++rit; arow=res.fetch_row(); } continue; } proteinSeq=it->second; ifdef DEBUG cout << "\n********* Working on " << query << " <-> " << target << " # " << cnt << " ...\n"; cerr << "\n********* Working on " << query << " <-> " << target << " # " << cnt << " ...\n"; else if (cnt % par.showevery == 0) { cerr << query << " <-> " << target << " # " << cnt << " pairs done ...\n"; } endif Tblastnmy tbn(query, target, proteinSeq.length(), genoinfo[target]); if (!ltbn.empty()) { if (tbn.sameQTPair(ltbn)) { cerr << "combining tbln with last tbn result\n" << "last tbn:\n" << ltbn << "\n===========================================\nthis tbn:\n" << tbn << endl; tbn.add(ltbn); } else { cerr << "Store away last tbln from previous batch\n" << ltbn << endl; storeBestModels(ltbn, SUM, DET, par); } ltbn.clear(); // make sure this information used only once. } tbn.setSortedByTarget(true); tbn.setSortedByTarget(false); tbn.addMatch((*rit)); arow=res.fetch_row(); ++rit; while (rit != res.end() && string((*rit).at(0)) == query && string((*rit).at(1)) == target) { tbn.addMatch((*rit)); ++rit; arow=res.fetch_row(); } rplcount += tbn.removeJunk(proteinSeq, genomicSeq, 0.8); if (!arow) ltbn=tbn; // the last tbn of the last chunk if (rit == res.end()) ltbn=tbn; // the last tbn of the last chunk needs to be processed outside of this function. else storeBestModels(tbn, SUM, DET, par); } cerr << cnt << " QxT pairs of this chunk of " << res.num_rows() << " matches processed\n"; } catch (exception &err) { cerr << err.what() << endl << " failed to process " << offset << " chunk\n"; exit(1); } return ltbn; } When the number of rows is too large we needs to buffer the result into segments this function will process one segment of 5 million rows offset is 0-bases
This function has one problem: when query x target have multiple matches, it limit is more likely not at the right boundary between different query x target pairs. Future version of this function should be able to do that. Postpone the processing of the last blast.
return true when done.
References bufferProteins(), PfogParameters::chunk_size, Tblastn::clear(), Tblastn::empty(), fetchGenomic(), PfogParameters::identitycut, PfogParameters::input_table, bioseq::length(), PfogParameters::showevery, storeBestModels(), and string().
Referenced by main().
| void storeBestModels | ( | Tblastn & | tbn, | |
| ostream & | SUM, | |||
| ostream & | DET, | |||
| const PfogParameters & | par | |||
| ) |
helper function used by processOneBatch and Main
References Linkmatch::bestModel(), Tblastn::numMatches(), Tblastn::outputModel(), PfogParameters::qcovcut, PfogParameters::qcovlencut, and Tblastn::queryLength().
Referenced by main(), processAll(), and processOneBatch().
| void storeDeleteRows | ( | Connection * | dbh, | |
| list< string > & | rows, | |||
| const string & | table | |||
| ) |
| void usage | ( | const PfogParameters & | param | ) |
this describe how to use this program After calling this function the program will terminate.
1.5.6