pfog.cpp File Reference

#include "Range.h"
#include "Match.h"
#include <mysql++.h>
#include <iostream>
#include <map>
#include <fstream>
#include "bioseq.h"
#include "dbinfo.h"

Classes

class  Tblastnmy
struct  PfogParameters

Functions

void loadGenomicInfo (mysqlpp::Connection *dbh, map< string, int > &genomicinfo, const string &gentab)
void loadProteinInfo (Query &query, map< string, string > &prtinfo, Connection &nrdb, const string &nrtab)
int bufferProteins (Connection &conn, map< string, string > &pepstore, int offset, Connection &nrdb, const PfogParameters &par)
int bufferProteinsFromFile (const string &infile, map< string, string > &store)
void storeDeleteRows (Connection *dbh, list< string > &rows, const string &table)
void fetchGenomic (Connection &dbh, const string &seqid, DNA &seq)
void fetchProtein (Connection &dbh, const string &seqid, Protein &seq)
void createRemoveTable (Connection &dbh, const string &table)
TblastnprocessOneBatch (Connection &conn, Connection &conn2, Connection &nrconn, const PfogParameters &par, int offset, ostream &SUM, ostream &DET, int &rplcount, map< string, int > &genoinfo, Tblastn &ltbn)
void storeBestModels (Tblastn &tbn, ostream &SUM, ostream &DET, const PfogParameters &par)
void usage (const PfogParameters &param)
int main (int argc, char *argv[])

Function Documentation

int bufferProteins ( Connection &  conn,
map< string, string > &  pepstore,
int  offset,
Connection &  nrdb,
const PfogParameters par 
)

int bufferProteinsFromFile ( const string &  infile,
map< string, string > &  store 
)

this function buffers all the proteins from the input file into a map of <string,string> pid -> sequence string Not fully coded yet.

References bioseq::getName(), bioseq::getSequence(), ifstream(), and bioseq::read().

Referenced by main().

void createRemoveTable ( Connection &  dbh,
const string &  table 
)

void fetchGenomic ( Connection &  dbh,
const string &  seqid,
DNA seq 
)

assume that the target database has genomic table

Referenced by processAll(), and processOneBatch().

void fetchProtein ( Connection &  dbh,
const string &  seqid,
Protein seq 
)

void loadGenomicInfo ( mysqlpp::Connection *  dbh,
map< string, int > &  genomicinfo,
const string &  gentab 
)

Referenced by main().

void loadProteinInfo ( Query &  query,
map< string, string > &  prtinfo,
Connection &  nrdb,
const string &  nrtab 
)

References string().

Referenced by main().

int main ( int  argc,
char *  argv[] 
)

Tblastn & processOneBatch ( Connection &  conn,
Connection &  conn2,
Connection &  nrconn,
const PfogParameters par,
int  offset,
ostream &  SUM,
ostream &  DET,
int &  rplcount,
map< string, int > &  genoinfo,
Tblastn ltbn 
)

This is the major working horse

process a subset from the whole table. In this particular case, I am picking a Reverse transcriptase scanning against different databases. I used RT from SwissProt and searched against a dozen fungal gnomic databases using tblastn. Because this program only deals with the whole table. I have the choice to write a wrapper to use a dozen tables or use a sql to select a portion of the tables.

proteinSource Tblastn& processAll(Connection &conn, Connection &conn2, Connection &nrconn, const PfogParameters &par, int offset, ostream &SUM, ostream &DET, int &rplcount, map<string,int> &genoinfo, Tblastn &ltbn) { cerr << "Processing recoreds of the input table: " << par.input_table << "\n";

map<string,string> pepstore; bufferProteins(conn, pepstore, offset, nrconn, par); par.input_table, offset, par.chunk_size, nrconn, par.nr);

try { Query query=conn.query(); query << "select * from tbn_kznr2Aniger order by target,query limit 5000"; 0 1 2 3 4 5 6 query << "select query, target, identity, alnlen, mismatches, gaps, qbegin, " << " qend, tbegin, tend, E, score " << " from " << par.input_table << " where identity > " << par.identitycut << " order by target,query,tbegin,tend,qbegin,qend limit " << offset << "," << par.chunk_size; Result res=query.store(); if (res.num_rows() == 0) { return ltbn; } int showEvery=5000; cerr << res.num_rows() << " rows to process\n"; int nfields = res.num_fields(); Row arow; list<string> removedRows; // for removing junks Result::const_iterator rit=res.begin(); arow=res.fetch_row(); int rplcount=0; int cnt=0; DNA genomicSeq; Protein proteinSeq; string lastTarget; while (arow) { arow=*rit; while (rit != res.end()) { cnt++; string query=string(arow[0]); string query=string((*rit).at(0)); string target=string((*rit).at(1));

if (target != lastTarget) { fetchGenomic(conn2, target, genomicSeq); lastTarget=target; } the protein is always different fetchProtein(conn2, query, proteinSeq); map<string,string>::const_iterator it = pepstore.find(query); query protein not found in pepstore if (it == pepstore.end()) { ifdef DEBUG cerr << query << " removed from the nr database, ignoring this row\n"; endif discarding all rows with the query missing from nr while (rit != res.end() && string((*rit).at(0)) == query && string((*rit).at(1)) == target) { ++rit; arow=res.fetch_row(); } continue; } proteinSeq=it->second; ifdef DEBUG cout << "\n********* Working on " << query << " <-> " << target << " # " << cnt << " ...\n"; cerr << "\n********* Working on " << query << " <-> " << target << " # " << cnt << " ...\n"; else if (cnt % par.showevery == 0) { cerr << query << " <-> " << target << " # " << cnt << " pairs done ...\n"; } endif Tblastnmy tbn(query, target, proteinSeq.length(), genoinfo[target]); if (!ltbn.empty()) { if (tbn.sameQTPair(ltbn)) { cerr << "combining tbln with last tbn result\n" << "last tbn:\n" << ltbn << "\n===========================================\nthis tbn:\n" << tbn << endl; tbn.add(ltbn); } else { cerr << "Store away last tbln from previous batch\n" << ltbn << endl; storeBestModels(ltbn, SUM, DET, par); } ltbn.clear(); // make sure this information used only once. } tbn.setSortedByTarget(true); tbn.setSortedByTarget(false); tbn.addMatch((*rit)); arow=res.fetch_row(); ++rit; while (rit != res.end() && string((*rit).at(0)) == query && string((*rit).at(1)) == target) { tbn.addMatch((*rit)); ++rit; arow=res.fetch_row(); } rplcount += tbn.removeJunk(proteinSeq, genomicSeq, 0.8); if (!arow) ltbn=tbn; // the last tbn of the last chunk if (rit == res.end()) ltbn=tbn; // the last tbn of the last chunk needs to be processed outside of this function. else storeBestModels(tbn, SUM, DET, par); } cerr << cnt << " QxT pairs of this chunk of " << res.num_rows() << " matches processed\n"; } catch (exception &err) { cerr << err.what() << endl << " failed to process " << offset << " chunk\n"; exit(1); } return ltbn; } When the number of rows is too large we needs to buffer the result into segments this function will process one segment of 5 million rows offset is 0-bases

This function has one problem: when query x target have multiple matches, it limit is more likely not at the right boundary between different query x target pairs. Future version of this function should be able to do that. Postpone the processing of the last blast.

return true when done.

References bufferProteins(), PfogParameters::chunk_size, Tblastn::clear(), Tblastn::empty(), fetchGenomic(), PfogParameters::identitycut, PfogParameters::input_table, bioseq::length(), PfogParameters::showevery, storeBestModels(), and string().

Referenced by main().

void storeBestModels ( Tblastn tbn,
ostream &  SUM,
ostream &  DET,
const PfogParameters par 
)

void storeDeleteRows ( Connection *  dbh,
list< string > &  rows,
const string &  table 
)

void usage ( const PfogParameters param  ) 

this describe how to use this program After calling this function the program will terminate.


Generated on Wed Aug 10 11:57:05 2011 for Softwares from Orpara by  doxygen 1.5.6