hatrees.h

Go to the documentation of this file.
00001 // file hatrees.h  replaces forest.h
00002 // template class;  no *.cpp file associated with it
00003 #ifndef DISET_H
00004 #define DISET_H
00005 //#define HAVE_NAMESPACE_STD
00006 
00007 #include <string>
00008 #include <map>
00009 //#include <hash_map.h>
00010 //#include <multimap.h> included by <map>
00011 #include <utility>   // contains pair.h
00012 #include <fstream>
00013 #include <sstream>
00014 //#include "libpq++.h"
00015 //#include <pqxx>
00016 #include <set>
00017 #include <vector>
00018 #include <iostream>
00019 #include <cstdlib>
00020 
00021 //#include <istream.h>
00022 //
00031 //#define USE_HASH_MAP  // will insert duplicate key
00032 
00033 // there are a lot std utilites used in this 
00034 // header, the following statement polute the namespace
00035 using namespace std;
00036 //using namespace pqxx;
00037 
00038 template<class T> struct node
00039 {
00040         node() :  rank(0), key() { parent=this; }  // point to self
00041         node(const T &k) : rank(0), key(k) { parent=this; }
00042         node(const T &k, node* p) : parent(p), rank(0), key(k) { }
00043         /* set parent = this is essential */
00044         node(const node &n) { key = n.key; parent = this;  rank = n.rank; }
00045         // if use parent = n.parent then segmentation fault
00046         ~node() { /*nothing needs to be done*/ }
00047         bool operator==(const node& n) const { return key==n.key; }
00048         bool operator<(const node &n) const { return key < n.key; }
00049         //node& operator=(const node &n) { if (&n != this) { key = n.key; parent = n.parent; rank = n.rank; } return *this; }
00050         node& operator=(const node &n) { if (&n != this) { key = n.key; parent = this; rank = n.rank; } return *this; }
00051 
00052         T key;   // stored in the map<key, value>
00053         node<T>* parent;
00054         int rank;
00055 };
00056 
00057 /* tree operations */
00059 /* x is not altered */
00060 template<class T> node<T>* getRoot(node<T> *x) {
00061         if (x != x->parent)  x->parent = getRoot(x->parent); 
00062         return x->parent;
00063 }
00064 template<class T> void mergeRoot(node<T> *&r1, node<T> *&r2) {
00065         if(r1 == r2) return;  // if both are the same set
00066         if (r1->rank > r2->rank) r2->parent = r1;
00067         else {
00068                 r1->parent = r2;
00069                 if (r1->rank == r2->rank) r2->rank++;
00070         }
00071 }
00072 template<class T> void join(node<T> &x, node<T> &y) {
00073         node<T> *r1 = getRoot(&x);  // this is necessary for the compiler
00074         node<T> *r2 = getRoot(&y);
00075         mergeRoot(r1, r2);
00076         //mergeRoot(getRoot(&x), getRoot(&y));  // compiler got confused 
00077 }
00078 template<class T> void join(node<T>* x, node<T>* y) {
00079         node<T> *r1 = getRoot(x);  // this is necessary for the compiler
00080         node<T> *r2 = getRoot(y);
00081         mergeRoot(r1, r2);
00082         //mergeRoot(getRoot(x), getRoot(y));  // compiler got confused 
00083 }
00084 
00086 /* for hash_map of string as key, hash is not working
00087  * should be implemented later */
00088 /*
00089 template<> struct hash<string> {
00090         size_t operator()(const string &key) const {
00091                 size_t res=0;
00092                 typedef string::const_iterator CI;
00093                 CI  p = key.begin();
00094                 CI end= key.end();
00095 
00096                 while (*p) {
00097                         res = (res<<4) + *p++;
00098                         size_t g = res & 0xF0000000L;
00099                         if (g) res ^= g >> 24;
00100                         res &= ~g;
00101                 }
00102                 return res;
00103         }
00104 };
00105 */
00107 //   hatrees class definition /////////////////////////
00115 template<class T> class hatrees
00116 {
00117         public:
00118                 hatrees() { }
00122                 hatrees(const std::multimap<T,T> &mm) { readFromMap(mm); }
00123                 ~hatrees();
00124                 void clear() { nodes.clear(); result.clear(); }
00126                 void readFromMap(const std::multimap<T, T> &m);
00127                 void readFromFile(const string &file);
00128       /* only gecods.cpp needs this function
00129        * I am not compiling gecods.cpp anymore
00130        */
00131                 //void readFromDB(PgDatabase &db);
00132                 //void readFromDB(pqxx::result &qres);
00133 
00134                 /********** output ********************/
00136                 //void loadTable(PgDatabase &db, string tab);
00137                 //void loadTable(pqxx::connection &db, string tab);
00138 
00140                 void showStore(ostream &os) const;  
00141 
00143                 int getNodeCount() const { return nodes.size(); }
00146                 void showCluster(ostream &os, bool reverse= true);
00151                 void showClusterIntId(ostream &ous, int &id);
00152 
00154                 int showClusterByLine(ostream &os);
00158                 set<T> keyset() const;
00159                 vector<T> keyarray() const;
00160 
00166                 void clusterArray(vector<set<T> > &vecset);
00167 
00173                 std::multimap<T,T>& getCluster() { if (result.empty()) transform(); return result; }
00174 
00175 #ifdef USE_HASH_MAP
00176         //typename hash_map<T, node<T>* >::iterator niterator;
00177         //typename hash_map<T, node<T>* >::const_iterator const_niterator;
00178         typedef typename hash_map<T, node<T>* >::iterator niterator;
00179         typedef typename hash_map<T, node<T>* >::const_iterator const_niterator;
00180 #else
00181         typedef typename map<T, node<T>* >::iterator niterator;
00182         typedef typename map<T, node<T>* >::const_iterator const_niterator;
00183         //typename map<T, node<T>* >::iterator niterator;
00184         //typename map<T, node<T>* >::const_iterator const_niterator;
00185 #endif
00186 
00187         private:
00188 #ifdef USE_HASH_MAP 
00189                 hash_map<T, node<T>* > nodes;
00190 #else
00191                 map<T, node<T>* > nodes;
00192 #endif
00193 
00196                 //multimap<T, T> cluster;  // sorted result, loaded after
00197                 std::multimap<T, T> result;  // sorted result in map format
00198 
00199                 /* transform the nodStore cluster into mutimaped cluster
00200                  * in a table format cluster_id -> members */
00201                 void transform();
00202 };
00203 
00205 template<class T> hatrees<T>::~hatrees() { 
00206         niterator MI = nodes.begin();
00207         while (MI != nodes.end()) {
00208                 delete MI->second;
00209                 MI++;
00210         }
00211 }
00212 
00213 template<class T> void hatrees<T>::readFromMap(const std::multimap<T, T> &m) {
00214         pair<niterator, bool> p1, p2;
00215         T f1, f2; 
00216         typename std::multimap<T, T>::const_iterator imi = m.begin();
00217 
00218         while (imi != m.end()) {
00219                 f1 = imi->first;
00220                 f2 = imi->second;
00221                 p1 = nodes.insert(pair<T, node<T>* >(f1, new node<T>(f1)));
00222                 p2 = nodes.insert(pair<T, node<T>* >(f2, new node<T>(f2)));
00223                 join(p1.first->second, p2.first->second);
00224                 imi++;
00225         }
00226 }
00227 
00228 template<class T> void hatrees<T>::readFromFile(const string &file) {
00229         ifstream IN(file.c_str());
00230         if (IN.fail()) {
00231                 cerr << "reading from " << file << "failed\n";
00232                 exit(1);
00233         }
00234         string ln;
00235         pair<niterator, bool> p1, p2;
00236         T f1, f2; 
00237         //hash_map<T, T> xx;   // hash_map inserts duplicate key under template
00238         //hash_map<T, int> hm;
00239         //map<T, T> yy;
00240         int id=0;
00241 
00242         getline(IN, ln);
00243         while (!IN.eof()) {
00244                 istringstream ist(ln);
00245                 ist >> f1 >> f2;
00246                 /*
00247                 xx.insert(pair<T, T>(f1, f2));
00248                 yy.insert(pair<T, T>(f1, f2));
00249                 if (hm.find(f1) == hm.end()) {
00250                         hm.insert(pair<T, int>(f1, id++));
00251                 }
00252                 if (hm.find(f2) == hm.end()) {
00253                         hm.insert(pair<T, int>(f2, id++));
00254                 }
00255                 */
00256                 p1 = nodes.insert(pair<T, node<T>* >(f1, new node<T>(f1)));
00257                 p2 = nodes.insert(pair<T, node<T>* >(f2, new node<T>(f2)));
00258                 join(p1.first->second, p2.first->second);
00259                 getline(IN, ln);
00260         }
00261         /*
00262         cout << "Size of hash_map<string, string> is " << xx.size() << endl;
00263         cout << "Size of map<string, string> is " << yy.size() << endl;
00264         cout << "Size of hash_map<string, int> is " << hm.size() << endl;
00265         hash_map<T, T>::iterator xxi = xx.begin();
00266         map<T,T>::iterator yyi = yy.begin();
00267         while (xxi != xx.end()) {
00268                 cout << xxi->first << "\t" << xxi->second << endl;
00269                 xxi++;
00270         }
00271         cout << "================================\n";
00272         while (yyi != yy.end()) {
00273                 cout << yyi->first << "\t" << yyi->second << endl;
00274                 yyi++;
00275         }
00276         cout << "-----------------------------\n";
00277         hash_map<T, int>::iterator hmi = hm.begin();
00278         while (hmi != hm.end()) {
00279                 cout << hmi->first << "\t" << hmi->second << endl;
00280                 hmi++;
00281         }
00282         */
00283 }
00284 
00285 /* db has the result ready; will use field 1, and 2 as input */
00286 //template<class T> void hatrees<T>::readFromDB(PgDatabase &db) {
00287 /* removing this function so that this library will not
00288  * dependent on postgres
00289 template<class T> void hatrees<T>::readFromDB(pqxx::result &qres) {
00290         pair<niterator, bool> p1, p2;
00291         T f1, f2; 
00292         string ln, tmp;
00293 
00294         int i;
00295         for (i=0; i< qres.size(); i++) {
00296                 //ln = db.GetValue(i,0);
00297                 qres[i][0].to(ln);
00298       qres[i][1].to(tmp);
00299                 //ln.append("\t").append(db.GetValue(i,1));
00300                 ln.append("\t").append(tmp);
00301       
00302                 istringstream ist(ln);
00303                 ist >> f1 >> f2;
00304                 p1 = nodes.insert(pair<T, node<T>* >(f1, new node<T>(f1)));
00305                 p2 = nodes.insert(pair<T, node<T>* >(f2, new node<T>(f2)));
00306                 join(p1.first->second, p2.first->second);
00307         }
00308 }
00309 */
00310 /* load into relational table(member_key, member_representative) */
00311 //template<class T> void hatrees<T>::loadTable(PgDatabase &db,string tab) {
00312 /* disable to make this library less dependent on postgress 
00313  * installation
00314 template<class T> void hatrees<T>::loadTable(pqxx::connection &db,string tab) {
00315         string queryBase = "insert into " + tab + " values(";
00316         
00317         niterator it = nodes.begin();
00318         node<T>* root;
00319         cerr << "Loading into table member_key\tgroup_representative\n";
00320    work Xaction(db, "forinsertion");
00321 
00322         while (it != nodes.end()) {
00323                 root = getRoot(it->second->parent);
00324                 string query = queryBase;
00325                 ostringstream ous;
00326                 ous << '\'' << it->first << "','" << root->key << "')";
00327                 query += ous.str();
00328       try {
00329          Xaction.exec(query);
00330       }
00331       catch (exception &err) {
00332          cerr << err.what() << endl;
00333          exit(1);
00334       }
00335       Xaction.commit();
00336       */
00337       /*
00338                 if (!db.ExecCommandOk(query.c_str())) {
00339                         cerr << query << " failed\n";
00340                         exit(1);
00341                 }
00342       */
00343 /*
00344                 it++;
00345         }
00346         cerr << "Total number of unique keys is " << nodes.size() << endl;
00347 }
00348 */
00349 
00350 /* members->cluster_id sorted according to members if using <map>
00351  * if using hash_map then not in any order */
00352 template<class T> void hatrees<T>::showStore(ostream &os) const {
00353         const_niterator it = nodes.begin();
00354         node<T>* root;
00355         os << "key\tcluster_representative\n";
00356         while (it != nodes.end()) {
00357                 root = getRoot(it->second->parent);
00358                 os << it->first << "\t" << root->key << endl;
00359                 it++;
00360         }
00361         os << "Total number of unique keys is " << nodes.size() << endl;
00362         cerr << "Total number of unique keys is " << nodes.size() << endl;
00363 }
00364 
00365 /* return the key as a set<T> */
00366 template<class T> set<T> hatrees<T>::keyset() const {
00367         set<T> tmpset;
00368         const_niterator it = nodes.begin();
00369         while (it != nodes.end()) {
00370                 tmpset.insert(it->first);
00371                 it++;
00372         }
00373         return tmpset;
00374 }
00375 
00376 template<class T> vector<T> hatrees<T>::keyarray()  const {
00377         vector<T> tmp;
00378         const_niterator it = nodes.begin();
00379         while (it != nodes.end()) {
00380                 //cout << it->first << "   ";  //debug
00381                 tmp.push_back(it->first);
00382                 it++;
00383         }
00384         return tmp;
00385 }
00386 
00387 /* return a vector of clusters as sets of members */
00388 template<class T> void hatrees<T>::clusterArray(vector< set<T> > &vecset) {
00389         if (result.empty()) transform();
00390         vecset.clear();  // just in case it has something already in it
00391 
00392         typename std::multimap<T, T>::const_iterator mi = result.begin();
00393         while (mi != result.end()) {
00394                 set<T> tmpset;
00395                 tmpset.insert(mi->second);
00396                 //os << mi->second;
00397                 T cluster_id = mi->first;
00398                 mi++;
00399                 while (mi != result.end() && mi->first == cluster_id) {
00400                         //os << '\t' << mi->second;
00401                         tmpset.insert(mi->second);
00402                         mi++;
00403                 }
00404                 vecset.push_back(tmpset);
00405                 //os << endl;
00406         }
00407 }
00408 
00409 // you can provide a inital id to name the clusters
00410 // This function will increment this id for each cluster
00411 template<class T> void hatrees<T>::showClusterIntId(ostream &ous, int &id) {
00412         if (result.empty()) transform();
00413         typename std::multimap<T, T>::const_iterator mi = result.begin();
00414         while (mi != result.end()) {
00415                 set<T> tmpset;
00416                 ous << mi->second << '\t' << id << endl;
00417                 //tmpset.insert(mi->second);
00418                 T cluster_id = mi->first;
00419                 mi++;
00420                 while (mi != result.end() && mi->first == cluster_id) {
00421                         //tmpset.insert(mi->second);
00422                         ous << mi->second << '\t' << id << endl;
00423                         mi++;
00424                 }
00425                 //vecset.push_back(tmpset);
00426                 //os << endl;
00427                 ++id;
00428         }
00429 }
00430 
00431 /* output in cluster_id -->members 
00432  * Table format, good for relational database
00433  * */
00434 template<class T> void hatrees<T>::showCluster(ostream &os, bool reverse) {
00435         if (result.empty()) transform();
00436 
00437         typename std::multimap<T, T>::const_iterator mi = result.begin();
00438         if (reverse) {
00439                 while (mi != result.end()) {
00440                         os << mi->second << '\t' << mi->first << endl;
00441                         mi++;
00442                 }
00443         }
00444         else {
00445                 while (mi != result.end()) {
00446                         os << mi->first << '\t' << mi->second << endl;
00447                         mi++;
00448                 }
00449         }
00450         //os << "Total members: " << result.size() << endl;
00451         //for db loading, this line causes trouble
00452         cerr << "Total members: " << result.size() << endl;
00453 }
00454 
00455 template<class T> void hatrees<T>::transform() {
00456         niterator it = nodes.begin();
00457         node<T>* root;
00458         while (it != nodes.end()) {
00459                 root = getRoot(it->second->parent);
00460                 result.insert(pair<T, T>(root->key, it->first));
00461                 it++;
00462         }
00463 }
00464 
00467 template<class T> int hatrees<T>::showClusterByLine(ostream &os) {
00468         if (result.empty()) transform();
00469         os << "Output format: one cluster per line\n\n";
00470         int clusterCnt=0;
00471         typename std::multimap<T, T>::const_iterator mi = result.begin();
00472         while (mi != result.end()) {
00473                 clusterCnt++;
00474                 os << mi->second;
00475                 T cluster_id = mi->first;
00476                 mi++;
00477                 while (mi != result.end() && mi->first == cluster_id) {
00478                         os << '\t' << mi->second;
00479                         mi++;
00480                 }
00481                 os << "\n\n";
00482         }
00483         os << "Total " << clusterCnt << " clusters\n";
00484         return clusterCnt;
00485 }
00486 
00487 #endif

Generated on Wed Aug 10 11:56:50 2011 for Softwares from Orpara by  doxygen 1.5.6