SparseIntVect.h

Go to the documentation of this file.
00001 // $Id: SparseIntVect.h 640 2008-04-24 19:03:34Z glandrum $
00002 //
00003 //  Copyright (C) 2007-2008 Greg Landrum
00004 //
00005 //  @@ All Rights Reserved @@
00006 //
00007 #ifndef __RD_SPARSE_INT_VECT_20070921__
00008 #define __RD_SPARSE_INT_VECT_20070921__
00009 
00010 #include <map>
00011 #include <string>
00012 #include <RDGeneral/Invariant.h>
00013 #include <sstream>
00014 #include <RDBoost/Exceptions.h>
00015 
00016 const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
00017 namespace RDKit{
00018   //! a class for efficiently storing sparse vectors of ints
00019   template <typename IndexType>
00020   class SparseIntVect {
00021   public:
00022     typedef std::map<IndexType,int> StorageType;
00023   
00024     SparseIntVect() : d_length(0) {};
00025 
00026     //! initialize with a particular length
00027     SparseIntVect(IndexType length) : d_length(length) {};
00028 
00029     //! Copy constructor
00030     SparseIntVect(const SparseIntVect<IndexType> &other){
00031       d_length=other.d_length;
00032       d_data.insert(other.d_data.begin(),other.d_data.end());
00033     }
00034 
00035     //! constructor from a pickle
00036     SparseIntVect(const std::string pkl){
00037       initFromText(pkl.c_str(),pkl.size());
00038     };
00039     //! constructor from a pickle
00040     SparseIntVect(const char *pkl,const unsigned int len){
00041       initFromText(pkl,len);
00042     };
00043 
00044     //! destructor (doesn't need to do anything)
00045     ~SparseIntVect() {}
00046 
00047     //! return the value at an index
00048     int getVal(IndexType idx) const {
00049       if(idx<0||idx>=d_length){
00050         throw IndexErrorException(static_cast<int>(idx));
00051       }
00052       int res=0;
00053       typename StorageType::const_iterator iter=d_data.find(idx);
00054       if(iter!=d_data.end()){
00055         res=iter->second;
00056       }
00057       return res;
00058     };
00059     int operator[] (IndexType idx) const { return getVal(idx); };
00060 
00061     //! set the value at an index
00062     void setVal(IndexType idx, int val){
00063       if(idx<0||idx>=d_length){
00064         throw IndexErrorException(static_cast<int>(idx));
00065       }
00066       if(val!=0){
00067         d_data[idx]=val;
00068       } else {
00069         d_data.erase(idx);
00070       }
00071     };
00072     //! returns the length
00073     IndexType getLength() const { return d_length; };
00074 
00075     //! returns the sum of all the elements in the vect
00076     int getTotalVal() const {
00077       int res=0;
00078       typename StorageType::const_iterator iter;
00079       for(iter=d_data.begin();iter!=d_data.end();++iter){
00080         res+=iter->second;
00081       }
00082       return res;
00083     };
00084 
00085 
00086     //! returns our nonzero elements as a map(IndexType->int)
00087     const StorageType &getNonzeroElements() const {
00088       return d_data;
00089     }
00090 
00091 
00092     //! this is a "fuzzy" intesection, the final value
00093     //! of each element is equal to the minimum from
00094     //! the two vects.
00095     SparseIntVect<IndexType> &
00096     operator&= (const SparseIntVect<IndexType> &other) {
00097       if(other.d_length!=d_length){
00098         throw ValueErrorException("SparseIntVect size mismatch");
00099       }
00100 
00101       typename StorageType::iterator iter=d_data.begin();
00102       typename StorageType::const_iterator oIter=other.d_data.begin();
00103       while(iter!=d_data.end()){
00104         // we're relying on the fact that the maps are sorted:
00105         while(oIter!=other.d_data.end() && oIter->first < iter->first){
00106           ++oIter;
00107         }
00108         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00109           // found it:
00110           if(oIter->second<iter->second){
00111             iter->second=oIter->second;
00112           }
00113           ++oIter;
00114           ++iter;
00115         } else {
00116           // not there; our value is zero, which means
00117           // we should remove this value:
00118           typename StorageType::iterator tmpIter=iter;
00119           ++tmpIter;
00120           d_data.erase(iter);
00121           iter=tmpIter;
00122         }
00123       }
00124       return *this;
00125     };
00126     const SparseIntVect<IndexType> 
00127     operator& (const SparseIntVect<IndexType> &other) const {
00128       SparseIntVect<IndexType> res(*this);
00129       return res&=other;
00130     }
00131 
00132     //! this is a "fuzzy" union, the final value
00133     //! of each element is equal to the maximum from
00134     //! the two vects.
00135     SparseIntVect<IndexType> &
00136     operator|= (const SparseIntVect<IndexType> &other) {
00137       if(other.d_length!=d_length){
00138         throw ValueErrorException("SparseIntVect size mismatch");
00139       }
00140 
00141       typename StorageType::iterator iter=d_data.begin();
00142       typename StorageType::const_iterator oIter=other.d_data.begin();
00143       while(iter!=d_data.end()){
00144         // we're relying on the fact that the maps are sorted:
00145         while(oIter!=other.d_data.end() &&
00146               oIter->first < iter->first){
00147           d_data[oIter->first]=oIter->second;
00148           ++oIter;
00149         }
00150         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00151           // found it:
00152           if(oIter->second>iter->second){
00153             iter->second=oIter->second;
00154           }
00155           ++oIter;
00156         }
00157         ++iter;
00158       }
00159       // finish up the other vect:
00160       while(oIter!=other.d_data.end()){
00161         d_data[oIter->first]=oIter->second;
00162         ++oIter;
00163       }
00164       return *this;
00165     };
00166     const SparseIntVect<IndexType> 
00167     operator| (const SparseIntVect<IndexType> &other) const {
00168       SparseIntVect<IndexType> res(*this);
00169       return res|=other;
00170     }
00171 
00172     SparseIntVect<IndexType> &
00173     operator+= (const SparseIntVect<IndexType> &other) {
00174       if(other.d_length!=d_length){
00175         throw ValueErrorException("SparseIntVect size mismatch");
00176       }
00177       typename StorageType::iterator iter=d_data.begin();
00178       typename StorageType::const_iterator oIter=other.d_data.begin();
00179       while(oIter!=other.d_data.end()){
00180         while(iter!=d_data.end() &&
00181               iter->first < oIter->first){
00182           ++iter;
00183         }
00184         if(iter!=d_data.end() && oIter->first==iter->first){
00185           // found it:
00186           iter->second+=oIter->second;
00187           if(!iter->second){
00188             typename StorageType::iterator tIter=iter;
00189             ++tIter;
00190             d_data.erase(iter);
00191             iter=tIter;
00192           } else {
00193             ++iter;
00194           }
00195         } else {
00196           d_data[oIter->first]=oIter->second;
00197         }
00198         ++oIter;
00199       }
00200       return *this;
00201     };
00202     const SparseIntVect<IndexType> 
00203     operator+ (const SparseIntVect<IndexType> &other) const {
00204       SparseIntVect<IndexType> res(*this);
00205       return res+=other;
00206     }
00207 
00208     SparseIntVect<IndexType> &
00209     operator-= (const SparseIntVect<IndexType> &other) {
00210       if(other.d_length!=d_length){
00211         throw ValueErrorException("SparseIntVect size mismatch");
00212       }
00213       typename StorageType::iterator iter=d_data.begin();
00214       typename StorageType::const_iterator oIter=other.d_data.begin();
00215       while(oIter!=other.d_data.end()){
00216         while(iter!=d_data.end() &&
00217               iter->first < oIter->first){
00218           ++iter;
00219         }
00220         if(iter!=d_data.end() && oIter->first==iter->first){
00221           // found it:
00222           iter->second-=oIter->second;
00223           if(!iter->second){
00224             typename StorageType::iterator tIter=iter;
00225             ++tIter;
00226             d_data.erase(iter);
00227             iter=tIter;
00228           } else {
00229             ++iter;
00230           }
00231         } else {
00232           d_data[oIter->first]=-oIter->second;
00233         }
00234         ++oIter;
00235       }
00236       return *this;
00237     };
00238     const SparseIntVect<IndexType> 
00239     operator- (const SparseIntVect<IndexType> &other) const {
00240       SparseIntVect<IndexType> res(*this);
00241       return res-=other;
00242     }
00243 
00244     bool operator==(const SparseIntVect<IndexType> &v2){
00245       if(d_length!=v2.d_length){
00246         return false;
00247       }
00248       return d_data==v2.d_data;
00249     }
00250     bool operator!=(const SparseIntVect<IndexType> &v2){
00251       return !(*this==v2);
00252     }
00253 
00254     //! returns a binary string representation (pickle)
00255     const std::string toString() const {
00256       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00257       ss.write((const char *)&(ci_SPARSEINTVECT_VERSION),sizeof(ci_SPARSEINTVECT_VERSION));
00258       unsigned int pieceSize=sizeof(IndexType);
00259       ss.write((const char *)&pieceSize,sizeof(pieceSize));
00260       ss.write((const char *)&d_length,sizeof(d_length));
00261       IndexType nEntries=d_data.size();
00262       ss.write((const char *)&nEntries,sizeof(nEntries));
00263 
00264       typename StorageType::const_iterator iter=d_data.begin();
00265       while(iter!=d_data.end()){
00266         ss.write((const char *)&iter->first,sizeof(iter->first));
00267         ss.write((const char *)&iter->second,sizeof(iter->second));
00268         ++iter;
00269       }
00270       return ss.str();
00271     };
00272 
00273     void fromString(std::string &txt) {
00274       initFromText(txt.c_str(),txt.length());
00275     }
00276 
00277   private:
00278     IndexType d_length;
00279     StorageType d_data;
00280     
00281     void initFromText(const char *pkl,const unsigned int len) {
00282       d_data.clear();
00283       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00284       ss.write(pkl,len);
00285       
00286       int vers;
00287       ss.read((char *)&vers,sizeof(vers));
00288       if(vers==0x0001){
00289         unsigned int idxSize;
00290         ss.read((char *)&idxSize,sizeof(idxSize));
00291         if(idxSize>sizeof(IndexType)){
00292           throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00293         }
00294         switch(idxSize){
00295         case sizeof(char):
00296           readVals<unsigned char>(ss);break;
00297         case sizeof(int):
00298           readVals<unsigned int>(ss);break;
00299         case sizeof(long long):
00300           readVals<unsigned long long>(ss);break;
00301         default:
00302           throw ValueErrorException("unreadable format");
00303         }
00304       } else {
00305         throw ValueErrorException("bad version in SparseIntVect pickle");
00306       }
00307     };
00308     template <typename T>
00309     void readVals(std::stringstream &ss){
00310       PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00311       T tVal;
00312       ss.read((char *)&tVal,sizeof(T));
00313       d_length=tVal;
00314       T nEntries;
00315       ss.read((char *)&nEntries,sizeof(T));
00316       for(T i=0;i<nEntries;++i){
00317         ss.read((char *)&tVal,sizeof(tVal));
00318         int val;
00319         ss.read((char *)&val,sizeof(val));
00320         d_data[tVal]=val;
00321       }
00322     }
00323   };
00324 
00325   template <typename IndexType, typename SequenceType>
00326   void updateFromSequence(SparseIntVect<IndexType> &vect,
00327                           const SequenceType &seq){
00328     typename SequenceType::const_iterator seqIt;
00329     for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00330       // EFF: probably not the most efficient approach
00331       IndexType idx=*seqIt;
00332       vect.setVal(idx,vect.getVal(idx)+1);
00333     }
00334   }
00335 
00336   template <typename IndexType>
00337   double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00338                         const SparseIntVect<IndexType> &v2,
00339                         bool returnDistance=false,
00340                         double bounds=0.0){
00341     if(v1.getLength()!=v2.getLength()){
00342       throw ValueErrorException("SparseIntVect size mismatch");
00343     }
00344     double v1Sum=v1.getTotalVal();
00345     double v2Sum=v2.getTotalVal();
00346     double denom=v1Sum+v2Sum;
00347     if(fabs(denom)<1e-6){
00348       if(returnDistance){
00349         return 1.0;
00350       } else {
00351         return 0.0;
00352       }
00353     }
00354     if(!returnDistance && bounds>0.0){
00355       double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00356       if(2.*minV/denom<bounds){
00357         return 0.0;
00358       }
00359     }
00360 
00361     double numer=0.0;
00362     // we're doing : (v1&v2).getTotalVal(), but w/o generating
00363     // the other vector:
00364     typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00365     iter1=v1.getNonzeroElements().begin();
00366     iter2=v2.getNonzeroElements().begin();
00367     while(iter1 != v1.getNonzeroElements().end()){
00368       while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00369         ++iter2;
00370       }
00371       if(iter2!=v2.getNonzeroElements().end()){
00372         if(iter2->first == iter1->first){
00373           if(iter2->second<iter1->second){
00374             numer += iter2->second;
00375           } else {
00376             numer += iter1->second;
00377           }
00378           ++iter2;
00379         }
00380         ++iter1;
00381       } else {
00382         break;
00383       }
00384     }
00385     double sim=2.*numer/denom;
00386     if(returnDistance) sim = 1.-sim;
00387     return sim;
00388   }
00389 } 
00390 
00391 
00392 
00393 #endif

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3