SparseIntVect.h

Go to the documentation of this file.
00001 // $Id: SparseIntVect.h 926 2008-12-11 16:11:24Z glandrum $
00002 //
00003 //  Copyright (C) 2007-2008 Greg Landrum
00004 //
00005 //  @@ All Rights Reserved @@
00006 //
00007 #ifndef __RD_SPARSE_INT_VECT_20070921__
00008 #define __RD_SPARSE_INT_VECT_20070921__
00009 
00010 #include <map>
00011 #include <string>
00012 #include <RDGeneral/Invariant.h>
00013 #include <sstream>
00014 #include <RDBoost/Exceptions.h>
00015 #include <RDGeneral/StreamOps.h>
00016 #include <boost/cstdint.hpp>
00017 
00018 
00019 const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
00020 namespace RDKit{
00021   //! a class for efficiently storing sparse vectors of ints
00022   template <typename IndexType>
00023   class SparseIntVect {
00024   public:
00025     typedef std::map<IndexType,int> StorageType;
00026   
00027     SparseIntVect() : d_length(0) {};
00028 
00029     //! initialize with a particular length
00030     SparseIntVect(IndexType length) : d_length(length) {};
00031 
00032     //! Copy constructor
00033     SparseIntVect(const SparseIntVect<IndexType> &other){
00034       d_length=other.d_length;
00035       d_data.insert(other.d_data.begin(),other.d_data.end());
00036     }
00037 
00038     //! constructor from a pickle
00039     SparseIntVect(const std::string pkl){
00040       initFromText(pkl.c_str(),pkl.size());
00041     };
00042     //! constructor from a pickle
00043     SparseIntVect(const char *pkl,const unsigned int len){
00044       initFromText(pkl,len);
00045     };
00046 
00047     //! destructor (doesn't need to do anything)
00048     ~SparseIntVect() {}
00049 
00050     //! return the value at an index
00051     int getVal(IndexType idx) const {
00052       if(idx<0||idx>=d_length){
00053         throw IndexErrorException(static_cast<int>(idx));
00054       }
00055       int res=0;
00056       typename StorageType::const_iterator iter=d_data.find(idx);
00057       if(iter!=d_data.end()){
00058         res=iter->second;
00059       }
00060       return res;
00061     };
00062     int operator[] (IndexType idx) const { return getVal(idx); };
00063 
00064     //! set the value at an index
00065     void setVal(IndexType idx, int val){
00066       if(idx<0||idx>=d_length){
00067         throw IndexErrorException(static_cast<int>(idx));
00068       }
00069       if(val!=0){
00070         d_data[idx]=val;
00071       } else {
00072         d_data.erase(idx);
00073       }
00074     };
00075     //! returns the length
00076     IndexType getLength() const { return d_length; };
00077 
00078     //! returns the sum of all the elements in the vect
00079     int getTotalVal() const {
00080       int res=0;
00081       typename StorageType::const_iterator iter;
00082       for(iter=d_data.begin();iter!=d_data.end();++iter){
00083         res+=iter->second;
00084       }
00085       return res;
00086     };
00087 
00088 
00089     //! returns our nonzero elements as a map(IndexType->int)
00090     const StorageType &getNonzeroElements() const {
00091       return d_data;
00092     }
00093 
00094 
00095     //! this is a "fuzzy" intesection, the final value
00096     //! of each element is equal to the minimum from
00097     //! the two vects.
00098     SparseIntVect<IndexType> &
00099     operator&= (const SparseIntVect<IndexType> &other) {
00100       if(other.d_length!=d_length){
00101         throw ValueErrorException("SparseIntVect size mismatch");
00102       }
00103 
00104       typename StorageType::iterator iter=d_data.begin();
00105       typename StorageType::const_iterator oIter=other.d_data.begin();
00106       while(iter!=d_data.end()){
00107         // we're relying on the fact that the maps are sorted:
00108         while(oIter!=other.d_data.end() && oIter->first < iter->first){
00109           ++oIter;
00110         }
00111         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00112           // found it:
00113           if(oIter->second<iter->second){
00114             iter->second=oIter->second;
00115           }
00116           ++oIter;
00117           ++iter;
00118         } else {
00119           // not there; our value is zero, which means
00120           // we should remove this value:
00121           typename StorageType::iterator tmpIter=iter;
00122           ++tmpIter;
00123           d_data.erase(iter);
00124           iter=tmpIter;
00125         }
00126       }
00127       return *this;
00128     };
00129     const SparseIntVect<IndexType> 
00130     operator& (const SparseIntVect<IndexType> &other) const {
00131       SparseIntVect<IndexType> res(*this);
00132       return res&=other;
00133     }
00134 
00135     //! this is a "fuzzy" union, the final value
00136     //! of each element is equal to the maximum from
00137     //! the two vects.
00138     SparseIntVect<IndexType> &
00139     operator|= (const SparseIntVect<IndexType> &other) {
00140       if(other.d_length!=d_length){
00141         throw ValueErrorException("SparseIntVect size mismatch");
00142       }
00143 
00144       typename StorageType::iterator iter=d_data.begin();
00145       typename StorageType::const_iterator oIter=other.d_data.begin();
00146       while(iter!=d_data.end()){
00147         // we're relying on the fact that the maps are sorted:
00148         while(oIter!=other.d_data.end() &&
00149               oIter->first < iter->first){
00150           d_data[oIter->first]=oIter->second;
00151           ++oIter;
00152         }
00153         if(oIter!=other.d_data.end() && oIter->first==iter->first){
00154           // found it:
00155           if(oIter->second>iter->second){
00156             iter->second=oIter->second;
00157           }
00158           ++oIter;
00159         }
00160         ++iter;
00161       }
00162       // finish up the other vect:
00163       while(oIter!=other.d_data.end()){
00164         d_data[oIter->first]=oIter->second;
00165         ++oIter;
00166       }
00167       return *this;
00168     };
00169     const SparseIntVect<IndexType> 
00170     operator| (const SparseIntVect<IndexType> &other) const {
00171       SparseIntVect<IndexType> res(*this);
00172       return res|=other;
00173     }
00174 
00175     SparseIntVect<IndexType> &
00176     operator+= (const SparseIntVect<IndexType> &other) {
00177       if(other.d_length!=d_length){
00178         throw ValueErrorException("SparseIntVect size mismatch");
00179       }
00180       typename StorageType::iterator iter=d_data.begin();
00181       typename StorageType::const_iterator oIter=other.d_data.begin();
00182       while(oIter!=other.d_data.end()){
00183         while(iter!=d_data.end() &&
00184               iter->first < oIter->first){
00185           ++iter;
00186         }
00187         if(iter!=d_data.end() && oIter->first==iter->first){
00188           // found it:
00189           iter->second+=oIter->second;
00190           if(!iter->second){
00191             typename StorageType::iterator tIter=iter;
00192             ++tIter;
00193             d_data.erase(iter);
00194             iter=tIter;
00195           } else {
00196             ++iter;
00197           }
00198         } else {
00199           d_data[oIter->first]=oIter->second;
00200         }
00201         ++oIter;
00202       }
00203       return *this;
00204     };
00205     const SparseIntVect<IndexType> 
00206     operator+ (const SparseIntVect<IndexType> &other) const {
00207       SparseIntVect<IndexType> res(*this);
00208       return res+=other;
00209     }
00210 
00211     SparseIntVect<IndexType> &
00212     operator-= (const SparseIntVect<IndexType> &other) {
00213       if(other.d_length!=d_length){
00214         throw ValueErrorException("SparseIntVect size mismatch");
00215       }
00216       typename StorageType::iterator iter=d_data.begin();
00217       typename StorageType::const_iterator oIter=other.d_data.begin();
00218       while(oIter!=other.d_data.end()){
00219         while(iter!=d_data.end() &&
00220               iter->first < oIter->first){
00221           ++iter;
00222         }
00223         if(iter!=d_data.end() && oIter->first==iter->first){
00224           // found it:
00225           iter->second-=oIter->second;
00226           if(!iter->second){
00227             typename StorageType::iterator tIter=iter;
00228             ++tIter;
00229             d_data.erase(iter);
00230             iter=tIter;
00231           } else {
00232             ++iter;
00233           }
00234         } else {
00235           d_data[oIter->first]=-oIter->second;
00236         }
00237         ++oIter;
00238       }
00239       return *this;
00240     };
00241     const SparseIntVect<IndexType> 
00242     operator- (const SparseIntVect<IndexType> &other) const {
00243       SparseIntVect<IndexType> res(*this);
00244       return res-=other;
00245     }
00246 
00247     bool operator==(const SparseIntVect<IndexType> &v2) const{
00248       if(d_length!=v2.d_length){
00249         return false;
00250       }
00251       return d_data==v2.d_data;
00252     }
00253     bool operator!=(const SparseIntVect<IndexType> &v2) const {
00254       return !(*this==v2);
00255     }
00256 
00257     //! returns a binary string representation (pickle)
00258     std::string toString() const {
00259       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00260       boost::uint32_t tInt;
00261       tInt=ci_SPARSEINTVECT_VERSION;
00262       streamWrite(ss,tInt);
00263       tInt=sizeof(IndexType);
00264       streamWrite(ss,tInt);
00265       streamWrite(ss,d_length);
00266       IndexType nEntries=d_data.size();
00267       streamWrite(ss,nEntries);
00268 
00269       typename StorageType::const_iterator iter=d_data.begin();
00270       while(iter!=d_data.end()){
00271         streamWrite(ss,iter->first);
00272         boost::int32_t tInt=iter->second;
00273         streamWrite(ss,tInt);
00274         ++iter;
00275       }
00276       return ss.str();
00277     };
00278 
00279     void fromString(const std::string &txt) {
00280       initFromText(txt.c_str(),txt.length());
00281     }
00282 
00283   private:
00284     IndexType d_length;
00285     StorageType d_data;
00286     
00287     void initFromText(const char *pkl,const unsigned int len) {
00288       d_data.clear();
00289       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00290       ss.write(pkl,len);
00291       
00292       boost::uint32_t vers;
00293       streamRead(ss,vers);
00294       if(vers==0x0001){
00295         boost::uint32_t tInt;
00296         streamRead(ss,tInt);
00297         if(tInt>sizeof(IndexType)){
00298           throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00299         }
00300         switch(tInt){
00301         case sizeof(char):
00302           readVals<unsigned char>(ss);break;
00303         case sizeof(boost::int32_t):
00304           readVals<boost::uint32_t>(ss);break;
00305         case sizeof(boost::int64_t):
00306           readVals<boost::uint64_t>(ss);break;
00307         default:
00308           throw ValueErrorException("unreadable format");
00309         }
00310       } else {
00311         throw ValueErrorException("bad version in SparseIntVect pickle");
00312       }
00313     };
00314     template <typename T>
00315     void readVals(std::stringstream &ss){
00316       PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00317       T tVal;
00318       streamRead(ss,tVal);
00319       d_length=tVal;
00320       T nEntries;
00321       streamRead(ss,nEntries);
00322       for(T i=0;i<nEntries;++i){
00323         streamRead(ss,tVal);
00324         boost::int32_t val;
00325         streamRead(ss,val);
00326         d_data[tVal]=val;
00327       }
00328     }
00329   };
00330 
00331   template <typename IndexType, typename SequenceType>
00332   void updateFromSequence(SparseIntVect<IndexType> &vect,
00333                           const SequenceType &seq){
00334     typename SequenceType::const_iterator seqIt;
00335     for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00336       // EFF: probably not the most efficient approach
00337       IndexType idx=*seqIt;
00338       vect.setVal(idx,vect.getVal(idx)+1);
00339     }
00340   }
00341 
00342   template <typename IndexType>
00343   double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00344                         const SparseIntVect<IndexType> &v2,
00345                         bool returnDistance=false,
00346                         double bounds=0.0){
00347     if(v1.getLength()!=v2.getLength()){
00348       throw ValueErrorException("SparseIntVect size mismatch");
00349     }
00350     double v1Sum=0.0;
00351     double v2Sum=0.0;
00352     if(!returnDistance && bounds>0.0){
00353       v1Sum=v1.getTotalVal();
00354       v2Sum=v2.getTotalVal();
00355       double denom=v1Sum+v2Sum;
00356       if(fabs(denom)<1e-6){
00357         if(returnDistance){
00358           return 1.0;
00359         } else {
00360           return 0.0;
00361         }
00362       }
00363       double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00364       if(2.*minV/denom<bounds){
00365         return 0.0;
00366       }
00367       v1Sum=0.0;
00368       v2Sum=0.0;
00369     }
00370 
00371     double numer=0.0;
00372     // we're doing : (v1&v2).getTotalVal(), but w/o generating
00373     // the other vector:
00374     typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00375     iter1=v1.getNonzeroElements().begin();
00376     if(iter1!=v1.getNonzeroElements().end()) v1Sum+=iter1->second; 
00377     iter2=v2.getNonzeroElements().begin();
00378     if(iter2!=v2.getNonzeroElements().end()) v2Sum+=iter2->second; 
00379     while(iter1 != v1.getNonzeroElements().end()){
00380       while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00381         ++iter2;
00382         if(iter2!=v2.getNonzeroElements().end()) v2Sum+=iter2->second; 
00383       }
00384       if(iter2!=v2.getNonzeroElements().end()){
00385         if(iter2->first == iter1->first){
00386           if(iter2->second<iter1->second){
00387             numer += iter2->second;
00388           } else {
00389             numer += iter1->second;
00390           }
00391           ++iter2;
00392           if(iter2!=v2.getNonzeroElements().end()) v2Sum+=iter2->second; 
00393         }
00394         ++iter1;
00395         if(iter1!=v1.getNonzeroElements().end()) v1Sum+=iter1->second; 
00396       } else {
00397         break;
00398       }
00399     }
00400     if(iter1 != v1.getNonzeroElements().end()){
00401       ++iter1;
00402       while(iter1!=v1.getNonzeroElements().end()){
00403         v1Sum+=iter1->second;
00404         ++iter1;
00405       }
00406     }
00407 
00408     if(iter2!=v2.getNonzeroElements().end()){
00409       ++iter2;
00410       while(iter2!=v2.getNonzeroElements().end()){
00411         v2Sum+=iter2->second;
00412         ++iter2;
00413       }
00414     }
00415     double denom=v1Sum+v2Sum;
00416     double sim;
00417     if(fabs(denom)<1e-6){
00418       sim=0.0;
00419     } else {
00420       sim=2.*numer/denom;
00421     }
00422     if(returnDistance) sim = 1.-sim;
00423     //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
00424     return sim;
00425   }
00426 } 
00427 
00428 
00429 
00430 #endif

Generated on Fri Apr 3 06:03:02 2009 for RDCode by  doxygen 1.5.6