00001
00002
00003
00004
00005
00006
00007 #ifndef __RD_SPARSE_INT_VECT_20070921__
00008 #define __RD_SPARSE_INT_VECT_20070921__
00009
00010 #include <map>
00011 #include <string>
00012 #include <RDGeneral/Invariant.h>
00013 #include <sstream>
00014 #include <RDBoost/Exceptions.h>
00015
00016 const int ci_SPARSEINTVECT_VERSION=0x0001;
00017 namespace RDKit{
00018
00019 template <typename IndexType>
00020 class SparseIntVect {
00021 public:
00022 typedef std::map<IndexType,int> StorageType;
00023
00024 SparseIntVect() : d_length(0) {};
00025
00026
00027 SparseIntVect(IndexType length) : d_length(length) {};
00028
00029
00030 SparseIntVect(const SparseIntVect<IndexType> &other){
00031 d_length=other.d_length;
00032 d_data.insert(other.d_data.begin(),other.d_data.end());
00033 }
00034
00035
00036 SparseIntVect(const std::string pkl){
00037 initFromText(pkl.c_str(),pkl.size());
00038 };
00039
00040 SparseIntVect(const char *pkl,const unsigned int len){
00041 initFromText(pkl,len);
00042 };
00043
00044
00045 ~SparseIntVect() {}
00046
00047
00048 int getVal(IndexType idx) const {
00049 if(idx<0||idx>=d_length){
00050 throw IndexErrorException(static_cast<int>(idx));
00051 }
00052 int res=0;
00053 typename StorageType::const_iterator iter=d_data.find(idx);
00054 if(iter!=d_data.end()){
00055 res=iter->second;
00056 }
00057 return res;
00058 };
00059 int operator[] (IndexType idx) const { return getVal(idx); };
00060
00061
00062 void setVal(IndexType idx, int val){
00063 if(idx<0||idx>=d_length){
00064 throw IndexErrorException(static_cast<int>(idx));
00065 }
00066 if(val!=0){
00067 d_data[idx]=val;
00068 } else {
00069 d_data.erase(idx);
00070 }
00071 };
00072
00073 IndexType getLength() const { return d_length; };
00074
00075
00076 int getTotalVal() const {
00077 int res=0;
00078 typename StorageType::const_iterator iter;
00079 for(iter=d_data.begin();iter!=d_data.end();++iter){
00080 res+=iter->second;
00081 }
00082 return res;
00083 };
00084
00085
00086
00087 const StorageType &getNonzeroElements() const {
00088 return d_data;
00089 }
00090
00091
00092
00093
00094
00095 SparseIntVect<IndexType> &
00096 operator&= (const SparseIntVect<IndexType> &other) {
00097 if(other.d_length!=d_length){
00098 throw ValueErrorException("SparseIntVect size mismatch");
00099 }
00100
00101 typename StorageType::iterator iter=d_data.begin();
00102 typename StorageType::const_iterator oIter=other.d_data.begin();
00103 while(iter!=d_data.end()){
00104
00105 while(oIter!=other.d_data.end() && oIter->first < iter->first){
00106 ++oIter;
00107 }
00108 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00109
00110 if(oIter->second<iter->second){
00111 iter->second=oIter->second;
00112 }
00113 ++oIter;
00114 ++iter;
00115 } else {
00116
00117
00118 typename StorageType::iterator tmpIter=iter;
00119 ++tmpIter;
00120 d_data.erase(iter);
00121 iter=tmpIter;
00122 }
00123 }
00124 return *this;
00125 };
00126 const SparseIntVect<IndexType>
00127 operator& (const SparseIntVect<IndexType> &other) const {
00128 SparseIntVect<IndexType> res(*this);
00129 return res&=other;
00130 }
00131
00132
00133
00134
00135 SparseIntVect<IndexType> &
00136 operator|= (const SparseIntVect<IndexType> &other) {
00137 if(other.d_length!=d_length){
00138 throw ValueErrorException("SparseIntVect size mismatch");
00139 }
00140
00141 typename StorageType::iterator iter=d_data.begin();
00142 typename StorageType::const_iterator oIter=other.d_data.begin();
00143 while(iter!=d_data.end()){
00144
00145 while(oIter!=other.d_data.end() &&
00146 oIter->first < iter->first){
00147 d_data[oIter->first]=oIter->second;
00148 ++oIter;
00149 }
00150 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00151
00152 if(oIter->second>iter->second){
00153 iter->second=oIter->second;
00154 }
00155 ++oIter;
00156 }
00157 ++iter;
00158 }
00159
00160 while(oIter!=other.d_data.end()){
00161 d_data[oIter->first]=oIter->second;
00162 ++oIter;
00163 }
00164 return *this;
00165 };
00166 const SparseIntVect<IndexType>
00167 operator| (const SparseIntVect<IndexType> &other) const {
00168 SparseIntVect<IndexType> res(*this);
00169 return res|=other;
00170 }
00171
00172 SparseIntVect<IndexType> &
00173 operator+= (const SparseIntVect<IndexType> &other) {
00174 if(other.d_length!=d_length){
00175 throw ValueErrorException("SparseIntVect size mismatch");
00176 }
00177 typename StorageType::iterator iter=d_data.begin();
00178 typename StorageType::const_iterator oIter=other.d_data.begin();
00179 while(oIter!=other.d_data.end()){
00180 while(iter!=d_data.end() &&
00181 iter->first < oIter->first){
00182 ++iter;
00183 }
00184 if(iter!=d_data.end() && oIter->first==iter->first){
00185
00186 iter->second+=oIter->second;
00187 if(!iter->second){
00188 typename StorageType::iterator tIter=iter;
00189 ++tIter;
00190 d_data.erase(iter);
00191 iter=tIter;
00192 } else {
00193 ++iter;
00194 }
00195 } else {
00196 d_data[oIter->first]=oIter->second;
00197 }
00198 ++oIter;
00199 }
00200 return *this;
00201 };
00202 const SparseIntVect<IndexType>
00203 operator+ (const SparseIntVect<IndexType> &other) const {
00204 SparseIntVect<IndexType> res(*this);
00205 return res+=other;
00206 }
00207
00208 SparseIntVect<IndexType> &
00209 operator-= (const SparseIntVect<IndexType> &other) {
00210 if(other.d_length!=d_length){
00211 throw ValueErrorException("SparseIntVect size mismatch");
00212 }
00213 typename StorageType::iterator iter=d_data.begin();
00214 typename StorageType::const_iterator oIter=other.d_data.begin();
00215 while(oIter!=other.d_data.end()){
00216 while(iter!=d_data.end() &&
00217 iter->first < oIter->first){
00218 ++iter;
00219 }
00220 if(iter!=d_data.end() && oIter->first==iter->first){
00221
00222 iter->second-=oIter->second;
00223 if(!iter->second){
00224 typename StorageType::iterator tIter=iter;
00225 ++tIter;
00226 d_data.erase(iter);
00227 iter=tIter;
00228 } else {
00229 ++iter;
00230 }
00231 } else {
00232 d_data[oIter->first]=-oIter->second;
00233 }
00234 ++oIter;
00235 }
00236 return *this;
00237 };
00238 const SparseIntVect<IndexType>
00239 operator- (const SparseIntVect<IndexType> &other) const {
00240 SparseIntVect<IndexType> res(*this);
00241 return res-=other;
00242 }
00243
00244 bool operator==(const SparseIntVect<IndexType> &v2){
00245 if(d_length!=v2.d_length){
00246 return false;
00247 }
00248 return d_data==v2.d_data;
00249 }
00250 bool operator!=(const SparseIntVect<IndexType> &v2){
00251 return !(*this==v2);
00252 }
00253
00254
00255 const std::string toString() const {
00256 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00257 ss.write((const char *)&(ci_SPARSEINTVECT_VERSION),sizeof(ci_SPARSEINTVECT_VERSION));
00258 unsigned int pieceSize=sizeof(IndexType);
00259 ss.write((const char *)&pieceSize,sizeof(pieceSize));
00260 ss.write((const char *)&d_length,sizeof(d_length));
00261 IndexType nEntries=d_data.size();
00262 ss.write((const char *)&nEntries,sizeof(nEntries));
00263
00264 typename StorageType::const_iterator iter=d_data.begin();
00265 while(iter!=d_data.end()){
00266 ss.write((const char *)&iter->first,sizeof(iter->first));
00267 ss.write((const char *)&iter->second,sizeof(iter->second));
00268 ++iter;
00269 }
00270 return ss.str();
00271 };
00272
00273 void fromString(std::string &txt) {
00274 initFromText(txt.c_str(),txt.length());
00275 }
00276
00277 private:
00278 IndexType d_length;
00279 StorageType d_data;
00280
00281 void initFromText(const char *pkl,const unsigned int len) {
00282 d_data.clear();
00283 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00284 ss.write(pkl,len);
00285
00286 int vers;
00287 ss.read((char *)&vers,sizeof(vers));
00288 if(vers==0x0001){
00289 unsigned int idxSize;
00290 ss.read((char *)&idxSize,sizeof(idxSize));
00291 if(idxSize>sizeof(IndexType)){
00292 throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00293 }
00294 switch(idxSize){
00295 case sizeof(char):
00296 readVals<unsigned char>(ss);break;
00297 case sizeof(int):
00298 readVals<unsigned int>(ss);break;
00299 case sizeof(long long):
00300 readVals<unsigned long long>(ss);break;
00301 default:
00302 throw ValueErrorException("unreadable format");
00303 }
00304 } else {
00305 throw ValueErrorException("bad version in SparseIntVect pickle");
00306 }
00307 };
00308 template <typename T>
00309 void readVals(std::stringstream &ss){
00310 PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00311 T tVal;
00312 ss.read((char *)&tVal,sizeof(T));
00313 d_length=tVal;
00314 T nEntries;
00315 ss.read((char *)&nEntries,sizeof(T));
00316 for(T i=0;i<nEntries;++i){
00317 ss.read((char *)&tVal,sizeof(tVal));
00318 int val;
00319 ss.read((char *)&val,sizeof(val));
00320 d_data[tVal]=val;
00321 }
00322 }
00323 };
00324
00325 template <typename IndexType, typename SequenceType>
00326 void updateFromSequence(SparseIntVect<IndexType> &vect,
00327 const SequenceType &seq){
00328 typename SequenceType::const_iterator seqIt;
00329 for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00330
00331 IndexType idx=*seqIt;
00332 vect.setVal(idx,vect.getVal(idx)+1);
00333 }
00334 }
00335
00336 template <typename IndexType>
00337 double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00338 const SparseIntVect<IndexType> &v2,
00339 bool returnDistance=false,
00340 double bounds=0.0){
00341 if(v1.getLength()!=v2.getLength()){
00342 throw ValueErrorException("SparseIntVect size mismatch");
00343 }
00344 double v1Sum=v1.getTotalVal();
00345 double v2Sum=v2.getTotalVal();
00346 double denom=v1Sum+v2Sum;
00347 if(fabs(denom)<1e-6){
00348 if(returnDistance){
00349 return 1.0;
00350 } else {
00351 return 0.0;
00352 }
00353 }
00354 if(!returnDistance && bounds>0.0){
00355 double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00356 if(2.*minV/denom<bounds){
00357 return 0.0;
00358 }
00359 }
00360
00361 double numer=0.0;
00362
00363
00364 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00365 iter1=v1.getNonzeroElements().begin();
00366 iter2=v2.getNonzeroElements().begin();
00367 while(iter1 != v1.getNonzeroElements().end()){
00368 while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00369 ++iter2;
00370 }
00371 if(iter2!=v2.getNonzeroElements().end()){
00372 if(iter2->first == iter1->first){
00373 if(iter2->second<iter1->second){
00374 numer += iter2->second;
00375 } else {
00376 numer += iter1->second;
00377 }
00378 ++iter2;
00379 }
00380 ++iter1;
00381 } else {
00382 break;
00383 }
00384 }
00385 double sim=2.*numer/denom;
00386 if(returnDistance) sim = 1.-sim;
00387 return sim;
00388 }
00389 }
00390
00391
00392
00393 #endif