00001 // 00002 // Copyright (C) 2001-2006 Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // 00006 #ifndef _RD_MOL_OPS_H_ 00007 #define _RD_MOL_OPS_H_ 00008 00009 #include <RDGeneral/types.h> 00010 #include <boost/tuple/tuple.hpp> 00011 00012 extern const int ci_LOCAL_INF; 00013 namespace RDKit{ 00014 class ROMol; 00015 class RWMol; 00016 class Atom; 00017 class Bond; 00018 typedef std::vector<double> INVAR_VECT; 00019 typedef INVAR_VECT::iterator INVAR_VECT_I; 00020 typedef INVAR_VECT::const_iterator INVAR_VECT_CI; 00021 00022 //! used to request particular types of atom invariants 00023 typedef enum { 00024 ComprehensiveInvariants, //!< for general atomic ranking 00025 ChiralSearchInvariants, //!< for ranking atoms for calculating CIP codes 00026 } AtomInvariantType; 00027 00028 //! used to return atomic discriminators (three doubles) 00029 typedef boost::tuples::tuple<double,double,double> DiscrimTuple; 00030 00031 00032 //! \brief Groups a variety of molecular query and transformation operations. 00033 namespace MolOps { 00034 //! find fragments (disconnected components of the molecular graph) 00035 /*! 00036 00037 \param mol the molecule of interest 00038 \param mapping used to return the mapping of Atoms->fragments. 00039 On return \c mapping will be <tt>mol->getNumAtoms()</tt> long 00040 and will contain the fragment assignment for each Atom 00041 00042 \return the number of fragments found. 00043 00044 */ 00045 unsigned int getMolFrags(const ROMol &mol,INT_VECT &mapping); 00046 //! find fragments (disconnected components of the molecular graph) 00047 /*! 00048 00049 \param mol the molecule of interest 00050 \param frags used to return the Atoms in each fragment 00051 On return \c mapping will be \c numFrags long, and each entry 00052 will contain the indices of the Atoms in that fragment. 00053 00054 \return the number of fragments found. 00055 00056 */ 00057 unsigned int getMolFrags(const ROMol &mol, VECT_INT_VECT &frags); 00058 00059 00060 //! returns a copy of a molecule with hydrogens added in as explicit Atoms 00061 /*! 00062 \param mol the molecule to add Hs to 00063 \param explicitOnly (optional) if this \c true, only explicit Hs will be added 00064 \param addCoords (optional) If this is true, estimates for the atomic coordinates 00065 of the added Hs will be used. 00066 00067 \return the new molecule 00068 00069 <b>Notes:</b> 00070 - it makes no sense to use the \c addCoords option if the molecule's heavy 00071 atoms don't already have coordinates. 00072 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00073 */ 00074 ROMol *addHs(const ROMol &mol,bool explicitOnly=false,bool addCoords=false); 00075 00076 00077 //! returns a copy of a molecule with hydrogens removed 00078 /*! 00079 \param mol the molecule to remove Hs from 00080 \param implicitOnly (optional) if this \c true, only implicit Hs will be removed 00081 \param updateExplicitCount (optional) If this is \c true, when explicit Hs are removed 00082 from the graph, the heavy atom to which they are bound will have its counter of 00083 explicit Hs increased. 00084 \param sanitize: (optional) If this is \c true, the final molecule will be 00085 sanitized 00086 00087 \return the new molecule 00088 00089 <b>Notes:</b> 00090 - Hydrogens which aren't connected to a heavy atom will not be 00091 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00092 all atoms removed. 00093 - Labelled hydrogen (e.g. atoms with atomic number=1, but mass > 1), 00094 will not be removed. 00095 00096 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00097 */ 00098 ROMol *removeHs(const ROMol &mol,bool implicitOnly=false, 00099 bool updateExplicitCount=false,bool sanitize=true); 00100 00101 //! returns a copy of a molecule with hydrogens removed and added as queries 00102 //! to the heavy atoms to which they are bound. 00103 /*! 00104 This is really intended to be used with molecules that contain QueryAtoms 00105 00106 \param mol the molecule to remove Hs from 00107 00108 \return the new molecule 00109 00110 <b>Notes:</b> 00111 - Atoms that do not already have hydrogen count queries will have one 00112 added, other H-related queries will not be touched. Examples: 00113 - C[H] -> [C;!H0] 00114 - [C;H1][H] -> [C;H1] 00115 - [C;H2][H] -> [C;H2] 00116 - Hydrogens which aren't connected to a heavy atom will not be 00117 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00118 all atoms removed. 00119 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00120 00121 */ 00122 ROMol *mergeQueryHs(const ROMol &mol); 00123 00124 //! return the number of electrons available on an atom to donate for aromaticity 00125 /*! 00126 The result is determined using the default valency, number of lone pairs, 00127 number of bonds and the formal charge. Note that the atom may not donate 00128 all of these electrons to a ring for aromaticity (also used in Conjugation 00129 and hybridization code). 00130 00131 \param at the atom of interest 00132 00133 \return the number of electrons 00134 */ 00135 int countAtomElec(const Atom *at); 00136 00137 //! Sets up the aromaticity for a molecule 00138 /*! 00139 00140 This is what happens here: 00141 -# find all the simple rings by calling the findSSSR function 00142 -# loop over all the Atoms in each ring and mark them if they are candidates 00143 for aromaticity. A ring atom is a candidate if it can spare electrons 00144 to the ring and if it's from the first two rows of the periodic table. 00145 -# ased on the candidate atoms, mark the rings to be either candidates 00146 or non-candidates. A ring is a candidate only if all its atoms are candidates 00147 -# apply Hueckel rule to each of the candidate rings to check if the ring can be 00148 aromatic 00149 00150 \param mol the RWMol of interest 00151 00152 \return 1 on succes, 0 otherwise 00153 00154 <b>Assumptions:</b> 00155 - Kekulization has been done (i.e. \c MolOps::Kekulize() has already 00156 been called) 00157 00158 */ 00159 int setAromaticity(RWMol &mol); 00160 00161 00162 //! Designed to be called by the sanitizer to handle special cases before anything is done. 00163 /*! 00164 00165 Currently this: 00166 - modifies nitro groups, so that the nitrogen does not have a unreasonable 00167 valence of 5, as follows: 00168 - the nitrogen gets a positve charge 00169 - one of the oxygens gets a negative chage and the double bond to this 00170 oxygen is changed to a single bond 00171 The net result is that nitro groups can be counted on to be: 00172 \c "[N+](=O)[O-]" 00173 00174 \param mol the molecule of interest 00175 00176 */ 00177 void cleanUp(RWMol &mol); 00178 00179 00180 //! adjust the number of implicit and explicit Hs for special cases 00181 /*! 00182 00183 Currently this: 00184 - modifies aromatic nitrogens so that, when appropriate, they have an 00185 explicit H marked (e.g. so that we get things like \c "c1cc[nH]cc1" 00186 00187 \param mol the molecule of interest 00188 00189 <b>Assumptions</b> 00190 - this is called after the molecule has been sanitized, 00191 aromaticity has been perceived, and the implicit valence of 00192 everything has been calculated. 00193 00194 */ 00195 void adjustHs(RWMol &mol); 00196 00197 //! finds a molecule's Smallest Set of Smallest Rings 00198 /*! 00199 Currently this implements a modified form of Figueras algorithm 00200 (JCICS - Vol. 36, No. 5, 1996, 986-991) 00201 00202 \param mol the molecule of interest 00203 \param res used to return the vector of rings. Each entry is a vector with 00204 atom indices. This information is also stored in the molecule's 00205 RingInfo structure, so this argument is optional (see overload) 00206 00207 \return number of smallest rings found 00208 00209 Base algorithm: 00210 - The original algorithm starts by finding representative degree 2 00211 nodes. 00212 - Representative because if a series of deg 2 nodes are found only 00213 one of them is picked. 00214 - The smallest ring around each of them is found. 00215 - The bonds that connect to this degree 2 node are them chopped off, yielding 00216 new deg two nodes 00217 - The process is repeated on the new deg 2 nodes. 00218 - If no deg 2 nodes are found, a deg 3 node is picked. The smallest ring 00219 with it is found. A bond from this is "carefully" (look in the paper) 00220 selected and chopped, yielding deg 2 nodes. The process is same as 00221 above once this is done. 00222 00223 Our Modifications: 00224 - If available, more than one smallest ring around a representative deg 2 00225 node will be computed and stored 00226 - Typically 3 rings are found around a degree 3 node (when no deg 2s are available) 00227 and all the bond to that node are chopped. 00228 - The extra rings that were found in this process are removed after all the nodes 00229 have been covered. 00230 00231 These changes were motivated by several factors: 00232 - We believe the original algorithm fails to find the correct SSSR 00233 (finds the correct number of them but the wrong ones) on some sample mols 00234 - Since SSSR may not be unique, a post-SSSR step to symmetrize may be done. 00235 The extra rings this process adds can be quite useful. 00236 */ 00237 int findSSSR(const ROMol &mol, VECT_INT_VECT &res); 00238 //! \overload 00239 int findSSSR(const ROMol &mol, VECT_INT_VECT *res=0); 00240 00241 //! symmetrize the molecule's Smallest Set of Smallest Rings 00242 /*! 00243 SSSR rings obatined from "findSSSR" can be non-unique in some case. 00244 For example, cubane has five SSSR rings, not six as one would hope. 00245 00246 This function adds additional rings to the SSSR list if necessary 00247 to make the list symmetric, e.g. all atoms in cubane will be part of the same number 00248 of SSSRs. This function choses these extra rings from the extra rings computed 00249 and discarded during findSSSR. The new ring are chosen such that: 00250 - replacing a same sized ring in the SSSR list with an extra ring yields 00251 the same union of bond IDs as the orignal SSSR list 00252 00253 \param mol - the molecule of interest 00254 \param res used to return the vector of rings. Each entry is a vector with 00255 atom indices. This information is also stored in the molecule's 00256 RingInfo structure, so this argument is optional (see overload) 00257 00258 \return the total number of rings = (new rings + old SSSRs) 00259 00260 <b>Notes:</b> 00261 - if no SSSR rings are found on the molecule - MolOps::findSSSR() is called first 00262 */ 00263 int symmetrizeSSSR(ROMol &mol, VECT_INT_VECT &res); 00264 00265 //! Kekulizes the molecule 00266 /*! 00267 00268 \param mol the molecule of interest 00269 \param markAtomsBonds if this is set to true, \c isAromatic boolean settings 00270 on both the Bonds and Atoms are turned to false following 00271 the Kekulization, otherwise they are left alone in their 00272 original state. 00273 \param maxBackTracks the maximum number of attempts at back-tracking. The algorithm 00274 uses a back-tracking procedure to revist a previous setting of 00275 double bond if we hit a wall in the kekulization process 00276 00277 <b>Notes:</b> 00278 - even if \c markAtomsBonds is \c false the \c BondType for all aromatic 00279 bonds will be changed from \c RDKit::Bond::AROMATIC to \c RDKit::Bond::SINGLE 00280 or RDKit::Bond::DOUBLE during Kekulization. 00281 00282 */ 00283 void Kekulize(RWMol &mol, bool markAtomsBonds=true, unsigned int maxBackTracks=100); 00284 00285 //! flags the molecule's conjugated bonds 00286 void setConjugation(ROMol &mol); 00287 00288 //! returns whether or not the given Atom is involved in a conjugated bond 00289 bool atomHasConjugatedBond(const Atom *at); 00290 00291 00292 //! calculates and sets the hybridization of all a molecule's Stoms 00293 void setHybridization(ROMol &mol); 00294 00295 00296 //! \brief carries out a collection of tasks for cleaning up a molecule and ensuring 00297 //! that it makes "chemical sense" 00298 /*! 00299 This functions calls the following in sequence 00300 -# MolOps::cleanUp() 00301 -# MolOps::Kekulize() 00302 -# MolOps::setAromaticity() 00303 -# MolOps::setConjugation() 00304 -# MolOps::setHybridization() 00305 -# MolOps::cleanupChirality() 00306 -# MolOps::adjustHs() 00307 00308 \param mol the RWMol to be cleaned 00309 00310 <b>Notes:</b> 00311 - If there is a failure in the sanitization, a \c SanitException 00312 will be thrown. 00313 - in general the user of this function should cast the molecule following this 00314 function to a ROMol, so that new atoms and bonds cannot be added to the 00315 molecule and screw up the sanitizing that has been done here 00316 */ 00317 void sanitizeMol(RWMol &mol); 00318 00319 //! returns a molecule's adjacency matrix 00320 /*! 00321 \param mol the molecule of interest 00322 \param useBO toggles use of bond orders in the matrix 00323 \param emptyVal sets the empty value (for non-adjacent atoms) 00324 \param force forces calculation of the matrix, even if already computed 00325 \param propNamePrefix used to set the cached property name 00326 00327 \return the adjacency matrix. 00328 00329 <b>Notes</b> 00330 - The result of this is cached in the molecule's local property dictionary, 00331 which will handle deallocation. Do the caller should <b>not</b> \c delete 00332 this pointer. 00333 00334 */ 00335 double * getAdjacencyMatrix(const ROMol &mol, 00336 bool useBO=false, 00337 int emptyVal=0, 00338 bool force=false, 00339 const char *propNamePrefix=0); 00340 00341 //! Computes the molecule's topological distance matrix 00342 /*! 00343 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00344 00345 \param mol the molecule of interest 00346 \param useBO toggles use of bond orders in the matrix 00347 \param useAtomWts sets the diagonal elements of the result to 00348 6.0/(atomic number) so that the matrix can be used to calculate 00349 Balaban J values. This does not affect the bond weights. 00350 \param force forces calculation of the matrix, even if already computed 00351 \param propNamePrefix used to set the cached property name 00352 00353 \return the distance matrix. 00354 00355 <b>Notes</b> 00356 - The result of this is cached in the molecule's local property dictionary, 00357 which will handle deallocation. Do the caller should <b>not</b> \c delete 00358 this pointer. 00359 00360 00361 */ 00362 double *getDistanceMat(const ROMol &mol, 00363 bool useBO=false, 00364 bool useAtomWts=false, 00365 bool force=false, 00366 const char *propNamePrefix=0); 00367 00368 00369 //! Computes the molecule's topological distance matrix 00370 /*! 00371 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00372 00373 \param mol the molecule of interest 00374 \param activeAtoms only elements corresponding to these atom indices 00375 will be included in the calculation 00376 \param bonds only bonds found in this list will be included in the 00377 calculation 00378 \param useBO toggles use of bond orders in the matrix 00379 \param useAtomWts sets the diagonal elements of the result to 00380 6.0/(atomic number) so that the matrix can be used to calculate 00381 Balaban J values. This does not affect the bond weights. 00382 00383 \return the distance matrix. 00384 00385 <b>Notes</b> 00386 - The results of this call are not cached, the caller <b>should</b> \c delete 00387 this pointer. 00388 00389 00390 */ 00391 double *getDistanceMat(const ROMol &mol, 00392 const std::vector<int> &activeAtoms, 00393 const std::vector<const Bond *> &bonds, 00394 bool useBO=false, 00395 bool useAtomWts=false); 00396 00397 00398 //! Find the shortest path between two atoms 00399 /*! 00400 Uses the Bellman-Ford algorithm 00401 00402 \param mol molecule of interest 00403 \param aid1 index of the first atom 00404 \param aid2 index of the second atom 00405 00406 \return an std::list with the indices of the atoms along the shortest 00407 path 00408 00409 <b>Notes:</b> 00410 - the starting and end atoms are not include in the path 00411 00412 */ 00413 INT_LIST getShortestPath(const ROMol &mol, int aid1, int aid2); 00414 00415 //! calculates a set of molecular discriminators from the distance matrix 00416 /*! 00417 Computes: 00418 -# BalabanJ 00419 -# the first eigenvalue of the distance matrix 00420 -# the last but one eigenvalue of the distance matrix 00421 00422 \param mol the molecule of interest 00423 \param useBO toggles inclusion of the bond order in the discriminators 00424 (when false, the discriminators are purely topological) 00425 \param force forces the calculation (instead of using cached results) 00426 00427 \return a \c DiscrimTuple with the results 00428 00429 */ 00430 DiscrimTuple computeDiscriminators(const ROMol &mol, 00431 bool useBO=true, 00432 bool force=false); 00433 //! \brief Same as MolOps::computeDiscriminators(const ROMol &mol), 00434 //! except that this directly uses the user-supplied distance matrix 00435 DiscrimTuple computeDiscriminators(double *distMat, int nb, int na); 00436 00437 00438 //! calculates Balaban's J index for the molecule 00439 /*! 00440 \param mol the molecule of interest 00441 \param useBO toggles inclusion of the bond order in the calculation 00442 (when false, we're not really calculating the J value) 00443 \param force forces the calculation (instead of using cached results) 00444 \param bondPath when included, only paths using bonds whose indices occur 00445 in this vector will be included in the calculation 00446 \param cacheIt If this is true, the calculated value will be cached 00447 as a property on the molecule 00448 \return the J index 00449 00450 */ 00451 double computeBalabanJ(const ROMol &mol, 00452 bool useBO=true, 00453 bool force=false, 00454 const std::vector<int> *bondPath=0, 00455 bool cacheIt=true); 00456 00457 00458 //! finds a molecule's minimium spanning tree (MST) 00459 /*! 00460 \param mol the molecule of interest 00461 \param mst used to return the MST as a vector of bond indices 00462 */ 00463 void findSpanningTree(const ROMol &mol,INT_VECT &mst); 00464 00465 //! assign a canonical ordering to a molecule's atoms 00466 /*! 00467 The algorithm used here is a modification of the published Daylight canonical 00468 smiles algorithm (i.e. it uses atom invariants and products of primes). 00469 00470 \param mol the molecule of interest 00471 \param ranks used to return the ranks 00472 \param invariantType the type of invariant to be computed 00473 \param breakTies toggles breaking of ties (see below) 00474 \param rankHistory used to return the rank history (see below) 00475 \param includeChirality toggles inclusion of chirality in the atom ranking 00476 (see below) 00477 00478 <b>Notes:</b> 00479 - Tie breaking should be done when it's important to have a full ordering 00480 of the atoms (e.g. when generating canonical traversal trees). If it's 00481 acceptable to have ties between symmetry-equivalent atoms (e.g. when 00482 generating CIP codes), tie breaking can/should be skipped. 00483 - if the \c rankHistory argument is provided, the evolution of the ranks of 00484 individual atoms will be tracked. The \c rankHistory pointer should be 00485 to a VECT_INT_VECT that has at least \c mol.getNumAtoms() elements. 00486 - the \c includeChirality argument only makes sense when generating full 00487 invariants (i.e. ComprehensiveInvariant) 00488 */ 00489 void rankAtoms(const ROMol &mol,INT_VECT &ranks, 00490 AtomInvariantType invariantType=ComprehensiveInvariants, 00491 bool breakTies=true,VECT_INT_VECT *rankHistory=0, 00492 bool includeChirality=false); 00493 00494 //! calculates a set of atom invariants 00495 /*! 00496 These invariants include terms for: 00497 -# degree 00498 -# explicit valence 00499 -# atomic number 00500 -# isotope 00501 -# number of hs 00502 -# size of rings an atom is involved in 00503 -# formal charge 00504 -# chiral code (optional) 00505 -# whether of not the atom determines the stereochemistry of a 00506 double bond. (optional) 00507 00508 \param mol the molecule of interest 00509 \param res used to return the results 00510 \param includeChirality toggles inclusion of chirality, see MolOps::rankAtoms() 00511 for an explanation. 00512 */ 00513 void buildAtomInvariants(const ROMol &mol,INVAR_VECT &res, 00514 bool includeChirality=false); 00515 //! calculates a set of chiral atom invariants 00516 /*! 00517 These are based upon Labute's proposal in the article: 00518 "An Efficient Algorithm for the Determination of Topological 00519 RS Chirality" <i>Journal of the CCG</i> (1996) 00520 and include terms for: 00521 -# atomic number 00522 -# isotope 00523 -# number of multiple bonds from/to this atom 00524 -# degree 00525 -# number of hydrogens 00526 00527 \param mol the molecule of interest 00528 \param res used to return the results 00529 */ 00530 void buildChiralAtomInvariants(const ROMol &mol, 00531 INVAR_VECT &res); 00532 00533 //! calculate CIP (R/S) codes for a molecule's chiral centers 00534 /*! 00535 \param mol the molecule of interest 00536 \param cleanIt toggles removal of chiral flags from atoms that do 00537 not have four unique neighbors 00538 \param force forces the calculation to be repeated even if it has 00539 already been done 00540 00541 <b>Notes:</b> 00542 - The codes are stored on atoms as a string property named 00543 "_CIPCode". 00544 - All atoms also gain a property "_CIPRank" with their overall 00545 CIP ranking. 00546 - Throughout we assume that we're working with a hydrogen-suppressed 00547 graph. 00548 */ 00549 void assignAtomChiralCodes(ROMol &mol,bool cleanIt=false,bool force=false); 00550 //! calculate CIP (Z/E) codes for a molecule's double bonds 00551 /*! 00552 \param mol the molecule of interest 00553 \param cleanIt toggles removal of stereo flags from double bonds that can 00554 not have stereochemistry 00555 \param force forces the calculation to be repeated even if it has 00556 already been done 00557 00558 <b>Notes:M</b> 00559 - Throughout we assume that we're working with a hydrogen-suppressed 00560 graph. 00561 00562 */ 00563 void assignBondStereoCodes(ROMol &mol,bool cleanIt=false,bool force=false); 00564 //! \brief finds bonds that could be cis/trans in a molecule and mark them as 00565 //! Bond::STEREONONE 00566 /*! 00567 \param mol the molecule of interest 00568 \param cleanIt toggles removal of stereo flags from double bonds that can 00569 not have stereochemistry 00570 00571 This function is usefuly in two situations 00572 - when parsing a mol file; for the bonds marked here, coordinate informations 00573 on the neighbors can be used to indentify cis or trans states 00574 - when writing a mol file; bonds that can be cis/trans but not marked as either 00575 need to be specially marked in the mol file 00576 */ 00577 void findPotentialStereoBonds(ROMol &mol,bool cleanIt=false); 00578 //! sums up all atomic formal charges and returns the result 00579 int getFormalCharge(const ROMol &mol); 00580 00581 //! removes bogus chirality markers (those on non-sp3 centers): 00582 void cleanupChirality(RWMol &mol); 00583 00584 }; // end of namespace MolOps 00585 }; // end of namespace RDKit 00586 00587 #endif
1.5.3