00001 // 00002 // Copyright (C) 2001-2009 Greg Landrum and Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // 00006 #ifndef _RD_MOL_OPS_H_ 00007 #define _RD_MOL_OPS_H_ 00008 00009 #include <RDGeneral/types.h> 00010 #include <boost/tuple/tuple.hpp> 00011 #include <boost/smart_ptr.hpp> 00012 00013 extern const int ci_LOCAL_INF; 00014 namespace RDKit{ 00015 class ROMol; 00016 class RWMol; 00017 class Atom; 00018 class Bond; 00019 typedef std::vector<double> INVAR_VECT; 00020 typedef INVAR_VECT::iterator INVAR_VECT_I; 00021 typedef INVAR_VECT::const_iterator INVAR_VECT_CI; 00022 00023 00024 //! used to return atomic discriminators (three doubles) 00025 typedef boost::tuples::tuple<double,double,double> DiscrimTuple; 00026 00027 //! \brief Groups a variety of molecular query and transformation operations. 00028 namespace MolOps { 00029 00030 //! return the number of electrons available on an atom to donate for aromaticity 00031 /*! 00032 The result is determined using the default valency, number of lone pairs, 00033 number of bonds and the formal charge. Note that the atom may not donate 00034 all of these electrons to a ring for aromaticity (also used in Conjugation 00035 and hybridization code). 00036 00037 \param at the atom of interest 00038 00039 \return the number of electrons 00040 */ 00041 int countAtomElec(const Atom *at); 00042 00043 //! sums up all atomic formal charges and returns the result 00044 int getFormalCharge(const ROMol &mol); 00045 00046 //! returns whether or not the given Atom is involved in a conjugated bond 00047 bool atomHasConjugatedBond(const Atom *at); 00048 00049 //! find fragments (disconnected components of the molecular graph) 00050 /*! 00051 00052 \param mol the molecule of interest 00053 \param mapping used to return the mapping of Atoms->fragments. 00054 On return \c mapping will be <tt>mol->getNumAtoms()</tt> long 00055 and will contain the fragment assignment for each Atom 00056 00057 \return the number of fragments found. 00058 00059 */ 00060 unsigned int getMolFrags(const ROMol &mol,INT_VECT &mapping); 00061 //! find fragments (disconnected components of the molecular graph) 00062 /*! 00063 00064 \param mol the molecule of interest 00065 \param frags used to return the Atoms in each fragment 00066 On return \c mapping will be \c numFrags long, and each entry 00067 will contain the indices of the Atoms in that fragment. 00068 00069 \return the number of fragments found. 00070 00071 */ 00072 unsigned int getMolFrags(const ROMol &mol, VECT_INT_VECT &frags); 00073 00074 //! splits a molecule into its component fragments 00075 // (disconnected components of the molecular graph) 00076 /*! 00077 00078 \param mol the molecule of interest 00079 \param sanitizeFrags toggles sanitization of the fragments after 00080 they are built 00081 \param frags used to return the mapping of Atoms->fragments. 00082 if provided, \c frags will be <tt>mol->getNumAtoms()</tt> long 00083 on return and will contain the fragment assignment for each Atom 00084 00085 \return a vector of the fragments as smart pointers to ROMols 00086 00087 */ 00088 std::vector<boost::shared_ptr<ROMol> > getMolFrags(const ROMol &mol, 00089 bool sanitizeFrags=true, 00090 INT_VECT *frags=0); 00091 00092 #if 0 00093 //! finds a molecule's minimium spanning tree (MST) 00094 /*! 00095 \param mol the molecule of interest 00096 \param mst used to return the MST as a vector of bond indices 00097 */ 00098 void findSpanningTree(const ROMol &mol,INT_VECT &mst); 00099 #endif 00100 00101 //! calculates a set of molecular discriminators from the distance matrix 00102 /*! 00103 Computes: 00104 -# BalabanJ 00105 -# the first eigenvalue of the distance matrix 00106 -# the last but one eigenvalue of the distance matrix 00107 00108 \param mol the molecule of interest 00109 \param useBO toggles inclusion of the bond order in the discriminators 00110 (when false, the discriminators are purely topological) 00111 \param force forces the calculation (instead of using cached results) 00112 00113 \return a \c DiscrimTuple with the results 00114 00115 */ 00116 DiscrimTuple computeDiscriminators(const ROMol &mol, 00117 bool useBO=true, 00118 bool force=false); 00119 //! \brief Same as MolOps::computeDiscriminators(const ROMol &mol), 00120 //! except that this directly uses the user-supplied distance matrix 00121 DiscrimTuple computeDiscriminators(double *distMat, unsigned int nb, unsigned int na); 00122 00123 00124 //! calculates Balaban's J index for the molecule 00125 /*! 00126 \param mol the molecule of interest 00127 \param useBO toggles inclusion of the bond order in the calculation 00128 (when false, we're not really calculating the J value) 00129 \param force forces the calculation (instead of using cached results) 00130 \param bondPath when included, only paths using bonds whose indices occur 00131 in this vector will be included in the calculation 00132 \param cacheIt If this is true, the calculated value will be cached 00133 as a property on the molecule 00134 \return the J index 00135 00136 */ 00137 double computeBalabanJ(const ROMol &mol, 00138 bool useBO=true, 00139 bool force=false, 00140 const std::vector<int> *bondPath=0, 00141 bool cacheIt=true); 00142 00143 00144 00145 //! \name Dealing with hydrogens 00146 //{@ 00147 00148 //! returns a copy of a molecule with hydrogens added in as explicit Atoms 00149 /*! 00150 \param mol the molecule to add Hs to 00151 \param explicitOnly (optional) if this \c true, only explicit Hs will be added 00152 \param addCoords (optional) If this is true, estimates for the atomic coordinates 00153 of the added Hs will be used. 00154 00155 \return the new molecule 00156 00157 <b>Notes:</b> 00158 - it makes no sense to use the \c addCoords option if the molecule's heavy 00159 atoms don't already have coordinates. 00160 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00161 */ 00162 ROMol *addHs(const ROMol &mol,bool explicitOnly=false,bool addCoords=false); 00163 00164 00165 //! returns a copy of a molecule with hydrogens removed 00166 /*! 00167 \param mol the molecule to remove Hs from 00168 \param implicitOnly (optional) if this \c true, only implicit Hs will be removed 00169 \param updateExplicitCount (optional) If this is \c true, when explicit Hs are removed 00170 from the graph, the heavy atom to which they are bound will have its counter of 00171 explicit Hs increased. 00172 \param sanitize: (optional) If this is \c true, the final molecule will be 00173 sanitized 00174 00175 \return the new molecule 00176 00177 <b>Notes:</b> 00178 - Hydrogens which aren't connected to a heavy atom will not be 00179 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00180 all atoms removed. 00181 - Labelled hydrogen (e.g. atoms with atomic number=1, but mass > 1), 00182 will not be removed. 00183 00184 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00185 */ 00186 ROMol *removeHs(const ROMol &mol,bool implicitOnly=false, 00187 bool updateExplicitCount=false,bool sanitize=true); 00188 00189 //! returns a copy of a molecule with hydrogens removed and added as queries 00190 //! to the heavy atoms to which they are bound. 00191 /*! 00192 This is really intended to be used with molecules that contain QueryAtoms 00193 00194 \param mol the molecule to remove Hs from 00195 00196 \return the new molecule 00197 00198 <b>Notes:</b> 00199 - Atoms that do not already have hydrogen count queries will have one 00200 added, other H-related queries will not be touched. Examples: 00201 - C[H] -> [C;!H0] 00202 - [C;H1][H] -> [C;H1] 00203 - [C;H2][H] -> [C;H2] 00204 - Hydrogens which aren't connected to a heavy atom will not be 00205 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00206 all atoms removed. 00207 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00208 00209 */ 00210 ROMol *mergeQueryHs(const ROMol &mol); 00211 //@} 00212 00213 //! \name Sanitization 00214 //@{ 00215 00216 //! \brief carries out a collection of tasks for cleaning up a molecule and ensuring 00217 //! that it makes "chemical sense" 00218 /*! 00219 This functions calls the following in sequence 00220 -# MolOps::cleanUp() 00221 -# MolOps::Kekulize() 00222 -# MolOps::setAromaticity() 00223 -# MolOps::setConjugation() 00224 -# MolOps::setHybridization() 00225 -# MolOps::cleanupChirality() 00226 -# MolOps::adjustHs() 00227 00228 \param mol the RWMol to be cleaned 00229 00230 <b>Notes:</b> 00231 - If there is a failure in the sanitization, a \c SanitException 00232 will be thrown. 00233 - in general the user of this function should cast the molecule following this 00234 function to a ROMol, so that new atoms and bonds cannot be added to the 00235 molecule and screw up the sanitizing that has been done here 00236 */ 00237 void sanitizeMol(RWMol &mol); 00238 00239 //! Sets up the aromaticity for a molecule 00240 /*! 00241 00242 This is what happens here: 00243 -# find all the simple rings by calling the findSSSR function 00244 -# loop over all the Atoms in each ring and mark them if they are candidates 00245 for aromaticity. A ring atom is a candidate if it can spare electrons 00246 to the ring and if it's from the first two rows of the periodic table. 00247 -# ased on the candidate atoms, mark the rings to be either candidates 00248 or non-candidates. A ring is a candidate only if all its atoms are candidates 00249 -# apply Hueckel rule to each of the candidate rings to check if the ring can be 00250 aromatic 00251 00252 \param mol the RWMol of interest 00253 00254 \return 1 on succes, 0 otherwise 00255 00256 <b>Assumptions:</b> 00257 - Kekulization has been done (i.e. \c MolOps::Kekulize() has already 00258 been called) 00259 00260 */ 00261 int setAromaticity(RWMol &mol); 00262 00263 00264 //! Designed to be called by the sanitizer to handle special cases before anything is done. 00265 /*! 00266 00267 Currently this: 00268 - modifies nitro groups, so that the nitrogen does not have a unreasonable 00269 valence of 5, as follows: 00270 - the nitrogen gets a positve charge 00271 - one of the oxygens gets a negative chage and the double bond to this 00272 oxygen is changed to a single bond 00273 The net result is that nitro groups can be counted on to be: 00274 \c "[N+](=O)[O-]" 00275 00276 \param mol the molecule of interest 00277 00278 */ 00279 void cleanUp(RWMol &mol); 00280 00281 00282 //! adjust the number of implicit and explicit Hs for special cases 00283 /*! 00284 00285 Currently this: 00286 - modifies aromatic nitrogens so that, when appropriate, they have an 00287 explicit H marked (e.g. so that we get things like \c "c1cc[nH]cc1" 00288 00289 \param mol the molecule of interest 00290 00291 <b>Assumptions</b> 00292 - this is called after the molecule has been sanitized, 00293 aromaticity has been perceived, and the implicit valence of 00294 everything has been calculated. 00295 00296 */ 00297 void adjustHs(RWMol &mol); 00298 00299 //! Kekulizes the molecule 00300 /*! 00301 00302 \param mol the molecule of interest 00303 \param markAtomsBonds if this is set to true, \c isAromatic boolean settings 00304 on both the Bonds and Atoms are turned to false following 00305 the Kekulization, otherwise they are left alone in their 00306 original state. 00307 \param maxBackTracks the maximum number of attempts at back-tracking. The algorithm 00308 uses a back-tracking procedure to revist a previous setting of 00309 double bond if we hit a wall in the kekulization process 00310 00311 <b>Notes:</b> 00312 - even if \c markAtomsBonds is \c false the \c BondType for all aromatic 00313 bonds will be changed from \c RDKit::Bond::AROMATIC to \c RDKit::Bond::SINGLE 00314 or RDKit::Bond::DOUBLE during Kekulization. 00315 00316 */ 00317 void Kekulize(RWMol &mol, bool markAtomsBonds=true, unsigned int maxBackTracks=100); 00318 00319 //! flags the molecule's conjugated bonds 00320 void setConjugation(ROMol &mol); 00321 00322 //! calculates and sets the hybridization of all a molecule's Stoms 00323 void setHybridization(ROMol &mol); 00324 00325 00326 // @} 00327 00328 //! \name Ring finding and SSSR 00329 //@{ 00330 00331 //! finds a molecule's Smallest Set of Smallest Rings 00332 /*! 00333 Currently this implements a modified form of Figueras algorithm 00334 (JCICS - Vol. 36, No. 5, 1996, 986-991) 00335 00336 \param mol the molecule of interest 00337 \param res used to return the vector of rings. Each entry is a vector with 00338 atom indices. This information is also stored in the molecule's 00339 RingInfo structure, so this argument is optional (see overload) 00340 00341 \return number of smallest rings found 00342 00343 Base algorithm: 00344 - The original algorithm starts by finding representative degree 2 00345 nodes. 00346 - Representative because if a series of deg 2 nodes are found only 00347 one of them is picked. 00348 - The smallest ring around each of them is found. 00349 - The bonds that connect to this degree 2 node are them chopped off, yielding 00350 new deg two nodes 00351 - The process is repeated on the new deg 2 nodes. 00352 - If no deg 2 nodes are found, a deg 3 node is picked. The smallest ring 00353 with it is found. A bond from this is "carefully" (look in the paper) 00354 selected and chopped, yielding deg 2 nodes. The process is same as 00355 above once this is done. 00356 00357 Our Modifications: 00358 - If available, more than one smallest ring around a representative deg 2 00359 node will be computed and stored 00360 - Typically 3 rings are found around a degree 3 node (when no deg 2s are available) 00361 and all the bond to that node are chopped. 00362 - The extra rings that were found in this process are removed after all the nodes 00363 have been covered. 00364 00365 These changes were motivated by several factors: 00366 - We believe the original algorithm fails to find the correct SSSR 00367 (finds the correct number of them but the wrong ones) on some sample mols 00368 - Since SSSR may not be unique, a post-SSSR step to symmetrize may be done. 00369 The extra rings this process adds can be quite useful. 00370 */ 00371 int findSSSR(const ROMol &mol, VECT_INT_VECT &res); 00372 //! \overload 00373 int findSSSR(const ROMol &mol, VECT_INT_VECT *res=0); 00374 00375 //! symmetrize the molecule's Smallest Set of Smallest Rings 00376 /*! 00377 SSSR rings obatined from "findSSSR" can be non-unique in some case. 00378 For example, cubane has five SSSR rings, not six as one would hope. 00379 00380 This function adds additional rings to the SSSR list if necessary 00381 to make the list symmetric, e.g. all atoms in cubane will be part of the same number 00382 of SSSRs. This function choses these extra rings from the extra rings computed 00383 and discarded during findSSSR. The new ring are chosen such that: 00384 - replacing a same sized ring in the SSSR list with an extra ring yields 00385 the same union of bond IDs as the orignal SSSR list 00386 00387 \param mol - the molecule of interest 00388 \param res used to return the vector of rings. Each entry is a vector with 00389 atom indices. This information is also stored in the molecule's 00390 RingInfo structure, so this argument is optional (see overload) 00391 00392 \return the total number of rings = (new rings + old SSSRs) 00393 00394 <b>Notes:</b> 00395 - if no SSSR rings are found on the molecule - MolOps::findSSSR() is called first 00396 */ 00397 int symmetrizeSSSR(ROMol &mol, VECT_INT_VECT &res); 00398 //! \overload 00399 int symmetrizeSSSR(ROMol &mol); 00400 00401 //@} 00402 00403 //! \name Shortest paths and other matrices 00404 //@{ 00405 00406 //! returns a molecule's adjacency matrix 00407 /*! 00408 \param mol the molecule of interest 00409 \param useBO toggles use of bond orders in the matrix 00410 \param emptyVal sets the empty value (for non-adjacent atoms) 00411 \param force forces calculation of the matrix, even if already computed 00412 \param propNamePrefix used to set the cached property name 00413 00414 \return the adjacency matrix. 00415 00416 <b>Notes</b> 00417 - The result of this is cached in the molecule's local property dictionary, 00418 which will handle deallocation. Do the caller should <b>not</b> \c delete 00419 this pointer. 00420 00421 */ 00422 double * getAdjacencyMatrix(const ROMol &mol, 00423 bool useBO=false, 00424 int emptyVal=0, 00425 bool force=false, 00426 const char *propNamePrefix=0); 00427 00428 //! Computes the molecule's topological distance matrix 00429 /*! 00430 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00431 00432 \param mol the molecule of interest 00433 \param useBO toggles use of bond orders in the matrix 00434 \param useAtomWts sets the diagonal elements of the result to 00435 6.0/(atomic number) so that the matrix can be used to calculate 00436 Balaban J values. This does not affect the bond weights. 00437 \param force forces calculation of the matrix, even if already computed 00438 \param propNamePrefix used to set the cached property name 00439 00440 \return the distance matrix. 00441 00442 <b>Notes</b> 00443 - The result of this is cached in the molecule's local property dictionary, 00444 which will handle deallocation. Do the caller should <b>not</b> \c delete 00445 this pointer. 00446 00447 00448 */ 00449 double *getDistanceMat(const ROMol &mol, 00450 bool useBO=false, 00451 bool useAtomWts=false, 00452 bool force=false, 00453 const char *propNamePrefix=0); 00454 00455 00456 //! Computes the molecule's topological distance matrix 00457 /*! 00458 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00459 00460 \param mol the molecule of interest 00461 \param activeAtoms only elements corresponding to these atom indices 00462 will be included in the calculation 00463 \param bonds only bonds found in this list will be included in the 00464 calculation 00465 \param useBO toggles use of bond orders in the matrix 00466 \param useAtomWts sets the diagonal elements of the result to 00467 6.0/(atomic number) so that the matrix can be used to calculate 00468 Balaban J values. This does not affect the bond weights. 00469 00470 \return the distance matrix. 00471 00472 <b>Notes</b> 00473 - The results of this call are not cached, the caller <b>should</b> \c delete 00474 this pointer. 00475 00476 00477 */ 00478 double *getDistanceMat(const ROMol &mol, 00479 const std::vector<int> &activeAtoms, 00480 const std::vector<const Bond *> &bonds, 00481 bool useBO=false, 00482 bool useAtomWts=false); 00483 00484 00485 //! Find the shortest path between two atoms 00486 /*! 00487 Uses the Bellman-Ford algorithm 00488 00489 \param mol molecule of interest 00490 \param aid1 index of the first atom 00491 \param aid2 index of the second atom 00492 00493 \return an std::list with the indices of the atoms along the shortest 00494 path 00495 00496 <b>Notes:</b> 00497 - the starting and end atoms are included in the path 00498 - if no path is found, an empty path is returned 00499 00500 */ 00501 INT_LIST getShortestPath(const ROMol &mol, int aid1, int aid2); 00502 00503 //@} 00504 00505 //! \name Canonicalization 00506 //@{ 00507 00508 //! assign a canonical ordering to a molecule's atoms 00509 /*! 00510 The algorithm used here is a modification of the published Daylight canonical 00511 smiles algorithm (i.e. it uses atom invariants and products of primes). 00512 00513 \param mol the molecule of interest 00514 \param ranks used to return the ranks 00515 \param breakTies toggles breaking of ties (see below) 00516 \param rankHistory used to return the rank history (see below) 00517 00518 <b>Notes:</b> 00519 - Tie breaking should be done when it's important to have a full ordering 00520 of the atoms (e.g. when generating canonical traversal trees). If it's 00521 acceptable to have ties between symmetry-equivalent atoms (e.g. when 00522 generating CIP codes), tie breaking can/should be skipped. 00523 - if the \c rankHistory argument is provided, the evolution of the ranks of 00524 individual atoms will be tracked. The \c rankHistory pointer should be 00525 to a VECT_INT_VECT that has at least \c mol.getNumAtoms() elements. 00526 */ 00527 void rankAtoms(const ROMol &mol,INT_VECT &ranks, 00528 bool breakTies=true,VECT_INT_VECT *rankHistory=0); 00529 00530 // @} 00531 00532 //! \name Stereochemistry 00533 //@{ 00534 00535 //! removes bogus chirality markers (those on non-sp3 centers): 00536 void cleanupChirality(RWMol &mol); 00537 00538 //! \brief Uses a conformer to assign ChiralType to a molecule's atoms 00539 /*! 00540 \param mol the molecule of interest 00541 \param confId the conformer to use 00542 \param replaceExistingTags if this flag is true, any existing atomic chiral 00543 tags will be replaced 00544 00545 If the conformer provided is not a 3D conformer, nothing will be done. 00546 */ 00547 void assignChiralTypesFrom3D(ROMol &mol,int confId=-1,bool replaceExistingTags=true); 00548 00549 //! Assign stereochemistry tags to atoms (i.e. R/S) and bonds (i.e. Z/E) 00550 /*! 00551 00552 \param mol the molecule of interest 00553 \param cleanIt toggles removal of stereo flags from double bonds that can 00554 not have stereochemistry 00555 \param force forces the calculation to be repeated even if it has 00556 already been done 00557 00558 <b>Notes:M</b> 00559 - Throughout we assume that we're working with a hydrogen-suppressed 00560 graph. 00561 00562 */ 00563 void assignStereochemistry(ROMol &mol,bool cleanIt=false,bool force=false); 00564 //! Removes all stereochemistry information from atoms (i.e. R/S) and bonds (i.e. Z/E) 00565 /*! 00566 00567 \param mol the molecule of interest 00568 */ 00569 void removeStereochemistry(ROMol &mol); 00570 00571 //! \deprecated Use assignStereochemistry() instead 00572 static void assignAtomChiralCodes(ROMol &mol,bool cleanIt=false,bool force=false){ 00573 assignStereochemistry(mol,cleanIt,force); 00574 }; 00575 //! \deprecated Use assignStereochemistry() instead 00576 static void assignBondStereoCodes(ROMol &mol,bool cleanIt=false,bool force=false){ 00577 assignStereochemistry(mol,cleanIt,force); 00578 }; 00579 00580 00581 //! \brief finds bonds that could be cis/trans in a molecule and mark them as 00582 //! Bond::STEREONONE 00583 /*! 00584 \param mol the molecule of interest 00585 \param cleanIt toggles removal of stereo flags from double bonds that can 00586 not have stereochemistry 00587 00588 This function is usefuly in two situations 00589 - when parsing a mol file; for the bonds marked here, coordinate informations 00590 on the neighbors can be used to indentify cis or trans states 00591 - when writing a mol file; bonds that can be cis/trans but not marked as either 00592 need to be specially marked in the mol file 00593 */ 00594 void findPotentialStereoBonds(ROMol &mol,bool cleanIt=false); 00595 //@} 00596 00597 }; // end of namespace MolOps 00598 }; // end of namespace RDKit 00599 00600 #endif
1.5.6