MolOps.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2001-2006 Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved  @@
00005 //
00006 #ifndef _RD_MOL_OPS_H_
00007 #define _RD_MOL_OPS_H_
00008 
00009 #include <RDGeneral/types.h>
00010 #include <boost/tuple/tuple.hpp>
00011 
00012 extern const int ci_LOCAL_INF;
00013 namespace RDKit{
00014   class ROMol;
00015   class RWMol;
00016   class Atom;
00017   class Bond;
00018   typedef std::vector<double> INVAR_VECT;
00019   typedef INVAR_VECT::iterator INVAR_VECT_I;
00020   typedef INVAR_VECT::const_iterator INVAR_VECT_CI;
00021 
00022   //! used to request particular types of atom invariants
00023   typedef enum {
00024     ComprehensiveInvariants, //!< for general atomic ranking
00025     ChiralSearchInvariants,  //!< for ranking atoms for calculating CIP codes
00026   } AtomInvariantType;
00027 
00028   //! used to return atomic discriminators (three doubles)
00029   typedef boost::tuples::tuple<double,double,double> DiscrimTuple;
00030 
00031 
00032   //! \brief Groups a variety of molecular query and transformation operations.
00033   namespace MolOps {
00034     //! find fragments (disconnected components of the molecular graph)
00035     /*!
00036 
00037       \param mol     the molecule of interest
00038       \param mapping used to return the mapping of Atoms->fragments.
00039          On return \c mapping will be <tt>mol->getNumAtoms()</tt> long
00040          and will contain the fragment assignment for each Atom
00041 
00042       \return the number of fragments found.     
00043       
00044     */
00045     unsigned int getMolFrags(const ROMol &mol,INT_VECT &mapping);
00046     //! find fragments (disconnected components of the molecular graph)
00047     /*!
00048 
00049       \param mol    the molecule of interest
00050       \param frags  used to return the Atoms in each fragment
00051          On return \c mapping will be \c numFrags long, and each entry
00052          will contain the indices of the Atoms in that fragment.
00053 
00054       \return the number of fragments found.     
00055       
00056     */
00057     unsigned int getMolFrags(const ROMol &mol, VECT_INT_VECT &frags);
00058 
00059 
00060     //! returns a copy of a molecule with hydrogens added in as explicit Atoms
00061     /*!
00062         \param mol          the molecule to add Hs to
00063         \param explicitOnly (optional) if this \c true, only explicit Hs will be added
00064         \param addCoords    (optional) If this is true, estimates for the atomic coordinates
00065                     of the added Hs will be used.
00066      
00067         \return the new molecule 
00068 
00069         <b>Notes:</b>
00070            - it makes no sense to use the \c addCoords option if the molecule's heavy
00071              atoms don't already have coordinates.
00072            - the caller is responsible for <tt>delete</tt>ing the pointer this returns.
00073      */
00074     ROMol *addHs(const ROMol &mol,bool explicitOnly=false,bool addCoords=false);
00075 
00076 
00077     //! returns a copy of a molecule with hydrogens removed
00078     /*!
00079         \param mol          the molecule to remove Hs from
00080         \param implicitOnly (optional) if this \c true, only implicit Hs will be removed
00081         \param updateExplicitCount  (optional) If this is \c true, when explicit Hs are removed
00082              from the graph, the heavy atom to which they are bound will have its counter of
00083              explicit Hs increased.
00084         \param sanitize:  (optional) If this is \c true, the final molecule will be 
00085        sanitized
00086      
00087         \return the new molecule 
00088 
00089         <b>Notes:</b>
00090            - Hydrogens which aren't connected to a heavy atom will not be
00091              removed.  This prevents molecules like <tt>"[H][H]"</tt> from having
00092              all atoms removed.
00093            - Labelled hydrogen (e.g. atoms with atomic number=1, but mass > 1),
00094              will not be removed.
00095 
00096            - the caller is responsible for <tt>delete</tt>ing the pointer this returns.
00097     */
00098     ROMol *removeHs(const ROMol &mol,bool implicitOnly=false,
00099                            bool updateExplicitCount=false,bool sanitize=true);
00100 
00101     //! returns a copy of a molecule with hydrogens removed and added as queries
00102     //!  to the heavy atoms to which they are bound.
00103     /*!
00104       This is really intended to be used with molecules that contain QueryAtoms
00105       
00106       \param mol the molecule to remove Hs from
00107      
00108       \return the new molecule 
00109 
00110       <b>Notes:</b>
00111         - Atoms that do not already have hydrogen count queries will have one
00112                 added, other H-related queries will not be touched. Examples:
00113               - C[H] -> [C;!H0]
00114               - [C;H1][H] -> [C;H1]
00115               - [C;H2][H] -> [C;H2]
00116         - Hydrogens which aren't connected to a heavy atom will not be
00117           removed.  This prevents molecules like <tt>"[H][H]"</tt> from having
00118           all atoms removed.
00119               - the caller is responsible for <tt>delete</tt>ing the pointer this returns.
00120         
00121     */
00122     ROMol *mergeQueryHs(const ROMol &mol);
00123 
00124     //! return the number of electrons available on an atom to donate for aromaticity
00125     /*!
00126        The result is determined using the default valency, number of lone pairs,
00127        number of bonds and the formal charge. Note that the atom may not donate
00128        all of these electrons to a ring for aromaticity (also used in Conjugation
00129        and hybridization code).
00130 
00131        \param at the atom of interest
00132 
00133        \return the number of electrons
00134     */
00135     int countAtomElec(const Atom *at);
00136 
00137     //! Sets up the aromaticity for a molecule
00138     /*!
00139 
00140       This is what happens here:
00141          -# find all the simple rings by calling the findSSSR function
00142          -# loop over all the Atoms in each ring and mark them if they are candidates
00143             for aromaticity. A ring atom is a candidate if it can spare electrons
00144             to the ring and if it's from the first two rows of the periodic table.
00145          -# ased on the candidate atoms, mark the rings to be either candidates 
00146             or non-candidates. A ring is a candidate only if all its atoms are candidates
00147          -# apply Hueckel rule to each of the candidate rings to check if the ring can be
00148             aromatic
00149 
00150       \param mol the RWMol of interest
00151       
00152       \return 1 on succes, 0 otherwise
00153       
00154       <b>Assumptions:</b>
00155         - Kekulization has been done (i.e. \c MolOps::Kekulize() has already
00156           been called)
00157        
00158     */
00159     int setAromaticity(RWMol &mol);
00160 
00161 
00162     //! Designed to be called by the sanitizer to handle special cases before anything is done.
00163     /*!
00164 
00165         Currently this:
00166          - modifies nitro groups, so that the nitrogen does not have a unreasonable
00167            valence of 5, as follows:
00168              - the nitrogen gets a positve charge
00169              - one of the oxygens gets a negative chage and the double bond to this 
00170                oxygen is changed to a single bond
00171            The net result is that nitro groups can be counted on to be:
00172              \c "[N+](=O)[O-]"
00173 
00174        \param mol    the molecule of interest
00175      
00176     */
00177     void cleanUp(RWMol &mol);
00178 
00179 
00180     //! adjust the number of implicit and explicit Hs for special cases
00181     /*!
00182 
00183         Currently this:
00184          - modifies aromatic nitrogens so that, when appropriate, they have an
00185            explicit H marked (e.g. so that we get things like \c "c1cc[nH]cc1"
00186 
00187         \param mol    the molecule of interest
00188         
00189         <b>Assumptions</b>
00190            - this is called after the molecule has been sanitized,
00191              aromaticity has been perceived, and the implicit valence of
00192              everything has been calculated.
00193 
00194     */
00195     void adjustHs(RWMol &mol);
00196 
00197     //! finds a molecule's Smallest Set of Smallest Rings
00198     /*!
00199       Currently this implements a modified form of Figueras algorithm
00200         (JCICS - Vol. 36, No. 5, 1996, 986-991)
00201 
00202       \param mol the molecule of interest
00203       \param res used to return the vector of rings. Each entry is a vector with
00204           atom indices.  This information is also stored in the molecule's
00205           RingInfo structure, so this argument is optional (see overload)
00206           
00207       \return number of smallest rings found
00208 
00209       Base algorithm:
00210         - The original algorithm starts by finding representative degree 2
00211           nodes. 
00212         - Representative because if a series of deg 2 nodes are found only
00213           one of them is picked.
00214         - The smallest ring around each of them is found. 
00215         - The bonds that connect to this degree 2 node are them chopped off, yielding
00216           new deg two nodes 
00217         - The process is repeated on the new deg 2 nodes.
00218         - If no deg 2 nodes are found, a deg 3 node is picked. The smallest ring
00219           with it is found. A bond from this is "carefully" (look in the paper)
00220           selected and chopped, yielding deg 2 nodes. The process is same as 
00221           above once this is done.
00222       
00223       Our Modifications:
00224         - If available, more than one smallest ring around a representative deg 2
00225           node will be computed and stored
00226         - Typically 3 rings are found around a degree 3 node (when no deg 2s are available)
00227           and all the bond to that node are chopped.
00228         - The extra rings that were found in this process are removed after all the nodes
00229           have been covered.
00230 
00231       These changes were motivated by several factors:
00232         - We believe the original algorithm fails to find the correct SSSR
00233           (finds the correct number of them but the wrong ones) on some sample mols
00234         - Since SSSR may not be unique, a post-SSSR step to symmetrize may be done.
00235           The extra rings this process adds can be quite useful.
00236     */
00237     int findSSSR(const ROMol &mol, VECT_INT_VECT &res);
00238     //! \overload
00239     int findSSSR(const ROMol &mol, VECT_INT_VECT *res=0);
00240 
00241     //! symmetrize the molecule's Smallest Set of Smallest Rings
00242     /*!
00243        SSSR rings obatined from "findSSSR" can be non-unique in some case.
00244        For example, cubane has five SSSR rings, not six as one would hope.
00245        
00246        This function adds additional rings to the SSSR list if necessary 
00247        to make the list symmetric, e.g. all atoms in cubane will be part of the same number 
00248        of SSSRs. This function choses these extra rings from the extra rings computed
00249        and discarded during findSSSR. The new ring are chosen such that:
00250         - replacing a same sized ring in the SSSR list with an extra ring yields
00251           the same union of bond IDs as the orignal SSSR list
00252      
00253       \param mol - the molecule of interest
00254       \param res used to return the vector of rings. Each entry is a vector with
00255           atom indices.  This information is also stored in the molecule's
00256           RingInfo structure, so this argument is optional (see overload)
00257       
00258       \return the total number of rings = (new rings + old SSSRs)
00259      
00260       <b>Notes:</b>
00261        - if no SSSR rings are found on the molecule - MolOps::findSSSR() is called first
00262     */
00263     int symmetrizeSSSR(ROMol &mol, VECT_INT_VECT &res);
00264 
00265     //! Kekulizes the molecule
00266     /*!
00267 
00268        \param mol             the molecule of interest
00269        \param markAtomsBonds  if this is set to true, \c isAromatic boolean settings
00270                               on both the Bonds and Atoms are turned to false following
00271                               the Kekulization, otherwise they are left alone in their 
00272                               original state.
00273        \param maxBackTracks   the maximum number of attempts at back-tracking. The algorithm 
00274                               uses a back-tracking procedure to revist a previous setting of 
00275                               double bond if we hit a wall in the kekulization process
00276                               
00277        <b>Notes:</b>
00278          - even if \c markAtomsBonds is \c false the \c BondType for all aromatic
00279            bonds will be changed from \c RDKit::Bond::AROMATIC to \c RDKit::Bond::SINGLE
00280            or RDKit::Bond::DOUBLE during Kekulization.
00281 
00282     */
00283     void Kekulize(RWMol &mol, bool markAtomsBonds=true, unsigned int maxBackTracks=100);
00284 
00285     //! flags the molecule's conjugated bonds
00286     void setConjugation(ROMol &mol);
00287 
00288     //! returns whether or not the given Atom is involved in a conjugated bond
00289     bool atomHasConjugatedBond(const Atom *at);
00290 
00291 
00292     //! calculates and sets the hybridization of all a molecule's Stoms
00293     void setHybridization(ROMol &mol);
00294 
00295 
00296     //! \brief carries out a collection of tasks for cleaning up a molecule and ensuring
00297     //! that it makes "chemical sense"
00298     /*!
00299        This functions calls the following in sequence
00300          -# MolOps::cleanUp()
00301          -# MolOps::Kekulize()
00302          -# MolOps::setAromaticity()
00303          -# MolOps::setConjugation()
00304          -# MolOps::setHybridization()
00305          -# MolOps::cleanupChirality()
00306          -# MolOps::adjustHs()
00307          
00308        \param mol the RWMol to be cleaned
00309 
00310        <b>Notes:</b>
00311         - If there is a failure in the sanitization, a \c SanitException
00312           will be thrown.
00313         - in general the user of this function should cast the molecule following this 
00314           function to a ROMol, so that new atoms and bonds cannot be added to the 
00315           molecule and screw up the sanitizing that has been done here
00316     */
00317     void sanitizeMol(RWMol &mol);
00318 
00319     //! returns a molecule's adjacency matrix
00320     /*!
00321       \param mol             the molecule of interest
00322       \param useBO           toggles use of bond orders in the matrix
00323       \param emptyVal        sets the empty value (for non-adjacent atoms)
00324       \param force           forces calculation of the matrix, even if already computed
00325       \param propNamePrefix  used to set the cached property name
00326 
00327       \return the adjacency matrix.
00328 
00329       <b>Notes</b>
00330         - The result of this is cached in the molecule's local property dictionary,
00331           which will handle deallocation. Do the caller should <b>not</b> \c delete
00332           this pointer.
00333           
00334     */
00335     double * getAdjacencyMatrix(const ROMol &mol,
00336                                        bool useBO=false,
00337                                        int emptyVal=0,
00338                                        bool force=false,
00339                                        const char *propNamePrefix=0);
00340 
00341     //! Computes the molecule's topological distance matrix
00342     /*!
00343        Uses the Floyd-Warshall all-pairs-shortest-paths algorithm.
00344       
00345       \param mol             the molecule of interest
00346       \param useBO           toggles use of bond orders in the matrix
00347       \param useAtomWts      sets the diagonal elements of the result to
00348                6.0/(atomic number) so that the matrix can be used to calculate
00349                Balaban J values.  This does not affect the bond weights. 
00350       \param force           forces calculation of the matrix, even if already computed
00351       \param propNamePrefix  used to set the cached property name
00352 
00353       \return the distance matrix.
00354 
00355       <b>Notes</b>
00356         - The result of this is cached in the molecule's local property dictionary,
00357           which will handle deallocation. Do the caller should <b>not</b> \c delete
00358           this pointer.
00359           
00360      
00361     */
00362     double *getDistanceMat(const ROMol &mol,
00363                                   bool useBO=false,
00364                                   bool useAtomWts=false,
00365                                   bool force=false,
00366                                   const char *propNamePrefix=0);
00367 
00368 
00369     //! Computes the molecule's topological distance matrix
00370     /*!
00371        Uses the Floyd-Warshall all-pairs-shortest-paths algorithm.
00372       
00373       \param mol             the molecule of interest
00374       \param activeAtoms     only elements corresponding to these atom indices
00375                              will be included in the calculation
00376       \param bonds           only bonds found in this list will be included in the
00377                              calculation
00378       \param useBO           toggles use of bond orders in the matrix
00379       \param useAtomWts      sets the diagonal elements of the result to
00380                6.0/(atomic number) so that the matrix can be used to calculate
00381                Balaban J values.  This does not affect the bond weights. 
00382 
00383       \return the distance matrix.
00384 
00385       <b>Notes</b>
00386         - The results of this call are not cached, the caller <b>should</b> \c delete
00387           this pointer.
00388           
00389      
00390     */
00391     double *getDistanceMat(const ROMol &mol,
00392                                   const std::vector<int> &activeAtoms,
00393                                   const std::vector<const Bond *> &bonds,
00394                                   bool useBO=false,
00395                                   bool useAtomWts=false);
00396 
00397 
00398     //! Find the shortest path between two atoms
00399     /*!
00400       Uses the Bellman-Ford algorithm
00401 
00402      \param mol  molecule of interest
00403      \param aid1 index of the first atom
00404      \param aid2 index of the second atom
00405 
00406      \return an std::list with the indices of the atoms along the shortest
00407         path
00408 
00409      <b>Notes:</b>
00410        - the starting and end atoms are not include in the path
00411 
00412     */
00413     INT_LIST getShortestPath(const ROMol &mol, int aid1, int aid2);
00414 
00415     //! calculates a set of molecular discriminators from the distance matrix
00416     /*!
00417       Computes:
00418         -# BalabanJ 
00419         -# the first eigenvalue of the distance matrix 
00420         -# the last but one eigenvalue of the distance matrix 
00421 
00422       \param mol    the molecule of interest
00423       \param useBO  toggles inclusion of the bond order in the discriminators
00424                     (when false, the discriminators are purely topological)
00425       \param force  forces the calculation (instead of using cached results)
00426         
00427       \return a \c DiscrimTuple with the results
00428       
00429     */
00430     DiscrimTuple computeDiscriminators(const ROMol &mol, 
00431                                               bool useBO=true, 
00432                                               bool force=false);
00433     //! \brief Same as MolOps::computeDiscriminators(const ROMol &mol),
00434     //! except that this directly uses the user-supplied distance matrix
00435     DiscrimTuple computeDiscriminators(double *distMat, int nb, int na);
00436 
00437 
00438     //! calculates Balaban's J index for the molecule
00439     /*!
00440       \param mol      the molecule of interest
00441       \param useBO    toggles inclusion of the bond order in the calculation
00442                       (when false, we're not really calculating the J value)
00443       \param force    forces the calculation (instead of using cached results)
00444       \param bondPath when included, only paths using bonds whose indices occur
00445                       in this vector will be included in the calculation
00446       \param cacheIt  If this is true, the calculated value will be cached
00447                       as a property on the molecule
00448       \return the J index
00449       
00450     */
00451     double computeBalabanJ(const ROMol &mol, 
00452                                   bool useBO=true,
00453                                   bool force=false,
00454                                   const std::vector<int> *bondPath=0,
00455                                   bool cacheIt=true);
00456                                 
00457 
00458     //! finds a molecule's minimium spanning tree (MST)
00459     /*!
00460       \param mol  the molecule of interest
00461       \param mst  used to return the MST as a vector of bond indices
00462     */
00463     void findSpanningTree(const ROMol &mol,INT_VECT &mst);
00464 
00465     //! assign a canonical ordering to a molecule's atoms
00466     /*!
00467       The algorithm used here is a modification of the published Daylight canonical
00468       smiles algorithm (i.e. it uses atom invariants and products of primes).
00469       
00470       \param mol               the molecule of interest
00471       \param ranks             used to return the ranks
00472       \param invariantType     the type of invariant to be computed 
00473       \param breakTies         toggles breaking of ties (see below)
00474       \param rankHistory       used to return the rank history (see below)
00475       \param includeChirality  toggles inclusion of chirality in the atom ranking
00476                                (see below)
00477 
00478       <b>Notes:</b>
00479         - Tie breaking should be done when it's important to have a full ordering
00480           of the atoms (e.g. when generating canonical traversal trees). If it's
00481                 acceptable to have ties between symmetry-equivalent atoms (e.g. when
00482                 generating CIP codes), tie breaking can/should be skipped.
00483               - if the \c rankHistory argument is provided, the evolution of the ranks of
00484                 individual atoms will be tracked.  The \c rankHistory pointer should be
00485                 to a VECT_INT_VECT that has at least \c mol.getNumAtoms() elements.
00486         - the \c includeChirality argument only makes sense when generating full
00487           invariants (i.e. ComprehensiveInvariant)
00488     */
00489     void rankAtoms(const ROMol &mol,INT_VECT &ranks,
00490                           AtomInvariantType invariantType=ComprehensiveInvariants,
00491                           bool breakTies=true,VECT_INT_VECT *rankHistory=0,
00492                           bool includeChirality=false);
00493 
00494     //! calculates a set of atom invariants
00495     /*!
00496       These invariants include terms for:
00497         -# degree
00498         -# explicit valence
00499         -# atomic number
00500         -# isotope
00501         -# number of hs
00502         -# size of rings an atom is involved in
00503         -# formal charge
00504         -# chiral code (optional)
00505         -# whether of not the atom determines the stereochemistry of a
00506                    double bond. (optional)
00507       
00508       \param mol               the molecule of interest
00509       \param res               used to return the results
00510       \param includeChirality  toggles inclusion of chirality, see MolOps::rankAtoms()
00511                                 for an explanation.
00512     */
00513     void buildAtomInvariants(const ROMol &mol,INVAR_VECT &res,
00514                                     bool includeChirality=false);
00515     //! calculates a set of chiral atom invariants
00516     /*!
00517       These are based upon Labute's proposal in the article:
00518       "An Efficient Algorithm for the Determination of Topological
00519       RS Chirality" <i>Journal of the CCG</i> (1996)
00520       and include terms for:
00521         -# atomic number
00522         -# isotope
00523         -# number of multiple bonds from/to this atom
00524         -# degree
00525         -# number of hydrogens
00526 
00527       \param mol               the molecule of interest
00528       \param res               used to return the results
00529     */
00530     void buildChiralAtomInvariants(const ROMol &mol,
00531                                           INVAR_VECT &res);
00532 
00533     //! calculate CIP (R/S) codes for a molecule's chiral centers
00534     /*!
00535       \param mol     the molecule of interest
00536       \param cleanIt toggles removal of chiral flags from atoms that do
00537                      not have four unique neighbors
00538       \param force   forces the calculation to be repeated even if it has 
00539                      already been done 
00540 
00541       <b>Notes:</b>
00542         - The codes are stored on atoms as a string property named
00543           "_CIPCode".
00544         - All atoms also gain a property "_CIPRank" with their overall
00545           CIP ranking.
00546         - Throughout we assume that we're working with a hydrogen-suppressed
00547           graph.
00548     */
00549     void assignAtomChiralCodes(ROMol &mol,bool cleanIt=false,bool force=false);
00550     //! calculate CIP (Z/E) codes for a molecule's double bonds
00551     /*!
00552       \param mol     the molecule of interest
00553       \param cleanIt toggles removal of stereo flags from double bonds that can
00554                      not have stereochemistry
00555       \param force   forces the calculation to be repeated even if it has 
00556                      already been done 
00557 
00558       <b>Notes:M</b>
00559         - Throughout we assume that we're working with a hydrogen-suppressed
00560           graph.
00561 
00562     */
00563     void assignBondStereoCodes(ROMol &mol,bool cleanIt=false,bool force=false);
00564     //! \brief finds bonds that could be cis/trans in a molecule and mark them as
00565     //!  Bond::STEREONONE
00566     /*!
00567       \param mol     the molecule of interest
00568       \param cleanIt toggles removal of stereo flags from double bonds that can
00569                      not have stereochemistry
00570 
00571       This function is usefuly in two situations
00572         - when parsing a mol file; for the bonds marked here, coordinate informations 
00573           on the neighbors can be used to indentify cis or trans states
00574         - when writing a mol file; bonds that can be cis/trans but not marked as either 
00575           need to be specially marked in the mol file
00576     */
00577     void findPotentialStereoBonds(ROMol &mol,bool cleanIt=false);
00578     //! sums up all atomic formal charges and returns the result
00579     int getFormalCharge(const ROMol &mol);
00580 
00581     //! removes bogus chirality markers (those on non-sp3 centers):
00582     void cleanupChirality(RWMol &mol);
00583 
00584   }; // end of namespace MolOps
00585 }; // end of namespace RDKit
00586 
00587 #endif

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3