Package Chem :: Module GraphDescriptors
[hide private]
[frames] | no frames]

Source Code for Module Chem.GraphDescriptors

  1  # $Id: GraphDescriptors.py 294 2007-07-20 03:39:13Z glandrum $ 
  2  # 
  3  # Copyright (C) 2003-2006 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ Calculation of topological/topochemical descriptors. 
  8   
  9   
 10   
 11  """ 
 12  import Chem 
 13  from Chem import Graphs 
 14  from Chem import rdchem 
 15  # FIX: remove this dependency here and below 
 16  from Chem import pyPeriodicTable as PeriodicTable 
 17  from Numeric import * 
 18  from ML.InfoTheory import entropy 
 19   
 20  periodicTable = rdchem.GetPeriodicTable() 
 21   
 22  _log2val = log(2) 
23 -def _log2(x):
24 return log(x) / _log2val
25
26 -def _VertexDegrees(mat,onlyOnes=0):
27 """ *Internal Use Only* 28 29 this is just a row sum of the matrix... simple, neh? 30 31 """ 32 if not onlyOnes: 33 res = sum(mat) 34 else: 35 res = sum(equal(mat,1)) 36 return res
37
38 -def _NumAdjacencies(mol,dMat):
39 """ *Internal Use Only* 40 41 """ 42 res = mol.GetNumBonds() 43 return res
44
45 -def _GetCountDict(arr):
46 """ *Internal Use Only* 47 48 """ 49 res = {} 50 for v in arr: 51 res[v] = res.get(v,0)+1 52 return res
53 54
55 -def HallKierAlpha(m):
56 """ calculate the Hall-Kier alpha value for a molecule 57 58 From equations (58) of Rev. Comp. Chem. vol 2, 367-422, (1991) 59 60 """ 61 alphaSum = 0.0 62 rC = PeriodicTable.nameTable['C'][5] 63 for atom in m.GetAtoms(): 64 atNum=atom.GetAtomicNum() 65 if not atNum: continue 66 symb = atom.GetSymbol() 67 alphaV = PeriodicTable.hallKierAlphas.get(symb,None) 68 if alphaV is not None: 69 hyb = atom.GetHybridization()-2 70 if(hyb<len(alphaV)): 71 alpha = alphaV[hyb] 72 if alpha is None: 73 alpha = alphaV[-1] 74 else: 75 alpha = alphaV[-1] 76 else: 77 rA = PeriodicTable.nameTable[symb][5] 78 alpha = rA/rC - 1 79 alphaSum += alpha 80 return alphaSum
81 HallKierAlpha.version="1.0.2" 82
83 -def Ipc(mol, avg = 0, dMat = None, forceDMat = 0):
84 """This returns the information content of the coefficients of the characteristic 85 polynomial of the adjacency matrix of a hydrogen-suppressed graph of a molecule. 86 87 'avg = 1' returns the information content divided by the total population. 88 89 From D. Bonchev & N. Trinajstic, J. Chem. Phys. vol 67, 4517-4533 (1977) 90 91 """ 92 if forceDMat or dMat is None: 93 if forceDMat: 94 dMat = Chem.GetDistanceMatrix(mol,0) 95 mol._adjMat = dMat 96 else: 97 try: 98 dMat = mol._adjMat 99 except AttributeError: 100 dMat = Chem.GetDistanceMatrix(mol,0) 101 mol._adjMat = dMat 102 103 adjMat = equal(dMat,1) 104 cPoly = abs(Graphs.CharacteristicPolynomial(mol, adjMat)) 105 if avg: 106 return entropy.InfoEntropy(cPoly) 107 else: 108 return sum(cPoly)*entropy.InfoEntropy(cPoly)
109 Ipc.version="1.0.0" 110 111 112
113 -def Kappa1(mol):
114 """ Hall-Kier Kappa1 value 115 116 From equations (58) and (59) of Rev. Comp. Chem. vol 2, 367-422, (1991) 117 118 """ 119 P1 = mol.GetNumBonds(1) 120 A = mol.GetNumAtoms(onlyHeavy=1) 121 alpha = HallKierAlpha(mol) 122 denom = P1 + alpha 123 if denom: 124 kappa = (A + alpha)*(A + alpha - 1)**2 / denom**2 125 else: 126 kappa = 0.0 127 return kappa
128 Kappa1.version="1.0.0" 129 130
131 -def Kappa2(mol):
132 """ Hall-Kier Kappa2 value 133 134 From equations (58) and (60) of Rev. Comp. Chem. vol 2, 367-422, (1991) 135 136 """ 137 P2 = len(Chem.FindAllPathsOfLengthN(mol,2)) 138 A = mol.GetNumAtoms(onlyHeavy=1) 139 alpha = HallKierAlpha(mol) 140 denom = (P2 + alpha)**2 141 if denom: 142 kappa = (A + alpha - 1)*(A + alpha - 2)**2 / denom 143 else: 144 kappa = 0 145 return kappa
146 Kappa2.version="1.0.0" 147
148 -def Kappa3(mol):
149 """ Hall-Kier Kappa3 value 150 151 From equations (58), (61) and (62) of Rev. Comp. Chem. vol 2, 367-422, (1991) 152 153 """ 154 P3 = len(Chem.FindAllPathsOfLengthN(mol,3)) 155 A = mol.GetNumAtoms(1) 156 alpha = HallKierAlpha(mol) 157 denom = (P3 + alpha)**2 158 if denom: 159 if A % 2 == 1: 160 kappa = (A + alpha - 1)*(A + alpha - 3)**2 / denom 161 else: 162 kappa = (A + alpha - 2)*(A + alpha - 3)**2 / denom 163 else: 164 kappa = 0 165 return kappa
166 Kappa3.version="1.0.0" 167
168 -def Chi0(mol):
169 """ From equations (1),(9) and (10) of Rev. Comp. Chem. vol 2, 367-422, (1991) 170 171 """ 172 deltas = [x.GetDegree() for x in mol.GetAtoms()] 173 while 0 in deltas: 174 deltas.remove(0) 175 deltas = array(deltas,Float) 176 res = sum(sqrt(1./deltas)) 177 return res
178 Chi0.version="1.0.0" 179 180
181 -def Chi1(mol):
182 """ From equations (1),(11) and (12) of Rev. Comp. Chem. vol 2, 367-422, (1991) 183 184 """ 185 c1s = [x.GetBeginAtom().GetDegree()*x.GetEndAtom().GetDegree() for x in mol.GetBonds()] 186 while 0 in c1s: 187 c1s.remove(0) 188 c1s = array(c1s,Float) 189 res = sum(sqrt(1./c1s)) 190 return res
191 Chi1.version="1.0.0" 192
193 -def _nVal(atom):
194 return periodicTable.GetNOuterElecs(atom.GetAtomicNum())-atom.GetTotalNumHs()
195
196 -def _hkDeltas(mol,skipHs=1):
197 global periodicTable 198 res = [] 199 for atom in mol.GetAtoms(): 200 n = atom.GetAtomicNum() 201 if n>1: 202 nV = periodicTable.GetNOuterElecs(n) 203 nHs = atom.GetTotalNumHs() 204 if n <= 10: 205 # first row 206 res.append(float(nV-nHs)) 207 else: 208 # second row and up 209 res.append(float(nV-nHs)/float(n-nV-1)) 210 elif not skipHs: 211 res.append(0.0) 212 return res
213
214 -def Chi0v(mol):
215 """ From equations (5),(9) and (10) of Rev. Comp. Chem. vol 2, 367-422, (1991) 216 217 """ 218 deltas = _hkDeltas(mol) 219 while 0 in deltas: 220 deltas.remove(0) 221 res = sum(sqrt(1./array(deltas))) 222 return res
223 Chi0v.version="1.0.0" 224
225 -def Chi1v(mol):
226 """ From equations (5),(11) and (12) of Rev. Comp. Chem. vol 2, 367-422, (1991) 227 228 """ 229 deltas = array(_hkDeltas(mol,skipHs=0)) 230 res = 0.0 231 for bond in mol.GetBonds(): 232 v = deltas[bond.GetBeginAtomIdx()]*deltas[bond.GetEndAtomIdx()] 233 if v != 0.0: 234 res += sqrt(1./v) 235 return res
236 Chi1v.version="1.0.0" 237
238 -def ChiNv_(mol,order=2):
239 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 240 241 **NOTE**: because the current path finding code does, by design, 242 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 243 length 3), values of ChiNv with N >= 3 may give results that differ 244 from those provided by the old code in molecules that have rings of 245 size 3. 246 247 """ 248 deltas = array(_hkDeltas(mol,skipHs=0)) 249 accum = 0.0 250 #print 'DELTAS',deltas 251 for path in Chem.FindAllPathsOfLengthN(mol,order+1,useBonds=0): 252 ats = [] 253 cAccum = 1.0 254 #print 'PATH:',path 255 for idx in path: 256 cAccum *= deltas[idx] 257 if cAccum: 258 accum += 1./sqrt(cAccum) 259 return accum
260
261 -def Chi2v(mol):
262 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 263 264 """ 265 return ChiNv_(mol,2)
266 Chi2v.version="1.0.0" 267
268 -def Chi3v(mol):
269 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 270 271 """ 272 return ChiNv_(mol,3)
273 Chi3v.version="1.0.0" 274
275 -def Chi4v(mol):
276 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 277 278 **NOTE**: because the current path finding code does, by design, 279 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 280 length 3), values of Chi4v may give results that differ from those 281 provided by the old code in molecules that have 3 rings. 282 283 """ 284 return ChiNv_(mol,4)
285 Chi4v.version="1.0.0" 286
287 -def Chi0n(mol):
288 """ Similar to Hall Kier Chi0v, but uses nVal instead of valence 289 This makes a big difference after we get out of the first row. 290 291 """ 292 deltas = [_nVal(x) for x in mol.GetAtoms()] 293 while deltas.count(0): 294 deltas.remove(0) 295 deltas = array(deltas,Float) 296 res = sum(sqrt(1./deltas)) 297 return res
298 Chi0n.version="1.0.0" 299
300 -def Chi1n(mol):
301 """ Similar to Hall Kier Chi1v, but uses nVal instead of valence 302 303 """ 304 delts = array([_nVal(x) for x in mol.GetAtoms()],Float) 305 res = 0.0 306 for bond in mol.GetBonds(): 307 v = delts[bond.GetBeginAtomIdx()]*delts[bond.GetEndAtomIdx()] 308 if v != 0.0: 309 res += sqrt(1./v) 310 return res
311 Chi1n.version="1.0.0"
312 -def ChiNn_(mol,order=2):
313 """ Similar to Hall Kier ChiNv, but uses nVal instead of valence 314 This makes a big difference after we get out of the first row. 315 316 **NOTE**: because the current path finding code does, by design, 317 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 318 length 3), values of ChiNn with N >= 3 may give results that differ 319 from those provided by the old code in molecules that have rings of 320 size 3. 321 322 """ 323 deltas = array([_nVal(x) for x in mol.GetAtoms()],Float) 324 accum = 0.0 325 for path in Chem.FindAllPathsOfLengthN(mol,order+1,useBonds=0): 326 cAccum = 1.0 327 for idx in path: 328 cAccum *= deltas[idx] 329 if cAccum: 330 accum += 1./sqrt(cAccum) 331 return accum
332
333 -def Chi2n(mol):
334 """ Similar to Hall Kier Chi2v, but uses nVal instead of valence 335 This makes a big difference after we get out of the first row. 336 337 """ 338 return ChiNn_(mol,2)
339 Chi2n.version="1.0.0" 340
341 -def Chi3n(mol):
342 """ Similar to Hall Kier Chi3v, but uses nVal instead of valence 343 This makes a big difference after we get out of the first row. 344 345 """ 346 return ChiNn_(mol,3)
347 Chi3n.version="1.0.0" 348
349 -def Chi4n(mol):
350 """ Similar to Hall Kier Chi4v, but uses nVal instead of valence 351 This makes a big difference after we get out of the first row. 352 353 354 **NOTE**: because the current path finding code does, by design, 355 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 356 length 3), values of Chi4n may give results that differ from those 357 provided by the old code in molecules that have 3 rings. 358 359 """ 360 return ChiNn_(mol,4)
361 Chi4n.version="1.0.0" 362 363
364 -def BalabanJ(mol,dMat=None,forceDMat=0):
365 """ Calculate Balaban's J value for a molecule 366 367 **Arguments** 368 369 - mol: a molecule 370 371 - dMat: (optional) a distance/adjacency matrix for the molecule, if this 372 is not provide, one will be calculated 373 374 - forceDMat: (optional) if this is set, the distance/adjacency matrix 375 will be recalculated regardless of whether or not _dMat_ is provided 376 or the molecule already has one 377 378 **Returns** 379 380 - a float containing the J value 381 382 We follow the notation of Balaban's paper: 383 Chem. Phys. Lett. vol 89, 399-404, (1982) 384 385 """ 386 # if no dMat is passed in, calculate one ourselves 387 if forceDMat or dMat is None: 388 if forceDMat: 389 # FIX: should we be using atom weights here or not? 390 dMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=1) 391 mol._balabanMat = dMat 392 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 393 mol._adjMat = adjMat 394 else: 395 try: 396 # first check if the molecule already has one 397 dMat = mol._balabanMat 398 except AttributeError: 399 # nope, gotta calculate one 400 dMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=0,prefix="Balaban") 401 # now store it 402 mol._balabanMat = dMat 403 try: 404 adjMat = mol._adjMat 405 except AttributeError: 406 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 407 mol._adjMat = adjMat 408 else: 409 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 410 411 s = _VertexDegrees(dMat) 412 q = _NumAdjacencies(mol,dMat) 413 n = mol.GetNumAtoms() 414 mu = q - n + 1 415 416 sum = 0. 417 nS = len(s) 418 for i in range(nS): 419 si = s[i] 420 for j in range(i,nS): 421 if adjMat[i,j] == 1: 422 sum += 1./sqrt(si*s[j]) 423 424 if mu+1 != 0: 425 J = float(q) / float(mu + 1) * sum 426 else: 427 J = 0 428 429 return J
430 BalabanJ.version="1.0.0" 431 432 433 #------------------------------------------------------------------------ 434 # 435 # Start block of BertzCT stuff. 436 # 437 438
439 -def _AssignSymmetryClasses(mol, vdList, bdMat, forceBDMat, numAtoms, cutoff):
440 """ 441 Used by BertzCT 442 443 vdList: the number of neighbors each atom has 444 bdMat: "balaban" distance matrix 445 446 """ 447 if forceBDMat: 448 bdMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=1, 449 prefix="Balaban") 450 mol._balabanMat = bdMat 451 452 atomIdx = 0 453 keysSeen = [] 454 symList = [0]*numAtoms 455 for i in range(numAtoms): 456 tmpList = bdMat[i].tolist() 457 tmpList.sort() 458 theKey = tuple(['%.4f'%x for x in tmpList[:cutoff]]) 459 try: 460 idx = keysSeen.index(theKey) 461 except ValueError: 462 idx = len(keysSeen) 463 keysSeen.append(theKey) 464 symList[i] = idx+1 465 return tuple(symList)
466
467 -def _LookUpBondOrder(atom1Id, atom2Id, bondDic):
468 """ 469 Used by BertzCT 470 """ 471 if atom1Id < atom2Id: 472 theKey = (atom1Id,atom2Id) 473 else: 474 theKey = (atom2Id,atom1Id) 475 tmp = bondDic[theKey] 476 if tmp == Chem.BondType.AROMATIC: 477 tmp = 1.5 478 else: 479 tmp = float(tmp) 480 #tmp = int(tmp) 481 return tmp
482 483
484 -def _CalculateEntropies(connectionDict, atomTypeDict, numAtoms):
485 """ 486 Used by BertzCT 487 """ 488 connectionList = connectionDict.values() 489 totConnections = sum(connectionList) 490 connectionIE = totConnections*(entropy.InfoEntropy(array(connectionList)) + 491 log(totConnections)/_log2val) 492 atomTypeList = atomTypeDict.values() 493 atomTypeIE = numAtoms*entropy.InfoEntropy(array(atomTypeList)) 494 return atomTypeIE + connectionIE
495
496 -def _CreateBondDictEtc(mol, numAtoms):
497 """ _Internal Use Only_ 498 Used by BertzCT 499 500 """ 501 bondDict = {} 502 nList = [None]*numAtoms 503 vdList = [0]*numAtoms 504 for aBond in mol.GetBonds(): 505 atom1=aBond.GetBeginAtomIdx() 506 atom2=aBond.GetEndAtomIdx() 507 if atom1>atom2: atom2,atom1=atom1,atom2 508 if not aBond.GetIsAromatic(): 509 bondDict[(atom1,atom2)] = aBond.GetBondType() 510 else: 511 # mark Kekulized systems as aromatic 512 bondDict[(atom1,atom2)] = Chem.BondType.AROMATIC 513 if nList[atom1] is None: 514 nList[atom1] = [atom2] 515 elif atom2 not in nList[atom1]: 516 nList[atom1].append(atom2) 517 if nList[atom2] is None: 518 nList[atom2] = [atom1] 519 elif atom1 not in nList[atom2]: 520 nList[atom2].append(atom1) 521 522 for i,element in enumerate(nList): 523 try: 524 element.sort() 525 vdList[i] = len(element) 526 except: 527 vdList[i] = 0 528 return bondDict, nList, vdList
529
530 -def BertzCT(mol, cutoff = 100, dMat = None, forceDMat = 1):
531 """ A topological index meant to quantify "complexity" of molecules. 532 533 Consists of a sum of two terms, one representing the complexity 534 of the bonding, the other representing the complexity of the 535 distribution of heteroatoms. 536 537 From S. H. Bertz, J. Am. Chem. Soc., vol 103, 3599-3601 (1981) 538 539 "cutoff" is an integer value used to limit the computational 540 expense. A cutoff value tells the program to consider vertices 541 topologically identical if their distance vectors (sets of 542 distances to all other vertices) are equal out to the "cutoff"th 543 nearest-neighbor. 544 545 **NOTE** The original implementation had the following comment: 546 > this implementation treats aromatic rings as the 547 > corresponding Kekule structure with alternating bonds, 548 > for purposes of counting "connections". 549 Upon further thought, this is the WRONG thing to do. It 550 results in the possibility of a molecule giving two different 551 CT values depending on the kekulization. For example, in the 552 old implementation, these two SMILES: 553 CC2=CN=C1C3=C(C(C)=C(C=N3)C)C=CC1=C2C 554 CC3=CN=C2C1=NC=C(C)C(C)=C1C=CC2=C3C 555 which correspond to differentk kekule forms, yield different 556 values. 557 The new implementation uses consistent (aromatic) bond orders 558 for aromatic bonds. 559 560 THIS MEANS THAT THIS IMPLEMENTATION IS NOT BACKWARDS COMPATIBLE. 561 562 Any molecule containing aromatic rings will yield different 563 values with this implementation. The new behavior is the correct 564 one, so we're going to live with the breakage. 565 566 **NOTE** this barfs if the molecule contains a second (or 567 nth) fragment that is one atom. 568 569 """ 570 atomTypeDict = {} 571 connectionDict = {} 572 numAtoms = mol.GetNumAtoms() 573 if forceDMat or dMat is None: 574 if forceDMat: 575 # nope, gotta calculate one 576 dMat = Chem.GetDistanceMatrix(mol,useBO=0,useAtomWts=0,force=1) 577 mol._adjMat = dMat 578 else: 579 try: 580 dMat = mol._adjMat 581 except AttributeError: 582 dMat = Chem.GetDistanceMatrix(mol,useBO=0,useAtomWts=0,force=1) 583 mol._adjMat = dMat 584 585 if numAtoms < 2: 586 return 0 587 588 bondDict, neighborList, vdList = _CreateBondDictEtc(mol, numAtoms) 589 symmetryClasses = _AssignSymmetryClasses(mol, vdList, dMat, forceDMat, numAtoms, cutoff) 590 #print 'Symmm Classes:',symmetryClasses 591 for atomIdx in range(numAtoms): 592 hingeAtomNumber = mol.GetAtomWithIdx(atomIdx).GetAtomicNum() 593 atomTypeDict[hingeAtomNumber] = atomTypeDict.get(hingeAtomNumber,0)+1 594 595 hingeAtomClass = symmetryClasses[atomIdx] 596 numNeighbors = vdList[atomIdx] 597 for i in range(numNeighbors): 598 neighbor_iIdx = neighborList[atomIdx][i] 599 NiClass = symmetryClasses[neighbor_iIdx] 600 bond_i_order = _LookUpBondOrder(atomIdx, neighbor_iIdx, bondDict) 601 #print '\t',atomIdx,i,hingeAtomClass,NiClass,bond_i_order 602 if (bond_i_order > 1) and (neighbor_iIdx > atomIdx): 603 numConnections = bond_i_order*(bond_i_order - 1)/2 604 connectionKey = (min(hingeAtomClass, NiClass), max(hingeAtomClass, NiClass)) 605 connectionDict[connectionKey] = connectionDict.get(connectionKey,0)+numConnections 606 607 for j in range(i+1, numNeighbors): 608 neighbor_jIdx = neighborList[atomIdx][j] 609 NjClass = symmetryClasses[neighbor_jIdx] 610 bond_j_order = _LookUpBondOrder(atomIdx, neighbor_jIdx, bondDict) 611 numConnections = bond_i_order*bond_j_order 612 connectionKey = (min(NiClass, NjClass), hingeAtomClass, max(NiClass, NjClass)) 613 connectionDict[connectionKey] = connectionDict.get(connectionKey,0)+numConnections 614 615 if not connectionDict: 616 connectionDict = {'a':1} 617 618 ks = connectionDict.keys() 619 ks.sort() 620 return _CalculateEntropies(connectionDict, atomTypeDict, numAtoms)
621 BertzCT.version="2.0.0" 622 # Recent Revisions: 623 # 1.0.0 -> 2.0.0: 624 # - force distance matrix updates properly (Fixed as part of Issue 125) 625 # - handle single-atom fragments (Issue 136) 626 627 628 # 629 # End block of BertzCT stuff. 630 # 631 #------------------------------------------------------------------------ 632