Package rdkit :: Package Chem :: Package MolDb :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolDb.Loader

  1  # $Id: Loader.py 997 2009-02-25 06:12:43Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2007-2008 Greg Landrum 
  4  #   @@ All Rights Reserved @@ 
  5  # 
  6  from rdkit import Chem 
  7  from rdkit.Chem import AllChem 
  8  from rdkit.Chem import Lipinski,Descriptors,Crippen 
  9  from rdkit.Dbase.DbConnection import DbConnect 
 10  from rdkit.Dbase import DbModule 
 11  import re 
 12   
 13  #set up the logger: 
 14  import rdkit.RDLogger as logging 
 15  logger = logging.logger() 
 16  logger.setLevel(logging.INFO) 
 17   
18 -def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id', 19 redraw=False,keepHs=False, 20 skipProps=False,addComputedProps=False, 21 skipSmiles=False, 22 uniqNames=None,namesSeen=None):
23 if not mol: 24 raise ValueError,'no molecule' 25 if keepHs: 26 Chem.SanitizeMol(mol) 27 try: 28 nm = mol.GetProp(nameProp) 29 except KeyError: 30 nm = None 31 if not nm: 32 nm = 'Mol_%d'%nDone 33 if uniqNames and nm in namesSeen: 34 logger.error('duplicate compound id (%s) encountered. second instance skipped.'%nm) 35 return None 36 namesSeen.add(nm) 37 row = [nm] 38 if not skipProps: 39 if addComputedProps: 40 nHD=Lipinski.NumHDonors(mol) 41 mol.SetProp('DonorCount',str(nHD)) 42 nHA=Lipinski.NumHAcceptors(mol) 43 mol.SetProp('AcceptorCount',str(nHA)) 44 nRot=Lipinski.NumRotatableBonds(mol) 45 mol.SetProp('RotatableBondCount',str(nRot)) 46 MW=Descriptors.MolWt(mol) 47 mol.SetProp('AMW',str(MW)) 48 logp=Crippen.MolLogP(mol) 49 mol.SetProp('MolLogP',str(logp)) 50 51 pns = list(mol.GetPropNames()) 52 pD={} 53 for pi,pn in enumerate(pns): 54 if pn.lower()==nameCol.lower(): continue 55 pv = mol.GetProp(pn).strip() 56 if pv.find('>')<0 and pv.find('<')<0: 57 colTyp = globalProps.get(pn,2) 58 while colTyp>0: 59 try: 60 tpi = typeConversions[colTyp][1](pv) 61 except: 62 colTyp-=1 63 else: 64 break 65 globalProps[pn]=colTyp 66 pD[pn]=typeConversions[colTyp][1](pv) 67 else: 68 pD[pn]=pv 69 else: 70 pD={} 71 if redraw: 72 AllChem.Compute2DCoords(m) 73 if not skipSmiles: 74 row.append(Chem.MolToSmiles(mol,True)) 75 row.append(DbModule.binaryHolder(mol.ToBinary())) 76 row.append(pD) 77 return row
78
79 -def ConvertRows(rows,globalProps,defaultVal,skipSmiles):
80 for i,row in enumerate(rows): 81 newRow = [] 82 newRow.append(row[0]) 83 pD=row[-1] 84 for pn in globalProps: 85 pv = pD.get(pn,defaultVal) 86 newRow.append(pv) 87 newRow.append(row[1]) 88 if not skipSmiles: 89 newRow.append(row[2]) 90 rows[i] = newRow
91
92 -def LoadDb(suppl,dbName,nameProp='_Name',nameCol='compound_id',silent=False, 93 redraw=False,errorsTo=None,keepHs=False,defaultVal='N/A',skipProps=False, 94 regName='molecules',skipSmiles=False,maxRowsCached=-1, 95 uniqNames=False,addComputedProps=False,lazySupplier=False):
96 if not lazySupplier: 97 nMols = len(suppl) 98 else: 99 nMols=-1 100 if not silent: 101 logger.info("Generating molecular database in file %s"%dbName) 102 if not lazySupplier: 103 logger.info(" Processing %d molecules"%nMols) 104 rows = [] 105 globalProps = {} 106 namesSeen = set() 107 nDone = 0 108 typeConversions={0:('varchar',str),1:('float',float),2:('int',int)} 109 for m in suppl: 110 nDone +=1 111 if not m: 112 if errorsTo: 113 if hasattr(suppl,'GetItemText'): 114 d = suppl.GetItemText(nDone-1) 115 errorsTo.write(d) 116 else: 117 logger.warning('full error file support not complete') 118 continue 119 120 row=ProcessMol(m,typeConversions,globalProps,nDone,nameProp=nameProp, 121 nameCol=nameCol,redraw=redraw, 122 keepHs=keepHs,skipProps=skipProps, 123 addComputedProps=addComputedProps,skipSmiles=skipSmiles, 124 uniqNames=uniqNames,namesSeen=namesSeen) 125 if row is None: continue 126 rows.append(row) 127 if not silent and not nDone%100: 128 logger.info(' done %d'%nDone) 129 if len(rows)==maxRowsCached: 130 break 131 132 nameDef='%s varchar not null'%nameCol 133 if uniqNames: 134 nameDef += ' unique' 135 typs = [nameDef] 136 pns = [] 137 for pn,v in globalProps.iteritems(): 138 addNm = re.sub(r'[\W]','_',pn) 139 typs.append('%s %s'%(addNm,typeConversions[v][0])) 140 pns.append(pn.lower()) 141 142 if not skipSmiles: 143 if 'smiles' not in pns: 144 typs.append('smiles varchar') 145 else: 146 typs.append('cansmiles varchar') 147 typs.append('molpkl %s'%(DbModule.binaryTypeName)) 148 conn = DbConnect(dbName) 149 curs = conn.GetCursor() 150 try: 151 curs.execute('drop table %s'%regName) 152 except: 153 pass 154 curs.execute('create table %s (%s)'%(regName,','.join(typs))) 155 qs = ','.join([DbModule.placeHolder for x in typs]) 156 157 158 ConvertRows(rows,globalProps,defaultVal,skipSmiles) 159 curs.executemany('insert into %s values (%s)'%(regName,qs),rows) 160 conn.Commit() 161 162 rows = [] 163 while 1: 164 nDone +=1 165 try: 166 m = suppl.next() 167 except StopIteration: 168 break 169 if not m: 170 if errorsTo: 171 if hasattr(suppl,'GetItemText'): 172 d = suppl.GetItemText(nDone-1) 173 errorsTo.write(d) 174 else: 175 logger.warning('full error file support not complete') 176 continue 177 tmpProps={} 178 row=ProcessMol(m,typeConversions,globalProps,nDone,nameProp=nameProp, 179 nameCol=nameCol,redraw=redraw, 180 keepHs=keepHs,skipProps=skipProps, 181 addComputedProps=addComputedProps,skipSmiles=skipSmiles, 182 uniqNames=uniqNames,namesSeen=namesSeen) 183 if not row: continue 184 rows.append(row) 185 if not silent and not nDone%100: 186 logger.info(' done %d'%nDone) 187 if len(rows)==maxRowsCached: 188 ConvertRows(rows,globalProps,defaultVal,skipSmiles) 189 curs.executemany('insert into %s values (%s)'%(regName,qs),rows) 190 conn.Commit() 191 rows = [] 192 if len(rows): 193 ConvertRows(rows,globalProps,defaultVal,skipSmiles) 194 curs.executemany('insert into %s values (%s)'%(regName,qs),rows) 195 conn.Commit()
196