Package rdkit :: Package Dbase :: Module DbUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Dbase.DbUtils

  1  # $Id: DbUtils.py 997 2009-02-25 06:12:43Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2000-2006  greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ a set of functions for interacting with databases 
  8   
  9   When possible, it's probably preferable to use a _DbConnection.DbConnect_ object 
 10   
 11  """ 
 12  from rdkit import RDConfig 
 13  try: 
 14    from rdkit.Excel import ExcelWrapper 
 15  except: 
 16    haveExcel = 0 
 17  else: 
 18    haveExcel = 1 
 19   
 20  from rdkit.Dbase.DbResultSet import DbResultSet,RandomAccessDbResultSet 
21 -def _take(fromL,what):
22 return map(lambda x,y=fromL:y[x],what)
23 24 from rdkit.Dbase import DbModule 25 import sys,types,string 26 from rdkit.Dbase import DbInfo 27
28 -def GetColumns(dBase,table,fieldString,user='sysdba',password='masterkey', 29 join='',cn=None):
30 """ gets a set of data from a table 31 32 **Arguments** 33 34 - dBase: database name 35 36 - table: table name 37 38 - fieldString: a string with the names of the fields to be extracted, 39 this should be a comma delimited list 40 41 - user and password: 42 43 - join: a join clause (omit the verb 'join') 44 45 46 **Returns** 47 48 - a list of the data 49 50 """ 51 if not cn: 52 cn = DbModule.connect(dBase,user,password) 53 c = cn.cursor() 54 cmd = 'select %s from %s'%(fieldString,table) 55 if join: 56 if join.strip().find('join') != 0: 57 join = 'join %s'%(join) 58 cmd +=' ' + join 59 c.execute(cmd) 60 return c.fetchall()
61
62 -def GetData(dBase,table,fieldString='*',whereString='',user='sysdba',password='masterkey', 63 removeDups=-1,join='',forceList=0,transform=None,randomAccess=1,extras=None,cn=None):
64 """ a more flexible method to get a set of data from a table 65 66 **Arguments** 67 68 - fields: a string with the names of the fields to be extracted, 69 this should be a comma delimited list 70 71 - where: the SQL where clause to be used with the DB query 72 73 - removeDups indicates the column which should be used to screen 74 out duplicates. Only the first appearance of a duplicate will 75 be left in the dataset. 76 77 **Returns** 78 79 - a list of the data 80 81 82 **Notes** 83 84 - EFF: this isn't particularly efficient 85 86 """ 87 if not cn: 88 cn = DbModule.connect(dBase,user,password) 89 c = cn.cursor() 90 cmd = 'select %s from %s'%(fieldString,table) 91 if join: 92 if join.strip().find('join') != 0: 93 join = 'join %s'%(join) 94 cmd += ' ' + join 95 if whereString: 96 if whereString.strip().find('where')!=0: 97 whereString = 'where %s'%(whereString) 98 cmd += ' ' + whereString 99 100 if forceList: 101 try: 102 if not extras: 103 c.execute(cmd) 104 else: 105 c.execute(cmd,extras) 106 except: 107 sys.stderr.write('the command "%s" generated errors:\n'%(cmd)) 108 import traceback 109 traceback.print_exc() 110 return None 111 if transform is not None: 112 raise ValueError,'forceList and transform arguments are not compatible' 113 if not randomAccess: 114 raise ValueError,'when forceList is set, randomAccess must also be used' 115 data = c.fetchall() 116 if removeDups>0: 117 seen = [] 118 for entry in data[:]: 119 if entry[removeDups] in seen: 120 data.remove(entry) 121 else: 122 seen.append(entry[removeDups]) 123 else: 124 if randomAccess: 125 klass = RandomAccessDbResultSet 126 else: 127 klass = DbResultSet 128 129 data = klass(c,cn,cmd,removeDups=removeDups,transform=transform,extras=extras) 130 131 return data
132 133
134 -def DatabaseToExcel(dBase,table,fields='*',join='',where='',wrapper=None, 135 user='sysdba',password='masterkey',lowMemory=False,cn=None):
136 """ Pulls the contents of a database and puts them in an Excel worksheet 137 138 **Arguments** 139 - dBase: the name of the DB file to be used 140 141 - table: the name of the table to query 142 143 - fields: the fields to select with the SQL query 144 145 - join: the join clause of the SQL query 146 (e.g. 'join foo on foo.bar=base.bar') 147 148 - where: the where clause of the SQL query 149 (e.g. 'where foo = 2' or 'where bar > 17.6') 150 151 - wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used 152 in interacting with Excel 153 154 - user: the username for DB access 155 156 - password: the password to be used for DB access 157 158 """ 159 if not haveExcel: 160 return 161 if wrapper is None: 162 wrapper = ExcelWrapper.ExcelWrapper() 163 wrapper.Visible = 1 164 if len(where) and where.strip().find('where') != 0: 165 where = 'where %s'%(where) 166 if len(join) and join.strip().find('join') != 0: 167 join = 'join %s'%(join) 168 169 sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where) 170 if not cn: 171 cn = DbModule.connect(dBase,user,password) 172 c = cn.cursor() 173 try: 174 c.execute(sqlCommand) 175 except: 176 print 'problems executing SQL statement %s'%(repr(sqlCommand)) 177 import sys,traceback 178 traceback.print_exc() 179 return 180 headers = [] 181 colsToTake = [] 182 strCols = [] 183 # the description field of the cursor carries around info about the columns 184 # of the table 185 for i in range(len(c.description)): 186 item = c.description[i] 187 if item[1] not in DbInfo.sqlBinTypes: 188 colsToTake.append(i) 189 headers.append(item[0]) 190 if item[1] in DbInfo.sqlTextTypes: 191 strCols.append(len(colsToTake)-1) 192 wrapper.Workbooks.Add() 193 194 r = wrapper.GetRange(1,1,1,len(headers)) 195 # add the headers 196 r.Value = headers 197 # and make them bold 198 r.Font.Bold = 1 199 200 # now just insert the data... easy as pie 201 results = c.fetchall() 202 row = 2 203 fullData = [] 204 for res in results: 205 vs = [res[x] for x in colsToTake] 206 for col in strCols: 207 vs[col] = "'%s"%vs[col] 208 if not lowMemory: 209 fullData.append(vs) 210 else: 211 wrapper[row,1:len(headers)]=vs 212 row+=1 213 if not lowMemory: 214 wrapper[row:row+len(fullData),1:len(headers)] = fullData
215 216 217
218 -def DatabaseToText(dBase,table,fields='*',join='',where='',wrapper=None, 219 user='sysdba',password='masterkey',delim=',',cn=None):
220 """ Pulls the contents of a database and makes a deliminted text file from them 221 222 **Arguments** 223 - dBase: the name of the DB file to be used 224 225 - table: the name of the table to query 226 227 - fields: the fields to select with the SQL query 228 229 - join: the join clause of the SQL query 230 (e.g. 'join foo on foo.bar=base.bar') 231 232 - where: the where clause of the SQL query 233 (e.g. 'where foo = 2' or 'where bar > 17.6') 234 235 - wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used 236 in interacting with Excel 237 238 - user: the username for DB access 239 240 - password: the password to be used for DB access 241 242 **Returns** 243 244 - the CSV data (as text) 245 246 """ 247 if len(where) and where.strip().find('where')==-1: 248 where = 'where %s'%(where) 249 if len(join) and join.strip().find('join') == -1: 250 join = 'join %s'%(join) 251 sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where) 252 if not cn: 253 cn = DbModule.connect(dBase,user,password) 254 c = cn.cursor() 255 c.execute(sqlCommand) 256 headers = [] 257 colsToTake = [] 258 # the description field of the cursor carries around info about the columns 259 # of the table 260 for i in range(len(c.description)): 261 item = c.description[i] 262 if item[1] not in DbInfo.sqlBinTypes: 263 colsToTake.append(i) 264 headers.append(item[0]) 265 266 lines = [] 267 lines.append(delim.join(headers)) 268 269 # grab the data 270 results = c.fetchall() 271 for res in results: 272 d = _take(res,colsToTake) 273 lines.append(delim.join(map(str,d))) 274 275 return '\n'.join(lines)
276 277
278 -def TypeFinder(data,nRows,nCols,nullMarker=None):
279 """ 280 281 finds the types of the columns in _data_ 282 283 if nullMarker is not None, elements of the data table which are 284 equal to nullMarker will not count towards setting the type of 285 their columns. 286 287 """ 288 priorities={types.FloatType:3,types.IntType:2,types.StringType:1,-1:-1} 289 res = [None]*nCols 290 for col in xrange(nCols): 291 typeHere = [-1,1] 292 for row in xrange(nRows): 293 d = data[row][col] 294 if d is not None: 295 locType = type(d) 296 if locType != types.FloatType and locType != types.IntType: 297 locType = types.StringType 298 try: 299 d = str(d) 300 except UnicodeError,msg: 301 print 'cannot convert text from row %d col %d to a string'%(row+2,col) 302 print '\t>%s'%(repr(d)) 303 raise UnicodeError,msg 304 else: 305 typeHere[1] = max(typeHere[1],len(str(d))) 306 if locType == types.StringType: 307 if nullMarker is None or d != nullMarker: 308 l = max(len(d),typeHere[1]) 309 typeHere = [types.StringType,l] 310 else: 311 try: 312 fD = float(int(d)) 313 except OverflowError: 314 locType = types.FloatType 315 else: 316 if fD == d: 317 locType = types.IntType 318 if typeHere[0]!=types.StringType and \ 319 priorities[locType] > priorities[typeHere[0]]: 320 typeHere[0] = locType 321 res[col] = typeHere 322 return res
323
324 -def DetermineColTypes(wrapper,nullMarker=None):
325 """This is kind of crude hack to automagically determine the types 326 of columns in the active Excel sheet 327 328 **Arguments** 329 330 - wrapper: the _ExcelWrapper_ to be used in interacting with Excel 331 332 - nullMarker: (optional) if this is not None, elements of the 333 data table which are equal to nullMarker will not count towards 334 setting the type of their columns. 335 336 **Returns** 337 338 - a list of the types of each column 339 340 **Note** 341 342 - we make the assumption that there are only three possible types: int, 343 float and string. 344 345 """ 346 nCols = wrapper.FindLastCol(1,1) 347 nRows = wrapper.FindLastRow(1,1) 348 dList = wrapper[2:nRows,1:nCols] 349 res = TypeFinder(dList,nRows-1,nCols,nullMarker=nullMarker) 350 return res
351 352
353 -def _AdjustColHeadings(colHeadings,maxColLabelLen):
354 """ *For Internal Use* 355 356 removes illegal characters from column headings 357 and truncates those which are too long. 358 359 """ 360 for i in xrange(len(colHeadings)): 361 # replace unallowed characters and strip extra white space 362 colHeadings[i] = string.strip(colHeadings[i]) 363 colHeadings[i] = string.replace(colHeadings[i],' ','_') 364 colHeadings[i] = string.replace(colHeadings[i],'-','_') 365 colHeadings[i] = string.replace(colHeadings[i],'.','_') 366 367 if len(colHeadings[i]) > maxColLabelLen: 368 # interbase (at least) has a limit on the maximum length of a column name 369 newHead = string.replace(colHeadings[i],'_','') 370 newHead = newHead[:maxColLabelLen] 371 print '\tHeading %s too long, changed to %s'%(colHeadings[i],newHead) 372 colHeadings[i] = newHead 373 return colHeadings
374
375 -def GetTypeStrings(colHeadings,colTypes,keyCol=None):
376 """ returns a list of SQL type strings 377 """ 378 typeStrs=[] 379 for i in xrange(len(colTypes)): 380 type = colTypes[i] 381 if type[0] == types.FloatType: 382 typeStrs.append('%s double precision'%colHeadings[i]) 383 elif type[0] == types.IntType: 384 typeStrs.append('%s integer'%colHeadings[i]) 385 else: 386 typeStrs.append('%s varchar(%d)'%(colHeadings[i],type[1])) 387 if colHeadings[i] == keyCol: 388 typeStrs[-1] = '%s not null primary key'%(typeStrs[-1]) 389 return typeStrs
390
391 -def _insertBlock(conn,sqlStr,block,silent=False):
392 try: 393 conn.cursor().executemany(sqlStr,block) 394 except: 395 res = 0 396 conn.commit() 397 for row in block: 398 try: 399 conn.cursor().execute(sqlStr,tuple(row)) 400 res += 1 401 except: 402 if not silent: 403 import traceback 404 traceback.print_exc() 405 print 'insert failed:',sqlStr 406 print '\t',repr(row) 407 else: 408 conn.commit() 409 else: 410 res = len(block) 411 return res
412
413 -def _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data, 414 nullMarker=None,blockSize=100,cn=None):
415 """ *For Internal Use* 416 417 (drops and) creates a table and then inserts the values 418 419 """ 420 if not cn: 421 cn = DbModule.connect(dBase,user,password) 422 c = cn.cursor() 423 try: 424 c.execute('drop table %s'%(table)) 425 except: 426 print 'cannot drop table %s'%(table) 427 try: 428 sqlStr = 'create table %s (%s)'%(table,colDefs) 429 c.execute(sqlStr) 430 except: 431 print 'create table failed: ', sqlStr 432 print 'here is the exception:' 433 import traceback 434 traceback.print_exc() 435 return 436 cn.commit() 437 c = None 438 439 block = [] 440 entryTxt = [DbModule.placeHolder]*len(data[0]) 441 dStr = ','.join(entryTxt) 442 sqlStr = 'insert into %s values (%s)'%(table,dStr) 443 nDone = 0 444 for row in data: 445 entries = [None]*len(row) 446 for col in xrange(len(row)): 447 if row[col] is not None and \ 448 (nullMarker is None or row[col] != nullMarker): 449 if colTypes[col][0] == types.FloatType: 450 entries[col] = float(row[col]) 451 elif colTypes[col][0] == types.IntType: 452 entries[col] = int(row[col]) 453 else: 454 entries[col] = str(row[col]) 455 else: 456 entries[col] = None 457 block.append(tuple(entries)) 458 if len(block)>=blockSize: 459 nDone += _insertBlock(cn,sqlStr,block) 460 if not hasattr(cn,'autocommit') or not cn.autocommit: 461 cn.commit() 462 block = [] 463 if len(block): 464 nDone += _insertBlock(cn,sqlStr,block) 465 if not hasattr(cn,'autocommit') or not cn.autocommit: 466 cn.commit()
467 468 469
470 -def ExcelToDatabase(dBase,table,wrapper=None,user='sysdba',password='masterkey', 471 maxColLabelLen=31,keyCol=None,nullMarker=None,force=0):
472 """convert the active excel worksheet into a database. 473 474 this isn't as smooth or slick as the conversion the other way... sad. 475 476 **Arguments** 477 478 - dBase: the name of the DB to use 479 480 - table: the name of the table to create/overwrite 481 482 - wrapper: the _ExcelWrapper_ to use 483 484 - user: the user name to use in connecting to the DB 485 486 - password: the password to use in connecting to the DB 487 488 - maxColLabelLen: the maximum length a column label should be 489 allowed to have (truncation otherwise) 490 491 - keyCol: the column to be used as an index for the db 492 493 **Notes** 494 495 - if _table_ already exists, it is destroyed before we write 496 the new data 497 498 """ 499 if not haveExcel: 500 return 501 table.replace('-','_') 502 table.replace(' ','_') 503 if not force: 504 tblNames = [x.strip() for x in DbInfo.GetTableNames(dBase)] 505 tmp = table.upper() 506 if tmp in tblNames: 507 resp = raw_input('Table %s already exists, overwrite it? '%(table)) 508 if not resp or resp[0] not in ['Y','y']: 509 print 'cancelled' 510 return 511 512 if wrapper is None: 513 wrapper = ExcelWrapper.ExcelWrapper() 514 colHeadings = wrapper.GetHeadings() 515 _AdjustColHeadings(colHeadings,maxColLabelLen) 516 nCols = len(colHeadings) 517 nRows = wrapper.FindLastRow(1,1) 518 data = wrapper[2:nRows,1:nCols] 519 # determine the types of each column 520 colTypes = TypeFinder(data,nRows-1,nCols,nullMarker=nullMarker) 521 typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol) 522 colDefs=','.join(typeStrs) 523 524 _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,nullMarker=nullMarker)
525
526 -def TextFileToDatabase(dBase,table,inF,delim=',', 527 user='sysdba',password='masterkey', 528 maxColLabelLen=31,keyCol=None,nullMarker=None):
529 """loads the contents of the text file into a database. 530 531 **Arguments** 532 533 - dBase: the name of the DB to use 534 535 - table: the name of the table to create/overwrite 536 537 - inF: the file like object from which the data should 538 be pulled (must support readline()) 539 540 - delim: the delimiter used to separate fields 541 542 - user: the user name to use in connecting to the DB 543 544 - password: the password to use in connecting to the DB 545 546 - maxColLabelLen: the maximum length a column label should be 547 allowed to have (truncation otherwise) 548 549 - keyCol: the column to be used as an index for the db 550 551 **Notes** 552 553 - if _table_ already exists, it is destroyed before we write 554 the new data 555 556 - we assume that the first row of the file contains the column names 557 558 """ 559 table.replace('-','_') 560 table.replace(' ','_') 561 562 colHeadings = inF.readline().split(delim) 563 _AdjustColHeadings(colHeadings,maxColLabelLen) 564 nCols = len(colHeadings) 565 data = [] 566 inL = inF.readline() 567 while inL: 568 inL = inL.replace('\r','') 569 inL = inL.replace('\n','') 570 splitL = inL.split(delim) 571 if len(splitL)!=nCols: 572 print '>>>',repr(inL) 573 assert len(splitL)==nCols,'unequal length' 574 tmpVect = [] 575 for entry in splitL: 576 try: 577 val = int(entry) 578 except: 579 try: 580 val = float(entry) 581 except: 582 val = entry 583 tmpVect.append(val) 584 data.append(tmpVect) 585 inL = inF.readline() 586 nRows = len(data) 587 588 # determine the types of each column 589 colTypes = TypeFinder(data,nRows,nCols,nullMarker=nullMarker) 590 typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol) 591 colDefs=','.join(typeStrs) 592 593 _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data, 594 nullMarker=nullMarker)
595 596
597 -def DatabaseToDatabase(fromDb,fromTbl,toDb,toTbl, 598 fields='*',join='',where='', 599 user='sysdba',password='masterkey',keyCol=None,nullMarker='None'):
600 """ 601 602 FIX: at the moment this is a hack 603 604 """ 605 import cStringIO 606 io = cStringIO.StringIO() 607 io.write(DatabaseToText(fromDb,fromTbl,fields=fields,join=join,where=where, 608 user=user,password=password)) 609 io.seek(-1) 610 TextFileToDatabase(toDb,toTbl,io,user=user,password=password,keyCol=keyCol, 611 nullMarker=nullMarker)
612 613 614 if __name__=='__main__': 615 import cStringIO 616 617 io = cStringIO.StringIO() 618 io.write('foo,bar,baz\n') 619 io.write('1,2,3\n') 620 io.write('1.1,4,5\n') 621 io.write('4,foo,6\n') 622 io.seek(0) 623 from rdkit import RDConfig 624 import os 625 dirLoc = os.path.join(RDConfig.RDCodeDir,'Dbase','TEST.GDB') 626 627 TextFileToDatabase(dirLoc,'fromtext',io) 628