Package Dbase :: Module DbUtils
[hide private]
[frames] | no frames]

Source Code for Module Dbase.DbUtils

  1  # $Id: DbUtils.py 2 2006-05-06 22:54:39Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2000-2006  greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ a set of functions for interacting with databases 
  8   
  9   When possible, it's probably preferable to use a _DbConnection.DbConnect_ object 
 10   
 11  """ 
 12  import RDConfig 
 13  try: 
 14    from Excel import ExcelWrapper 
 15  except: 
 16    haveExcel = 0 
 17  else: 
 18    haveExcel = 1 
 19   
 20  from Dbase.DbResultSet import DbResultSet,RandomAccessDbResultSet 
21 -def _take(fromL,what):
22 return map(lambda x,y=fromL:y[x],what)
23 24 from Dbase import DbModule 25 import sys,types,string 26 from Dbase import DbInfo 27
28 -def GetColumns(dBase,table,fieldString,user='sysdba',password='masterkey', 29 join=''):
30 """ gets a set of data from a table 31 32 **Arguments** 33 34 - dBase: database name 35 36 - table: table name 37 38 - fieldString: a string with the names of the fields to be extracted, 39 this should be a comma delimited list 40 41 - user and password: 42 43 - join: a join clause (omit the verb 'join') 44 45 46 **Returns** 47 48 - a list of the data 49 50 """ 51 cn = DbModule.connect(dBase,user,password) 52 c = cn.cursor() 53 cmd = 'select %s from %s'%(fieldString,table) 54 if join: 55 if join.strip().find('join') != 0: 56 join = 'join %s'%(join) 57 cmd +=' ' + join 58 c.execute(cmd) 59 return c.fetchall()
60
61 -def GetData(dBase,table,fieldString='*',whereString='',user='sysdba',password='masterkey', 62 removeDups=-1,join='',forceList=0,transform=None,randomAccess=1,extras=None):
63 """ a more flexible method to get a set of data from a table 64 65 **Arguments** 66 67 - fields: a string with the names of the fields to be extracted, 68 this should be a comma delimited list 69 70 - where: the SQL where clause to be used with the DB query 71 72 - removeDups indicates the column which should be used to screen 73 out duplicates. Only the first appearance of a duplicate will 74 be left in the dataset. 75 76 **Returns** 77 78 - a list of the data 79 80 81 **Notes** 82 83 - EFF: this isn't particularly efficient 84 85 """ 86 cn = DbModule.connect(dBase,user,password) 87 c = cn.cursor() 88 cmd = 'select %s from %s'%(fieldString,table) 89 if join: 90 if join.strip().find('join') != 0: 91 join = 'join %s'%(join) 92 cmd += ' ' + join 93 if whereString: 94 if whereString.strip().find('where')!=0: 95 whereString = 'where %s'%(whereString) 96 cmd += ' ' + whereString 97 98 if forceList: 99 try: 100 if not extras: 101 c.execute(cmd) 102 else: 103 c.execute(cmd,extras) 104 except: 105 sys.stderr.write('the command "%s" generated errors:\n'%(cmd)) 106 import traceback 107 traceback.print_exc() 108 return None 109 if transform is not None: 110 raise ValueError,'forceList and transform arguments are not compatible' 111 if not randomAccess: 112 raise ValueError,'when forceList is set, randomAccess must also be used' 113 data = c.fetchall() 114 if removeDups>0: 115 seen = [] 116 for entry in data[:]: 117 if entry[removeDups] in seen: 118 data.remove(entry) 119 else: 120 seen.append(entry[removeDups]) 121 else: 122 if randomAccess: 123 klass = RandomAccessDbResultSet 124 else: 125 klass = DbResultSet 126 127 data = klass(c,cn,cmd,removeDups=removeDups,transform=transform,extras=extras) 128 129 return data
130 131
132 -def DatabaseToExcel(dBase,table,fields='*',join='',where='',wrapper=None, 133 user='sysdba',password='masterkey',lowMemory=False):
134 """ Pulls the contents of a database and puts them in an Excel worksheet 135 136 **Arguments** 137 - dBase: the name of the DB file to be used 138 139 - table: the name of the table to query 140 141 - fields: the fields to select with the SQL query 142 143 - join: the join clause of the SQL query 144 (e.g. 'join foo on foo.bar=base.bar') 145 146 - where: the where clause of the SQL query 147 (e.g. 'where foo = 2' or 'where bar > 17.6') 148 149 - wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used 150 in interacting with Excel 151 152 - user: the username for DB access 153 154 - password: the password to be used for DB access 155 156 """ 157 if not haveExcel: 158 return 159 if wrapper is None: 160 wrapper = ExcelWrapper.ExcelWrapper() 161 wrapper.Visible = 1 162 if len(where) and where.strip().find('where') != 0: 163 where = 'where %s'%(where) 164 if len(join) and join.strip().find('join') != 0: 165 join = 'join %s'%(join) 166 167 sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where) 168 cn = DbModule.connect(dBase,user,password) 169 c = cn.cursor() 170 try: 171 c.execute(sqlCommand) 172 except: 173 print 'problems executing SQL statement %s'%(repr(sqlCommand)) 174 import sys,traceback 175 traceback.print_exc() 176 return 177 headers = [] 178 colsToTake = [] 179 strCols = [] 180 # the description field of the cursor carries around info about the columns 181 # of the table 182 for i in range(len(c.description)): 183 item = c.description[i] 184 if item[1] not in DbInfo.sqlBinTypes: 185 colsToTake.append(i) 186 headers.append(item[0]) 187 if item[1] in DbInfo.sqlTextTypes: 188 strCols.append(len(colsToTake)-1) 189 wrapper.Workbooks.Add() 190 191 r = wrapper.GetRange(1,1,1,len(headers)) 192 # add the headers 193 r.Value = headers 194 # and make them bold 195 r.Font.Bold = 1 196 197 # now just insert the data... easy as pie 198 results = c.fetchall() 199 row = 2 200 fullData = [] 201 for res in results: 202 vs = [res[x] for x in colsToTake] 203 for col in strCols: 204 vs[col] = "'%s"%vs[col] 205 if not lowMemory: 206 fullData.append(vs) 207 else: 208 wrapper[row,1:len(headers)]=vs 209 row+=1 210 if not lowMemory: 211 wrapper[row:row+len(fullData),1:len(headers)] = fullData
212 213 214
215 -def DatabaseToText(dBase,table,fields='*',join='',where='',wrapper=None, 216 user='sysdba',password='masterkey',delim=','):
217 """ Pulls the contents of a database and makes a deliminted text file from them 218 219 **Arguments** 220 - dBase: the name of the DB file to be used 221 222 - table: the name of the table to query 223 224 - fields: the fields to select with the SQL query 225 226 - join: the join clause of the SQL query 227 (e.g. 'join foo on foo.bar=base.bar') 228 229 - where: the where clause of the SQL query 230 (e.g. 'where foo = 2' or 'where bar > 17.6') 231 232 - wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used 233 in interacting with Excel 234 235 - user: the username for DB access 236 237 - password: the password to be used for DB access 238 239 **Returns** 240 241 - the CSV data (as text) 242 243 """ 244 if len(where) and where.strip().find('where')==-1: 245 where = 'where %s'%(where) 246 if len(join) and join.strip().find('join') == -1: 247 join = 'join %s'%(join) 248 sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where) 249 cn = DbModule.connect(dBase,user,password) 250 c = cn.cursor() 251 c.execute(sqlCommand) 252 headers = [] 253 colsToTake = [] 254 # the description field of the cursor carries around info about the columns 255 # of the table 256 for i in range(len(c.description)): 257 item = c.description[i] 258 if item[1] not in DbInfo.sqlBinTypes: 259 colsToTake.append(i) 260 headers.append(item[0]) 261 262 lines = [] 263 lines.append(delim.join(headers)) 264 265 # grab the data 266 results = c.fetchall() 267 for res in results: 268 d = _take(res,colsToTake) 269 lines.append(delim.join(map(str,d))) 270 271 return '\n'.join(lines)
272 273
274 -def TypeFinder(data,nRows,nCols,nullMarker=None):
275 """ 276 277 finds the types of the columns in _data_ 278 279 if nullMarker is not None, elements of the data table which are 280 equal to nullMarker will not count towards setting the type of 281 their columns. 282 283 """ 284 priorities={types.FloatType:3,types.IntType:2,types.StringType:1,-1:-1} 285 res = [None]*nCols 286 for col in xrange(nCols): 287 typeHere = [-1,1] 288 for row in xrange(nRows): 289 d = data[row][col] 290 if d is not None: 291 locType = type(d) 292 if locType != types.FloatType and locType != types.IntType: 293 locType = types.StringType 294 try: 295 d = str(d) 296 except UnicodeError,msg: 297 print 'cannot convert text from row %d col %d to a string'%(row+2,col) 298 print '\t>%s'%(repr(d)) 299 raise UnicodeError,msg 300 else: 301 typeHere[1] = max(typeHere[1],len(str(d))) 302 if locType == types.StringType: 303 if nullMarker is None or d != nullMarker: 304 l = max(len(d),typeHere[1]) 305 typeHere = [types.StringType,l] 306 else: 307 try: 308 fD = float(int(d)) 309 except OverflowError: 310 locType = types.FloatType 311 else: 312 if fD == d: 313 locType = types.IntType 314 if typeHere[0]!=types.StringType and \ 315 priorities[locType] > priorities[typeHere[0]]: 316 typeHere[0] = locType 317 res[col] = typeHere 318 return res
319
320 -def DetermineColTypes(wrapper,nullMarker=None):
321 """This is kind of crude hack to automagically determine the types 322 of columns in the active Excel sheet 323 324 **Arguments** 325 326 - wrapper: the _ExcelWrapper_ to be used in interacting with Excel 327 328 - nullMarker: (optional) if this is not None, elements of the 329 data table which are equal to nullMarker will not count towards 330 setting the type of their columns. 331 332 **Returns** 333 334 - a list of the types of each column 335 336 **Note** 337 338 - we make the assumption that there are only three possible types: int, 339 float and string. 340 341 """ 342 nCols = wrapper.FindLastCol(1,1) 343 nRows = wrapper.FindLastRow(1,1) 344 dList = wrapper[2:nRows,1:nCols] 345 res = TypeFinder(dList,nRows-1,nCols,nullMarker=nullMarker) 346 return res
347 348
349 -def _AdjustColHeadings(colHeadings,maxColLabelLen):
350 """ *For Internal Use* 351 352 removes illegal characters from column headings 353 and truncates those which are too long. 354 355 """ 356 for i in xrange(len(colHeadings)): 357 # replace unallowed characters and strip extra white space 358 colHeadings[i] = string.strip(colHeadings[i]) 359 colHeadings[i] = string.replace(colHeadings[i],' ','_') 360 colHeadings[i] = string.replace(colHeadings[i],'-','_') 361 362 if len(colHeadings[i]) > maxColLabelLen: 363 # interbase (at least) has a limit on the maximum length of a column name 364 newHead = string.replace(colHeadings[i],'_','') 365 newHead = newHead[:maxColLabelLen] 366 print '\tHeading %s too long, changed to %s'%(colHeadings[i],newHead) 367 colHeadings[i] = newHead 368 return colHeadings
369
370 -def GetTypeStrings(colHeadings,colTypes,keyCol=None):
371 """ returns a list of SQL type strings 372 """ 373 typeStrs=[] 374 for i in xrange(len(colTypes)): 375 type = colTypes[i] 376 if type[0] == types.FloatType: 377 typeStrs.append('%s double precision'%colHeadings[i]) 378 elif type[0] == types.IntType: 379 typeStrs.append('%s integer'%colHeadings[i]) 380 else: 381 typeStrs.append('%s varchar(%d)'%(colHeadings[i],type[1])) 382 if colHeadings[i] == keyCol: 383 typeStrs[-1] = '%s not null primary key'%(typeStrs[-1]) 384 return typeStrs
385
386 -def _insertBlock(conn,sqlStr,block,silent=False):
387 try: 388 conn.cursor().executemany(sqlStr,block) 389 except: 390 res = 0 391 conn.commit() 392 for row in block: 393 try: 394 conn.cursor().execute(sqlStr,tuple(row)) 395 res += 1 396 except: 397 if not silent: 398 import traceback 399 traceback.print_exc() 400 print 'insert failed:',sqlStr 401 print '\t',repr(row) 402 else: 403 conn.commit() 404 else: 405 res = len(block) 406 return res
407
408 -def _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data, 409 nullMarker=None,blockSize=100):
410 """ *For Internal Use* 411 412 (drops and) creates a table and then inserts the values 413 414 """ 415 cn = DbModule.connect(dBase,user,password) 416 c = cn.cursor() 417 try: 418 c.execute('drop table %s'%(table)) 419 except: 420 print 'cannot drop table %s'%(table) 421 try: 422 sqlStr = 'create table %s (%s)'%(table,colDefs) 423 c.execute(sqlStr) 424 except: 425 print 'create table failed: ', sqlStr 426 print 'here is the exception:' 427 import traceback 428 traceback.print_exc() 429 return 430 cn.commit() 431 c = None 432 433 block = [] 434 entryTxt = [DbModule.placeHolder]*len(data[0]) 435 dStr = ','.join(entryTxt) 436 sqlStr = 'insert into %s values (%s)'%(table,dStr) 437 nDone = 0 438 for row in data: 439 entries = [None]*len(row) 440 for col in xrange(len(row)): 441 if row[col] is not None and \ 442 (nullMarker is None or row[col] != nullMarker): 443 if colTypes[col][0] == types.FloatType: 444 entries[col] = float(row[col]) 445 elif colTypes[col][0] == types.IntType: 446 entries[col] = int(row[col]) 447 else: 448 entries[col] = str(row[col]) 449 else: 450 entries[col] = None 451 block.append(tuple(entries)) 452 if len(block)>=blockSize: 453 nDone += _insertBlock(cn,sqlStr,block) 454 if not hasattr(cn,'autocommit') or not cn.autocommit: 455 cn.commit() 456 block = [] 457 if len(block): 458 nDone += _insertBlock(cn,sqlStr,block) 459 if not hasattr(cn,'autocommit') or not cn.autocommit: 460 cn.commit()
461 462 463
464 -def ExcelToDatabase(dBase,table,wrapper=None,user='sysdba',password='masterkey', 465 maxColLabelLen=31,keyCol=None,nullMarker=None,force=0):
466 """convert the active excel worksheet into a database. 467 468 this isn't as smooth or slick as the conversion the other way... sad. 469 470 **Arguments** 471 472 - dBase: the name of the DB to use 473 474 - table: the name of the table to create/overwrite 475 476 - wrapper: the _ExcelWrapper_ to use 477 478 - user: the user name to use in connecting to the DB 479 480 - password: the password to use in connecting to the DB 481 482 - maxColLabelLen: the maximum length a column label should be 483 allowed to have (truncation otherwise) 484 485 - keyCol: the column to be used as an index for the db 486 487 **Notes** 488 489 - if _table_ already exists, it is destroyed before we write 490 the new data 491 492 """ 493 if not haveExcel: 494 return 495 table.replace('-','_') 496 table.replace(' ','_') 497 if not force: 498 tblNames = [x.strip() for x in DbInfo.GetTableNames(dBase)] 499 tmp = table.upper() 500 if tmp in tblNames: 501 resp = raw_input('Table %s already exists, overwrite it? '%(table)) 502 if not resp or resp[0] not in ['Y','y']: 503 print 'cancelled' 504 return 505 506 if wrapper is None: 507 wrapper = ExcelWrapper.ExcelWrapper() 508 colHeadings = wrapper.GetHeadings() 509 _AdjustColHeadings(colHeadings,maxColLabelLen) 510 nCols = len(colHeadings) 511 nRows = wrapper.FindLastRow(1,1) 512 data = wrapper[2:nRows,1:nCols] 513 # determine the types of each column 514 colTypes = TypeFinder(data,nRows-1,nCols,nullMarker=nullMarker) 515 typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol) 516 colDefs=','.join(typeStrs) 517 518 _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,nullMarker=nullMarker)
519
520 -def TextFileToDatabase(dBase,table,inF,delim=',', 521 user='sysdba',password='masterkey', 522 maxColLabelLen=31,keyCol=None,nullMarker=None):
523 """loads the contents of the text file into a database. 524 525 **Arguments** 526 527 - dBase: the name of the DB to use 528 529 - table: the name of the table to create/overwrite 530 531 - inF: the file like object from which the data should 532 be pulled (must support readline()) 533 534 - delim: the delimiter used to separate fields 535 536 - user: the user name to use in connecting to the DB 537 538 - password: the password to use in connecting to the DB 539 540 - maxColLabelLen: the maximum length a column label should be 541 allowed to have (truncation otherwise) 542 543 - keyCol: the column to be used as an index for the db 544 545 **Notes** 546 547 - if _table_ already exists, it is destroyed before we write 548 the new data 549 550 - we assume that the first row of the file contains the column names 551 552 """ 553 table.replace('-','_') 554 table.replace(' ','_') 555 556 colHeadings = inF.readline().split(delim) 557 _AdjustColHeadings(colHeadings,maxColLabelLen) 558 nCols = len(colHeadings) 559 data = [] 560 inL = inF.readline() 561 while inL: 562 inL = inL.replace('\r','') 563 inL = inL.replace('\n','') 564 splitL = inL.split(delim) 565 if len(splitL)!=nCols: 566 print '>>>',repr(inL) 567 assert len(splitL)==nCols,'unequal length' 568 tmpVect = [] 569 for entry in splitL: 570 try: 571 val = int(entry) 572 except: 573 try: 574 val = float(entry) 575 except: 576 val = entry 577 tmpVect.append(val) 578 data.append(tmpVect) 579 inL = inF.readline() 580 nRows = len(data) 581 582 # determine the types of each column 583 colTypes = TypeFinder(data,nRows,nCols,nullMarker=nullMarker) 584 typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol) 585 colDefs=','.join(typeStrs) 586 587 _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data, 588 nullMarker=nullMarker)
589 590
591 -def DatabaseToDatabase(fromDb,fromTbl,toDb,toTbl, 592 fields='*',join='',where='', 593 user='sysdba',password='masterkey',keyCol=None,nullMarker='None'):
594 """ 595 596 FIX: at the moment this is a hack 597 598 """ 599 import cStringIO 600 io = cStringIO.StringIO() 601 io.write(DatabaseToText(fromDb,fromTbl,fields=fields,join=join,where=where, 602 user=user,password=password)) 603 io.seek(-1) 604 TextFileToDatabase(toDb,toTbl,io,user=user,password=password,keyCol=keyCol, 605 nullMarker=nullMarker)
606 607 608 if __name__=='__main__': 609 import cStringIO 610 611 io = cStringIO.StringIO() 612 io.write('foo,bar,baz\n') 613 io.write('1,2,3\n') 614 io.write('1.1,4,5\n') 615 io.write('4,foo,6\n') 616 io.seek(0) 617 import RDConfig,os 618 dirLoc = os.path.join(RDConfig.RDCodeDir,'Dbase','TEST.GDB') 619 620 TextFileToDatabase(dirLoc,'fromtext',io) 621