1
2
3
4
5
6
7 """ a set of functions for interacting with databases
8
9 When possible, it's probably preferable to use a _DbConnection.DbConnect_ object
10
11 """
12 from rdkit import RDConfig
13 try:
14 from rdkit.Excel import ExcelWrapper
15 except:
16 haveExcel = 0
17 else:
18 haveExcel = 1
19
20 from rdkit.Dbase.DbResultSet import DbResultSet,RandomAccessDbResultSet
22 return map(lambda x,y=fromL:y[x],what)
23
24 from rdkit.Dbase import DbModule
25 import sys,types,string
26 from rdkit.Dbase import DbInfo
27
28 -def GetColumns(dBase,table,fieldString,user='sysdba',password='masterkey',
29 join='',cn=None):
30 """ gets a set of data from a table
31
32 **Arguments**
33
34 - dBase: database name
35
36 - table: table name
37
38 - fieldString: a string with the names of the fields to be extracted,
39 this should be a comma delimited list
40
41 - user and password:
42
43 - join: a join clause (omit the verb 'join')
44
45
46 **Returns**
47
48 - a list of the data
49
50 """
51 if not cn:
52 cn = DbModule.connect(dBase,user,password)
53 c = cn.cursor()
54 cmd = 'select %s from %s'%(fieldString,table)
55 if join:
56 if join.strip().find('join') != 0:
57 join = 'join %s'%(join)
58 cmd +=' ' + join
59 c.execute(cmd)
60 return c.fetchall()
61
62 -def GetData(dBase,table,fieldString='*',whereString='',user='sysdba',password='masterkey',
63 removeDups=-1,join='',forceList=0,transform=None,randomAccess=1,extras=None,cn=None):
64 """ a more flexible method to get a set of data from a table
65
66 **Arguments**
67
68 - fields: a string with the names of the fields to be extracted,
69 this should be a comma delimited list
70
71 - where: the SQL where clause to be used with the DB query
72
73 - removeDups indicates the column which should be used to screen
74 out duplicates. Only the first appearance of a duplicate will
75 be left in the dataset.
76
77 **Returns**
78
79 - a list of the data
80
81
82 **Notes**
83
84 - EFF: this isn't particularly efficient
85
86 """
87 if not cn:
88 cn = DbModule.connect(dBase,user,password)
89 c = cn.cursor()
90 cmd = 'select %s from %s'%(fieldString,table)
91 if join:
92 if join.strip().find('join') != 0:
93 join = 'join %s'%(join)
94 cmd += ' ' + join
95 if whereString:
96 if whereString.strip().find('where')!=0:
97 whereString = 'where %s'%(whereString)
98 cmd += ' ' + whereString
99
100 if forceList:
101 try:
102 if not extras:
103 c.execute(cmd)
104 else:
105 c.execute(cmd,extras)
106 except:
107 sys.stderr.write('the command "%s" generated errors:\n'%(cmd))
108 import traceback
109 traceback.print_exc()
110 return None
111 if transform is not None:
112 raise ValueError,'forceList and transform arguments are not compatible'
113 if not randomAccess:
114 raise ValueError,'when forceList is set, randomAccess must also be used'
115 data = c.fetchall()
116 if removeDups>0:
117 seen = []
118 for entry in data[:]:
119 if entry[removeDups] in seen:
120 data.remove(entry)
121 else:
122 seen.append(entry[removeDups])
123 else:
124 if randomAccess:
125 klass = RandomAccessDbResultSet
126 else:
127 klass = DbResultSet
128
129 data = klass(c,cn,cmd,removeDups=removeDups,transform=transform,extras=extras)
130
131 return data
132
133
134 -def DatabaseToExcel(dBase,table,fields='*',join='',where='',wrapper=None,
135 user='sysdba',password='masterkey',lowMemory=False,cn=None):
136 """ Pulls the contents of a database and puts them in an Excel worksheet
137
138 **Arguments**
139 - dBase: the name of the DB file to be used
140
141 - table: the name of the table to query
142
143 - fields: the fields to select with the SQL query
144
145 - join: the join clause of the SQL query
146 (e.g. 'join foo on foo.bar=base.bar')
147
148 - where: the where clause of the SQL query
149 (e.g. 'where foo = 2' or 'where bar > 17.6')
150
151 - wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used
152 in interacting with Excel
153
154 - user: the username for DB access
155
156 - password: the password to be used for DB access
157
158 """
159 if not haveExcel:
160 return
161 if wrapper is None:
162 wrapper = ExcelWrapper.ExcelWrapper()
163 wrapper.Visible = 1
164 if len(where) and where.strip().find('where') != 0:
165 where = 'where %s'%(where)
166 if len(join) and join.strip().find('join') != 0:
167 join = 'join %s'%(join)
168
169 sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where)
170 if not cn:
171 cn = DbModule.connect(dBase,user,password)
172 c = cn.cursor()
173 try:
174 c.execute(sqlCommand)
175 except:
176 print 'problems executing SQL statement %s'%(repr(sqlCommand))
177 import sys,traceback
178 traceback.print_exc()
179 return
180 headers = []
181 colsToTake = []
182 strCols = []
183
184
185 for i in range(len(c.description)):
186 item = c.description[i]
187 if item[1] not in DbInfo.sqlBinTypes:
188 colsToTake.append(i)
189 headers.append(item[0])
190 if item[1] in DbInfo.sqlTextTypes:
191 strCols.append(len(colsToTake)-1)
192 wrapper.Workbooks.Add()
193
194 r = wrapper.GetRange(1,1,1,len(headers))
195
196 r.Value = headers
197
198 r.Font.Bold = 1
199
200
201 results = c.fetchall()
202 row = 2
203 fullData = []
204 for res in results:
205 vs = [res[x] for x in colsToTake]
206 for col in strCols:
207 vs[col] = "'%s"%vs[col]
208 if not lowMemory:
209 fullData.append(vs)
210 else:
211 wrapper[row,1:len(headers)]=vs
212 row+=1
213 if not lowMemory:
214 wrapper[row:row+len(fullData),1:len(headers)] = fullData
215
216
217
218 -def DatabaseToText(dBase,table,fields='*',join='',where='',wrapper=None,
219 user='sysdba',password='masterkey',delim=',',cn=None):
220 """ Pulls the contents of a database and makes a deliminted text file from them
221
222 **Arguments**
223 - dBase: the name of the DB file to be used
224
225 - table: the name of the table to query
226
227 - fields: the fields to select with the SQL query
228
229 - join: the join clause of the SQL query
230 (e.g. 'join foo on foo.bar=base.bar')
231
232 - where: the where clause of the SQL query
233 (e.g. 'where foo = 2' or 'where bar > 17.6')
234
235 - wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used
236 in interacting with Excel
237
238 - user: the username for DB access
239
240 - password: the password to be used for DB access
241
242 **Returns**
243
244 - the CSV data (as text)
245
246 """
247 if len(where) and where.strip().find('where')==-1:
248 where = 'where %s'%(where)
249 if len(join) and join.strip().find('join') == -1:
250 join = 'join %s'%(join)
251 sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where)
252 if not cn:
253 cn = DbModule.connect(dBase,user,password)
254 c = cn.cursor()
255 c.execute(sqlCommand)
256 headers = []
257 colsToTake = []
258
259
260 for i in range(len(c.description)):
261 item = c.description[i]
262 if item[1] not in DbInfo.sqlBinTypes:
263 colsToTake.append(i)
264 headers.append(item[0])
265
266 lines = []
267 lines.append(delim.join(headers))
268
269
270 results = c.fetchall()
271 for res in results:
272 d = _take(res,colsToTake)
273 lines.append(delim.join(map(str,d)))
274
275 return '\n'.join(lines)
276
277
279 """
280
281 finds the types of the columns in _data_
282
283 if nullMarker is not None, elements of the data table which are
284 equal to nullMarker will not count towards setting the type of
285 their columns.
286
287 """
288 priorities={types.FloatType:3,types.IntType:2,types.StringType:1,-1:-1}
289 res = [None]*nCols
290 for col in xrange(nCols):
291 typeHere = [-1,1]
292 for row in xrange(nRows):
293 d = data[row][col]
294 if d is not None:
295 locType = type(d)
296 if locType != types.FloatType and locType != types.IntType:
297 locType = types.StringType
298 try:
299 d = str(d)
300 except UnicodeError,msg:
301 print 'cannot convert text from row %d col %d to a string'%(row+2,col)
302 print '\t>%s'%(repr(d))
303 raise UnicodeError,msg
304 else:
305 typeHere[1] = max(typeHere[1],len(str(d)))
306 if locType == types.StringType:
307 if nullMarker is None or d != nullMarker:
308 l = max(len(d),typeHere[1])
309 typeHere = [types.StringType,l]
310 else:
311 try:
312 fD = float(int(d))
313 except OverflowError:
314 locType = types.FloatType
315 else:
316 if fD == d:
317 locType = types.IntType
318 if typeHere[0]!=types.StringType and \
319 priorities[locType] > priorities[typeHere[0]]:
320 typeHere[0] = locType
321 res[col] = typeHere
322 return res
323
325 """This is kind of crude hack to automagically determine the types
326 of columns in the active Excel sheet
327
328 **Arguments**
329
330 - wrapper: the _ExcelWrapper_ to be used in interacting with Excel
331
332 - nullMarker: (optional) if this is not None, elements of the
333 data table which are equal to nullMarker will not count towards
334 setting the type of their columns.
335
336 **Returns**
337
338 - a list of the types of each column
339
340 **Note**
341
342 - we make the assumption that there are only three possible types: int,
343 float and string.
344
345 """
346 nCols = wrapper.FindLastCol(1,1)
347 nRows = wrapper.FindLastRow(1,1)
348 dList = wrapper[2:nRows,1:nCols]
349 res = TypeFinder(dList,nRows-1,nCols,nullMarker=nullMarker)
350 return res
351
352
354 """ *For Internal Use*
355
356 removes illegal characters from column headings
357 and truncates those which are too long.
358
359 """
360 for i in xrange(len(colHeadings)):
361
362 colHeadings[i] = string.strip(colHeadings[i])
363 colHeadings[i] = string.replace(colHeadings[i],' ','_')
364 colHeadings[i] = string.replace(colHeadings[i],'-','_')
365 colHeadings[i] = string.replace(colHeadings[i],'.','_')
366
367 if len(colHeadings[i]) > maxColLabelLen:
368
369 newHead = string.replace(colHeadings[i],'_','')
370 newHead = newHead[:maxColLabelLen]
371 print '\tHeading %s too long, changed to %s'%(colHeadings[i],newHead)
372 colHeadings[i] = newHead
373 return colHeadings
374
376 """ returns a list of SQL type strings
377 """
378 typeStrs=[]
379 for i in xrange(len(colTypes)):
380 type = colTypes[i]
381 if type[0] == types.FloatType:
382 typeStrs.append('%s double precision'%colHeadings[i])
383 elif type[0] == types.IntType:
384 typeStrs.append('%s integer'%colHeadings[i])
385 else:
386 typeStrs.append('%s varchar(%d)'%(colHeadings[i],type[1]))
387 if colHeadings[i] == keyCol:
388 typeStrs[-1] = '%s not null primary key'%(typeStrs[-1])
389 return typeStrs
390
392 try:
393 conn.cursor().executemany(sqlStr,block)
394 except:
395 res = 0
396 conn.commit()
397 for row in block:
398 try:
399 conn.cursor().execute(sqlStr,tuple(row))
400 res += 1
401 except:
402 if not silent:
403 import traceback
404 traceback.print_exc()
405 print 'insert failed:',sqlStr
406 print '\t',repr(row)
407 else:
408 conn.commit()
409 else:
410 res = len(block)
411 return res
412
413 -def _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,
414 nullMarker=None,blockSize=100,cn=None):
415 """ *For Internal Use*
416
417 (drops and) creates a table and then inserts the values
418
419 """
420 if not cn:
421 cn = DbModule.connect(dBase,user,password)
422 c = cn.cursor()
423 try:
424 c.execute('drop table %s'%(table))
425 except:
426 print 'cannot drop table %s'%(table)
427 try:
428 sqlStr = 'create table %s (%s)'%(table,colDefs)
429 c.execute(sqlStr)
430 except:
431 print 'create table failed: ', sqlStr
432 print 'here is the exception:'
433 import traceback
434 traceback.print_exc()
435 return
436 cn.commit()
437 c = None
438
439 block = []
440 entryTxt = [DbModule.placeHolder]*len(data[0])
441 dStr = ','.join(entryTxt)
442 sqlStr = 'insert into %s values (%s)'%(table,dStr)
443 nDone = 0
444 for row in data:
445 entries = [None]*len(row)
446 for col in xrange(len(row)):
447 if row[col] is not None and \
448 (nullMarker is None or row[col] != nullMarker):
449 if colTypes[col][0] == types.FloatType:
450 entries[col] = float(row[col])
451 elif colTypes[col][0] == types.IntType:
452 entries[col] = int(row[col])
453 else:
454 entries[col] = str(row[col])
455 else:
456 entries[col] = None
457 block.append(tuple(entries))
458 if len(block)>=blockSize:
459 nDone += _insertBlock(cn,sqlStr,block)
460 if not hasattr(cn,'autocommit') or not cn.autocommit:
461 cn.commit()
462 block = []
463 if len(block):
464 nDone += _insertBlock(cn,sqlStr,block)
465 if not hasattr(cn,'autocommit') or not cn.autocommit:
466 cn.commit()
467
468
469
470 -def ExcelToDatabase(dBase,table,wrapper=None,user='sysdba',password='masterkey',
471 maxColLabelLen=31,keyCol=None,nullMarker=None,force=0):
472 """convert the active excel worksheet into a database.
473
474 this isn't as smooth or slick as the conversion the other way... sad.
475
476 **Arguments**
477
478 - dBase: the name of the DB to use
479
480 - table: the name of the table to create/overwrite
481
482 - wrapper: the _ExcelWrapper_ to use
483
484 - user: the user name to use in connecting to the DB
485
486 - password: the password to use in connecting to the DB
487
488 - maxColLabelLen: the maximum length a column label should be
489 allowed to have (truncation otherwise)
490
491 - keyCol: the column to be used as an index for the db
492
493 **Notes**
494
495 - if _table_ already exists, it is destroyed before we write
496 the new data
497
498 """
499 if not haveExcel:
500 return
501 table.replace('-','_')
502 table.replace(' ','_')
503 if not force:
504 tblNames = [x.strip() for x in DbInfo.GetTableNames(dBase)]
505 tmp = table.upper()
506 if tmp in tblNames:
507 resp = raw_input('Table %s already exists, overwrite it? '%(table))
508 if not resp or resp[0] not in ['Y','y']:
509 print 'cancelled'
510 return
511
512 if wrapper is None:
513 wrapper = ExcelWrapper.ExcelWrapper()
514 colHeadings = wrapper.GetHeadings()
515 _AdjustColHeadings(colHeadings,maxColLabelLen)
516 nCols = len(colHeadings)
517 nRows = wrapper.FindLastRow(1,1)
518 data = wrapper[2:nRows,1:nCols]
519
520 colTypes = TypeFinder(data,nRows-1,nCols,nullMarker=nullMarker)
521 typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol)
522 colDefs=','.join(typeStrs)
523
524 _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,nullMarker=nullMarker)
525
526 -def TextFileToDatabase(dBase,table,inF,delim=',',
527 user='sysdba',password='masterkey',
528 maxColLabelLen=31,keyCol=None,nullMarker=None):
529 """loads the contents of the text file into a database.
530
531 **Arguments**
532
533 - dBase: the name of the DB to use
534
535 - table: the name of the table to create/overwrite
536
537 - inF: the file like object from which the data should
538 be pulled (must support readline())
539
540 - delim: the delimiter used to separate fields
541
542 - user: the user name to use in connecting to the DB
543
544 - password: the password to use in connecting to the DB
545
546 - maxColLabelLen: the maximum length a column label should be
547 allowed to have (truncation otherwise)
548
549 - keyCol: the column to be used as an index for the db
550
551 **Notes**
552
553 - if _table_ already exists, it is destroyed before we write
554 the new data
555
556 - we assume that the first row of the file contains the column names
557
558 """
559 table.replace('-','_')
560 table.replace(' ','_')
561
562 colHeadings = inF.readline().split(delim)
563 _AdjustColHeadings(colHeadings,maxColLabelLen)
564 nCols = len(colHeadings)
565 data = []
566 inL = inF.readline()
567 while inL:
568 inL = inL.replace('\r','')
569 inL = inL.replace('\n','')
570 splitL = inL.split(delim)
571 if len(splitL)!=nCols:
572 print '>>>',repr(inL)
573 assert len(splitL)==nCols,'unequal length'
574 tmpVect = []
575 for entry in splitL:
576 try:
577 val = int(entry)
578 except:
579 try:
580 val = float(entry)
581 except:
582 val = entry
583 tmpVect.append(val)
584 data.append(tmpVect)
585 inL = inF.readline()
586 nRows = len(data)
587
588
589 colTypes = TypeFinder(data,nRows,nCols,nullMarker=nullMarker)
590 typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol)
591 colDefs=','.join(typeStrs)
592
593 _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,
594 nullMarker=nullMarker)
595
596
597 -def DatabaseToDatabase(fromDb,fromTbl,toDb,toTbl,
598 fields='*',join='',where='',
599 user='sysdba',password='masterkey',keyCol=None,nullMarker='None'):
600 """
601
602 FIX: at the moment this is a hack
603
604 """
605 import cStringIO
606 io = cStringIO.StringIO()
607 io.write(DatabaseToText(fromDb,fromTbl,fields=fields,join=join,where=where,
608 user=user,password=password))
609 io.seek(-1)
610 TextFileToDatabase(toDb,toTbl,io,user=user,password=password,keyCol=keyCol,
611 nullMarker=nullMarker)
612
613
614 if __name__=='__main__':
615 import cStringIO
616
617 io = cStringIO.StringIO()
618 io.write('foo,bar,baz\n')
619 io.write('1,2,3\n')
620 io.write('1.1,4,5\n')
621 io.write('4,foo,6\n')
622 io.seek(0)
623 from rdkit import RDConfig
624 import os
625 dirLoc = os.path.join(RDConfig.RDCodeDir,'Dbase','TEST.GDB')
626
627 TextFileToDatabase(dirLoc,'fromtext',io)
628