Package Dbase :: Package Pubmed :: Module Searches
[hide private]
[frames] | no frames]

Source Code for Module Dbase.Pubmed.Searches

  1  # $Id: Searches.py 486 2008-01-19 14:19:19Z glandrum $ 
  2  # 
  3  # Copyright (C) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ Tools for doing PubMed searches and processing the results 
  8   
  9  NOTE: much of the example code in the documentation here uses XML 
 10  files from the test_data directory in order to avoid having to call 
 11  out to PubMed itself.  Actual calls to the functions would not include 
 12  the _conn_ argument. 
 13   
 14  """ 
 15  import RDConfig 
 16  import QueryParams,Records 
 17  import urllib,urllib2 
 18  from xml.etree import ElementTree 
 19   
20 -def GetNumHits(query,url=QueryParams.searchBase):
21 """ returns a tuple of pubmed ids (strings) for the query provided 22 23 To do a search, we need a query object: 24 >>> query = QueryParams.details() 25 26 set up the search parameters: 27 >>> query['term'] = 'penzotti je AND grootenhuis pd' 28 >>> query['field'] = 'auth' 29 30 now get the search ids: 31 >>> counts = GetNumHits(query) 32 >>> counts 33 2 34 35 alternately, we can search using field specifiers: 36 >>> query = QueryParams.details() 37 >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]' 38 >>> counts = GetNumHits(query) 39 >>> counts 40 3 41 42 43 """ 44 query['rettype']='count' 45 conn = urllib2.urlopen(url,urllib.urlencode(query)) 46 pubmed = ElementTree.parse(conn) 47 countText = pubmed.findtext('Count') 48 if countText: 49 res = int(countText) 50 else: 51 res = 0 52 return res
53 54
55 -def GetSearchIds(query,url=QueryParams.searchBase):
56 """ returns a tuple of pubmed ids (strings) for the query provided 57 58 To do a search, we need a query object: 59 >>> query = QueryParams.details() 60 61 set up the search parameters: 62 >>> query['term'] = 'penzotti je AND grootenhuis pd' 63 >>> query['field'] = 'auth' 64 65 now get the search ids: 66 >>> ids = GetSearchIds(query) 67 >>> len(ids) 68 2 69 >>> ids[0] 70 '11960484' 71 >>> ids[1] 72 '10893315' 73 74 75 """ 76 conn = urllib2.urlopen(url,urllib.urlencode(query)) 77 pubmed = ElementTree.parse(conn) 78 res = [id.text for id in pubmed.getiterator('Id')] 79 return tuple(res)
80
81 -def GetSummaries(ids,query=None,url=QueryParams.summaryBase,conn=None):
82 """ gets a set of document summary records for the ids provided 83 84 >>> ids = ['11960484'] 85 >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r')) 86 >>> len(summs) 87 1 88 >>> rec = summs[0] 89 >>> isinstance(rec,Records.SummaryRecord) 90 1 91 >>> rec.PubMedId 92 '11960484' 93 >>> rec.Authors 94 'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD' 95 >>> rec.Title 96 'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.' 97 >>> rec.Source 98 'J Med Chem' 99 >>> rec.Volume 100 '45' 101 >>> rec.Pages 102 '1737-40' 103 >>> rec.HasAbstract 104 '1' 105 106 """ 107 if not conn: 108 try: 109 iter(ids) 110 except TypeError: 111 ids = [ids,] 112 if not query: 113 query = QueryParams.details() 114 ids = map(str,ids) 115 query['id'] = ','.join(ids) 116 conn = urllib2.urlopen(url,urllib.urlencode(query)) 117 pubmed = ElementTree.parse(conn) 118 res = [] 119 for summary in pubmed.getiterator('DocSum'): 120 rec = Records.SummaryRecord(summary) 121 if rec.PubMedId in ids: 122 res.append(rec) 123 ids.remove(rec.PubMedId) 124 125 return tuple(res)
126
127 -def GetRecords(ids,query=None,url=QueryParams.fetchBase,conn=None):
128 """ gets a set of document summary records for the ids provided 129 130 >>> ids = ['11960484'] 131 >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r')) 132 >>> len(recs) 133 1 134 >>> rec = recs[0] 135 >>> rec.PubMedId 136 '11960484' 137 >>> rec.Authors 138 u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD' 139 >>> rec.Title 140 u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.' 141 >>> rec.Source 142 u'J Med Chem' 143 >>> rec.Volume 144 '45' 145 >>> rec.Pages 146 '1737-40' 147 >>> rec.PubYear 148 '2002' 149 >>> rec.Abstract[:10] 150 u'P-glycopro' 151 152 We've also got access to keywords: 153 >>> str(rec.keywords[0]) 154 'Combinatorial Chemistry Techniques' 155 >>> str(rec.keywords[3]) 156 'Indinavir / chemistry' 157 158 and chemicals: 159 >>> rec.chemicals[0] 160 'P-Glycoprotein' 161 >>> rec.chemicals[2] 162 'Nicardipine <55985-32-5>' 163 164 165 """ 166 if not conn: 167 try: 168 iter(ids) 169 except TypeError: 170 ids = [ids,] 171 if not query: 172 query = QueryParams.details() 173 query['id'] = ','.join(map(str,ids)) 174 conn = urllib2.urlopen(url,urllib.urlencode(query)) 175 176 pubmed = ElementTree.parse(conn) 177 res = [] 178 for article in pubmed.getiterator('PubmedArticle'): 179 rec = Records.JournalArticleRecord(article) 180 if rec.PubMedId in ids: 181 res.append(rec) 182 return tuple(res)
183 205 233 234 235 #------------------------------------ 236 # 237 # doctest boilerplate 238 #
239 -def _test():
240 import doctest,sys 241 return doctest.testmod(sys.modules["__main__"])
242 243 if __name__ == '__main__': 244 import sys,os.path 245 testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data') 246 failed,tried = _test() 247 sys.exit(failed) 248 #query = QueryParams.details() 249 #query['term']='landrum ga' 250 #query['field']='auth' 251 #ids = GetSearchIds(query) 252 #print ids 253 #ids = ids[:2] 254 ids = ['11666868','11169640'] 255 if 0: 256 summs = GetSummaries(ids,conn=open('summary.xml','r')) 257 print 'summs:',summs 258 for summary in summs: 259 print summary.Authors 260 print '\t',summary.Title 261 print '\t',summary.Source, 262 print summary.Volume, 263 print summary.Pages, 264 print summary.PubDate 265 266 if 0: 267 ids = ['11666868'] 268 res = GetRecords(ids,conn=open('records.xml','r')) 269 for record in res: 270 print record.Authors 271 print '\t',record.Title 272 print '\t',record.Journal, 273 print record.Volume, 274 print record.Pages, 275 print record.PubYear 276 print 277 278 if 0: 279 ids = ['11666868','11169640'] 280 res = CheckForLinks(ids,conn=open('haslinks.xml','r')) 281 print res 282 283 if 0: 284 ids = ['11666868'] 285 res = GetLinks(ids,conn=open('links.xml','r')) 286 #res = GetLinks(ids) 287 for id,score in res[:10]: 288 print id,score 289