1
2
3
4
5
6
7 """ Tools for doing PubMed searches and processing the results
8
9 NOTE: much of the example code in the documentation here uses XML
10 files from the test_data directory in order to avoid having to call
11 out to PubMed itself. Actual calls to the functions would not include
12 the _conn_ argument.
13
14 """
15 from rdkit import RDConfig
16 import QueryParams,Records
17 import urllib,urllib2
18 from xml.etree import ElementTree
19
21 proxy_support = urllib2.ProxyHandler({})
22 opener = urllib2.build_opener(proxy_support)
23 conn = urllib2.urlopen(url,args)
24 return conn
25
27 """ returns a tuple of pubmed ids (strings) for the query provided
28
29 To do a search, we need a query object:
30 >>> query = QueryParams.details()
31
32 set up the search parameters:
33 >>> query['term'] = 'penzotti je AND grootenhuis pd'
34 >>> query['field'] = 'auth'
35
36 now get the search ids:
37 >>> counts = GetNumHits(query)
38 >>> counts
39 2
40
41 alternately, we can search using field specifiers:
42 >>> query = QueryParams.details()
43 >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]'
44 >>> counts = GetNumHits(query)
45 >>> counts
46 3
47
48
49 """
50 query['rettype']='count'
51 conn = openURL(url,urllib.urlencode(query))
52 pubmed = ElementTree.parse(conn)
53 countText = pubmed.findtext('Count')
54 if countText:
55 res = int(countText)
56 else:
57 res = 0
58 return res
59
60
62 """ returns a tuple of pubmed ids (strings) for the query provided
63
64 To do a search, we need a query object:
65 >>> query = QueryParams.details()
66
67 set up the search parameters:
68 >>> query['term'] = 'penzotti je AND grootenhuis pd'
69 >>> query['field'] = 'auth'
70
71 now get the search ids:
72 >>> ids = GetSearchIds(query)
73 >>> len(ids)
74 2
75 >>> ids[0]
76 '11960484'
77 >>> ids[1]
78 '10893315'
79
80
81 """
82 conn = openURL(url,urllib.urlencode(query))
83 pubmed = ElementTree.parse(conn)
84 res = [id.text for id in pubmed.getiterator('Id')]
85 return tuple(res)
86
88 """ gets a set of document summary records for the ids provided
89
90 >>> ids = ['11960484']
91 >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r'))
92 >>> len(summs)
93 1
94 >>> rec = summs[0]
95 >>> isinstance(rec,Records.SummaryRecord)
96 1
97 >>> rec.PubMedId
98 '11960484'
99 >>> rec.Authors
100 'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
101 >>> rec.Title
102 'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
103 >>> rec.Source
104 'J Med Chem'
105 >>> rec.Volume
106 '45'
107 >>> rec.Pages
108 '1737-40'
109 >>> rec.HasAbstract
110 '1'
111
112 """
113 if not conn:
114 try:
115 iter(ids)
116 except TypeError:
117 ids = [ids,]
118 if not query:
119 query = QueryParams.details()
120 ids = map(str,ids)
121 query['id'] = ','.join(ids)
122 conn = openURL(url,urllib.urlencode(query))
123 pubmed = ElementTree.parse(conn)
124 res = []
125 for summary in pubmed.getiterator('DocSum'):
126 rec = Records.SummaryRecord(summary)
127 if rec.PubMedId in ids:
128 res.append(rec)
129 ids.remove(rec.PubMedId)
130
131 return tuple(res)
132
134 """ gets a set of document summary records for the ids provided
135
136 >>> ids = ['11960484']
137 >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r'))
138 >>> len(recs)
139 1
140 >>> rec = recs[0]
141 >>> rec.PubMedId
142 '11960484'
143 >>> rec.Authors
144 u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
145 >>> rec.Title
146 u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
147 >>> rec.Source
148 u'J Med Chem'
149 >>> rec.Volume
150 '45'
151 >>> rec.Pages
152 '1737-40'
153 >>> rec.PubYear
154 '2002'
155 >>> rec.Abstract[:10]
156 u'P-glycopro'
157
158 We've also got access to keywords:
159 >>> str(rec.keywords[0])
160 'Combinatorial Chemistry Techniques'
161 >>> str(rec.keywords[3])
162 'Indinavir / chemistry'
163
164 and chemicals:
165 >>> rec.chemicals[0]
166 'P-Glycoprotein'
167 >>> rec.chemicals[2]
168 'Nicardipine <55985-32-5>'
169
170
171 """
172 if not conn:
173 try:
174 iter(ids)
175 except TypeError:
176 ids = [ids,]
177 if not query:
178 query = QueryParams.details()
179 query['id'] = ','.join(map(str,ids))
180 conn = openURL(url,urllib.urlencode(query))
181
182 pubmed = ElementTree.parse(conn)
183 res = []
184 for article in pubmed.getiterator('PubmedArticle'):
185 rec = Records.JournalArticleRecord(article)
186 if rec.PubMedId in ids:
187 res.append(rec)
188 return tuple(res)
189
191 if not conn:
192 try:
193 iter(ids)
194 except TypeError:
195 ids = [ids,]
196 if not query:
197 query = QueryParams.details()
198 query['id'] = ','.join(map(str,ids))
199 conn = openURL(url,urllib.urlencode(query))
200 query['cmd'] = 'ncheck'
201 pubmed = ElementTree.parse(conn)
202
203 checklist = pubmed.find('LinkSet/IdCheckList')
204 recs = [Records.LinkRecord(x) for x in checklist.getiterator('Id')]
205
206 res = {}
207 for rec in recs:
208 id = rec.PubMedId
209 res[id] = rec.HasNeighbor
210 return res
211
213 if not conn:
214 try:
215 iter(ids)
216 except TypeError:
217 ids = [ids,]
218 if not query:
219 query = QueryParams.details()
220 query['id'] = ','.join(map(str,ids))
221 conn = openURL(url,urllib.urlencode(query))
222 query['cmd'] = 'neighbor'
223
224 pubmed = ElementTree.parse(conn)
225 linkset = pubmed.find('LinkSet/LinkSetDb')
226 scores = []
227 scoreNorm = 1.0
228 for link in linkset.getiterator('Link'):
229 id = link.findtext('Id')
230 score = float(link.findtext('Score'))
231 scores.append([id,score])
232
233 if id == ids[0]:
234 scoreNorm = score
235 for i in range(len(scores)):
236 id,score = scores[i]
237 scores[i] = id,score/scoreNorm
238 return tuple(scores)
239
240
241
242
243
244
246 import doctest,sys
247 return doctest.testmod(sys.modules["__main__"])
248
249 if __name__ == '__main__':
250 import sys,os.path
251 testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data')
252 failed,tried = _test()
253 sys.exit(failed)
254
255
256
257
258
259
260 ids = ['11666868','11169640']
261 if 0:
262 summs = GetSummaries(ids,conn=open('summary.xml','r'))
263 print 'summs:',summs
264 for summary in summs:
265 print summary.Authors
266 print '\t',summary.Title
267 print '\t',summary.Source,
268 print summary.Volume,
269 print summary.Pages,
270 print summary.PubDate
271
272 if 0:
273 ids = ['11666868']
274 res = GetRecords(ids,conn=open('records.xml','r'))
275 for record in res:
276 print record.Authors
277 print '\t',record.Title
278 print '\t',record.Journal,
279 print record.Volume,
280 print record.Pages,
281 print record.PubYear
282 print
283
284 if 0:
285 ids = ['11666868','11169640']
286 res = CheckForLinks(ids,conn=open('haslinks.xml','r'))
287 print res
288
289 if 0:
290 ids = ['11666868']
291 res = GetLinks(ids,conn=open('links.xml','r'))
292
293 for id,score in res[:10]:
294 print id,score
295