1
2
3
4
5
6
7 """ Tools for doing PubMed searches and processing the results
8
9 NOTE: much of the example code in the documentation here uses XML
10 files from the test_data directory in order to avoid having to call
11 out to PubMed itself. Actual calls to the functions would not include
12 the _conn_ argument.
13
14 """
15 import RDConfig
16 import QueryParams,Records
17 import urllib,urllib2
18 from xml.etree import ElementTree
19
21 """ returns a tuple of pubmed ids (strings) for the query provided
22
23 To do a search, we need a query object:
24 >>> query = QueryParams.details()
25
26 set up the search parameters:
27 >>> query['term'] = 'penzotti je AND grootenhuis pd'
28 >>> query['field'] = 'auth'
29
30 now get the search ids:
31 >>> counts = GetNumHits(query)
32 >>> counts
33 2
34
35 alternately, we can search using field specifiers:
36 >>> query = QueryParams.details()
37 >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]'
38 >>> counts = GetNumHits(query)
39 >>> counts
40 3
41
42
43 """
44 query['rettype']='count'
45 conn = urllib2.urlopen(url,urllib.urlencode(query))
46 pubmed = ElementTree.parse(conn)
47 countText = pubmed.findtext('Count')
48 if countText:
49 res = int(countText)
50 else:
51 res = 0
52 return res
53
54
56 """ returns a tuple of pubmed ids (strings) for the query provided
57
58 To do a search, we need a query object:
59 >>> query = QueryParams.details()
60
61 set up the search parameters:
62 >>> query['term'] = 'penzotti je AND grootenhuis pd'
63 >>> query['field'] = 'auth'
64
65 now get the search ids:
66 >>> ids = GetSearchIds(query)
67 >>> len(ids)
68 2
69 >>> ids[0]
70 '11960484'
71 >>> ids[1]
72 '10893315'
73
74
75 """
76 conn = urllib2.urlopen(url,urllib.urlencode(query))
77 pubmed = ElementTree.parse(conn)
78 res = [id.text for id in pubmed.getiterator('Id')]
79 return tuple(res)
80
82 """ gets a set of document summary records for the ids provided
83
84 >>> ids = ['11960484']
85 >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r'))
86 >>> len(summs)
87 1
88 >>> rec = summs[0]
89 >>> isinstance(rec,Records.SummaryRecord)
90 1
91 >>> rec.PubMedId
92 '11960484'
93 >>> rec.Authors
94 'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
95 >>> rec.Title
96 'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
97 >>> rec.Source
98 'J Med Chem'
99 >>> rec.Volume
100 '45'
101 >>> rec.Pages
102 '1737-40'
103 >>> rec.HasAbstract
104 '1'
105
106 """
107 if not conn:
108 try:
109 iter(ids)
110 except TypeError:
111 ids = [ids,]
112 if not query:
113 query = QueryParams.details()
114 ids = map(str,ids)
115 query['id'] = ','.join(ids)
116 conn = urllib2.urlopen(url,urllib.urlencode(query))
117 pubmed = ElementTree.parse(conn)
118 res = []
119 for summary in pubmed.getiterator('DocSum'):
120 rec = Records.SummaryRecord(summary)
121 if rec.PubMedId in ids:
122 res.append(rec)
123 ids.remove(rec.PubMedId)
124
125 return tuple(res)
126
128 """ gets a set of document summary records for the ids provided
129
130 >>> ids = ['11960484']
131 >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r'))
132 >>> len(recs)
133 1
134 >>> rec = recs[0]
135 >>> rec.PubMedId
136 '11960484'
137 >>> rec.Authors
138 u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
139 >>> rec.Title
140 u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
141 >>> rec.Source
142 u'J Med Chem'
143 >>> rec.Volume
144 '45'
145 >>> rec.Pages
146 '1737-40'
147 >>> rec.PubYear
148 '2002'
149 >>> rec.Abstract[:10]
150 u'P-glycopro'
151
152 We've also got access to keywords:
153 >>> str(rec.keywords[0])
154 'Combinatorial Chemistry Techniques'
155 >>> str(rec.keywords[3])
156 'Indinavir / chemistry'
157
158 and chemicals:
159 >>> rec.chemicals[0]
160 'P-Glycoprotein'
161 >>> rec.chemicals[2]
162 'Nicardipine <55985-32-5>'
163
164
165 """
166 if not conn:
167 try:
168 iter(ids)
169 except TypeError:
170 ids = [ids,]
171 if not query:
172 query = QueryParams.details()
173 query['id'] = ','.join(map(str,ids))
174 conn = urllib2.urlopen(url,urllib.urlencode(query))
175
176 pubmed = ElementTree.parse(conn)
177 res = []
178 for article in pubmed.getiterator('PubmedArticle'):
179 rec = Records.JournalArticleRecord(article)
180 if rec.PubMedId in ids:
181 res.append(rec)
182 return tuple(res)
183
185 if not conn:
186 try:
187 iter(ids)
188 except TypeError:
189 ids = [ids,]
190 if not query:
191 query = QueryParams.details()
192 query['id'] = ','.join(map(str,ids))
193 conn = urllib2.urlopen(url,urllib.urlencode(query))
194 query['cmd'] = 'ncheck'
195 pubmed = ElementTree.parse(conn)
196
197 checklist = pubmed.find('LinkSet/IdCheckList')
198 recs = [Records.LinkRecord(x) for x in checklist.getiterator('Id')]
199
200 res = {}
201 for rec in recs:
202 id = rec.PubMedId
203 res[id] = rec.HasNeighbor
204 return res
205
207 if not conn:
208 try:
209 iter(ids)
210 except TypeError:
211 ids = [ids,]
212 if not query:
213 query = QueryParams.details()
214 query['id'] = ','.join(map(str,ids))
215 conn = urllib2.urlopen(url,urllib.urlencode(query))
216 query['cmd'] = 'neighbor'
217
218 pubmed = ElementTree.parse(conn)
219 linkset = pubmed.find('LinkSet/LinkSetDb')
220 scores = []
221 scoreNorm = 1.0
222 for link in linkset.getiterator('Link'):
223 id = link.findtext('Id')
224 score = float(link.findtext('Score'))
225 scores.append([id,score])
226
227 if id == ids[0]:
228 scoreNorm = score
229 for i in range(len(scores)):
230 id,score = scores[i]
231 scores[i] = id,score/scoreNorm
232 return tuple(scores)
233
234
235
236
237
238
240 import doctest,sys
241 return doctest.testmod(sys.modules["__main__"])
242
243 if __name__ == '__main__':
244 import sys,os.path
245 testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data')
246 failed,tried = _test()
247 sys.exit(failed)
248
249
250
251
252
253
254 ids = ['11666868','11169640']
255 if 0:
256 summs = GetSummaries(ids,conn=open('summary.xml','r'))
257 print 'summs:',summs
258 for summary in summs:
259 print summary.Authors
260 print '\t',summary.Title
261 print '\t',summary.Source,
262 print summary.Volume,
263 print summary.Pages,
264 print summary.PubDate
265
266 if 0:
267 ids = ['11666868']
268 res = GetRecords(ids,conn=open('records.xml','r'))
269 for record in res:
270 print record.Authors
271 print '\t',record.Title
272 print '\t',record.Journal,
273 print record.Volume,
274 print record.Pages,
275 print record.PubYear
276 print
277
278 if 0:
279 ids = ['11666868','11169640']
280 res = CheckForLinks(ids,conn=open('haslinks.xml','r'))
281 print res
282
283 if 0:
284 ids = ['11666868']
285 res = GetLinks(ids,conn=open('links.xml','r'))
286
287 for id,score in res[:10]:
288 print id,score
289