RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
python_streambuf.h
Go to the documentation of this file.
1//
2// This file is part of the CCTBX distribution:
3// http://cctbx.sourceforge.net/
4// Downloaded from here:
5// http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6//
7// Copyright (c) 2006, The Regents of the University of
8// California, through Lawrence Berkeley National Laboratory (subject to
9// receipt of any required approvals from the U.S. Dept. of Energy). All
10// rights reserved.
11//
12// The license is here:
13// http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14//
15#include <RDGeneral/export.h>
16#ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17#define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19#include <boost/python/object.hpp>
20#include <boost/python/str.hpp>
21#include <boost/python/extract.hpp>
22
23#include <boost/optional.hpp>
24#include <boost/utility/typed_in_place_factory.hpp>
26
27#include <RDGeneral/Invariant.h>
29
30#include <streambuf>
31#include <iostream>
32
33namespace boost_adaptbx {
34namespace python {
35
36namespace bp = boost::python;
37
38/// A stream buffer getting data from and putting data into a Python file object
39/** The aims are as follow:
40
41 - Given a C++ function acting on a standard stream, e.g.
42
43 \code
44 void read_inputs(std::istream& input) {
45 ...
46 input >> something >> something_else;
47 }
48 \endcode
49
50 and given a piece of Python code which creates a file-like object,
51 to be able to pass this file object to that C++ function, e.g.
52
53 \code
54 import gzip
55 gzip_file_obj = gzip.GzipFile(...)
56 read_inputs(gzip_file_obj)
57 \endcode
58
59 and have the standard stream pull data from and put data into the Python
60 file object.
61
62 - When Python \c read_inputs() returns, the Python object is able to
63 continue reading or writing where the C++ code left off.
64
65 - Operations in C++ on mere files should be competitively fast compared
66 to the direct use of \c std::fstream.
67
68
69 \b Motivation
70
71 - the standard Python library offer of file-like objects (files,
72 compressed files and archives, network, ...) is far superior to the
73 offer of streams in the C++ standard library and Boost C++ libraries.
74
75 - i/o code involves a fair amount of text processing which is more
76 efficiently prototyped in Python but then one may need to rewrite
77 a time-critical part in C++, in as seamless a manner as possible.
78
79 \b Usage
80
81 This is 2-step:
82
83 - a trivial wrapper function
84
85 \code
86 using boost_adaptbx::python::streambuf;
87 void read_inputs_wrapper(streambuf& input)
88 {
89 streambuf::istream is(input);
90 read_inputs(is);
91 }
92
93 def("read_inputs", read_inputs_wrapper);
94 \endcode
95
96 which has to be written every time one wants a Python binding for
97 such a C++ function.
98
99 - the Python side
100
101 \code
102 from boost.python import streambuf
103 read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
104 \endcode
105
106 \c buffer_size is optional. See also: \c default_buffer_size
107
108 Note: references are to the C++ standard (the numbers between parentheses
109 at the end of references are margin markers).
110*/
111class streambuf : public std::basic_streambuf<char> {
112 private:
113 typedef std::basic_streambuf<char> base_t;
114
115 public:
116 /* The syntax
117 using base_t::char_type;
118 would be nicer but Visual Studio C++ 8 chokes on it
119 */
120 typedef base_t::char_type char_type;
121 typedef base_t::int_type int_type;
122 typedef base_t::pos_type pos_type;
123 typedef base_t::off_type off_type;
124 typedef base_t::traits_type traits_type;
125
126 // work around Visual C++ 7.1 problem
127 inline static int traits_type_eof() { return traits_type::eof(); }
128
129 /// The default size of the read and write buffer.
130 /** They are respectively used to buffer data read from and data written to
131 the Python file object. It can be modified from Python.
132 */
133 const static std::size_t default_buffer_size = 1024;
134
135 /// Construct from a Python file object
136 /** if buffer_size is 0 the current default_buffer_size is used.
137 */
138 streambuf(bp::object &python_file_obj, std::size_t buffer_size_ = 0)
139 : py_read(getattr(python_file_obj, "read", bp::object())),
140 py_write(getattr(python_file_obj, "write", bp::object())),
141 py_seek(getattr(python_file_obj, "seek", bp::object())),
142 py_tell(getattr(python_file_obj, "tell", bp::object())),
143 buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
144 write_buffer(nullptr),
145 pos_of_read_buffer_end_in_py_file(0),
146 pos_of_write_buffer_end_in_py_file(buffer_size),
147 farthest_pptr(nullptr) {
148 TEST_ASSERT(buffer_size != 0);
149 /* Some Python file objects (e.g. sys.stdout and sys.stdin)
150 have non-functional seek and tell. If so, assign None to
151 py_tell and py_seek.
152 */
153 if (py_tell != bp::object()) {
154 try {
155 off_type py_pos = bp::extract<off_type>(py_tell());
156 if (py_seek != bp::object()) {
157 /* Make sure we can actually seek.
158 bzip2 readers from python have a seek method, but it fails
159 when they are in write mode.
160 */
161 py_seek(py_pos);
162 }
163 } catch (bp::error_already_set &) {
164 py_tell = bp::object();
165 py_seek = bp::object();
166 /* Boost.Python does not do any Python exception handling whatsoever
167 So we need to catch it by hand like so.
168 */
169 PyErr_Clear();
170 }
171 }
172
173 if (py_write != bp::object()) {
174 // C-like string to make debugging easier
175 write_buffer = new char[buffer_size + 1];
176 write_buffer[buffer_size] = '\0';
177 setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
178 farthest_pptr = pptr();
179 } else {
180 // The first attempt at output will result in a call to overflow
181 setp(nullptr, nullptr);
182 }
183
184 if (py_tell != bp::object()) {
185 off_type py_pos = bp::extract<off_type>(py_tell());
186 pos_of_read_buffer_end_in_py_file = py_pos;
187 pos_of_write_buffer_end_in_py_file = py_pos;
188 }
189 }
190
191 /// constructor to enforce a mode (binary or text)
192 streambuf(bp::object &python_file_obj, char mode,
193 std::size_t buffer_size_ = 0)
194 : streambuf(python_file_obj, buffer_size_) {
195#if 1
196 bp::object io_mod = bp::import("io");
197 CHECK_INVARIANT(io_mod, "module not found");
198 bp::object iobase = io_mod.attr("TextIOBase");
199 CHECK_INVARIANT(iobase, "base class not found");
200#else
201 // using statics to save an undetermined amount of time results in
202 // alarming seg faults on windows. so we don't do it. Keep this here
203 // for the moment though in case someone manages to figure that out in
204 // the future
205 static bp::object io_mod = bp::object();
206 static bp::object iobase = bp::object();
207 if (!io_mod) io_mod = bp::import("io");
208 if (io_mod && !iobase) iobase = io_mod.attr("TextIOBase");
209 CHECK_INVARIANT(io_mod, "module not found");
210 CHECK_INVARIANT(iobase, "base class not found");
211#endif
212
213 df_isTextMode = PyObject_IsInstance(python_file_obj.ptr(), iobase.ptr());
214 switch (mode) {
215 case 's': /// yeah, is redundant, but it is somehow natural to do "s"
216 case 't':
217 if (!df_isTextMode) {
219 "Need a text mode file object like StringIO or a file opened "
220 "with mode 't'");
221 }
222 break;
223 case 'b':
224 if (df_isTextMode) {
226 "Need a binary mode file object like BytesIO or a file opened "
227 "with mode 'b'");
228 }
229 break;
230 default:
231 throw std::invalid_argument("bad mode character");
232 }
233 }
234
235 /// Mundane destructor freeing the allocated resources
236 ~streambuf() override {
237 if (write_buffer) {
238 delete[] write_buffer;
239 }
240 }
241
242 /// C.f. C++ standard section 27.5.2.4.3
243 /** It is essential to override this virtual function for the stream
244 member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
245 */
246 std::streamsize showmanyc() override {
247 int_type const failure = traits_type::eof();
248 int_type status = underflow();
249 if (status == failure) {
250 return -1;
251 }
252 return egptr() - gptr();
253 }
254
255 /// C.f. C++ standard section 27.5.2.4.3
256 int_type underflow() override {
257 int_type const failure = traits_type::eof();
258 if (py_read == bp::object()) {
259 throw std::invalid_argument(
260 "That Python file object has no 'read' attribute");
261 }
262 read_buffer = py_read(buffer_size);
263 char *read_buffer_data;
264 bp::ssize_t py_n_read;
265 if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
266 &py_n_read) == -1) {
267 setg(nullptr, nullptr, nullptr);
268 throw std::invalid_argument(
269 "The method 'read' of the Python file object "
270 "did not return a string.");
271 }
272 off_type n_read = (off_type)py_n_read;
273 pos_of_read_buffer_end_in_py_file += n_read;
274 setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
275 // ^^^27.5.2.3.1 (4)
276 if (n_read == 0) {
277 return failure;
278 }
279 return traits_type::to_int_type(read_buffer_data[0]);
280 }
281
282 /// C.f. C++ standard section 27.5.2.4.5
284 if (py_write == bp::object()) {
285 throw std::invalid_argument(
286 "That Python file object has no 'write' attribute");
287 }
288 farthest_pptr = std::max(farthest_pptr, pptr());
289 off_type n_written = (off_type)(farthest_pptr - pbase());
290 off_type orig_n_written = n_written;
291 const unsigned int STD_ASCII = 0x7F;
292 if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII) {
293 // we're somewhere in the middle of a utf8 block. If we
294 // only write part of it we'll end up with an exception,
295 // so push everything that could be utf8 into the next block
296 while (n_written > 0 && static_cast<unsigned int>(
297 write_buffer[n_written - 1]) > STD_ASCII) {
298 --n_written;
299 }
300 }
301 bp::str chunk(pbase(), pbase() + n_written);
302 py_write(chunk);
303
304 if ((!df_isTextMode || static_cast<unsigned int>(c) <= STD_ASCII) &&
305 !traits_type::eq_int_type(c, traits_type::eof())) {
306 py_write(traits_type::to_char_type(c));
307 n_written++;
308 }
309
310 setp(pbase(), epptr());
311 // ^^^ 27.5.2.4.5 (5)
312 farthest_pptr = pptr();
313 if (n_written) {
314 pos_of_write_buffer_end_in_py_file += n_written;
315 if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII &&
316 !traits_type::eq_int_type(c, traits_type::eof())) {
317 size_t n_to_copy = orig_n_written - n_written;
318
319 for (size_t i = 0; i < n_to_copy; ++i) {
320 sputc(write_buffer[n_written + i]);
321 ++farthest_pptr;
322 }
323 sputc(c);
324 ++farthest_pptr;
325 }
326 }
327 return traits_type::eq_int_type(c, traits_type::eof())
328 ? traits_type::not_eof(c)
329 : c;
330 }
331
332 /// Update the python file to reflect the state of this stream buffer
333 /** Empty the write buffer into the Python file object and set the seek
334 position of the latter accordingly (C++ standard section 27.5.2.4.2).
335 If there is no write buffer or it is empty, but there is a non-empty
336 read buffer, set the Python file object seek position to the
337 seek position in that read buffer.
338 */
339 int sync() override {
340 int result = 0;
341 farthest_pptr = std::max(farthest_pptr, pptr());
342 if (farthest_pptr && farthest_pptr > pbase()) {
343 off_type delta = pptr() - farthest_pptr;
344 int_type status = overflow();
345 if (traits_type::eq_int_type(status, traits_type::eof())) {
346 result = -1;
347 }
348 if (py_seek != bp::object()) {
349 py_seek(delta, 1);
350 }
351 } else if (gptr() && gptr() < egptr()) {
352 if (py_seek != bp::object()) {
353 py_seek(gptr() - egptr(), 1);
354 }
355 }
356 return result;
357 }
358
359 /// C.f. C++ standard section 27.5.2.4.2
360 /** This implementation is optimised to look whether the position is within
361 the buffers, so as to avoid calling Python seek or tell. It is
362 important for many applications that the overhead of calling into Python
363 is avoided as much as possible (e.g. parsers which may do a lot of
364 backtracking)
365 */
366 pos_type seekoff(off_type off, std::ios_base::seekdir way,
367 std::ios_base::openmode which =
368 std::ios_base::in | std::ios_base::out) override {
369 /* In practice, "which" is either std::ios_base::in or out
370 since we end up here because either seekp or seekg was called
371 on the stream using this buffer. That simplifies the code
372 in a few places.
373 */
374 int const failure = off_type(-1);
375
376 if (py_seek == bp::object()) {
377 throw std::invalid_argument(
378 "That Python file object has no 'seek' attribute");
379 }
380
381 // we need the read buffer to contain something!
382 if (which == std::ios_base::in && !gptr()) {
383 if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
384 return failure;
385 }
386 }
387
388 // compute the whence parameter for Python seek
389 int whence;
390 switch (way) {
391 case std::ios_base::beg:
392 whence = 0;
393 break;
394 case std::ios_base::cur:
395 whence = 1;
396 break;
397 case std::ios_base::end:
398 whence = 2;
399 break;
400 default:
401 return failure;
402 }
403
404 // Let's have a go
405 boost::optional<off_type> result =
406 seekoff_without_calling_python(off, way, which);
407 if (!result) {
408 // we need to call Python
409 if (which == std::ios_base::out) {
410 overflow();
411 }
412 if (way == std::ios_base::cur) {
413 if (which == std::ios_base::in) {
414 off -= egptr() - gptr();
415 } else if (which == std::ios_base::out) {
416 off += pptr() - pbase();
417 }
418 }
419 py_seek(off, whence);
420 result = off_type(bp::extract<off_type>(py_tell()));
421 if (which == std::ios_base::in) {
422 underflow();
423 }
424 }
425 return *result;
426 }
427
428 /// C.f. C++ standard section 27.5.2.4.2
430 std::ios_base::openmode which =
431 std::ios_base::in | std::ios_base::out) override {
432 return streambuf::seekoff(sp, std::ios_base::beg, which);
433 }
434
435 private:
436 bp::object py_read, py_write, py_seek, py_tell;
437
438 std::size_t buffer_size;
439
440 /* This is actually a Python string and the actual read buffer is
441 its internal data, i.e. an array of characters. We use a Boost.Python
442 object so as to hold on it: as a result, the actual buffer can't
443 go away.
444 */
445 bp::object read_buffer;
446
447 /* A mere array of char's allocated on the heap at construction time and
448 de-allocated only at destruction time.
449 */
450 char *write_buffer;
451 bool df_isTextMode;
452
453 off_type pos_of_read_buffer_end_in_py_file,
454 pos_of_write_buffer_end_in_py_file;
455
456 // the farthest place the buffer has been written into
457 char *farthest_pptr;
458
459 boost::optional<off_type> seekoff_without_calling_python(
460 off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
461 boost::optional<off_type> const failure = off_type(-1);
462
463 // Buffer range and current position
464 off_type buf_begin, buf_end, buf_cur, upper_bound;
465 off_type pos_of_buffer_end_in_py_file;
466 if (which == std::ios_base::in) {
467 pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
468 buf_begin = reinterpret_cast<std::streamsize>(eback());
469 buf_cur = reinterpret_cast<std::streamsize>(gptr());
470 buf_end = reinterpret_cast<std::streamsize>(egptr());
471 upper_bound = buf_end;
472 } else if (which == std::ios_base::out) {
473 pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
474 buf_begin = reinterpret_cast<std::streamsize>(pbase());
475 buf_cur = reinterpret_cast<std::streamsize>(pptr());
476 buf_end = reinterpret_cast<std::streamsize>(epptr());
477 farthest_pptr = std::max(farthest_pptr, pptr());
478 upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
479 } else {
480 CHECK_INVARIANT(0, "unreachable code");
481 }
482
483 // Sought position in "buffer coordinate"
484 off_type buf_sought;
485 if (way == std::ios_base::cur) {
486 buf_sought = buf_cur + off;
487 } else if (way == std::ios_base::beg) {
488 buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
489 } else if (way == std::ios_base::end) {
490 return failure;
491 } else {
492 CHECK_INVARIANT(0, "unreachable code");
493 }
494
495 // if the sought position is not in the buffer, give up
496 if (buf_sought < buf_begin || buf_sought >= upper_bound) {
497 return failure;
498 }
499
500 // we are in wonderland
501 if (which == std::ios_base::in) {
502 gbump(buf_sought - buf_cur);
503 } else if (which == std::ios_base::out) {
504 pbump(buf_sought - buf_cur);
505 }
506 return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
507 }
508
509 public:
510 class istream : public std::istream {
511 public:
512 istream(streambuf &buf) : std::istream(&buf) {
513 exceptions(std::ios_base::badbit);
514 }
515
516 ~istream() override {
517 // do nothing.
518 // This used to do:
519 // if (this->good()) this->sync();
520 // but that caused problems if the underlying file had been closed
521 // (see github #579) and really doesn't seem necessary for what we're
522 // doing.
523 }
524 };
525
526 class ostream : public std::ostream {
527 public:
528 ostream(streambuf &buf) : std::ostream(&buf) {
529 exceptions(std::ios_base::badbit);
530 }
531
532 // overload that takes ownership of the streambuf ptr
533 ostream(streambuf *buf) : std::ostream(buf), m_buf(buf) {
534 exceptions(std::ios_base::badbit);
535 }
536
537 ~ostream() override {
538 if (this->good()) {
539 this->flush();
540 }
541 delete m_buf;
542 }
543
544 private:
545 streambuf *m_buf = nullptr;
546 };
547};
548
549// std::size_t streambuf::default_buffer_size = 1024;
550
553
554 streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size = 0)
555 : python_streambuf(python_file_obj, buffer_size) {}
556};
557
559 ostream(bp::object &python_file_obj, std::size_t buffer_size = 0)
560 : streambuf_capsule(python_file_obj, buffer_size),
562
563 ~ostream() noexcept override {
564 if (this->good()) {
565 this->flush();
566 }
567 }
568};
569} // namespace python
570} // namespace boost_adaptbx
571
572#endif // GUARD
#define TEST_ASSERT(expr)
Definition Invariant.h:152
#define CHECK_INVARIANT(expr, mess)
Definition Invariant.h:101
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
A stream buffer getting data from and putting data into a Python file object.
~streambuf() override
Mundane destructor freeing the allocated resources.
static const std::size_t default_buffer_size
The default size of the read and write buffer.
pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
std::streamsize showmanyc() override
C.f. C++ standard section 27.5.2.4.3.
pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
streambuf(bp::object &python_file_obj, char mode, std::size_t buffer_size_=0)
constructor to enforce a mode (binary or text)
int sync() override
Update the python file to reflect the state of this stream buffer.
int_type overflow(int_type c=traits_type_eof()) override
C.f. C++ standard section 27.5.2.4.5.
int_type underflow() override
C.f. C++ standard section 27.5.2.4.3.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)