RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1 //
2 // This file is part of the CCTBX distribution:
3 // http://cctbx.sourceforge.net/
4 // Downloaded from here:
5 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6 //
7 // Copyright (c) 2006, The Regents of the University of
8 // California, through Lawrence Berkeley National Laboratory (subject to
9 // receipt of any required approvals from the U.S. Dept. of Energy). All
10 // rights reserved.
11 //
12 // The license is here:
13 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14 //
15 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
16 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
18 #include <boost/python/object.hpp>
19 #include <boost/python/str.hpp>
20 #include <boost/python/extract.hpp>
21 
22 #include <boost/optional.hpp>
23 #include <boost/utility/typed_in_place_factory.hpp>
25 
26 //#include <tbxx/error_utils.hpp>
27 #include <RDGeneral/Invariant.h>
28 
29 #include <streambuf>
30 #include <iostream>
31 
32 namespace boost_adaptbx {
33 namespace python {
34 
35 namespace bp = boost::python;
36 
37 /// A stream buffer getting data from and putting data into a Python file object
38 /** The aims are as follow:
39 
40  - Given a C++ function acting on a standard stream, e.g.
41 
42  \code
43  void read_inputs(std::istream& input) {
44  ...
45  input >> something >> something_else;
46  }
47  \endcode
48 
49  and given a piece of Python code which creates a file-like object,
50  to be able to pass this file object to that C++ function, e.g.
51 
52  \code
53  import gzip
54  gzip_file_obj = gzip.GzipFile(...)
55  read_inputs(gzip_file_obj)
56  \endcode
57 
58  and have the standard stream pull data from and put data into the Python
59  file object.
60 
61  - When Python \c read_inputs() returns, the Python object is able to
62  continue reading or writing where the C++ code left off.
63 
64  - Operations in C++ on mere files should be competitively fast compared
65  to the direct use of \c std::fstream.
66 
67 
68  \b Motivation
69 
70  - the standard Python library offer of file-like objects (files,
71  compressed files and archives, network, ...) is far superior to the
72  offer of streams in the C++ standard library and Boost C++ libraries.
73 
74  - i/o code involves a fair amount of text processing which is more
75  efficiently prototyped in Python but then one may need to rewrite
76  a time-critical part in C++, in as seamless a manner as possible.
77 
78  \b Usage
79 
80  This is 2-step:
81 
82  - a trivial wrapper function
83 
84  \code
85  using boost_adaptbx::python::streambuf;
86  void read_inputs_wrapper(streambuf& input)
87  {
88  streambuf::istream is(input);
89  read_inputs(is);
90  }
91 
92  def("read_inputs", read_inputs_wrapper);
93  \endcode
94 
95  which has to be written every time one wants a Python binding for
96  such a C++ function.
97 
98  - the Python side
99 
100  \code
101  from boost.python import streambuf
102  read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
103  \endcode
104 
105  \c buffer_size is optional. See also: \c default_buffer_size
106 
107  Note: references are to the C++ standard (the numbers between parentheses
108  at the end of references are margin markers).
109 */
110 class streambuf : public std::basic_streambuf<char> {
111  private:
112  typedef std::basic_streambuf<char> base_t;
113 
114  public:
115  /* The syntax
116  using base_t::char_type;
117  would be nicer but Visual Studio C++ 8 chokes on it
118  */
119  typedef base_t::char_type char_type;
120  typedef base_t::int_type int_type;
121  typedef base_t::pos_type pos_type;
122  typedef base_t::off_type off_type;
123  typedef base_t::traits_type traits_type;
124 
125  // work around Visual C++ 7.1 problem
126  inline static int traits_type_eof() { return traits_type::eof(); }
127 
128  /// The default size of the read and write buffer.
129  /** They are respectively used to buffer data read from and data written to
130  the Python file object. It can be modified from Python.
131  */
132  const static std::size_t default_buffer_size = 1024;
133 
134  /// Construct from a Python file object
135  /** if buffer_size is 0 the current default_buffer_size is used.
136  */
137  streambuf(bp::object& python_file_obj, std::size_t buffer_size_ = 0)
138  : py_read(getattr(python_file_obj, "read", bp::object())),
139  py_write(getattr(python_file_obj, "write", bp::object())),
140  py_seek(getattr(python_file_obj, "seek", bp::object())),
141  py_tell(getattr(python_file_obj, "tell", bp::object())),
142  buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
143  write_buffer(0),
144  pos_of_read_buffer_end_in_py_file(0),
145  pos_of_write_buffer_end_in_py_file(buffer_size),
146  farthest_pptr(0) {
147  TEST_ASSERT(buffer_size != 0);
148  /* Some Python file objects (e.g. sys.stdout and sys.stdin)
149  have non-functional seek and tell. If so, assign None to
150  py_tell and py_seek.
151  */
152  if (py_tell != bp::object()) {
153  try {
154  off_type py_pos = bp::extract<off_type>(py_tell());
155  if (py_seek != bp::object()) {
156  /* Make sure we can actually seek.
157  bzip2 readers from python have a seek method, but it fails
158  when they are in write mode.
159  */
160  py_seek(py_pos);
161  }
162  } catch (bp::error_already_set&) {
163  py_tell = bp::object();
164  py_seek = bp::object();
165  /* Boost.Python does not do any Python exception handling whatsoever
166  So we need to catch it by hand like so.
167  */
168  PyErr_Clear();
169  }
170  }
171 
172  if (py_write != bp::object()) {
173  // C-like string to make debugging easier
174  write_buffer = new char[buffer_size + 1];
175  write_buffer[buffer_size] = '\0';
176  setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
177  farthest_pptr = pptr();
178  } else {
179  // The first attempt at output will result in a call to overflow
180  setp(0, 0);
181  }
182 
183  if (py_tell != bp::object()) {
184  off_type py_pos = bp::extract<off_type>(py_tell());
185  pos_of_read_buffer_end_in_py_file = py_pos;
186  pos_of_write_buffer_end_in_py_file = py_pos;
187  }
188  }
189 
190  /// Mundane destructor freeing the allocated resources
191  virtual ~streambuf() {
192  if (write_buffer) delete[] write_buffer;
193  }
194 
195  /// C.f. C++ standard section 27.5.2.4.3
196  /** It is essential to override this virtual function for the stream
197  member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
198  */
199  virtual std::streamsize showmanyc() {
200  int_type const failure = traits_type::eof();
201  int_type status = underflow();
202  if (status == failure) return -1;
203  return egptr() - gptr();
204  }
205 
206  /// C.f. C++ standard section 27.5.2.4.3
207  virtual int_type underflow() {
208  int_type const failure = traits_type::eof();
209  if (py_read == bp::object()) {
210  throw std::invalid_argument(
211  "That Python file object has no 'read' attribute");
212  }
213  read_buffer = py_read(buffer_size);
214  char* read_buffer_data;
215  bp::ssize_t py_n_read;
216  if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
217  &py_n_read) == -1) {
218  setg(0, 0, 0);
219  throw std::invalid_argument(
220  "The method 'read' of the Python file object "
221  "did not return a string.");
222  }
223  off_type n_read = (off_type)py_n_read;
224  pos_of_read_buffer_end_in_py_file += n_read;
225  setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
226  // ^^^27.5.2.3.1 (4)
227  if (n_read == 0) return failure;
228  return traits_type::to_int_type(read_buffer_data[0]);
229  }
230 
231  /// C.f. C++ standard section 27.5.2.4.5
232  virtual int_type overflow(int_type c = traits_type_eof()) {
233  if (py_write == bp::object()) {
234  throw std::invalid_argument(
235  "That Python file object has no 'write' attribute");
236  }
237  farthest_pptr = std::max(farthest_pptr, pptr());
238  off_type n_written = (off_type)(farthest_pptr - pbase());
239  bp::str chunk(pbase(), farthest_pptr);
240  py_write(chunk);
241  if (!traits_type::eq_int_type(c, traits_type::eof())) {
242  py_write(traits_type::to_char_type(c));
243  n_written++;
244  }
245  if (n_written) {
246  pos_of_write_buffer_end_in_py_file += n_written;
247  setp(pbase(), epptr());
248  // ^^^ 27.5.2.4.5 (5)
249  farthest_pptr = pptr();
250  }
251  return traits_type::eq_int_type(c, traits_type::eof())
252  ? traits_type::not_eof(c)
253  : c;
254  }
255 
256  /// Update the python file to reflect the state of this stream buffer
257  /** Empty the write buffer into the Python file object and set the seek
258  position of the latter accordingly (C++ standard section 27.5.2.4.2).
259  If there is no write buffer or it is empty, but there is a non-empty
260  read buffer, set the Python file object seek position to the
261  seek position in that read buffer.
262  */
263  virtual int sync() {
264  int result = 0;
265  farthest_pptr = std::max(farthest_pptr, pptr());
266  if (farthest_pptr && farthest_pptr > pbase()) {
267  off_type delta = pptr() - farthest_pptr;
268  int_type status = overflow();
269  if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
270  if (py_seek != bp::object()) py_seek(delta, 1);
271  } else if (gptr() && gptr() < egptr()) {
272  if (py_seek != bp::object()) py_seek(gptr() - egptr(), 1);
273  }
274  return result;
275  }
276 
277  /// C.f. C++ standard section 27.5.2.4.2
278  /** This implementation is optimised to look whether the position is within
279  the buffers, so as to avoid calling Python seek or tell. It is
280  important for many applications that the overhead of calling into Python
281  is avoided as much as possible (e.g. parsers which may do a lot of
282  backtracking)
283  */
284  virtual pos_type seekoff(off_type off, std::ios_base::seekdir way,
285  std::ios_base::openmode which = std::ios_base::in |
286  std::ios_base::out) {
287  /* In practice, "which" is either std::ios_base::in or out
288  since we end up here because either seekp or seekg was called
289  on the stream using this buffer. That simplifies the code
290  in a few places.
291  */
292  int const failure = off_type(-1);
293 
294  if (py_seek == bp::object()) {
295  throw std::invalid_argument(
296  "That Python file object has no 'seek' attribute");
297  }
298 
299  // we need the read buffer to contain something!
300  if (which == std::ios_base::in && !gptr()) {
301  if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
302  return failure;
303  }
304  }
305 
306  // compute the whence parameter for Python seek
307  int whence;
308  switch (way) {
309  case std::ios_base::beg:
310  whence = 0;
311  break;
312  case std::ios_base::cur:
313  whence = 1;
314  break;
315  case std::ios_base::end:
316  whence = 2;
317  break;
318  default:
319  return failure;
320  }
321 
322  // Let's have a go
323  boost::optional<off_type> result =
324  seekoff_without_calling_python(off, way, which);
325  if (!result) {
326  // we need to call Python
327  if (which == std::ios_base::out) overflow();
328  if (way == std::ios_base::cur) {
329  if (which == std::ios_base::in)
330  off -= egptr() - gptr();
331  else if (which == std::ios_base::out)
332  off += pptr() - pbase();
333  }
334  py_seek(off, whence);
335  result = off_type(bp::extract<off_type>(py_tell()));
336  if (which == std::ios_base::in) underflow();
337  }
338  return *result;
339  }
340 
341  /// C.f. C++ standard section 27.5.2.4.2
342  virtual pos_type seekpos(pos_type sp,
343  std::ios_base::openmode which = std::ios_base::in |
344  std::ios_base::out) {
345  return streambuf::seekoff(sp, std::ios_base::beg, which);
346  }
347 
348  private:
349  bp::object py_read, py_write, py_seek, py_tell;
350 
351  std::size_t buffer_size;
352 
353  /* This is actually a Python string and the actual read buffer is
354  its internal data, i.e. an array of characters. We use a Boost.Python
355  object so as to hold on it: as a result, the actual buffer can't
356  go away.
357  */
358  bp::object read_buffer;
359 
360  /* A mere array of char's allocated on the heap at construction time and
361  de-allocated only at destruction time.
362  */
363  char* write_buffer;
364 
365  off_type pos_of_read_buffer_end_in_py_file,
366  pos_of_write_buffer_end_in_py_file;
367 
368  // the farthest place the buffer has been written into
369  char* farthest_pptr;
370 
371  boost::optional<off_type> seekoff_without_calling_python(
372  off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
373  boost::optional<off_type> const failure;
374 
375  // Buffer range and current position
376  off_type buf_begin, buf_end, buf_cur, upper_bound;
377  off_type pos_of_buffer_end_in_py_file;
378  if (which == std::ios_base::in) {
379  pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
380  buf_begin = reinterpret_cast<std::streamsize>(eback());
381  buf_cur = reinterpret_cast<std::streamsize>(gptr());
382  buf_end = reinterpret_cast<std::streamsize>(egptr());
383  upper_bound = buf_end;
384  } else if (which == std::ios_base::out) {
385  pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
386  buf_begin = reinterpret_cast<std::streamsize>(pbase());
387  buf_cur = reinterpret_cast<std::streamsize>(pptr());
388  buf_end = reinterpret_cast<std::streamsize>(epptr());
389  farthest_pptr = std::max(farthest_pptr, pptr());
390  upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
391  } else {
392  CHECK_INVARIANT(0, "unreachable code");
393  }
394 
395  // Sought position in "buffer coordinate"
396  off_type buf_sought;
397  if (way == std::ios_base::cur) {
398  buf_sought = buf_cur + off;
399  } else if (way == std::ios_base::beg) {
400  buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
401  } else if (way == std::ios_base::end) {
402  return failure;
403  } else {
404  CHECK_INVARIANT(0, "unreachable code");
405  }
406 
407  // if the sought position is not in the buffer, give up
408  if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
409 
410  // we are in wonderland
411  if (which == std::ios_base::in)
412  gbump(buf_sought - buf_cur);
413  else if (which == std::ios_base::out)
414  pbump(buf_sought - buf_cur);
415  return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
416  }
417 
418  public:
419  class istream : public std::istream {
420  public:
421  istream(streambuf& buf) : std::istream(&buf) {
422  exceptions(std::ios_base::badbit);
423  }
424 
426  // do nothing.
427  // This used to do:
428  // if (this->good()) this->sync();
429  // but that caused problems if the underlying file had been closed
430  // (see github #579) and really doesn't seem necessary for what we're
431  // doing.
432  }
433  };
434 
435  class ostream : public std::ostream {
436  public:
437  ostream(streambuf& buf) : std::ostream(&buf) {
438  exceptions(std::ios_base::badbit);
439  }
440 
442  if (this->good()) this->flush();
443  }
444  };
445 };
446 
447 // std::size_t streambuf::default_buffer_size = 1024;
448 
451 
452  streambuf_capsule(bp::object& python_file_obj, std::size_t buffer_size = 0)
453  : python_streambuf(python_file_obj, buffer_size) {}
454 };
455 
457  ostream(bp::object& python_file_obj, std::size_t buffer_size = 0)
458  : streambuf_capsule(python_file_obj, buffer_size),
459  streambuf::ostream(python_streambuf) {}
460 
462  try {
463  if (this->good()) this->flush();
464  } catch (bp::error_already_set&) {
465  PyErr_Clear();
466  throw std::runtime_error(
467  "Problem closing python ostream.\n"
468  " Known limitation: the error is unrecoverable. Sorry.\n"
469  " Suggestion for programmer: add ostream.flush() before"
470  " returning.");
471  }
472  }
473 };
474 }
475 } // boost_adaptbx::python
476 
477 #endif // GUARD
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)
virtual int sync()
Update the python file to reflect the state of this stream buffer.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
static const std::size_t default_buffer_size
The default size of the read and write buffer.
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:99
STL namespace.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
virtual int_type underflow()
C.f. C++ standard section 27.5.2.4.3.
virtual int_type overflow(int_type c=traits_type_eof())
C.f. C++ standard section 27.5.2.4.5.
#define TEST_ASSERT(expr)
Definition: Invariant.h:150
virtual pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual std::streamsize showmanyc()
C.f. C++ standard section 27.5.2.4.3.
virtual ~streambuf()
Mundane destructor freeing the allocated resources.
A stream buffer getting data from and putting data into a Python file object.