pandas-dev · jreback · Sep 19, 2014 · Sep 4, 2014 · jreback · Aug 16, 2014
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -176,7 +176,12 @@ They can take a number of arguments:
   - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified
     as 'X.0'...'X.N', rather than 'X'...'X'
   - ``tupleize_cols``: boolean, default False, if False, convert a list of tuples
-    to a multi-index of columns, otherwise, leave the column index as a list of tuples
+    to a multi-index of columns, otherwise, leave the column index as a list of
+    tuples
+  - ``float_precision`` : string, default None. Specifies which converter the C
+    engine should use for floating-point values. The options are None for the
+    ordinary converter, 'high' for the high-precision converter, and
+    'round_trip' for the round-trip converter.
 
 .. ipython:: python
    :suppress:
@@ -512,6 +517,23 @@ data columns:
    specify `index_col` as a column label rather then as an index on the resulting frame.
 
 
+Specifying method for floating-point conversion
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The parameter ``float_precision`` can be specified in order to use
+a specific floating-point converter during parsing with the C engine.
+The options are the ordinary converter, the high-precision converter, and
+the round-trip converter (which is guaranteed to round-trip values after
+writing to a file). For example:
+
+.. ipython:: python
+
+   val = '0.3066101993807095471566981359501369297504425048828125'
+   data = 'a,b,c\n1,2,{0}'.format(val)
+   abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val))
+   abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val))
+   abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val))
+
+
 Date Parsing Functions
 ~~~~~~~~~~~~~~~~~~~~~~
 Finally, the parser allows you can specify a custom ``date_parser`` function to

diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -631,6 +631,8 @@ Enhancements
 - Added support for ``c``, ``colormap`` and ``colorbar`` arguments for
   ``DataFrame.plot`` with ``kind='scatter'`` (:issue:`7780`)
 
+- ``read_csv`` now has a keyword parameter ``float_precision`` which specifies which floating-point
+  converter the C engine should use during parsing, see :ref:`_io` (:issue:`8002`, :issue:`8044`)
 
 - ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`)
 - ``pandas.tseries.holiday`` has added support for additional holidays and ways to observe holidays (:issue:`7070`)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -303,7 +303,8 @@ def _read(filepath_or_buffer, kwds):
     'error_bad_lines': True,
     'warn_bad_lines': True,
     'dtype': None,
-    'decimal': b'.'
+    'decimal': b'.',
+    'float_precision': None
 }
 
 _fwf_defaults = {
@@ -369,6 +370,7 @@ def parser_f(filepath_or_buffer,
                  date_parser=None,
 
                  memory_map=False,
+                 float_precision=None,
                  nrows=None,
                  iterator=False,
                  chunksize=None,
@@ -437,6 +439,7 @@ def parser_f(filepath_or_buffer,
                     encoding=encoding,
                     squeeze=squeeze,
                     memory_map=memory_map,
+                    float_precision=float_precision,
 
                     na_filter=na_filter,
                     compact_ints=compact_ints,
@@ -1264,6 +1267,11 @@ def TextParser(*args, **kwds):
         If True and `parse_dates` is True for a column, try to infer the
         datetime format based on the first datetime string. If the format
         can be inferred, there often will be a large parsing speed-up.
+    float_precision : string, default None
+        Specifies which converter the C engine should use for floating-point
+        values. The options are None for the ordinary converter,
+        'high' for the high-precision converter, and 'round_trip' for the
+        round-trip converter.
     """
     kwds['engine'] = 'python'
     return TextFileReader(*args, **kwds)

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2523,6 +2523,12 @@ def test_verbose_import(self):
         finally:
             sys.stdout = sys.__stdout__
 
+    def test_float_precision_specified(self):
+        # Should raise an error if float_precision (C parser option) is specified
+        with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option "
+                                   "is not supported with the 'python' engine"):
+            self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high')
+
     def test_iteration_open_handle(self):
         if PY3:
             raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info))
@@ -3088,6 +3094,25 @@ def test_compact_ints(self):
         ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
         self.assertEqual(result.dtype, ex_dtype)
 
+    def test_precise_conversion(self):
+        # GH #8002
+        from decimal import Decimal
+        normal_errors = []
+        precise_errors = []
+        for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2
+            text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision
+            normal_val = float(self.read_csv(StringIO(text))['a'][0])
+            precise_val = float(self.read_csv(StringIO(text), float_precision='high')['a'][0])
+            roundtrip_val = float(self.read_csv(StringIO(text), float_precision='round_trip')['a'][0])
+            actual_val = Decimal(text[2:])
+            def error(val):
+                return abs(Decimal('{0:.100}'.format(val)) - actual_val)
+            normal_errors.append(error(normal_val))
+            precise_errors.append(error(precise_val))
+            self.assertEqual(roundtrip_val, float(text[2:])) # round-trip should match float()
+        self.assertTrue(sum(precise_errors) < sum(normal_errors))
+        self.assertTrue(max(precise_errors) < max(normal_errors))
+
     def test_pass_dtype(self):
         data = """\
 one,two

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -62,6 +62,9 @@ cdef extern from "headers/stdint.h":
 cdef extern from "headers/portable.h":
     pass
 
+cdef extern from "errno.h":
+    int errno
+
 try:
     basestring
 except NameError:
@@ -155,6 +158,7 @@ cdef extern from "parser/tokenizer.h":
 
         void *skipset
         int skip_footer
+        double (*converter)(const char *, char **, char, char, char, int)
 
         #  error handling
         char *warn_msg
@@ -189,8 +193,13 @@ cdef extern from "parser/tokenizer.h":
                          int64_t int_max, int *error, char tsep)
     uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
 
-    inline int to_double(char *item, double *p_value,
-                         char sci, char decimal, char thousands)
+    double xstrtod(const char *p, char **q, char decimal, char sci,
+                   char tsep, int skip_trailing)
+    double precise_xstrtod(const char *p, char **q, char decimal, char sci,
+                   char tsep, int skip_trailing)
+    double round_trip(const char *p, char **q, char decimal, char sci,
+                   char tsep, int skip_trailing)
+
     inline int to_complex(char *item, double *p_real,
                           double *p_imag, char sci, char decimal)
     inline int to_longlong(char *item, long long *p_value)
@@ -315,7 +324,8 @@ cdef class TextReader:
                   skip_footer=0,
                   verbose=False,
                   mangle_dupe_cols=True,
-                  tupleize_cols=False):
+                  tupleize_cols=False,
+                  float_precision=None):
 
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
@@ -415,6 +425,11 @@ cdef class TextReader:
 
         self.verbose = verbose
         self.low_memory = low_memory
+        self.parser.converter = xstrtod
+        if float_precision == 'high':
+            self.parser.converter = precise_xstrtod
+        elif float_precision == 'round_trip':
+            self.parser.converter = round_trip
 
         # encoding
         if encoding is not None:
@@ -1018,7 +1033,7 @@ cdef class TextReader:
 
         elif dtype[1] == 'f':
             result, na_count = _try_double(self.parser, i, start, end,
-                                           na_filter, na_hashset, na_flist)
+                          na_filter, na_hashset, na_flist)
 
             if dtype[1:] != 'f8':
                 result = result.astype(dtype)
@@ -1415,12 +1430,14 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
         size_t i, lines
         coliter_t it
         char *word
+        char *p_end
         double *data
         double NA = na_values[np.float64]
         ndarray result
         khiter_t k
         bint use_na_flist = len(na_flist) > 0
 
+    global errno
     lines = line_end - line_start
     result = np.empty(lines, dtype=np.float64)
     data = <double *> result.data
@@ -1436,8 +1453,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
                 na_count += 1
                 data[0] = NA
             else:
-                error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
-                if error != 1:
+                data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
+                                         parser.thousands, 1)
+                if errno != 0 or p_end[0] or p_end == word:
                     if strcasecmp(word, cinf) == 0:
                         data[0] = INF
                     elif strcasecmp(word, cneginf) == 0:
@@ -1452,8 +1470,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
     else:
         for i in range(lines):
             word = COLITER_NEXT(it)
-            error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
-            if error != 1:
+            data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
+                                         parser.thousands, 1)
+            if errno != 0 or p_end[0] or p_end == word:
                 if strcasecmp(word, cinf) == 0:
                     data[0] = INF
                 elif strcasecmp(word, cneginf) == 0: