Skip to content

Added parameter float_precision to CSV parser #8044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 19, 2014
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion doc/source/io.rst
Original file line number Diff line number Diff line change
@@ -176,7 +176,12 @@ They can take a number of arguments:
- ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified
as 'X.0'...'X.N', rather than 'X'...'X'
- ``tupleize_cols``: boolean, default False, if False, convert a list of tuples
to a multi-index of columns, otherwise, leave the column index as a list of tuples
to a multi-index of columns, otherwise, leave the column index as a list of
tuples
- ``float_precision`` : string, default None. Specifies which converter the C
engine should use for floating-point values. The options are None for the
ordinary converter, 'high' for the high-precision converter, and
'round_trip' for the round-trip converter.

.. ipython:: python
:suppress:
@@ -512,6 +517,23 @@ data columns:
specify `index_col` as a column label rather then as an index on the resulting frame.


Specifying method for floating-point conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The parameter ``float_precision`` can be specified in order to use
a specific floating-point converter during parsing with the C engine.
The options are the ordinary converter, the high-precision converter, and
the round-trip converter (which is guaranteed to round-trip values after
writing to a file). For example:

.. ipython:: python

val = '0.3066101993807095471566981359501369297504425048828125'
data = 'a,b,c\n1,2,{0}'.format(val)
abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val))
abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val))
abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val))


Date Parsing Functions
~~~~~~~~~~~~~~~~~~~~~~
Finally, the parser allows you can specify a custom ``date_parser`` function to
2 changes: 2 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
@@ -631,6 +631,8 @@ Enhancements
- Added support for ``c``, ``colormap`` and ``colorbar`` arguments for
``DataFrame.plot`` with ``kind='scatter'`` (:issue:`7780`)

- ``read_csv`` now has a keyword parameter ``float_precision`` which specifies which floating-point
converter the C engine should use during parsing, see :ref:`_io` (:issue:`8002`, :issue:`8044`)

- ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`)
- ``pandas.tseries.holiday`` has added support for additional holidays and ways to observe holidays (:issue:`7070`)
10 changes: 9 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
@@ -303,7 +303,8 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'decimal': b'.'
'decimal': b'.',
'float_precision': None
}

_fwf_defaults = {
@@ -369,6 +370,7 @@ def parser_f(filepath_or_buffer,
date_parser=None,

memory_map=False,
float_precision=None,
nrows=None,
iterator=False,
chunksize=None,
@@ -437,6 +439,7 @@ def parser_f(filepath_or_buffer,
encoding=encoding,
squeeze=squeeze,
memory_map=memory_map,
float_precision=float_precision,

na_filter=na_filter,
compact_ints=compact_ints,
@@ -1264,6 +1267,11 @@ def TextParser(*args, **kwds):
If True and `parse_dates` is True for a column, try to infer the
datetime format based on the first datetime string. If the format
can be inferred, there often will be a large parsing speed-up.
float_precision : string, default None
Specifies which converter the C engine should use for floating-point
values. The options are None for the ordinary converter,
'high' for the high-precision converter, and 'round_trip' for the
round-trip converter.
"""
kwds['engine'] = 'python'
return TextFileReader(*args, **kwds)
25 changes: 25 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
@@ -2523,6 +2523,12 @@ def test_verbose_import(self):
finally:
sys.stdout = sys.__stdout__

def test_float_precision_specified(self):
# Should raise an error if float_precision (C parser option) is specified
with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option "
"is not supported with the 'python' engine"):
self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high')

def test_iteration_open_handle(self):
if PY3:
raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info))
@@ -3088,6 +3094,25 @@ def test_compact_ints(self):
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

def test_precise_conversion(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this test added twice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the high-memory and low-memory C reading test suites, I could take one out though.

# GH #8002
from decimal import Decimal
normal_errors = []
precise_errors = []
for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2
text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision
normal_val = float(self.read_csv(StringIO(text))['a'][0])
precise_val = float(self.read_csv(StringIO(text), float_precision='high')['a'][0])
roundtrip_val = float(self.read_csv(StringIO(text), float_precision='round_trip')['a'][0])
actual_val = Decimal(text[2:])
def error(val):
return abs(Decimal('{0:.100}'.format(val)) - actual_val)
normal_errors.append(error(normal_val))
precise_errors.append(error(precise_val))
self.assertEqual(roundtrip_val, float(text[2:])) # round-trip should match float()
self.assertTrue(sum(precise_errors) < sum(normal_errors))
self.assertTrue(max(precise_errors) < max(normal_errors))

def test_pass_dtype(self):
data = """\
one,two
35 changes: 27 additions & 8 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
@@ -62,6 +62,9 @@ cdef extern from "headers/stdint.h":
cdef extern from "headers/portable.h":
pass

cdef extern from "errno.h":
int errno

try:
basestring
except NameError:
@@ -155,6 +158,7 @@ cdef extern from "parser/tokenizer.h":

void *skipset
int skip_footer
double (*converter)(const char *, char **, char, char, char, int)

# error handling
char *warn_msg
@@ -189,8 +193,13 @@ cdef extern from "parser/tokenizer.h":
int64_t int_max, int *error, char tsep)
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)

inline int to_double(char *item, double *p_value,
char sci, char decimal, char thousands)
double xstrtod(const char *p, char **q, char decimal, char sci,
char tsep, int skip_trailing)
double precise_xstrtod(const char *p, char **q, char decimal, char sci,
char tsep, int skip_trailing)
double round_trip(const char *p, char **q, char decimal, char sci,
char tsep, int skip_trailing)

inline int to_complex(char *item, double *p_real,
double *p_imag, char sci, char decimal)
inline int to_longlong(char *item, long long *p_value)
@@ -315,7 +324,8 @@ cdef class TextReader:
skip_footer=0,
verbose=False,
mangle_dupe_cols=True,
tupleize_cols=False):
tupleize_cols=False,
float_precision=None):

self.parser = parser_new()
self.parser.chunksize = tokenize_chunksize
@@ -415,6 +425,11 @@ cdef class TextReader:

self.verbose = verbose
self.low_memory = low_memory
self.parser.converter = xstrtod
if float_precision == 'high':
self.parser.converter = precise_xstrtod
elif float_precision == 'round_trip':
self.parser.converter = round_trip

# encoding
if encoding is not None:
@@ -1018,7 +1033,7 @@ cdef class TextReader:

elif dtype[1] == 'f':
result, na_count = _try_double(self.parser, i, start, end,
na_filter, na_hashset, na_flist)
na_filter, na_hashset, na_flist)

if dtype[1:] != 'f8':
result = result.astype(dtype)
@@ -1415,12 +1430,14 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
size_t i, lines
coliter_t it
char *word
char *p_end
double *data
double NA = na_values[np.float64]
ndarray result
khiter_t k
bint use_na_flist = len(na_flist) > 0

global errno
lines = line_end - line_start
result = np.empty(lines, dtype=np.float64)
data = <double *> result.data
@@ -1436,8 +1453,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
na_count += 1
data[0] = NA
else:
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
if error != 1:
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
if strcasecmp(word, cinf) == 0:
data[0] = INF
elif strcasecmp(word, cneginf) == 0:
@@ -1452,8 +1470,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
else:
for i in range(lines):
word = COLITER_NEXT(it)
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
if error != 1:
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
if strcasecmp(word, cinf) == 0:
data[0] = INF
elif strcasecmp(word, cneginf) == 0:
Loading