Skip to content

ENH:Add EA types to read CSV #23255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jan 2, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -362,16 +362,16 @@ columns:

.. ipython:: python

data = ('a,b,c\n'
'1,2,3\n'
'4,5,6\n'
'7,8,9')
data = ('a,b,c,d\n'
'1,2,3,4\n'
'5,6,7,8\n'
'9,10,11')
print(data)

df = pd.read_csv(StringIO(data), dtype=object)
df
df['a'][0]
df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64})
df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64, 'd': 'Int64'})
df.dtypes

Fortunately, pandas offers more than one way to ensure that your column(s)
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ New features
- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
- :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`)
- :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`)

.. _whatsnew_0240.values_api:
Expand Down
30 changes: 25 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ from pandas.core.dtypes.common import (
is_integer_dtype, is_float_dtype,
is_bool_dtype, is_object_dtype,
is_datetime64_dtype,
pandas_dtype)
pandas_dtype, is_extension_array_dtype)
from pandas.core.arrays import Categorical
from pandas.core.dtypes.concat import union_categoricals
import pandas.io.common as icom
Expand Down Expand Up @@ -983,7 +983,6 @@ cdef class TextReader:
footer=footer,
upcast_na=True)
self._end_clock('Type conversion')

self._start_clock()
if len(columns) > 0:
rows_read = len(list(columns.values())[0])
Expand Down Expand Up @@ -1123,7 +1122,9 @@ cdef class TextReader:
if na_filter:
self._free_na_set(na_hashset)

if upcast_na and na_count > 0:
# don't try to upcast EAs
try_upcast = upcast_na and na_count > 0
if try_upcast and not is_extension_array_dtype(col_dtype):
col_res = _maybe_upcast(col_res)

if col_res is None:
Expand Down Expand Up @@ -1215,6 +1216,22 @@ cdef class TextReader:
cats, codes, dtype, true_values=true_values)
return cat, na_count

elif is_extension_array_dtype(dtype):
result, na_count = self._string_convert(i, start, end, na_filter,
na_hashset)
array_type = dtype.construct_array_type()
try:
# use _from_sequence_of_strings if the class defines it
result = array_type._from_sequence_of_strings(result,
dtype=dtype)
except NotImplementedError:
raise NotImplementedError(
"Extension Array: {ea} must implement "
"_from_sequence_of_strings in order "
"to be used in parser methods".format(ea=array_type))

return result, na_count

elif is_integer_dtype(dtype):
try:
result, na_count = _try_int64(self.parser, i, start,
Expand All @@ -1240,7 +1257,6 @@ cdef class TextReader:
if result is not None and dtype != 'float64':
result = result.astype(dtype)
return result, na_count

elif is_bool_dtype(dtype):
result, na_count = _try_bool_flex(self.parser, i, start, end,
na_filter, na_hashset,
Expand Down Expand Up @@ -2173,7 +2189,11 @@ def _concatenate_chunks(list chunks):
result[name] = union_categoricals(arrs,
sort_categories=sort_categories)
else:
result[name] = np.concatenate(arrs)
if is_extension_array_dtype(dtype):
array_type = dtype.construct_array_type()
result[name] = array_type._concat_same_type(arrs)
else:
result[name] = np.concatenate(arrs)

if warning_columns:
warning_names = ','.join(warning_columns)
Expand Down
24 changes: 24 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,30 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
"""
raise AbstractMethodError(cls)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
"""Construct a new ExtensionArray from a sequence of strings.

.. versionadded:: 0.24.0

Parameters
----------
strings : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
dtype : dtype, optional
Construct for this particular dtype. This should be a Dtype
compatible with the ExtensionArray.
copy : boolean, default False
If True, copy the underlying data.

Returns
-------
ExtensionArray

"""
raise AbstractMethodError(cls)

@classmethod
def _from_factorized(cls, values, original):
"""
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pandas.core import nanops
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.tools.numeric import to_numeric


class _IntegerDtype(ExtensionDtype):
Expand Down Expand Up @@ -261,6 +262,11 @@ def __init__(self, values, mask, copy=False):
def _from_sequence(cls, scalars, dtype=None, copy=False):
return integer_array(scalars, dtype=dtype, copy=copy)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype, copy)

@classmethod
def _from_factorized(cls, values, original):
return integer_array(values, dtype=original.dtype)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from pandas.util._decorators import Appender, cache_readonly

from pandas.core.dtypes.common import (
is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype,
is_scalar, needs_i8_conversion, pandas_dtype)
is_bool, is_bool_dtype, is_dtype_equal, is_float,
is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.missing import isna

Expand Down
33 changes: 26 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
is_scalar, is_string_dtype)
is_extension_array_dtype, is_float, is_integer, is_integer_dtype,
is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -134,7 +134,8 @@
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Use `str` or `object` together with suitable `na_values` settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
Expand Down Expand Up @@ -1659,16 +1660,20 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
values, set(col_na_values) | col_na_fvalues,
try_num_bool=False)
else:
is_str_or_ea_dtype = (is_string_dtype(cast_type)
or is_extension_array_dtype(cast_type))
# skip inference if specified dtype is object
try_num_bool = not (cast_type and is_string_dtype(cast_type))
# or casting to an EA
try_num_bool = not (cast_type and is_str_or_ea_dtype)

# general type inference and conversion
cvals, na_count = self._infer_types(
values, set(col_na_values) | col_na_fvalues,
try_num_bool)

# type specified in dtype param
if cast_type and not is_dtype_equal(cvals, cast_type):
# type specified in dtype param or cast_type is an EA
if cast_type and (not is_dtype_equal(cvals, cast_type)
or is_extension_array_dtype(cast_type)):
try:
if (is_bool_dtype(cast_type) and
not is_categorical_dtype(cast_type)
Expand Down Expand Up @@ -1765,6 +1770,20 @@ def _cast_types(self, values, cast_type, column):
cats, cats.get_indexer(values), cast_type,
true_values=self.true_values)

# use the EA's implementation of casting
elif is_extension_array_dtype(cast_type):
# ensure cast_type is an actual dtype and not a string
cast_type = pandas_dtype(cast_type)
array_type = cast_type.construct_array_type()
try:
return array_type._from_sequence_of_strings(values,
dtype=cast_type)
except NotImplementedError:
raise NotImplementedError(
"Extension Array: {ea} must implement "
"_from_sequence_of_strings in order "
"to be used in parser methods".format(ea=array_type))

else:
try:
values = astype_nansafe(values, cast_type,
Expand Down Expand Up @@ -2174,8 +2193,8 @@ def __init__(self, f, **kwds):

self.verbose = kwds['verbose']
self.converters = kwds['converters']
self.dtype = kwds['dtype']

self.dtype = kwds['dtype']
self.thousands = kwds['thousands']
self.decimal = kwds['decimal']

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,4 @@ class TestMyDtype(BaseDtypeTests):
from .missing import BaseMissingTests # noqa
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa
from .io import BaseParsingTests # noqa
24 changes: 24 additions & 0 deletions pandas/tests/extension/base/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np
import pytest

from pandas.compat import StringIO

import pandas as pd
import pandas.testing as tm

from .base import BaseExtensionTests


class BaseParsingTests(BaseExtensionTests):

@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
df = pd.DataFrame({
'with_dtype': pd.Series(data, dtype=str(data.dtype))
})
csv_output = df.to_csv(index=False, na_rep=np.nan)
result = pd.read_csv(StringIO(csv_output), dtype={
'with_dtype': str(data.dtype)
}, engine=engine)
expected = df
tm.assert_frame_equal(result, expected)
5 changes: 5 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ def dtype(self):
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls(scalars)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
return cls._from_sequence([decimal.Decimal(x) for x in strings],
dtype, copy)

@classmethod
def _from_factorized(cls, values, original):
return cls(values)
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,7 @@ def _compare_other(self, s, data, op_name, other):
else:
with pytest.raises(TypeError):
op(data, other)


class TestParsing(base.BaseParsingTests):
pass
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests):

class TestPrinting(base.BasePrintingTests):
pass


class TestParsing(base.BaseParsingTests):
pass
8 changes: 8 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,11 @@ class TestPrinting(BaseInterval, base.BasePrintingTests):
@pytest.mark.skip(reason="custom repr")
def test_array_repr(self, data, size):
pass


class TestParsing(BaseInterval, base.BaseParsingTests):
@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
expected_msg = r'.*must implement _from_sequence_of_strings.*'
with pytest.raises(NotImplementedError, match=expected_msg):
super(TestParsing, self).test_EA_types(engine, data)
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,7 @@ def test_concat_mixed_dtypes(self, data):

class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
pass


class TestParsing(BaseNumPyTests, base.BaseParsingTests):
pass
8 changes: 8 additions & 0 deletions pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,11 @@ class TestGroupby(BasePeriodTests, base.BaseGroupbyTests):

class TestPrinting(BasePeriodTests, base.BasePrintingTests):
pass


class TestParsing(BasePeriodTests, base.BaseParsingTests):
@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
expected_msg = r'.*must implement _from_sequence_of_strings.*'
with pytest.raises(NotImplementedError, match=expected_msg):
super(TestParsing, self).test_EA_types(engine, data)
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,12 @@ class TestPrinting(BaseSparseTests, base.BasePrintingTests):
@pytest.mark.xfail(reason='Different repr', strict=True)
def test_array_repr(self, data, size):
super(TestPrinting, self).test_array_repr(data, size)


class TestParsing(BaseSparseTests, base.BaseParsingTests):
@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
expected_msg = r'.*must implement _from_sequence_of_strings.*'
with pytest.raises(NotImplementedError, match=expected_msg):
super(TestParsing, self).test_EA_types(engine, data)

Empty file.