Skip to content

Commit c9e5bf4

Browse files
gfyoungjorisvandenbossche
authored andcommitted
BUG: Patch read_csv NA values behaviour
Patches the following behaviour when `na_values` is passed in as a dictionary: 1. Prevent aliasing in case `na_values` was defined in a broader scope. 2. Respect column indices as keys when doing NA conversions. Closes pandas-dev#14203. Author: gfyoung <[email protected]> Closes pandas-dev#14751 from gfyoung/csv-na-values-patching and squashes the following commits: cac422c [gfyoung] BUG: Respect column indices for dict-like na_values 1439c27 [gfyoung] BUG: Prevent aliasing of dict na_values (cherry picked from commit dd8cba2)
1 parent c520b25 commit c9e5bf4

File tree

4 files changed

+62
-13
lines changed

4 files changed

+62
-13
lines changed

doc/source/whatsnew/v0.19.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ Bug Fixes
4040

4141
- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
4242
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
43+
- Bug in ``pd.read_csv`` in which aliasing was being done for ``na_values`` when passed in as a dictionary (:issue:`14203`)
44+
- Bug in ``pd.read_csv`` in which column indices for a dict-like ``na_values`` were not being respected (:issue:`14203`)
4345
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
4446
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
4547
- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.

pandas/io/parsers.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -2040,8 +2040,27 @@ def _convert_data(self, data):
20402040
col = self.orig_names[col]
20412041
clean_conv[col] = f
20422042

2043-
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
2044-
self.verbose, clean_conv)
2043+
# Apply NA values.
2044+
clean_na_values = {}
2045+
clean_na_fvalues = {}
2046+
2047+
if isinstance(self.na_values, dict):
2048+
for col in self.na_values:
2049+
na_value = self.na_values[col]
2050+
na_fvalue = self.na_fvalues[col]
2051+
2052+
if isinstance(col, int) and col not in self.orig_names:
2053+
col = self.orig_names[col]
2054+
2055+
clean_na_values[col] = na_value
2056+
clean_na_fvalues[col] = na_fvalue
2057+
else:
2058+
clean_na_values = self.na_values
2059+
clean_na_fvalues = self.na_fvalues
2060+
2061+
return self._convert_to_ndarrays(data, clean_na_values,
2062+
clean_na_fvalues, self.verbose,
2063+
clean_conv)
20452064

20462065
def _to_recarray(self, data, columns):
20472066
dtypes = []
@@ -2749,6 +2768,7 @@ def _clean_na_values(na_values, keep_default_na=True):
27492768
na_values = []
27502769
na_fvalues = set()
27512770
elif isinstance(na_values, dict):
2771+
na_values = na_values.copy() # Prevent aliasing.
27522772
if keep_default_na:
27532773
for k, v in compat.iteritems(na_values):
27542774
if not is_list_like(v):

pandas/io/tests/parser/na_values.py

+23
Original file line numberDiff line numberDiff line change
@@ -266,3 +266,26 @@ def test_na_values_scalar(self):
266266
out = self.read_csv(StringIO(data), names=names,
267267
na_values={'a': 2, 'b': 1})
268268
tm.assert_frame_equal(out, expected)
269+
270+
def test_na_values_dict_aliasing(self):
271+
na_values = {'a': 2, 'b': 1}
272+
na_values_copy = na_values.copy()
273+
274+
names = ['a', 'b']
275+
data = '1,2\n2,1'
276+
277+
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
278+
out = self.read_csv(StringIO(data), names=names, na_values=na_values)
279+
280+
tm.assert_frame_equal(out, expected)
281+
tm.assert_dict_equal(na_values, na_values_copy)
282+
283+
def test_na_values_dict_col_index(self):
284+
# see gh-14203
285+
286+
data = 'a\nfoo\n1'
287+
na_values = {0: 'foo'}
288+
289+
out = self.read_csv(StringIO(data), na_values=na_values)
290+
expected = DataFrame({'a': [np.nan, 1]})
291+
tm.assert_frame_equal(out, expected)

pandas/parser.pyx

+15-11
Original file line numberDiff line numberDiff line change
@@ -1243,19 +1243,23 @@ cdef class TextReader:
12431243
return None, set()
12441244

12451245
if isinstance(self.na_values, dict):
1246+
key = None
12461247
values = None
1248+
12471249
if name is not None and name in self.na_values:
1248-
values = self.na_values[name]
1249-
if values is not None and not isinstance(values, list):
1250-
values = list(values)
1251-
fvalues = self.na_fvalues[name]
1252-
if fvalues is not None and not isinstance(fvalues, set):
1253-
fvalues = set(fvalues)
1254-
else:
1255-
if i in self.na_values:
1256-
return self.na_values[i], self.na_fvalues[i]
1257-
else:
1258-
return _NA_VALUES, set()
1250+
key = name
1251+
elif i in self.na_values:
1252+
key = i
1253+
else: # No na_values provided for this column.
1254+
return _NA_VALUES, set()
1255+
1256+
values = self.na_values[key]
1257+
if values is not None and not isinstance(values, list):
1258+
values = list(values)
1259+
1260+
fvalues = self.na_fvalues[key]
1261+
if fvalues is not None and not isinstance(fvalues, set):
1262+
fvalues = set(fvalues)
12591263

12601264
return _ensure_encoded(values), fvalues
12611265
else:

0 commit comments

Comments
 (0)