diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ff98b7b5a9b5..c061102fbaddc 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,19 +355,38 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) - def factorize(self, {{dtype}}_t values): - uniques = {{name}}Vector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels - @cython.boundscheck(False) - def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, + @cython.wraparound(False) + def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + """ + Calculate unique values and labels (no sorting); ignores all NA-values + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + uniques : {{name}}Vector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value satisfying val!=val are considered missing. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels - Py_ssize_t idx, count = count_prior int ret = 0 {{dtype}}_t val, na_value2 khiter_t k @@ -399,9 +418,11 @@ cdef class {{name}}HashTable(HashTable): k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: + # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx else: + # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count @@ -418,6 +439,19 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels) + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + uniques = {{name}}Vector() + labels = self._factorize(values, uniques=uniques, + na_sentinel=na_sentinel, na_value=na_value) + return labels, uniques.to_array() + + def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + return self._factorize(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: @@ -464,7 +498,21 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques @cython.boundscheck(False) + @cython.wraparound(False) def unique(self, const {{dtype}}_t[:] values): + """ + Calculate unique values without sorting + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + """ cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -567,7 +615,21 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) + @cython.wraparound(False) def unique(self, ndarray[object] values): + """ + Calculate unique values without sorting + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + """ cdef: Py_ssize_t i, count, n = len(values) int64_t[:] uindexer @@ -602,11 +664,6 @@ cdef class StringHashTable(HashTable): uniques.append(values[uindexer[i]]) return uniques.to_array() - def factorize(self, ndarray[object] values): - uniques = ObjectVector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels - @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -669,14 +726,37 @@ cdef class StringHashTable(HashTable): free(vecs) @cython.boundscheck(False) - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, + @cython.wraparound(False) + def _factorize(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + """ + Calculate unique values and labels (no sorting); ignores all NA-values + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int64_t[:] uindexer - Py_ssize_t idx, count = count_prior int ret = 0 object val const char *v @@ -684,19 +764,17 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None - # pre-filter out missing - # and assign pointers + # assign pointers and pre-filter out missing vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if ((PyUnicode_Check(val) or PyString_Check(val)) and - not (use_na_value and val == na_value)): + if ((PyUnicode_Check(val) or PyString_Check(val)) + and not (use_na_value and val == na_value)): v = util.get_c_string(val) vecs[i] = v else: @@ -711,9 +789,11 @@ cdef class StringHashTable(HashTable): v = vecs[i] k = kh_get_str(self.table, v) if k != self.table.n_buckets: + # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx else: + # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) self.table.vals[k] = count uindexer[count] = i @@ -728,6 +808,19 @@ cdef class StringHashTable(HashTable): return np.asarray(labels) + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + uniques = ObjectVector() + labels = self._factorize(values, uniques=uniques, + na_sentinel=na_sentinel, na_value=na_value) + return labels, uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + return self._factorize(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + cdef class PyObjectHashTable(HashTable): @@ -814,7 +907,22 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) + @cython.boundscheck(False) + @cython.wraparound(False) def unique(self, ndarray[object] values): + """ + Calculate unique values without sorting + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + """ cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -832,13 +940,38 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array() - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, + @cython.boundscheck(False) + @cython.wraparound(False) + def _factorize(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + """ + Calculate unique values and labels (no sorting); ignores all NA-values + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value satisfying val!=val are considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels - Py_ssize_t idx, count = count_prior int ret = 0 object val khiter_t k @@ -851,16 +984,18 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if ((val != val or val is None) or - (use_na_value and val == na_value)): + if ((val != val or val is None) + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: + # k falls into a previous bucket idx = self.table.vals[k] labels[i] = idx else: + # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = count uniques.append(val) @@ -868,3 +1003,16 @@ cdef class PyObjectHashTable(HashTable): count += 1 return np.asarray(labels) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + uniques = ObjectVector() + labels = self._factorize(values, uniques=uniques, + na_sentinel=na_sentinel, na_value=na_value) + return labels, uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + return self._factorize(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cb9ffc4bd0fd5..0f1eb12883fd5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -467,15 +467,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, ------- labels, uniques : ndarray """ - (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, - na_value=na_value) + labels, uniques = table.factorize(values, na_sentinel=na_sentinel, + na_value=na_value) labels = ensure_platform_int(labels) - uniques = uniques.to_array() return labels, uniques diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 1fd801c68fdde..557669260604a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -15,7 +15,6 @@ from pandas import compat from pandas._libs import (groupby as libgroupby, algos as libalgos, hashtable as ht) -from pandas._libs.hashtable import unique_label_indices from pandas.compat import lrange, range import pandas.core.algorithms as algos import pandas.core.common as com @@ -228,19 +227,53 @@ def test_complex_sorting(self): pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True) + def test_float64_factorize(self, writable): + data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) + data.setflags(write=writable) + exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) + exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + def test_uint64_factorize(self, writable): - data = np.array([2**63, 1, 2**63], dtype=np.uint64) + data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63, 1], dtype=np.uint64) + exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) - data = np.array([2**63, -1, 2**63], dtype=object) + def test_int64_factorize(self, writable): + data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64) + data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63, -1], dtype=object) + exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + + def test_string_factorize(self, writable): + data = np.array(['a', 'c', 'a', 'b', 'c'], + dtype=object) + data.setflags(write=writable) + exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) + exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + + def test_object_factorize(self, writable): + data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'], + dtype=object) + data.setflags(write=writable) + exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) + exp_uniques = np.array(['a', 'c', 'b'], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) @@ -1262,41 +1295,107 @@ def test_get_unique(self): exp = np.array([1, 2, 2**63], dtype=np.uint64) tm.assert_numpy_array_equal(s.unique(), exp) - def test_vector_resize(self, writable): + @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [ + (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), + (ht.StringHashTable, ht.ObjectVector, 'object', True), + (ht.Float64HashTable, ht.Float64Vector, 'float64', False), + (ht.Int64HashTable, ht.Int64Vector, 'int64', False), + (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]) + def test_vector_resize(self, writable, htable, uniques, dtype, + safely_resizes, nvals): # Test for memory errors after internal vector - # reallocations (pull request #7157) - - def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): - vals = np.array(np.random.randn(1000), dtype=dtype) - # GH 21688 ensure we can deal with readonly memory views - vals.setflags(write=writable) - # get_labels may append to uniques - htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array() set an external_view_exists flag on uniques. - tmp = uniques.to_array() - oldshape = tmp.shape - # subsequent get_labels() calls can no longer append to it - # (for all but StringHashTables + ObjectVector) - if safely_resizes: + # reallocations (GH 7157) + vals = np.array(np.random.randn(1000), dtype=dtype) + + # GH 21688 ensures we can deal with read-only memory views + vals.setflags(write=writable) + + # initialise instances; cannot initialise in parametrization, + # as otherwise external views would be held on the array (which is + # one of the things this test is checking) + htable = htable() + uniques = uniques() + + # get_labels may append to uniques + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array() sets an external_view_exists flag on uniques. + tmp = uniques.to_array() + oldshape = tmp.shape + + # subsequent get_labels() calls can no longer append to it + # (except for StringHashTables + ObjectVector) + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with tm.assert_raises_regex(ValueError, 'external reference.*'): htable.get_labels(vals, uniques, 0, -1) - else: - with pytest.raises(ValueError) as excinfo: - htable.get_labels(vals, uniques, 0, -1) - assert str(excinfo.value).startswith('external reference') - uniques.to_array() # should not raise here - assert tmp.shape == oldshape - - test_cases = [ - (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), - (ht.StringHashTable, ht.ObjectVector, 'object', True), - (ht.Float64HashTable, ht.Float64Vector, 'float64', False), - (ht.Int64HashTable, ht.Int64Vector, 'int64', False), - (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)] - - for (tbl, vect, dtype, safely_resizes) in test_cases: - # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes) - _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes) + + uniques.to_array() # should not raise here + assert tmp.shape == oldshape + + @pytest.mark.parametrize('htable, tm_dtype', [ + (ht.PyObjectHashTable, 'String'), + (ht.StringHashTable, 'String'), + (ht.Float64HashTable, 'Float'), + (ht.Int64HashTable, 'Int'), + (ht.UInt64HashTable, 'UInt')]) + def test_hashtable_unique(self, htable, tm_dtype, writable): + # output of maker has guaranteed unique elements + maker = getattr(tm, 'make' + tm_dtype + 'Index') + s = Series(maker(1000)) + if htable == ht.Float64HashTable: + # add NaN for float column + s.loc[500] = np.nan + elif htable == ht.PyObjectHashTable: + # use different NaN types for object column + s.loc[500:502] = [np.nan, None, pd.NaT] + + # create duplicated selection + s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) + s_duplicated.values.setflags(write=writable) + + # drop_duplicates has own cython code (hash_table_func_helper.pxi) + # and is tested separately; keeps first occurrence like ht.unique() + expected_unique = s_duplicated.drop_duplicates(keep='first').values + result_unique = htable().unique(s_duplicated.values) + tm.assert_numpy_array_equal(result_unique, expected_unique) + + @pytest.mark.parametrize('htable, tm_dtype', [ + (ht.PyObjectHashTable, 'String'), + (ht.StringHashTable, 'String'), + (ht.Float64HashTable, 'Float'), + (ht.Int64HashTable, 'Int'), + (ht.UInt64HashTable, 'UInt')]) + def test_hashtable_factorize(self, htable, tm_dtype, writable): + # output of maker has guaranteed unique elements + maker = getattr(tm, 'make' + tm_dtype + 'Index') + s = Series(maker(1000)) + if htable == ht.Float64HashTable: + # add NaN for float column + s.loc[500] = np.nan + elif htable == ht.PyObjectHashTable: + # use different NaN types for object column + s.loc[500:502] = [np.nan, None, pd.NaT] + + # create duplicated selection + s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) + s_duplicated.values.setflags(write=writable) + na_mask = s_duplicated.isna().values + + result_inverse, result_unique = htable().factorize(s_duplicated.values) + + # drop_duplicates has own cython code (hash_table_func_helper.pxi) + # and is tested separately; keeps first occurrence like ht.factorize() + # since factorize removes all NaNs, we do the same here + expected_unique = s_duplicated.dropna().drop_duplicates().values + tm.assert_numpy_array_equal(result_unique, expected_unique) + + # reconstruction can only succeed if the inverse is correct. Since + # factorize removes the NaNs, those have to be excluded here as well + result_reconstruct = result_unique[result_inverse[~na_mask]] + expected_reconstruct = s_duplicated.dropna().values + tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) def test_quantile(): @@ -1311,14 +1410,14 @@ def test_unique_label_indices(): a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') - left = unique_label_indices(a) + left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] tm.assert_numpy_array_equal(left, right, check_dtype=False) a[np.random.choice(len(a), 10)] = -1 - left = unique_label_indices(a) + left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] tm.assert_numpy_array_equal(left, right, check_dtype=False)