Skip to content

Commit ef61d18

Browse files
authored
Move encode_array to utils module from sgkit-plink (#55)
1 parent 9c46ac6 commit ef61d18

File tree

2 files changed

+63
-3
lines changed

2 files changed

+63
-3
lines changed

sgkit/tests/test_utils.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
from typing import Any, List
2+
13
import numpy as np
24
import pytest
35

4-
from sgkit.utils import check_array_like
6+
from sgkit.typing import ArrayLike
7+
from sgkit.utils import check_array_like, encode_array
58

69

710
def test_check_array_like():
@@ -18,3 +21,27 @@ def test_check_array_like():
1821
check_array_like(a, ndim=2)
1922
with pytest.raises(ValueError):
2023
check_array_like(a, ndim={2, 3})
24+
25+
26+
def test_encode_array():
27+
def check(x: ArrayLike, values: ArrayLike, names: List[Any]) -> None:
28+
v, n = encode_array(x)
29+
np.testing.assert_equal(v, values)
30+
np.testing.assert_equal(n, names)
31+
32+
check([], [], [])
33+
check(["a"], [0], ["a"])
34+
check(["a", "b"], [0, 1], ["a", "b"])
35+
check(["b", "a"], [0, 1], ["b", "a"])
36+
check(["a", "b", "b"], [0, 1, 1], ["a", "b"])
37+
check(["b", "b", "a"], [0, 0, 1], ["b", "a"])
38+
check(["b", "b", "a", "a"], [0, 0, 1, 1], ["b", "a"])
39+
check(["c", "a", "a", "b"], [0, 1, 1, 2], ["c", "a", "b"])
40+
check(["b", "b", "c", "c", "c", "a", "a"], [0, 0, 1, 1, 1, 2, 2], ["b", "c", "a"])
41+
check(["b", "c", "b", "c", "a"], [0, 1, 0, 1, 2], ["b", "c", "a"])
42+
check([2, 2, 1, 3, 1, 5, 5, 1], [0, 0, 1, 2, 1, 3, 3, 1], [2.0, 1.0, 3.0, 5.0])
43+
check(
44+
[2.0, 2.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0],
45+
[0, 0, 1, 2, 1, 3, 3, 1],
46+
[2.0, 1.0, 3.0, 5.0],
47+
)

sgkit/utils.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from typing import Any, Set, Union
1+
from typing import Any, List, Set, Tuple, Union
22

33
import numpy as np
44

5-
from .typing import DType
5+
from .typing import ArrayLike, DType
66

77

88
def check_array_like(
@@ -31,3 +31,36 @@ def check_array_like(
3131
raise ValueError
3232
elif ndim != a.ndim:
3333
raise ValueError
34+
35+
36+
def encode_array(x: ArrayLike) -> Tuple[ArrayLike, List[Any]]:
37+
"""Encode array values as integers indexing unique values
38+
39+
The codes created for each unique element in the array correspond
40+
to order of appearance, not the natural sort order for the array
41+
dtype.
42+
43+
Examples
44+
--------
45+
46+
>>> encode_array(['c', 'a', 'a', 'b'])
47+
(array([0, 1, 1, 2]), array(['c', 'a', 'b'], dtype='<U1'))
48+
49+
Parameters
50+
----------
51+
x : (M,) array-like
52+
Array of elements to encode of any type
53+
54+
Returns
55+
-------
56+
indexes : (M,) ndarray
57+
Encoded values as integer indices
58+
values : ndarray
59+
Unique values in original array in order of appearance
60+
"""
61+
# argsort not implemented in dask: https://github.com/dask/dask/issues/4368
62+
names, index, inverse = np.unique(x, return_index=True, return_inverse=True)
63+
index = np.argsort(index)
64+
rank = np.empty_like(index)
65+
rank[index] = np.arange(len(index))
66+
return rank[inverse], names[index]

0 commit comments

Comments
 (0)