Skip to content

Commit dead46e

Browse files
committed
TILEDB_STRING_ASCII Now Displaying As UTF-8 / str Everywhere
* Previously `TILEDB_STRING_ASCII` data was inconsistently displayed as `bytes` * There is a need to coerce to `str` everywhere because (1) previously the resulting dataframe displayed ASCII as bytes with Pyarrow disabled but as str with Pyarrow enabled, and (2) this fix would remove the need to copy large amounts of data to convert back and forth in the TileDB-SingleCell Python API * Warning now emitted to the user to pass `dtype="ascii"` for string dim types in lieu of `np.bytes_` or `np.str_` for clarity. All three still work and under the hood use `np.str_` and `TILEDB_STRING_ASCII` * `repr` of string dimensions is now always displayed as `dtype="ascii"`. Calling `.dtype()` will return `np.dtype('U')` as the return signature of `dtype` requires a Numpy dtype
1 parent fe5fb33 commit dead46e

12 files changed

+132
-83
lines changed

HISTORY.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# In Progress
22

3+
## API Changes
4+
* `TILEDB_STRING_ASCII` displaying as `str` instead of `bytes` [#1304](https://github.com/TileDB-Inc/TileDB-Py/pull/1304)
5+
36
## Misc Updates
47
* Wheels will no longer be supported for macOS 10.15 Catalina; the minimum supported macOS version is now 11 Big Sur [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
58
* Wheels will no longer supported for Python 3.6 [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)

tiledb/core.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,10 @@ py::dtype tiledb_dtype(tiledb_datatype_t type, uint32_t cell_val_num) {
246246
std::string base_str;
247247
switch (type) {
248248
case TILEDB_CHAR:
249-
case TILEDB_STRING_ASCII:
250249
base_str = "|S";
251250
break;
252251
case TILEDB_STRING_UTF8:
252+
case TILEDB_STRING_ASCII:
253253
base_str = "|U";
254254
break;
255255
default:

tiledb/dataframe_.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -273,9 +273,8 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non
273273
return tiledb.Dim(
274274
name=name,
275275
domain=(dim_min, dim_max),
276-
# libtiledb only supports TILEDB_ASCII dimensions, so we must use
277-
# nb.bytes_ which will force encoding on write
278-
dtype=np.bytes_ if dtype == np.str_ else dtype,
276+
# TileDB only supports TILEDB_STRING_ASCII dimensions for strings
277+
dtype="ascii" if dtype in (np.bytes_, np.str_) else dtype,
279278
tile=tile,
280279
filters=dim_filters,
281280
)

tiledb/libmetadata.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@ cdef object unpack_metadata_val(
8282
):
8383
assert value_num != 0, "internal error: unexpected value_num==0"
8484

85-
if value_type == TILEDB_STRING_UTF8:
85+
if value_type == TILEDB_STRING_UTF8 or value_type == TILEDB_STRING_ASCII:
8686
return value_ptr[:value_num].decode('UTF-8') if value_ptr != NULL else ''
8787

88-
if value_type == TILEDB_CHAR or value_type == TILEDB_STRING_ASCII:
88+
if value_type == TILEDB_CHAR:
8989
return value_ptr[:value_num] if value_ptr != NULL else b''
9090

9191
if value_ptr == NULL:

tiledb/libtiledb.pyx

+29-23
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={
110110
TILEDB_INT16: np.NPY_INT16,
111111
TILEDB_UINT16: np.NPY_UINT16,
112112
TILEDB_CHAR: np.NPY_STRING,
113-
TILEDB_STRING_ASCII: np.NPY_STRING,
113+
TILEDB_STRING_ASCII: np.NPY_UNICODE,
114114
TILEDB_STRING_UTF8: np.NPY_UNICODE,
115115
}
116116
IF LIBTILEDB_VERSION_MAJOR >= 2:
@@ -133,7 +133,7 @@ _tiledb_dtype_to_numpy_dtype_convert = {
133133
TILEDB_INT16: np.int16,
134134
TILEDB_UINT16: np.uint16,
135135
TILEDB_CHAR: np.dtype('S1'),
136-
TILEDB_STRING_ASCII: np.dtype('S'),
136+
TILEDB_STRING_ASCII: np.dtype('U'),
137137
TILEDB_STRING_UTF8: np.dtype('U1'),
138138
}
139139
IF LIBTILEDB_VERSION_MAJOR >= 2:
@@ -1824,10 +1824,8 @@ cdef class Attr(object):
18241824
filters_str += repr(f) + ", "
18251825
filters_str += "])"
18261826

1827-
attr_dtype = "ascii" if self.isascii else self.dtype
1828-
18291827
# filters_str must be last with no spaces
1830-
return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """
1828+
return (f"""Attr(name={repr(self.name)}, dtype='{self.dtype!s}', """
18311829
f"""var={self.isvar!s}, nullable={self.isnullable!s}"""
18321830
f"""{filters_str})""")
18331831

@@ -1852,7 +1850,7 @@ cdef class Attr(object):
18521850

18531851
output.write("<tr>")
18541852
output.write(f"<td>{self.name}</td>")
1855-
output.write(f"<td>{'ascii' if self.isascii else self.dtype}</td>")
1853+
output.write(f"<td>{self.isascii}</td>")
18561854
output.write(f"<td>{self.isvar}</td>")
18571855
output.write(f"<td>{self.isnullable}</td>")
18581856
output.write(f"<td>{self.filters._repr_html_()}</td>")
@@ -1903,8 +1901,12 @@ cdef class Dim(object):
19031901
if not ctx:
19041902
ctx = default_ctx()
19051903

1904+
is_string = (
1905+
isinstance(dtype, str) and dtype == "ascii"
1906+
) or np.dtype(dtype) in (np.str_, np.bytes_)
1907+
19061908
if var is not None:
1907-
if var and np.dtype(dtype) not in (np.str_, np.bytes_):
1909+
if var and not is_string:
19081910
raise TypeError("'var=True' specified for non-str/bytes dtype")
19091911

19101912
if domain is not None and len(domain) != 2:
@@ -1919,12 +1921,14 @@ cdef class Dim(object):
19191921
cdef void* tile_size_ptr = NULL
19201922
cdef np.dtype domain_dtype
19211923

1922-
if ((isinstance(dtype, str) and dtype == "ascii") or
1923-
dtype == np.dtype('S')):
1924+
if is_string:
19241925
# Handle var-len domain type
19251926
# (currently only TILEDB_STRING_ASCII)
19261927
# The dimension's domain is implicitly formed as
19271928
# coordinates are written.
1929+
if dtype != "ascii":
1930+
warnings.warn("Use 'ascii' for string dimensions.")
1931+
dtype = np.dtype("|U0")
19281932
dim_datatype = TILEDB_STRING_ASCII
19291933
else:
19301934
if domain is None or len(domain) != 2:
@@ -1985,17 +1989,19 @@ cdef class Dim(object):
19851989
self.ptr = dim_ptr
19861990

19871991
def __repr__(self):
1988-
filters_str = ""
1992+
filters = ""
19891993
if self.filters:
1990-
filters_str = ", filters=FilterList(["
1994+
filters = ", filters=FilterList(["
19911995
for f in self.filters:
1992-
filters_str += repr(f) + ", "
1993-
filters_str += "])"
1996+
filters += repr(f) + ", "
1997+
filters += "])"
1998+
1999+
dtype = "ascii" if self._get_type() == TILEDB_STRING_ASCII else self.dtype
19942000

19952001
# for consistency, print `var=True` for string-like types
1996-
varlen = "" if not self.dtype in (np.str_, np.bytes_) else ", var=True"
1997-
return "Dim(name={0!r}, domain={1!s}, tile={2!r}, dtype='{3!s}'{4}{5})" \
1998-
.format(self.name, self.domain, self.tile, self.dtype, varlen, filters_str)
2002+
varlen = "" if dtype != "ascii" else ", var=True"
2003+
return f"Dim(name={self.name!r}, domain={self.domain}, tile={self.tile!r}, dtype='{dtype}'{varlen}{filters})"
2004+
19992005

20002006
def _repr_html_(self) -> str:
20012007
output = io.StringIO()
@@ -2022,7 +2028,7 @@ cdef class Dim(object):
20222028
output.write(f"<td>{self.domain}</td>")
20232029
output.write(f"<td>{self.tile}</td>")
20242030
output.write(f"<td>{self.dtype}</td>")
2025-
output.write(f"<td>{self.dtype in (np.str_, np.bytes_)}</td>")
2031+
output.write(f"<td>{self.dtype == 'ascii'}</td>")
20262032
output.write(f"<td>{self.filters._repr_html_()}</td>")
20272033
output.write("</tr>")
20282034

@@ -2222,7 +2228,7 @@ cdef class Dim(object):
22222228
:rtype: tuple(numpy scalar, numpy scalar)
22232229
22242230
"""
2225-
if self.dtype == np.dtype('S'):
2231+
if self.dtype == np.dtype('U'):
22262232
return None, None
22272233
cdef const void* domain_ptr = NULL
22282234
check_error(self.ctx,
@@ -3864,9 +3870,8 @@ cdef class Array(object):
38643870
results.append((None, None))
38653871
continue
38663872

3867-
buf_dtype = 'S'
3868-
start_buf = np.empty(start_size, 'S' + str(start_size))
3869-
end_buf = np.empty(end_size, 'S' + str(end_size))
3873+
start_buf = np.empty(start_size, f"S{start_size}")
3874+
end_buf = np.empty(end_size, f"S{end_size}")
38703875
start_buf_ptr = np.PyArray_DATA(start_buf)
38713876
end_buf_ptr = np.PyArray_DATA(end_buf)
38723877
else:
@@ -3884,7 +3889,8 @@ cdef class Array(object):
38843889
return None
38853890

38863891
if start_size > 0 and end_size > 0:
3887-
results.append((start_buf.item(0), end_buf.item(0)))
3892+
results.append((start_buf.item(0).decode("UTF-8"),
3893+
end_buf.item(0).decode("UTF-8")))
38883894
else:
38893895
results.append((None, None))
38903896
else:
@@ -4918,7 +4924,7 @@ def index_domain_coords(dom: Domain, idx: tuple, check_ndim: bool):
49184924
# ensure strings contain only ASCII characters
49194925
domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1))
49204926
except Exception as exc:
4921-
raise TileDBError(f'Dim\' strings may only contain ASCII characters')
4927+
raise TileDBError('Dimension strings may only contain ASCII characters')
49224928
else:
49234929
domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1))
49244930

tiledb/tests/test_filters.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def test_dictionary_encoding(self):
9797
schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True)
9898
tiledb.Array.create(path, schema)
9999

100-
data = [b"x" * i for i in np.random.randint(1, 10, size=10)]
100+
data = ["x" * i for i in np.random.randint(1, 10, size=10)]
101101

102102
with tiledb.open(path, "w") as A:
103103
A[np.arange(10)] = data

tiledb/tests/test_fragments.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def test_array_fragments_var(self):
7575

7676
uri = self.path("test_array_fragments_var")
7777
dom = tiledb.Domain(
78-
tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype=np.bytes_)
78+
tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype="ascii")
7979
)
8080
schema = tiledb.ArraySchema(
8181
domain=dom,
@@ -285,22 +285,22 @@ def test_nonempty_domain_date(self):
285285
def test_nonempty_domain_strings(self):
286286
uri = self.path("test_nonempty_domain_strings")
287287
dom = tiledb.Domain(
288-
tiledb.Dim(name="x", domain=(None, None), dtype=np.bytes_),
289-
tiledb.Dim(name="y", domain=(None, None), dtype=np.bytes_),
288+
tiledb.Dim(name="x", domain=(None, None), dtype="ascii"),
289+
tiledb.Dim(name="y", domain=(None, None), dtype="ascii"),
290290
)
291291
att = tiledb.Attr()
292292
schema = tiledb.ArraySchema(sparse=True, domain=dom, attrs=(att,))
293293

294294
tiledb.SparseArray.create(uri, schema)
295295

296296
with tiledb.SparseArray(uri, mode="w") as T:
297-
x_dims = [b"a", b"b", b"c", b"d"]
298-
y_dims = [b"e", b"f", b"g", b"h"]
297+
x_dims = ["a", "b", "c", "d"]
298+
y_dims = ["e", "f", "g", "h"]
299299
T[x_dims, y_dims] = np.array([1, 2, 3, 4])
300300

301301
with tiledb.SparseArray(uri, mode="w") as T:
302-
x_dims = [b"a", b"b"]
303-
y_dims = [b"e", b"f"]
302+
x_dims = ["a", "b"]
303+
y_dims = ["e", "f"]
304304
T[x_dims, y_dims] = np.array([1, 2])
305305

306306
fragment_info = PyFragmentInfo(uri, schema, False, tiledb.default_ctx())

0 commit comments

Comments
 (0)