Skip to content

Commit d56dddd

Browse files
authored
Allow non-string typed values in table properties (#469)
* property accept int https://stackoverflow.com/questions/77304167/using-pydantic-to-change-int-to-string https://docs.pydantic.dev/latest/concepts/validators/\#field-validators * add tests * add integration tests * pr feedback * make validator reusable * show key when none
1 parent 6708a6e commit d56dddd

File tree

8 files changed

+144
-11
lines changed

8 files changed

+144
-11
lines changed

pyiceberg/catalog/rest.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
Union,
2929
)
3030

31-
from pydantic import Field, ValidationError
31+
from pydantic import Field, ValidationError, field_validator
3232
from requests import HTTPError, Session
3333
from tenacity import RetryCallState, retry, retry_if_exception_type, stop_after_attempt
3434

@@ -69,6 +69,7 @@
6969
)
7070
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder, assign_fresh_sort_order_ids
7171
from pyiceberg.typedef import EMPTY_DICT, UTF8, IcebergBaseModel
72+
from pyiceberg.types import transform_dict_value_to_str
7273

7374
if TYPE_CHECKING:
7475
import pyarrow as pa
@@ -147,6 +148,8 @@ class CreateTableRequest(IcebergBaseModel):
147148
write_order: Optional[SortOrder] = Field(alias="write-order")
148149
stage_create: bool = Field(alias="stage-create", default=False)
149150
properties: Properties = Field(default_factory=dict)
151+
# validators
152+
transform_properties_dict_value_to_str = field_validator('properties', mode='before')(transform_dict_value_to_str)
150153

151154

152155
class RegisterTableRequest(IcebergBaseModel):
@@ -234,9 +237,9 @@ def _create_session(self) -> Session:
234237

235238
# Sets the client side and server side SSL cert verification, if provided as properties.
236239
if ssl_config := self.properties.get(SSL):
237-
if ssl_ca_bundle := ssl_config.get(CA_BUNDLE): # type: ignore
240+
if ssl_ca_bundle := ssl_config.get(CA_BUNDLE):
238241
session.verify = ssl_ca_bundle
239-
if ssl_client := ssl_config.get(CLIENT): # type: ignore
242+
if ssl_client := ssl_config.get(CLIENT):
240243
if all(k in ssl_client for k in (CERT, KEY)):
241244
session.cert = (ssl_client[CERT], ssl_client[KEY])
242245
elif ssl_client_cert := ssl_client.get(CERT):

pyiceberg/table/metadata.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
Union,
2929
)
3030

31-
from pydantic import Field, model_validator
31+
from pydantic import Field, field_validator, model_validator
3232
from pydantic import ValidationError as PydanticValidationError
3333
from typing_extensions import Annotated
3434

@@ -49,6 +49,7 @@
4949
IcebergRootModel,
5050
Properties,
5151
)
52+
from pyiceberg.types import transform_dict_value_to_str
5253
from pyiceberg.utils.datetime import datetime_to_millis
5354

5455
CURRENT_SNAPSHOT_ID = "current-snapshot-id"
@@ -218,6 +219,9 @@ class TableMetadataCommonFields(IcebergBaseModel):
218219
There is always a main branch reference pointing to the
219220
current-snapshot-id even if the refs map is null."""
220221

222+
# validators
223+
transform_properties_dict_value_to_str = field_validator('properties', mode='before')(transform_dict_value_to_str)
224+
221225
def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]:
222226
"""Get the snapshot by snapshot_id."""
223227
return next((snapshot for snapshot in self.snapshots if snapshot.snapshot_id == snapshot_id), None)

pyiceberg/typedef.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __missing__(self, key: K) -> V:
7373

7474

7575
Identifier = Tuple[str, ...]
76-
Properties = Dict[str, str]
76+
Properties = Dict[str, Any]
7777
RecursiveDict = Dict[str, Union[str, "RecursiveDict"]]
7878

7979
# Represents the literal value

pyiceberg/types.py

+9
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from typing import (
3838
Any,
3939
ClassVar,
40+
Dict,
4041
Literal,
4142
Optional,
4243
Tuple,
@@ -61,6 +62,14 @@
6162
FIXED_PARSER = ParseNumberFromBrackets(FIXED)
6263

6364

65+
def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]:
66+
"""Transform all values in the dictionary to string. Raise an error if any value is None."""
67+
for key, value in dict.items():
68+
if value is None:
69+
raise ValueError(f"None type is not a supported value in properties: {key}")
70+
return {k: str(v) for k, v in dict.items()}
71+
72+
6473
def _parse_decimal_type(decimal: Any) -> Tuple[int, int]:
6574
if isinstance(decimal, str):
6675
matches = DECIMAL_REGEX.search(decimal)

tests/catalog/test_base.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import pyarrow as pa
2828
import pytest
29+
from pydantic_core import ValidationError
2930
from pytest_lazyfixture import lazy_fixture
3031

3132
from pyiceberg.catalog import (
@@ -255,13 +256,16 @@ def catalog() -> InMemoryCatalog:
255256
NAMESPACE_NOT_EMPTY_ERROR = "Namespace is not empty: \\('com', 'organization', 'department'\\)"
256257

257258

258-
def given_catalog_has_a_table(catalog: InMemoryCatalog) -> Table:
259+
def given_catalog_has_a_table(
260+
catalog: InMemoryCatalog,
261+
properties: Properties = EMPTY_DICT,
262+
) -> Table:
259263
return catalog.create_table(
260264
identifier=TEST_TABLE_IDENTIFIER,
261265
schema=TEST_TABLE_SCHEMA,
262266
location=TEST_TABLE_LOCATION,
263267
partition_spec=TEST_TABLE_PARTITION_SPEC,
264-
properties=TEST_TABLE_PROPERTIES,
268+
properties=properties or TEST_TABLE_PROPERTIES,
265269
)
266270

267271

@@ -661,3 +665,17 @@ def test_add_column_with_statement(catalog: InMemoryCatalog) -> None:
661665
def test_catalog_repr(catalog: InMemoryCatalog) -> None:
662666
s = repr(catalog)
663667
assert s == "test.in.memory.catalog (<class 'test_base.InMemoryCatalog'>)"
668+
669+
670+
def test_table_properties_int_value(catalog: InMemoryCatalog) -> None:
671+
# table properties can be set to int, but still serialized to string
672+
property_with_int = {"property_name": 42}
673+
given_table = given_catalog_has_a_table(catalog, properties=property_with_int)
674+
assert isinstance(given_table.properties["property_name"], str)
675+
676+
677+
def test_table_properties_raise_for_none_value(catalog: InMemoryCatalog) -> None:
678+
property_with_none = {"property_name": None}
679+
with pytest.raises(ValidationError) as exc_info:
680+
_ = given_catalog_has_a_table(catalog, properties=property_with_none)
681+
assert "None type is not a supported value in properties: property_name" in str(exc_info.value)

tests/catalog/test_sql.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import pyarrow as pa
2323
import pytest
24+
from pydantic_core import ValidationError
2425
from pytest_lazyfixture import lazy_fixture
2526
from sqlalchemy.exc import ArgumentError, IntegrityError
2627

@@ -640,7 +641,7 @@ def test_create_namespace_with_null_properties(catalog: SqlCatalog, database_nam
640641
catalog.create_namespace(namespace=database_name, properties={None: "value"}) # type: ignore
641642

642643
with pytest.raises(IntegrityError):
643-
catalog.create_namespace(namespace=database_name, properties={"key": None}) # type: ignore
644+
catalog.create_namespace(namespace=database_name, properties={"key": None})
644645

645646

646647
@pytest.mark.parametrize(
@@ -915,3 +916,39 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None:
915916
with txn.update_snapshot().fast_append() as snapshot_update:
916917
for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table_with_column, io=tbl.io):
917918
snapshot_update.append_data_file(data_file)
919+
920+
921+
@pytest.mark.parametrize(
922+
'catalog',
923+
[
924+
lazy_fixture('catalog_memory'),
925+
lazy_fixture('catalog_sqlite'),
926+
lazy_fixture('catalog_sqlite_without_rowcount'),
927+
],
928+
)
929+
def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None:
930+
# table properties can be set to int, but still serialized to string
931+
database_name, _table_name = random_identifier
932+
catalog.create_namespace(database_name)
933+
property_with_int = {"property_name": 42}
934+
table = catalog.create_table(random_identifier, table_schema_simple, properties=property_with_int)
935+
assert isinstance(table.properties["property_name"], str)
936+
937+
938+
@pytest.mark.parametrize(
939+
'catalog',
940+
[
941+
lazy_fixture('catalog_memory'),
942+
lazy_fixture('catalog_sqlite'),
943+
lazy_fixture('catalog_sqlite_without_rowcount'),
944+
],
945+
)
946+
def test_table_properties_raise_for_none_value(
947+
catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier
948+
) -> None:
949+
database_name, _table_name = random_identifier
950+
catalog.create_namespace(database_name)
951+
property_with_none = {"property_name": None}
952+
with pytest.raises(ValidationError) as exc_info:
953+
_ = catalog.create_table(random_identifier, table_schema_simple, properties=property_with_none)
954+
assert "None type is not a supported value in properties: property_name" in str(exc_info.value)

tests/integration/test_writes.py

+37-2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import pytest
2929
import pytz
3030
from pyarrow.fs import S3FileSystem
31+
from pydantic_core import ValidationError
3132
from pyspark.sql import SparkSession
3233
from pytest_mock.plugin import MockerFixture
3334

@@ -403,7 +404,7 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
403404

404405

405406
@pytest.mark.integration
406-
@pytest.mark.parametrize("format_version", ["1", "2"])
407+
@pytest.mark.parametrize("format_version", [1, 2])
407408
@pytest.mark.parametrize(
408409
"properties, expected_compression_name",
409410
[
@@ -419,7 +420,7 @@ def test_write_parquet_compression_properties(
419420
spark: SparkSession,
420421
session_catalog: Catalog,
421422
arrow_table_with_null: pa.Table,
422-
format_version: str,
423+
format_version: int,
423424
properties: Dict[str, Any],
424425
expected_compression_name: str,
425426
) -> None:
@@ -654,3 +655,37 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None
654655
with txn.update_snapshot().fast_append() as snapshot_update:
655656
for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table_with_column, io=tbl.io):
656657
snapshot_update.append_data_file(data_file)
658+
659+
660+
@pytest.mark.integration
661+
@pytest.mark.parametrize("format_version", [1, 2])
662+
def test_table_properties_int_value(
663+
session_catalog: Catalog,
664+
arrow_table_with_null: pa.Table,
665+
format_version: int,
666+
) -> None:
667+
# table properties can be set to int, but still serialized to string
668+
property_with_int = {"property_name": 42}
669+
identifier = "default.test_table_properties_int_value"
670+
671+
tbl = _create_table(
672+
session_catalog, identifier, {"format-version": format_version, **property_with_int}, [arrow_table_with_null]
673+
)
674+
assert isinstance(tbl.properties["property_name"], str)
675+
676+
677+
@pytest.mark.integration
678+
@pytest.mark.parametrize("format_version", [1, 2])
679+
def test_table_properties_raise_for_none_value(
680+
session_catalog: Catalog,
681+
arrow_table_with_null: pa.Table,
682+
format_version: int,
683+
) -> None:
684+
property_with_none = {"property_name": None}
685+
identifier = "default.test_table_properties_raise_for_none_value"
686+
687+
with pytest.raises(ValidationError) as exc_info:
688+
_ = _create_table(
689+
session_catalog, identifier, {"format-version": format_version, **property_with_none}, [arrow_table_with_null]
690+
)
691+
assert "None type is not a supported value in properties: property_name" in str(exc_info.value)

tests/table/test_init.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@
1717
# pylint:disable=redefined-outer-name
1818
import uuid
1919
from copy import copy
20-
from typing import Dict
20+
from typing import Any, Dict
2121

2222
import pyarrow as pa
2323
import pytest
24+
from pydantic import ValidationError
2425
from sortedcontainers import SortedList
2526

2627
from pyiceberg.catalog.noop import NoopCatalog
@@ -1081,3 +1082,29 @@ def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
10811082

10821083
with pytest.raises(ValueError, match=expected):
10831084
_check_schema(table_schema_simple, other_schema)
1085+
1086+
1087+
def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None:
1088+
# metadata properties are all strings
1089+
for k, v in example_table_metadata_v2["properties"].items():
1090+
assert isinstance(k, str)
1091+
assert isinstance(v, str)
1092+
metadata = TableMetadataV2(**example_table_metadata_v2)
1093+
for k, v in metadata.properties.items():
1094+
assert isinstance(k, str)
1095+
assert isinstance(v, str)
1096+
1097+
# property can be set to int, but still serialized as string
1098+
property_with_int = {"property_name": 42}
1099+
new_example_table_metadata_v2 = {**example_table_metadata_v2, "properties": property_with_int}
1100+
assert isinstance(new_example_table_metadata_v2["properties"]["property_name"], int)
1101+
new_metadata = TableMetadataV2(**new_example_table_metadata_v2)
1102+
assert isinstance(new_metadata.properties["property_name"], str)
1103+
1104+
1105+
def test_table_properties_raise_for_none_value(example_table_metadata_v2: Dict[str, Any]) -> None:
1106+
property_with_none = {"property_name": None}
1107+
example_table_metadata_v2 = {**example_table_metadata_v2, "properties": property_with_none}
1108+
with pytest.raises(ValidationError) as exc_info:
1109+
TableMetadataV2(**example_table_metadata_v2)
1110+
assert "None type is not a supported value in properties: property_name" in str(exc_info.value)

0 commit comments

Comments
 (0)