Skip to content

Commit d6dce6d

Browse files
Clean up old metadata (#1607)
Implements property `write.metadata.delete-after-commit.enabled` from https://iceberg.apache.org/docs/1.5.1/maintenance/#remove-old-metadata-files. Closes #1199 --------- Co-authored-by: Kevin Liu <[email protected]>
1 parent df2e16a commit d6dce6d

File tree

4 files changed

+79
-0
lines changed

4 files changed

+79
-0
lines changed

mkdocs/docs/configuration.md

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Iceberg tables support table properties to configure table behavior.
6363
| `write.parquet.page-row-limit` | Number of rows | 20000 | Set a target threshold for the maximum number of rows within a column chunk |
6464
| `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group |
6565
| `write.metadata.previous-versions-max` | Integer | 100 | The max number of previous version metadata files to keep before deleting after commit. |
66+
| `write.metadata.delete-after-commit.enabled` | Boolean | False | Whether to automatically delete old *tracked* metadata files after each table commit. It will retain a number of the most recent metadata files, which can be set using property `write.metadata.previous-versions-max`. |
6667
| `write.object-storage.enabled` | Boolean | True | Enables the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider) that adds a hash component to file paths. Note: the default value of `True` differs from Iceberg's Java implementation |
6768
| `write.object-storage.partitioned-paths` | Boolean | True | Controls whether [partition values are included in file paths](configuration.md#partition-exclusion) when object storage is enabled |
6869
| `write.py-location-provider.impl` | String of form `module.ClassName` | null | Optional, [custom `LocationProvider`](configuration.md#loading-a-custom-location-provider) implementation |

pyiceberg/catalog/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
CreateTableTransaction,
5656
StagedTable,
5757
Table,
58+
TableProperties,
5859
)
5960
from pyiceberg.table.metadata import TableMetadata, TableMetadataV1, new_table_metadata
6061
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
@@ -72,6 +73,7 @@
7273
from pyiceberg.utils.config import Config, merge_config
7374
from pyiceberg.utils.deprecated import deprecated as deprecated
7475
from pyiceberg.utils.deprecated import deprecation_message
76+
from pyiceberg.utils.properties import property_as_bool
7577

7678
if TYPE_CHECKING:
7779
import pyarrow as pa
@@ -757,6 +759,21 @@ def _convert_schema_if_needed(schema: Union[Schema, "pa.Schema"]) -> Schema:
757759
pass
758760
raise ValueError(f"{type(schema)=}, but it must be pyiceberg.schema.Schema or pyarrow.Schema")
759761

762+
@staticmethod
763+
def _delete_old_metadata(io: FileIO, base: TableMetadata, metadata: TableMetadata) -> None:
764+
"""Delete oldest metadata if config is set to true."""
765+
delete_after_commit: bool = property_as_bool(
766+
metadata.properties,
767+
TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
768+
TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT,
769+
)
770+
771+
if delete_after_commit:
772+
removed_previous_metadata_files: set[str] = {log.metadata_file for log in base.metadata_log}
773+
current_metadata_files: set[str] = {log.metadata_file for log in metadata.metadata_log}
774+
removed_previous_metadata_files.difference_update(current_metadata_files)
775+
delete_files(io, removed_previous_metadata_files, METADATA)
776+
760777
def __repr__(self) -> str:
761778
"""Return the string representation of the Catalog class."""
762779
return f"{self.name} ({self.__class__})"

pyiceberg/table/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,9 @@ class TableProperties:
221221
METADATA_PREVIOUS_VERSIONS_MAX = "write.metadata.previous-versions-max"
222222
METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT = 100
223223

224+
METADATA_DELETE_AFTER_COMMIT_ENABLED = "write.metadata.delete-after-commit.enabled"
225+
METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT = False
226+
224227
MAX_SNAPSHOT_AGE_MS = "history.expire.max-snapshot-age-ms"
225228
MAX_SNAPSHOT_AGE_MS_DEFAULT = 5 * 24 * 60 * 60 * 1000 # 5 days
226229

@@ -1181,6 +1184,16 @@ def refs(self) -> Dict[str, SnapshotRef]:
11811184

11821185
def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None:
11831186
response = self.catalog.commit_table(self, requirements, updates)
1187+
1188+
# https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527
1189+
# delete old metadata if METADATA_DELETE_AFTER_COMMIT_ENABLED is set to true and uses
1190+
# TableProperties.METADATA_PREVIOUS_VERSIONS_MAX to determine how many previous versions to keep -
1191+
# everything else will be removed.
1192+
try:
1193+
self.catalog._delete_old_metadata(self.io, self.metadata, response.metadata)
1194+
except Exception as e:
1195+
warnings.warn(f"Failed to delete old metadata after commit: {e}")
1196+
11841197
self.metadata = response.metadata
11851198
self.metadata_location = response.metadata_location
11861199

tests/catalog/test_sql.py

+48
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow
5151
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
5252
from pyiceberg.schema import Schema
53+
from pyiceberg.table import TableProperties
5354
from pyiceberg.table.snapshots import Operation
5455
from pyiceberg.table.sorting import (
5556
NullOrder,
@@ -1613,3 +1614,50 @@ def test_merge_manifests_local_file_system(catalog: SqlCatalog, arrow_table_with
16131614
tbl.append(arrow_table_with_null)
16141615

16151616
assert len(tbl.scan().to_arrow()) == 5 * len(arrow_table_with_null)
1617+
1618+
1619+
@pytest.mark.parametrize(
1620+
"catalog",
1621+
[
1622+
lazy_fixture("catalog_memory"),
1623+
lazy_fixture("catalog_sqlite"),
1624+
lazy_fixture("catalog_sqlite_without_rowcount"),
1625+
],
1626+
)
1627+
def test_delete_metadata_multiple(catalog: SqlCatalog, table_schema_nested: Schema, random_table_identifier: str) -> None:
1628+
namespace = Catalog.namespace_from(random_table_identifier)
1629+
catalog.create_namespace(namespace)
1630+
table = catalog.create_table(random_table_identifier, table_schema_nested)
1631+
1632+
original_metadata_location = table.metadata_location
1633+
1634+
for i in range(5):
1635+
with table.transaction() as transaction:
1636+
with transaction.update_schema() as update:
1637+
update.add_column(path=f"new_column_{i}", field_type=IntegerType())
1638+
1639+
assert len(table.metadata.metadata_log) == 5
1640+
assert os.path.exists(original_metadata_location[len("file://") :])
1641+
1642+
# Set the max versions property to 2, and delete after commit
1643+
new_property = {
1644+
TableProperties.METADATA_PREVIOUS_VERSIONS_MAX: "2",
1645+
TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED: "true",
1646+
}
1647+
1648+
with table.transaction() as transaction:
1649+
transaction.set_properties(properties=new_property)
1650+
1651+
# Verify that only the most recent metadata files are kept
1652+
assert len(table.metadata.metadata_log) == 2
1653+
updated_metadata_1, updated_metadata_2 = table.metadata.metadata_log
1654+
1655+
# new metadata log was added, so earlier metadata logs are removed.
1656+
with table.transaction() as transaction:
1657+
with transaction.update_schema() as update:
1658+
update.add_column(path="new_column_x", field_type=IntegerType())
1659+
1660+
assert len(table.metadata.metadata_log) == 2
1661+
assert not os.path.exists(original_metadata_location[len("file://") :])
1662+
assert not os.path.exists(updated_metadata_1.metadata_file[len("file://") :])
1663+
assert os.path.exists(updated_metadata_2.metadata_file[len("file://") :])

0 commit comments

Comments
 (0)