Table statistics

ndrluis · ndrluis · commit 9b15c8664fb0 · 2024-11-10T18:55:59.000-03:00
diff --git a/dev/provision.py b/dev/provision.py
@@ -399,3 +399,27 @@
     )
     spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id")
     spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'")
+
+    spark.sql(
+        f"""
+        CREATE OR REPLACE TABLE {catalog_name}.default.test_table_statistics_operations (
+            number integer
+        )
+        USING iceberg
+        TBLPROPERTIES (
+            'format-version'='2'
+        );
+        """
+    )
+    spark.sql(
+        f"""
+        INSERT INTO {catalog_name}.default.test_table_statistics_operations
+        VALUES (1)
+        """
+    )
+    spark.sql(
+        f"""
+        INSERT INTO {catalog_name}.default.test_table_statistics_operations
+        VALUES (2)
+        """
+    )
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -1129,6 +1129,28 @@ with table.manage_snapshots() as ms:
     ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789")
 ```
 
+## Table Statistics Management
+
+Manage table statistics with operations through the `Table` API:
+
+```python
+# To run a specific operation
+table.update_statistics().set_statistics(snapshot_id, statistics_file).commit()
+# To run multiple operations
+table.update_statistics()
+  .set_statistics(snapshot_id1, statistics_file1)
+  .remove_statistics(snapshot_id2)
+# Operations are applied on commit.
+```
+
+You can also use context managers to make more changes:
+
+```python
+with table.update_statistics() as update:
+  update.set_statistics(1, statistics_file)
+  update.remove_statistics(2)
+```
+
 ## Query the data
 
 To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID:
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -84,7 +84,6 @@
     SnapshotLogEntry,
 )
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
-from pyiceberg.table.statistics import StatisticsFile
 from pyiceberg.table.update import (
     AddPartitionSpecUpdate,
     AddSchemaUpdate,
@@ -95,14 +94,12 @@
     AssertTableUUID,
     AssignUUIDUpdate,
     RemovePropertiesUpdate,
-    RemoveStatisticsUpdate,
     SetCurrentSchemaUpdate,
     SetDefaultSortOrderUpdate,
     SetDefaultSpecUpdate,
     SetLocationUpdate,
     SetPropertiesUpdate,
     SetSnapshotRefUpdate,
-    SetStatisticsUpdate,
     TableRequirement,
     TableUpdate,
     UpdatesAndRequirements,
@@ -119,6 +116,7 @@
     _OverwriteFiles,
 )
 from pyiceberg.table.update.spec import UpdateSpec
+from pyiceberg.table.update.statistics import UpdateStatistics
 from pyiceberg.typedef import (
     EMPTY_DICT,
     IcebergBaseModel,
@@ -666,42 +664,6 @@ def update_location(self, location: str) -> Transaction:
         """
         raise NotImplementedError("Not yet implemented")
 
-    def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> Transaction:
-        """Set the statistics for a snapshot.
-
-        Args:
-            snapshot_id: The snapshot ID to set the statistics for.
-            statistics_file: The statistics file to set.
-
-        Returns:
-            The alter table builder.
-        """
-        updates = (
-            SetStatisticsUpdate(
-                snapshot_id=snapshot_id,
-                statistics=statistics_file,
-            ),
-        )
-
-        return self._apply(updates, ())
-
-    def remove_statistics(self, snapshot_id: int) -> Transaction:
-        """Remove the statistics for a snapshot.
-
-        Args:
-            snapshot_id: The snapshot ID to remove the statistics for.
-
-        Returns:
-            The alter table builder.
-        """
-        updates = (
-            RemoveStatisticsUpdate(
-                snapshot_id=snapshot_id,
-            ),
-        )
-
-        return self._apply(updates, ())
-
     def commit_transaction(self) -> Table:
         """Commit the changes to the catalog.
 
@@ -1021,6 +983,23 @@ def manage_snapshots(self) -> ManageSnapshots:
         """
         return ManageSnapshots(transaction=Transaction(self, autocommit=True))
 
+    def update_statistics(self) -> UpdateStatistics:
+        """
+        Shorthand to run statistics management operations like add statistics and remove statistics.
+
+        Use table.update_statistics().<operation>().commit() to run a specific operation.
+        Use table.update_statistics().<operation-one>().<operation-two>().commit() to run multiple operations.
+
+        Pending changes are applied on commit.
+
+        We can also use context managers to make more changes. For example:
+
+        with table.update_statistics() as update:
+            update.set_statistics(snapshot_id=1, statistics_file=statistics_file)
+            update.remove_statistics(snapshot_id=2)
+        """
+        return UpdateStatistics(transaction=Transaction(self, autocommit=True))
+
     def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema:
         """Create a new UpdateSchema to alter the columns of this table.
 
diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py
@@ -38,4 +38,12 @@ class StatisticsFile(IcebergBaseModel):
     statistics_path: str = Field(alias="statistics-path")
     file_size_in_bytes: int = Field(alias="file-size-in-bytes")
     file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes")
+    key_metadata: Optional[str] = Field(alias="key-metadata")
     blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata")
+
+
+def reject_statistics(
+    statistics: List[StatisticsFile],
+    reject_snapshot_id: int,
+) -> List[StatisticsFile]:
+    return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id]
diff --git a/pyiceberg/table/update/__init__.py b/pyiceberg/table/update/__init__.py
@@ -37,7 +37,7 @@
     SnapshotLogEntry,
 )
 from pyiceberg.table.sorting import SortOrder
-from pyiceberg.table.statistics import StatisticsFile
+from pyiceberg.table.statistics import StatisticsFile, reject_statistics
 from pyiceberg.typedef import (
     IcebergBaseModel,
     Properties,
@@ -496,19 +496,18 @@ def _(update: SetStatisticsUpdate, base_metadata: TableMetadata, context: _Table
     if update.snapshot_id != update.statistics.snapshot_id:
         raise ValueError("Snapshot id in statistics does not match the snapshot id in the update")
 
-    rest_statistics = [stat for stat in base_metadata.statistics if stat.snapshot_id != update.snapshot_id]
-
+    statistics = reject_statistics(base_metadata.statistics, update.snapshot_id)
     context.add_update(update)
-    return base_metadata.model_copy(update={"statistics": rest_statistics + [update.statistics]})
+
+    return base_metadata.model_copy(update={"statistics": statistics + [update.statistics]})
 
 
 @_apply_table_update.register(RemoveStatisticsUpdate)
 def _(update: RemoveStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
     if not any(stat.snapshot_id == update.snapshot_id for stat in base_metadata.statistics):
         raise ValueError(f"Statistics with snapshot id {update.snapshot_id} does not exist")
 
-    statistics = [stat for stat in base_metadata.statistics if stat.snapshot_id != update.snapshot_id]
-
+    statistics = reject_statistics(base_metadata.statistics, update.snapshot_id)
     context.add_update(update)
 
     return base_metadata.model_copy(update={"statistics": statistics})
diff --git a/pyiceberg/table/update/statistics.py b/pyiceberg/table/update/statistics.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import TYPE_CHECKING, Tuple
+
+from pyiceberg.table.statistics import StatisticsFile
+from pyiceberg.table.update import (
+    RemoveStatisticsUpdate,
+    SetStatisticsUpdate,
+    TableUpdate,
+    UpdatesAndRequirements,
+    UpdateTableMetadata,
+)
+
+if TYPE_CHECKING:
+    from pyiceberg.table import Transaction
+
+
+class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]):
+    """
+    Run statistics management operations using APIs.
+
+    APIs include set_statistics and remove statistics operations.
+
+    Use table.update_statistics().<operation>().commit() to run a specific operation.
+    Use table.update_statistics().<operation-one>().<operation-two>().commit() to run multiple operations.
+
+    Pending changes are applied on commit.
+
+    We can also use context managers to make more changes. For example:
+
+    with table.update_statistics() as update:
+        update.set_statistics(snapshot_id=1, statistics_file=statistics_file)
+        update.remove_statistics(snapshot_id=2)
+    """
+
+    _updates: Tuple[TableUpdate, ...] = ()
+
+    def __init__(self, transaction: "Transaction") -> None:
+        super().__init__(transaction)
+
+    def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> "UpdateStatistics":
+        self._updates += (
+            SetStatisticsUpdate(
+                snapshot_id=snapshot_id,
+                statistics=statistics_file,
+            ),
+        )
+
+        return self
+
+    def remove_statistics(self, snapshot_id: int) -> "UpdateStatistics":
+        self._updates = (
+            RemoveStatisticsUpdate(
+                snapshot_id=snapshot_id,
+            ),
+        )
+
+        return self
+
+    def _commit(self) -> UpdatesAndRequirements:
+        return self._updates, ()
diff --git a/tests/integration/test_statistics_operations.py b/tests/integration/test_statistics_operations.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+from pyiceberg.catalog import Catalog
+from pyiceberg.table.statistics import BlobMetadata, StatisticsFile
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
+def test_manage_statistics(catalog: Catalog) -> None:
+    identifier = "default.test_table_statistics_operations"
+    tbl = catalog.load_table(identifier)
+
+    add_snapshot_id_1 = tbl.history()[0].snapshot_id
+    add_snapshot_id_2 = tbl.history()[1].snapshot_id
+
+    def create_statistics_file(snapshot_id: int) -> StatisticsFile:
+        blob_metadata = BlobMetadata(
+            type="boring-type",
+            snapshot_id=snapshot_id,
+            sequence_number=2,
+            fields=[1],
+            properties={"prop-key": "prop-value"},
+        )
+
+        statistics_file = StatisticsFile(
+            snapshot_id=snapshot_id,
+            statistics_path="s3://bucket/warehouse/stats.puffin",
+            file_size_in_bytes=124,
+            file_footer_size_in_bytes=27,
+            blob_metadata=[blob_metadata],
+        )
+
+        return statistics_file
+
+    statistics_file_snap_1 = create_statistics_file(add_snapshot_id_1)
+    statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2)
+
+    with tbl.update_statistics() as update:
+        update.set_statistics(add_snapshot_id_1, statistics_file_snap_1)
+        update.set_statistics(add_snapshot_id_2, statistics_file_snap_2)
+
+    assert len(tbl.metadata.statistics) == 2
+
+    with tbl.update_statistics() as update:
+        update.remove_statistics(add_snapshot_id_1)
+
+    assert len(tbl.metadata.statistics) == 1