Skip to content

Commit adfc971

Browse files
committed
Add table statistics update
1 parent 3b58011 commit adfc971

File tree

11 files changed

+495
-2
lines changed

11 files changed

+495
-2
lines changed

dev/provision.py

+24
Original file line numberDiff line numberDiff line change
@@ -401,3 +401,27 @@
401401
)
402402
spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id")
403403
spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'")
404+
405+
spark.sql(
406+
f"""
407+
CREATE OR REPLACE TABLE {catalog_name}.default.test_table_statistics_operations (
408+
number integer
409+
)
410+
USING iceberg
411+
TBLPROPERTIES (
412+
'format-version'='2'
413+
);
414+
"""
415+
)
416+
spark.sql(
417+
f"""
418+
INSERT INTO {catalog_name}.default.test_table_statistics_operations
419+
VALUES (1)
420+
"""
421+
)
422+
spark.sql(
423+
f"""
424+
INSERT INTO {catalog_name}.default.test_table_statistics_operations
425+
VALUES (2)
426+
"""
427+
)

mkdocs/docs/api.md

+23
Original file line numberDiff line numberDiff line change
@@ -1250,6 +1250,29 @@ with table.manage_snapshots() as ms:
12501250
ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789")
12511251
```
12521252

1253+
## Table Statistics Management
1254+
1255+
Manage table statistics with operations through the `Table` API:
1256+
1257+
```python
1258+
# To run a specific operation
1259+
table.update_statistics().set_statistics(snapshot_id, statistics_file).commit()
1260+
# To run multiple operations
1261+
table.update_statistics()
1262+
.set_statistics(snapshot_id1, statistics_file1)
1263+
.remove_statistics(snapshot_id2)
1264+
.commit()
1265+
# Operations are applied on commit.
1266+
```
1267+
1268+
You can also use context managers to make more changes:
1269+
1270+
```python
1271+
with table.update_statistics() as update:
1272+
update.set_statistics(snaphsot_id1, statistics_file)
1273+
update.remove_statistics(snapshot_id2)
1274+
```
1275+
12531276
## Query the data
12541277

12551278
To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID:

pyiceberg/table/__init__.py

+18
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
_FastAppendFiles,
119119
)
120120
from pyiceberg.table.update.spec import UpdateSpec
121+
from pyiceberg.table.update.statistics import UpdateStatistics
121122
from pyiceberg.transforms import IdentityTransform
122123
from pyiceberg.typedef import (
123124
EMPTY_DICT,
@@ -1035,6 +1036,23 @@ def manage_snapshots(self) -> ManageSnapshots:
10351036
"""
10361037
return ManageSnapshots(transaction=Transaction(self, autocommit=True))
10371038

1039+
def update_statistics(self) -> UpdateStatistics:
1040+
"""
1041+
Shorthand to run statistics management operations like add statistics and remove statistics.
1042+
1043+
Use table.update_statistics().<operation>().commit() to run a specific operation.
1044+
Use table.update_statistics().<operation-one>().<operation-two>().commit() to run multiple operations.
1045+
1046+
Pending changes are applied on commit.
1047+
1048+
We can also use context managers to make more changes. For example:
1049+
1050+
with table.update_statistics() as update:
1051+
update.set_statistics(snapshot_id=1, statistics_file=statistics_file)
1052+
update.remove_statistics(snapshot_id=2)
1053+
"""
1054+
return UpdateStatistics(transaction=Transaction(self, autocommit=True))
1055+
10381056
def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema:
10391057
"""Create a new UpdateSchema to alter the columns of this table.
10401058

pyiceberg/table/metadata.py

+9
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
SortOrder,
4545
assign_fresh_sort_order_ids,
4646
)
47+
from pyiceberg.table.statistics import StatisticsFile
4748
from pyiceberg.typedef import (
4849
EMPTY_DICT,
4950
IcebergBaseModel,
@@ -221,6 +222,14 @@ class TableMetadataCommonFields(IcebergBaseModel):
221222
There is always a main branch reference pointing to the
222223
current-snapshot-id even if the refs map is null."""
223224

225+
statistics: List[StatisticsFile] = Field(default_factory=list)
226+
"""A optional list of table statistics files.
227+
Table statistics files are valid Puffin files. Statistics are
228+
informational. A reader can choose to ignore statistics
229+
information. Statistics support is not required to read the
230+
table correctly. A table can contain many statistics files
231+
associated with different table snapshots."""
232+
224233
# validators
225234
@field_validator("properties", mode="before")
226235
def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]:

pyiceberg/table/statistics.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
from typing import (
18+
Dict,
19+
List,
20+
Optional,
21+
)
22+
23+
from pydantic import Field
24+
25+
from pyiceberg.typedef import IcebergBaseModel
26+
27+
28+
class BlobMetadata(IcebergBaseModel):
29+
type: str
30+
snapshot_id: int = Field(alias="snapshot-id")
31+
sequence_number: int = Field(alias="sequence-number")
32+
fields: List[int]
33+
properties: Optional[Dict[str, str]] = None
34+
35+
36+
class StatisticsFile(IcebergBaseModel):
37+
snapshot_id: int = Field(alias="snapshot-id")
38+
statistics_path: str = Field(alias="statistics-path")
39+
file_size_in_bytes: int = Field(alias="file-size-in-bytes")
40+
file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes")
41+
key_metadata: Optional[str] = Field(alias="key-metadata", default=None)
42+
blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata")
43+
44+
45+
def filter_statistics_by_snapshot_id(
46+
statistics: List[StatisticsFile],
47+
reject_snapshot_id: int,
48+
) -> List[StatisticsFile]:
49+
return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id]

pyiceberg/table/update/__init__.py

+36
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
SnapshotLogEntry,
3737
)
3838
from pyiceberg.table.sorting import SortOrder
39+
from pyiceberg.table.statistics import StatisticsFile, filter_statistics_by_snapshot_id
3940
from pyiceberg.typedef import (
4041
IcebergBaseModel,
4142
Properties,
@@ -174,6 +175,17 @@ class RemovePropertiesUpdate(IcebergBaseModel):
174175
removals: List[str]
175176

176177

178+
class SetStatisticsUpdate(IcebergBaseModel):
179+
action: Literal["set-statistics"] = Field(default="set-statistics")
180+
snapshot_id: int = Field(alias="snapshot-id")
181+
statistics: StatisticsFile
182+
183+
184+
class RemoveStatisticsUpdate(IcebergBaseModel):
185+
action: Literal["remove-statistics"] = Field(default="remove-statistics")
186+
snapshot_id: int = Field(alias="snapshot-id")
187+
188+
177189
TableUpdate = Annotated[
178190
Union[
179191
AssignUUIDUpdate,
@@ -191,6 +203,8 @@ class RemovePropertiesUpdate(IcebergBaseModel):
191203
SetLocationUpdate,
192204
SetPropertiesUpdate,
193205
RemovePropertiesUpdate,
206+
SetStatisticsUpdate,
207+
RemoveStatisticsUpdate,
194208
],
195209
Field(discriminator="action"),
196210
]
@@ -475,6 +489,28 @@ def _(
475489
return base_metadata.model_copy(update={"default_sort_order_id": new_sort_order_id})
476490

477491

492+
@_apply_table_update.register(SetStatisticsUpdate)
493+
def _(update: SetStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
494+
if update.snapshot_id != update.statistics.snapshot_id:
495+
raise ValueError("Snapshot id in statistics does not match the snapshot id in the update")
496+
497+
statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id)
498+
context.add_update(update)
499+
500+
return base_metadata.model_copy(update={"statistics": statistics + [update.statistics]})
501+
502+
503+
@_apply_table_update.register(RemoveStatisticsUpdate)
504+
def _(update: RemoveStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
505+
if not any(stat.snapshot_id == update.snapshot_id for stat in base_metadata.statistics):
506+
raise ValueError(f"Statistics with snapshot id {update.snapshot_id} does not exist")
507+
508+
statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id)
509+
context.add_update(update)
510+
511+
return base_metadata.model_copy(update={"statistics": statistics})
512+
513+
478514
def update_table_metadata(
479515
base_metadata: TableMetadata,
480516
updates: Tuple[TableUpdate, ...],

pyiceberg/table/update/statistics.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
from typing import TYPE_CHECKING, Tuple
18+
19+
from pyiceberg.table.statistics import StatisticsFile
20+
from pyiceberg.table.update import (
21+
RemoveStatisticsUpdate,
22+
SetStatisticsUpdate,
23+
TableUpdate,
24+
UpdatesAndRequirements,
25+
UpdateTableMetadata,
26+
)
27+
28+
if TYPE_CHECKING:
29+
from pyiceberg.table import Transaction
30+
31+
32+
class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]):
33+
"""
34+
Run statistics management operations using APIs.
35+
36+
APIs include set_statistics and remove statistics operations.
37+
38+
Use table.update_statistics().<operation>().commit() to run a specific operation.
39+
Use table.update_statistics().<operation-one>().<operation-two>().commit() to run multiple operations.
40+
41+
Pending changes are applied on commit.
42+
43+
We can also use context managers to make more changes. For example:
44+
45+
with table.update_statistics() as update:
46+
update.set_statistics(snapshot_id=1, statistics_file=statistics_file)
47+
update.remove_statistics(snapshot_id=2)
48+
"""
49+
50+
_updates: Tuple[TableUpdate, ...] = ()
51+
52+
def __init__(self, transaction: "Transaction") -> None:
53+
super().__init__(transaction)
54+
55+
def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> "UpdateStatistics":
56+
self._updates += (
57+
SetStatisticsUpdate(
58+
snapshot_id=snapshot_id,
59+
statistics=statistics_file,
60+
),
61+
)
62+
63+
return self
64+
65+
def remove_statistics(self, snapshot_id: int) -> "UpdateStatistics":
66+
self._updates = (
67+
RemoveStatisticsUpdate(
68+
snapshot_id=snapshot_id,
69+
),
70+
)
71+
72+
return self
73+
74+
def _commit(self) -> UpdatesAndRequirements:
75+
return self._updates, ()

0 commit comments

Comments
 (0)