Merge branch 'main' into manifest_compaction

HonahX · HonahX · commit 1ec5edd628ec · 2024-06-09T22:49:48.000-07:00
diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt
@@ -23,6 +23,6 @@ mkdocstrings-python==1.10.3
 mkdocs-literate-nav==0.6.1
 mkdocs-autorefs==1.0.1
 mkdocs-gen-files==0.5.0
-mkdocs-material==9.5.25
+mkdocs-material==9.5.26
 mkdocs-material-extensions==1.3.1
 mkdocs-section-index==0.3.9
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -469,15 +469,18 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
         self.fs_by_scheme = lru_cache(self._initialize_fs)
 
 
-def schema_to_pyarrow(schema: Union[Schema, IcebergType], metadata: Dict[bytes, bytes] = EMPTY_DICT) -> pa.schema:
-    return visit(schema, _ConvertToArrowSchema(metadata))
+def schema_to_pyarrow(
+    schema: Union[Schema, IcebergType], metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True
+) -> pa.schema:
+    return visit(schema, _ConvertToArrowSchema(metadata, include_field_ids))
 
 
 class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]):
     _metadata: Dict[bytes, bytes]
 
-    def __init__(self, metadata: Dict[bytes, bytes] = EMPTY_DICT) -> None:
+    def __init__(self, metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True) -> None:
         self._metadata = metadata
+        self._include_field_ids = include_field_ids
 
     def schema(self, _: Schema, struct_result: pa.StructType) -> pa.schema:
         return pa.schema(list(struct_result), metadata=self._metadata)
@@ -486,13 +489,17 @@ def struct(self, _: StructType, field_results: List[pa.DataType]) -> pa.DataType
         return pa.struct(field_results)
 
     def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
+        metadata = {}
+        if field.doc:
+            metadata[PYARROW_FIELD_DOC_KEY] = field.doc
+        if self._include_field_ids:
+            metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
+
         return pa.field(
             name=field.name,
             type=field_result,
             nullable=field.optional,
-            metadata={PYARROW_FIELD_DOC_KEY: field.doc, PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)}
-            if field.doc
-            else {PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)},
+            metadata=metadata,
         )
 
     def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType:
@@ -1130,7 +1137,7 @@ def project_table(
     tables = [f.result() for f in completed_futures if f.result()]
 
     if len(tables) < 1:
-        return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema))
+        return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema, include_field_ids=False))
 
     result = pa.concat_tables(tables)
 
@@ -1161,7 +1168,7 @@ def __init__(self, file_schema: Schema):
     def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
         file_field = self.file_schema.find_field(field.field_id)
         if field.field_type.is_primitive and field.field_type != file_field.field_type:
-            return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type)))
+            return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=False))
         return values
 
     def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Field:
@@ -1188,7 +1195,7 @@ def struct(
                 field_arrays.append(array)
                 fields.append(self._construct_field(field, array.type))
             elif field.optional:
-                arrow_type = schema_to_pyarrow(field.field_type)
+                arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=False)
                 field_arrays.append(pa.nulls(len(struct_array), type=arrow_type))
                 fields.append(self._construct_field(field, arrow_type))
             else:
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1359,6 +1359,18 @@ def snapshot_by_name(self, name: str) -> Optional[Snapshot]:
             return self.snapshot_by_id(ref.snapshot_id)
         return None
 
+    def snapshot_as_of_timestamp(self, timestamp_ms: int, inclusive: bool = True) -> Optional[Snapshot]:
+        """Get the snapshot that was current as of or right before the given timestamp, or None if there is no matching snapshot.
+
+        Args:
+            timestamp_ms: Find snapshot that was current at/before this timestamp
+            inclusive: Includes timestamp_ms in search when True. Excludes timestamp_ms when False
+        """
+        for log_entry in reversed(self.history()):
+            if (inclusive and log_entry.timestamp_ms <= timestamp_ms) or log_entry.timestamp_ms < timestamp_ms:
+                return self.snapshot_by_id(log_entry.snapshot_id)
+        return None
+
     def history(self) -> List[SnapshotLogEntry]:
         """Get the snapshot history of this table."""
         return self.metadata.snapshot_log
diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -14,17 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from __future__ import annotations
+
 import time
 from collections import defaultdict
 from enum import Enum
-from typing import Any, DefaultDict, Dict, List, Mapping, Optional
+from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional
 
 from pydantic import Field, PrivateAttr, model_serializer
 
 from pyiceberg.io import FileIO
 from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, read_manifest_list
 from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
 from pyiceberg.schema import Schema
+
+if TYPE_CHECKING:
+    from pyiceberg.table.metadata import TableMetadata
 from pyiceberg.typedef import IcebergBaseModel
 
 ADDED_DATA_FILES = "added-data-files"
@@ -412,3 +417,12 @@ def _update_totals(total_property: str, added_property: str, removed_property: s
 def set_when_positive(properties: Dict[str, str], num: int, property_name: str) -> None:
     if num > 0:
         properties[property_name] = str(num)
+
+
+def ancestors_of(current_snapshot: Optional[Snapshot], table_metadata: TableMetadata) -> Iterable[Snapshot]:
+    """Get the ancestors of and including the given snapshot."""
+    if current_snapshot:
+        yield current_snapshot
+        if current_snapshot.parent_snapshot_id is not None:
+            if parent := table_metadata.snapshot_by_id(current_snapshot.parent_snapshot_id):
+                yield from ancestors_of(parent, table_metadata)
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,7 @@ zstandard = ">=0.13.0,<1.0.0"
 tenacity = ">=8.2.3,<9.0.0"
 pyarrow = { version = ">=9.0.0,<17.0.0", optional = true }
 pandas = { version = ">=1.0.0,<3.0.0", optional = true }
-duckdb = { version = ">=0.5.0,<1.0.0", optional = true }
+duckdb = { version = ">=0.5.0,<2.0.0", optional = true }
 ray = { version = ">=2.0.0,<2.10.0", optional = true }
 python-snappy = { version = ">=0.6.0,<1.0.0", optional = true }
 thrift = { version = ">=0.13.0,<1.0.0", optional = true }
@@ -82,7 +82,7 @@ fastavro = "1.9.4"
 coverage = { version = "^7.4.2", extras = ["toml"] }
 requests-mock = "1.12.1"
 moto = { version = "^5.0.2", extras = ["server"] }
-typing-extensions = "4.12.0"
+typing-extensions = "4.12.2"
 pytest-mock = "3.14.0"
 pyspark = "3.5.1"
 cython = "3.0.10"
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -344,7 +344,7 @@ def test_deleting_hdfs_file_not_found() -> None:
         assert "Cannot delete file, does not exist:" in str(exc_info.value)
 
 
-def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None:
+def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) -> None:
     actual = schema_to_pyarrow(table_schema_nested)
     expected = """foo: string
   -- field metadata --
@@ -402,6 +402,30 @@ def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None:
     assert repr(actual) == expected
 
 
+def test_schema_to_pyarrow_schema_exclude_field_ids(table_schema_nested: Schema) -> None:
+    actual = schema_to_pyarrow(table_schema_nested, include_field_ids=False)
+    expected = """foo: string
+bar: int32 not null
+baz: bool
+qux: list<element: string not null> not null
+  child 0, element: string not null
+quux: map<string, map<string, int32>> not null
+  child 0, entries: struct<key: string not null, value: map<string, int32> not null> not null
+      child 0, key: string not null
+      child 1, value: map<string, int32> not null
+          child 0, entries: struct<key: string not null, value: int32 not null> not null
+              child 0, key: string not null
+              child 1, value: int32 not null
+location: list<element: struct<latitude: float, longitude: float> not null> not null
+  child 0, element: struct<latitude: float, longitude: float> not null
+      child 0, latitude: float
+      child 1, longitude: float
+person: struct<name: string, age: int32 not null>
+  child 0, name: string
+  child 1, age: int32 not null"""
+    assert repr(actual) == expected
+
+
 def test_fixed_type_to_pyarrow() -> None:
     length = 22
     iceberg_type = FixedType(length)
@@ -945,23 +969,13 @@ def test_projection_add_column(file_int: str) -> None:
         == """id: int32
 list: list<element: int32>
   child 0, element: int32
-    -- field metadata --
-    PARQUET:field_id: '21'
 map: map<int32, string>
   child 0, entries: struct<key: int32 not null, value: string> not null
       child 0, key: int32 not null
-      -- field metadata --
-      PARQUET:field_id: '31'
       child 1, value: string
-      -- field metadata --
-      PARQUET:field_id: '32'
 location: struct<lat: double, lon: double>
   child 0, lat: double
-    -- field metadata --
-    PARQUET:field_id: '41'
-  child 1, lon: double
-    -- field metadata --
-    PARQUET:field_id: '42'"""
+  child 1, lon: double"""
     )
 
 
@@ -1014,11 +1028,7 @@ def test_projection_add_column_struct(schema_int: Schema, file_int: str) -> None
         == """id: map<int32, string>
   child 0, entries: struct<key: int32 not null, value: string> not null
       child 0, key: int32 not null
-      -- field metadata --
-      PARQUET:field_id: '3'
-      child 1, value: string
-      -- field metadata --
-      PARQUET:field_id: '4'"""
+      child 1, value: string"""
     )
 
 
@@ -1062,12 +1072,7 @@ def test_projection_concat_files(schema_int: Schema, file_int: str) -> None:
 def test_projection_filter(schema_int: Schema, file_int: str) -> None:
     result_table = project(schema_int, [file_int], GreaterThan("id", 4))
     assert len(result_table.columns[0]) == 0
-    assert (
-        repr(result_table.schema)
-        == """id: int32
-  -- field metadata --
-  PARQUET:field_id: '1'"""
-    )
+    assert repr(result_table.schema) == """id: int32"""
 
 
 def test_projection_filter_renamed_column(file_int: str) -> None:
@@ -1304,11 +1309,7 @@ def test_projection_nested_struct_different_parent_id(file_struct: str) -> None:
         repr(result_table.schema)
         == """location: struct<lat: double, long: double>
   child 0, lat: double
-    -- field metadata --
-    PARQUET:field_id: '41'
-  child 1, long: double
-    -- field metadata --
-    PARQUET:field_id: '42'"""
+  child 1, long: double"""
     )
 
 
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -76,6 +76,7 @@
     Snapshot,
     SnapshotLogEntry,
     Summary,
+    ancestors_of,
 )
 from pyiceberg.table.sorting import (
     NullOrder,
@@ -204,6 +205,42 @@ def test_snapshot_by_id(table_v2: Table) -> None:
     )
 
 
+def test_snapshot_by_timestamp(table_v2: Table) -> None:
+    assert table_v2.snapshot_as_of_timestamp(1515100955770) == Snapshot(
+        snapshot_id=3051729675574597004,
+        parent_snapshot_id=None,
+        sequence_number=0,
+        timestamp_ms=1515100955770,
+        manifest_list="s3://a/b/1.avro",
+        summary=Summary(Operation.APPEND),
+        schema_id=None,
+    )
+    assert table_v2.snapshot_as_of_timestamp(1515100955770, inclusive=False) is None
+
+
+def test_ancestors_of(table_v2: Table) -> None:
+    assert list(ancestors_of(table_v2.current_snapshot(), table_v2.metadata)) == [
+        Snapshot(
+            snapshot_id=3055729675574597004,
+            parent_snapshot_id=3051729675574597004,
+            sequence_number=1,
+            timestamp_ms=1555100955770,
+            manifest_list="s3://a/b/2.avro",
+            summary=Summary(Operation.APPEND),
+            schema_id=1,
+        ),
+        Snapshot(
+            snapshot_id=3051729675574597004,
+            parent_snapshot_id=None,
+            sequence_number=0,
+            timestamp_ms=1515100955770,
+            manifest_list="s3://a/b/1.avro",
+            summary=Summary(Operation.APPEND),
+            schema_id=None,
+        ),
+    ]
+
+
 def test_snapshot_by_id_does_not_exist(table_v2: Table) -> None:
     assert table_v2.snapshot_by_id(-1) is None