Skip to content

Commit 5aa451d

Browse files
authored
Rename data_sequence_number to sequence_number in ManifestEntry (apache#900)
1 parent 77a07c9 commit 5aa451d

File tree

4 files changed

+48
-54
lines changed

4 files changed

+48
-54
lines changed

pyiceberg/manifest.py

+26-32
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ def __eq__(self, other: Any) -> bool:
377377
2: Schema(
378378
NestedField(0, "status", IntegerType(), required=True),
379379
NestedField(1, "snapshot_id", LongType(), required=False),
380-
NestedField(3, "data_sequence_number", LongType(), required=False),
380+
NestedField(3, "sequence_number", LongType(), required=False),
381381
NestedField(4, "file_sequence_number", LongType(), required=False),
382382
NestedField(2, "data_file", DATA_FILE_TYPE[2], required=True),
383383
),
@@ -394,10 +394,10 @@ def manifest_entry_schema_with_data_file(format_version: TableVersion, data_file
394394

395395

396396
class ManifestEntry(Record):
397-
__slots__ = ("status", "snapshot_id", "data_sequence_number", "file_sequence_number", "data_file")
397+
__slots__ = ("status", "snapshot_id", "sequence_number", "file_sequence_number", "data_file")
398398
status: ManifestEntryStatus
399399
snapshot_id: Optional[int]
400-
data_sequence_number: Optional[int]
400+
sequence_number: Optional[int]
401401
file_sequence_number: Optional[int]
402402
data_file: DataFile
403403

@@ -408,43 +408,39 @@ def _wrap(
408408
self,
409409
new_status: ManifestEntryStatus,
410410
new_snapshot_id: Optional[int],
411-
new_data_sequence_number: Optional[int],
411+
new_sequence_number: Optional[int],
412412
new_file_sequence_number: Optional[int],
413413
new_file: DataFile,
414414
) -> ManifestEntry:
415415
self.status = new_status
416416
self.snapshot_id = new_snapshot_id
417-
self.data_sequence_number = new_data_sequence_number
417+
self.sequence_number = new_sequence_number
418418
self.file_sequence_number = new_file_sequence_number
419419
self.data_file = new_file
420420
return self
421421

422422
def _wrap_append(
423-
self, new_snapshot_id: Optional[int], new_data_sequence_number: Optional[int], new_file: DataFile
423+
self, new_snapshot_id: Optional[int], new_sequence_number: Optional[int], new_file: DataFile
424424
) -> ManifestEntry:
425-
return self._wrap(ManifestEntryStatus.ADDED, new_snapshot_id, new_data_sequence_number, None, new_file)
425+
return self._wrap(ManifestEntryStatus.ADDED, new_snapshot_id, new_sequence_number, None, new_file)
426426

427427
def _wrap_delete(
428428
self,
429429
new_snapshot_id: Optional[int],
430-
new_data_sequence_number: Optional[int],
430+
new_sequence_number: Optional[int],
431431
new_file_sequence_number: Optional[int],
432432
new_file: DataFile,
433433
) -> ManifestEntry:
434-
return self._wrap(
435-
ManifestEntryStatus.DELETED, new_snapshot_id, new_data_sequence_number, new_file_sequence_number, new_file
436-
)
434+
return self._wrap(ManifestEntryStatus.DELETED, new_snapshot_id, new_sequence_number, new_file_sequence_number, new_file)
437435

438436
def _wrap_existing(
439437
self,
440438
new_snapshot_id: Optional[int],
441-
new_data_sequence_number: Optional[int],
439+
new_sequence_number: Optional[int],
442440
new_file_sequence_number: Optional[int],
443441
new_file: DataFile,
444442
) -> ManifestEntry:
445-
return self._wrap(
446-
ManifestEntryStatus.EXISTING, new_snapshot_id, new_data_sequence_number, new_file_sequence_number, new_file
447-
)
443+
return self._wrap(ManifestEntryStatus.EXISTING, new_snapshot_id, new_sequence_number, new_file_sequence_number, new_file)
448444

449445

450446
PARTITION_FIELD_SUMMARY_TYPE = StructType(
@@ -665,10 +661,10 @@ def _inherit_from_manifest(entry: ManifestEntry, manifest: ManifestFile) -> Mani
665661
if entry.snapshot_id is None:
666662
entry.snapshot_id = manifest.added_snapshot_id
667663

668-
# in v1 tables, the data sequence number is not persisted and can be safely defaulted to 0
669-
# in v2 tables, the data sequence number should be inherited iff the entry status is ADDED
670-
if entry.data_sequence_number is None and (manifest.sequence_number == 0 or entry.status == ManifestEntryStatus.ADDED):
671-
entry.data_sequence_number = manifest.sequence_number
664+
# in v1 tables, the sequence number is not persisted and can be safely defaulted to 0
665+
# in v2 tables, the sequence number should be inherited iff the entry status is ADDED
666+
if entry.sequence_number is None and (manifest.sequence_number == 0 or entry.status == ManifestEntryStatus.ADDED):
667+
entry.sequence_number = manifest.sequence_number
672668

673669
# in v1 tables, the file sequence number is not persisted and can be safely defaulted to 0
674670
# in v2 tables, the file sequence number should be inherited iff the entry status is ADDED
@@ -695,7 +691,7 @@ class ManifestWriter(ABC):
695691
_existing_rows: int
696692
_deleted_files: int
697693
_deleted_rows: int
698-
_min_data_sequence_number: Optional[int]
694+
_min_sequence_number: Optional[int]
699695
_partitions: List[Record]
700696
_reused_entry_wrapper: ManifestEntry
701697

@@ -712,7 +708,7 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile,
712708
self._existing_rows = 0
713709
self._deleted_files = 0
714710
self._deleted_rows = 0
715-
self._min_data_sequence_number = None
711+
self._min_sequence_number = None
716712
self._partitions = []
717713
self._reused_entry_wrapper = ManifestEntry()
718714

@@ -774,7 +770,7 @@ def to_manifest_file(self) -> ManifestFile:
774770
"""Return the manifest file."""
775771
# once the manifest file is generated, no more entries can be added
776772
self.closed = True
777-
min_sequence_number = self._min_data_sequence_number or UNASSIGNED_SEQ
773+
min_sequence_number = self._min_sequence_number or UNASSIGNED_SEQ
778774
return ManifestFile(
779775
manifest_path=self._output_file.location,
780776
manifest_length=len(self._writer.output_file),
@@ -812,35 +808,33 @@ def add_entry(self, entry: ManifestEntry) -> ManifestWriter:
812808

813809
if (
814810
(entry.status == ManifestEntryStatus.ADDED or entry.status == ManifestEntryStatus.EXISTING)
815-
and entry.data_sequence_number is not None
816-
and (self._min_data_sequence_number is None or entry.data_sequence_number < self._min_data_sequence_number)
811+
and entry.sequence_number is not None
812+
and (self._min_sequence_number is None or entry.sequence_number < self._min_sequence_number)
817813
):
818-
self._min_data_sequence_number = entry.data_sequence_number
814+
self._min_sequence_number = entry.sequence_number
819815

820816
self._writer.write_block([self.prepare_entry(entry)])
821817
return self
822818

823819
def add(self, entry: ManifestEntry) -> ManifestWriter:
824-
if entry.data_sequence_number is not None and entry.data_sequence_number >= 0:
825-
self.add_entry(
826-
self._reused_entry_wrapper._wrap_append(self._snapshot_id, entry.data_sequence_number, entry.data_file)
827-
)
820+
if entry.sequence_number is not None and entry.sequence_number >= 0:
821+
self.add_entry(self._reused_entry_wrapper._wrap_append(self._snapshot_id, entry.sequence_number, entry.data_file))
828822
else:
829823
self.add_entry(self._reused_entry_wrapper._wrap_append(self._snapshot_id, None, entry.data_file))
830824
return self
831825

832826
def delete(self, entry: ManifestEntry) -> ManifestWriter:
833827
self.add_entry(
834828
self._reused_entry_wrapper._wrap_delete(
835-
self._snapshot_id, entry.data_sequence_number, entry.file_sequence_number, entry.data_file
829+
self._snapshot_id, entry.sequence_number, entry.file_sequence_number, entry.data_file
836830
)
837831
)
838832
return self
839833

840834
def existing(self, entry: ManifestEntry) -> ManifestWriter:
841835
self.add_entry(
842836
self._reused_entry_wrapper._wrap_existing(
843-
entry.snapshot_id, entry.data_sequence_number, entry.file_sequence_number, entry.data_file
837+
entry.snapshot_id, entry.sequence_number, entry.file_sequence_number, entry.data_file
844838
)
845839
)
846840
return self
@@ -885,7 +879,7 @@ def _meta(self) -> Dict[str, str]:
885879
}
886880

887881
def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
888-
if entry.data_sequence_number is None:
882+
if entry.sequence_number is None:
889883
if entry.snapshot_id is not None and entry.snapshot_id != self._snapshot_id:
890884
raise ValueError(f"Found unassigned sequence number for an entry from snapshot: {entry.snapshot_id}")
891885
if entry.status != ManifestEntryStatus.ADDED:

pyiceberg/table/__init__.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -1888,7 +1888,7 @@ def _open_manifest(
18881888
]
18891889

18901890

1891-
def _min_data_file_sequence_number(manifests: List[ManifestFile]) -> int:
1891+
def _min_sequence_number(manifests: List[ManifestFile]) -> int:
18921892
try:
18931893
return min(
18941894
manifest.min_sequence_number or INITIAL_SEQUENCE_NUMBER
@@ -1949,11 +1949,11 @@ def _build_partition_evaluator(self, spec_id: int) -> Callable[[DataFile], bool]
19491949
# shared instance across multiple threads.
19501950
return lambda data_file: expression_evaluator(partition_schema, partition_expr, self.case_sensitive)(data_file.partition)
19511951

1952-
def _check_sequence_number(self, min_data_sequence_number: int, manifest: ManifestFile) -> bool:
1952+
def _check_sequence_number(self, min_sequence_number: int, manifest: ManifestFile) -> bool:
19531953
"""Ensure that no manifests are loaded that contain deletes that are older than the data.
19541954
19551955
Args:
1956-
min_data_sequence_number (int): The minimal sequence number.
1956+
min_sequence_number (int): The minimal sequence number.
19571957
manifest (ManifestFile): A ManifestFile that can be either data or deletes.
19581958
19591959
Returns:
@@ -1962,7 +1962,7 @@ def _check_sequence_number(self, min_data_sequence_number: int, manifest: Manife
19621962
return manifest.content == ManifestContent.DATA or (
19631963
# Not interested in deletes that are older than the data
19641964
manifest.content == ManifestContent.DELETES
1965-
and (manifest.sequence_number or INITIAL_SEQUENCE_NUMBER) >= min_data_sequence_number
1965+
and (manifest.sequence_number or INITIAL_SEQUENCE_NUMBER) >= min_sequence_number
19661966
)
19671967

19681968
def plan_files(self) -> Iterable[FileScanTask]:
@@ -1994,10 +1994,10 @@ def plan_files(self) -> Iterable[FileScanTask]:
19941994
self.table_metadata.schema(), self.row_filter, self.case_sensitive, self.options.get("include_empty_files") == "true"
19951995
).eval
19961996

1997-
min_data_sequence_number = _min_data_file_sequence_number(manifests)
1997+
min_sequence_number = _min_sequence_number(manifests)
19981998

19991999
data_entries: List[ManifestEntry] = []
2000-
positional_delete_entries = SortedList(key=lambda entry: entry.data_sequence_number or INITIAL_SEQUENCE_NUMBER)
2000+
positional_delete_entries = SortedList(key=lambda entry: entry.sequence_number or INITIAL_SEQUENCE_NUMBER)
20012001

20022002
executor = ExecutorFactory.get_or_create()
20032003
for manifest_entry in chain(
@@ -2011,7 +2011,7 @@ def plan_files(self) -> Iterable[FileScanTask]:
20112011
metrics_evaluator,
20122012
)
20132013
for manifest in manifests
2014-
if self._check_sequence_number(min_data_sequence_number, manifest)
2014+
if self._check_sequence_number(min_sequence_number, manifest)
20152015
],
20162016
)
20172017
):
@@ -3150,7 +3150,7 @@ def _write_added_manifest() -> List[ManifestFile]:
31503150
ManifestEntry(
31513151
status=ManifestEntryStatus.ADDED,
31523152
snapshot_id=self._snapshot_id,
3153-
data_sequence_number=None,
3153+
sequence_number=None,
31543154
file_sequence_number=None,
31553155
data_file=data_file,
31563156
)
@@ -3353,7 +3353,7 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
33533353
return ManifestEntry(
33543354
status=status,
33553355
snapshot_id=entry.snapshot_id,
3356-
data_sequence_number=entry.data_sequence_number,
3356+
sequence_number=entry.sequence_number,
33573357
file_sequence_number=entry.file_sequence_number,
33583358
data_file=entry.data_file,
33593359
)
@@ -3537,7 +3537,7 @@ def _existing_manifests(self) -> List[ManifestFile]:
35373537
ManifestEntry(
35383538
status=ManifestEntryStatus.EXISTING,
35393539
snapshot_id=entry.snapshot_id,
3540-
data_sequence_number=entry.data_sequence_number,
3540+
sequence_number=entry.sequence_number,
35413541
file_sequence_number=entry.file_sequence_number,
35423542
data_file=entry.data_file,
35433543
)
@@ -3568,7 +3568,7 @@ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]:
35683568
ManifestEntry(
35693569
status=ManifestEntryStatus.DELETED,
35703570
snapshot_id=entry.snapshot_id,
3571-
data_sequence_number=entry.data_sequence_number,
3571+
sequence_number=entry.sequence_number,
35723572
file_sequence_number=entry.file_sequence_number,
35733573
data_file=entry.data_file,
35743574
)
@@ -4016,7 +4016,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
40164016
entries.append({
40174017
"status": entry.status.value,
40184018
"snapshot_id": entry.snapshot_id,
4019-
"sequence_number": entry.data_sequence_number,
4019+
"sequence_number": entry.sequence_number,
40204020
"file_sequence_number": entry.file_sequence_number,
40214021
"data_file": {
40224022
"content": entry.data_file.content,

tests/avro/test_file.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v1() -> None:
140140
entry = ManifestEntry(
141141
status=ManifestEntryStatus.ADDED,
142142
snapshot_id=8638475580105682862,
143-
data_sequence_number=0,
143+
sequence_number=0,
144144
file_sequence_number=0,
145145
data_file=data_file,
146146
)
@@ -173,7 +173,7 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v1() -> None:
173173
v2_entry = todict(entry)
174174

175175
# These are not written in V1
176-
del v2_entry["data_sequence_number"]
176+
del v2_entry["sequence_number"]
177177
del v2_entry["file_sequence_number"]
178178
del v2_entry["data_file"]["content"]
179179
del v2_entry["data_file"]["equality_ids"]
@@ -206,7 +206,7 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v2() -> None:
206206
entry = ManifestEntry(
207207
status=ManifestEntryStatus.ADDED,
208208
snapshot_id=8638475580105682862,
209-
data_sequence_number=0,
209+
sequence_number=0,
210210
file_sequence_number=0,
211211
data_file=data_file,
212212
)
@@ -263,7 +263,7 @@ def test_write_manifest_entry_with_fastavro_read_with_iceberg(format_version: in
263263
entry = ManifestEntry(
264264
status=ManifestEntryStatus.ADDED,
265265
snapshot_id=8638475580105682862,
266-
data_sequence_number=0,
266+
sequence_number=0,
267267
file_sequence_number=0,
268268
data_file=data_file,
269269
)
@@ -305,7 +305,7 @@ def test_write_manifest_entry_with_fastavro_read_with_iceberg(format_version: in
305305
status=ManifestEntryStatus.ADDED,
306306
snapshot_id=8638475580105682862,
307307
# Not part of v1
308-
data_sequence_number=None,
308+
sequence_number=None,
309309
file_sequence_number=None,
310310
data_file=v1_datafile,
311311
)

tests/utils/test_manifest.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
6666

6767
assert manifest_entry.status == ManifestEntryStatus.ADDED
6868
assert manifest_entry.snapshot_id == 8744736658442914487
69-
assert manifest_entry.data_sequence_number == 0
69+
assert manifest_entry.sequence_number == 0
7070
assert isinstance(manifest_entry.data_file, DataFile)
7171

7272
data_file = manifest_entry.data_file
@@ -250,7 +250,7 @@ def test_read_manifest_v1(generated_manifest_file_file_v1: str) -> None:
250250

251251
entry = entries[0]
252252

253-
assert entry.data_sequence_number == 0
253+
assert entry.sequence_number == 0
254254
assert entry.file_sequence_number == 0
255255
assert entry.snapshot_id == 8744736658442914487
256256
assert entry.status == ManifestEntryStatus.ADDED
@@ -300,7 +300,7 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None:
300300

301301
entry = entries[0]
302302

303-
assert entry.data_sequence_number == 3
303+
assert entry.sequence_number == 3
304304
assert entry.file_sequence_number == 3
305305
assert entry.snapshot_id == 8744736658442914487
306306
assert entry.status == ManifestEntryStatus.ADDED
@@ -379,7 +379,7 @@ def test_write_manifest(
379379

380380
assert manifest_entry.status == ManifestEntryStatus.ADDED
381381
assert manifest_entry.snapshot_id == 8744736658442914487
382-
assert manifest_entry.data_sequence_number == -1 if format_version == 1 else 3
382+
assert manifest_entry.sequence_number == -1 if format_version == 1 else 3
383383
assert isinstance(manifest_entry.data_file, DataFile)
384384

385385
data_file = manifest_entry.data_file
@@ -556,7 +556,7 @@ def test_write_manifest_list(
556556

557557
entry = entries[0]
558558

559-
assert entry.data_sequence_number == 0 if format_version == 1 else 3
559+
assert entry.sequence_number == 0 if format_version == 1 else 3
560560
assert entry.file_sequence_number == 0 if format_version == 1 else 3
561561
assert entry.snapshot_id == 8744736658442914487
562562
assert entry.status == ManifestEntryStatus.ADDED

0 commit comments

Comments
 (0)