Skip to content

Commit c273f59

Browse files
emmatypingAA-Turnergpsheadtomasr8Rogdham
authored
gh-132983: Add the compression.zstd pacakge and tests (#133365)
Co-authored-by: Adam Turner <[email protected]> Co-authored-by: Gregory P. Smith <[email protected]> Co-authored-by: Tomas R. <[email protected]> Co-authored-by: Rogdham <[email protected]>
1 parent 793402e commit c273f59

File tree

15 files changed

+3357
-99
lines changed

15 files changed

+3357
-99
lines changed

Lib/compression/zstd/__init__.py

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""
2+
3+
__all__ = (
4+
# compression.zstd
5+
"COMPRESSION_LEVEL_DEFAULT",
6+
"compress",
7+
"CompressionParameter",
8+
"decompress",
9+
"DecompressionParameter",
10+
"finalize_dict",
11+
"get_frame_info",
12+
"Strategy",
13+
"train_dict",
14+
15+
# compression.zstd._zstdfile
16+
"open",
17+
"ZstdFile",
18+
19+
# _zstd
20+
"get_frame_size",
21+
"zstd_version",
22+
"zstd_version_info",
23+
"ZstdCompressor",
24+
"ZstdDecompressor",
25+
"ZstdDict",
26+
"ZstdError",
27+
)
28+
29+
import _zstd
30+
import enum
31+
from _zstd import *
32+
from compression.zstd._zstdfile import ZstdFile, open, _nbytes
33+
34+
COMPRESSION_LEVEL_DEFAULT = _zstd._compressionLevel_values[0]
35+
"""The default compression level for Zstandard, currently '3'."""
36+
37+
38+
class FrameInfo:
39+
"""Information about a Zstandard frame."""
40+
__slots__ = 'decompressed_size', 'dictionary_id'
41+
42+
def __init__(self, decompressed_size, dictionary_id):
43+
super().__setattr__('decompressed_size', decompressed_size)
44+
super().__setattr__('dictionary_id', dictionary_id)
45+
46+
def __repr__(self):
47+
return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
48+
f'dictionary_id={self.dictionary_id})')
49+
50+
def __setattr__(self, name, _):
51+
raise AttributeError(f"can't set attribute {name!r}")
52+
53+
54+
def get_frame_info(frame_buffer):
55+
"""Get Zstandard frame information from a frame header.
56+
57+
*frame_buffer* is a bytes-like object. It should start from the beginning
58+
of a frame, and needs to include at least the frame header (6 to 18 bytes).
59+
60+
The returned FrameInfo object has two attributes.
61+
'decompressed_size' is the size in bytes of the data in the frame when
62+
decompressed, or None when the decompressed size is unknown.
63+
'dictionary_id' is an int in the range (0, 2**32). The special value 0
64+
means that the dictionary ID was not recorded in the frame header,
65+
the frame may or may not need a dictionary to be decoded,
66+
and the ID of such a dictionary is not specified.
67+
"""
68+
return FrameInfo(*_zstd._get_frame_info(frame_buffer))
69+
70+
71+
def train_dict(samples, dict_size):
72+
"""Return a ZstdDict representing a trained Zstandard dictionary.
73+
74+
*samples* is an iterable of samples, where a sample is a bytes-like
75+
object representing a file.
76+
77+
*dict_size* is the dictionary's maximum size, in bytes.
78+
"""
79+
if not isinstance(dict_size, int):
80+
ds_cls = type(dict_size).__qualname__
81+
raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')
82+
83+
samples = tuple(samples)
84+
chunks = b''.join(samples)
85+
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
86+
if not chunks:
87+
raise ValueError("samples contained no data; can't train dictionary.")
88+
dict_content = _zstd._train_dict(chunks, chunk_sizes, dict_size)
89+
return ZstdDict(dict_content)
90+
91+
92+
def finalize_dict(zstd_dict, /, samples, dict_size, level):
93+
"""Return a ZstdDict representing a finalized Zstandard dictionary.
94+
95+
Given a custom content as a basis for dictionary, and a set of samples,
96+
finalize *zstd_dict* by adding headers and statistics according to the
97+
Zstandard dictionary format.
98+
99+
You may compose an effective dictionary content by hand, which is used as
100+
basis dictionary, and use some samples to finalize a dictionary. The basis
101+
dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.
102+
103+
*samples* is an iterable of samples, where a sample is a bytes-like object
104+
representing a file.
105+
*dict_size* is the dictionary's maximum size, in bytes.
106+
*level* is the expected compression level. The statistics for each
107+
compression level differ, so tuning the dictionary to the compression level
108+
can provide improvements.
109+
"""
110+
111+
if not isinstance(zstd_dict, ZstdDict):
112+
raise TypeError('zstd_dict argument should be a ZstdDict object.')
113+
if not isinstance(dict_size, int):
114+
raise TypeError('dict_size argument should be an int object.')
115+
if not isinstance(level, int):
116+
raise TypeError('level argument should be an int object.')
117+
118+
samples = tuple(samples)
119+
chunks = b''.join(samples)
120+
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
121+
if not chunks:
122+
raise ValueError("The samples are empty content, can't finalize the"
123+
"dictionary.")
124+
dict_content = _zstd._finalize_dict(zstd_dict.dict_content,
125+
chunks, chunk_sizes,
126+
dict_size, level)
127+
return ZstdDict(dict_content)
128+
129+
def compress(data, level=None, options=None, zstd_dict=None):
130+
"""Return Zstandard compressed *data* as bytes.
131+
132+
*level* is an int specifying the compression level to use, defaulting to
133+
COMPRESSION_LEVEL_DEFAULT ('3').
134+
*options* is a dict object that contains advanced compression
135+
parameters. See CompressionParameter for more on options.
136+
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
137+
the function train_dict for how to train a ZstdDict on sample data.
138+
139+
For incremental compression, use a ZstdCompressor instead.
140+
"""
141+
comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
142+
return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)
143+
144+
def decompress(data, zstd_dict=None, options=None):
145+
"""Decompress one or more frames of Zstandard compressed *data*.
146+
147+
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
148+
the function train_dict for how to train a ZstdDict on sample data.
149+
*options* is a dict object that contains advanced compression
150+
parameters. See DecompressionParameter for more on options.
151+
152+
For incremental decompression, use a ZstdDecompressor instead.
153+
"""
154+
results = []
155+
while True:
156+
decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
157+
results.append(decomp.decompress(data))
158+
if not decomp.eof:
159+
raise ZstdError("Compressed data ended before the "
160+
"end-of-stream marker was reached")
161+
data = decomp.unused_data
162+
if not data:
163+
break
164+
return b"".join(results)
165+
166+
167+
class CompressionParameter(enum.IntEnum):
168+
"""Compression parameters."""
169+
170+
compression_level = _zstd._ZSTD_c_compressionLevel
171+
window_log = _zstd._ZSTD_c_windowLog
172+
hash_log = _zstd._ZSTD_c_hashLog
173+
chain_log = _zstd._ZSTD_c_chainLog
174+
search_log = _zstd._ZSTD_c_searchLog
175+
min_match = _zstd._ZSTD_c_minMatch
176+
target_length = _zstd._ZSTD_c_targetLength
177+
strategy = _zstd._ZSTD_c_strategy
178+
179+
enable_long_distance_matching = _zstd._ZSTD_c_enableLongDistanceMatching
180+
ldm_hash_log = _zstd._ZSTD_c_ldmHashLog
181+
ldm_min_match = _zstd._ZSTD_c_ldmMinMatch
182+
ldm_bucket_size_log = _zstd._ZSTD_c_ldmBucketSizeLog
183+
ldm_hash_rate_log = _zstd._ZSTD_c_ldmHashRateLog
184+
185+
content_size_flag = _zstd._ZSTD_c_contentSizeFlag
186+
checksum_flag = _zstd._ZSTD_c_checksumFlag
187+
dict_id_flag = _zstd._ZSTD_c_dictIDFlag
188+
189+
nb_workers = _zstd._ZSTD_c_nbWorkers
190+
job_size = _zstd._ZSTD_c_jobSize
191+
overlap_log = _zstd._ZSTD_c_overlapLog
192+
193+
def bounds(self):
194+
"""Return the (lower, upper) int bounds of a compression parameter.
195+
196+
Both the lower and upper bounds are inclusive.
197+
"""
198+
return _zstd._get_param_bounds(self.value, is_compress=True)
199+
200+
201+
class DecompressionParameter(enum.IntEnum):
202+
"""Decompression parameters."""
203+
204+
window_log_max = _zstd._ZSTD_d_windowLogMax
205+
206+
def bounds(self):
207+
"""Return the (lower, upper) int bounds of a decompression parameter.
208+
209+
Both the lower and upper bounds are inclusive.
210+
"""
211+
return _zstd._get_param_bounds(self.value, is_compress=False)
212+
213+
214+
class Strategy(enum.IntEnum):
215+
"""Compression strategies, listed from fastest to strongest.
216+
217+
Note that new strategies might be added in the future.
218+
Only the order (from fast to strong) is guaranteed,
219+
the numeric value might change.
220+
"""
221+
222+
fast = _zstd._ZSTD_fast
223+
dfast = _zstd._ZSTD_dfast
224+
greedy = _zstd._ZSTD_greedy
225+
lazy = _zstd._ZSTD_lazy
226+
lazy2 = _zstd._ZSTD_lazy2
227+
btlazy2 = _zstd._ZSTD_btlazy2
228+
btopt = _zstd._ZSTD_btopt
229+
btultra = _zstd._ZSTD_btultra
230+
btultra2 = _zstd._ZSTD_btultra2
231+
232+
233+
# Check validity of the CompressionParameter & DecompressionParameter types
234+
_zstd._set_parameter_types(CompressionParameter, DecompressionParameter)

0 commit comments

Comments
 (0)