Skip to content

Commit 5375fa0

Browse files
committed
Feat: Add "auto" checksum option and make default (#1383)
1 parent 7b6c9a0 commit 5375fa0

File tree

17 files changed

+275
-222
lines changed

17 files changed

+275
-222
lines changed

README.rst

+11
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,17 @@ setup.py file. Applications which do not import directly from
7272
`google-resumable-media` can safely disregard this dependency. This backwards
7373
compatibility feature will be removed in a future major version update.
7474

75+
Checksum Defaults
76+
~~~~~~~~~~~~~~~~~
77+
78+
In Python Storage 3.0, uploads and downloads now have a default of "auto" where
79+
applicable. "Auto" will use crc32c checksums, except for unusual cases where the
80+
fast (C extension) crc32c implementation is not available, in which case it will
81+
use md5 instead. Before Python Storage 3.0, the default was md5 for most
82+
downloads and None for most uploads. Note that ranged downloads ("start" or
83+
"end" set) still do not support any checksumming, and some features in
84+
`transfer_manager.py` still support crc32c only.
85+
7586
Miscellaneous
7687
~~~~~~~~~~~~~
7788

google/cloud/storage/_media/_download.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,28 @@ class Download(DownloadBase):
130130
appropriate checksum (for instance in the case of transcoded or
131131
ranged downloads where the remote service does not know the
132132
correct checksum) an INFO-level log will be emitted. Supported
133-
values are "md5", "crc32c" and None.
133+
values are "md5", "crc32c", "auto" and None. The default is "auto",
134+
which will try to detect if the C extension for crc32c is installed
135+
and fall back to md5 otherwise.
134136
"""
135137

136138
def __init__(
137-
self, media_url, stream=None, start=None, end=None, headers=None, checksum="md5"
139+
self,
140+
media_url,
141+
stream=None,
142+
start=None,
143+
end=None,
144+
headers=None,
145+
checksum="auto",
138146
):
139147
super(Download, self).__init__(
140148
media_url, stream=stream, start=start, end=end, headers=headers
141149
)
142150
self.checksum = checksum
151+
if self.checksum == "auto":
152+
self.checksum = (
153+
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
154+
)
143155
self._bytes_downloaded = 0
144156
self._expected_checksum = None
145157
self._checksum_object = None

google/cloud/storage/_media/_helpers.py

+22-43
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import hashlib
2121
import logging
2222
import random
23-
import warnings
2423

2524
from urllib.parse import parse_qs
2625
from urllib.parse import urlencode
@@ -142,43 +141,6 @@ def calculate_retry_wait(base_wait, max_sleep, multiplier=2.0):
142141
return new_base_wait, new_base_wait + 0.001 * jitter_ms
143142

144143

145-
def _get_crc32c_object():
146-
"""Get crc32c object
147-
Attempt to use the Google-CRC32c package. If it isn't available, try
148-
to use CRCMod. CRCMod might be using a 'slow' varietal. If so, warn...
149-
"""
150-
try:
151-
import google_crc32c # type: ignore
152-
153-
crc_obj = google_crc32c.Checksum()
154-
except ImportError:
155-
try:
156-
import crcmod # type: ignore
157-
158-
crc_obj = crcmod.predefined.Crc("crc-32c")
159-
_is_fast_crcmod()
160-
161-
except ImportError:
162-
raise ImportError("Failed to import either `google-crc32c` or `crcmod`")
163-
164-
return crc_obj
165-
166-
167-
def _is_fast_crcmod():
168-
# Determine if this is using the slow form of crcmod.
169-
nested_crcmod = __import__(
170-
"crcmod.crcmod",
171-
globals(),
172-
locals(),
173-
["_usingExtension"],
174-
0,
175-
)
176-
fast_crc = getattr(nested_crcmod, "_usingExtension", False)
177-
if not fast_crc:
178-
warnings.warn(_SLOW_CRC32C_WARNING, RuntimeWarning, stacklevel=2)
179-
return fast_crc
180-
181-
182144
def _get_metadata_key(checksum_type):
183145
if checksum_type == "md5":
184146
return "md5Hash"
@@ -231,10 +193,7 @@ def _get_expected_checksum(response, get_headers, media_url, checksum_type):
231193
_LOGGER.info(msg)
232194
checksum_object = _DoNothingHash()
233195
else:
234-
if checksum_type == "md5":
235-
checksum_object = hashlib.md5()
236-
else:
237-
checksum_object = _get_crc32c_object()
196+
checksum_object = _get_checksum_object(checksum_type)
238197
else:
239198
expected_checksum = None
240199
checksum_object = _DoNothingHash()
@@ -331,13 +290,33 @@ def _get_checksum_object(checksum_type):
331290
if checksum_type == "md5":
332291
return hashlib.md5()
333292
elif checksum_type == "crc32c":
334-
return _get_crc32c_object()
293+
# In order to support platforms that don't have google_crc32c
294+
# support, only perform the import on demand.
295+
import google_crc32c
296+
297+
return google_crc32c.Checksum()
335298
elif checksum_type is None:
336299
return None
337300
else:
338301
raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
339302

340303

304+
def _is_crc32c_available_and_fast():
305+
"""Return True if the google_crc32c C extension is installed.
306+
307+
Return False if either the package is not installed, or if only the
308+
pure-Python version is installed.
309+
"""
310+
try:
311+
import google_crc32c
312+
313+
if google_crc32c.implementation == "c":
314+
return True
315+
except Exception:
316+
pass
317+
return False
318+
319+
341320
def _parse_generation_header(response, get_headers):
342321
"""Parses the generation header from an ``X-Goog-Generation`` value.
343322

google/cloud/storage/_media/_upload.py

+30-13
Original file line numberDiff line numberDiff line change
@@ -249,19 +249,25 @@ class MultipartUpload(UploadBase):
249249
upload_url (str): The URL where the content will be uploaded.
250250
headers (Optional[Mapping[str, str]]): Extra headers that should
251251
be sent with the request, e.g. headers for encrypted data.
252-
checksum (Optional([str])): The type of checksum to compute to verify
252+
checksum Optional([str]): The type of checksum to compute to verify
253253
the integrity of the object. The request metadata will be amended
254254
to include the computed value. Using this option will override a
255-
manually-set checksum value. Supported values are "md5", "crc32c"
256-
and None. The default is None.
255+
manually-set checksum value. Supported values are "md5",
256+
"crc32c", "auto", and None. The default is "auto", which will try
257+
to detect if the C extension for crc32c is installed and fall back
258+
to md5 otherwise.
257259
258260
Attributes:
259261
upload_url (str): The URL where the content will be uploaded.
260262
"""
261263

262-
def __init__(self, upload_url, headers=None, checksum=None):
264+
def __init__(self, upload_url, headers=None, checksum="auto"):
263265
super(MultipartUpload, self).__init__(upload_url, headers=headers)
264266
self._checksum_type = checksum
267+
if self._checksum_type == "auto":
268+
self._checksum_type = (
269+
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
270+
)
265271

266272
def _prepare_request(self, data, metadata, content_type):
267273
"""Prepare the contents of an HTTP request.
@@ -355,13 +361,15 @@ class ResumableUpload(UploadBase):
355361
chunk_size (int): The size of each chunk used to upload the resource.
356362
headers (Optional[Mapping[str, str]]): Extra headers that should
357363
be sent with every request.
358-
checksum (Optional([str])): The type of checksum to compute to verify
364+
checksum Optional([str]): The type of checksum to compute to verify
359365
the integrity of the object. After the upload is complete, the
360-
server-computed checksum of the resulting object will be read
366+
server-computed checksum of the resulting object will be checked
361367
and google.cloud.storage.exceptions.DataCorruption will be raised on
362368
a mismatch. The corrupted file will not be deleted from the remote
363-
host automatically. Supported values are "md5", "crc32c" and None.
364-
The default is None.
369+
host automatically. Supported values are "md5", "crc32c", "auto",
370+
and None. The default is "auto", which will try to detect if the C
371+
extension for crc32c is installed and fall back to md5 otherwise.
372+
365373
366374
Attributes:
367375
upload_url (str): The URL where the content will be uploaded.
@@ -371,7 +379,7 @@ class ResumableUpload(UploadBase):
371379
:data:`.UPLOAD_CHUNK_SIZE`.
372380
"""
373381

374-
def __init__(self, upload_url, chunk_size, checksum=None, headers=None):
382+
def __init__(self, upload_url, chunk_size, checksum="auto", headers=None):
375383
super(ResumableUpload, self).__init__(upload_url, headers=headers)
376384
if chunk_size % UPLOAD_CHUNK_SIZE != 0:
377385
raise ValueError(
@@ -383,6 +391,10 @@ def __init__(self, upload_url, chunk_size, checksum=None, headers=None):
383391
self._bytes_uploaded = 0
384392
self._bytes_checksummed = 0
385393
self._checksum_type = checksum
394+
if self._checksum_type == "auto":
395+
self._checksum_type = (
396+
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
397+
)
386398
self._checksum_object = None
387399
self._total_bytes = None
388400
self._resumable_url = None
@@ -1185,9 +1197,10 @@ class XMLMPUPart(UploadBase):
11851197
be sent with every request.
11861198
checksum (Optional([str])): The type of checksum to compute to verify
11871199
the integrity of the object. The request headers will be amended
1188-
to include the computed value. Supported values are "md5", "crc32c"
1189-
and None. The default is None.
1190-
1200+
to include the computed value. Supported values are "md5", "crc32c",
1201+
"auto" and None. The default is "auto", which will try to detect if
1202+
the C extension for crc32c is installed and fall back to md5
1203+
otherwise.
11911204
Attributes:
11921205
upload_url (str): The URL of the object (without query parameters).
11931206
upload_id (str): The ID of the upload from the initialization response.
@@ -1208,7 +1221,7 @@ def __init__(
12081221
end,
12091222
part_number,
12101223
headers=None,
1211-
checksum=None,
1224+
checksum="auto",
12121225
):
12131226
super().__init__(upload_url, headers=headers)
12141227
self._filename = filename
@@ -1218,6 +1231,10 @@ def __init__(
12181231
self._part_number = part_number
12191232
self._etag = None
12201233
self._checksum_type = checksum
1234+
if self._checksum_type == "auto":
1235+
self._checksum_type = (
1236+
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
1237+
)
12211238
self._checksum_object = None
12221239

12231240
@property

google/cloud/storage/_media/requests/download.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ class Download(_request_helpers.RequestsMixin, _download.Download):
6969
appropriate checksum (for instance in the case of transcoded or
7070
ranged downloads where the remote service does not know the
7171
correct checksum) an INFO-level log will be emitted. Supported
72-
values are "md5", "crc32c" and None. The default is "md5".
72+
values are "md5", "crc32c", "auto" and None. The default is "auto",
73+
which will try to detect if the C extension for crc32c is installed
74+
and fall back to md5 otherwise.
7375
7476
Attributes:
7577
media_url (str): The URL containing the media to be downloaded.
@@ -263,7 +265,9 @@ class RawDownload(_request_helpers.RawRequestsMixin, _download.Download):
263265
appropriate checksum (for instance in the case of transcoded or
264266
ranged downloads where the remote service does not know the
265267
correct checksum) an INFO-level log will be emitted. Supported
266-
values are "md5", "crc32c" and None. The default is "md5".
268+
values are "md5", "crc32c", "auto" and None. The default is "auto",
269+
which will try to detect if the C extension for crc32c is installed
270+
and fall back to md5 otherwise.
267271
Attributes:
268272
media_url (str): The URL containing the media to be downloaded.
269273
start (Optional[int]): The first byte in a range to be downloaded.

google/cloud/storage/_media/requests/upload.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ class MultipartUpload(_request_helpers.RequestsMixin, _upload.MultipartUpload):
9898
the integrity of the object. The request metadata will be amended
9999
to include the computed value. Using this option will override a
100100
manually-set checksum value. Supported values are "md5",
101-
"crc32c" and None. The default is None.
101+
"crc32c", "auto", and None. The default is "auto", which will try
102+
to detect if the C extension for crc32c is installed and fall back
103+
to md5 otherwise.
102104
103105
Attributes:
104106
upload_url (str): The URL where the content will be uploaded.
@@ -334,8 +336,9 @@ class ResumableUpload(_request_helpers.RequestsMixin, _upload.ResumableUpload):
334336
server-computed checksum of the resulting object will be checked
335337
and google.cloud.storage.exceptions.DataCorruption will be raised on
336338
a mismatch. The corrupted file will not be deleted from the remote
337-
host automatically. Supported values are "md5", "crc32c" and None.
338-
The default is None.
339+
host automatically. Supported values are "md5", "crc32c", "auto",
340+
and None. The default is "auto", which will try to detect if the C
341+
extension for crc32c is installed and fall back to md5 otherwise.
339342
340343
Attributes:
341344
upload_url (str): The URL where the content will be uploaded.

0 commit comments

Comments
 (0)