@@ -856,7 +856,7 @@ def _get_resolved_checkpoint_files(
856
856
) -> Tuple [Optional [List [str ]], Optional [Dict ]]:
857
857
"""Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
858
858
checkpoints are sharded.
859
- This function will download the data if necesary .
859
+ This function will download the data if necessary .
860
860
"""
861
861
is_sharded = False
862
862
@@ -3296,7 +3296,7 @@ def save_pretrained(
3296
3296
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
3297
3297
save_peft_format (`bool`, *optional*, defaults to `True`):
3298
3298
For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
3299
- keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
3299
+ keys of the state dict of adapters needs to be prepended with `base_model.model`. Advanced users can
3300
3300
disable this behaviours by setting `save_peft_format` to `False`.
3301
3301
kwargs (`Dict[str, Any]`, *optional*):
3302
3302
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
@@ -3400,7 +3400,7 @@ def save_pretrained(
3400
3400
3401
3401
if save_peft_format :
3402
3402
logger .info (
3403
- "To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`."
3403
+ "To match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`."
3404
3404
)
3405
3405
peft_state_dict = {}
3406
3406
for key , value in state_dict .items ():
@@ -5887,14 +5887,14 @@ def is_accelerator_device(device: Union[str, int, torch.device]) -> bool:
5887
5887
def caching_allocator_warmup (model : PreTrainedModel , expanded_device_map : Dict , hf_quantizer : Optional [HfQuantizer ]):
5888
5888
"""This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
5889
5889
device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
5890
- the model, which is actually the loading speed botteneck .
5890
+ the model, which is actually the loading speed bottleneck .
5891
5891
Calling this function allows to cut the model loading time by a very large margin.
5892
5892
5893
5893
A few facts related to loading speed (taking into account the use of this function):
5894
5894
- When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
5895
- to cache the different state dicts (if enough ressources /RAM are available)
5895
+ to cache the different state dicts (if enough resources /RAM are available)
5896
5896
- Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
5897
- and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway
5897
+ and not a good idea in general as this is low level OS optimizations that depend on resource usage anyway
5898
5898
- As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
5899
5899
The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
5900
5900
These numbers are reported for TP on 4 H100 GPUs.
@@ -5935,7 +5935,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict,
5935
5935
index = device .index if device .index is not None else torch .cuda .current_device ()
5936
5936
device_memory = torch .cuda .mem_get_info (index )[0 ]
5937
5937
# Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
5938
- # than that amount might sometimes lead to unecesary cuda OOM, if the last parameter to be loaded on the device is large,
5938
+ # than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large,
5939
5939
# and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
5940
5940
# the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
5941
5941
# to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
0 commit comments