huggingface
diff --git a/‎benchmark/benchmark.py
+1-1 b/‎benchmark/benchmark.py
+1-1
diff --git a/‎benchmark/llama.py
+1-1 b/‎benchmark/llama.py
+1-1
diff --git a/‎src/transformers/audio_utils.py
+3-3 b/‎src/transformers/audio_utils.py
+3-3
diff --git a/‎src/transformers/cache_utils.py
+3-3 b/‎src/transformers/cache_utils.py
+3-3
diff --git a/‎src/transformers/debug_utils.py
+1-1 b/‎src/transformers/debug_utils.py
+1-1
diff --git a/‎src/transformers/modeling_tf_utils.py
+6-6 b/‎src/transformers/modeling_tf_utils.py
+6-6
diff --git a/‎src/transformers/modeling_utils.py
+7-7 b/‎src/transformers/modeling_utils.py
+7-7
diff --git a/‎src/transformers/models/convnext/image_processing_convnext.py
+6-6 b/‎src/transformers/models/convnext/image_processing_convnext.py
+6-6
diff --git a/‎src/transformers/models/data2vec/configuration_data2vec_audio.py
+2-2 b/‎src/transformers/models/data2vec/configuration_data2vec_audio.py
+2-2
diff --git a/‎src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
+1-1 b/‎src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
+1-1
diff --git a/‎src/transformers/models/hubert/configuration_hubert.py
+2-2 b/‎src/transformers/models/hubert/configuration_hubert.py
+2-2
diff --git a/‎src/transformers/models/idefics/processing_idefics.py
+1-1 b/‎src/transformers/models/idefics/processing_idefics.py
+1-1
diff --git a/‎src/transformers/models/imagegpt/image_processing_imagegpt.py
+1-1 b/‎src/transformers/models/imagegpt/image_processing_imagegpt.py
+1-1
@@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):
 
         model = benchmark.config.backend["model"]
 
-        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
         # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
         benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
         benchmark_name = str(Path(benchmark_name).parts[-1])
 
@@ -293,7 +293,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
                 max_cache_len=seq_length + 128,
             )
 
-            # 3nd call
+            # 3rd call
             start = perf_counter()
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
 
@@ -37,15 +37,15 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
 
     Args:
         audio (`str` or `np.ndarray`):
-            The audio to be laoded to the numpy array format.
+            The audio to be loaded to the numpy array format.
         sampling_rate (`int`, *optional*, defaults to 16000):
-            The samlping rate to be used when loading the audio. It should be same as the
+            The sampling rate to be used when loading the audio. It should be same as the
             sampling rate the model you will be using further was trained with.
         timeout (`float`, *optional*):
             The timeout value in seconds for the URL request.
 
     Returns:
-        `np.ndarray`: A numpy artay representing the audio.
+        `np.ndarray`: A numpy array representing the audio.
     """
     requires_backends(load_audio, ["librosa"])
 
 
@@ -1919,7 +1919,7 @@ def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k
             full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
             full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
             # Fast decoding path -> here as the effective size is still sliding window, it is extremely important
-            # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed adress
+            # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed address
             # in memory (the values are the same as the full states, but not the address!!)
             if key_states.shape[-2] == 1:
                 self.key_cache[layer_idx].copy_(full_key_states)
@@ -2031,7 +2031,7 @@ def __init__(
         self.active_device_layer = 0
 
     def initialise_cache_layer(self, layer_idx, key_states):
-        """Overriden to use the correct device if offloaded layer (and pin memory)."""
+        """Overridden to use the correct device if offloaded layer (and pin memory)."""
         if len(self.key_cache) > layer_idx:
             return
 
@@ -2243,7 +2243,7 @@ class OffloadedStaticCache(StaticCache):
             The device to offload to. Defaults to CPU.
         layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
             Mapping between the layers and its device. This is required when you are manually initializing the cache
-            and the model is splitted between differents gpus. You can know which layers mapped to which device by
+            and the model is split between different gpus. You can know which layers mapped to which device by
             checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -80,7 +80,7 @@ class DebugUnderflowOverflow:
     You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
     around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
     renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
-    64K, and we get an overlow.
+    64K, and we get an overflow.
 
     As you can see it's the previous frames that we need to look into when the numbers start going into very large for
     fp16 numbers.
 
@@ -848,7 +848,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
                 f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
                 f"at '{resolved_archive_file}'. "
                 "If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
-                "by loading it in pytorch and saving it locally. A convertion script should be released soon."
+                "by loading it in pytorch and saving it locally. A conversion script should be released soon."
             )
 
 
@@ -980,10 +980,10 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
                 for symbolic_weight in symbolic_weights:
                     # TF names always start with the model name so we ignore it
                     if _prefix is not None:
-                        delimeter = len(_prefix.split("/"))
+                        delimiter = len(_prefix.split("/"))
                         symbolic_weight_name = "/".join(
-                            symbolic_weight.name.split("/")[:delimeter]
-                            + symbolic_weight.name.split("/")[delimeter + 1 :]
+                            symbolic_weight.name.split("/")[:delimiter]
+                            + symbolic_weight.name.split("/")[delimiter + 1 :]
                         )
                     else:
                         symbolic_weight_name = "/".join(symbolic_weight.name.split("/")[1:])
@@ -2042,7 +2042,7 @@ def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) ->
         return model_embeds
 
     def _get_word_embedding_weight(model, embedding_layer):
-        # TODO (joao): flagged for delection due to embeddings refactor
+        # TODO (joao): flagged for detection due to embeddings refactor
 
         # If the variable holds the weights themselves, return them
         if isinstance(embedding_layer, tf.Tensor):
@@ -3312,7 +3312,7 @@ class TFSharedEmbeddings(keras.layers.Layer):
             Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
     """
 
-    # TODO (joao): flagged for delection due to embeddings refactor
+    # TODO (joao): flagged for detection due to embeddings refactor
 
     def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
         super().__init__(**kwargs)
 
@@ -856,7 +856,7 @@ def _get_resolved_checkpoint_files(
 ) -> Tuple[Optional[List[str]], Optional[Dict]]:
     """Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
     checkpoints are sharded.
-    This function will download the data if necesary.
+    This function will download the data if necessary.
     """
     is_sharded = False
 
@@ -3296,7 +3296,7 @@ def save_pretrained(
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             save_peft_format (`bool`, *optional*, defaults to `True`):
                 For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
-                keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
+                keys of the state dict of adapters needs to be prepended with `base_model.model`. Advanced users can
                 disable this behaviours by setting `save_peft_format` to `False`.
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
@@ -3400,7 +3400,7 @@ def save_pretrained(
 
                 if save_peft_format:
                     logger.info(
-                        "To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`."
+                        "To match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`."
                     )
                     peft_state_dict = {}
                     for key, value in state_dict.items():
@@ -5887,14 +5887,14 @@ def is_accelerator_device(device: Union[str, int, torch.device]) -> bool:
 def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, hf_quantizer: Optional[HfQuantizer]):
     """This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
     device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
-    the model, which is actually the loading speed botteneck.
+    the model, which is actually the loading speed bottleneck.
     Calling this function allows to cut the model loading time by a very large margin.
 
     A few facts related to loading speed (taking into account the use of this function):
     - When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
-    to cache the different state dicts (if enough ressources/RAM are available)
+    to cache the different state dicts (if enough resources/RAM are available)
     - Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
-    and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway
+    and not a good idea in general as this is low level OS optimizations that depend on resource usage anyway
     - As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
     The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
     These numbers are reported for TP on 4 H100 GPUs.
@@ -5935,7 +5935,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict,
             index = device.index if device.index is not None else torch.cuda.current_device()
             device_memory = torch.cuda.mem_get_info(index)[0]
             # Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
-            # than that amount might sometimes lead to unecesary cuda OOM, if the last parameter to be loaded on the device is large,
+            # than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large,
             # and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
             # the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
             # to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
 
@@ -56,24 +56,24 @@ class ConvNextImageProcessor(BaseImageProcessor):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
             by `do_resize` in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 384}`):
             Resolution of the output image after `resize` is applied. If `size["shortest_edge"]` >= 384, the image is
             resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the image will
             be matched to `int(size["shortest_edge"]/crop_pct)`, after which the image is cropped to
             `(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`. Can
-            be overriden by `size` in the `preprocess` method.
+            be overridden by `size` in the `preprocess` method.
         crop_pct (`float` *optional*, defaults to 224 / 256):
             Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
-            overriden by `crop_pct` in the `preprocess` method.
+            overridden by `crop_pct` in the `preprocess` method.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
-            Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
             the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
             method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
 
@@ -91,7 +91,7 @@ class Data2VecAudioConfig(PretrainedConfig):
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
         mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
@@ -102,7 +102,7 @@ class Data2VecAudioConfig(PretrainedConfig):
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
             True`.
 
@@ -79,7 +79,7 @@ class TvltImageProcessor(BaseImageProcessor):
             `do_resize` parameter in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
             Size of the output image after resizing. The shortest edge of the image will be resized to
-            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
             `size` in the `preprocess` method.
         patch_size (`List[int]` *optional*, defaults to [16,16]):
             The patch size of image patch embedding.
 
@@ -107,7 +107,7 @@ class HubertConfig(PretrainedConfig):
         mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
             masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
         mask_time_length (`int`, *optional*, defaults to 10):
@@ -119,7 +119,7 @@ class HubertConfig(PretrainedConfig):
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
             span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
             may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
             True`.
 
@@ -377,7 +377,7 @@ def __call__(
         add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
         add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
 
-        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
+        # if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
             add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
         # turn non-batched prompts into batched
 
@@ -66,7 +66,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
 
     Args:
         clusters (`np.ndarray` or `List[List[int]]`, *optional*):
-            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overriden by `clusters`
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
             in `preprocess`.
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):`
`293`	`293`	`max_cache_len=seq_length + 128,`
`294`	`294`	`)`
`295`	`295`
`296`		`- # 3nd call`
	`296`	`+ # 3rd call`
`297`	`297`	`start = perf_counter()`
`298`	`298`	`output = model.generate(**inputs, past_key_values=past_key_values)`
`299`	`299`	`end = perf_counter()`