Skip to content

Commit 0302aa1

Browse files
authored
Fix typos in comments (#37694)
Signed-off-by: co63oc <[email protected]>
1 parent af000ce commit 0302aa1

35 files changed

+82
-82
lines changed

benchmark/benchmark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):
9090

9191
model = benchmark.config.backend["model"]
9292

93-
# Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
93+
# This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
9494
# (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
9595
benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
9696
benchmark_name = str(Path(benchmark_name).parts[-1])

benchmark/llama.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
293293
max_cache_len=seq_length + 128,
294294
)
295295

296-
# 3nd call
296+
# 3rd call
297297
start = perf_counter()
298298
output = model.generate(**inputs, past_key_values=past_key_values)
299299
end = perf_counter()

src/transformers/audio_utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,15 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
3737
3838
Args:
3939
audio (`str` or `np.ndarray`):
40-
The audio to be laoded to the numpy array format.
40+
The audio to be loaded to the numpy array format.
4141
sampling_rate (`int`, *optional*, defaults to 16000):
42-
The samlping rate to be used when loading the audio. It should be same as the
42+
The sampling rate to be used when loading the audio. It should be same as the
4343
sampling rate the model you will be using further was trained with.
4444
timeout (`float`, *optional*):
4545
The timeout value in seconds for the URL request.
4646
4747
Returns:
48-
`np.ndarray`: A numpy artay representing the audio.
48+
`np.ndarray`: A numpy array representing the audio.
4949
"""
5050
requires_backends(load_audio, ["librosa"])
5151

src/transformers/cache_utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1919,7 +1919,7 @@ def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k
19191919
full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
19201920
full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
19211921
# Fast decoding path -> here as the effective size is still sliding window, it is extremely important
1922-
# to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed adress
1922+
# to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed address
19231923
# in memory (the values are the same as the full states, but not the address!!)
19241924
if key_states.shape[-2] == 1:
19251925
self.key_cache[layer_idx].copy_(full_key_states)
@@ -2031,7 +2031,7 @@ def __init__(
20312031
self.active_device_layer = 0
20322032

20332033
def initialise_cache_layer(self, layer_idx, key_states):
2034-
"""Overriden to use the correct device if offloaded layer (and pin memory)."""
2034+
"""Overridden to use the correct device if offloaded layer (and pin memory)."""
20352035
if len(self.key_cache) > layer_idx:
20362036
return
20372037

@@ -2243,7 +2243,7 @@ class OffloadedStaticCache(StaticCache):
22432243
The device to offload to. Defaults to CPU.
22442244
layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
22452245
Mapping between the layers and its device. This is required when you are manually initializing the cache
2246-
and the model is splitted between differents gpus. You can know which layers mapped to which device by
2246+
and the model is split between different gpus. You can know which layers mapped to which device by
22472247
checking the associated device_map: `model.hf_device_map`.
22482248
22492249
Example:

src/transformers/debug_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class DebugUnderflowOverflow:
8080
You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
8181
around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
8282
renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
83-
64K, and we get an overlow.
83+
64K, and we get an overflow.
8484
8585
As you can see it's the previous frames that we need to look into when the numbers start going into very large for
8686
fp16 numbers.

src/transformers/modeling_tf_utils.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -848,7 +848,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
848848
f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
849849
f"at '{resolved_archive_file}'. "
850850
"If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
851-
"by loading it in pytorch and saving it locally. A convertion script should be released soon."
851+
"by loading it in pytorch and saving it locally. A conversion script should be released soon."
852852
)
853853

854854

@@ -980,10 +980,10 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
980980
for symbolic_weight in symbolic_weights:
981981
# TF names always start with the model name so we ignore it
982982
if _prefix is not None:
983-
delimeter = len(_prefix.split("/"))
983+
delimiter = len(_prefix.split("/"))
984984
symbolic_weight_name = "/".join(
985-
symbolic_weight.name.split("/")[:delimeter]
986-
+ symbolic_weight.name.split("/")[delimeter + 1 :]
985+
symbolic_weight.name.split("/")[:delimiter]
986+
+ symbolic_weight.name.split("/")[delimiter + 1 :]
987987
)
988988
else:
989989
symbolic_weight_name = "/".join(symbolic_weight.name.split("/")[1:])
@@ -2042,7 +2042,7 @@ def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) ->
20422042
return model_embeds
20432043

20442044
def _get_word_embedding_weight(model, embedding_layer):
2045-
# TODO (joao): flagged for delection due to embeddings refactor
2045+
# TODO (joao): flagged for detection due to embeddings refactor
20462046

20472047
# If the variable holds the weights themselves, return them
20482048
if isinstance(embedding_layer, tf.Tensor):
@@ -3312,7 +3312,7 @@ class TFSharedEmbeddings(keras.layers.Layer):
33123312
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
33133313
"""
33143314

3315-
# TODO (joao): flagged for delection due to embeddings refactor
3315+
# TODO (joao): flagged for detection due to embeddings refactor
33163316

33173317
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
33183318
super().__init__(**kwargs)

src/transformers/modeling_utils.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,7 @@ def _get_resolved_checkpoint_files(
856856
) -> Tuple[Optional[List[str]], Optional[Dict]]:
857857
"""Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
858858
checkpoints are sharded.
859-
This function will download the data if necesary.
859+
This function will download the data if necessary.
860860
"""
861861
is_sharded = False
862862

@@ -3296,7 +3296,7 @@ def save_pretrained(
32963296
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
32973297
save_peft_format (`bool`, *optional*, defaults to `True`):
32983298
For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
3299-
keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
3299+
keys of the state dict of adapters needs to be prepended with `base_model.model`. Advanced users can
33003300
disable this behaviours by setting `save_peft_format` to `False`.
33013301
kwargs (`Dict[str, Any]`, *optional*):
33023302
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
@@ -3400,7 +3400,7 @@ def save_pretrained(
34003400

34013401
if save_peft_format:
34023402
logger.info(
3403-
"To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`."
3403+
"To match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`."
34043404
)
34053405
peft_state_dict = {}
34063406
for key, value in state_dict.items():
@@ -5887,14 +5887,14 @@ def is_accelerator_device(device: Union[str, int, torch.device]) -> bool:
58875887
def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, hf_quantizer: Optional[HfQuantizer]):
58885888
"""This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
58895889
device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
5890-
the model, which is actually the loading speed botteneck.
5890+
the model, which is actually the loading speed bottleneck.
58915891
Calling this function allows to cut the model loading time by a very large margin.
58925892
58935893
A few facts related to loading speed (taking into account the use of this function):
58945894
- When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
5895-
to cache the different state dicts (if enough ressources/RAM are available)
5895+
to cache the different state dicts (if enough resources/RAM are available)
58965896
- Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
5897-
and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway
5897+
and not a good idea in general as this is low level OS optimizations that depend on resource usage anyway
58985898
- As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
58995899
The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
59005900
These numbers are reported for TP on 4 H100 GPUs.
@@ -5935,7 +5935,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict,
59355935
index = device.index if device.index is not None else torch.cuda.current_device()
59365936
device_memory = torch.cuda.mem_get_info(index)[0]
59375937
# Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
5938-
# than that amount might sometimes lead to unecesary cuda OOM, if the last parameter to be loaded on the device is large,
5938+
# than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large,
59395939
# and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
59405940
# the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
59415941
# to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.

src/transformers/models/convnext/image_processing_convnext.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -56,24 +56,24 @@ class ConvNextImageProcessor(BaseImageProcessor):
5656
5757
Args:
5858
do_resize (`bool`, *optional*, defaults to `True`):
59-
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
59+
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
6060
by `do_resize` in the `preprocess` method.
6161
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 384}`):
6262
Resolution of the output image after `resize` is applied. If `size["shortest_edge"]` >= 384, the image is
6363
resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the image will
6464
be matched to `int(size["shortest_edge"]/crop_pct)`, after which the image is cropped to
6565
`(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`. Can
66-
be overriden by `size` in the `preprocess` method.
66+
be overridden by `size` in the `preprocess` method.
6767
crop_pct (`float` *optional*, defaults to 224 / 256):
6868
Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
69-
overriden by `crop_pct` in the `preprocess` method.
69+
overridden by `crop_pct` in the `preprocess` method.
7070
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
71-
Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
71+
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
7272
do_rescale (`bool`, *optional*, defaults to `True`):
73-
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
73+
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
7474
the `preprocess` method.
7575
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
76-
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
76+
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
7777
method.
7878
do_normalize (`bool`, *optional*, defaults to `True`):
7979
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`

src/transformers/models/data2vec/configuration_data2vec_audio.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class Data2VecAudioConfig(PretrainedConfig):
9191
mask_time_prob (`float`, *optional*, defaults to 0.05):
9292
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
9393
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
94-
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
94+
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
9595
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
9696
mask_time_length (`int`, *optional*, defaults to 10):
9797
Length of vector span along the time axis.
@@ -102,7 +102,7 @@ class Data2VecAudioConfig(PretrainedConfig):
102102
mask_feature_prob (`float`, *optional*, defaults to 0.0):
103103
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
104104
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
105-
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
105+
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
106106
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
107107
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
108108
True`.

src/transformers/models/deprecated/tvlt/image_processing_tvlt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class TvltImageProcessor(BaseImageProcessor):
7979
`do_resize` parameter in the `preprocess` method.
8080
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
8181
Size of the output image after resizing. The shortest edge of the image will be resized to
82-
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
82+
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
8383
`size` in the `preprocess` method.
8484
patch_size (`List[int]` *optional*, defaults to [16,16]):
8585
The patch size of image patch embedding.

src/transformers/models/hubert/configuration_hubert.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ class HubertConfig(PretrainedConfig):
107107
mask_time_prob (`float`, *optional*, defaults to 0.05):
108108
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
109109
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
110-
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
110+
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
111111
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
112112
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
113113
mask_time_length (`int`, *optional*, defaults to 10):
@@ -119,7 +119,7 @@ class HubertConfig(PretrainedConfig):
119119
mask_feature_prob (`float`, *optional*, defaults to 0.0):
120120
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
121121
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
122-
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
122+
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
123123
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
124124
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
125125
True`.

src/transformers/models/idefics/processing_idefics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ def __call__(
377377
add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
378378
add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
379379

380-
# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
380+
# if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
381381
if add_end_of_utterance_token is None:
382382
add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
383383
# turn non-batched prompts into batched

src/transformers/models/imagegpt/image_processing_imagegpt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
6666
6767
Args:
6868
clusters (`np.ndarray` or `List[List[int]]`, *optional*):
69-
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overriden by `clusters`
69+
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
7070
in `preprocess`.
7171
do_resize (`bool`, *optional*, defaults to `True`):
7272
Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by

0 commit comments

Comments
 (0)