huggingface
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report.yml
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/bug-report.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 2 additions & 2 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/scripts/codeowners_for_review_action
Lines changed: 3 additions & 3 deletions b/‎.github/scripts/codeowners_for_review_action
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/README.md
Lines changed: 1 addition & 1 deletion b/‎benchmark/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmarks_entrypoint.py
Lines changed: 0 additions & 1 deletion b/‎benchmark/benchmarks_entrypoint.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmark/llama.py
Lines changed: 8 additions & 8 deletions b/‎benchmark/llama.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎conftest.py
Lines changed: 0 additions & 4 deletions b/‎conftest.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/source/en/_toctree.yml
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/_toctree.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/en/attention_interface.md
Lines changed: 30 additions & 8 deletions b/‎docs/source/en/attention_interface.md
Lines changed: 30 additions & 8 deletions
diff --git a/‎docs/source/en/index.md
Lines changed: 0 additions & 1 deletion b/‎docs/source/en/index.md
Lines changed: 0 additions & 1 deletion
@@ -48,11 +48,11 @@ body:
           - pipelines: @Rocketknight1
           - tensorflow: @gante and @Rocketknight1
           - tokenizers: @ArthurZucker and @itazap
-          - trainer: @muellerzr @SunMarc
+          - trainer: @zach-huggingface @SunMarc
 
         Integrations:
 
-          - deepspeed: HF Trainer/Accelerate: @muellerzr
+          - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
           - ray/raytune: @richardliaw, @amogkam
           - Big Model Inference: @SunMarc
           - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
 
@@ -51,12 +51,12 @@ Library:
 - pipelines: @Rocketknight1
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
-- trainer: @muellerzr and @SunMarc
+- trainer: @zach-huggingface and @SunMarc
 - chat templates: @Rocketknight1
 
 Integrations:
 
-- deepspeed: HF Trainer/Accelerate: @muellerzr
+- deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
 
@@ -14,7 +14,7 @@ docs/ @stevhliu
 # Owners of subsections of the library
 /src/transformers/generation/ @gante
 /src/transformers/pipeline/ @Rocketknight1 @yonigozlan
-/src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr
+/src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface
 /src/transformers/quantizers/ @SunMarc @MekkCyber
 tests/ @ydshieh
 tests/generation/ @gante
@@ -27,8 +27,8 @@ tests/generation/ @gante
 # Specific files come after the sections/globs, so they take priority
 /.circleci/config.yml @ArthurZucker @ydshieh
 /utils/tests_fetcher.py @ydshieh
-trainer.py @muellerzr @SunMarc
-trainer_utils.py @muellerzr @SunMarc
+trainer.py @zach-huggingface @SunMarc
+trainer_utils.py @zach-huggingface @SunMarc
 /utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
 
 # Owners of individual models are specific / high priority, and so they come last
 
@@ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
 
 ## Writing metrics to the database
 
-`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
 
 cf [`llama.py`](./llama.py) to see an example of this in practice.
 
 
@@ -3,7 +3,6 @@
 import logging
 import os
 from typing import Dict
-import psycopg2
 import sys
 
 from psycopg2.extras import Json
 
@@ -204,7 +204,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             time_to_first_token = end - start
             logger.info(f"completed first compile generation in: {time_to_first_token}s")
             cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()
 
             cache_position = torch.tensor([seq_length], device=device)
             ### First compile, decoding
@@ -215,9 +215,9 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             torch.cuda.synchronize()
             end = perf_counter()
             time_to_second_token = end - start
-            logger.info(f"completed second compile generation in: {time_to_first_token}s")
+            logger.info(f"completed second compile generation in: {time_to_second_token}s")
             cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()
 
             ### Second compile, decoding
             start = perf_counter()
@@ -227,15 +227,15 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             torch.cuda.synchronize()
             end = perf_counter()
             time_to_third_token = end - start
-            logger.info(f"completed third compile forward in: {time_to_first_token}s")
+            logger.info(f"completed third compile forward in: {time_to_third_token}s")
             cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()
 
             ### Using cuda graphs decoding
 
             start = perf_counter()
             for _ in range(1, num_tokens_to_generate):
-                all_generated_tokens += next_token.clone().detach().cpu().tolist()
+                all_generated_tokens += next_token.tolist()
                 next_token = decode_one_token(
                     model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
                 )
@@ -298,7 +298,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
             third_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {third_compile_generate_time}s")
+            logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
             past_key_values = StaticCache(
@@ -313,7 +313,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
             fourth_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
+            logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
         metrics_recorder.collect_model_measurements(
 
@@ -46,10 +46,6 @@
     "test_keep_in_fp32_modules",
     "test_gradient_checkpointing_backward_compatibility",
     "test_gradient_checkpointing_enable_disable",
-    "test_save_load_fast_init_from_base",
-    "test_fast_init_context_manager",
-    "test_fast_init_tied_embeddings",
-    "test_save_load_fast_init_to_base",
     "test_torch_save_load",
     "test_initialization",
     "test_forward_signature",
 
@@ -415,6 +415,8 @@
         title: DeBERTa
       - local: model_doc/deberta-v2
         title: DeBERTa-v2
+      - local: model_doc/deepseek_v3
+        title: DeepSeek-V3
       - local: model_doc/dialogpt
         title: DialoGPT
       - local: model_doc/diffllama
@@ -603,6 +605,10 @@
         title: Qwen2
       - local: model_doc/qwen2_moe
         title: Qwen2MoE
+      - local: model_doc/qwen3
+        title: Qwen3
+      - local: model_doc/qwen3_moe
+        title: Qwen3MoE
       - local: model_doc/rag
         title: RAG
       - local: model_doc/realm
 
@@ -23,13 +23,13 @@ supported models.
 Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
 By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
 [`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
-as well as `eager`, which is simple matrix multiplication without any optimization on top.  
+as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
 This is the setting you can usually choose when instantiating a model:
 
 ```python
 from transformers import AutoModelForCausalLM
 
-model_id = "meta-llama/Llama-3.2-1B
+model_id = "meta-llama/Llama-3.2-1B"
 
 # Here, using flash attention as an example
 model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
@@ -43,7 +43,7 @@ from transformers import AutoModelForCausalLM, AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 import torch
 
-model_id = "meta-llama/Llama-3.2-1B
+model_id = "meta-llama/Llama-3.2-1B"
 
 def my_new_sdpa(*args, **kwargs):
     print("I just entered the attention computation")
@@ -56,7 +56,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_n
 model(torch.ones(1, 5, dtype=int))
 ```
 
-You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times.
+You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
 
 ## Dynamically switching attention function
 
@@ -70,12 +70,12 @@ model(torch.ones(1, 5, dtype=int))
 ```
 
 and it will stop printing the statements, as it now uses the `sdpa` attention.  
-This allows to quickly change attention function, without needing to reload the model!
+This allows to quickly change an attention function, without needing to reload the model!
 
-## What about new args needed in my custom function?
+## What about new args needed in my custom attention function?
 
 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
-`AttentionInterface` propagates kwargs all the way to the Attention layers, and to the attention function used. That way,
+`AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
 you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
 
 ```python
@@ -103,4 +103,26 @@ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="cust
 model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
 ```
 
-If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+
+## Accessing current available implementations
+
+Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
+and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+would expect from a usual Python dictionary:
+
+```python
+>>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+>>> list(ALL_ATTENTION_FUNCTIONS.keys())
+>>> ['flash_attention_2', 'flex_attention', 'sdpa']
+
+>>> ALL_ATTENTION_FUNCTIONS["sdpa"]
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+>>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+# You can also globally `register` a new function directly on it
+>>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
+```
@@ -43,4 +43,3 @@ Transformers is designed for developers and machine learning engineers and resea
   </a>
 </div>
 
-Join us on the Hugging Face [Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) to collaborate and build models, datasets, and applications together.
Original file line number	Diff line number	Diff line change
`@@ -43,4 +43,3 @@ Transformers is designed for developers and machine learning engineers and resea`
`43`	`43`	`</a>`
`44`	`44`	`</div>`
`45`	`45`
`46`		`-Join us on the Hugging Face [Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) to collaborate and build models, datasets, and applications together.`