Merge pull request #54 from huggingface/fix-tp-pipeline

ArthurZucker · web-flow · commit 29028393c671 · 2025-04-05T16:39:54.000+02:00
Fix tp pipeline
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -292,7 +292,20 @@
                     "LlamaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
-            ("llama4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "llama4",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "llama4_text",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
@@ -746,7 +746,10 @@ def _update_causal_mask(
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
-                return attention_mask, attention_mask  # flash does not support chunked attn
+                return attention_mask, attention_mask  # flash does not support chunked attn TODO support flash
+            return None, None
+
+        if self.config._attn_implementation not in ["sdpa", "flex_attention", "eager"]:
             return None, None
 
         sequence_length = input_tensor.shape[1]
@@ -808,8 +811,8 @@ def _update_causal_mask(
             if sequence_length == 1:
                 chunked_attention_mask = chunked_attention_mask[-1:]
             if self.config._attn_implementation == "eager":
-                chunked_attention_mask = chunked_attention_mask[None,None,:,:].to(dtype).masked_fill(
-                    chunked_attention_mask, min_dtype
+                chunked_attention_mask = (
+                    chunked_attention_mask[None, None, :, :].to(dtype).masked_fill(chunked_attention_mask, min_dtype)
                 )
 
         if (
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
@@ -981,6 +981,8 @@ def __init__(
         else:
             self.device = device if device is not None else -1
 
+        if torch.distributed.is_initialized():
+            self.device = self.model.device
         logger.warning(f"Device set to use {self.device}")
 
         self.binary_output = binary_output
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -294,26 +294,35 @@ def is_serializable(self, safe_serialization=None): ...
     @property
     @abstractmethod
     def is_trainable(self): ...
-    
+
     def _convert_model_for_quantization(self, model):
         from accelerate import init_empty_weights
+
         for name, module in model.named_modules():
             module_class_name = module.__class__.__name__
-            if module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION.keys() and self.quantization_config.quant_method == QuantizationMethod.COMPRESSED_TENSORS:
+            if (
+                module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION.keys()
+                and self.quantization_config.quant_method == QuantizationMethod.COMPRESSED_TENSORS
+            ):
                 with init_empty_weights():
                     parent_module, name = get_module_from_name(model, name)
-                    parent_module._modules[name] = MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name](model.config.get_text_config())
+                    parent_module._modules[name] = MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name](
+                        model.config.get_text_config()
+                    )
+
 
 class SequentialLlama4TextExperts(torch.nn.ModuleList):
     """
     A module that implements a compressed version of a list of expert modules.
     This is specifically designed to work with Llama4TextExperts in MoE layers.
     """
+
     def __init__(self, config):
         from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
+
         super().__init__([Llama4TextMLP(config) for _ in range(config.num_local_experts)])
         self.num_experts = config.num_local_experts
-    
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -324,4 +333,5 @@ def forward(
             routed_out[expert_idx] = self[expert_idx](hidden_states[expert_idx])
         return routed_out
 
-MODULES_TO_PATCH_FOR_QUANTIZATION = { "Llama4TextExperts": SequentialLlama4TextExperts }
+
+MODULES_TO_PATCH_FOR_QUANTIZATION = {"Llama4TextExperts": SequentialLlama4TextExperts}