RyanMullins
diff --git a/‎docs/source/en/model_doc/gemma3.md
+38 b/‎docs/source/en/model_doc/gemma3.md
+38
diff --git a/‎src/transformers/__init__.py
+4-3 b/‎src/transformers/__init__.py
+4-3
diff --git a/‎src/transformers/models/auto/image_processing_auto.py
+1-1 b/‎src/transformers/models/auto/image_processing_auto.py
+1-1
diff --git a/‎src/transformers/models/auto/modeling_auto.py
+1-1 b/‎src/transformers/models/auto/modeling_auto.py
+1-1
diff --git a/‎src/transformers/models/gemma3/__init__.py
+1 b/‎src/transformers/models/gemma3/__init__.py
+1
diff --git a/‎src/transformers/models/gemma3/configuration_gemma3.py
+98-80 b/‎src/transformers/models/gemma3/configuration_gemma3.py
+98-80
@@ -26,13 +26,51 @@ This model was contributed by [INSERT](INSERT).
 
 ## Usage tips
 
+
 - For image+text and image-only inputs use `Gemma3ForConditionalGeneration`.
 - For text-only inputs use `Gemma3ForCausalLM` for generation to avoid loading the vision tower.
 - Each sample can contain multiple images, and the number of images can vary between samples. However make sure to pass correctly batched images to the processor, where each batch is a list of one or more images.
 - The text passed to the processor should have the `"<start_of_image_>"` token where the images should be inserted.
 - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor. You can also get a vectorized output from `apply_chat_template`. See the examples below for more details on how to use it.
 
 
+### Image cropping for high resolution images
+
+`do_pan_and_scan`
+
+The model supports cropping images into smaller patches when the image aspect ratio exceeds a certain value. By default the images are not cropped and only the base image is forwarded to the model. Users can set `do_pan_and_scan=True` to obtain several crops per image along with the base image to improve the quality in DocVQA or similar tasks requiring higher resolution images,
+
+```python
+
+processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it", padding_side="left")
+
+url = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4="
+messages = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are a helpful assistant."}
+        ]
+    },
+    {
+        "role": "user", "content": [
+            {"type": "image", "url": url},
+            {"type": "text", "text": "What is shown in this image?"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+    do_pan_and_scan=True,
+).to(model.device)
+
+```
+
+
 ## Usage Example
 
 ### Single-image Inference
 
@@ -1260,7 +1260,7 @@
     _import_structure["models.emu3"].append("Emu3ImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
-    _import_structure["models.gemma3"].append("Gemma3ImageProcessor")
+    _import_structure["models.gemma3"].extend(("Gemma3ImageProcessor", "Gemma3ImageProcessorFast"))
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
     _import_structure["models.got_ocr2"].extend(["GotOcr2ImageProcessor"])
     _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
@@ -2458,9 +2458,9 @@
         [
             "Gemma3ForCausalLM",
             "Gemma3ForConditionalGeneration",
-            "Gemma3Model",
             "Gemma3PreTrainedModel",
             "Gemma3Processor",
+            "Gemma3TextModel",
         ]
     )
     _import_structure["models.git"].extend(
@@ -6548,6 +6548,7 @@
         from .models.deit import DeiTImageProcessorFast
         from .models.depth_pro import DepthProImageProcessorFast
         from .models.detr import DetrImageProcessorFast
+        from .models.gemma3 import Gemma3ImageProcessorFast
         from .models.got_ocr2 import GotOcr2ImageProcessorFast
         from .models.llava import LlavaImageProcessorFast
         from .models.llava_next import LlavaNextImageProcessorFast
@@ -7477,9 +7478,9 @@
         from .models.gemma3 import (
             Gemma3ForCausalLM,
             Gemma3ForConditionalGeneration,
-            Gemma3Model,
             Gemma3PreTrainedModel,
             Gemma3Processor,
+            Gemma3TextModel,
         )
         from .models.git import (
             GitForCausalLM,
 
@@ -86,7 +86,7 @@
             ("flava", ("FlavaImageProcessor",)),
             ("focalnet", ("BitImageProcessor",)),
             ("fuyu", ("FuyuImageProcessor",)),
-            ("gemma3", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
+            ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("glpn", ("GLPNImageProcessor",)),
             ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
 
@@ -118,7 +118,7 @@
         ("funnel", ("FunnelModel", "FunnelBaseModel")),
         ("gemma", "GemmaModel"),
         ("gemma2", "Gemma2Model"),
-        ("gemma3_text", "Gemma3Model"),
+        ("gemma3_text", "Gemma3TextModel"),
         ("git", "GitModel"),
         ("glm", "GlmModel"),
         ("glpn", "GLPNModel"),
 
@@ -20,6 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_gemma3 import *
     from .image_processing_gemma3 import *
+    from .image_processing_gemma3_fast import *
     from .modeling_gemma3 import *
     from .processing_gemma3 import *
 else:
 
@@ -32,17 +32,16 @@
 
 class Gemma3TextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Gemma3Model`]. It is used to instantiate a Gemma3
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the Gemma3-4B.
-    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
-
     Args:
-        vocab_size (`int`, *optional*, defaults to 262144):
-            Vocabulary size of the Gemma3 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Gemma3Model`]
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
         hidden_size (`int`, *optional*, defaults to 2304):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 9216):
@@ -61,14 +60,43 @@ class Gemma3TextConfig(PretrainedConfig):
             `num_attention_heads`.
         head_dim (`int`, *optional*, defaults to 256):
             The attention head dimension.
-        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3, every other layer uses sliding window
-            attention. This is the size of the sliding window.
-        query_pre_attn_scalar (`float`, *optional*):
-            The scaling factor used on the attention scores, not that
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
         rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings used for global attention.
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
+            size of the sliding window.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
             accordingly.
             Expected contents:
@@ -108,79 +136,68 @@ class Gemma3TextConfig(PretrainedConfig):
             The base period of the RoPE embeddings for local attention.
         sliding_window_pattern (`int`, *optional*, defaults to 6):
             Pattern for the sliding window attention.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the decoder. Will default to
-            `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"`
-            activation function.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
-        eos_token_id (`int`, *optional*, defaults to 1):
-            End of stream token id.
-        bos_token_id (`int`, *optional*, defaults to 2):
-            Beginning of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie weight embeddings
-        max_position_embeddings (`int`, *optional*, defaults to 131072):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        final_logit_softcapping (`bool`, *optional*, defaults to `True`):
-            Whether to apply logit softcapping or nor
-        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
-            Scaling factor when applying tanh soft-capping on the attention scorexs.
-        cache_implementation (`str`, *optional*, defaults to `"hybrid"`):
-            The cache type to be used with `generate`.
 
     ```python
-    >>> from transformers import Gemma3Model, Gemma3TextConfig
-    >>> # Initializing a Gemma3 gemma3-4b style configuration
-    >>> configuration = Gemma3Config()
-    >>> # Initializing a model from the gemma3-4b style configuration
-    >>> model = Gemma3Model(configuration)
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```"""
+    ```
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        sliding_window_pattern (`int`, *optional*, defaults to 6):
+            Pattern for the sliding window attention.
+    """
 
     model_type = "gemma3_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
 
     def __init__(
         self,
-        vocab_size: int = 262_144,
-        hidden_size: int = 2304,
-        intermediate_size: int = 9216,
-        num_hidden_layers: int = 26,
-        num_attention_heads: int = 8,
-        num_key_value_heads: int = 4,
-        head_dim: int = 256,
-        sliding_window: int = 4096,
-        query_pre_attn_scalar: Optional[float] = None,
-        rope_theta: float = 1_000_000.0,
-        rope_scaling=None,
-        rope_local_base_freq: float = 10_000.0,
-        sliding_window_pattern: int = 6,
-        rms_norm_eps: float = 1e-6,
-        hidden_activation: str = "gelu_pytorch_tanh",
-        pad_token_id: int = 0,
-        eos_token_id: int = 1,
-        bos_token_id: int = 2,
-        tie_word_embeddings: bool = True,
-        max_position_embeddings: int = 131_072,
-        initializer_range: float = 0.02,
-        attention_bias: bool = False,
-        attention_dropout: float = 0.0,
-        use_cache: bool = True,
+        vocab_size=262_208,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
         final_logit_softcapping=None,
         attn_logit_softcapping=None,
-        cache_implementation: str = "hybrid",
+        cache_implementation="hybrid",
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        sliding_window_pattern=6,
         **kwargs,
     ):
         super().__init__(
@@ -190,7 +207,6 @@ def __init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
-
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -203,10 +219,6 @@ def __init__(
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.rope_local_base_freq = rope_local_base_freq
-        # For configuring HybridCache to work with 5:1 attention pattern
-        self.sliding_window_pattern = sliding_window_pattern
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.hidden_activation = hidden_activation
@@ -215,6 +227,11 @@ def __init__(
         self.final_logit_softcapping = final_logit_softcapping
         self.attn_logit_softcapping = attn_logit_softcapping
         self.cache_implementation = cache_implementation
+
+        self.rope_local_base_freq = rope_local_base_freq
+        # For configuring HybridCache to work with 5:1 attention pattern
+        self.sliding_window_pattern = sliding_window_pattern
+        self.rope_scaling = rope_scaling
         rope_config_validation(self)
 
 
@@ -245,6 +262,7 @@ class Gemma3Config(PretrainedConfig):
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
+
     Example:
 
     ```python