32
32
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
33
33
from ...utils import (
34
34
add_start_docstrings ,
35
+ add_start_docstrings_to_model_forward ,
35
36
logging ,
37
+ replace_return_docstrings ,
36
38
)
37
39
from .configuration_mllama import MllamaConfig , MllamaTextConfig , MllamaVisionConfig
38
40
@@ -175,16 +177,6 @@ def _prepare_aspect_ratio_attention_mask(
175
177
return attention_mask
176
178
177
179
178
- @dataclass
179
- class MllamaOutput (ModelOutput ):
180
- loss : torch .FloatTensor = None
181
- logits : torch .FloatTensor = None
182
- past_key_values : List [List [torch .FloatTensor ]] = None
183
- hidden_states : Optional [Tuple [torch .FloatTensor ]] = None
184
- attentions : Optional [Tuple [torch .FloatTensor ]] = None
185
- image_hidden_states : Optional [torch .FloatTensor ] = None
186
-
187
-
188
180
class MllamaPrecomputedAspectRatioEmbedding (nn .Module ):
189
181
def __init__ (self , config : MllamaVisionConfig , is_gated : bool = True ):
190
182
super ().__init__ ()
@@ -1505,6 +1497,87 @@ def prepare_inputs_for_generation(
1505
1497
return model_inputs
1506
1498
1507
1499
1500
+ MLLAMA_INPUTS_DOCSTRING = r"""
1501
+ Args:
1502
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1503
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1504
+ it.
1505
+
1506
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1507
+ [`PreTrainedTokenizer.__call__`] for details.
1508
+
1509
+ [What are input IDs?](../glossary#input-ids)
1510
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, max_num_images, max_num_tiles, channels, image_size, image_size)):
1511
+ The tensors corresponding to the input images. Pixel values can be obtained using
1512
+ [`AutoImageProcessor`]. See [`MllamaImageProcessor.__call__`] for details ([]`MllamaProcessor`] uses
1513
+ [`MllamaImageProcessor`] for processing images).
1514
+ aspect_ratio_mask: Optional[List[List[int]]] = None, # TODO
1515
+ aspect_ratio_ids: Optional[torch.Tensor] = None, # TODO
1516
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1517
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1518
+
1519
+ - 1 for tokens that are **not masked**,
1520
+ - 0 for tokens that are **masked**.
1521
+
1522
+ [What are attention masks?](../glossary#attention-mask)
1523
+
1524
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1525
+ [`PreTrainedTokenizer.__call__`] for details.
1526
+
1527
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1528
+ `past_key_values`).
1529
+
1530
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1531
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1532
+ information on the default strategy.
1533
+
1534
+ - 1 indicates the head is **not masked**,
1535
+ - 0 indicates the head is **masked**.
1536
+ cross_attention_mask: Optional[torch.Tensor] = None, # TODO
1537
+ cross_attention_states: Optional[torch.Tensor] = None, # TODO
1538
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1539
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1540
+ config.n_positions - 1]`.
1541
+
1542
+ [What are position IDs?](../glossary#position-ids)
1543
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1544
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1545
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1546
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1547
+
1548
+ Two formats are allowed:
1549
+ - a [`~cache_utils.Cache`] instance, see our
1550
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
1551
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1552
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1553
+ cache format.
1554
+
1555
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1556
+ legacy cache format will be returned.
1557
+
1558
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1559
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1560
+ of shape `(batch_size, sequence_length)`.
1561
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1562
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1563
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1564
+ model's internal embedding lookup matrix.
1565
+ use_cache (`bool`, *optional*):
1566
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1567
+ `past_key_values`).
1568
+ output_attentions (`bool`, *optional*):
1569
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1570
+ tensors for more detail.
1571
+ output_hidden_states (`bool`, *optional*):
1572
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1573
+ more detail.
1574
+ return_dict (`bool`, *optional*):
1575
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1576
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
1577
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
1578
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
1579
+ the complete sequence length.
1580
+ """
1508
1581
1509
1582
1510
1583
@add_start_docstrings (
@@ -1556,11 +1629,13 @@ def get_decoder(self):
1556
1629
def tie_weights (self ):
1557
1630
return self .language_model .tie_weights ()
1558
1631
1632
+ @add_start_docstrings_to_model_forward (MLLAMA_INPUTS_DOCSTRING )
1633
+ @replace_return_docstrings (output_type = CausalLMOutputWithPast , config_class = _CONFIG_FOR_DOC )
1559
1634
def forward (
1560
1635
self ,
1561
1636
input_ids : torch .LongTensor = None ,
1562
- pixel_values : torch .FloatTensor = None , # shape: [batch_size, num_images, num_tiles, channels, height, width]
1563
- aspect_ratio_mask : Optional [List [List [int ]]] = None , # shape: [batch_size, num_images]; num tiles per image
1637
+ pixel_values : Optional [ torch .FloatTensor ] = None ,
1638
+ aspect_ratio_mask : Optional [List [List [int ]]] = None ,
1564
1639
aspect_ratio_ids : Optional [torch .Tensor ] = None ,
1565
1640
attention_mask : Optional [List [List [List [int ]]]] = None ,
1566
1641
cross_attention_mask : Optional [torch .Tensor ] = None ,
@@ -1575,7 +1650,44 @@ def forward(
1575
1650
return_dict : Optional [bool ] = None ,
1576
1651
cache_position : Optional [torch .LongTensor ] = None ,
1577
1652
num_logits_to_keep : int = 0 ,
1578
- ) -> MllamaOutput :
1653
+ ) -> Union [Tuple , CausalLMOutputWithPast ]:
1654
+ r"""
1655
+ Args:
1656
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1657
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1658
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1659
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1660
+
1661
+ num_logits_to_keep (`int`, *optional*):
1662
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1663
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1664
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1665
+
1666
+
1667
+ Returns:
1668
+
1669
+ Example:
1670
+
1671
+ ```python
1672
+ >>> from PIL import Image
1673
+ >>> import requests
1674
+ >>> from transformers import AutoProcessor, MllamaForConditionalGeneration
1675
+
1676
+ >>> model = MllamaForConditionalGeneration.from_pretrained("<mllama-checkpoint>")
1677
+ >>> processor = AutoProcessor.from_pretrained("<mllama-checkpoint>")
1678
+
1679
+ >>> prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
1680
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1681
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1682
+
1683
+ >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
1684
+
1685
+ >>> # Generate
1686
+ >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
1687
+ >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1688
+ "TODO: fill this out"
1689
+ ```"""
1690
+
1579
1691
if (input_ids is None ) ^ (inputs_embeds is not None ):
1580
1692
raise ValueError (
1581
1693
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
0 commit comments