Skip to content

Commit 3e677c9

Browse files
committed
Add initial docstrings
1 parent 9decb19 commit 3e677c9

File tree

1 file changed

+125
-13
lines changed

1 file changed

+125
-13
lines changed

src/transformers/models/mllama/modeling_mllama.py

+125-13
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
3333
from ...utils import (
3434
add_start_docstrings,
35+
add_start_docstrings_to_model_forward,
3536
logging,
37+
replace_return_docstrings,
3638
)
3739
from .configuration_mllama import MllamaConfig, MllamaTextConfig, MllamaVisionConfig
3840

@@ -175,16 +177,6 @@ def _prepare_aspect_ratio_attention_mask(
175177
return attention_mask
176178

177179

178-
@dataclass
179-
class MllamaOutput(ModelOutput):
180-
loss: torch.FloatTensor = None
181-
logits: torch.FloatTensor = None
182-
past_key_values: List[List[torch.FloatTensor]] = None
183-
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
184-
attentions: Optional[Tuple[torch.FloatTensor]] = None
185-
image_hidden_states: Optional[torch.FloatTensor] = None
186-
187-
188180
class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
189181
def __init__(self, config: MllamaVisionConfig, is_gated: bool = True):
190182
super().__init__()
@@ -1505,6 +1497,87 @@ def prepare_inputs_for_generation(
15051497
return model_inputs
15061498

15071499

1500+
MLLAMA_INPUTS_DOCSTRING = r"""
1501+
Args:
1502+
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1503+
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1504+
it.
1505+
1506+
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1507+
[`PreTrainedTokenizer.__call__`] for details.
1508+
1509+
[What are input IDs?](../glossary#input-ids)
1510+
pixel_values (`torch.FloatTensor` of shape `(batch_size, max_num_images, max_num_tiles, channels, image_size, image_size)):
1511+
The tensors corresponding to the input images. Pixel values can be obtained using
1512+
[`AutoImageProcessor`]. See [`MllamaImageProcessor.__call__`] for details ([]`MllamaProcessor`] uses
1513+
[`MllamaImageProcessor`] for processing images).
1514+
aspect_ratio_mask: Optional[List[List[int]]] = None, # TODO
1515+
aspect_ratio_ids: Optional[torch.Tensor] = None, # TODO
1516+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1517+
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1518+
1519+
- 1 for tokens that are **not masked**,
1520+
- 0 for tokens that are **masked**.
1521+
1522+
[What are attention masks?](../glossary#attention-mask)
1523+
1524+
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1525+
[`PreTrainedTokenizer.__call__`] for details.
1526+
1527+
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1528+
`past_key_values`).
1529+
1530+
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1531+
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1532+
information on the default strategy.
1533+
1534+
- 1 indicates the head is **not masked**,
1535+
- 0 indicates the head is **masked**.
1536+
cross_attention_mask: Optional[torch.Tensor] = None, # TODO
1537+
cross_attention_states: Optional[torch.Tensor] = None, # TODO
1538+
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1539+
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1540+
config.n_positions - 1]`.
1541+
1542+
[What are position IDs?](../glossary#position-ids)
1543+
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1544+
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1545+
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1546+
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1547+
1548+
Two formats are allowed:
1549+
- a [`~cache_utils.Cache`] instance, see our
1550+
[kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
1551+
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1552+
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1553+
cache format.
1554+
1555+
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1556+
legacy cache format will be returned.
1557+
1558+
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1559+
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1560+
of shape `(batch_size, sequence_length)`.
1561+
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1562+
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1563+
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1564+
model's internal embedding lookup matrix.
1565+
use_cache (`bool`, *optional*):
1566+
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1567+
`past_key_values`).
1568+
output_attentions (`bool`, *optional*):
1569+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1570+
tensors for more detail.
1571+
output_hidden_states (`bool`, *optional*):
1572+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1573+
more detail.
1574+
return_dict (`bool`, *optional*):
1575+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1576+
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
1577+
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
1578+
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
1579+
the complete sequence length.
1580+
"""
15081581

15091582

15101583
@add_start_docstrings(
@@ -1556,11 +1629,13 @@ def get_decoder(self):
15561629
def tie_weights(self):
15571630
return self.language_model.tie_weights()
15581631

1632+
@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
1633+
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
15591634
def forward(
15601635
self,
15611636
input_ids: torch.LongTensor = None,
1562-
pixel_values: torch.FloatTensor = None, # shape: [batch_size, num_images, num_tiles, channels, height, width]
1563-
aspect_ratio_mask: Optional[List[List[int]]] = None, # shape: [batch_size, num_images]; num tiles per image
1637+
pixel_values: Optional[torch.FloatTensor] = None,
1638+
aspect_ratio_mask: Optional[List[List[int]]] = None,
15641639
aspect_ratio_ids: Optional[torch.Tensor] = None,
15651640
attention_mask: Optional[List[List[List[int]]]] = None,
15661641
cross_attention_mask: Optional[torch.Tensor] = None,
@@ -1575,7 +1650,44 @@ def forward(
15751650
return_dict: Optional[bool] = None,
15761651
cache_position: Optional[torch.LongTensor] = None,
15771652
num_logits_to_keep: int = 0,
1578-
) -> MllamaOutput:
1653+
) -> Union[Tuple, CausalLMOutputWithPast]:
1654+
r"""
1655+
Args:
1656+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1657+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1658+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1659+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1660+
1661+
num_logits_to_keep (`int`, *optional*):
1662+
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1663+
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1664+
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1665+
1666+
1667+
Returns:
1668+
1669+
Example:
1670+
1671+
```python
1672+
>>> from PIL import Image
1673+
>>> import requests
1674+
>>> from transformers import AutoProcessor, MllamaForConditionalGeneration
1675+
1676+
>>> model = MllamaForConditionalGeneration.from_pretrained("<mllama-checkpoint>")
1677+
>>> processor = AutoProcessor.from_pretrained("<mllama-checkpoint>")
1678+
1679+
>>> prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
1680+
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1681+
>>> image = Image.open(requests.get(url, stream=True).raw)
1682+
1683+
>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
1684+
1685+
>>> # Generate
1686+
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
1687+
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1688+
"TODO: fill this out"
1689+
```"""
1690+
15791691
if (input_ids is None) ^ (inputs_embeds is not None):
15801692
raise ValueError(
15811693
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"

0 commit comments

Comments
 (0)