Adding more docs and references

datumbox · datumbox · commit 1330d9ce98b3 · 2022-05-30T13:06:04.000+01:00
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -459,6 +459,7 @@ pre-trained weights:
 .. toctree::
    :maxdepth: 1
 
+   models/video_mvitv2
    models/video_resnet
 
 |
diff --git a/docs/source/models/video_mvitv2.rst b/docs/source/models/video_mvitv2.rst
@@ -0,0 +1,28 @@
+Video ResNet
+============
+
+.. currentmodule:: torchvision.models.video
+
+The MViTv2 model is based on the
+`MViTv2: Improved Multiscale Vision Transformers for Classification and Detection
+<https://arxiv.org/abs/2112.01526>`__ and `Multiscale Vision Transformers
+<https://arxiv.org/abs/2104.11227>`__ papers.
+
+
+Model builders
+--------------
+
+The following model builders can be used to instantiate a MViTV2 model, with or
+without pre-trained weights. All the model builders internally rely on the
+``torchvision.models.video.MViTV2`` base class. Please refer to the `source
+code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvitv2.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    mvitv2_t
+    mvitv2_s
+    mvitv2_b
diff --git a/torchvision/models/video/mvitv2.py b/torchvision/models/video/mvitv2.py
@@ -468,7 +468,8 @@ def mvitv2_t(*, weights: Optional[MViTV2_T_Weights] = None, progress: bool = Tru
     """
     Constructs a tiny MViTv2 architecture from
     `MViTv2: Improved Multiscale Vision Transformers for Classification and Detection
-    <https://arxiv.org/abs/2112.01526>`__.
+    <https://arxiv.org/abs/2112.01526>`__ and `Multiscale Vision Transformers
+    <https://arxiv.org/abs/2104.11227>`__.
 
     Args:
         weights (:class:`~torchvision.models.video.MViTV2_T_Weights`, optional): The
@@ -503,9 +504,10 @@ def mvitv2_t(*, weights: Optional[MViTV2_T_Weights] = None, progress: bool = Tru
 
 def mvitv2_s(*, weights: Optional[MViTV2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> MViTv2:
     """
-    Constructs a tiny MViTv2 architecture from
+    Constructs a small MViTv2 architecture from
     `MViTv2: Improved Multiscale Vision Transformers for Classification and Detection
-    <https://arxiv.org/abs/2112.01526>`__.
+    <https://arxiv.org/abs/2112.01526>`__ and `Multiscale Vision Transformers
+    <https://arxiv.org/abs/2104.11227>`__.
 
     Args:
         weights (:class:`~torchvision.models.video.MViTV2_S_Weights`, optional): The
@@ -540,9 +542,10 @@ def mvitv2_s(*, weights: Optional[MViTV2_S_Weights] = None, progress: bool = Tru
 
 def mvitv2_b(*, weights: Optional[MViTV2_B_Weights] = None, progress: bool = True, **kwargs: Any) -> MViTv2:
     """
-    Constructs a tiny MViTv2 architecture from
+    Constructs a base MViTv2 architecture from
     `MViTv2: Improved Multiscale Vision Transformers for Classification and Detection
-    <https://arxiv.org/abs/2112.01526>`__.
+    <https://arxiv.org/abs/2112.01526>`__ and `Multiscale Vision Transformers
+    <https://arxiv.org/abs/2104.11227>`__.
 
     Args:
         weights (:class:`~torchvision.models.video.MViTV2_B_Weights`, optional): The

Original file line number	Diff line number	Diff line change
`@@ -459,6 +459,7 @@ pre-trained weights:`
`459`	`459`	`.. toctree::`
`460`	`460`	`:maxdepth: 1`
`461`	`461`
	`462`	`+ models/video_mvitv2`
`462`	`463`	`models/video_resnet`
`463`	`464`
`464`	`465`	`\|`