Merge branch 'main' into add-regnet-16-32-swag

YosuaMichael · YosuaMichael · commit 99043583ba28 · 2022-04-01T20:34:08.000+01:00
diff --git a/README.rst b/README.rst
@@ -185,3 +185,10 @@ Disclaimer on Datasets
 This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
 
 If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
+
+Pre-trained Model License
+=========================
+
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
+
+More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE <https://github.com/facebookresearch/SWAG/blob/main/LICENSE>`_ for additional details.
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
@@ -115,7 +115,8 @@ def test_schema_meta_validation(model_fn):
                     incorrect_params.append(w)
         else:
             if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"):
-                incorrect_params.append(w)
+                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
+                    incorrect_params.append(w)
         if not w.name.isupper():
             bad_names.append(w)
 
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
@@ -1225,7 +1225,7 @@ def test_invalid_file(self):
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
     @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    @pytest.mark.parametrize("start_offset", [0, 1000])
+    @pytest.mark.parametrize("start_offset", [0, 500])
     @pytest.mark.parametrize("end_offset", [3000, None])
     def test_audio_present_pts(self, test_video, backend, start_offset, end_offset):
         """Test if audio frames are returned with pts unit."""
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
@@ -11,6 +11,7 @@
 import urllib
 import urllib.error
 import urllib.request
+import warnings
 import zipfile
 from typing import Any, Callable, List, Iterable, Optional, TypeVar, Dict, IO, Tuple, Iterator
 from urllib.parse import urlparse
@@ -24,22 +25,31 @@
     _is_remote_location_available,
 )
 
-
 USER_AGENT = "pytorch/vision"
 
 
-def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
-    with open(filename, "wb") as fh:
-        with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
-            with tqdm(total=response.length) as pbar:
-                for chunk in iter(lambda: response.read(chunk_size), ""):
-                    if not chunk:
-                        break
-                    pbar.update(chunk_size)
-                    fh.write(chunk)
+def _save_response_content(
+    content: Iterator[bytes],
+    destination: str,
+    length: Optional[int] = None,
+) -> None:
+    with open(destination, "wb") as fh, tqdm(total=length) as pbar:
+        for chunk in content:
+            # filter out keep-alive new chunks
+            if not chunk:
+                continue
+
+            fh.write(chunk)
+            pbar.update(len(chunk))
+
+
+def _urlretrieve(url: str, filename: str, chunk_size: int = 1024 * 32) -> None:
+    with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
+        _save_response_content(iter(lambda: response.read(chunk_size), b""), filename, length=response.length)
 
 
 def gen_bar_updater() -> Callable[[int, int, int], None]:
+    warnings.warn("The function `gen_bar_update` is deprecated since 0.13 and will be removed in 0.15.")
     pbar = tqdm(total=None)
 
     def bar_update(count, block_size, total_size):
@@ -184,11 +194,20 @@ def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]:
     return files
 
 
-def _quota_exceeded(first_chunk: bytes) -> bool:
+def _extract_gdrive_api_response(response, chunk_size: int = 32 * 1024) -> Tuple[bytes, Iterator[bytes]]:
+    content = response.iter_content(chunk_size)
+    first_chunk = None
+    # filter out keep-alive new chunks
+    while not first_chunk:
+        first_chunk = next(content)
+    content = itertools.chain([first_chunk], content)
+
     try:
-        return "Google Drive - Quota exceeded" in first_chunk.decode()
+        match = re.search("<title>Google Drive - (?P<api_response>.+?)</title>", first_chunk.decode())
+        api_response = match["api_response"] if match is not None else None
     except UnicodeDecodeError:
-        return False
+        api_response = None
+    return api_response, content
 
 
 def download_file_from_google_drive(file_id: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None):
@@ -202,70 +221,41 @@ def download_file_from_google_drive(file_id: str, root: str, filename: Optional[
     """
     # Based on https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
 
-    url = "https://docs.google.com/uc?export=download"
-
     root = os.path.expanduser(root)
     if not filename:
         filename = file_id
     fpath = os.path.join(root, filename)
 
     os.makedirs(root, exist_ok=True)
 
-    if os.path.isfile(fpath) and check_integrity(fpath, md5):
-        print("Using downloaded and verified file: " + fpath)
-    else:
-        session = requests.Session()
-
-        response = session.get(url, params={"id": file_id}, stream=True)
-        token = _get_confirm_token(response)
-
-        if token:
-            params = {"id": file_id, "confirm": token}
-            response = session.get(url, params=params, stream=True)
-
-        # Ideally, one would use response.status_code to check for quota limits, but google drive is not consistent
-        # with their own API, refer https://github.com/pytorch/vision/issues/2992#issuecomment-730614517.
-        # Should this be fixed at some place in future, one could refactor the following to no longer rely on decoding
-        # the first_chunk of the payload
-        response_content_generator = response.iter_content(32768)
-        first_chunk = None
-        while not first_chunk:  # filter out keep-alive new chunks
-            first_chunk = next(response_content_generator)
-
-        if _quota_exceeded(first_chunk):
-            msg = (
-                f"The daily quota of the file {filename} is exceeded and it "
-                f"can't be downloaded. This is a limitation of Google Drive "
-                f"and can only be overcome by trying again later."
-            )
-            raise RuntimeError(msg)
-
-        _save_response_content(itertools.chain((first_chunk,), response_content_generator), fpath)
-        response.close()
+    if check_integrity(fpath, md5):
+        print(f"Using downloaded {'and verified ' if md5 else ''}file: {fpath}")
 
+    url = "https://drive.google.com/uc"
+    params = dict(id=file_id, export="download")
+    with requests.Session() as session:
+        response = session.get(url, params=params, stream=True)
 
-def _get_confirm_token(response: requests.models.Response) -> Optional[str]:
-    for key, value in response.cookies.items():
-        if key.startswith("download_warning"):
-            return value
+        for key, value in response.cookies.items():
+            if key.startswith("download_warning"):
+                token = value
+                break
+        else:
+            api_response, content = _extract_gdrive_api_response(response)
+            token = "t" if api_response == "Virus scan warning" else None
 
-    return None
+        if token is not None:
+            response = session.get(url, params=dict(params, confirm=token), stream=True)
+            api_response, content = _extract_gdrive_api_response(response)
 
+        if api_response == "Quota exceeded":
+            raise RuntimeError(
+                f"The daily quota of the file {filename} is exceeded and it "
+                f"can't be downloaded. This is a limitation of Google Drive "
+                f"and can only be overcome by trying again later."
+            )
 
-def _save_response_content(
-    response_gen: Iterator[bytes],
-    destination: str,
-) -> None:
-    with open(destination, "wb") as f:
-        pbar = tqdm(total=None)
-        progress = 0
-
-        for chunk in response_gen:
-            if chunk:  # filter out keep-alive new chunks
-                f.write(chunk)
-                progress += len(chunk)
-                pbar.update(progress - pbar.n)
-        pbar.close()
+        _save_response_content(content, fpath)
 
 
 def _extract_tar(from_path: str, to_path: str, compression: Optional[str]) -> None:
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
@@ -423,16 +423,6 @@ def _probe_video_from_memory(
     return info
 
 
-def _convert_to_sec(
-    start_pts: Union[float, Fraction], end_pts: Union[float, Fraction], pts_unit: str, time_base: Fraction
-) -> Tuple[Union[float, Fraction], Union[float, Fraction], str]:
-    if pts_unit == "pts":
-        start_pts = float(start_pts * time_base)
-        end_pts = float(end_pts * time_base)
-        pts_unit = "sec"
-    return start_pts, end_pts, pts_unit
-
-
 def _read_video(
     filename: str,
     start_pts: Union[float, Fraction] = 0,
@@ -452,38 +442,28 @@ def _read_video(
 
     has_video = info.has_video
     has_audio = info.has_audio
-    video_pts_range = (0, -1)
-    video_timebase = default_timebase
-    audio_pts_range = (0, -1)
-    audio_timebase = default_timebase
-    time_base = default_timebase
-
-    if has_video:
-        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        time_base = video_timebase
-
-    if has_audio:
-        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
-        time_base = time_base if time_base else audio_timebase
-
-    # video_timebase is the default time_base
-    start_pts_sec, end_pts_sec, pts_unit = _convert_to_sec(start_pts, end_pts, pts_unit, time_base)
 
     def get_pts(time_base):
-        start_offset = start_pts_sec
-        end_offset = end_pts_sec
+        start_offset = start_pts
+        end_offset = end_pts
         if pts_unit == "sec":
-            start_offset = int(math.floor(start_pts_sec * (1 / time_base)))
+            start_offset = int(math.floor(start_pts * (1 / time_base)))
             if end_offset != float("inf"):
-                end_offset = int(math.ceil(end_pts_sec * (1 / time_base)))
+                end_offset = int(math.ceil(end_pts * (1 / time_base)))
         if end_offset == float("inf"):
             end_offset = -1
         return start_offset, end_offset
 
+    video_pts_range = (0, -1)
+    video_timebase = default_timebase
     if has_video:
+        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
         video_pts_range = get_pts(video_timebase)
 
+    audio_pts_range = (0, -1)
+    audio_timebase = default_timebase
     if has_audio:
+        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
         audio_pts_range = get_pts(audio_timebase)
 
     vframes, aframes, info = _read_video_from_file(
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -287,13 +287,6 @@ def read_video(
         with av.open(filename, metadata_errors="ignore") as container:
             if container.streams.audio:
                 audio_timebase = container.streams.audio[0].time_base
-            time_base = _video_opt.default_timebase
-            if container.streams.video:
-                time_base = container.streams.video[0].time_base
-            elif container.streams.audio:
-                time_base = container.streams.audio[0].time_base
-            # video_timebase is the default time_base
-            start_pts, end_pts, pts_unit = _video_opt._convert_to_sec(start_pts, end_pts, pts_unit, time_base)
             if container.streams.video:
                 video_frames = _read_from_stream(
                     container,
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
@@ -1,7 +1,7 @@
 import math
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, List, NamedTuple, Optional
+from typing import Any, Callable, List, NamedTuple, Optional, Sequence
 
 import torch
 import torch.nn as nn
@@ -284,10 +284,21 @@ def _vision_transformer(
     progress: bool,
     **kwargs: Any,
 ) -> VisionTransformer:
-    image_size = kwargs.pop("image_size", 224)
-
     if weights is not None:
         _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if isinstance(weights.meta["size"], int):
+            _ovewrite_named_param(kwargs, "image_size", weights.meta["size"])
+        elif isinstance(weights.meta["size"], Sequence):
+            if len(weights.meta["size"]) != 2 or weights.meta["size"][0] != weights.meta["size"][1]:
+                raise ValueError(
+                    f'size: {weights.meta["size"]} is not valid! Currently we only support a 2-dimensional square and width = height'
+                )
+            _ovewrite_named_param(kwargs, "image_size", weights.meta["size"][0])
+        else:
+            raise ValueError(
+                f'weights.meta["size"]: {weights.meta["size"]} is not valid, the type should be either an int or a Sequence[int]'
+            )
+    image_size = kwargs.pop("image_size", 224)
 
     model = VisionTransformer(
         image_size=image_size,
@@ -313,6 +324,14 @@ def _vision_transformer(
     "interpolation": InterpolationMode.BILINEAR,
 }
 
+_COMMON_SWAG_META = {
+    **_COMMON_META,
+    "publication_year": 2022,
+    "recipe": "https://github.com/facebookresearch/SWAG",
+    "license": "https://github.com/facebookresearch/SWAG/blob/main/LICENSE",
+    "interpolation": InterpolationMode.BICUBIC,
+}
+
 
 class ViT_B_16_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
@@ -328,6 +347,23 @@ class ViT_B_16_Weights(WeightsEnum):
             "acc@5": 95.318,
         },
     )
+    IMAGENET1K_SWAG_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=384,
+            resize_size=384,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 86859496,
+            "size": (384, 384),
+            "min_size": (384, 384),
+            "acc@1": 85.304,
+            "acc@5": 97.650,
+        },
+    )
     DEFAULT = IMAGENET1K_V1
 
 
@@ -362,6 +398,23 @@ class ViT_L_16_Weights(WeightsEnum):
             "acc@5": 94.638,
         },
     )
+    IMAGENET1K_SWAG_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_l_16_swag-4f3808c9.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=512,
+            resize_size=512,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 305174504,
+            "size": (512, 512),
+            "min_size": (512, 512),
+            "acc@1": 88.064,
+            "acc@5": 98.512,
+        },
+    )
     DEFAULT = IMAGENET1K_V1