Lightning-AI · justusschock · Jul 7, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
@@ -97,7 +97,7 @@ jobs:
       # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
       run: |
         python -m coverage run --source lightning \
-          -m pytest -v --timeout=30 --durations=50
+          -m pytest -v --timeout=60 --durations=60
 
     - name: Statistics
       if: success()

@@ -22,7 +22,7 @@
 from lightning.app.perf import pdb  # noqa: E402
 from lightning.app.utilities.packaging.build_config import BuildConfig  # noqa: E402
 from lightning.app.utilities.packaging.cloud_compute import CloudCompute  # noqa: E402
-from lightning.data import LightningDataset  # noqa: E402
+from lightning.data import LightningDataset, LightningIterableDataset  # noqa: E402
 from lightning.fabric.fabric import Fabric  # noqa: E402
 from lightning.fabric.utilities.seed import seed_everything  # noqa: E402
 from lightning.pytorch.callbacks import Callback  # noqa: E402
@@ -44,6 +44,7 @@
     "CloudCompute",
     "Trainer",
     "LightningDataset",
+    "LightningIterableDataset",
     "LightningDataModule",
     "LightningModule",
     "Callback",

@@ -9,3 +9,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Added `LightningDataset` for optimized data loading including fast loading for S3 buckets. ([#17743](https://github.com/Lightning-AI/lightning/pull/17743))
+
+- Added `LightningIterableDataset` for resumable dataloading with iterable datasets ([#17998](https://github.com/Lightning-AI/lightning/pull/17998))
@@ -1,3 +1,3 @@
-from lightning.data.dataset import LightningDataset
+from lightning.data.datasets import LightningDataset, LightningIterableDataset
 
-__all__ = ["LightningDataset"]
+__all__ = ["LightningDataset", "LightningIterableDataset"]
@@ -0,0 +1,4 @@
+from lightning.data.datasets.iterable import LightningIterableDataset
+from lightning.data.datasets.mapping import LightningDataset
+
+__all__ = ["LightningDataset", "LightningIterableDataset"]
@@ -0,0 +1,37 @@
+from typing import Any
+
+from torch.utils.data import Dataset as TorchDataset
+
+from lightning.data.backends import _DatasetBackend, LocalDatasetBackend, S3DatasetBackend
+from lightning.data.fileio import OpenCloudFileObj
+
+
+class _Dataset(TorchDataset):
+    """Base dataset class for streaming data from a cloud storage.
+
+    Args:
+        backend: storage location of the data_source. current options are "s3" or "local"
+    """
+
+    def __init__(self, backend: str = "local"):
+        self.backend = self._init_backend(backend=backend)
+
+        assert isinstance(self.backend, _DatasetBackend)
+
+    def _init_backend(self, backend: str) -> _DatasetBackend:
+        """Picks the correct backend handler."""
+        if backend == "s3":
+            return S3DatasetBackend()
+        if backend == "local":
+            return LocalDatasetBackend()
+        raise ValueError(f"Unsupported backend {backend}")
+
+    def open(self, file: str, mode: str = "r", kwargs_for_open: Any = {}, **kwargs: Any) -> OpenCloudFileObj:
+        """Opens a stream for the given file.
+
+        Returns:
+            A stream object of the file.
+        """
+        return OpenCloudFileObj(
+            path=file, mode=mode, kwargs_for_open={**self.backend.credentials(), **kwargs_for_open}, **kwargs
+        )
@@ -0,0 +1,147 @@
+from typing import Optional
+
+import torch
+from torch.utils.data import get_worker_info
+
+
+class DistributedEnv:
+    """The environment of the distributed training.
+
+    Args:
+        world_size: The number of total distributed training processes
+        global_rank: The rank of the current process within this pool of training processes
+    """
+
+    def __init__(self, world_size: int, global_rank: int):
+        self.world_size = world_size
+        self.global_rank = global_rank
+
+    @classmethod
+    def detect(cls) -> "DistributedEnv":
+        """Tries to automatically detect the distributed environment paramters.
+
+        Note:
+            This detection may not work in processes spawned from the distributed processes (e.g. DataLoader workers)
+            as the distributed framework won't be initialized there.
+            It will default to 1 distributed process in this case.
+        """
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            world_size = torch.distributed.get_world_size()
+            global_rank = torch.distributed.get_rank()
+        else:
+            world_size = None
+            global_rank = 0
+
+        if world_size is None or world_size == -1:
+            world_size = 1
+
+        return cls(world_size=world_size, global_rank=global_rank)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(world_size: {self.world_size}, global_rank: {self.global_rank}\n)"
+
+    def __str__(self) -> str:
+        return repr(self)
+
+
+class WorkerEnv:
+    """Contains the environment for the current dataloader within the current training process.
+
+    Args:
+        world_size: The number of dataloader workers for the current training process
+        rank: The rank of the current worker within the number of workers
+    """
+
+    def __init__(self, world_size: int, rank: int):
+        self.world_size = world_size
+        self.rank = rank
+
+    @classmethod
+    def detect(cls) -> "WorkerEnv":
+        """Automatically detects the number of workers and the current rank.
+
+        Note:
+            This only works reliably within a dataloader worker as otherwise the necessary information won't be present.
+            In such a case it will default to 1 worker
+        """
+        worker_info = get_worker_info()
+        num_workers = worker_info.num_workers if worker_info is not None else 1
+        current_worker_rank = worker_info.id if worker_info is not None else 0
+
+        return cls(world_size=num_workers, rank=current_worker_rank)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(world_size: {self.world_size}, rank: {self.rank})"
+
+    def __str__(self) -> str:
+        return repr(self)
+
+
+class Environment:
+    """Contains the compute environment. If not passed, will try to detect.
+
+    Args:
+        dist_env: The distributed environment (distributed worldsize and global rank)
+        worker_env: The worker environment (number of workers, worker rank)
+    """
+
+    def __init__(self, dist_env: Optional[DistributedEnv], worker_env: Optional[WorkerEnv]):
+        self.worker_env = worker_env
+        self.dist_env = dist_env
+
+    @classmethod
+    def from_args(
+        cls,
+        dist_world_size: int,
+        global_rank: int,
+        num_workers: int,
+        current_worker_rank: int,
+    ) -> "Environment":
+        """Generates the Environment class by already given arguments instead of detecting them.
+
+        Args:
+            dist_world_size: The worldsize used for distributed training (=total number of distributed processes)
+            global_rank: The distributed global rank of the current process
+            num_workers: The number of workers per distributed training process
+            current_worker_rank: The rank of the current worker within the number of workers of
+                the current training process
+        """
+        dist_env = DistributedEnv(dist_world_size, global_rank)
+        worker_env = WorkerEnv(num_workers, current_worker_rank)
+        return cls(dist_env=dist_env, worker_env=worker_env)
+
+    @property
+    def num_shards(self) -> int:
+        """Returns the total number of shards.
+
+        Note:
+            This may not be accurate in a non-dataloader-worker process like the main training process
+            as it doesn't necessarily know about the number of dataloader workers.
+        """
+        assert self.worker_env is not None
+        assert self.dist_env is not None
+        return self.worker_env.world_size * self.dist_env.world_size
+
+    @property
+    def shard_rank(self) -> int:
+        """Returns the rank of the current process wrt. the total number of shards.
+
+        Note:
+            This may not be accurate in a non-dataloader-worker process like the main training process as it
+            doesn't necessarily know about the number of dataloader workers.
+        """
+        assert self.worker_env is not None
+        assert self.dist_env is not None
+        return self.dist_env.global_rank * self.worker_env.world_size + self.worker_env.rank
+
+    def __repr__(self) -> str:
+        dist_env_repr = repr(self.dist_env)
+        worker_env_repr = repr(self.worker_env)
+
+        return (
+            f"{self.__class__.__name__}(\n\tdist_env: {dist_env_repr},\n\tworker_env: "
+            + f"{worker_env_repr}\n\tnum_shards: {self.num_shards},\n\tshard_rank: {self.shard_rank})"
+        )
+
+    def __str__(self) -> str:
+        return repr(self)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,3 +9,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		### Added

		- Added `LightningDataset` for optimized data loading including fast loading for S3 buckets. ([#17743](https://github.com/Lightning-AI/lightning/pull/17743))

		- Added `LightningIterableDataset` for resumable dataloading with iterable datasets ([#17998](https://github.com/Lightning-AI/lightning/pull/17998))