Make parallel mapper work on dataset specific transformations

divyanshk · web-flow · commit 8d97fbb4ffd0 · 2025-05-13T15:01:59.000-07:00
Differential Revision: D73187135 Pull Request resolved: #1486
diff --git a/test/nodes/test_multi_node_weighted_sampler.py b/test/nodes/test_multi_node_weighted_sampler.py
@@ -308,3 +308,68 @@ def test_multi_node_weighted_large_sample_size_with_prefetcher(self, midpoint, s
             stop_criteria,
         )
         run_test_save_load_state(self, node, midpoint)
+
+    def test_multi_node_weighted_sampler_tag_output_dict_items(self) -> None:
+        """Test MultiNodeWeightedSampler with tag_output=True for dictionary items"""
+        node = MultiNodeWeightedSampler(
+            self.datasets,
+            self.weights,
+            tag_output=True,
+        )
+
+        results = list(node)
+
+        # Verify that each result has a 'dataset_key' key with the correct dataset name
+        for result in results:
+            self.assertIn("dataset_key", result)
+
+            dataset_name = result["dataset_key"]
+            self.assertIn(dataset_name, [f"ds{i}" for i in range(self._num_datasets)])
+
+            self.assertIn("name", result)
+            self.assertIn("test_tensor", result)
+
+            self.assertEqual(dataset_name, result["name"])
+
+    def test_multi_node_weighted_sampler_tag_output_non_dict_items(self) -> None:
+        """Test MultiNodeWeightedSampler with tag_output=True for non-dictionary items"""
+        non_dict_datasets = {
+            f"ds{i}": IterableWrapper(range(i * 10, (i + 1) * 10))
+            for i in range(self._num_datasets)
+        }
+
+        node = MultiNodeWeightedSampler(
+            non_dict_datasets,
+            self.weights,
+            tag_output=True,
+        )
+
+        results = list(node)
+
+        # Verify that each result is now a dictionary with 'data' and 'dataset_key' keys
+        for result in results:
+            self.assertIsInstance(result, dict)
+
+            self.assertIn("data", result)
+            self.assertIn("dataset_key", result)
+
+            dataset_name = result["dataset_key"]
+            self.assertIn(dataset_name, [f"ds{i}" for i in range(self._num_datasets)])
+
+    def test_multi_node_weighted_sampler_tag_output_false(self) -> None:
+        """Test MultiNodeWeightedSampler with tag_output=False (default behavior)"""
+        node = MultiNodeWeightedSampler(
+            self.datasets,
+            self.weights,
+            tag_output=False,
+        )
+
+        results = list(node)
+
+        # Verify that none of the results have a 'dataset' key
+        for result in results:
+            self.assertNotIn("dataset", result)
+
+            # Check that the original data is preserved
+            self.assertIn("name", result)
+            self.assertIn("test_tensor", result)
diff --git a/torchdata/nodes/samplers/multi_node_weighted_sampler.py b/torchdata/nodes/samplers/multi_node_weighted_sampler.py
@@ -48,6 +48,7 @@ class MultiNodeWeightedSampler(BaseNode[T]):
         world_size (int): The world size of the distributed environment. Default is None, in
             which case the world size will be obtained from the distributed environment.
         seed (int): The seed for the random number generator. Default is 0.
+        tag_output (bool): Whether to tag the output with the dataset name. Default is False.
     """
 
     DATASET_NODE_STATES_KEY = "dataset_node_states"
@@ -64,6 +65,7 @@ def __init__(
         rank: Optional[int] = None,
         world_size: Optional[int] = None,
         seed: int = 0,
+        tag_output: bool = False,
     ) -> None:
         super().__init__()
 
@@ -74,6 +76,7 @@ def __init__(
         self._num_yielded = 0
         self._started = False
         self.seed = seed
+        self.tag_output = tag_output
 
         # Setup rank and world size
         if rank is None or world_size is None:
@@ -194,8 +197,17 @@ def next(self) -> T:
 
         # If we did't throw StopIteration, increment the number of items yielded and return the item
         self._num_yielded += 1
+
+        # If tag_output is True, add the dataset key to the output
+        if self.tag_output:
+            if isinstance(item, dict):  # type: ignore[used-before-def]
+                item["dataset_key"] = key  # type: ignore[used-before-def]
+            else:
+                item = {"dataset_key": key, "data": item}
+
         return item
 
+
     def get_state(self) -> Dict[str, Any]:
         return {
             self.DATASETS_EXHAUSTED_KEY: copy.deepcopy(self._datasets_exhausted),