[c10d][fr] Fix script for uneven reduce scatter and update test cases (pytorch#151475)

fduwjj · pytorchmergebot · commit 6f9ffaa9916c · 2025-04-17T02:11:08.000Z
Somehow the type string for reduce scatter is "REDUCE_SCATTER" not "REDUCESCATTER". This PR fixed it and added more test cases. Differential Revision: [D73141245](https://our.internmc.facebook.com/intern/diff/D73141245) Pull Request resolved: pytorch#151475 Approved by: https://github.com/fegin
diff --git a/test/distributed/flight_recorder/test_fr_analysis.py b/test/distributed/flight_recorder/test_fr_analysis.py
@@ -150,17 +150,16 @@ def test_all_events(self):
             output_sizes = [[4, 4]]
             expectedState = MatchState.FULLY_MATCHED
             if collective in [
+                "reduce_scatter",
                 "_reduce_scatter_base",
                 "reduce_scatter_tensor_coalesced",
-                "REDUCESCATTER_coalesced",
             ]:
                 input_sizes = [[4, 4]]
                 output_sizes = [[input_sizes[0][0] * 2]]
             if collective in [
                 "all_gather",
                 "_all_gather_base",
                 "all_gather_into_tensor_coalesced",
-                "ALLGATHER_coalesced",
             ]:
                 output_sizes = [[math.prod(input_sizes[0]) * 2]]
             if collective == "all_to_all":
@@ -295,7 +294,7 @@ def testBuildDB(self):
             create_one_entry(1, "_broadcast_oop", [[5, 5]], [[5, 5]])
         )
         details4["dump_file_rank_0"]["entries"].append(
-            create_one_entry(2, "coalesced", [[]], [[]])
+            create_one_entry(2, "ALLGATHER_coalesced", [[]], [[]])
         )
         details4["dump_file_rank_1"]["entries"].append(
             create_one_entry(0, "_broadcast_oop", [[4, 4]], [[4, 4]])
@@ -304,13 +303,41 @@ def testBuildDB(self):
             create_one_entry(1, "_broadcast_oop", [[4, 4]], [[4, 4]])
         )
         details4["dump_file_rank_1"]["entries"].append(
-            create_one_entry(2, "coalesced", [[]], [[]])
+            create_one_entry(2, "ALLGATHER_coalesced", [[]], [[]])
         )
         db = build_db(details4, args, version)
         self.assertEqual(len(db.collectives), 1)
         self.assertEqual(db.collectives[0].record_id, 1)
         self.assertEqual(db.collectives[0].collective_name, "nccl:_broadcast_oop")
         self.assertEqual(db.collectives[0].pass_check, False)
+        # Test case 5: matched uneven reduce scatter case.
+        details5 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        # sequence ID should not increase for coalesced collectives
+        details5["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_0"]["entries"].append(
+            create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_0"]["entries"].append(
+            create_one_entry(2, "REDUCE_SCATTER_coalesced", [[]], [[]])
+        )
+        details5["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_1"]["entries"].append(
+            create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_1"]["entries"].append(
+            create_one_entry(2, "REDUCE_SCATTER_coalesced", [[]], [[]])
+        )
+        db = build_db(details5, args, version)
+        self.assertEqual(len(db.collectives), 1)
+        self.assertEqual(db.collectives[0].record_id, 2)
+        self.assertEqual(
+            db.collectives[0].collective_name, "nccl:REDUCE_SCATTER_coalesced"
+        )
+        self.assertEqual(db.collectives[0].pass_check, True)
 
 
 if __name__ == "__main__":
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
@@ -194,6 +194,7 @@ class Database(NamedTuple):
     "all_reduce",
     "_all_gather_base",
     "all_gather_into_tensor_coalesced",
+    "reduce_scatter",
     "reduce_scatter_tensor_coalesced",
     "_reduce_scatter_base",
     "gather",
@@ -202,7 +203,7 @@ class Database(NamedTuple):
     "all_reduce_barrier",
     "allreduce_coalesced",
     "ALLGATHER_coalesced",
-    "REDUCESCATTER_coalesced",
+    "REDUCE_SCATTER_coalesced",
 }
 
 P2P = {
@@ -552,7 +553,6 @@ def match(self, other: "Op") -> MatchInfo:
                 "all_gather",
                 "all_gather_base",
                 "all_gather_into_tensor_coalesced",
-                "ALLGATHER_coalesced",
             ] and not (
                 math.prod(other.output_sizes[0])
                 == math.prod(self.input_sizes[0]) * self.pg_size
@@ -566,7 +566,6 @@ def match(self, other: "Op") -> MatchInfo:
                 "reduce_scatter",
                 "_reduce_scatter_base",
                 "reduce_scatter_tensor_coalesced",
-                "REDUCESCATTER_coalesced",
             ] and not (
                 math.prod(other.input_sizes[0])
                 == math.prod(self.output_sizes[0]) * self.pg_size
@@ -576,10 +575,14 @@ def match(self, other: "Op") -> MatchInfo:
                     f"Found input numel '{math.prod(other.input_sizes[0])}' does not match output numel "
                     f"'{math.prod(other.output_sizes[0])} * pg size {self.pg_size}'",
                 )
-        elif self.type == "coalesced":
+        elif self.type in [
+            "coalesced",
+            "ALLGATHER_coalesced",
+            "REDUCE_SCATTER_coalesced",
+        ]:
             return (
                 MatchInfo(MatchState.FULLY_MATCHED)
-                if (other.type == "coalesced")
+                if (other.type == self.type)
                 else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
             )
         return MatchInfo(MatchState.FULLY_MATCHED)
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
@@ -601,7 +601,8 @@ def find_coalesced_group_with_non_p2p(
             break
 
     if len(found) > 1:
-        if found[-1][1]["profiling_name"] != "nccl:coalesced":
+        name = found[-1][1]["profiling_name"]
+        if name.startswith("nccl:") and name.endswith("_coalesced"):
             logger.error("Rank %s does not have a coalesced end.", rank)
         return found
     return []