Skip to content

Commit 6f9ffaa

Browse files
fduwjjpytorchmergebot
authored andcommitted
[c10d][fr] Fix script for uneven reduce scatter and update test cases (pytorch#151475)
Somehow the type string for reduce scatter is "REDUCE_SCATTER" not "REDUCESCATTER". This PR fixed it and added more test cases. Differential Revision: [D73141245](https://our.internmc.facebook.com/intern/diff/D73141245) Pull Request resolved: pytorch#151475 Approved by: https://github.com/fegin
1 parent cd1db55 commit 6f9ffaa

File tree

3 files changed

+41
-10
lines changed

3 files changed

+41
-10
lines changed

test/distributed/flight_recorder/test_fr_analysis.py

+31-4
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,16 @@ def test_all_events(self):
150150
output_sizes = [[4, 4]]
151151
expectedState = MatchState.FULLY_MATCHED
152152
if collective in [
153+
"reduce_scatter",
153154
"_reduce_scatter_base",
154155
"reduce_scatter_tensor_coalesced",
155-
"REDUCESCATTER_coalesced",
156156
]:
157157
input_sizes = [[4, 4]]
158158
output_sizes = [[input_sizes[0][0] * 2]]
159159
if collective in [
160160
"all_gather",
161161
"_all_gather_base",
162162
"all_gather_into_tensor_coalesced",
163-
"ALLGATHER_coalesced",
164163
]:
165164
output_sizes = [[math.prod(input_sizes[0]) * 2]]
166165
if collective == "all_to_all":
@@ -295,7 +294,7 @@ def testBuildDB(self):
295294
create_one_entry(1, "_broadcast_oop", [[5, 5]], [[5, 5]])
296295
)
297296
details4["dump_file_rank_0"]["entries"].append(
298-
create_one_entry(2, "coalesced", [[]], [[]])
297+
create_one_entry(2, "ALLGATHER_coalesced", [[]], [[]])
299298
)
300299
details4["dump_file_rank_1"]["entries"].append(
301300
create_one_entry(0, "_broadcast_oop", [[4, 4]], [[4, 4]])
@@ -304,13 +303,41 @@ def testBuildDB(self):
304303
create_one_entry(1, "_broadcast_oop", [[4, 4]], [[4, 4]])
305304
)
306305
details4["dump_file_rank_1"]["entries"].append(
307-
create_one_entry(2, "coalesced", [[]], [[]])
306+
create_one_entry(2, "ALLGATHER_coalesced", [[]], [[]])
308307
)
309308
db = build_db(details4, args, version)
310309
self.assertEqual(len(db.collectives), 1)
311310
self.assertEqual(db.collectives[0].record_id, 1)
312311
self.assertEqual(db.collectives[0].collective_name, "nccl:_broadcast_oop")
313312
self.assertEqual(db.collectives[0].pass_check, False)
313+
# Test case 5: matched uneven reduce scatter case.
314+
details5 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
315+
# sequence ID should not increase for coalesced collectives
316+
details5["dump_file_rank_0"]["entries"].append(
317+
create_one_entry(0, "_reduce_oop", [[4, 4]], [[4, 4]])
318+
)
319+
details5["dump_file_rank_0"]["entries"].append(
320+
create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
321+
)
322+
details5["dump_file_rank_0"]["entries"].append(
323+
create_one_entry(2, "REDUCE_SCATTER_coalesced", [[]], [[]])
324+
)
325+
details5["dump_file_rank_1"]["entries"].append(
326+
create_one_entry(0, "_reduce_oop", [[4, 4]], [[4, 4]])
327+
)
328+
details5["dump_file_rank_1"]["entries"].append(
329+
create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
330+
)
331+
details5["dump_file_rank_1"]["entries"].append(
332+
create_one_entry(2, "REDUCE_SCATTER_coalesced", [[]], [[]])
333+
)
334+
db = build_db(details5, args, version)
335+
self.assertEqual(len(db.collectives), 1)
336+
self.assertEqual(db.collectives[0].record_id, 2)
337+
self.assertEqual(
338+
db.collectives[0].collective_name, "nccl:REDUCE_SCATTER_coalesced"
339+
)
340+
self.assertEqual(db.collectives[0].pass_check, True)
314341

315342

316343
if __name__ == "__main__":

tools/flight_recorder/components/types.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ class Database(NamedTuple):
194194
"all_reduce",
195195
"_all_gather_base",
196196
"all_gather_into_tensor_coalesced",
197+
"reduce_scatter",
197198
"reduce_scatter_tensor_coalesced",
198199
"_reduce_scatter_base",
199200
"gather",
@@ -202,7 +203,7 @@ class Database(NamedTuple):
202203
"all_reduce_barrier",
203204
"allreduce_coalesced",
204205
"ALLGATHER_coalesced",
205-
"REDUCESCATTER_coalesced",
206+
"REDUCE_SCATTER_coalesced",
206207
}
207208

208209
P2P = {
@@ -552,7 +553,6 @@ def match(self, other: "Op") -> MatchInfo:
552553
"all_gather",
553554
"all_gather_base",
554555
"all_gather_into_tensor_coalesced",
555-
"ALLGATHER_coalesced",
556556
] and not (
557557
math.prod(other.output_sizes[0])
558558
== math.prod(self.input_sizes[0]) * self.pg_size
@@ -566,7 +566,6 @@ def match(self, other: "Op") -> MatchInfo:
566566
"reduce_scatter",
567567
"_reduce_scatter_base",
568568
"reduce_scatter_tensor_coalesced",
569-
"REDUCESCATTER_coalesced",
570569
] and not (
571570
math.prod(other.input_sizes[0])
572571
== math.prod(self.output_sizes[0]) * self.pg_size
@@ -576,10 +575,14 @@ def match(self, other: "Op") -> MatchInfo:
576575
f"Found input numel '{math.prod(other.input_sizes[0])}' does not match output numel "
577576
f"'{math.prod(other.output_sizes[0])} * pg size {self.pg_size}'",
578577
)
579-
elif self.type == "coalesced":
578+
elif self.type in [
579+
"coalesced",
580+
"ALLGATHER_coalesced",
581+
"REDUCE_SCATTER_coalesced",
582+
]:
580583
return (
581584
MatchInfo(MatchState.FULLY_MATCHED)
582-
if (other.type == "coalesced")
585+
if (other.type == self.type)
583586
else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
584587
)
585588
return MatchInfo(MatchState.FULLY_MATCHED)

tools/flight_recorder/components/utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,8 @@ def find_coalesced_group_with_non_p2p(
601601
break
602602

603603
if len(found) > 1:
604-
if found[-1][1]["profiling_name"] != "nccl:coalesced":
604+
name = found[-1][1]["profiling_name"]
605+
if name.startswith("nccl:") and name.endswith("_coalesced"):
605606
logger.error("Rank %s does not have a coalesced end.", rank)
606607
return found
607608
return []

0 commit comments

Comments
 (0)