Skip to content

Commit b5c36b5

Browse files
t4c1taozha2
authored andcommitted
Add a tool for debugging copies (codeplaysoftware#224)
Add a tool for understanding various copy operations that prints out layout of the copied data, which should be useful for debugging.
1 parent bf666fa commit b5c36b5

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

tools/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ cmake_policy(SET CMP0112 NEW)
3030

3131
add_subdirectory(util)
3232

33+
if (CUTLASS_ENABLE_SYCL)
34+
add_subdirectory(copy_debug)
35+
endif()
36+
3337
if (CUTLASS_ENABLE_LIBRARY)
3438
add_subdirectory(library)
3539
endif()

tools/copy_debug/CMakeLists.txt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
2+
# SPDX-License-Identifier: BSD-3-Clause
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# 1. Redistributions of source code must retain the above copyright notice, this
8+
# list of conditions and the following disclaimer.
9+
#
10+
# 2. Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# 3. Neither the name of the copyright holder nor the names of its
15+
# contributors may be used to endorse or promote products derived from
16+
# this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
29+
if (CUTLASS_ENABLE_SYCL AND SYCL_INTEL_TARGET)
30+
cutlass_add_executable(
31+
copy_debug
32+
copy_debug.cpp
33+
)
34+
35+
target_link_libraries(
36+
copy_debug
37+
PRIVATE
38+
CUTLASS
39+
cutlass_tools_util_includes
40+
)
41+
42+
add_sycl_to_target(TARGET copy_debug)
43+
44+
endif()

tools/copy_debug/copy_debug.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/***************************************************************************************************
2+
* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
3+
* SPDX-License-Identifier: BSD-3-Clause
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are met:
7+
*
8+
* 1. Redistributions of source code must retain the above copyright notice, this
9+
* list of conditions and the following disclaimer.
10+
*
11+
* 2. Redistributions in binary form must reproduce the above copyright notice,
12+
* this list of conditions and the following disclaimer in the documentation
13+
* and/or other materials provided with the distribution.
14+
*
15+
* 3. Neither the name of the copyright holder nor the names of its
16+
* contributors may be used to endorse or promote products derived from
17+
* this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
*
30+
**************************************************************************************************/
31+
32+
#include <cute/tensor.hpp>
33+
#include <sycl/sycl.hpp>
34+
#include <syclcompat.hpp>
35+
36+
#include <cutlass/util/device_memory.h>
37+
#include <syclcompat/syclcompat.hpp>
38+
#include <cutlass/cutlass.h>
39+
40+
using namespace syclcompat::experimental;
41+
using namespace cute;
42+
43+
#define SUBGROUP_SIZE (16)
44+
45+
template <class CopyInstruction, class TensorS, class fragment_size>
46+
void copy_kernel(TensorS S) {
47+
using namespace cute;
48+
using Element = typename TensorS::value_type;
49+
50+
// initialization
51+
if(thread(0)){
52+
for(int i=0; i<size(S); i++){
53+
S(i) = static_cast<Element>(i);
54+
}
55+
}
56+
syncthreads();
57+
58+
using CopyThreadShape = Shape<_1, Int<SUBGROUP_SIZE>>;
59+
using traits_load = Copy_Traits<CopyInstruction, decltype(S)>;
60+
using Atom_load = Copy_Atom<traits_load, Element>;
61+
auto tiled_copy_load = make_tiled_copy(Atom_load{}.with(S),
62+
Layout<CopyThreadShape>{},
63+
make_layout(shape_div(typename traits_load::BlockShape{}, CopyThreadShape{})));
64+
65+
auto thr_copy_load = tiled_copy_load.get_slice(ThreadIdxX());
66+
67+
using actual_fragment_size = std::conditional_t<std::is_same_v<fragment_size, void>, C<Atom_load::NumValDst>, fragment_size>;
68+
Tensor fragment = make_tensor<Element>(make_shape(actual_fragment_size{},_1{},_1{}));
69+
clear(fragment);
70+
71+
static_assert(actual_fragment_size::value >= Atom_load::NumValDst, "fragment is too small to hold all results!");
72+
Tensor fragment_copy_view = make_tensor(fragment.data(), make_shape(C<Atom_load::NumValDst>{},_1{},_1{}));
73+
auto blk_load_S = tiled_copy_load.get_pvc_tensor(S.shape());
74+
copy(tiled_copy_load, blk_load_S, fragment_copy_view);
75+
76+
if(thread(0)){
77+
print("fragment: "); print(fragment); print("\n");
78+
}
79+
80+
for(int i=0;i<SUBGROUP_SIZE;i++){
81+
if(thread(i)){
82+
print("thread ");
83+
if(i<10) print(" ");
84+
print(i); print(" ");
85+
}
86+
}
87+
for(int i = 0;i < size(fragment); i++){
88+
if(thread(0)){
89+
print("\n ");
90+
}
91+
for(int j=0;j<SUBGROUP_SIZE;j++){
92+
if(thread(j)){
93+
if(fragment(i)<10) print(" ");
94+
if(fragment(i)<100) print(" ");
95+
if(fragment(i)<1000) print(" ");
96+
print(static_cast<int>(fragment(i))); print(" ");
97+
}
98+
}
99+
}
100+
if(thread(0)){
101+
print("\n");
102+
}
103+
}
104+
105+
// by default select fragment size to match copy size. This can be set manually to a bigger value as copy size might be wrong
106+
template <class CopyInstruction, class dtype, class fragment_size = void>
107+
void copy(int global_M, int global_N) {
108+
using namespace cute;
109+
110+
auto tensor_shape = make_shape(global_M, global_N, 1);
111+
int tensor_size = size(tensor_shape);
112+
cutlass::DeviceAllocation<dtype> src(tensor_size);
113+
114+
Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutLeft{}));
115+
116+
auto gridDim = syclcompat::dim3(1);
117+
auto blockDim = syclcompat::dim3(SUBGROUP_SIZE);
118+
launch<copy_kernel<CopyInstruction, decltype(tensor_S), fragment_size>>(
119+
launch_policy{gridDim, blockDim,
120+
kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
121+
tensor_S);
122+
123+
syclcompat::wait_and_throw();
124+
}
125+
126+
int main(){
127+
// for 16b copies use integers as floating point types could lose precision for bigger indices
128+
// for 8b copies you have to work with overflow
129+
copy<XE_2D_U16x32x32_LD_V, int16_t>(256, 256);
130+
copy<XE_2D_U16x32x32_LD_N, int16_t>(256, 256);
131+
return 0;
132+
}

0 commit comments

Comments
 (0)