|
| 1 | +/*************************************************************************************************** |
| 2 | + * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved. |
| 3 | + * SPDX-License-Identifier: BSD-3-Clause |
| 4 | + * |
| 5 | + * Redistribution and use in source and binary forms, with or without |
| 6 | + * modification, are permitted provided that the following conditions are met: |
| 7 | + * |
| 8 | + * 1. Redistributions of source code must retain the above copyright notice, this |
| 9 | + * list of conditions and the following disclaimer. |
| 10 | + * |
| 11 | + * 2. Redistributions in binary form must reproduce the above copyright notice, |
| 12 | + * this list of conditions and the following disclaimer in the documentation |
| 13 | + * and/or other materials provided with the distribution. |
| 14 | + * |
| 15 | + * 3. Neither the name of the copyright holder nor the names of its |
| 16 | + * contributors may be used to endorse or promote products derived from |
| 17 | + * this software without specific prior written permission. |
| 18 | + * |
| 19 | + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 20 | + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 21 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 22 | + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
| 23 | + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 24 | + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 25 | + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 26 | + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 27 | + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | + * |
| 30 | + **************************************************************************************************/ |
| 31 | + |
| 32 | +#include <cute/tensor.hpp> |
| 33 | +#include <sycl/sycl.hpp> |
| 34 | +#include <syclcompat.hpp> |
| 35 | + |
| 36 | +#include <cutlass/util/device_memory.h> |
| 37 | +#include <syclcompat/syclcompat.hpp> |
| 38 | +#include <cutlass/cutlass.h> |
| 39 | + |
| 40 | +using namespace syclcompat::experimental; |
| 41 | +using namespace cute; |
| 42 | + |
| 43 | +#define SUBGROUP_SIZE (16) |
| 44 | + |
| 45 | +template <class CopyInstruction, class TensorS, class fragment_size> |
| 46 | +void copy_kernel(TensorS S) { |
| 47 | + using namespace cute; |
| 48 | + using Element = typename TensorS::value_type; |
| 49 | + |
| 50 | + // initialization |
| 51 | + if(thread(0)){ |
| 52 | + for(int i=0; i<size(S); i++){ |
| 53 | + S(i) = static_cast<Element>(i); |
| 54 | + } |
| 55 | + } |
| 56 | + syncthreads(); |
| 57 | + |
| 58 | + using CopyThreadShape = Shape<_1, Int<SUBGROUP_SIZE>>; |
| 59 | + using traits_load = Copy_Traits<CopyInstruction, decltype(S)>; |
| 60 | + using Atom_load = Copy_Atom<traits_load, Element>; |
| 61 | + auto tiled_copy_load = make_tiled_copy(Atom_load{}.with(S), |
| 62 | + Layout<CopyThreadShape>{}, |
| 63 | + make_layout(shape_div(typename traits_load::BlockShape{}, CopyThreadShape{}))); |
| 64 | + |
| 65 | + auto thr_copy_load = tiled_copy_load.get_slice(ThreadIdxX()); |
| 66 | + |
| 67 | + using actual_fragment_size = std::conditional_t<std::is_same_v<fragment_size, void>, C<Atom_load::NumValDst>, fragment_size>; |
| 68 | + Tensor fragment = make_tensor<Element>(make_shape(actual_fragment_size{},_1{},_1{})); |
| 69 | + clear(fragment); |
| 70 | + |
| 71 | + static_assert(actual_fragment_size::value >= Atom_load::NumValDst, "fragment is too small to hold all results!"); |
| 72 | + Tensor fragment_copy_view = make_tensor(fragment.data(), make_shape(C<Atom_load::NumValDst>{},_1{},_1{})); |
| 73 | + auto blk_load_S = tiled_copy_load.get_pvc_tensor(S.shape()); |
| 74 | + copy(tiled_copy_load, blk_load_S, fragment_copy_view); |
| 75 | + |
| 76 | + if(thread(0)){ |
| 77 | + print("fragment: "); print(fragment); print("\n"); |
| 78 | + } |
| 79 | + |
| 80 | + for(int i=0;i<SUBGROUP_SIZE;i++){ |
| 81 | + if(thread(i)){ |
| 82 | + print("thread "); |
| 83 | + if(i<10) print(" "); |
| 84 | + print(i); print(" "); |
| 85 | + } |
| 86 | + } |
| 87 | + for(int i = 0;i < size(fragment); i++){ |
| 88 | + if(thread(0)){ |
| 89 | + print("\n "); |
| 90 | + } |
| 91 | + for(int j=0;j<SUBGROUP_SIZE;j++){ |
| 92 | + if(thread(j)){ |
| 93 | + if(fragment(i)<10) print(" "); |
| 94 | + if(fragment(i)<100) print(" "); |
| 95 | + if(fragment(i)<1000) print(" "); |
| 96 | + print(static_cast<int>(fragment(i))); print(" "); |
| 97 | + } |
| 98 | + } |
| 99 | + } |
| 100 | + if(thread(0)){ |
| 101 | + print("\n"); |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +// by default select fragment size to match copy size. This can be set manually to a bigger value as copy size might be wrong |
| 106 | +template <class CopyInstruction, class dtype, class fragment_size = void> |
| 107 | +void copy(int global_M, int global_N) { |
| 108 | + using namespace cute; |
| 109 | + |
| 110 | + auto tensor_shape = make_shape(global_M, global_N, 1); |
| 111 | + int tensor_size = size(tensor_shape); |
| 112 | + cutlass::DeviceAllocation<dtype> src(tensor_size); |
| 113 | + |
| 114 | + Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutLeft{})); |
| 115 | + |
| 116 | + auto gridDim = syclcompat::dim3(1); |
| 117 | + auto blockDim = syclcompat::dim3(SUBGROUP_SIZE); |
| 118 | + launch<copy_kernel<CopyInstruction, decltype(tensor_S), fragment_size>>( |
| 119 | + launch_policy{gridDim, blockDim, |
| 120 | + kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}}, |
| 121 | + tensor_S); |
| 122 | + |
| 123 | + syclcompat::wait_and_throw(); |
| 124 | +} |
| 125 | + |
| 126 | +int main(){ |
| 127 | + // for 16b copies use integers as floating point types could lose precision for bigger indices |
| 128 | + // for 8b copies you have to work with overflow |
| 129 | + copy<XE_2D_U16x32x32_LD_V, int16_t>(256, 256); |
| 130 | + copy<XE_2D_U16x32x32_LD_N, int16_t>(256, 256); |
| 131 | + return 0; |
| 132 | +} |
0 commit comments