@@ -60,9 +60,9 @@ template <typename inputT, typename outputT> struct NonZeroIndicator
60
60
61
61
outputT operator ()(const inputT &val) const
62
62
{
63
- constexpr outputT out_one (1 );
64
- constexpr outputT out_zero (0 );
65
- constexpr inputT val_zero (0 );
63
+ static constexpr outputT out_one (1 );
64
+ static constexpr outputT out_zero (0 );
65
+ static constexpr inputT val_zero (0 );
66
66
67
67
return (val == val_zero) ? out_zero : out_one;
68
68
}
@@ -583,7 +583,7 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q,
583
583
cgh.depends_on (dependent_event);
584
584
cgh.use_kernel_bundle (kb);
585
585
586
- constexpr nwiT updates_per_wi = n_wi;
586
+ static constexpr nwiT updates_per_wi = n_wi;
587
587
const std::size_t n_items =
588
588
ceiling_quotient<std::size_t >(src_size, sg_size * n_wi) * sg_size;
589
589
@@ -594,8 +594,8 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q,
594
594
cgh.parallel_for <UpdateKernelName>(
595
595
ndRange,
596
596
[chunk_size, src, src_size, local_scans](sycl::nd_item<1 > ndit) {
597
- constexpr ScanOpT scan_op{};
598
- constexpr outputT identity =
597
+ static constexpr ScanOpT scan_op{};
598
+ static constexpr outputT identity =
599
599
su_ns::Identity<ScanOpT, outputT>::value;
600
600
601
601
const std::uint32_t lws = ndit.get_local_range (0 );
@@ -640,16 +640,17 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
640
640
std::vector<sycl::event> &host_tasks,
641
641
const std::vector<sycl::event> &depends = {})
642
642
{
643
- constexpr ScanOpT scan_op{};
644
- constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
643
+ static constexpr ScanOpT scan_op{};
644
+ static constexpr outputT identity =
645
+ su_ns::Identity<ScanOpT, outputT>::value;
645
646
646
- constexpr std::size_t _iter_nelems = 1 ;
647
+ static constexpr std::size_t _iter_nelems = 1 ;
647
648
648
649
using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
649
- constexpr IterIndexerT _no_op_iter_indexer{};
650
+ static constexpr IterIndexerT _no_op_iter_indexer{};
650
651
651
652
using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
652
- constexpr NoOpIndexerT _no_op_indexer{};
653
+ static constexpr NoOpIndexerT _no_op_indexer{};
653
654
654
655
std::size_t n_groups;
655
656
sycl::event inc_scan_phase1_ev =
@@ -687,7 +688,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
687
688
outputT *local_scans = temp;
688
689
689
690
using NoOpTransformerT = NoOpTransformer<outputT>;
690
- constexpr NoOpTransformerT _no_op_transformer{};
691
+ static constexpr NoOpTransformerT _no_op_transformer{};
691
692
std::size_t size_to_update = n_elems;
692
693
while (n_groups_ > 1 ) {
693
694
@@ -761,16 +762,16 @@ accumulate_1d_contig_impl(sycl::queue &q,
761
762
dstT *dst_data_ptr = reinterpret_cast <dstT *>(dst);
762
763
763
764
using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
764
- constexpr NoOpIndexerT flat_indexer{};
765
- constexpr transformerT transformer{};
765
+ static constexpr NoOpIndexerT flat_indexer{};
766
+ static constexpr transformerT transformer{};
766
767
767
- constexpr std::size_t s0 = 0 ;
768
- constexpr std::size_t s1 = 1 ;
768
+ static constexpr std::size_t s0 = 0 ;
769
+ static constexpr std::size_t s1 = 1 ;
769
770
770
771
sycl::event comp_ev;
771
772
const sycl::device &dev = q.get_device ();
772
773
if (dev.has (sycl::aspect::cpu)) {
773
- constexpr nwiT n_wi_for_cpu = 8 ;
774
+ static constexpr nwiT n_wi_for_cpu = 8 ;
774
775
const std::uint32_t wg_size = 256 ;
775
776
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
776
777
transformerT, AccumulateOpT,
@@ -779,7 +780,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
779
780
flat_indexer, transformer, host_tasks, depends);
780
781
}
781
782
else {
782
- constexpr nwiT n_wi_for_gpu = 4 ;
783
+ static constexpr nwiT n_wi_for_gpu = 4 ;
783
784
// base_scan_striped algorithm does not execute correctly
784
785
// on HIP device with wg_size > 64
785
786
const std::uint32_t wg_size =
@@ -829,7 +830,7 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q,
829
830
const std::uint32_t sg_size = krn.template get_info <
830
831
sycl::info::kernel_device_specific::max_sub_group_size>(dev);
831
832
832
- constexpr nwiT updates_per_wi = n_wi;
833
+ static constexpr nwiT updates_per_wi = n_wi;
833
834
const std::size_t updates_per_sg = sg_size * updates_per_wi;
834
835
const std::size_t update_nelems =
835
836
ceiling_quotient (src_size, updates_per_sg) * sg_size;
@@ -845,8 +846,8 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q,
845
846
cgh.parallel_for <UpdateKernelName>(
846
847
ndRange, [chunk_size, src_size, local_stride, src, local_scans,
847
848
out_iter_indexer, out_indexer](sycl::nd_item<2 > ndit) {
848
- constexpr ScanOpT scan_op{};
849
- constexpr outputT identity =
849
+ static constexpr ScanOpT scan_op{};
850
+ static constexpr outputT identity =
850
851
su_ns::Identity<ScanOpT, outputT>::value;
851
852
852
853
const std::uint32_t lws = ndit.get_local_range (1 );
@@ -898,8 +899,8 @@ sycl::event update_local_chunks(sycl::queue &exec_q,
898
899
std::size_t local_stride,
899
900
sycl::event dependent_event)
900
901
{
901
- constexpr NoOpIndexer out_indexer{};
902
- constexpr NoOpIndexer iter_out_indexer{};
902
+ static constexpr NoOpIndexer out_indexer{};
903
+ static constexpr NoOpIndexer iter_out_indexer{};
903
904
904
905
return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
905
906
NoOpIndexer, NoOpIndexer, ScanOpT>(
@@ -933,8 +934,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
933
934
std::vector<sycl::event> &host_tasks,
934
935
const std::vector<sycl::event> &depends = {})
935
936
{
936
- constexpr ScanOpT scan_op{};
937
- constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
937
+ static constexpr ScanOpT scan_op{};
938
+ static constexpr outputT identity =
939
+ su_ns::Identity<ScanOpT, outputT>::value;
938
940
939
941
using IterIndexerT =
940
942
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
@@ -977,9 +979,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
977
979
outputT *local_scans = temp;
978
980
979
981
using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
980
- constexpr NoOpIndexerT _no_op_indexer{};
982
+ static constexpr NoOpIndexerT _no_op_indexer{};
981
983
using NoOpTransformerT = NoOpTransformer<outputT>;
982
- constexpr NoOpTransformerT _no_op_transformer{};
984
+ static constexpr NoOpTransformerT _no_op_transformer{};
983
985
std::size_t size_to_update = acc_nelems;
984
986
985
987
{
@@ -1142,15 +1144,15 @@ accumulate_strided_impl(sycl::queue &q,
1142
1144
iter_shape_strides,
1143
1145
iter_shape_strides + 2 * iter_nd};
1144
1146
1145
- constexpr transformerT transformer{};
1147
+ static constexpr transformerT transformer{};
1146
1148
1147
- constexpr std::size_t s0 = 0 ;
1148
- constexpr std::size_t s1 = 1 ;
1149
+ static constexpr std::size_t s0 = 0 ;
1150
+ static constexpr std::size_t s1 = 1 ;
1149
1151
1150
1152
const sycl::device &dev = q.get_device ();
1151
1153
sycl::event comp_ev;
1152
1154
if (dev.has (sycl::aspect::cpu)) {
1153
- constexpr nwiT n_wi_for_cpu = 8 ;
1155
+ static constexpr nwiT n_wi_for_cpu = 8 ;
1154
1156
const std::uint32_t wg_size = 256 ;
1155
1157
comp_ev =
1156
1158
inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
@@ -1161,7 +1163,7 @@ accumulate_strided_impl(sycl::queue &q,
1161
1163
out_axis_indexer, transformer, host_tasks, depends);
1162
1164
}
1163
1165
else {
1164
- constexpr nwiT n_wi_for_gpu = 4 ;
1166
+ static constexpr nwiT n_wi_for_gpu = 4 ;
1165
1167
// base_scan_striped algorithm does not execute correctly
1166
1168
// on HIP device with wg_size > 64
1167
1169
const std::uint32_t wg_size =
@@ -1198,18 +1200,18 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
1198
1200
cumsumT *cumsum_data_ptr = reinterpret_cast <cumsumT *>(cumsum);
1199
1201
1200
1202
using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
1201
- constexpr NoOpIndexerT flat_indexer{};
1202
- constexpr transformerT transformer{};
1203
+ static constexpr NoOpIndexerT flat_indexer{};
1204
+ static constexpr transformerT transformer{};
1203
1205
1204
- constexpr std::size_t s0 = 0 ;
1205
- constexpr std::size_t s1 = 1 ;
1206
- constexpr bool include_initial = false ;
1206
+ static constexpr std::size_t s0 = 0 ;
1207
+ static constexpr std::size_t s1 = 1 ;
1208
+ static constexpr bool include_initial = false ;
1207
1209
using AccumulateOpT = sycl::plus<cumsumT>;
1208
1210
1209
1211
sycl::event comp_ev;
1210
1212
const sycl::device &dev = q.get_device ();
1211
1213
if (dev.has (sycl::aspect::cpu)) {
1212
- constexpr nwiT n_wi_for_cpu = 8 ;
1214
+ static constexpr nwiT n_wi_for_cpu = 8 ;
1213
1215
const std::uint32_t wg_size = 256 ;
1214
1216
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
1215
1217
NoOpIndexerT, transformerT,
@@ -1218,7 +1220,7 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
1218
1220
flat_indexer, transformer, host_tasks, depends);
1219
1221
}
1220
1222
else {
1221
- constexpr nwiT n_wi_for_gpu = 4 ;
1223
+ static constexpr nwiT n_wi_for_gpu = 4 ;
1222
1224
// base_scan_striped algorithm does not execute correctly
1223
1225
// on HIP device with wg_size > 64
1224
1226
const std::uint32_t wg_size =
@@ -1313,17 +1315,17 @@ cumsum_val_strided_impl(sycl::queue &q,
1313
1315
1314
1316
using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
1315
1317
const StridedIndexerT strided_indexer{nd, 0 , shape_strides};
1316
- constexpr transformerT transformer{};
1318
+ static constexpr transformerT transformer{};
1317
1319
1318
- constexpr std::size_t s0 = 0 ;
1319
- constexpr std::size_t s1 = 1 ;
1320
- constexpr bool include_initial = false ;
1320
+ static constexpr std::size_t s0 = 0 ;
1321
+ static constexpr std::size_t s1 = 1 ;
1322
+ static constexpr bool include_initial = false ;
1321
1323
using AccumulateOpT = sycl::plus<cumsumT>;
1322
1324
1323
1325
const sycl::device &dev = q.get_device ();
1324
1326
sycl::event comp_ev;
1325
1327
if (dev.has (sycl::aspect::cpu)) {
1326
- constexpr nwiT n_wi_for_cpu = 8 ;
1328
+ static constexpr nwiT n_wi_for_cpu = 8 ;
1327
1329
const std::uint32_t wg_size = 256 ;
1328
1330
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
1329
1331
StridedIndexerT, transformerT,
@@ -1332,7 +1334,7 @@ cumsum_val_strided_impl(sycl::queue &q,
1332
1334
strided_indexer, transformer, host_tasks, depends);
1333
1335
}
1334
1336
else {
1335
- constexpr nwiT n_wi_for_gpu = 4 ;
1337
+ static constexpr nwiT n_wi_for_gpu = 4 ;
1336
1338
// base_scan_striped algorithm does not execute correctly
1337
1339
// on HIP device with wg_size > 64
1338
1340
const std::uint32_t wg_size =
0 commit comments