From c6bc9904b051578707166d959d11431f34b67def Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 1 Jun 2025 23:49:24 -0700 Subject: [PATCH 1/2] make namespace-scope constexpr variables inline in headers this gives these constants external linkage and prevents duplication --- dpctl/tensor/libtensor/include/kernels/alignment.hpp | 2 +- dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp | 2 +- dpctl/tensor/libtensor/include/utils/type_utils.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp index a6dc93b6c7..1b92deb7ce 100644 --- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp +++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp @@ -30,7 +30,7 @@ namespace kernels namespace alignment_utils { -static constexpr std::size_t required_alignment = 64UL; +inline constexpr std::size_t required_alignment = 64UL; template bool is_aligned(Ptr p) { diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp index 4394d9a4b1..b1308184fc 100644 --- a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp @@ -56,7 +56,7 @@ enum class typenum_t : int CFLOAT, CDOUBLE, // 13 }; -constexpr int num_types = 14; // number of elements in typenum_t +inline constexpr int num_types = 14; // number of elements in typenum_t template diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp index e9493aebfe..41c42476b6 100644 --- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp @@ -51,7 +51,7 @@ struct is_complex< { }; -template constexpr bool is_complex_v = is_complex::value; +template inline constexpr bool is_complex_v = is_complex::value; template dstTy convert_impl(const srcTy &v) { From 6edcc218c63949cf4a448078bfa1307766ffc10e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 4 Jun 2025 14:12:36 -0700 Subject: [PATCH 2/2] Make constexpr variables static --- dpctl/_host_task_util.hpp | 6 +- .../include/kernels/accumulators.hpp | 92 +++---- .../kernels/boolean_advanced_indexing.hpp | 22 +- .../tensor/libtensor/include/kernels/clip.hpp | 10 +- .../include/kernels/constructors.hpp | 2 +- .../include/kernels/copy_and_cast.hpp | 23 +- .../include/kernels/copy_as_contiguous.hpp | 32 +-- .../kernels/elementwise_functions/abs.hpp | 4 +- .../kernels/elementwise_functions/acos.hpp | 9 +- .../kernels/elementwise_functions/acosh.hpp | 12 +- .../kernels/elementwise_functions/add.hpp | 8 +- .../kernels/elementwise_functions/angle.hpp | 8 +- .../kernels/elementwise_functions/asin.hpp | 9 +- .../kernels/elementwise_functions/asinh.hpp | 9 +- .../kernels/elementwise_functions/atan.hpp | 9 +- .../kernels/elementwise_functions/atan2.hpp | 4 +- .../kernels/elementwise_functions/atanh.hpp | 7 +- .../elementwise_functions/bitwise_and.hpp | 8 +- .../elementwise_functions/bitwise_invert.hpp | 4 +- .../bitwise_left_shift.hpp | 17 +- .../elementwise_functions/bitwise_or.hpp | 8 +- .../bitwise_right_shift.hpp | 13 +- .../elementwise_functions/bitwise_xor.hpp | 8 +- .../elementwise_functions/cabs_impl.hpp | 4 +- .../kernels/elementwise_functions/cbrt.hpp | 4 +- .../kernels/elementwise_functions/ceil.hpp | 4 +- .../kernels/elementwise_functions/common.hpp | 18 +- .../elementwise_functions/common_inplace.hpp | 6 +- .../kernels/elementwise_functions/conj.hpp | 4 +- .../elementwise_functions/copysign.hpp | 4 +- .../kernels/elementwise_functions/cos.hpp | 7 +- .../kernels/elementwise_functions/cosh.hpp | 7 +- .../kernels/elementwise_functions/equal.hpp | 4 +- .../kernels/elementwise_functions/exp.hpp | 7 +- .../kernels/elementwise_functions/exp2.hpp | 7 +- .../kernels/elementwise_functions/expm1.hpp | 4 +- .../kernels/elementwise_functions/floor.hpp | 4 +- .../elementwise_functions/floor_divide.hpp | 8 +- .../kernels/elementwise_functions/greater.hpp | 4 +- .../elementwise_functions/greater_equal.hpp | 4 +- .../kernels/elementwise_functions/hypot.hpp | 4 +- .../kernels/elementwise_functions/imag.hpp | 4 +- .../elementwise_functions/isfinite.hpp | 4 +- .../kernels/elementwise_functions/isinf.hpp | 4 +- .../kernels/elementwise_functions/isnan.hpp | 4 +- .../kernels/elementwise_functions/less.hpp | 4 +- .../elementwise_functions/less_equal.hpp | 4 +- .../kernels/elementwise_functions/log.hpp | 4 +- .../kernels/elementwise_functions/log10.hpp | 4 +- .../kernels/elementwise_functions/log1p.hpp | 4 +- .../kernels/elementwise_functions/log2.hpp | 4 +- .../elementwise_functions/logaddexp.hpp | 4 +- .../elementwise_functions/logical_and.hpp | 4 +- .../elementwise_functions/logical_not.hpp | 4 +- .../elementwise_functions/logical_or.hpp | 4 +- .../elementwise_functions/logical_xor.hpp | 4 +- .../kernels/elementwise_functions/maximum.hpp | 4 +- .../kernels/elementwise_functions/minimum.hpp | 4 +- .../elementwise_functions/multiply.hpp | 8 +- .../elementwise_functions/negative.hpp | 4 +- .../elementwise_functions/nextafter.hpp | 4 +- .../elementwise_functions/not_equal.hpp | 4 +- .../elementwise_functions/positive.hpp | 4 +- .../kernels/elementwise_functions/pow.hpp | 8 +- .../kernels/elementwise_functions/proj.hpp | 4 +- .../kernels/elementwise_functions/real.hpp | 4 +- .../elementwise_functions/reciprocal.hpp | 4 +- .../elementwise_functions/remainder.hpp | 8 +- .../kernels/elementwise_functions/round.hpp | 4 +- .../kernels/elementwise_functions/rsqrt.hpp | 4 +- .../kernels/elementwise_functions/sign.hpp | 4 +- .../kernels/elementwise_functions/signbit.hpp | 4 +- .../kernels/elementwise_functions/sin.hpp | 7 +- .../kernels/elementwise_functions/sinh.hpp | 4 +- .../kernels/elementwise_functions/sqrt.hpp | 4 +- .../kernels/elementwise_functions/square.hpp | 4 +- .../elementwise_functions/subtract.hpp | 8 +- .../kernels/elementwise_functions/tan.hpp | 7 +- .../kernels/elementwise_functions/tanh.hpp | 7 +- .../elementwise_functions/true_divide.hpp | 8 +- .../kernels/elementwise_functions/trunc.hpp | 4 +- .../kernels/integer_advanced_indexing.hpp | 4 +- .../kernels/linalg_functions/dot_product.hpp | 48 ++-- .../include/kernels/linalg_functions/gemm.hpp | 241 +++++++++--------- .../libtensor/include/kernels/reductions.hpp | 142 ++++++----- .../libtensor/include/kernels/repeat.hpp | 4 +- .../include/kernels/sorting/merge_sort.hpp | 2 +- .../include/kernels/sorting/radix_sort.hpp | 138 +++++----- .../include/kernels/sorting/searchsorted.hpp | 8 +- .../include/kernels/sorting/sort_utils.hpp | 10 +- .../include/kernels/sorting/topk.hpp | 6 +- .../libtensor/include/kernels/where.hpp | 10 +- .../include/utils/indexing_utils.hpp | 28 +- .../libtensor/include/utils/math_utils.hpp | 2 +- .../include/utils/sycl_alloc_utils.hpp | 2 +- .../libtensor/include/utils/sycl_utils.hpp | 16 +- .../tensor/libtensor/source/accumulators.cpp | 9 +- .../accumulators/accumulate_over_axis.hpp | 2 +- .../accumulators/cumulative_logsumexp.cpp | 8 +- .../source/accumulators/cumulative_prod.cpp | 8 +- .../source/accumulators/cumulative_sum.cpp | 8 +- .../source/boolean_advanced_indexing.cpp | 12 +- .../libtensor/source/copy_as_contig.cpp | 6 +- .../tensor/libtensor/source/copy_for_roll.cpp | 6 +- .../copy_numpy_ndarray_into_usm_ndarray.cpp | 2 +- .../elementwise_functions.hpp | 4 +- .../elementwise_functions/true_divide.cpp | 16 +- dpctl/tensor/libtensor/source/full_ctor.cpp | 2 +- .../libtensor/source/linalg_functions/dot.cpp | 4 +- .../reductions/reduction_atomic_support.hpp | 4 +- .../source/reductions/reduction_over_axis.hpp | 19 +- dpctl/tensor/libtensor/source/repeat.cpp | 6 +- .../source/sorting/py_sort_common.hpp | 2 +- .../libtensor/source/sorting/searchsorted.cpp | 16 +- dpctl/tensor/libtensor/source/zeros_ctor.cpp | 2 +- dpctl/utils/src/sequential_order_keeper.hpp | 5 +- .../source/dpctl_sycl_device_manager.cpp | 2 +- .../dpctl_sycl_device_selector_interface.cpp | 2 +- .../dpctl_sycl_kernel_bundle_interface.cpp | 4 +- .../tests/test_sycl_device_aspects.cpp | 4 +- .../tests/test_sycl_queue_submit.cpp | 12 +- ...t_sycl_queue_submit_local_accessor_arg.cpp | 8 +- .../test_sycl_queue_submit_raw_kernel_arg.cpp | 6 +- ...ycl_queue_submit_work_group_memory_arg.cpp | 6 +- .../tests/test_sycl_usm_interface.cpp | 2 +- 125 files changed, 764 insertions(+), 701 deletions(-) diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index 33a01a2536..4db07e1cb1 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -77,19 +77,19 @@ DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef, }); }); - constexpr int result_ok = 0; + static constexpr int result_ok = 0; *status = result_ok; auto e_ptr = new sycl::event(ht_ev); return wrap(e_ptr); } catch (const std::exception &e) { - constexpr int result_std_exception = 1; + static constexpr int result_std_exception = 1; *status = result_std_exception; return nullptr; } - constexpr int result_other_abnormal = 2; + static constexpr int result_other_abnormal = 2; *status = result_other_abnormal; return nullptr; diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index f0f72aee82..407ea9e19a 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -60,9 +60,9 @@ template struct NonZeroIndicator outputT operator()(const inputT &val) const { - constexpr outputT out_one(1); - constexpr outputT out_zero(0); - constexpr inputT val_zero(0); + static constexpr outputT out_one(1); + static constexpr outputT out_zero(0); + static constexpr inputT val_zero(0); return (val == val_zero) ? out_zero : out_one; } @@ -583,7 +583,7 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q, cgh.depends_on(dependent_event); cgh.use_kernel_bundle(kb); - constexpr nwiT updates_per_wi = n_wi; + static constexpr nwiT updates_per_wi = n_wi; const std::size_t n_items = ceiling_quotient(src_size, sg_size * n_wi) * sg_size; @@ -594,8 +594,8 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q, cgh.parallel_for( ndRange, [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = su_ns::Identity::value; const std::uint32_t lws = ndit.get_local_range(0); @@ -640,16 +640,17 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, std::vector &host_tasks, const std::vector &depends = {}) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = su_ns::Identity::value; + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; - constexpr std::size_t _iter_nelems = 1; + static constexpr std::size_t _iter_nelems = 1; using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer; - constexpr IterIndexerT _no_op_iter_indexer{}; + static constexpr IterIndexerT _no_op_iter_indexer{}; using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr NoOpIndexerT _no_op_indexer{}; + static constexpr NoOpIndexerT _no_op_indexer{}; std::size_t n_groups; sycl::event inc_scan_phase1_ev = @@ -687,7 +688,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, outputT *local_scans = temp; using NoOpTransformerT = NoOpTransformer; - constexpr NoOpTransformerT _no_op_transformer{}; + static constexpr NoOpTransformerT _no_op_transformer{}; std::size_t size_to_update = n_elems; while (n_groups_ > 1) { @@ -761,16 +762,16 @@ accumulate_1d_contig_impl(sycl::queue &q, dstT *dst_data_ptr = reinterpret_cast(dst); using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr NoOpIndexerT flat_indexer{}; - constexpr transformerT transformer{}; + static constexpr NoOpIndexerT flat_indexer{}; + static constexpr transformerT transformer{}; - constexpr std::size_t s0 = 0; - constexpr std::size_t s1 = 1; + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; sycl::event comp_ev; const sycl::device &dev = q.get_device(); if (dev.has(sycl::aspect::cpu)) { - constexpr nwiT n_wi_for_cpu = 8; + static constexpr nwiT n_wi_for_cpu = 8; const std::uint32_t wg_size = 256; comp_ev = inclusive_scan_iter_1d 64 const std::uint32_t wg_size = @@ -829,7 +830,7 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q, const std::uint32_t sg_size = krn.template get_info< sycl::info::kernel_device_specific::max_sub_group_size>(dev); - constexpr nwiT updates_per_wi = n_wi; + static constexpr nwiT updates_per_wi = n_wi; const std::size_t updates_per_sg = sg_size * updates_per_wi; const std::size_t update_nelems = ceiling_quotient(src_size, updates_per_sg) * sg_size; @@ -845,8 +846,8 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q, cgh.parallel_for( ndRange, [chunk_size, src_size, local_stride, src, local_scans, out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = su_ns::Identity::value; const std::uint32_t lws = ndit.get_local_range(1); @@ -898,8 +899,8 @@ sycl::event update_local_chunks(sycl::queue &exec_q, std::size_t local_stride, sycl::event dependent_event) { - constexpr NoOpIndexer out_indexer{}; - constexpr NoOpIndexer iter_out_indexer{}; + static constexpr NoOpIndexer out_indexer{}; + static constexpr NoOpIndexer iter_out_indexer{}; return final_update_local_chunks( @@ -933,8 +934,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, std::vector &host_tasks, const std::vector &depends = {}) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = su_ns::Identity::value; + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; using IterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< @@ -977,9 +979,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, outputT *local_scans = temp; using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr NoOpIndexerT _no_op_indexer{}; + static constexpr NoOpIndexerT _no_op_indexer{}; using NoOpTransformerT = NoOpTransformer; - constexpr NoOpTransformerT _no_op_transformer{}; + static constexpr NoOpTransformerT _no_op_transformer{}; std::size_t size_to_update = acc_nelems; { @@ -1142,15 +1144,15 @@ accumulate_strided_impl(sycl::queue &q, iter_shape_strides, iter_shape_strides + 2 * iter_nd}; - constexpr transformerT transformer{}; + static constexpr transformerT transformer{}; - constexpr std::size_t s0 = 0; - constexpr std::size_t s1 = 1; + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; const sycl::device &dev = q.get_device(); sycl::event comp_ev; if (dev.has(sycl::aspect::cpu)) { - constexpr nwiT n_wi_for_cpu = 8; + static constexpr nwiT n_wi_for_cpu = 8; const std::uint32_t wg_size = 256; comp_ev = inclusive_scan_iter 64 const std::uint32_t wg_size = @@ -1198,18 +1200,18 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q, cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr NoOpIndexerT flat_indexer{}; - constexpr transformerT transformer{}; + static constexpr NoOpIndexerT flat_indexer{}; + static constexpr transformerT transformer{}; - constexpr std::size_t s0 = 0; - constexpr std::size_t s1 = 1; - constexpr bool include_initial = false; + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + static constexpr bool include_initial = false; using AccumulateOpT = sycl::plus; sycl::event comp_ev; const sycl::device &dev = q.get_device(); if (dev.has(sycl::aspect::cpu)) { - constexpr nwiT n_wi_for_cpu = 8; + static constexpr nwiT n_wi_for_cpu = 8; const std::uint32_t wg_size = 256; comp_ev = inclusive_scan_iter_1d 64 const std::uint32_t wg_size = @@ -1313,17 +1315,17 @@ cumsum_val_strided_impl(sycl::queue &q, using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer; const StridedIndexerT strided_indexer{nd, 0, shape_strides}; - constexpr transformerT transformer{}; + static constexpr transformerT transformer{}; - constexpr std::size_t s0 = 0; - constexpr std::size_t s1 = 1; - constexpr bool include_initial = false; + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + static constexpr bool include_initial = false; using AccumulateOpT = sycl::plus; const sycl::device &dev = q.get_device(); sycl::event comp_ev; if (dev.has(sycl::aspect::cpu)) { - constexpr nwiT n_wi_for_cpu = 8; + static constexpr nwiT n_wi_for_cpu = 8; const std::uint32_t wg_size = 256; comp_ev = inclusive_scan_iter_1d 64 const std::uint32_t wg_size = diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 948a0229eb..01ba0b21c1 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -227,11 +227,11 @@ std::size_t _get_lws_impl(std::size_t n) } } -std::size_t get_lws(std::size_t n) +inline std::size_t get_lws(std::size_t n) { - constexpr std::size_t lws0 = 256u; - constexpr std::size_t lws1 = 128u; - constexpr std::size_t lws2 = 64u; + static constexpr std::size_t lws0 = 256u; + static constexpr std::size_t lws1 = 128u; + static constexpr std::size_t lws2 = 64u; return _get_lws_impl(n); } @@ -261,9 +261,9 @@ sycl::event masked_extract_all_slices_contig_impl( ssize_t dst_stride, const std::vector &depends = {}) { - constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; - constexpr NoOpIndexer masked_src_indexer{}; + static constexpr NoOpIndexer masked_src_indexer{}; const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, /* step */ dst_stride); @@ -339,7 +339,7 @@ sycl::event masked_extract_all_slices_strided_impl( ssize_t dst_stride, const std::vector &depends = {}) { - constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const * *_packed_shape_strides) */ @@ -578,7 +578,7 @@ sycl::event masked_place_all_slices_strided_impl( ssize_t rhs_stride, const std::vector &depends = {}) { - constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; + static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const * *_packed_shape_strides) */ @@ -589,7 +589,7 @@ sycl::event masked_place_all_slices_strided_impl( TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT, indT>; - constexpr std::size_t nominal_lws = 256; + static constexpr std::size_t nominal_lws = 256; const std::size_t masked_extent = iteration_size; const std::size_t lws = std::min(masked_extent, nominal_lws); @@ -685,7 +685,7 @@ sycl::event masked_place_some_slices_strided_impl( TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, dataT, indT>; - constexpr std::size_t nominal_lws = 256; + static constexpr std::size_t nominal_lws = 256; const std::size_t orthog_extent = orthog_nelems; const std::size_t masked_extent = masked_nelems; const std::size_t lws = std::min(masked_extent, nominal_lws); @@ -788,7 +788,7 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q, const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); indT2 *indexes_data = reinterpret_cast(indexes_cp); - constexpr std::size_t nominal_lws = 256u; + static constexpr std::size_t nominal_lws = 256u; const std::size_t masked_extent = iter_size; const std::size_t lws = std::min(masked_extent, nominal_lws); diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp index 20c50c23e7..815ebbcbc9 100644 --- a/dpctl/tensor/libtensor/include/kernels/clip.hpp +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -106,7 +106,7 @@ class ClipContigFunctor void operator()(sycl::nd_item<1> ndit) const { - constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; using dpctl::tensor::type_utils::is_complex; if constexpr (is_complex::value || !enable_sg_loadstore) { @@ -202,8 +202,8 @@ sycl::event clip_contig_impl(sycl::queue &q, cgh.depends_on(depends); std::size_t lws = 64; - constexpr std::uint8_t vec_sz = 4; - constexpr std::uint8_t n_vecs = 2; + static constexpr std::uint8_t vec_sz = 4; + static constexpr std::uint8_t n_vecs = 2; const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); @@ -214,7 +214,7 @@ sycl::event clip_contig_impl(sycl::queue &q, is_aligned(max_cp) && is_aligned(dst_cp)) { - constexpr bool enable_sg_loadstore = true; + static constexpr bool enable_sg_loadstore = true; using KernelName = clip_contig_kernel; using Impl = ClipContigFunctor; @@ -224,7 +224,7 @@ sycl::event clip_contig_impl(sycl::queue &q, Impl(nelems, x_tp, min_tp, max_tp, dst_tp)); } else { - constexpr bool disable_sg_loadstore = false; + static constexpr bool disable_sg_loadstore = false; using InnerKernelName = clip_contig_kernel; using KernelName = disabled_sg_loadstore_wrapper_krn; diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp index c95c9ae114..e734f30497 100644 --- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -468,7 +468,7 @@ sycl::event tri_impl(sycl::queue &exec_q, const std::vector &depends, const std::vector &additional_depends) { - constexpr int d2 = 2; + static constexpr int d2 = 2; ssize_t src_s = nd; ssize_t dst_s = 2 * nd; ssize_t nd_1 = nd - 1; diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index 1db06a914a..f19dcb7c8c 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -233,9 +233,9 @@ class ContigCopyFunctor void operator()(sycl::nd_item<1> ndit) const { - CastFnT fn{}; + static constexpr CastFnT fn{}; - constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; using dpctl::tensor::type_utils::is_complex; if constexpr (!enable_sg_loadstore || is_complex::value || @@ -338,8 +338,8 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q, dstTy *dst_tp = reinterpret_cast(dst_cp); std::size_t lws = 64; - constexpr std::uint32_t vec_sz = 4; - constexpr std::uint32_t n_vecs = 2; + static constexpr std::uint32_t vec_sz = 4; + static constexpr std::uint32_t n_vecs = 2; const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); @@ -348,7 +348,7 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q, if (is_aligned(src_cp) && is_aligned(dst_cp)) { - constexpr bool enable_sg_loadstore = true; + static constexpr bool enable_sg_loadstore = true; using KernelName = copy_cast_contig_kernel; @@ -359,7 +359,7 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q, dst_tp)); } else { - constexpr bool disable_sg_loadstore = false; + static constexpr bool disable_sg_loadstore = false; using InnerKernelName = copy_cast_contig_kernel; using KernelName = @@ -724,9 +724,10 @@ void copy_and_cast_from_host_contig_impl( sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); using IndexerT = TwoOffsets_CombinedIndexer; - constexpr NoOpIndexer src_indexer{}; - constexpr NoOpIndexer dst_indexer{}; - constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, dst_indexer}; + static constexpr NoOpIndexer src_indexer{}; + static constexpr NoOpIndexer dst_indexer{}; + static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, + dst_indexer}; dstTy *dst_tp = reinterpret_cast(dst_p) + dst_offset; @@ -1124,12 +1125,12 @@ sycl::event copy_for_roll_contig_impl(sycl::queue &q, sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - constexpr NoOpIndexer src_indexer{}; + static constexpr NoOpIndexer src_indexer{}; const LeftRolled1DTransformer roller{shift, nelems}; const CompositionIndexer left_rolled_src_indexer{src_indexer, roller}; - constexpr NoOpIndexer dst_indexer{}; + static constexpr NoOpIndexer dst_indexer{}; using KernelName = copy_for_roll_contig_kernel; diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp index f3182707ad..a27a8c78d3 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -74,7 +74,7 @@ class CopyAsCContigFunctor static_assert(vec_sz > 0); static_assert(n_vecs > 0); - constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; using dpctl::tensor::type_utils::is_complex; if constexpr (!enable_sg_loadstore || is_complex::value) { @@ -151,7 +151,7 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, static_assert(vec_sz > 0); static_assert(n_vecs > 0); - constexpr std::size_t preferred_lws = 256; + static constexpr std::size_t preferred_lws = 256; const auto &kernel_id = sycl::get_kernel_id(); @@ -168,7 +168,7 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, const std::size_t lws = ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size; - constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; const std::size_t nelems_per_group = nelems_per_wi * lws; const std::size_t n_groups = @@ -214,8 +214,8 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q, using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides); - constexpr std::uint8_t vec_sz = 4u; - constexpr std::uint8_t n_vecs = 2u; + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; using dpctl::tensor::kernels::alignment_utils:: disabled_sg_loadstore_wrapper_krn; @@ -224,7 +224,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q, sycl::event copy_ev; if (is_aligned(dst_p)) { - constexpr bool enable_sg_load = true; + static constexpr bool enable_sg_load = true; using KernelName = as_contig_krn; copy_ev = submit_c_contiguous_copy; using KernelName = disabled_sg_loadstore_wrapper_krn; @@ -287,19 +287,19 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl( const T *src_tp = reinterpret_cast(src_p); T *dst_tp = reinterpret_cast(dst_p); - constexpr std::uint16_t private_tile_size = 4; - constexpr std::uint16_t n_lines = 2; - constexpr std::uint16_t block_size = + static constexpr std::uint16_t private_tile_size = 4; + static constexpr std::uint16_t n_lines = 2; + static constexpr std::uint16_t block_size = n_lines * private_tile_size * private_tile_size; - constexpr std::uint16_t lws0 = block_size; - constexpr std::uint16_t lws1 = n_lines; - constexpr std::uint16_t nelems_per_wi = (block_size / lws1); + static constexpr std::uint16_t lws0 = block_size; + static constexpr std::uint16_t lws1 = n_lines; + static constexpr std::uint16_t nelems_per_wi = (block_size / lws1); static_assert(nelems_per_wi * lws1 == block_size); static_assert(nelems_per_wi == private_tile_size * private_tile_size); - constexpr std::uint32_t lws = lws0 * lws1; + static constexpr std::uint32_t lws = lws0 * lws1; const std::size_t n_tiles = (n + block_size - 1) / block_size; @@ -384,7 +384,7 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl( // 0 <= lid_lin < lws0 * lws1 == // (block_size * block_size / nelems_per_wi) == // (block_size/private_tile_size)**2 - constexpr std::uint16_t n_private_tiles_per_axis = + static constexpr std::uint16_t n_private_tiles_per_axis = block_size / private_tile_size; const std::uint16_t local_tile_id0 = lid_lin / n_private_tiles_per_axis; @@ -594,7 +594,7 @@ sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; using BatchIndexerT = TwoOffsets_CombinedIndexer; - constexpr ssize_t zero_offset{0}; + static constexpr ssize_t zero_offset{0}; const SrcIndexerT src_batch_indexer{batch_nd, zero_offset, src_batch_shape_strides}; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index 48c8e3e4dd..d600175cf6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -154,8 +154,8 @@ sycl::event abs_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AbsHS = hyperparam_detail::AbsContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AbsHS::vec_sz; - constexpr std::uint8_t n_vec = AbsHS::n_vecs; + static constexpr std::uint8_t vec_sz = AbsHS::vec_sz; + static constexpr std::uint8_t n_vec = AbsHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp index 7dbfb6618c..c59a283ded 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -71,7 +71,8 @@ template struct AcosFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(in); const realT y = std::imag(in); @@ -103,7 +104,7 @@ template struct AcosFunctor /* * For large x or y including acos(+-Inf + I*+-Inf) */ - constexpr realT r_eps = + static constexpr realT r_eps = realT(1) / std::numeric_limits::epsilon(); if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { using sycl_complexT = exprm_ns::complex; @@ -188,8 +189,8 @@ sycl::event acos_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AcosHS = hyperparam_detail::AcosContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AcosHS::vec_sz; - constexpr std::uint8_t n_vec = AcosHS::n_vecs; + static constexpr std::uint8_t vec_sz = AcosHS::vec_sz; + static constexpr std::uint8_t n_vec = AcosHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp index a81ff3da99..12c20525f5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -71,7 +71,8 @@ template struct AcoshFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); /* * acosh(in) = I*acos(in) or -I*acos(in) * where the sign is chosen so Re(acosh(in)) >= 0. @@ -92,7 +93,8 @@ template struct AcoshFunctor } else if (std::isnan(y)) { /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */ - constexpr realT inf = std::numeric_limits::infinity(); + static constexpr realT inf = + std::numeric_limits::infinity(); if (std::isinf(x)) { acos_in = resT{q_nan, -inf}; @@ -107,7 +109,7 @@ template struct AcoshFunctor } } - constexpr realT r_eps = + static constexpr realT r_eps = realT(1) / std::numeric_limits::epsilon(); /* * For large x or y including acos(+-Inf + I*+-Inf) @@ -216,8 +218,8 @@ sycl::event acosh_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AcoshHS::vec_sz; - constexpr std::uint8_t n_vec = AcoshHS::n_vecs; + static constexpr std::uint8_t vec_sz = AcoshHS::vec_sz; + static constexpr std::uint8_t n_vec = AcoshHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index 476e7b52b9..d65c06500c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -273,8 +273,8 @@ sycl::event add_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AddHS = hyperparam_detail::AddContigHyperparameterSet; - constexpr auto vec_sz = AddHS::vec_sz; - constexpr auto n_vecs = AddHS::n_vecs; + static constexpr auto vec_sz = AddHS::vec_sz; + static constexpr auto n_vecs = AddHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel, @@ -551,9 +551,9 @@ add_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr auto vec_sz = + static constexpr auto vec_sz = hyperparam_detail::AddContigHyperparameterSet::vec_sz; - constexpr auto n_vecs = + static constexpr auto n_vecs = hyperparam_detail::AddContigHyperparameterSet::n_vecs; return elementwise_common::binary_inplace_contig_impl< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 726f90ba81..9af8970376 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -115,8 +115,8 @@ template struct AngleContigHyperparameterSet using value_type = typename std::disjunction>; - constexpr static auto vec_sz = value_type::vec_sz; - constexpr static auto n_vecs = value_type::n_vecs; + static constexpr static auto vec_sz = value_type::vec_sz; + static constexpr static auto n_vecs = value_type::n_vecs; }; } // end of namespace hyperparam_detail @@ -132,8 +132,8 @@ sycl::event angle_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AngleHS = hyperparam_detail::AngleContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AngleHS::vec_sz; - constexpr std::uint8_t n_vec = AngleHS::n_vecs; + static constexpr std::uint8_t vec_sz = AngleHS::vec_sz; + static constexpr std::uint8_t n_vec = AngleHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp index 70b48895b4..3ba00c4198 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -71,7 +71,8 @@ template struct AsinFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); /* * asin(in) = I * conj( asinh(I * conj(in)) ) @@ -117,7 +118,7 @@ template struct AsinFunctor * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) + * O(y/in^3) as in -> infinity, uniformly in y */ - constexpr realT r_eps = + static constexpr realT r_eps = realT(1) / std::numeric_limits::epsilon(); if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { using sycl_complexT = exprm_ns::complex; @@ -209,8 +210,8 @@ sycl::event asin_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AddHS = hyperparam_detail::AsinContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AddHS::vec_sz; - constexpr std::uint8_t n_vec = AddHS::n_vecs; + static constexpr std::uint8_t vec_sz = AddHS::vec_sz; + static constexpr std::uint8_t n_vec = AddHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp index 420ba3246c..7441e51ae7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -71,7 +71,8 @@ template struct AsinhFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(in); const realT y = std::imag(in); @@ -105,7 +106,7 @@ template struct AsinhFunctor * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) + * O(y/in^3) as in -> infinity, uniformly in y */ - constexpr realT r_eps = + static constexpr realT r_eps = realT(1) / std::numeric_limits::epsilon(); if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { @@ -192,8 +193,8 @@ sycl::event asinh_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AsinhHS::vec_sz; - constexpr std::uint8_t n_vec = AsinhHS::n_vecs; + static constexpr std::uint8_t vec_sz = AsinhHS::vec_sz; + static constexpr std::uint8_t n_vec = AsinhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp index 29c4941d76..87fad500ab 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -75,7 +75,8 @@ template struct AtanFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); /* * atan(in) = I * conj( atanh(I * conj(in)) ) * so we first calculate w = atanh(I * conj(in)) with @@ -122,7 +123,7 @@ template struct AtanFunctor * The sign of pi/2 depends on the sign of imaginary part of the * input. */ - constexpr realT r_eps = + static constexpr realT r_eps = realT(1) / std::numeric_limits::epsilon(); if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { const realT pi_half = sycl::atan(realT(1)) * 2; @@ -202,8 +203,8 @@ sycl::event atan_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AtanHS = hyperparam_detail::AtanContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AtanHS::vec_sz; - constexpr std::uint8_t n_vec = AtanHS::n_vecs; + static constexpr std::uint8_t vec_sz = AtanHS::vec_sz; + static constexpr std::uint8_t n_vec = AtanHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp index 32384fc8a9..7e5dbe0bb5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -145,8 +145,8 @@ sycl::event atan2_contig_impl(sycl::queue &exec_q, { using Atan2HS = hyperparam_detail::Atan2ContigHyperparameterSet; - constexpr std::uint8_t vec_sz = Atan2HS::vec_sz; - constexpr std::uint8_t n_vecs = Atan2HS::n_vecs; + static constexpr std::uint8_t vec_sz = Atan2HS::vec_sz; + static constexpr std::uint8_t n_vecs = Atan2HS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp index 39f11e0f90..4148c1545a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -71,7 +71,8 @@ template struct AtanhFunctor { if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(in); const realT y = std::imag(in); @@ -193,8 +194,8 @@ sycl::event atanh_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet; - constexpr std::uint8_t vec_sz = AtanhHS::vec_sz; - constexpr std::uint8_t n_vec = AtanhHS::n_vecs; + static constexpr std::uint8_t vec_sz = AtanhHS::vec_sz; + static constexpr std::uint8_t n_vec = AtanhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index cdea67f080..9c164ea5a2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -203,8 +203,8 @@ bitwise_and_contig_impl(sycl::queue &exec_q, { using BitwiseAndHS = hyperparam_detail::BitwiseAndContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; - constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; + static constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor, @@ -391,8 +391,8 @@ bitwise_and_inplace_contig_impl(sycl::queue &exec_q, { using BitwiseAndHS = hyperparam_detail::BitwiseAndContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseAndInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index d96bf35afe..c292373575 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -150,8 +150,8 @@ bitwise_invert_contig_impl(sycl::queue &exec_q, { using BitwiseInvertHS = hyperparam_detail::BitwiseInvertContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz; - constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz; + static constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index 3ff2d5e315..6511895537 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -84,8 +84,9 @@ struct BitwiseLeftShiftFunctor private: resT impl(const argT1 &in1, const argT2 &in2) const { - constexpr argT2 in1_bitsize = static_cast(sizeof(argT1) * 8); - constexpr resT zero = resT(0); + static constexpr argT2 in1_bitsize = + static_cast(sizeof(argT1) * 8); + static constexpr resT zero = resT(0); // bitshift op with second operand negative, or >= bitwidth(argT1) is UB // array API spec mandates 0 @@ -214,8 +215,8 @@ bitwise_left_shift_contig_impl(sycl::queue &exec_q, using BitwiseLSHS = hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseLeftShiftOutputType, @@ -316,8 +317,8 @@ template struct BitwiseLeftShiftInplaceFunctor private: void impl(resT &res, const argT &in) const { - constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); - constexpr resT zero = resT(0); + static constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + static constexpr resT zero = resT(0); // bitshift op with second operand negative, or >= bitwidth(argT1) is UB // array API spec mandates 0 @@ -407,8 +408,8 @@ sycl::event bitwise_left_shift_inplace_contig_impl( using BitwiseLSHS = hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseLeftShiftInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index f1cbb63afc..4e20b75138 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -202,8 +202,8 @@ sycl::event bitwise_or_contig_impl(sycl::queue &exec_q, { using BitwiseOrHS = hyperparam_detail::BitwiseOrContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor, @@ -387,8 +387,8 @@ bitwise_or_inplace_contig_impl(sycl::queue &exec_q, using BitwiseOrHS = hyperparam_detail::BitwiseOrContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseOrInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index 580ee7d828..047a33cea9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -83,8 +83,9 @@ struct BitwiseRightShiftFunctor private: resT impl(const argT1 &in1, const argT2 &in2) const { - constexpr argT2 in1_bitsize = static_cast(sizeof(argT1) * 8); - constexpr resT zero = resT(0); + static constexpr argT2 in1_bitsize = + static_cast(sizeof(argT1) * 8); + static constexpr resT zero = resT(0); // bitshift op with second operand negative, or >= bitwidth(argT1) is UB // array API spec mandates 0 @@ -318,8 +319,8 @@ template struct BitwiseRightShiftInplaceFunctor private: void impl(resT &res, const argT &in) const { - constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); - constexpr resT zero = resT(0); + static constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + static constexpr resT zero = resT(0); // bitshift op with second operand negative, or >= bitwidth(argT1) is UB // array API spec mandates 0 @@ -413,8 +414,8 @@ sycl::event bitwise_right_shift_inplace_contig_impl( argTy>; // res = OP(res, arg) - constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseRightShiftInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index 84d1c8ff64..d14a12a248 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -204,8 +204,8 @@ bitwise_xor_contig_impl(sycl::queue &exec_q, { using BitwiseXorHS = hyperparam_detail::BitwiseXorContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor, @@ -393,8 +393,8 @@ bitwise_xor_inplace_contig_impl(sycl::queue &exec_q, using BitwiseXorHS = hyperparam_detail::BitwiseXorContigHyperparameterSet; - constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; - constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; + static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseXorInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp index afa83a64cb..4cf32725b1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp @@ -54,8 +54,8 @@ template realT cabs(std::complex const &z) const realT x = std::real(z); const realT y = std::imag(z); - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); - constexpr realT p_inf = std::numeric_limits::infinity(); + static constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT p_inf = std::numeric_limits::infinity(); const realT res = std::isinf(x) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index d857e1563c..a26c0b6875 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -124,8 +124,8 @@ sycl::event cbrt_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet; - constexpr std::uint8_t vec_sz = CbrtHS::vec_sz; - constexpr std::uint8_t n_vecs = CbrtHS::n_vecs; + static constexpr std::uint8_t vec_sz = CbrtHS::vec_sz; + static constexpr std::uint8_t n_vecs = CbrtHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp index c587ba6767..eb9d576db1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -146,8 +146,8 @@ sycl::event ceil_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using CeilHS = hyperparam_detail::CeilContigHyperparameterSet; - constexpr std::uint8_t vec_sz = CeilHS::vec_sz; - constexpr std::uint8_t n_vecs = CeilHS::n_vecs; + static constexpr std::uint8_t vec_sz = CeilHS::vec_sz; + static constexpr std::uint8_t n_vecs = CeilHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 961a373890..1836b61d0e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -77,7 +77,7 @@ struct UnaryContigFunctor void operator()(sycl::nd_item<1> ndit) const { - constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; UnaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ /* NOTE: work-group size must be divisible by sub-group size */ @@ -94,7 +94,7 @@ struct UnaryContigFunctor elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { - constexpr sycl::vec res_vec(const_val); + static constexpr sycl::vec res_vec(const_val); #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { const std::size_t offset = base + it * sgSize; @@ -275,7 +275,7 @@ SizeT select_lws(const sycl::device &, SizeT n_work_items_needed) // TODO: make the decision based on device descriptors // constexpr SizeT few_threshold = (SizeT(1) << 17); - constexpr SizeT med_threshold = (SizeT(1) << 21); + static constexpr SizeT med_threshold = (SizeT(1) << 21); const SizeT lws = (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256)); @@ -302,7 +302,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; const std::size_t n_work_items_needed = nelems / elems_per_wi; const std::size_t lws = select_lws(exec_q.get_device(), n_work_items_needed); @@ -324,7 +324,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q, if (is_aligned(arg_p) && is_aligned(res_p)) { - constexpr bool enable_sg_loadstore = true; + static constexpr bool enable_sg_loadstore = true; using KernelName = BaseKernelName; using Impl = ContigFunctorT; @@ -334,7 +334,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q, Impl(arg_tp, res_tp, nelems)); } else { - constexpr bool disable_sg_loadstore = false; + static constexpr bool disable_sg_loadstore = false; using KernelName = disabled_sg_loadstore_wrapper_krn; using Impl = ContigFunctorT ndit) const { - constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; BinaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ /* NOTE: work-group size must be divisible by sub-group size */ @@ -817,7 +817,7 @@ sycl::event binary_contig_impl(sycl::queue &exec_q, is_aligned(arg2_tp) && is_aligned(res_tp)) { - constexpr bool enable_sg_loadstore = true; + static constexpr bool enable_sg_loadstore = true; using KernelName = BaseKernelName; using Impl = BinaryContigFunctorT; @@ -827,7 +827,7 @@ sycl::event binary_contig_impl(sycl::queue &exec_q, Impl(arg1_tp, arg2_tp, res_tp, nelems)); } else { - constexpr bool disable_sg_loadstore = false; + static constexpr bool disable_sg_loadstore = false; using KernelName = disabled_sg_loadstore_wrapper_krn; using Impl = BinaryContigFunctorT ndit) const { BinaryInplaceOperatorT op{}; - constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; /* Each work-item processes vec_sz elements, contiguous in memory */ /* NB: Workgroup size must be divisible by sub-group size */ @@ -337,7 +337,7 @@ binary_inplace_contig_impl(sycl::queue &exec_q, if (is_aligned(arg_tp) && is_aligned(res_tp)) { - constexpr bool enable_sg_loadstore = true; + static constexpr bool enable_sg_loadstore = true; using KernelName = kernel_name; using Impl = BinaryInplaceContigFunctorT; using KernelName = disabled_sg_loadstore_wrapper_krn; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index 19a95df5a1..6bf9658fc6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -152,8 +152,8 @@ sycl::event conj_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using ConjHS = hyperparam_detail::ConjContigHyperparameterSet; - constexpr std::uint8_t vec_sz = ConjHS::vec_sz; - constexpr std::uint8_t n_vecs = ConjHS::n_vecs; + static constexpr std::uint8_t vec_sz = ConjHS::vec_sz; + static constexpr std::uint8_t n_vecs = ConjHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp index ecfbf99c28..5762053821 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -160,8 +160,8 @@ sycl::event copysign_contig_impl(sycl::queue &exec_q, { using CopySignHS = hyperparam_detail::CopysignContigHyperparameterSet; - constexpr std::uint8_t vec_sz = CopySignHS::vec_sz; - constexpr std::uint8_t n_vecs = CopySignHS::n_vecs; + static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz; + static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, CopysignOutputType, CopysignContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index 5940315c62..81608346a7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -71,7 +71,8 @@ template struct CosFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); realT const &in_re = std::real(in); realT const &in_im = std::imag(in); @@ -225,8 +226,8 @@ sycl::event cos_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using CosHS = hyperparam_detail::CosContigHyperparameterSet; - constexpr std::uint8_t vec_sz = CosHS::vec_sz; - constexpr std::uint8_t n_vecs = CosHS::n_vecs; + static constexpr std::uint8_t vec_sz = CosHS::vec_sz; + static constexpr std::uint8_t n_vecs = CosHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp index 59468428d1..be3e76cb2b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -71,7 +71,8 @@ template struct CoshFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(in); const realT y = std::imag(in); @@ -214,8 +215,8 @@ sycl::event cosh_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using CoshHS = hyperparam_detail::CoshContigHyperparameterSet; - constexpr std::uint8_t vec_sz = CoshHS::vec_sz; - constexpr std::uint8_t n_vecs = CoshHS::n_vecs; + static constexpr std::uint8_t vec_sz = CoshHS::vec_sz; + static constexpr std::uint8_t n_vecs = CoshHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index a53f6412de..682e9f397e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -231,8 +231,8 @@ sycl::event equal_contig_impl(sycl::queue &exec_q, { using EqualHS = hyperparam_detail::EqualContigHyperparameterSet; - constexpr std::uint8_t vec_sz = EqualHS::vec_sz; - constexpr std::uint8_t n_vecs = EqualHS::n_vecs; + static constexpr std::uint8_t vec_sz = EqualHS::vec_sz; + static constexpr std::uint8_t n_vecs = EqualHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, EqualOutputType, EqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp index 00f8213251..770229f804 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -70,7 +70,8 @@ template struct ExpFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(in); const realT y = std::imag(in); @@ -183,8 +184,8 @@ sycl::event exp_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using ExpHS = hyperparam_detail::ExpContigHyperparameterSet; - constexpr std::uint8_t vec_sz = ExpHS::vec_sz; - constexpr std::uint8_t n_vecs = ExpHS::n_vecs; + static constexpr std::uint8_t vec_sz = ExpHS::vec_sz; + static constexpr std::uint8_t n_vecs = ExpHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp index 22291101ca..44cef5fc9e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -73,7 +73,8 @@ template struct Exp2Functor const argT tmp = in * sycl::log(realT(2)); - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(tmp); const realT y = std::imag(tmp); @@ -186,8 +187,8 @@ sycl::event exp2_contig_impl(sycl::queue &exec_q, { using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet; - constexpr std::uint8_t vec_sz = Exp2HS::vec_sz; - constexpr std::uint8_t n_vecs = Exp2HS::n_vecs; + static constexpr std::uint8_t vec_sz = Exp2HS::vec_sz; + static constexpr std::uint8_t n_vecs = Exp2HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index d1d64f4904..de0bf515b4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -198,8 +198,8 @@ sycl::event expm1_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet; - constexpr std::uint8_t vec_sz = Expm1HS::vec_sz; - constexpr std::uint8_t n_vecs = Expm1HS::n_vecs; + static constexpr std::uint8_t vec_sz = Expm1HS::vec_sz; + static constexpr std::uint8_t n_vecs = Expm1HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp index 5bc6b888ef..b80c1f3b54 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -146,8 +146,8 @@ sycl::event floor_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using FloorHS = hyperparam_detail::FloorContigHyperparameterSet; - constexpr std::uint8_t vec_sz = FloorHS::vec_sz; - constexpr std::uint8_t n_vecs = FloorHS::n_vecs; + static constexpr std::uint8_t vec_sz = FloorHS::vec_sz; + static constexpr std::uint8_t n_vecs = FloorHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index d290aa66a6..21b9304e53 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -245,8 +245,8 @@ floor_divide_contig_impl(sycl::queue &exec_q, { using FloorDivideHS = hyperparam_detail::FloorDivideContigHyperparameterSet; - constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; - constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; + static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; + static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor, @@ -472,8 +472,8 @@ floor_divide_inplace_contig_impl(sycl::queue &exec_q, using FloorDivideHS = hyperparam_detail::FloorDivideContigHyperparameterSet; - constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; - constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; + static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; + static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, FloorDivideInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index 4d0b7fb94f..bdde27e175 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -234,8 +234,8 @@ sycl::event greater_contig_impl(sycl::queue &exec_q, using GreaterHS = hyperparam_detail::GreaterContigHyperparameterSet; - constexpr std::uint8_t vec_sz = GreaterHS::vec_sz; - constexpr std::uint8_t n_vecs = GreaterHS::n_vecs; + static constexpr std::uint8_t vec_sz = GreaterHS::vec_sz; + static constexpr std::uint8_t n_vecs = GreaterHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, GreaterOutputType, GreaterContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp index b149158ee0..e5599b5a14 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -235,8 +235,8 @@ greater_equal_contig_impl(sycl::queue &exec_q, { using GreaterEqHS = hyperparam_detail::GreaterEqualContigHyperparameterSet; - constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz; - constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs; + static constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz; + static constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index faa92c2f2d..12b67481f0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -161,8 +161,8 @@ sycl::event hypot_contig_impl(sycl::queue &exec_q, { using HypotHS = hyperparam_detail::HypotContigHyperparameterSet; - constexpr std::uint8_t vec_sz = HypotHS::vec_sz; - constexpr std::uint8_t n_vecs = HypotHS::n_vecs; + static constexpr std::uint8_t vec_sz = HypotHS::vec_sz; + static constexpr std::uint8_t n_vecs = HypotHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, HypotOutputType, HypotContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp index c5e0feea12..e4d615f87e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -150,8 +150,8 @@ sycl::event imag_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using ImagHS = hyperparam_detail::ImagContigHyperparameterSet; - constexpr std::uint8_t vec_sz = ImagHS::vec_sz; - constexpr std::uint8_t n_vecs = ImagHS::n_vecs; + static constexpr std::uint8_t vec_sz = ImagHS::vec_sz; + static constexpr std::uint8_t n_vecs = ImagHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index b0651a4d8b..4615f9ce93 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -151,8 +151,8 @@ sycl::event isfinite_contig_impl(sycl::queue &exec_q, { using IsFiniteHS = hyperparam_detail::IsFiniteContigHyperparameterSet; - constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz; - constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs; + static constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz; + static constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, IsFiniteOutputType, IsFiniteContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index ec78746143..9291eeeb72 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -150,8 +150,8 @@ sycl::event isinf_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet; - constexpr std::uint8_t vec_sz = IsInfHS::vec_sz; - constexpr std::uint8_t n_vecs = IsInfHS::n_vecs; + static constexpr std::uint8_t vec_sz = IsInfHS::vec_sz; + static constexpr std::uint8_t n_vecs = IsInfHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index fbf6ef9383..397037dbae 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -148,8 +148,8 @@ sycl::event isnan_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet; - constexpr std::uint8_t vec_sz = IsNanHS::vec_sz; - constexpr std::uint8_t n_vecs = IsNanHS::n_vecs; + static constexpr std::uint8_t vec_sz = IsNanHS::vec_sz; + static constexpr std::uint8_t n_vecs = IsNanHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp index 523410a161..f323e28a93 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -230,8 +230,8 @@ sycl::event less_contig_impl(sycl::queue &exec_q, { using LessHS = hyperparam_detail::LessContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LessHS::vec_sz; - constexpr std::uint8_t n_vecs = LessHS::n_vecs; + static constexpr std::uint8_t vec_sz = LessHS::vec_sz; + static constexpr std::uint8_t n_vecs = LessHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp index 5827d350a3..8ae6f236fc 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -232,8 +232,8 @@ sycl::event less_equal_contig_impl(sycl::queue &exec_q, { using LessEqHS = hyperparam_detail::LessEqualContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LessEqHS::vec_sz; - constexpr std::uint8_t n_vecs = LessEqHS::n_vecs; + static constexpr std::uint8_t vec_sz = LessEqHS::vec_sz; + static constexpr std::uint8_t n_vecs = LessEqHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index 84471a5ef4..b6a934677d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -140,8 +140,8 @@ sycl::event log_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using LogHS = hyperparam_detail::LogContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LogHS::vec_sz; - constexpr std::uint8_t n_vecs = LogHS::n_vecs; + static constexpr std::uint8_t vec_sz = LogHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp index d308c85ac9..bf27099e6a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -159,8 +159,8 @@ sycl::event log10_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet; - constexpr std::uint8_t vec_sz = Log10HS::vec_sz; - constexpr std::uint8_t n_vecs = Log10HS::n_vecs; + static constexpr std::uint8_t vec_sz = Log10HS::vec_sz; + static constexpr std::uint8_t n_vecs = Log10HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp index b8d993dd94..be14d459ee 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -164,8 +164,8 @@ sycl::event log1p_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet; - constexpr std::uint8_t vec_sz = Log1pHS::vec_sz; - constexpr std::uint8_t n_vecs = Log1pHS::n_vecs; + static constexpr std::uint8_t vec_sz = Log1pHS::vec_sz; + static constexpr std::uint8_t n_vecs = Log1pHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp index 42c837cfa3..0548c61f9d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -160,8 +160,8 @@ sycl::event log2_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet; - constexpr std::uint8_t vec_sz = Log2HS::vec_sz; - constexpr std::uint8_t n_vecs = Log2HS::n_vecs; + static constexpr std::uint8_t vec_sz = Log2HS::vec_sz; + static constexpr std::uint8_t n_vecs = Log2HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 0076a0bbe3..cd1dc11e9c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -177,8 +177,8 @@ sycl::event logaddexp_contig_impl(sycl::queue &exec_q, { using LogAddExpHS = hyperparam_detail::LogAddExpContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz; - constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs; + static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index 0aa1f61b90..6b91ff6915 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -203,8 +203,8 @@ logical_and_contig_impl(sycl::queue &exec_q, { using LogicalAndHS = hyperparam_detail::LogicalAndContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz; - constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs; + static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index e6d3e6d5ad..d4f9ec671f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -125,8 +125,8 @@ logical_not_contig_impl(sycl::queue &exec_q, { using LogicalNotHS = hyperparam_detail::LogicalNotContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz; - constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs; + static constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, LogicalNotOutputType, LogicalNotContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp index 1fdcd84f60..fdf3d134de 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -201,8 +201,8 @@ sycl::event logical_or_contig_impl(sycl::queue &exec_q, { using LogicalOrHS = hyperparam_detail::LogicalOrContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz; - constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs; + static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index 0ef3b17dff..b1521a238a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -204,8 +204,8 @@ logical_xor_contig_impl(sycl::queue &exec_q, { using LogicalXorHS = hyperparam_detail::LogicalXorContigHyperparameterSet; - constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz; - constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs; + static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index 799cbb1d8c..6fd8faf648 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -235,8 +235,8 @@ sycl::event maximum_contig_impl(sycl::queue &exec_q, { using MaxHS = hyperparam_detail::MaximumContigHyperparameterSet; - constexpr std::uint8_t vec_sz = MaxHS::vec_sz; - constexpr std::uint8_t n_vecs = MaxHS::n_vecs; + static constexpr std::uint8_t vec_sz = MaxHS::vec_sz; + static constexpr std::uint8_t n_vecs = MaxHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, MaximumOutputType, MaximumContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp index 9a672e539f..37b43eb0a0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -235,8 +235,8 @@ sycl::event minimum_contig_impl(sycl::queue &exec_q, { using MinHS = hyperparam_detail::MinimumContigHyperparameterSet; - constexpr std::uint8_t vec_sz = MinHS::vec_sz; - constexpr std::uint8_t n_vecs = MinHS::n_vecs; + static constexpr std::uint8_t vec_sz = MinHS::vec_sz; + static constexpr std::uint8_t n_vecs = MinHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, MinimumOutputType, MinimumContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp index ca24383b44..798610f445 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -227,8 +227,8 @@ sycl::event multiply_contig_impl(sycl::queue &exec_q, { using MulHS = hyperparam_detail::MultiplyContigHyperparameterSet; - constexpr std::uint8_t vec_sz = MulHS::vec_sz; - constexpr std::uint8_t n_vecs = MulHS::n_vecs; + static constexpr std::uint8_t vec_sz = MulHS::vec_sz; + static constexpr std::uint8_t n_vecs = MulHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor, @@ -513,8 +513,8 @@ multiply_inplace_contig_impl(sycl::queue &exec_q, { using MulHS = hyperparam_detail::MultiplyContigHyperparameterSet; - constexpr std::uint8_t vec_sz = MulHS::vec_sz; - constexpr std::uint8_t n_vecs = MulHS::n_vecs; + static constexpr std::uint8_t vec_sz = MulHS::vec_sz; + static constexpr std::uint8_t n_vecs = MulHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, MultiplyInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index 47707a5f04..4a8fc76bcc 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -130,8 +130,8 @@ sycl::event negative_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using NegHS = hyperparam_detail::NegativeContigHyperparameterSet; - constexpr std::uint8_t vec_sz = NegHS::vec_sz; - constexpr std::uint8_t n_vecs = NegHS::n_vecs; + static constexpr std::uint8_t vec_sz = NegHS::vec_sz; + static constexpr std::uint8_t n_vecs = NegHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, NegativeOutputType, NegativeContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp index 4a54a115f3..f68e4e8295 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp @@ -160,8 +160,8 @@ sycl::event nextafter_contig_impl(sycl::queue &exec_q, { using NextafterHS = hyperparam_detail::NextafterContigHyperparameterSet; - constexpr std::uint8_t vec_sz = NextafterHS::vec_sz; - constexpr std::uint8_t n_vecs = NextafterHS::n_vecs; + static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz; + static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, NextafterOutputType, NextafterContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index f21bc678fd..f8f436ec4f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -216,8 +216,8 @@ sycl::event not_equal_contig_impl(sycl::queue &exec_q, { using NotEqHS = hyperparam_detail::NotEqualContigHyperparameterSet; - constexpr std::uint8_t vec_sz = NotEqHS::vec_sz; - constexpr std::uint8_t n_vecs = NotEqHS::n_vecs; + static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz; + static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp index df6c04021f..9bc9e5782a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -145,8 +145,8 @@ sycl::event positive_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using PosHS = hyperparam_detail::PositiveContigHyperparameterSet; - constexpr std::uint8_t vec_sz = PosHS::vec_sz; - constexpr std::uint8_t n_vecs = PosHS::n_vecs; + static constexpr std::uint8_t vec_sz = PosHS::vec_sz; + static constexpr std::uint8_t n_vecs = PosHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, PositiveOutputType, PositiveContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index d7b0ed909e..326d83f412 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -277,8 +277,8 @@ sycl::event pow_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using PowHS = hyperparam_detail::PowContigHyperparameterSet; - constexpr std::uint8_t vec_sz = PowHS::vec_sz; - constexpr std::uint8_t n_vecs = PowHS::n_vecs; + static constexpr std::uint8_t vec_sz = PowHS::vec_sz; + static constexpr std::uint8_t n_vecs = PowHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel, @@ -522,8 +522,8 @@ pow_inplace_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using PowHS = hyperparam_detail::PowContigHyperparameterSet; - constexpr std::uint8_t vec_sz = PowHS::vec_sz; - constexpr std::uint8_t n_vecs = PowHS::n_vecs; + static constexpr std::uint8_t vec_sz = PowHS::vec_sz; + static constexpr std::uint8_t n_vecs = PowHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index df5edface1..a9d630e762 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -149,8 +149,8 @@ sycl::event proj_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using ProjHS = hyperparam_detail::ProjContigHyperparameterSet; - constexpr std::uint8_t vec_sz = ProjHS::vec_sz; - constexpr std::uint8_t n_vecs = ProjHS::n_vecs; + static constexpr std::uint8_t vec_sz = ProjHS::vec_sz; + static constexpr std::uint8_t n_vecs = ProjHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp index 9ecb822a20..a8b5c719a0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -149,8 +149,8 @@ sycl::event real_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using RealHS = hyperparam_detail::RealContigHyperparameterSet; - constexpr std::uint8_t vec_sz = RealHS::vec_sz; - constexpr std::uint8_t n_vecs = RealHS::n_vecs; + static constexpr std::uint8_t vec_sz = RealHS::vec_sz; + static constexpr std::uint8_t n_vecs = RealHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp index 0e46acba39..a320aa3181 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -145,8 +145,8 @@ sycl::event reciprocal_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet; - constexpr std::uint8_t vec_sz = RecipHS::vec_sz; - constexpr std::uint8_t n_vecs = RecipHS::n_vecs; + static constexpr std::uint8_t vec_sz = RecipHS::vec_sz; + static constexpr std::uint8_t n_vecs = RecipHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ReciprocalOutputType, ReciprocalContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 6d3baf0d1f..028b078c6b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -262,8 +262,8 @@ sycl::event remainder_contig_impl(sycl::queue &exec_q, { using RemHS = hyperparam_detail::RemainderContigHyperparameterSet; - constexpr std::uint8_t vec_sz = RemHS::vec_sz; - constexpr std::uint8_t n_vecs = RemHS::n_vecs; + static constexpr std::uint8_t vec_sz = RemHS::vec_sz; + static constexpr std::uint8_t n_vecs = RemHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, RemainderOutputType, RemainderContigFunctor, @@ -495,8 +495,8 @@ remainder_inplace_contig_impl(sycl::queue &exec_q, { using RemHS = hyperparam_detail::RemainderContigHyperparameterSet; - constexpr std::uint8_t vec_sz = RemHS::vec_sz; - constexpr std::uint8_t n_vecs = RemHS::n_vecs; + static constexpr std::uint8_t vec_sz = RemHS::vec_sz; + static constexpr std::uint8_t n_vecs = RemHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, RemainderInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index 7fbb20ae32..f83349d6e5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -156,8 +156,8 @@ sycl::event round_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using RoundHS = hyperparam_detail::RoundContigHyperparameterSet; - constexpr std::uint8_t vec_sz = RoundHS::vec_sz; - constexpr std::uint8_t n_vecs = RoundHS::n_vecs; + static constexpr std::uint8_t vec_sz = RoundHS::vec_sz; + static constexpr std::uint8_t n_vecs = RoundHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp index 0450246807..ba78e19fa9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -127,8 +127,8 @@ sycl::event rsqrt_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet; - constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz; - constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs; + static constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz; + static constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp index baa224942f..97c3305c7f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -168,8 +168,8 @@ sycl::event sign_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using SignHS = hyperparam_detail::SignContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SignHS::vec_sz; - constexpr std::uint8_t n_vecs = SignHS::n_vecs; + static constexpr std::uint8_t vec_sz = SignHS::vec_sz; + static constexpr std::uint8_t n_vecs = SignHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp index 9020bceb7e..01d25453cd 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -134,8 +134,8 @@ sycl::event signbit_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SignbitHS::vec_sz; - constexpr std::uint8_t n_vecs = SignbitHS::n_vecs; + static constexpr std::uint8_t vec_sz = SignbitHS::vec_sz; + static constexpr std::uint8_t n_vecs = SignbitHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp index e075a90a88..eede1e82c9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -70,7 +70,8 @@ template struct SinFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); realT const &in_re = std::real(in); realT const &in_im = std::imag(in); @@ -247,8 +248,8 @@ sycl::event sin_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using SinHS = hyperparam_detail::SinContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SinHS::vec_sz; - constexpr std::uint8_t n_vecs = SinHS::n_vecs; + static constexpr std::uint8_t vec_sz = SinHS::vec_sz; + static constexpr std::uint8_t n_vecs = SinHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp index 23b3588a3b..e26631d5dc 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -216,8 +216,8 @@ sycl::event sinh_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using SinhHS = hyperparam_detail::SinhContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SinhHS::vec_sz; - constexpr std::uint8_t n_vecs = SinhHS::n_vecs; + static constexpr std::uint8_t vec_sz = SinhHS::vec_sz; + static constexpr std::uint8_t n_vecs = SinhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index b83ff72495..ed80521acd 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -142,8 +142,8 @@ sycl::event sqrt_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SqrtHS::vec_sz; - constexpr std::uint8_t n_vecs = SqrtHS::n_vecs; + static constexpr std::uint8_t vec_sz = SqrtHS::vec_sz; + static constexpr std::uint8_t n_vecs = SqrtHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp index f9d9d848c0..7aa12452cc 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -167,8 +167,8 @@ sycl::event square_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using SquareHS = hyperparam_detail::SquareContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SquareHS::vec_sz; - constexpr std::uint8_t n_vecs = SquareHS::n_vecs; + static constexpr std::uint8_t vec_sz = SquareHS::vec_sz; + static constexpr std::uint8_t n_vecs = SquareHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 51a3955142..ee817c2941 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -213,8 +213,8 @@ sycl::event subtract_contig_impl(sycl::queue &exec_q, { using SubHS = hyperparam_detail::SubtractContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SubHS::vec_sz; - constexpr std::uint8_t n_vecs = SubHS::n_vecs; + static constexpr std::uint8_t vec_sz = SubHS::vec_sz; + static constexpr std::uint8_t n_vecs = SubHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, SubtractOutputType, SubtractContigFunctor, @@ -511,8 +511,8 @@ subtract_inplace_contig_impl(sycl::queue &exec_q, { using SubHS = hyperparam_detail::SubtractContigHyperparameterSet; - constexpr std::uint8_t vec_sz = SubHS::vec_sz; - constexpr std::uint8_t n_vecs = SubHS::n_vecs; + static constexpr std::uint8_t vec_sz = SubHS::vec_sz; + static constexpr std::uint8_t n_vecs = SubHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, SubtractInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index 770518f918..fc67ca9a25 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -73,7 +73,8 @@ template struct TanFunctor using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); /* * since tan(in) = -I * tanh(I * in), for special cases, * we calculate real and imaginary parts of z = tanh(I * in) and @@ -191,8 +192,8 @@ sycl::event tan_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using TanHS = hyperparam_detail::TanContigHyperparameterSet; - constexpr std::uint8_t vec_sz = TanHS::vec_sz; - constexpr std::uint8_t n_vecs = TanHS::n_vecs; + static constexpr std::uint8_t vec_sz = TanHS::vec_sz; + static constexpr std::uint8_t n_vecs = TanHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp index 1d06fd3c4f..dda2e2914f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -73,7 +73,8 @@ template struct TanhFunctor if constexpr (is_complex::value) { using realT = typename argT::value_type; - constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); const realT x = std::real(in); const realT y = std::imag(in); @@ -185,8 +186,8 @@ sycl::event tanh_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using TanhHS = hyperparam_detail::TanhContigHyperparameterSet; - constexpr std::uint8_t vec_sz = TanhHS::vec_sz; - constexpr std::uint8_t n_vecs = TanhHS::n_vecs; + static constexpr std::uint8_t vec_sz = TanhHS::vec_sz; + static constexpr std::uint8_t n_vecs = TanhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index de6c9a8723..1aa3fbd482 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -221,8 +221,8 @@ true_divide_contig_impl(sycl::queue &exec_q, { using DivHS = hyperparam_detail::TrueDivideContigHyperparameterSet; - constexpr std::uint8_t vec_sz = DivHS::vec_sz; - constexpr std::uint8_t n_vecs = DivHS::n_vecs; + static constexpr std::uint8_t vec_sz = DivHS::vec_sz; + static constexpr std::uint8_t n_vecs = DivHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor, @@ -540,8 +540,8 @@ true_divide_inplace_contig_impl(sycl::queue &exec_q, { using DivHS = hyperparam_detail::TrueDivideContigHyperparameterSet; - constexpr std::uint8_t vec_sz = DivHS::vec_sz; - constexpr std::uint8_t n_vecs = DivHS::vec_sz; + static constexpr std::uint8_t vec_sz = DivHS::vec_sz; + static constexpr std::uint8_t n_vecs = DivHS::vec_sz; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, TrueDivideInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index 4c776e3560..61a37ca1b2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -143,8 +143,8 @@ sycl::event trunc_contig_impl(sycl::queue &exec_q, const std::vector &depends = {}) { using TruncHS = hyperparam_detail::TruncContigHyperparameterSet; - constexpr std::uint8_t vec_sz = TruncHS::vec_sz; - constexpr std::uint8_t n_vecs = TruncHS::n_vecs; + static constexpr std::uint8_t vec_sz = TruncHS::vec_sz; + static constexpr std::uint8_t n_vecs = TruncHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 285ae3575a..75807d3446 100644 --- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -96,7 +96,7 @@ class TakeFunctor ssize_t src_offset = orthog_offsets.get_first_offset(); ssize_t dst_offset = orthog_offsets.get_second_offset(); - constexpr ProjectorT proj{}; + static constexpr ProjectorT proj{}; for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { indT *ind_data = reinterpret_cast(ind_[axis_idx]); @@ -242,7 +242,7 @@ class PutFunctor ssize_t dst_offset = orthog_offsets.get_first_offset(); ssize_t val_offset = orthog_offsets.get_second_offset(); - constexpr ProjectorT proj{}; + static constexpr ProjectorT proj{}; for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { indT *ind_data = reinterpret_cast(ind_[axis_idx]); diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp index 71e2c15b6b..ceec04c0a1 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -507,7 +507,7 @@ sycl::event dot_product_impl(sycl::queue &exec_q, reduction_rhs_offset, reduction_shape_stride}; - constexpr std::size_t preferred_reductions_per_wi = + static constexpr std::size_t preferred_reductions_per_wi = 4; // determined experimentally std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) @@ -584,8 +584,8 @@ dot_product_contig_impl(sycl::queue &exec_q, /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; sycl::event dot_ev = sequential_dot_product::value; // more than one work-groups is needed, requires a temporary @@ -1053,7 +1053,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, const RhsIndexerT rhs_indexer( batch_nd, batch_rhs_offset, batch_shape_and_strides, batch_shape_and_strides + 2 * batch_nd); - constexpr ResIndexerT noop_tmp_indexer{}; + static constexpr ResIndexerT noop_tmp_indexer{}; const InputOutputBatchIndexerT in_out_iter_indexer{ lhs_indexer, rhs_indexer, noop_tmp_indexer}; @@ -1093,11 +1093,11 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, const InputIndexerT inp_indexer{/* size */ batches, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< @@ -1130,7 +1130,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; reductions_per_wi = std::max( @@ -1198,8 +1198,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; sycl::event dot_ev = sequential_dot_product::value; // more than one work-groups is needed, requires a temporary @@ -1300,8 +1300,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; first_reduction_ev = submit_no_atomic_dot_product< lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT, @@ -1335,11 +1335,11 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, const InputIndexerT inp_indexer{/* size */ batches, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< @@ -1365,11 +1365,11 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, const InputIndexerT inp_indexer{/* size */ batches, /* step */ remaining_reduction_nelems}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; reductions_per_wi = std::max( diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp index 4ad4eb142a..2b5a42af19 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -59,7 +59,7 @@ void scale_gemm_k_parameters(const std::size_t &local_mem_size, std::size_t &n_wi, std::size_t &delta_n) { - constexpr std::size_t slm_elem_size = sizeof(T) * m_groups; + static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups; while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >= local_mem_size) @@ -79,8 +79,8 @@ void scale_gemm_nm_parameters(const std::size_t &local_mem_size, std::size_t &wg_delta_n, std::size_t &wg_delta_m) { - constexpr std::size_t slm_A_elem_size = sizeof(T); - constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m; + static constexpr std::size_t slm_A_elem_size = sizeof(T); + static constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m; while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) + (wi_delta_k * wg_delta_m * slm_B_elem_size) + @@ -210,8 +210,8 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; // tmp allocation is a C-contiguous matrix (reduction_nelems, // iter_nelems) and we are reducing by axis 0 const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, @@ -239,8 +239,8 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; // tmp allocation is a C-contiguous matrix // (reduction_nelems, iter_nelems). Reducing along axis 0 const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, @@ -294,8 +294,8 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; // partially_reduced_tmp is C-contig matrix with shape // (reduction_nelems, iter_nelems). Reducing along axis 0. const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, @@ -331,12 +331,12 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< @@ -369,7 +369,7 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; reductions_per_wi = @@ -416,8 +416,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, NoOpIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, - NoOpIndexerT{}}; + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, /* step */ iter_nelems}; @@ -454,12 +454,12 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, // along the stack axis const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< @@ -487,11 +487,11 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, const InputIndexerT inp_indexer{ /* size */ iter_nelems, /* step */ remaining_reduction_nelems}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; reductions_per_wi = std::max( @@ -615,7 +615,7 @@ class GemmBatchFunctorThreadK using accV_t = typename LocAccT::value_type; - constexpr resT identity_ = resT(0); + static constexpr resT identity_ = resT(0); if (local_i == 0) { for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) { const std::size_t sq = s + q; @@ -651,7 +651,7 @@ class GemmBatchFunctorThreadK std::size_t global_s_offset = i * k + t_shift; accV_t private_sum(identity_); - constexpr accV_t vec_identity_(identity_); + static constexpr accV_t vec_identity_(identity_); for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) { private_sum += ((i < n) && (t + t_shift < k)) @@ -749,7 +749,7 @@ sycl::event _gemm_k_impl(sycl::queue &exec_q, const ResIndexerT &res_indexer, const std::vector &depends) { - constexpr std::size_t m_groups = 4; + static constexpr std::size_t m_groups = 4; const std::size_t delta_k(4); std::size_t n_wi(64); std::size_t delta_n(32); @@ -823,7 +823,7 @@ sycl::event _gemm_small_m_impl(sycl::queue &exec_q, const ResIndexerT &res_indexer, const std::vector &depends) { - constexpr std::size_t m_groups = 1; + static constexpr std::size_t m_groups = 1; const std::size_t delta_k(4); std::size_t n_wi(64); std::size_t delta_n(32); @@ -942,8 +942,9 @@ class GemmBatchFunctorThreadNM_vecm void operator()(sycl::nd_item<1> it) const { - constexpr resT zero_(0); - constexpr std::uint32_t wi_total_delta_m = wi_delta_m_vecs * m_vec_size; + static constexpr resT zero_(0); + static constexpr std::uint32_t wi_total_delta_m = + wi_delta_m_vecs * m_vec_size; const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems; const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch; @@ -987,7 +988,7 @@ class GemmBatchFunctorThreadNM_vecm // allocate/initialize private matrix C // size ( wi_total_delta_n, wi_total_delta_m ) - constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs; + static constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs; std::array private_C{slmB_t{zero_}}; for (std::size_t s = 0; s < k; s += wi_delta_k) { @@ -1281,16 +1282,18 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q, const ResIndexerT &res_indexer, std::vector const &depends) { - constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector + static constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector< + resTy> selector{}; - constexpr auto hyper_params = selector.get(); + static constexpr auto hyper_params = selector.get(); - constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n(); - constexpr std::uint32_t wi_delta_m_vecs = + static constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n(); + static constexpr std::uint32_t wi_delta_m_vecs = hyper_params.get_wi_delta_m_vecs(); - constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size(); + static constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size(); - constexpr std::uint32_t wi_total_delta_m = wi_delta_m_vecs * m_vec_size; + static constexpr std::uint32_t wi_total_delta_m = + wi_delta_m_vecs * m_vec_size; using KernelName = class gemm_batch_nm_vecm_krn(dev); // Limit work-group size - constexpr std::size_t wg_sz_limit(2048); + static constexpr std::size_t wg_sz_limit(2048); const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz); const std::uint32_t max_subgroups_per_wg = @@ -1422,9 +1425,9 @@ sycl::event gemm_impl(sycl::queue &exec_q, const OuterInnerIndexerT res_indexer(res_outer_nd, 0, res_shape_strides); using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchIndexerT batch_indexer{}; + static constexpr BatchIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; const std::size_t min_nm = std::min(n, m); const std::size_t max_nm = std::max(n, m); @@ -1503,14 +1506,14 @@ sycl::event gemm_contig_impl(sycl::queue &exec_q, resTy *res_tp = reinterpret_cast(res_cp); using OuterInnerIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerIndexerT lhs_indexer{}; - constexpr OuterInnerIndexerT rhs_indexer{}; - constexpr OuterInnerIndexerT res_indexer{}; + static constexpr OuterInnerIndexerT lhs_indexer{}; + static constexpr OuterInnerIndexerT rhs_indexer{}; + static constexpr OuterInnerIndexerT res_indexer{}; using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchIndexerT batch_indexer{}; + static constexpr BatchIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; const std::size_t min_nm = std::min(n, m); const std::size_t max_nm = std::max(n, m); @@ -1712,9 +1715,9 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, resTy *res_tp = reinterpret_cast(res_cp) + res_batch_offset; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT res_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; @@ -1941,7 +1944,7 @@ class GemmBatchNoAtomicFunctorThreadNM const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n; const std::size_t b_offset = local_j * wi_delta_k; - constexpr resT identity_(0); + static constexpr resT identity_(0); for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) { const std::size_t a_pr_offset = private_i * wi_delta_k; @@ -2076,7 +2079,7 @@ class GemmBatchNoAtomicFunctorThreadK using accV_t = typename LocAccT::value_type; - constexpr resT identity_ = resT(0); + static constexpr resT identity_ = resT(0); if (local_i == 0) { for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) { std::size_t sq = s + q; @@ -2112,7 +2115,7 @@ class GemmBatchNoAtomicFunctorThreadK std::size_t global_s_offset = i * k + t_shift; accV_t private_sum(identity_); - constexpr accV_t vec_identity_(identity_); + static constexpr accV_t vec_identity_(identity_); for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) { private_sum += ((i < n) && (t + t_shift < k)) @@ -2315,7 +2318,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = batch_nelems * n * m; @@ -2329,7 +2332,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 4; + static constexpr std::size_t preferred_reductions_per_wi = 4; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -2337,7 +2340,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, (preferred_reductions_per_wi * wg); // max_max_wg prevents running out of resources on CPU - constexpr std::size_t max_max_wg = 2048; + static constexpr std::size_t max_max_wg = 2048; std::size_t max_wg = std::min( max_max_wg, dev.get_info() / 2); @@ -2355,7 +2358,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); const OuterInnerDimsIndexerT rhs_indexer( inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); - constexpr TmpIndexerT res_indexer{}; + static constexpr TmpIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::StridedIndexer; @@ -2419,7 +2422,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); const OuterInnerDimsIndexerT rhs_indexer( inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); - constexpr TmpIndexerT res_indexer{}; + static constexpr TmpIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::StridedIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; @@ -2564,7 +2567,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, const ssize_t *res_shape_strides, std::vector const &depends) { - constexpr int wi_delta_n = 2; + static constexpr int wi_delta_n = 2; std::size_t wg_delta_n(16); // rows of A processed in WG std::size_t wg_delta_m(16); // rows of B processed in WG std::size_t wi_delta_k(64); // Elements in K dimension processed by WI @@ -2611,7 +2614,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = batch_nelems * n * m; std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; @@ -2623,7 +2626,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 4; + static constexpr std::size_t preferred_reductions_per_wi = 4; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -2645,7 +2648,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); const OuterInnerDimsIndexerT rhs_indexer( inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); - constexpr TmpIndexerT res_indexer{}; + static constexpr TmpIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::StridedIndexer; @@ -2708,7 +2711,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); const OuterInnerDimsIndexerT rhs_indexer( inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); - constexpr TmpIndexerT res_indexer{}; + static constexpr TmpIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::StridedIndexer; @@ -2866,7 +2869,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, using dpctl::tensor::type_utils::is_complex; if constexpr (!is_complex::value) { if (m < 4) { - constexpr std::uint32_t m_groups_one = 1; + static constexpr std::uint32_t m_groups_one = 1; return gemm_batch_tree_k_impl( exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, @@ -2877,7 +2880,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, res_outer_shapes_strides, res_shape_strides, depends); } else { - constexpr std::uint32_t m_groups_four = 4; + static constexpr std::uint32_t m_groups_four = 4; return gemm_batch_tree_k_impl( exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, @@ -2889,7 +2892,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, } } else { - constexpr std::uint32_t m_groups_one = 1; + static constexpr std::uint32_t m_groups_one = 1; return gemm_batch_tree_k_impl( exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, batch_shape_strides, lhs_batch_offset, rhs_batch_offset, @@ -2902,7 +2905,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, else { // m > 1, n > k or m > k using dpctl::tensor::type_utils::is_complex; if constexpr (!is_complex::value) { - constexpr std::uint32_t m_groups_four = 4; + static constexpr std::uint32_t m_groups_four = 4; return gemm_batch_tree_nm_impl( exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, batch_shape_strides, lhs_batch_offset, rhs_batch_offset, @@ -2912,7 +2915,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, res_outer_shapes_strides, res_shape_strides, depends); } else { // m > 1, n > k or m > k, resTy complex - constexpr std::uint32_t m_groups_one = 1; + static constexpr std::uint32_t m_groups_one = 1; return gemm_batch_tree_nm_impl( exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, batch_shape_strides, lhs_batch_offset, rhs_batch_offset, @@ -2953,9 +2956,9 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, if (k <= (delta_k * n_wi)) { using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT res_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; @@ -2984,7 +2987,7 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = batch_nelems * n * m; @@ -2998,7 +3001,7 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 4; + static constexpr std::size_t preferred_reductions_per_wi = 4; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -3015,9 +3018,9 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT tmp_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; using BatchDimsIndexerT = @@ -3068,9 +3071,9 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT tmp_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; using BatchDimsIndexerT = @@ -3120,7 +3123,7 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, std::size_t m, std::vector const &depends) { - constexpr int wi_delta_n = 2; + static constexpr int wi_delta_n = 2; std::size_t wg_delta_n(16); // rows of A processed in WG std::size_t wg_delta_m(16); // rows of B processed in WG std::size_t wi_delta_k(64); // Elements in K dimension processed by WI @@ -3142,9 +3145,9 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, // temp memory if only one group is needed if (k <= wi_delta_k) { using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT res_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; @@ -3172,7 +3175,7 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = batch_nelems * n * m; std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; @@ -3184,7 +3187,7 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 4; + static constexpr std::size_t preferred_reductions_per_wi = 4; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -3202,9 +3205,9 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT tmp_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; @@ -3256,9 +3259,9 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT tmp_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; using dpctl::tensor::offset_utils::Strided1DIndexer; using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; @@ -3325,9 +3328,9 @@ sycl::event gemm_nm_impl(sycl::queue &exec_q, using BatchDimsIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchDimsIndexerT batch_indexer{}; + static constexpr BatchDimsIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3351,15 +3354,15 @@ gemm_batch_nm_contig_impl(sycl::queue &exec_q, std::vector const &depends = {}) { using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT res_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; if (batch_nelems == single_batch_nelems) { using BatchDimsIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchDimsIndexerT batch_indexer{}; + static constexpr BatchDimsIndexerT batch_indexer{}; sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3517,9 +3520,9 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, ); using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchIndexerT batch_indexer{}; + static constexpr BatchIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, @@ -3543,7 +3546,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = n * m; @@ -3557,7 +3560,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -3574,7 +3577,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, resTy *tmp = tmp_owner.get(); using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr ResIndexerT res_indexer{}; + static constexpr ResIndexerT res_indexer{}; sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -3610,7 +3613,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, partially_reduced_tmp + reduction_nelems * iter_nelems; using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr ResIndexerT res_indexer{}; + static constexpr ResIndexerT res_indexer{}; sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -3652,7 +3655,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, const ssize_t *res_shapes_strides, const std::vector &depends) { - constexpr int wi_delta_n = 2; + static constexpr int wi_delta_n = 2; std::size_t wg_delta_n(16); // rows of A processed in WG std::size_t wg_delta_m(16); // rows of B processed in WG std::size_t wi_delta_k(64); // Elements in K dimension processed by WI @@ -3670,9 +3673,9 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, ); using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchIndexerT batch_indexer{}; + static constexpr BatchIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, @@ -3697,7 +3700,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = n * m; @@ -3710,7 +3713,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -3726,7 +3729,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, resTy *tmp = tmp_owner.get(); using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr ResIndexerT res_indexer{}; + static constexpr ResIndexerT res_indexer{}; sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -3762,7 +3765,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, partially_reduced_tmp + reduction_nelems * iter_nelems; using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr ResIndexerT res_indexer{}; + static constexpr ResIndexerT res_indexer{}; sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -3910,14 +3913,14 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, ); using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT res_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchIndexerT batch_indexer{}; + static constexpr BatchIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; sycl::event gemm_ev; if (k <= (delta_k * n_wi)) { @@ -3933,7 +3936,7 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = n * m; @@ -3947,7 +3950,7 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -4029,7 +4032,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, std::size_t m, std::vector const &depends) { - constexpr int wi_delta_n = 2; + static constexpr int wi_delta_n = 2; std::size_t wg_delta_n(16); // rows of A processed in WG std::size_t wg_delta_m(16); // rows of B processed in WG std::size_t wi_delta_k(64); // Elements in K dimension processed by WI @@ -4047,14 +4050,14 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, ); using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr OuterInnerDimsIndexerT lhs_indexer{}; - constexpr OuterInnerDimsIndexerT rhs_indexer{}; - constexpr OuterInnerDimsIndexerT res_indexer{}; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; - constexpr BatchIndexerT batch_indexer{}; + static constexpr BatchIndexerT batch_indexer{}; - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; // each group processes delta_k items in a column, // so no need to allocate temp memory if one group needed @@ -4072,7 +4075,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, typename std::conditional, sycl::logical_or, sycl::plus>::type; - constexpr resTy identity_val = + static constexpr resTy identity_val = sycl::known_identity::value; std::size_t iter_nelems = n * m; @@ -4085,7 +4088,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, dev.get_info(); std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; std::size_t reductions_per_wi(preferred_reductions_per_wi); std::size_t reduction_groups = @@ -4175,7 +4178,7 @@ sycl::event gemm_contig_tree_impl(sycl::queue &exec_q, const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { - constexpr std::size_t single_batch_nelems = 1; + static constexpr std::size_t single_batch_nelems = 1; return gemm_batch_nm_contig_impl( exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, depends); diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index f056d246c9..7251e3fad9 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -768,7 +768,8 @@ sycl::event reduction_over_group_with_atomics_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - constexpr resTy identity_val = su_ns::Identity::value; + static constexpr resTy identity_val = + su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -825,7 +826,7 @@ sycl::event reduction_over_group_with_atomics_strided_impl( const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) @@ -879,7 +880,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - constexpr resTy identity_val = su_ns::Identity::value; + static constexpr resTy identity_val = + su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -897,7 +899,7 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( InputIterIndexerT{/* size */ iter_nelems, /* step */ reduction_nelems}, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event comp_ev = sequential_reduction(1, (reduction_nelems + wg - 1) / wg) @@ -968,7 +970,8 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - constexpr resTy identity_val = su_ns::Identity::value; + static constexpr resTy identity_val = + su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -1007,14 +1010,14 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = ColsIndexerT; - constexpr NoOpIndexerT columns_indexer{}; - constexpr NoOpIndexerT result_indexer{}; + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, /* step */ iter_nelems}; - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) @@ -1144,7 +1147,8 @@ sycl::event reduction_over_group_temps_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - constexpr resTy identity_val = su_ns::Identity::value; + static constexpr resTy identity_val = + su_ns::Identity::value; if (reduction_nelems == 0) { sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -1196,7 +1200,7 @@ sycl::event reduction_over_group_temps_strided_impl( return comp_ev; } - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU std::size_t max_wg = reduction_detail::get_work_group_size(d); @@ -1272,7 +1276,7 @@ sycl::event reduction_over_group_temps_strided_impl( // inp_indexer const InputIndexerT inp_indexer(iter_nd, iter_arg_offset, iter_shape_and_strides); - constexpr ResIndexerT noop_tmp_indexer{}; + static constexpr ResIndexerT noop_tmp_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, noop_tmp_indexer}; @@ -1317,11 +1321,11 @@ sycl::event reduction_over_group_temps_strided_impl( const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{ inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; partial_reduction_ev = submit_no_atomic_reduction< resTy, resTy, ReductionOpT, InputOutputIterIndexerT, @@ -1354,7 +1358,7 @@ sycl::event reduction_over_group_temps_strided_impl( const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; reductions_per_wi = std::max( @@ -1401,7 +1405,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - constexpr resTy identity_val = su_ns::Identity::value; + static constexpr resTy identity_val = + su_ns::Identity::value; if (reduction_nelems == 0) { sycl::event res_init_ev = exec_q.fill( @@ -1426,7 +1431,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( InputIterIndexerT{/* size */ iter_nelems, /* step */ reduction_nelems}, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event comp_ev = sequential_reduction( @@ -1631,7 +1636,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - constexpr resTy identity_val = su_ns::Identity::value; + static constexpr resTy identity_val = + su_ns::Identity::value; if (reduction_nelems == 0) { sycl::event res_init_ev = exec_q.fill( @@ -1667,7 +1673,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( return comp_ev; } - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU std::size_t max_wg = reduction_detail::get_work_group_size(d); @@ -1683,8 +1689,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = ColsIndexerT; - constexpr NoOpIndexerT columns_indexer{}; - constexpr NoOpIndexerT result_indexer{}; + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, @@ -1741,8 +1747,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = ColsIndexerT; - constexpr NoOpIndexerT columns_indexer{}; - constexpr NoOpIndexerT noop_tmp_indexer{}; + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT noop_tmp_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, noop_tmp_indexer}; const ReductionIndexerT reduction_indexer{ @@ -1783,11 +1789,11 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = submit_no_atomic_reduction< resTy, resTy, ReductionOpT, InputOutputIterIndexerT, @@ -1812,11 +1818,11 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ remaining_reduction_nelems}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; reductions_per_wi = std::max( @@ -2508,8 +2514,10 @@ sycl::event search_over_group_temps_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - constexpr argTy identity_val = su_ns::Identity::value; - constexpr resTy idx_identity_val = su_ns::Identity::value; + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; if (reduction_nelems == 0) { sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -2567,7 +2575,7 @@ sycl::event search_over_group_temps_strided_impl( return comp_ev; } - constexpr std::size_t preferred_reductions_per_wi = 4; + static constexpr std::size_t preferred_reductions_per_wi = 4; // prevents running out of resources on CPU std::size_t max_wg = reduction_detail::get_work_group_size(d); @@ -2651,7 +2659,7 @@ sycl::event search_over_group_temps_strided_impl( // to be accessed by inp_indexer const InputIndexerT inp_indexer(iter_nd, iter_arg_offset, iter_shape_and_strides); - constexpr ResIndexerT noop_tmp_indexer{}; + static constexpr ResIndexerT noop_tmp_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, noop_tmp_indexer}; @@ -2698,11 +2706,11 @@ sycl::event search_over_group_temps_strided_impl( const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = submit_search_reduction( @@ -2800,8 +2808,10 @@ sycl::event search_axis1_over_group_temps_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - constexpr argTy identity_val = su_ns::Identity::value; - constexpr resTy idx_identity_val = su_ns::Identity::value; + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; if (reduction_nelems == 0) { sycl::event res_init_ev = exec_q.fill( @@ -2826,7 +2836,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( InputIterIndexerT{/* size */ iter_nelems, /* step */ reduction_nelems}, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -2846,7 +2856,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( return comp_ev; } - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU std::size_t max_wg = reduction_detail::get_work_group_size(d); @@ -2865,7 +2875,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( InputIterIndexerT{/* size */ iter_nelems, /* step */ reduction_nelems}, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; if (iter_nelems == 1) { // increase GPU occupancy @@ -2930,7 +2940,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( InputIterIndexerT{/* size */ iter_nelems, /* step */ reduction_nelems}, NoOpIndexerT{}}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; first_reduction_ev = submit_search_reduction( @@ -3060,8 +3070,10 @@ sycl::event search_axis0_over_group_temps_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - constexpr argTy identity_val = su_ns::Identity::value; - constexpr resTy idx_identity_val = su_ns::Identity::value; + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; if (reduction_nelems == 0) { sycl::event res_init_ev = exec_q.fill( @@ -3109,7 +3121,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( return comp_ev; } - constexpr std::size_t preferred_reductions_per_wi = 8; + static constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU std::size_t max_wg = reduction_detail::get_work_group_size(d); @@ -3124,8 +3136,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = ColsIndexerT; - constexpr NoOpIndexerT columns_indexer{}; - constexpr NoOpIndexerT result_indexer{}; + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, @@ -3190,8 +3202,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( NoOpIndexerT, NoOpIndexerT>; using ReductionIndexerT = ColsIndexerT; - constexpr NoOpIndexerT columns_indexer{}; - constexpr NoOpIndexerT result_indexer{}; + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; const ReductionIndexerT reduction_indexer{ @@ -3238,11 +3250,11 @@ sycl::event search_axis0_over_group_temps_contig_impl( const InputIndexerT inp_indexer{/* size */ iter_nelems, /* step */ reduction_groups_}; - constexpr ResIndexerT res_iter_indexer{}; + static constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; - constexpr ReductionIndexerT reduction_indexer{}; + static constexpr ReductionIndexerT reduction_indexer{}; sycl::event partial_reduction_ev = submit_search_reduction( diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp index 54d8ff1797..2d7b897ace 100644 --- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -235,7 +235,7 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, T *dst_tp = reinterpret_cast(dst_cp); // orthog ndim indexer - constexpr TwoZeroOffsets_Indexer orthog_indexer{}; + static constexpr TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis const StridedIndexer src_indexer{src_nd, 0, src_shape_strides}; const Strided1DIndexer dst_indexer{/* size */ dst_shape, @@ -422,7 +422,7 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, T *dst_tp = reinterpret_cast(dst_cp); // orthog ndim indexer - constexpr TwoZeroOffsets_Indexer orthog_indexer{}; + static constexpr TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis const StridedIndexer src_indexer(src_nd, 0, src_shape_strides); const Strided1DIndexer dst_indexer{/* size */ dst_shape, diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp index ea4cd085cc..504b47c5f9 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -408,7 +408,7 @@ sort_over_work_group_contig_impl(sycl::queue &q, const std::uint64_t nelems_per_slm = (device_local_memory_size - safety_margin) / (2 * sizeof(T)); - constexpr std::uint32_t sub_groups_per_work_group = 4; + static constexpr std::uint32_t sub_groups_per_work_group = 4; const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; const std::size_t lws = sub_groups_per_work_group * max_sg_size; diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 8af2bc5923..3f4314ea84 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -143,13 +143,13 @@ std::make_unsigned_t order_preserving_cast(IntT val) if constexpr (is_ascending) { // ascending_mask: 100..0 - constexpr UIntT ascending_mask = + static constexpr UIntT ascending_mask = (UIntT(1) << std::numeric_limits::digits); return (uint_val ^ ascending_mask); } else { // descending_mask: 011..1 - constexpr UIntT descending_mask = + static constexpr UIntT descending_mask = (std::numeric_limits::max() >> 1); return (uint_val ^ descending_mask); } @@ -167,11 +167,11 @@ template std::uint16_t order_preserving_cast(sycl::half val) // test the sign bit of the original value const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15)); - constexpr UIntT zero_mask = UIntT(0x8000u); - constexpr UIntT nonzero_mask = UIntT(0xFFFFu); + static constexpr UIntT zero_mask = UIntT(0x8000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFu); - constexpr UIntT inv_zero_mask = static_cast(~zero_mask); - constexpr UIntT inv_nonzero_mask = static_cast(~nonzero_mask); + static constexpr UIntT inv_zero_mask = static_cast(~zero_mask); + static constexpr UIntT inv_nonzero_mask = static_cast(~nonzero_mask); if constexpr (is_ascending) { mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; @@ -200,8 +200,8 @@ std::uint32_t order_preserving_cast(FloatT val) // test the sign bit of the original value const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31)); - constexpr UIntT zero_mask = UIntT(0x80000000u); - constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu); + static constexpr UIntT zero_mask = UIntT(0x80000000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu); if constexpr (is_ascending) mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; @@ -227,8 +227,8 @@ std::uint64_t order_preserving_cast(FloatT val) // test the sign bit of the original value const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63)); - constexpr UIntT zero_mask = UIntT(0x8000000000000000u); - constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu); + static constexpr UIntT zero_mask = UIntT(0x8000000000000000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu); if constexpr (is_ascending) mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; @@ -290,8 +290,9 @@ radix_sort_count_submit(sycl::queue &exec_q, const std::vector &dependency_events) { // bin_count = radix_states used for an array storing bucket state counters - constexpr std::uint32_t radix_states = (std::uint32_t(1) << radix_bits); - constexpr std::uint32_t radix_mask = radix_states - 1; + static constexpr std::uint32_t radix_states = + (std::uint32_t(1) << radix_bits); + static constexpr std::uint32_t radix_mask = radix_states - 1; // iteration space info const std::size_t n = n_values; @@ -444,7 +445,8 @@ sycl::event radix_sort_scan_submit(sycl::queue &exec_q, const std::size_t scan_size = n_segments + 1; wg_size = std::min(scan_size, wg_size); - constexpr std::uint32_t radix_states = std::uint32_t(1) << radix_bits; + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; // compilation of the kernel prevents out of resources issue, which may // occur due to usage of collective algorithms such as joint_exclusive_scan @@ -497,8 +499,8 @@ struct empty_storage // Number with `n` least significant bits of uint32_t inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept { - constexpr std::uint32_t zero{}; - constexpr std::uint32_t all_bits_set = ~zero; + static constexpr std::uint32_t zero{}; + static constexpr std::uint32_t all_bits_set = ~zero; return ~(all_bits_set << n); } @@ -716,8 +718,9 @@ radix_sort_reorder_submit(sycl::queue &exec_q, using ValueT = InputT; using PeerHelper = peer_prefix_helper; - constexpr std::uint32_t radix_states = std::uint32_t{1} << radix_bits; - constexpr std::uint32_t radix_mask = radix_states - 1; + static constexpr std::uint32_t radix_states = std::uint32_t{1} + << radix_bits; + static constexpr std::uint32_t radix_mask = radix_states - 1; const std::size_t elems_per_segment = (n_values + n_segments - 1) / n_segments; @@ -777,7 +780,7 @@ radix_sort_reorder_submit(sycl::queue &exec_q, OffsetT scanned_bin = 0; /* find cumulative offset */ - constexpr std::uint32_t zero_radix_state_id = 0; + static constexpr std::uint32_t zero_radix_state_id = 0; offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id]; for (std::uint32_t radix_state_id = 1; @@ -980,11 +983,12 @@ struct parallel_radix_sort_iteration_step exec_q.get_device() .template get_info(); - constexpr std::size_t two_mils = (std::size_t(1) << 21); + static constexpr std::size_t two_mils = (std::size_t(1) << 21); std::size_t count_wg_size = ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size); - constexpr std::uint32_t radix_states = std::uint32_t(1) << radix_bits; + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; // correct count_wg_size according to local memory limit in count phase const auto max_count_wg_size = _slm_adjusted_work_group_size( @@ -1022,13 +1026,14 @@ struct parallel_radix_sort_iteration_step sycl::event reorder_ev{}; // subgroup_ballot-based peer algo uses extract_bits to populate // uint32_t mask and hence relies on sub-group to be 32 or narrower - constexpr std::size_t sg32_v = 32u; - constexpr std::size_t sg16_v = 16u; - constexpr std::size_t sg08_v = 8u; + static constexpr std::size_t sg32_v = 32u; + static constexpr std::size_t sg16_v = 16u; + static constexpr std::size_t sg08_v = 8u; if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size || sg08_v == reorder_sg_size) { - constexpr auto peer_algorithm = peer_prefix_algo::subgroup_ballot; + static constexpr auto peer_algorithm = + peer_prefix_algo::subgroup_ballot; reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel, radix_bits, peer_algorithm>( @@ -1037,7 +1042,7 @@ struct parallel_radix_sort_iteration_step {scan_ev}); } else { - constexpr auto peer_algorithm = + static constexpr auto peer_algorithm = peer_prefix_algo::scan_then_broadcast; reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel, @@ -1089,7 +1094,7 @@ struct subgroup_radix_sort using _SortKernelGlob = radix_sort_one_wg_krn; - constexpr std::size_t max_concurrent_work_groups = 128U; + static constexpr std::size_t max_concurrent_work_groups = 128U; // Choose this to occupy the entire accelerator const std::size_t n_work_groups = @@ -1104,8 +1109,8 @@ struct subgroup_radix_sort switch (SLM_availability) { case temp_allocations::both_in_slm: { - constexpr auto storage_for_values = use_slm_tag{}; - constexpr auto storage_for_counters = use_slm_tag{}; + static constexpr auto storage_for_values = use_slm_tag{}; + static constexpr auto storage_for_counters = use_slm_tag{}; return one_group_submitter<_SortKernelLoc>()( exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr, @@ -1114,8 +1119,8 @@ struct subgroup_radix_sort } case temp_allocations::counters_in_slm: { - constexpr auto storage_for_values = use_global_mem_tag{}; - constexpr auto storage_for_counters = use_slm_tag{}; + static constexpr auto storage_for_values = use_global_mem_tag{}; + static constexpr auto storage_for_counters = use_slm_tag{}; return one_group_submitter<_SortKernelPartGlob>()( exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr, @@ -1124,8 +1129,8 @@ struct subgroup_radix_sort } default: { - constexpr auto storage_for_values = use_global_mem_tag{}; - constexpr auto storage_for_counters = use_global_mem_tag{}; + static constexpr auto storage_for_values = use_global_mem_tag{}; + static constexpr auto storage_for_counters = use_global_mem_tag{}; return one_group_submitter<_SortKernelGlob>()( exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr, @@ -1186,7 +1191,7 @@ struct subgroup_radix_sort // the kernel is designed for data size <= 64K assert(n <= (SizeT(1) << 16)); - constexpr auto req_slm_size_counters = + static constexpr auto req_slm_size_counters = counter_buf_sz * sizeof(std::uint16_t); const auto &dev = exec_q.get_device(); @@ -1319,7 +1324,7 @@ struct subgroup_radix_sort std::uint16_t wi = ndit.get_local_linear_id(); std::uint16_t begin_bit = 0; - constexpr std::uint16_t end_bit = + static constexpr std::uint16_t end_bit = number_of_bits_in_type(); // copy from input array into values @@ -1358,12 +1363,12 @@ struct subgroup_radix_sort { const std::uint16_t id = wi * block_size + i; - constexpr std::uint16_t bin_mask = - bin_count - 1; + static constexpr std::uint16_t + bin_mask = bin_count - 1; // points to the padded element, i.e. id // is in-range - constexpr std::uint16_t + static constexpr std::uint16_t default_out_of_range_bin_id = bin_mask; @@ -1395,12 +1400,12 @@ struct subgroup_radix_sort { const std::uint16_t id = wi * block_size + i; - constexpr std::uint16_t bin_mask = - bin_count - 1; + static constexpr std::uint16_t + bin_mask = bin_count - 1; // points to the padded element, i.e. id // is in-range - constexpr std::uint16_t + static constexpr std::uint16_t default_out_of_range_bin_id = bin_mask; @@ -1549,7 +1554,7 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, // radix bits represent number of processed bits in each value during one // iteration - constexpr std::uint32_t radix_bits = 4; + static constexpr std::uint32_t radix_bits = 4; sycl::event sort_ev{}; @@ -1557,14 +1562,14 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, const auto max_wg_size = dev.template get_info(); - constexpr std::uint16_t ref_wg_size = 64; + static constexpr std::uint16_t ref_wg_size = 64; if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) { using _RadixSortKernel = OneWorkGroupRadixSortKernel; if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) { // wg_size * block_size == 64 * 1 * 1 == 64 - constexpr std::uint16_t wg_size = ref_wg_size; - constexpr std::uint16_t block_size = 1; + static constexpr std::uint16_t wg_size = ref_wg_size; + static constexpr std::uint16_t block_size = 1; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1573,8 +1578,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) { // wg_size * block_size == 64 * 2 * 1 == 128 - constexpr std::uint16_t wg_size = ref_wg_size * 2; - constexpr std::uint16_t block_size = 1; + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 1; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1583,8 +1588,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) { // wg_size * block_size == 64 * 2 * 2 == 256 - constexpr std::uint16_t wg_size = ref_wg_size * 2; - constexpr std::uint16_t block_size = 2; + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 2; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1593,8 +1598,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) { // wg_size * block_size == 64 * 2 * 4 == 512 - constexpr std::uint16_t wg_size = ref_wg_size * 2; - constexpr std::uint16_t block_size = 4; + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 4; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1603,8 +1608,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) { // wg_size * block_size == 64 * 2 * 8 == 1024 - constexpr std::uint16_t wg_size = ref_wg_size * 2; - constexpr std::uint16_t block_size = 8; + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 8; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1613,8 +1618,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) { // wg_size * block_size == 64 * 4 * 8 == 2048 - constexpr std::uint16_t wg_size = ref_wg_size * 4; - constexpr std::uint16_t block_size = 8; + static constexpr std::uint16_t wg_size = ref_wg_size * 4; + static constexpr std::uint16_t block_size = 8; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1623,8 +1628,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) { // wg_size * block_size == 64 * 4 * 16 == 4096 - constexpr std::uint16_t wg_size = ref_wg_size * 4; - constexpr std::uint16_t block_size = 16; + static constexpr std::uint16_t wg_size = ref_wg_size * 4; + static constexpr std::uint16_t block_size = 16; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1633,8 +1638,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) { // wg_size * block_size == 64 * 8 * 16 == 8192 - constexpr std::uint16_t wg_size = ref_wg_size * 8; - constexpr std::uint16_t block_size = 16; + static constexpr std::uint16_t wg_size = ref_wg_size * 8; + static constexpr std::uint16_t block_size = 16; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1643,8 +1648,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } else { // wg_size * block_size == 64 * 8 * 32 == 16384 - constexpr std::uint16_t wg_size = ref_wg_size * 8; - constexpr std::uint16_t block_size = 32; + static constexpr std::uint16_t wg_size = ref_wg_size * 8; + static constexpr std::uint16_t block_size = 32; sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, radix_bits>{}( @@ -1653,12 +1658,13 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } } else { - constexpr std::uint32_t radix_iters = + static constexpr std::uint32_t radix_iters = number_of_buckets_in_type(radix_bits); - constexpr std::uint32_t radix_states = std::uint32_t(1) << radix_bits; + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; - constexpr std::size_t bound_512k = (std::size_t(1) << 19); - constexpr std::size_t bound_2m = (std::size_t(1) << 21); + static constexpr std::size_t bound_512k = (std::size_t(1) << 19); + static constexpr std::size_t bound_2m = (std::size_t(1) << 21); const auto wg_sz_k = (n_to_sort < bound_512k) ? 8 : (n_to_sort <= bound_2m) ? 4 @@ -1682,7 +1688,7 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, CountT *count_ptr = count_owner.get(); - constexpr std::uint32_t zero_radix_iter{0}; + static constexpr std::uint32_t zero_radix_iter{0}; if constexpr (std::is_same_v) { @@ -1805,7 +1811,7 @@ radix_sort_axis1_contig_impl(sycl::queue &exec_q, reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; using Proj = radix_sort_details::IdentityProj; - constexpr Proj proj_op{}; + static constexpr Proj proj_op{}; sycl::event radix_sort_ev = radix_sort_details::parallel_radix_sort_impl( diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp index ddf61471c6..6042f23e03 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -87,7 +87,7 @@ struct SearchSortedFunctor // position of the needle_v in the hay array indTy pos{}; - constexpr std::size_t zero(0); + static constexpr std::size_t zero(0); if constexpr (left_side) { // search in hay in left-closed interval, give `pos` such that // hay[pos - 1] < needle_v <= hay[pos] @@ -155,9 +155,9 @@ sycl::event searchsorted_contig_impl(sycl::queue &exec_q, using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - constexpr TrivialIndexerT hay_indexer{}; - constexpr TrivialIndexerT needles_indexer{}; - constexpr TrivialIndexerT positions_indexer{}; + static constexpr TrivialIndexerT hay_indexer{}; + static constexpr TrivialIndexerT needles_indexer{}; + static constexpr TrivialIndexerT positions_indexer{}; const auto fnctr = SearchSortedFunctor &dependent_events) { - constexpr std::uint32_t lws = 256; - constexpr std::uint32_t n_wi = 4; + static constexpr std::uint32_t lws = 256; + static constexpr std::uint32_t n_wi = 4; const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws); sycl::range<1> gRange{n_groups * lws}; @@ -72,7 +72,7 @@ sycl::event iota_impl(sycl::queue &exec_q, } if (offset + n_wi * max_sgSize < nelems) { - constexpr auto group_ls_props = syclexp::properties{ + static constexpr auto group_ls_props = syclexp::properties{ syclexp::data_placement_striped // , syclexp::full_group }; @@ -105,8 +105,8 @@ sycl::event map_back_impl(sycl::queue &exec_q, std::size_t row_size, const std::vector &dependent_events) { - constexpr std::uint32_t lws = 64; - constexpr std::uint32_t n_wi = 4; + static constexpr std::uint32_t lws = 64; + static constexpr std::uint32_t n_wi = 4; const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); sycl::range<1> lRange{lws}; diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index fc4e08064d..558cdfc167 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -83,8 +83,8 @@ sycl::event write_out_impl(sycl::queue &exec_q, IndexTy *inds_tp, const std::vector &depends) { - constexpr std::uint32_t lws = 64; - constexpr std::uint32_t n_wi = 4; + static constexpr std::uint32_t lws = 64; + static constexpr std::uint32_t n_wi = 4; const std::size_t nelems = iter_nelems * k; const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); @@ -252,7 +252,7 @@ sycl::event topk_merge_impl( const std::uint64_t nelems_per_slm = (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy)); - constexpr std::uint32_t sub_groups_per_work_group = 4; + static constexpr std::uint32_t sub_groups_per_work_group = 4; const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; std::size_t lws = sub_groups_per_work_group * max_sg_size; diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index 7c103e6ce4..e8309b359c 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -88,7 +88,7 @@ class WhereContigFunctor void operator()(sycl::nd_item<1> ndit) const { - constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; using dpctl::tensor::type_utils::is_complex; if constexpr (!enable_sg_loadstore || is_complex::value || @@ -185,8 +185,8 @@ sycl::event where_contig_impl(sycl::queue &q, cgh.depends_on(depends); std::size_t lws = 64; - constexpr std::uint8_t vec_sz = 4u; - constexpr std::uint8_t n_vecs = 2u; + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); @@ -197,7 +197,7 @@ sycl::event where_contig_impl(sycl::queue &q, is_aligned(x2_cp) && is_aligned(dst_cp)) { - constexpr bool enable_sg_loadstore = true; + static constexpr bool enable_sg_loadstore = true; using KernelName = where_contig_kernel; cgh.parallel_for( @@ -207,7 +207,7 @@ sycl::event where_contig_impl(sycl::queue &q, x2_tp, dst_tp)); } else { - constexpr bool disable_sg_loadstore = false; + static constexpr bool disable_sg_loadstore = false; using InnerKernelName = where_contig_kernel; using KernelName = diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp index 2aae899d50..2ff6c0abe7 100644 --- a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp @@ -55,16 +55,18 @@ template struct WrapIndex ssize_t operator()(ssize_t max_item, IndT ind) const { ssize_t projected; - constexpr ssize_t unit(1); + static constexpr ssize_t unit(1); max_item = sycl::max(max_item, unit); - constexpr std::uintmax_t ind_max = std::numeric_limits::max(); - constexpr std::uintmax_t ssize_max = + static constexpr std::uintmax_t ind_max = + std::numeric_limits::max(); + static constexpr std::uintmax_t ssize_max = std::numeric_limits::max(); if constexpr (std::is_signed_v) { - constexpr std::intmax_t ind_min = std::numeric_limits::min(); - constexpr std::intmax_t ssize_min = + static constexpr std::intmax_t ind_min = + std::numeric_limits::min(); + static constexpr std::intmax_t ssize_min = std::numeric_limits::min(); if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) { @@ -102,25 +104,27 @@ template struct ClipIndex ssize_t operator()(ssize_t max_item, IndT ind) const { ssize_t projected; - constexpr ssize_t unit(1); + static constexpr ssize_t unit(1); max_item = sycl::max(max_item, unit); - constexpr std::uintmax_t ind_max = std::numeric_limits::max(); - constexpr std::uintmax_t ssize_max = + static constexpr std::uintmax_t ind_max = + std::numeric_limits::max(); + static constexpr std::uintmax_t ssize_max = std::numeric_limits::max(); if constexpr (std::is_signed_v) { - constexpr std::intmax_t ind_min = std::numeric_limits::min(); - constexpr std::intmax_t ssize_min = + static constexpr std::intmax_t ind_min = + std::numeric_limits::min(); + static constexpr std::intmax_t ssize_min = std::numeric_limits::min(); if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) { const ssize_t ind_ = static_cast(ind); - constexpr ssize_t lb(0); + static constexpr ssize_t lb(0); const ssize_t ub = max_item - 1; projected = sycl::clamp(ind_, lb, ub); } else { - constexpr IndT lb(0); + static constexpr IndT lb(0); const IndT ub = static_cast(max_item - 1); projected = static_cast(sycl::clamp(ind, lb, ub)); } diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp index a49b56b6ba..92b3a3dc56 100644 --- a/dpctl/tensor/libtensor/include/utils/math_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp @@ -124,7 +124,7 @@ template T logaddexp(T x, T y) } else { const T tmp = x - y; - constexpr T zero(0); + static constexpr T zero(0); return (tmp > zero) ? (x + sycl::log1p(sycl::exp(-tmp))) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 73a09011a0..90a6e204ad 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -181,7 +181,7 @@ sycl::event async_smart_free(sycl::queue &exec_q, const std::vector &depends, UniquePtrTs &&...unique_pointers) { - constexpr std::size_t n = sizeof...(UniquePtrTs); + static constexpr std::size_t n = sizeof...(UniquePtrTs); static_assert( n > 0, "async_smart_free requires at least one smart pointer argument"); diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index ece8852643..f78193e614 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -168,9 +168,9 @@ T custom_reduce_over_group(const GroupT &wg, { // value experimentally tuned to achieve best runtime on Iris Xe, // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU. - constexpr std::uint32_t low_sz = 8u; + static constexpr std::uint32_t low_sz = 8u; // maximal work-group size - constexpr std::uint32_t high_sz = 1024u; + static constexpr std::uint32_t high_sz = 1024u; const std::uint32_t wgs = wg.get_local_linear_range(); const std::uint32_t lid = wg.get_local_linear_id(); @@ -568,7 +568,8 @@ auto sub_group_load(const sycl::sub_group &sg, #if (USE_GROUP_LOAD_STORE) using ValueT = typename std::remove_cv_t; sycl::vec x{}; - constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; ls_ns::group_load(sg, m_ptr, x, striped); return x; #else @@ -585,7 +586,8 @@ auto sub_group_load(const sycl::sub_group &sg, #if (USE_GROUP_LOAD_STORE) using ValueT = typename std::remove_cv_t; ValueT x{}; - constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; ls_ns::group_load(sg, m_ptr, x, striped); return x; #else @@ -607,7 +609,8 @@ sub_group_store(const sycl::sub_group &sg, { #if (USE_GROUP_LOAD_STORE) static_assert(std::is_same_v); - constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; ls_ns::group_store(sg, val, m_ptr, striped); return; #else @@ -628,7 +631,8 @@ sub_group_store(const sycl::sub_group &sg, sycl::multi_ptr m_ptr) { #if (USE_GROUP_LOAD_STORE) - constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; ls_ns::group_store(sg, val, m_ptr, striped); return; #else diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index f1e2d4986b..10a9ff8378 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -148,8 +148,10 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); // cumsum must be int32_t/int64_t only - constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { throw py::value_error( "Cumulative sum array must have int32 or int64 data-type."); @@ -307,7 +309,8 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); // this cumsum must be int64_t only - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); if (cumsum_typeid != int64_typeid) { throw py::value_error( "Cumulative sum array must have int64 data-type."); diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp index 2352b6ab30..23e0c28c1a 100644 --- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -250,7 +250,7 @@ std::pair py_accumulate_final_axis_include_initial( "the same array ranks"); } - constexpr int acc_nd = 1; + static constexpr int acc_nd = 1; int iter_nd = src_nd - acc_nd; if (iter_nd < 0) { diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp index 85d861dcd7..aa1f4aa091 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp @@ -133,7 +133,7 @@ struct CumLogSumExp1DContigFactory srcTy, dstTy>::is_defined) { using ScanOpT = su_ns::LogSumExp; - constexpr bool include_initial = false; + static constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -166,7 +166,7 @@ struct CumLogSumExp1DIncludeInitialContigFactory srcTy, dstTy>::is_defined) { using ScanOpT = su_ns::LogSumExp; - constexpr bool include_initial = true; + static constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -199,7 +199,7 @@ struct CumLogSumExpStridedFactory srcTy, dstTy>::is_defined) { using ScanOpT = su_ns::LogSumExp; - constexpr bool include_initial = false; + static constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -232,7 +232,7 @@ struct CumLogSumExpIncludeInitialStridedFactory srcTy, dstTy>::is_defined) { using ScanOpT = su_ns::LogSumExp; - constexpr bool include_initial = true; + static constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp index 045b1b330e..750da77a1b 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -146,7 +146,7 @@ struct CumProd1DContigFactory dstTy>::is_defined) { using ScanOpT = CumProdScanOpT; - constexpr bool include_initial = false; + static constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -179,7 +179,7 @@ struct CumProd1DIncludeInitialContigFactory dstTy>::is_defined) { using ScanOpT = CumProdScanOpT; - constexpr bool include_initial = true; + static constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -212,7 +212,7 @@ struct CumProdStridedFactory dstTy>::is_defined) { using ScanOpT = CumProdScanOpT; - constexpr bool include_initial = false; + static constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -245,7 +245,7 @@ struct CumProdIncludeInitialStridedFactory dstTy>::is_defined) { using ScanOpT = CumProdScanOpT; - constexpr bool include_initial = true; + static constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp index e44678e15f..7136e9005a 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -145,7 +145,7 @@ struct CumSum1DContigFactory dstTy>::is_defined) { using ScanOpT = CumSumScanOpT; - constexpr bool include_initial = false; + static constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -178,7 +178,7 @@ struct CumSum1DIncludeInitialContigFactory dstTy>::is_defined) { using ScanOpT = CumSumScanOpT; - constexpr bool include_initial = true; + static constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -211,7 +211,7 @@ struct CumSumStridedFactory dstTy>::is_defined) { using ScanOpT = CumSumScanOpT; - constexpr bool include_initial = false; + static constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: @@ -244,7 +244,7 @@ struct CumSumIncludeInitialStridedFactory dstTy>::is_defined) { using ScanOpT = CumSumScanOpT; - constexpr bool include_initial = true; + static constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; fnT fn = dpctl::tensor::kernels::accumulators:: diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index b4ec15c96f..1de4bd74e0 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -224,8 +224,10 @@ py_extract(const dpctl::tensor::usm_ndarray &src, int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); - constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { throw py::value_error("Unexpected data type of cumsum array, expecting " "'int32' or 'int64'"); @@ -550,8 +552,10 @@ py_place(const dpctl::tensor::usm_ndarray &dst, int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum); int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); - constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { throw py::value_error("Unexpected data type of cumsum array, expecting " "'int32' or 'int64'"); diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp index 04ddef3495..8b988e73ed 100644 --- a/dpctl/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp @@ -116,7 +116,7 @@ template std::size_t get_nelems(const std::vector &shape) return prod * static_cast(term); }; - constexpr std::size_t unit{1}; + static constexpr std::size_t unit{1}; const std::size_t nelems = std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn); @@ -185,7 +185,7 @@ py_as_c_contig(const dpctl::tensor::usm_ndarray &src, if (src_nd >= 2) { auto n = dst_shape_vec.back(); if (n == dst_shape_vec[src_nd - 2]) { - constexpr auto unit_stride = py::ssize_t(1); + static constexpr auto unit_stride = py::ssize_t(1); if (src_strides_vec[src_nd - 2] == unit_stride) { return py_as_c_contig_f2c(src, dst, exec_q, depends); } @@ -310,7 +310,7 @@ py_as_f_contig(const dpctl::tensor::usm_ndarray &src, if (src_nd >= 2) { auto n = dst_shape_vec.front(); if (n == dst_shape_vec[1]) { - constexpr auto unit_stride = py::ssize_t(1); + static constexpr auto unit_stride = py::ssize_t(1); if (src_strides_vec[1] == unit_stride) { return py_as_f_contig_c2f(src, dst, exec_q, depends); } diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp index 9bac9db768..ed46fac47d 100644 --- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp @@ -160,7 +160,7 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, auto fn = copy_for_roll_contig_dispatch_vector[type_id]; if (fn != nullptr) { - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event copy_for_roll_ev = fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data, @@ -331,8 +331,8 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, auto const &dst_strides = dst.get_strides_vector(); auto const &common_shape = src.get_shape_vector(); - constexpr py::ssize_t src_offset = 0; - constexpr py::ssize_t dst_offset = 0; + static constexpr py::ssize_t src_offset = 0; + static constexpr py::ssize_t dst_offset = 0; auto fn = copy_for_roll_ndshift_dispatch_vector[type_id]; diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index 6a53ad6cc0..6b52de7eee 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -160,7 +160,7 @@ void copy_numpy_ndarray_into_usm_ndarray( copy_and_cast_from_host_contig_blocking_dispatch_table [dst_type_id][src_type_id]; - constexpr py::ssize_t zero_offset(0); + static constexpr py::ssize_t zero_offset(0); copy_and_cast_from_host_contig_blocking_fn( exec_q, src_nelems, src_data, zero_offset, dst_data, diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index c046321006..59c9c570c9 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -458,7 +458,7 @@ std::pair py_binary_ufunc( std::initializer_list{0, 1}; static constexpr auto one_zero_strides = std::initializer_list{1, 0}; - constexpr py::ssize_t one{1}; + static constexpr py::ssize_t one{1}; // special case of C-contiguous matrix and a row if (isEqual(simplified_src2_strides, zero_one_strides) && isEqual(simplified_src1_strides, {simplified_shape[1], one}) && @@ -751,7 +751,7 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, if (nd == 2) { static constexpr auto one_zero_strides = std::initializer_list{1, 0}; - constexpr py::ssize_t one{1}; + static constexpr py::ssize_t one{1}; // special case of C-contiguous matrix and a row if (isEqual(simplified_rhs_strides, one_zero_strides) && isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp index 0e3fb38015..1ad4889fbd 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -298,11 +298,15 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, const char *src_data = src.get_data(); char *dst_data = dst.get_data(); - constexpr int float16_typeid = static_cast(td_ns::typenum_t::HALF); - constexpr int float32_typeid = static_cast(td_ns::typenum_t::FLOAT); - constexpr int float64_typeid = static_cast(td_ns::typenum_t::DOUBLE); - constexpr int complex64_typeid = static_cast(td_ns::typenum_t::CFLOAT); - constexpr int complex128_typeid = + static constexpr int float16_typeid = + static_cast(td_ns::typenum_t::HALF); + static constexpr int float32_typeid = + static_cast(td_ns::typenum_t::FLOAT); + static constexpr int float64_typeid = + static_cast(td_ns::typenum_t::DOUBLE); + static constexpr int complex64_typeid = + static_cast(td_ns::typenum_t::CFLOAT); + static constexpr int complex128_typeid = static_cast(td_ns::typenum_t::CDOUBLE); // statically pre-allocated memory for scalar @@ -370,7 +374,7 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, if (nd == 0) { // handle 0d array as 1d array with 1 element - constexpr py::ssize_t one{1}; + static constexpr py::ssize_t one{1}; simplified_shape.push_back(one); simplified_src_strides.push_back(one); simplified_dst_strides.push_back(one); diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index fe668422a0..1b3e14b245 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -127,7 +127,7 @@ sycl::event full_contig_impl(sycl::queue &exec_q, } if (is_zero) { - constexpr int memset_val = 0; + static constexpr int memset_val = 0; fill_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp index ce267baa1b..57bf88c00c 100644 --- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp +++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp @@ -346,7 +346,7 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, [x2_typeid]; } if (fn != nullptr) { - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), x2.get_data(), dst.get_data(), zero_offset, // lhs batch offset @@ -601,7 +601,7 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, [x2_typeid]; } if (fn != nullptr) { - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches, x1_outer_nelems, // n inner_nelems, // k diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp index 4139cbce19..bc24c71658 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -52,8 +52,8 @@ template bool check_atomic_support(const sycl::queue &exec_q, sycl::usm::alloc usm_alloc_type) { - constexpr bool atomic32 = (sizeof(T) == 4); - constexpr bool atomic64 = (sizeof(T) == 8); + static constexpr bool atomic32 = (sizeof(T) == 4); + static constexpr bool atomic64 = (sizeof(T) == 8); using dpctl::tensor::type_utils::is_complex; if constexpr ((!atomic32 && !atomic64) || is_complex::value) { return fixed_decision(exec_q, usm_alloc_type); diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index 9458374482..14abb2eb2e 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -266,7 +266,7 @@ std::pair py_reduction_over_axis( if (fn != nullptr) { std::size_t iter_nelems = dst_nelems; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -299,7 +299,7 @@ std::pair py_reduction_over_axis( if (fn != nullptr) { std::size_t iter_nelems = dst_nelems; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -586,7 +586,7 @@ std::pair py_tree_reduction_over_axis( if (fn != nullptr) { std::size_t iter_nelems = dst_nelems; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -610,7 +610,7 @@ std::pair py_tree_reduction_over_axis( if (fn != nullptr) { std::size_t iter_nelems = dst_nelems; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -863,7 +863,7 @@ std::pair py_search_over_axis( if (fn != nullptr) { std::size_t iter_nelems = dst_nelems; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -885,7 +885,7 @@ std::pair py_search_over_axis( if (fn != nullptr) { std::size_t iter_nelems = dst_nelems; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -1128,7 +1128,8 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, int src_typeid = array_types.typenum_to_lookup_id(src_typenum); int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); if (dst_typeid != int32_typeid) { throw py::value_error( "Unexpected data type of destination array, expecting 'int32'"); @@ -1152,7 +1153,7 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, (is_src_f_contig && dst_nelems == 0)) { auto fn = axis1_contig_dispatch_vector[src_typeid]; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event red_ev = fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, @@ -1167,7 +1168,7 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) { auto fn = axis0_contig_dispatch_vector[src_typeid]; - constexpr py::ssize_t zero_offset = 0; + static constexpr py::ssize_t zero_offset = 0; sycl::event red_ev = fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 25146eac88..9dfc84681e 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -198,7 +198,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, "Destination array must have the same elemental data type"); } - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); if (cumsum_typeid != int64_typeid) { throw py::value_error( "Unexpected data type of `cumsum` array, expecting " @@ -425,7 +426,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, "Destination array must have the same elemental data type"); } - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); if (cumsum_typeid != int64_typeid) { throw py::value_error( "Unexpected data type of `cumsum` array, expecting " diff --git a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp index 387c1cb521..2a727c4bd9 100644 --- a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp +++ b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp @@ -127,7 +127,7 @@ py_sort(const dpctl::tensor::usm_ndarray &src, bool is_dst_c_contig = dst.is_c_contiguous(); if (is_src_c_contig && is_dst_c_contig) { - constexpr py::ssize_t zero_offset = py::ssize_t(0); + static constexpr py::ssize_t zero_offset = py::ssize_t(0); auto fn = sort_contig_fns[src_typeid]; diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp index 174214e4c9..fd73c6ecdf 100644 --- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp @@ -74,7 +74,7 @@ struct LeftSideSearchSortedContigFactory if constexpr (std::is_same_v || std::is_same_v) { - constexpr bool left_side_search(true); + static constexpr bool left_side_search(true); using dpctl::tensor::kernels::searchsorted_contig_impl; using Compare = typename AscendingSorter::type; @@ -98,7 +98,7 @@ struct RightSideSearchSortedContigFactory if constexpr (std::is_same_v || std::is_same_v) { - constexpr bool right_side_search(false); + static constexpr bool right_side_search(false); using dpctl::tensor::kernels::searchsorted_contig_impl; using Compare = typename AscendingSorter::type; @@ -130,7 +130,7 @@ struct LeftSideSearchSortedStridedFactory if constexpr (std::is_same_v || std::is_same_v) { - constexpr bool left_side_search(true); + static constexpr bool left_side_search(true); using dpctl::tensor::kernels::searchsorted_strided_impl; using Compare = typename AscendingSorter::type; @@ -154,7 +154,7 @@ struct RightSideSearchSortedStridedFactory if constexpr (std::is_same_v || std::is_same_v) { - constexpr bool right_side_search(false); + static constexpr bool right_side_search(false); using dpctl::tensor::kernels::searchsorted_strided_impl; using Compare = typename AscendingSorter::type; @@ -321,7 +321,7 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, [positions_typeid]; if (fn) { - constexpr py::ssize_t zero_offset(0); + static constexpr py::ssize_t zero_offset(0); sycl::event comp_ev = fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, @@ -399,7 +399,7 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, "No implementation for data types of input arrays"); } - constexpr py::ssize_t zero_offset(0); + static constexpr py::ssize_t zero_offset(0); py::ssize_t hay_step = hay.get_strides_vector()[0]; const sycl::event &comp_ev = strided_fn( @@ -429,7 +429,7 @@ py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay, sycl::queue &exec_q, const std::vector &depends) { - constexpr bool side_left(true); + static constexpr bool side_left(true); return py_searchsorted(hay, needles, positions, exec_q, side_left, depends); } @@ -443,7 +443,7 @@ py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay, sycl::queue &exec_q, const std::vector &depends) { - constexpr bool side_right(false); + static constexpr bool side_right(false); return py_searchsorted(hay, needles, positions, exec_q, side_right, depends); } diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp index 65bfe5bef3..ef74469a8b 100644 --- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp @@ -78,7 +78,7 @@ sycl::event zeros_contig_impl(sycl::queue &exec_q, const std::vector &depends) { - constexpr int memset_val(0); + static constexpr int memset_val(0); sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); diff --git a/dpctl/utils/src/sequential_order_keeper.hpp b/dpctl/utils/src/sequential_order_keeper.hpp index b29e27a2b9..a6cba662b0 100644 --- a/dpctl/utils/src/sequential_order_keeper.hpp +++ b/dpctl/utils/src/sequential_order_keeper.hpp @@ -7,9 +7,10 @@ namespace { -bool is_event_complete(const sycl::event &e) +inline bool is_event_complete(const sycl::event &e) { - constexpr auto exec_complete = sycl::info::event_command_status::complete; + static constexpr auto exec_complete = + sycl::info::event_command_status::complete; const auto status = e.get_info(); diff --git a/libsyclinterface/source/dpctl_sycl_device_manager.cpp b/libsyclinterface/source/dpctl_sycl_device_manager.cpp index e87f7dd0ab..3bc213fc43 100644 --- a/libsyclinterface/source/dpctl_sycl_device_manager.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_manager.cpp @@ -269,7 +269,7 @@ DPCTLDeviceMgr_GetDeviceInfoStr(__dpctl_keep const DPCTLSyclDeviceRef DRef) int DPCTLDeviceMgr_GetPositionInDevices(__dpctl_keep DPCTLSyclDeviceRef DRef, int device_identifier) { - constexpr int not_found = -1; + static constexpr int not_found = -1; if (!DRef) { return not_found; } diff --git a/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp index d7c5bcedbc..8e15f694a7 100644 --- a/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp @@ -100,7 +100,7 @@ __dpctl_give DPCTLSyclDeviceSelectorRef DPCTLGPUSelector_Create() int DPCTLDeviceSelector_Score(__dpctl_keep DPCTLSyclDeviceSelectorRef DSRef, __dpctl_keep DPCTLSyclDeviceRef DRef) { - constexpr int REJECT_DEVICE_SCORE = -1; + static constexpr int REJECT_DEVICE_SCORE = -1; if (DSRef && DRef) { auto dev = *(unwrap(DRef)); return (*unwrap(DSRef))(dev); diff --git a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp index 70d89fd4fb..78c714ecbb 100644 --- a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp @@ -72,7 +72,7 @@ static const int clLibLoadFlags = 0; case code: \ return std::string(#code) + CodeStringSuffix(code) -constexpr backend cl_be = backend::opencl; +static constexpr backend cl_be = backend::opencl; struct cl_loader { @@ -339,7 +339,7 @@ static const int zeLibLoadFlags = 0; #error "Level Zero program compilation is unavailable for this platform" #endif -constexpr sycl::backend ze_be = sycl::backend::ext_oneapi_level_zero; +static constexpr sycl::backend ze_be = sycl::backend::ext_oneapi_level_zero; struct ze_loader { diff --git a/libsyclinterface/tests/test_sycl_device_aspects.cpp b/libsyclinterface/tests/test_sycl_device_aspects.cpp index 8c0ffe1f16..fc14ea131f 100644 --- a/libsyclinterface/tests/test_sycl_device_aspects.cpp +++ b/libsyclinterface/tests/test_sycl_device_aspects.cpp @@ -99,10 +99,10 @@ auto build_gtest_values(const std::array, N> ¶ms) auto build_params() { - constexpr auto param_1 = get_param_list( + static constexpr auto param_1 = get_param_list( "opencl:gpu", "opencl:cpu", "level_zero:gpu"); - constexpr auto param_2 = + static constexpr auto param_2 = get_param_list>( std::make_pair("cpu", sycl::aspect::cpu), std::make_pair("gpu", sycl::aspect::gpu), diff --git a/libsyclinterface/tests/test_sycl_queue_submit.cpp b/libsyclinterface/tests/test_sycl_queue_submit.cpp index 651c98fd61..ab5b6bef82 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit.cpp @@ -45,7 +45,7 @@ namespace { -constexpr std::size_t SIZE = 1024; +static constexpr std::size_t SIZE = 1024; static_assert(SIZE % 8 == 0); using namespace dpctl::syclinterface; @@ -59,10 +59,10 @@ void submit_kernel(DPCTLSyclQueueRef QRef, std::string kernelName) { T scalarVal = 3; - constexpr std::size_t NARGS = 4; - constexpr std::size_t RANGE_NDIMS_1 = 1; - constexpr std::size_t RANGE_NDIMS_2 = 2; - constexpr std::size_t RANGE_NDIMS_3 = 3; + static constexpr std::size_t NARGS = 4; + static constexpr std::size_t RANGE_NDIMS_1 = 1; + static constexpr std::size_t RANGE_NDIMS_2 = 2; + static constexpr std::size_t RANGE_NDIMS_3 = 3; ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); @@ -373,7 +373,7 @@ TEST_F(TestQueueSubmit, CheckForUnsupportedArgTy) int scalarVal = 3; std::size_t Range[] = {SIZE}; std::size_t RANGE_NDIMS = 1; - constexpr std::size_t NARGS = 4; + static constexpr std::size_t NARGS = 4; auto kernel = DPCTLKernelBundle_GetKernel(KBRef, "_ZTS11RangeKernelIdE"); void *args[NARGS] = {unwrap(nullptr), unwrap(nullptr), diff --git a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp index a8a9416c67..f6110375bb 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp @@ -44,7 +44,7 @@ namespace { -constexpr std::size_t SIZE = 320; +static constexpr std::size_t SIZE = 320; static_assert(SIZE % 10 == 0); @@ -58,8 +58,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef, DPCTLKernelArgType kernelArgTy, std::string kernelName) { - constexpr std::size_t NARGS = 2; - constexpr std::size_t RANGE_NDIMS = 1; + static constexpr std::size_t NARGS = 2; + static constexpr std::size_t RANGE_NDIMS = 1; ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); @@ -366,7 +366,7 @@ TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUnsupportedArgTy) std::size_t gRange[] = {SIZE}; std::size_t lRange[] = {SIZE / 10}; std::size_t RANGE_NDIMS = 1; - constexpr std::size_t NARGS = 2; + static constexpr std::size_t NARGS = 2; auto la = MDLocalAccessor{1, DPCTL_UNSUPPORTED_KERNEL_ARG, SIZE / 10, 1, 1}; auto kernel = DPCTLKernelBundle_GetKernel(KBRef, "_ZTS14SyclKernel_SLMImE"); diff --git a/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp index 2d120f7157..f40bc20066 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp @@ -46,7 +46,7 @@ namespace { -constexpr std::size_t SIZE = 320; +static constexpr std::size_t SIZE = 320; static_assert(SIZE % 10 == 0); @@ -71,8 +71,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef, return; } - constexpr std::size_t NARGS = 2; - constexpr std::size_t RANGE_NDIMS = 1; + static constexpr std::size_t NARGS = 2; + static constexpr std::size_t RANGE_NDIMS = 1; ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); diff --git a/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp index 658cca428a..d0f44b7275 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp @@ -46,7 +46,7 @@ namespace { -constexpr std::size_t SIZE = 320; +static constexpr std::size_t SIZE = 320; static_assert(SIZE % 10 == 0); @@ -67,8 +67,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef, return; } - constexpr std::size_t NARGS = 2; - constexpr std::size_t RANGE_NDIMS = 1; + static constexpr std::size_t NARGS = 2; + static constexpr std::size_t RANGE_NDIMS = 1; ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); diff --git a/libsyclinterface/tests/test_sycl_usm_interface.cpp b/libsyclinterface/tests/test_sycl_usm_interface.cpp index e04cbc1ad8..ad693661ff 100644 --- a/libsyclinterface/tests/test_sycl_usm_interface.cpp +++ b/libsyclinterface/tests/test_sycl_usm_interface.cpp @@ -42,7 +42,7 @@ using namespace sycl; namespace { -constexpr size_t SIZE = 1024; +static constexpr size_t SIZE = 1024; void common_test_body(size_t nbytes, const DPCTLSyclUSMRef Ptr,