IntelPython · ndgrigorian · Jun 2, 2025 · Jun 4, 2025
@@ -77,19 +77,19 @@ DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef,
             });
         });
 
-        constexpr int result_ok = 0;
+        static constexpr int result_ok = 0;
 
         *status = result_ok;
         auto e_ptr = new sycl::event(ht_ev);
         return wrap<sycl::event>(e_ptr);
     } catch (const std::exception &e) {
-        constexpr int result_std_exception = 1;
+        static constexpr int result_std_exception = 1;
 
         *status = result_std_exception;
         return nullptr;
     }
 
-    constexpr int result_other_abnormal = 2;
+    static constexpr int result_other_abnormal = 2;
 
     *status = result_other_abnormal;
     return nullptr;

@@ -60,9 +60,9 @@ template <typename inputT, typename outputT> struct NonZeroIndicator
 
     outputT operator()(const inputT &val) const
     {
-        constexpr outputT out_one(1);
-        constexpr outputT out_zero(0);
-        constexpr inputT val_zero(0);
+        static constexpr outputT out_one(1);
+        static constexpr outputT out_zero(0);
+        static constexpr inputT val_zero(0);
 
         return (val == val_zero) ? out_zero : out_one;
     }
@@ -583,7 +583,7 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q,
         cgh.depends_on(dependent_event);
         cgh.use_kernel_bundle(kb);
 
-        constexpr nwiT updates_per_wi = n_wi;
+        static constexpr nwiT updates_per_wi = n_wi;
         const std::size_t n_items =
             ceiling_quotient<std::size_t>(src_size, sg_size * n_wi) * sg_size;
 
@@ -594,8 +594,8 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q,
         cgh.parallel_for<UpdateKernelName>(
             ndRange,
             [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) {
-                constexpr ScanOpT scan_op{};
-                constexpr outputT identity =
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
                     su_ns::Identity<ScanOpT, outputT>::value;
 
                 const std::uint32_t lws = ndit.get_local_range(0);
@@ -640,16 +640,17 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
                                    std::vector<sycl::event> &host_tasks,
                                    const std::vector<sycl::event> &depends = {})
 {
-    constexpr ScanOpT scan_op{};
-    constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
 
-    constexpr std::size_t _iter_nelems = 1;
+    static constexpr std::size_t _iter_nelems = 1;
 
     using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
-    constexpr IterIndexerT _no_op_iter_indexer{};
+    static constexpr IterIndexerT _no_op_iter_indexer{};
 
     using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr NoOpIndexerT _no_op_indexer{};
+    static constexpr NoOpIndexerT _no_op_indexer{};
 
     std::size_t n_groups;
     sycl::event inc_scan_phase1_ev =
@@ -687,7 +688,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
         outputT *local_scans = temp;
 
         using NoOpTransformerT = NoOpTransformer<outputT>;
-        constexpr NoOpTransformerT _no_op_transformer{};
+        static constexpr NoOpTransformerT _no_op_transformer{};
         std::size_t size_to_update = n_elems;
         while (n_groups_ > 1) {
 
@@ -761,16 +762,16 @@ accumulate_1d_contig_impl(sycl::queue &q,
     dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
 
     using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr NoOpIndexerT flat_indexer{};
-    constexpr transformerT transformer{};
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
 
     sycl::event comp_ev;
     const sycl::device &dev = q.get_device();
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
                                          transformerT, AccumulateOpT,
@@ -779,7 +780,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
             flat_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
@@ -829,7 +830,7 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q,
     const std::uint32_t sg_size = krn.template get_info<
         sycl::info::kernel_device_specific::max_sub_group_size>(dev);
 
-    constexpr nwiT updates_per_wi = n_wi;
+    static constexpr nwiT updates_per_wi = n_wi;
     const std::size_t updates_per_sg = sg_size * updates_per_wi;
     const std::size_t update_nelems =
         ceiling_quotient(src_size, updates_per_sg) * sg_size;
@@ -845,8 +846,8 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q,
         cgh.parallel_for<UpdateKernelName>(
             ndRange, [chunk_size, src_size, local_stride, src, local_scans,
                       out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) {
-                constexpr ScanOpT scan_op{};
-                constexpr outputT identity =
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
                     su_ns::Identity<ScanOpT, outputT>::value;
 
                 const std::uint32_t lws = ndit.get_local_range(1);
@@ -898,8 +899,8 @@ sycl::event update_local_chunks(sycl::queue &exec_q,
                                 std::size_t local_stride,
                                 sycl::event dependent_event)
 {
-    constexpr NoOpIndexer out_indexer{};
-    constexpr NoOpIndexer iter_out_indexer{};
+    static constexpr NoOpIndexer out_indexer{};
+    static constexpr NoOpIndexer iter_out_indexer{};
 
     return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
                                      NoOpIndexer, NoOpIndexer, ScanOpT>(
@@ -933,8 +934,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
                                 std::vector<sycl::event> &host_tasks,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr ScanOpT scan_op{};
-    constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
 
     using IterIndexerT =
         dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
@@ -977,9 +979,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
         outputT *local_scans = temp;
 
         using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        constexpr NoOpIndexerT _no_op_indexer{};
+        static constexpr NoOpIndexerT _no_op_indexer{};
         using NoOpTransformerT = NoOpTransformer<outputT>;
-        constexpr NoOpTransformerT _no_op_transformer{};
+        static constexpr NoOpTransformerT _no_op_transformer{};
         std::size_t size_to_update = acc_nelems;
 
         {
@@ -1142,15 +1144,15 @@ accumulate_strided_impl(sycl::queue &q,
                                        iter_shape_strides,
                                        iter_shape_strides + 2 * iter_nd};
 
-    constexpr transformerT transformer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
 
     const sycl::device &dev = q.get_device();
     sycl::event comp_ev;
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev =
             inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
@@ -1161,7 +1163,7 @@ accumulate_strided_impl(sycl::queue &q,
                 out_axis_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
@@ -1198,18 +1200,18 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
 
     using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr NoOpIndexerT flat_indexer{};
-    constexpr transformerT transformer{};
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
-    constexpr bool include_initial = false;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
     using AccumulateOpT = sycl::plus<cumsumT>;
 
     sycl::event comp_ev;
     const sycl::device &dev = q.get_device();
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
                                          NoOpIndexerT, transformerT,
@@ -1218,7 +1220,7 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
             flat_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
@@ -1313,17 +1315,17 @@ cumsum_val_strided_impl(sycl::queue &q,
 
     using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
     const StridedIndexerT strided_indexer{nd, 0, shape_strides};
-    constexpr transformerT transformer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
-    constexpr bool include_initial = false;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
     using AccumulateOpT = sycl::plus<cumsumT>;
 
     const sycl::device &dev = q.get_device();
     sycl::event comp_ev;
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
                                          StridedIndexerT, transformerT,
@@ -1332,7 +1334,7 @@ cumsum_val_strided_impl(sycl::queue &q,
             strided_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =

@@ -30,7 +30,7 @@ namespace kernels
 namespace alignment_utils
 {
 
-static constexpr std::size_t required_alignment = 64UL;
+inline constexpr std::size_t required_alignment = 64UL;
 
 template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
 {

@@ -227,11 +227,11 @@ std::size_t _get_lws_impl(std::size_t n)
     }
 }
 
-std::size_t get_lws(std::size_t n)
+inline std::size_t get_lws(std::size_t n)
 {
-    constexpr std::size_t lws0 = 256u;
-    constexpr std::size_t lws1 = 128u;
-    constexpr std::size_t lws2 = 64u;
+    static constexpr std::size_t lws0 = 256u;
+    static constexpr std::size_t lws1 = 128u;
+    static constexpr std::size_t lws2 = 64u;
     return _get_lws_impl<lws0, lws1, lws2>(n);
 }
 
@@ -261,9 +261,9 @@ sycl::event masked_extract_all_slices_contig_impl(
     ssize_t dst_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
 
-    constexpr NoOpIndexer masked_src_indexer{};
+    static constexpr NoOpIndexer masked_src_indexer{};
     const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
                                               /* step */ dst_stride);
 
@@ -339,7 +339,7 @@ sycl::event masked_extract_all_slices_strided_impl(
     ssize_t dst_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
 
     /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
      * *_packed_shape_strides) */
@@ -578,7 +578,7 @@ sycl::event masked_place_all_slices_strided_impl(
     ssize_t rhs_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+    static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
 
     /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
      * *_packed_shape_strides) */
@@ -589,7 +589,7 @@ sycl::event masked_place_all_slices_strided_impl(
         TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT,
         indT>;
 
-    constexpr std::size_t nominal_lws = 256;
+    static constexpr std::size_t nominal_lws = 256;
     const std::size_t masked_extent = iteration_size;
     const std::size_t lws = std::min(masked_extent, nominal_lws);
 
@@ -685,7 +685,7 @@ sycl::event masked_place_some_slices_strided_impl(
         TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
         dataT, indT>;
 
-    constexpr std::size_t nominal_lws = 256;
+    static constexpr std::size_t nominal_lws = 256;
     const std::size_t orthog_extent = orthog_nelems;
     const std::size_t masked_extent = masked_nelems;
     const std::size_t lws = std::min(masked_extent, nominal_lws);
@@ -788,7 +788,7 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
     const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
     indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
 
-    constexpr std::size_t nominal_lws = 256u;
+    static constexpr std::size_t nominal_lws = 256u;
     const std::size_t masked_extent = iter_size;
     const std::size_t lws = std::min(masked_extent, nominal_lws);
 

@@ -106,7 +106,7 @@ class ClipContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
@@ -202,8 +202,8 @@ sycl::event clip_contig_impl(sycl::queue &q,
         cgh.depends_on(depends);
 
         std::size_t lws = 64;
-        constexpr std::uint8_t vec_sz = 4;
-        constexpr std::uint8_t n_vecs = 2;
+        static constexpr std::uint8_t vec_sz = 4;
+        static constexpr std::uint8_t n_vecs = 2;
         const std::size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -214,7 +214,7 @@ sycl::event clip_contig_impl(sycl::queue &q,
             is_aligned<required_alignment>(max_cp) &&
             is_aligned<required_alignment>(dst_cp))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
             using Impl =
                 ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;
@@ -224,7 +224,7 @@ sycl::event clip_contig_impl(sycl::queue &q,
                 Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
         }
         else {
-            constexpr bool disable_sg_loadstore = false;
+            static constexpr bool disable_sg_loadstore = false;
             using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<InnerKernelName>;

@@ -468,7 +468,7 @@ sycl::event tri_impl(sycl::queue &exec_q,
                      const std::vector<sycl::event> &depends,
                      const std::vector<sycl::event> &additional_depends)
 {
-    constexpr int d2 = 2;
+    static constexpr int d2 = 2;
     ssize_t src_s = nd;
     ssize_t dst_s = 2 * nd;
     ssize_t nd_1 = nd - 1;