From c6bc9904b051578707166d959d11431f34b67def Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sun, 1 Jun 2025 23:49:24 -0700
Subject: [PATCH 1/2] make namespace-scope constexpr variables inline in
 headers

this gives these constants external linkage and prevents duplication
---
 dpctl/tensor/libtensor/include/kernels/alignment.hpp            | 2 +-
 dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp | 2 +-
 dpctl/tensor/libtensor/include/utils/type_utils.hpp             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
index a6dc93b6c7..1b92deb7ce 100644
--- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
@@ -30,7 +30,7 @@ namespace kernels
 namespace alignment_utils
 {
 
-static constexpr std::size_t required_alignment = 64UL;
+inline constexpr std::size_t required_alignment = 64UL;
 
 template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
 {
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
index 4394d9a4b1..b1308184fc 100644
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -56,7 +56,7 @@ enum class typenum_t : int
     CFLOAT,
     CDOUBLE, // 13
 };
-constexpr int num_types = 14; // number of elements in typenum_t
+inline constexpr int num_types = 14; // number of elements in typenum_t
 
 template <typename funcPtrT,
           template <typename fnT, typename D, typename S>
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
index e9493aebfe..41c42476b6 100644
--- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -51,7 +51,7 @@ struct is_complex<
 {
 };
 
-template <typename T> constexpr bool is_complex_v = is_complex<T>::value;
+template <typename T> inline constexpr bool is_complex_v = is_complex<T>::value;
 
 template <typename dstTy, typename srcTy> dstTy convert_impl(const srcTy &v)
 {

From 6edcc218c63949cf4a448078bfa1307766ffc10e Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 4 Jun 2025 14:12:36 -0700
Subject: [PATCH 2/2] Make constexpr variables static

---
 dpctl/_host_task_util.hpp                     |   6 +-
 .../include/kernels/accumulators.hpp          |  92 +++----
 .../kernels/boolean_advanced_indexing.hpp     |  22 +-
 .../tensor/libtensor/include/kernels/clip.hpp |  10 +-
 .../include/kernels/constructors.hpp          |   2 +-
 .../include/kernels/copy_and_cast.hpp         |  23 +-
 .../include/kernels/copy_as_contiguous.hpp    |  32 +--
 .../kernels/elementwise_functions/abs.hpp     |   4 +-
 .../kernels/elementwise_functions/acos.hpp    |   9 +-
 .../kernels/elementwise_functions/acosh.hpp   |  12 +-
 .../kernels/elementwise_functions/add.hpp     |   8 +-
 .../kernels/elementwise_functions/angle.hpp   |   8 +-
 .../kernels/elementwise_functions/asin.hpp    |   9 +-
 .../kernels/elementwise_functions/asinh.hpp   |   9 +-
 .../kernels/elementwise_functions/atan.hpp    |   9 +-
 .../kernels/elementwise_functions/atan2.hpp   |   4 +-
 .../kernels/elementwise_functions/atanh.hpp   |   7 +-
 .../elementwise_functions/bitwise_and.hpp     |   8 +-
 .../elementwise_functions/bitwise_invert.hpp  |   4 +-
 .../bitwise_left_shift.hpp                    |  17 +-
 .../elementwise_functions/bitwise_or.hpp      |   8 +-
 .../bitwise_right_shift.hpp                   |  13 +-
 .../elementwise_functions/bitwise_xor.hpp     |   8 +-
 .../elementwise_functions/cabs_impl.hpp       |   4 +-
 .../kernels/elementwise_functions/cbrt.hpp    |   4 +-
 .../kernels/elementwise_functions/ceil.hpp    |   4 +-
 .../kernels/elementwise_functions/common.hpp  |  18 +-
 .../elementwise_functions/common_inplace.hpp  |   6 +-
 .../kernels/elementwise_functions/conj.hpp    |   4 +-
 .../elementwise_functions/copysign.hpp        |   4 +-
 .../kernels/elementwise_functions/cos.hpp     |   7 +-
 .../kernels/elementwise_functions/cosh.hpp    |   7 +-
 .../kernels/elementwise_functions/equal.hpp   |   4 +-
 .../kernels/elementwise_functions/exp.hpp     |   7 +-
 .../kernels/elementwise_functions/exp2.hpp    |   7 +-
 .../kernels/elementwise_functions/expm1.hpp   |   4 +-
 .../kernels/elementwise_functions/floor.hpp   |   4 +-
 .../elementwise_functions/floor_divide.hpp    |   8 +-
 .../kernels/elementwise_functions/greater.hpp |   4 +-
 .../elementwise_functions/greater_equal.hpp   |   4 +-
 .../kernels/elementwise_functions/hypot.hpp   |   4 +-
 .../kernels/elementwise_functions/imag.hpp    |   4 +-
 .../elementwise_functions/isfinite.hpp        |   4 +-
 .../kernels/elementwise_functions/isinf.hpp   |   4 +-
 .../kernels/elementwise_functions/isnan.hpp   |   4 +-
 .../kernels/elementwise_functions/less.hpp    |   4 +-
 .../elementwise_functions/less_equal.hpp      |   4 +-
 .../kernels/elementwise_functions/log.hpp     |   4 +-
 .../kernels/elementwise_functions/log10.hpp   |   4 +-
 .../kernels/elementwise_functions/log1p.hpp   |   4 +-
 .../kernels/elementwise_functions/log2.hpp    |   4 +-
 .../elementwise_functions/logaddexp.hpp       |   4 +-
 .../elementwise_functions/logical_and.hpp     |   4 +-
 .../elementwise_functions/logical_not.hpp     |   4 +-
 .../elementwise_functions/logical_or.hpp      |   4 +-
 .../elementwise_functions/logical_xor.hpp     |   4 +-
 .../kernels/elementwise_functions/maximum.hpp |   4 +-
 .../kernels/elementwise_functions/minimum.hpp |   4 +-
 .../elementwise_functions/multiply.hpp        |   8 +-
 .../elementwise_functions/negative.hpp        |   4 +-
 .../elementwise_functions/nextafter.hpp       |   4 +-
 .../elementwise_functions/not_equal.hpp       |   4 +-
 .../elementwise_functions/positive.hpp        |   4 +-
 .../kernels/elementwise_functions/pow.hpp     |   8 +-
 .../kernels/elementwise_functions/proj.hpp    |   4 +-
 .../kernels/elementwise_functions/real.hpp    |   4 +-
 .../elementwise_functions/reciprocal.hpp      |   4 +-
 .../elementwise_functions/remainder.hpp       |   8 +-
 .../kernels/elementwise_functions/round.hpp   |   4 +-
 .../kernels/elementwise_functions/rsqrt.hpp   |   4 +-
 .../kernels/elementwise_functions/sign.hpp    |   4 +-
 .../kernels/elementwise_functions/signbit.hpp |   4 +-
 .../kernels/elementwise_functions/sin.hpp     |   7 +-
 .../kernels/elementwise_functions/sinh.hpp    |   4 +-
 .../kernels/elementwise_functions/sqrt.hpp    |   4 +-
 .../kernels/elementwise_functions/square.hpp  |   4 +-
 .../elementwise_functions/subtract.hpp        |   8 +-
 .../kernels/elementwise_functions/tan.hpp     |   7 +-
 .../kernels/elementwise_functions/tanh.hpp    |   7 +-
 .../elementwise_functions/true_divide.hpp     |   8 +-
 .../kernels/elementwise_functions/trunc.hpp   |   4 +-
 .../kernels/integer_advanced_indexing.hpp     |   4 +-
 .../kernels/linalg_functions/dot_product.hpp  |  48 ++--
 .../include/kernels/linalg_functions/gemm.hpp | 241 +++++++++---------
 .../libtensor/include/kernels/reductions.hpp  | 142 ++++++-----
 .../libtensor/include/kernels/repeat.hpp      |   4 +-
 .../include/kernels/sorting/merge_sort.hpp    |   2 +-
 .../include/kernels/sorting/radix_sort.hpp    | 138 +++++-----
 .../include/kernels/sorting/searchsorted.hpp  |   8 +-
 .../include/kernels/sorting/sort_utils.hpp    |  10 +-
 .../include/kernels/sorting/topk.hpp          |   6 +-
 .../libtensor/include/kernels/where.hpp       |  10 +-
 .../include/utils/indexing_utils.hpp          |  28 +-
 .../libtensor/include/utils/math_utils.hpp    |   2 +-
 .../include/utils/sycl_alloc_utils.hpp        |   2 +-
 .../libtensor/include/utils/sycl_utils.hpp    |  16 +-
 .../tensor/libtensor/source/accumulators.cpp  |   9 +-
 .../accumulators/accumulate_over_axis.hpp     |   2 +-
 .../accumulators/cumulative_logsumexp.cpp     |   8 +-
 .../source/accumulators/cumulative_prod.cpp   |   8 +-
 .../source/accumulators/cumulative_sum.cpp    |   8 +-
 .../source/boolean_advanced_indexing.cpp      |  12 +-
 .../libtensor/source/copy_as_contig.cpp       |   6 +-
 .../tensor/libtensor/source/copy_for_roll.cpp |   6 +-
 .../copy_numpy_ndarray_into_usm_ndarray.cpp   |   2 +-
 .../elementwise_functions.hpp                 |   4 +-
 .../elementwise_functions/true_divide.cpp     |  16 +-
 dpctl/tensor/libtensor/source/full_ctor.cpp   |   2 +-
 .../libtensor/source/linalg_functions/dot.cpp |   4 +-
 .../reductions/reduction_atomic_support.hpp   |   4 +-
 .../source/reductions/reduction_over_axis.hpp |  19 +-
 dpctl/tensor/libtensor/source/repeat.cpp      |   6 +-
 .../source/sorting/py_sort_common.hpp         |   2 +-
 .../libtensor/source/sorting/searchsorted.cpp |  16 +-
 dpctl/tensor/libtensor/source/zeros_ctor.cpp  |   2 +-
 dpctl/utils/src/sequential_order_keeper.hpp   |   5 +-
 .../source/dpctl_sycl_device_manager.cpp      |   2 +-
 .../dpctl_sycl_device_selector_interface.cpp  |   2 +-
 .../dpctl_sycl_kernel_bundle_interface.cpp    |   4 +-
 .../tests/test_sycl_device_aspects.cpp        |   4 +-
 .../tests/test_sycl_queue_submit.cpp          |  12 +-
 ...t_sycl_queue_submit_local_accessor_arg.cpp |   8 +-
 .../test_sycl_queue_submit_raw_kernel_arg.cpp |   6 +-
 ...ycl_queue_submit_work_group_memory_arg.cpp |   6 +-
 .../tests/test_sycl_usm_interface.cpp         |   2 +-
 125 files changed, 764 insertions(+), 701 deletions(-)

diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp
index 33a01a2536..4db07e1cb1 100644
--- a/dpctl/_host_task_util.hpp
+++ b/dpctl/_host_task_util.hpp
@@ -77,19 +77,19 @@ DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef,
             });
         });
 
-        constexpr int result_ok = 0;
+        static constexpr int result_ok = 0;
 
         *status = result_ok;
         auto e_ptr = new sycl::event(ht_ev);
         return wrap<sycl::event>(e_ptr);
     } catch (const std::exception &e) {
-        constexpr int result_std_exception = 1;
+        static constexpr int result_std_exception = 1;
 
         *status = result_std_exception;
         return nullptr;
     }
 
-    constexpr int result_other_abnormal = 2;
+    static constexpr int result_other_abnormal = 2;
 
     *status = result_other_abnormal;
     return nullptr;
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index f0f72aee82..407ea9e19a 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -60,9 +60,9 @@ template <typename inputT, typename outputT> struct NonZeroIndicator
 
     outputT operator()(const inputT &val) const
     {
-        constexpr outputT out_one(1);
-        constexpr outputT out_zero(0);
-        constexpr inputT val_zero(0);
+        static constexpr outputT out_one(1);
+        static constexpr outputT out_zero(0);
+        static constexpr inputT val_zero(0);
 
         return (val == val_zero) ? out_zero : out_one;
     }
@@ -583,7 +583,7 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q,
         cgh.depends_on(dependent_event);
         cgh.use_kernel_bundle(kb);
 
-        constexpr nwiT updates_per_wi = n_wi;
+        static constexpr nwiT updates_per_wi = n_wi;
         const std::size_t n_items =
             ceiling_quotient<std::size_t>(src_size, sg_size * n_wi) * sg_size;
 
@@ -594,8 +594,8 @@ sycl::event update_local_chunks_1d(sycl::queue &exec_q,
         cgh.parallel_for<UpdateKernelName>(
             ndRange,
             [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) {
-                constexpr ScanOpT scan_op{};
-                constexpr outputT identity =
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
                     su_ns::Identity<ScanOpT, outputT>::value;
 
                 const std::uint32_t lws = ndit.get_local_range(0);
@@ -640,16 +640,17 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
                                    std::vector<sycl::event> &host_tasks,
                                    const std::vector<sycl::event> &depends = {})
 {
-    constexpr ScanOpT scan_op{};
-    constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
 
-    constexpr std::size_t _iter_nelems = 1;
+    static constexpr std::size_t _iter_nelems = 1;
 
     using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
-    constexpr IterIndexerT _no_op_iter_indexer{};
+    static constexpr IterIndexerT _no_op_iter_indexer{};
 
     using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr NoOpIndexerT _no_op_indexer{};
+    static constexpr NoOpIndexerT _no_op_indexer{};
 
     std::size_t n_groups;
     sycl::event inc_scan_phase1_ev =
@@ -687,7 +688,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
         outputT *local_scans = temp;
 
         using NoOpTransformerT = NoOpTransformer<outputT>;
-        constexpr NoOpTransformerT _no_op_transformer{};
+        static constexpr NoOpTransformerT _no_op_transformer{};
         std::size_t size_to_update = n_elems;
         while (n_groups_ > 1) {
 
@@ -761,16 +762,16 @@ accumulate_1d_contig_impl(sycl::queue &q,
     dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
 
     using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr NoOpIndexerT flat_indexer{};
-    constexpr transformerT transformer{};
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
 
     sycl::event comp_ev;
     const sycl::device &dev = q.get_device();
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
                                          transformerT, AccumulateOpT,
@@ -779,7 +780,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
             flat_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
@@ -829,7 +830,7 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q,
     const std::uint32_t sg_size = krn.template get_info<
         sycl::info::kernel_device_specific::max_sub_group_size>(dev);
 
-    constexpr nwiT updates_per_wi = n_wi;
+    static constexpr nwiT updates_per_wi = n_wi;
     const std::size_t updates_per_sg = sg_size * updates_per_wi;
     const std::size_t update_nelems =
         ceiling_quotient(src_size, updates_per_sg) * sg_size;
@@ -845,8 +846,8 @@ sycl::event final_update_local_chunks(sycl::queue &exec_q,
         cgh.parallel_for<UpdateKernelName>(
             ndRange, [chunk_size, src_size, local_stride, src, local_scans,
                       out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) {
-                constexpr ScanOpT scan_op{};
-                constexpr outputT identity =
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
                     su_ns::Identity<ScanOpT, outputT>::value;
 
                 const std::uint32_t lws = ndit.get_local_range(1);
@@ -898,8 +899,8 @@ sycl::event update_local_chunks(sycl::queue &exec_q,
                                 std::size_t local_stride,
                                 sycl::event dependent_event)
 {
-    constexpr NoOpIndexer out_indexer{};
-    constexpr NoOpIndexer iter_out_indexer{};
+    static constexpr NoOpIndexer out_indexer{};
+    static constexpr NoOpIndexer iter_out_indexer{};
 
     return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
                                      NoOpIndexer, NoOpIndexer, ScanOpT>(
@@ -933,8 +934,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
                                 std::vector<sycl::event> &host_tasks,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr ScanOpT scan_op{};
-    constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
 
     using IterIndexerT =
         dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
@@ -977,9 +979,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
         outputT *local_scans = temp;
 
         using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        constexpr NoOpIndexerT _no_op_indexer{};
+        static constexpr NoOpIndexerT _no_op_indexer{};
         using NoOpTransformerT = NoOpTransformer<outputT>;
-        constexpr NoOpTransformerT _no_op_transformer{};
+        static constexpr NoOpTransformerT _no_op_transformer{};
         std::size_t size_to_update = acc_nelems;
 
         {
@@ -1142,15 +1144,15 @@ accumulate_strided_impl(sycl::queue &q,
                                        iter_shape_strides,
                                        iter_shape_strides + 2 * iter_nd};
 
-    constexpr transformerT transformer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
 
     const sycl::device &dev = q.get_device();
     sycl::event comp_ev;
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev =
             inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
@@ -1161,7 +1163,7 @@ accumulate_strided_impl(sycl::queue &q,
                 out_axis_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
@@ -1198,18 +1200,18 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
 
     using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr NoOpIndexerT flat_indexer{};
-    constexpr transformerT transformer{};
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
-    constexpr bool include_initial = false;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
     using AccumulateOpT = sycl::plus<cumsumT>;
 
     sycl::event comp_ev;
     const sycl::device &dev = q.get_device();
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
                                          NoOpIndexerT, transformerT,
@@ -1218,7 +1220,7 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
             flat_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
@@ -1313,17 +1315,17 @@ cumsum_val_strided_impl(sycl::queue &q,
 
     using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
     const StridedIndexerT strided_indexer{nd, 0, shape_strides};
-    constexpr transformerT transformer{};
+    static constexpr transformerT transformer{};
 
-    constexpr std::size_t s0 = 0;
-    constexpr std::size_t s1 = 1;
-    constexpr bool include_initial = false;
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
     using AccumulateOpT = sycl::plus<cumsumT>;
 
     const sycl::device &dev = q.get_device();
     sycl::event comp_ev;
     if (dev.has(sycl::aspect::cpu)) {
-        constexpr nwiT n_wi_for_cpu = 8;
+        static constexpr nwiT n_wi_for_cpu = 8;
         const std::uint32_t wg_size = 256;
         comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
                                          StridedIndexerT, transformerT,
@@ -1332,7 +1334,7 @@ cumsum_val_strided_impl(sycl::queue &q,
             strided_indexer, transformer, host_tasks, depends);
     }
     else {
-        constexpr nwiT n_wi_for_gpu = 4;
+        static constexpr nwiT n_wi_for_gpu = 4;
         // base_scan_striped algorithm does not execute correctly
         // on HIP device with wg_size > 64
         const std::uint32_t wg_size =
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index 948a0229eb..01ba0b21c1 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -227,11 +227,11 @@ std::size_t _get_lws_impl(std::size_t n)
     }
 }
 
-std::size_t get_lws(std::size_t n)
+inline std::size_t get_lws(std::size_t n)
 {
-    constexpr std::size_t lws0 = 256u;
-    constexpr std::size_t lws1 = 128u;
-    constexpr std::size_t lws2 = 64u;
+    static constexpr std::size_t lws0 = 256u;
+    static constexpr std::size_t lws1 = 128u;
+    static constexpr std::size_t lws2 = 64u;
     return _get_lws_impl<lws0, lws1, lws2>(n);
 }
 
@@ -261,9 +261,9 @@ sycl::event masked_extract_all_slices_contig_impl(
     ssize_t dst_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
 
-    constexpr NoOpIndexer masked_src_indexer{};
+    static constexpr NoOpIndexer masked_src_indexer{};
     const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
                                               /* step */ dst_stride);
 
@@ -339,7 +339,7 @@ sycl::event masked_extract_all_slices_strided_impl(
     ssize_t dst_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
 
     /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
      * *_packed_shape_strides) */
@@ -578,7 +578,7 @@ sycl::event masked_place_all_slices_strided_impl(
     ssize_t rhs_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+    static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
 
     /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
      * *_packed_shape_strides) */
@@ -589,7 +589,7 @@ sycl::event masked_place_all_slices_strided_impl(
         TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT,
         indT>;
 
-    constexpr std::size_t nominal_lws = 256;
+    static constexpr std::size_t nominal_lws = 256;
     const std::size_t masked_extent = iteration_size;
     const std::size_t lws = std::min(masked_extent, nominal_lws);
 
@@ -685,7 +685,7 @@ sycl::event masked_place_some_slices_strided_impl(
         TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
         dataT, indT>;
 
-    constexpr std::size_t nominal_lws = 256;
+    static constexpr std::size_t nominal_lws = 256;
     const std::size_t orthog_extent = orthog_nelems;
     const std::size_t masked_extent = masked_nelems;
     const std::size_t lws = std::min(masked_extent, nominal_lws);
@@ -788,7 +788,7 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
     const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
     indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
 
-    constexpr std::size_t nominal_lws = 256u;
+    static constexpr std::size_t nominal_lws = 256u;
     const std::size_t masked_extent = iter_size;
     const std::size_t lws = std::min(masked_extent, nominal_lws);
 
diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
index 20c50c23e7..815ebbcbc9 100644
--- a/dpctl/tensor/libtensor/include/kernels/clip.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp
@@ -106,7 +106,7 @@ class ClipContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
@@ -202,8 +202,8 @@ sycl::event clip_contig_impl(sycl::queue &q,
         cgh.depends_on(depends);
 
         std::size_t lws = 64;
-        constexpr std::uint8_t vec_sz = 4;
-        constexpr std::uint8_t n_vecs = 2;
+        static constexpr std::uint8_t vec_sz = 4;
+        static constexpr std::uint8_t n_vecs = 2;
         const std::size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -214,7 +214,7 @@ sycl::event clip_contig_impl(sycl::queue &q,
             is_aligned<required_alignment>(max_cp) &&
             is_aligned<required_alignment>(dst_cp))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
             using Impl =
                 ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;
@@ -224,7 +224,7 @@ sycl::event clip_contig_impl(sycl::queue &q,
                 Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
         }
         else {
-            constexpr bool disable_sg_loadstore = false;
+            static constexpr bool disable_sg_loadstore = false;
             using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
index c95c9ae114..e734f30497 100644
--- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
@@ -468,7 +468,7 @@ sycl::event tri_impl(sycl::queue &exec_q,
                      const std::vector<sycl::event> &depends,
                      const std::vector<sycl::event> &additional_depends)
 {
-    constexpr int d2 = 2;
+    static constexpr int d2 = 2;
     ssize_t src_s = nd;
     ssize_t dst_s = 2 * nd;
     ssize_t nd_1 = nd - 1;
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index 1db06a914a..f19dcb7c8c 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -233,9 +233,9 @@ class ContigCopyFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        CastFnT fn{};
+        static constexpr CastFnT fn{};
 
-        constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
@@ -338,8 +338,8 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
 
         std::size_t lws = 64;
-        constexpr std::uint32_t vec_sz = 4;
-        constexpr std::uint32_t n_vecs = 2;
+        static constexpr std::uint32_t vec_sz = 4;
+        static constexpr std::uint32_t n_vecs = 2;
         const std::size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -348,7 +348,7 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         if (is_aligned<required_alignment>(src_cp) &&
             is_aligned<required_alignment>(dst_cp))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName =
                 copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
 
@@ -359,7 +359,7 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
                                                                dst_tp));
         }
         else {
-            constexpr bool disable_sg_loadstore = false;
+            static constexpr bool disable_sg_loadstore = false;
             using InnerKernelName =
                 copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
             using KernelName =
@@ -724,9 +724,10 @@ void copy_and_cast_from_host_contig_impl(
         sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
 
         using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
-        constexpr NoOpIndexer src_indexer{};
-        constexpr NoOpIndexer dst_indexer{};
-        constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, dst_indexer};
+        static constexpr NoOpIndexer src_indexer{};
+        static constexpr NoOpIndexer dst_indexer{};
+        static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer,
+                                                            dst_indexer};
 
         dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
 
@@ -1124,12 +1125,12 @@ sycl::event copy_for_roll_contig_impl(sycl::queue &q,
     sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        constexpr NoOpIndexer src_indexer{};
+        static constexpr NoOpIndexer src_indexer{};
         const LeftRolled1DTransformer roller{shift, nelems};
 
         const CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
             left_rolled_src_indexer{src_indexer, roller};
-        constexpr NoOpIndexer dst_indexer{};
+        static constexpr NoOpIndexer dst_indexer{};
 
         using KernelName = copy_for_roll_contig_kernel<Ty>;
 
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index f3182707ad..a27a8c78d3 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -74,7 +74,7 @@ class CopyAsCContigFunctor
         static_assert(vec_sz > 0);
         static_assert(n_vecs > 0);
 
-        constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
@@ -151,7 +151,7 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
     static_assert(vec_sz > 0);
     static_assert(n_vecs > 0);
 
-    constexpr std::size_t preferred_lws = 256;
+    static constexpr std::size_t preferred_lws = 256;
 
     const auto &kernel_id = sycl::get_kernel_id<KernelName>();
 
@@ -168,7 +168,7 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
     const std::size_t lws =
         ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
 
-    constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+    static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
 
     const std::size_t nelems_per_group = nelems_per_wi * lws;
     const std::size_t n_groups =
@@ -214,8 +214,8 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
     using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
     const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
 
-    constexpr std::uint8_t vec_sz = 4u;
-    constexpr std::uint8_t n_vecs = 2u;
+    static constexpr std::uint8_t vec_sz = 4u;
+    static constexpr std::uint8_t n_vecs = 2u;
 
     using dpctl::tensor::kernels::alignment_utils::
         disabled_sg_loadstore_wrapper_krn;
@@ -224,7 +224,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
 
     sycl::event copy_ev;
     if (is_aligned<required_alignment>(dst_p)) {
-        constexpr bool enable_sg_load = true;
+        static constexpr bool enable_sg_load = true;
         using KernelName =
             as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
         copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
@@ -232,7 +232,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
             exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
     }
     else {
-        constexpr bool disable_sg_load = false;
+        static constexpr bool disable_sg_load = false;
         using InnerKernelName =
             as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
         using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
@@ -287,19 +287,19 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
     const T *src_tp = reinterpret_cast<const T *>(src_p);
     T *dst_tp = reinterpret_cast<T *>(dst_p);
 
-    constexpr std::uint16_t private_tile_size = 4;
-    constexpr std::uint16_t n_lines = 2;
-    constexpr std::uint16_t block_size =
+    static constexpr std::uint16_t private_tile_size = 4;
+    static constexpr std::uint16_t n_lines = 2;
+    static constexpr std::uint16_t block_size =
         n_lines * private_tile_size * private_tile_size;
 
-    constexpr std::uint16_t lws0 = block_size;
-    constexpr std::uint16_t lws1 = n_lines;
-    constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
+    static constexpr std::uint16_t lws0 = block_size;
+    static constexpr std::uint16_t lws1 = n_lines;
+    static constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
 
     static_assert(nelems_per_wi * lws1 == block_size);
     static_assert(nelems_per_wi == private_tile_size * private_tile_size);
 
-    constexpr std::uint32_t lws = lws0 * lws1;
+    static constexpr std::uint32_t lws = lws0 * lws1;
 
     const std::size_t n_tiles = (n + block_size - 1) / block_size;
 
@@ -384,7 +384,7 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
             // 0 <= lid_lin < lws0 * lws1 ==
             //       (block_size * block_size / nelems_per_wi) ==
             //       (block_size/private_tile_size)**2
-            constexpr std::uint16_t n_private_tiles_per_axis =
+            static constexpr std::uint16_t n_private_tiles_per_axis =
                 block_size / private_tile_size;
             const std::uint16_t local_tile_id0 =
                 lid_lin / n_private_tiles_per_axis;
@@ -594,7 +594,7 @@ sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
     using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
     using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
 
-    constexpr ssize_t zero_offset{0};
+    static constexpr ssize_t zero_offset{0};
 
     const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
                                         src_batch_shape_strides};
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 48c8e3e4dd..d600175cf6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -154,8 +154,8 @@ sycl::event abs_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using AbsHS = hyperparam_detail::AbsContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AbsHS::vec_sz;
-    constexpr std::uint8_t n_vec = AbsHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AbsHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AbsHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index 7dbfb6618c..c59a283ded 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct AcosFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(in);
             const realT y = std::imag(in);
@@ -103,7 +104,7 @@ template <typename argT, typename resT> struct AcosFunctor
             /*
              * For large x or y including acos(+-Inf + I*+-Inf)
              */
-            constexpr realT r_eps =
+            static constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
                 using sycl_complexT = exprm_ns::complex<realT>;
@@ -188,8 +189,8 @@ sycl::event acos_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using AcosHS = hyperparam_detail::AcosContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AcosHS::vec_sz;
-    constexpr std::uint8_t n_vec = AcosHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AcosHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AcosHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index a81ff3da99..12c20525f5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct AcoshFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
             /*
              * acosh(in) = I*acos(in) or -I*acos(in)
              * where the sign is chosen so Re(acosh(in)) >= 0.
@@ -92,7 +93,8 @@ template <typename argT, typename resT> struct AcoshFunctor
             }
             else if (std::isnan(y)) {
                 /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
-                constexpr realT inf = std::numeric_limits<realT>::infinity();
+                static constexpr realT inf =
+                    std::numeric_limits<realT>::infinity();
 
                 if (std::isinf(x)) {
                     acos_in = resT{q_nan, -inf};
@@ -107,7 +109,7 @@ template <typename argT, typename resT> struct AcoshFunctor
                 }
             }
 
-            constexpr realT r_eps =
+            static constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             /*
              * For large x or y including acos(+-Inf + I*+-Inf)
@@ -216,8 +218,8 @@ sycl::event acosh_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AcoshHS::vec_sz;
-    constexpr std::uint8_t n_vec = AcoshHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AcoshHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AcoshHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 476e7b52b9..d65c06500c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -273,8 +273,8 @@ sycl::event add_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using AddHS = hyperparam_detail::AddContigHyperparameterSet<argTy1, argTy2>;
-    constexpr auto vec_sz = AddHS::vec_sz;
-    constexpr auto n_vecs = AddHS::n_vecs;
+    static constexpr auto vec_sz = AddHS::vec_sz;
+    static constexpr auto n_vecs = AddHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel,
@@ -551,9 +551,9 @@ add_inplace_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr auto vec_sz =
+    static constexpr auto vec_sz =
         hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr auto n_vecs =
+    static constexpr auto n_vecs =
         hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 726f90ba81..9af8970376 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -115,8 +115,8 @@ template <typename argTy> struct AngleContigHyperparameterSet
     using value_type =
         typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
 
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
+    static constexpr static auto vec_sz = value_type::vec_sz;
+    static constexpr static auto n_vecs = value_type::n_vecs;
 };
 
 } // end of namespace hyperparam_detail
@@ -132,8 +132,8 @@ sycl::event angle_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using AngleHS = hyperparam_detail::AngleContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AngleHS::vec_sz;
-    constexpr std::uint8_t n_vec = AngleHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AngleHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AngleHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 70b48895b4..3ba00c4198 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct AsinFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             /*
              * asin(in) = I * conj( asinh(I * conj(in)) )
@@ -117,7 +118,7 @@ template <typename argT, typename resT> struct AsinFunctor
              * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
              * O(y/in^3) as in -> infinity, uniformly in y
              */
-            constexpr realT r_eps =
+            static constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
                 using sycl_complexT = exprm_ns::complex<realT>;
@@ -209,8 +210,8 @@ sycl::event asin_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using AddHS = hyperparam_detail::AsinContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AddHS::vec_sz;
-    constexpr std::uint8_t n_vec = AddHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AddHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AddHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 420ba3246c..7441e51ae7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct AsinhFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(in);
             const realT y = std::imag(in);
@@ -105,7 +106,7 @@ template <typename argT, typename resT> struct AsinhFunctor
              * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
              * O(y/in^3) as in -> infinity, uniformly in y
              */
-            constexpr realT r_eps =
+            static constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
 
             if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
@@ -192,8 +193,8 @@ sycl::event asinh_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AsinhHS::vec_sz;
-    constexpr std::uint8_t n_vec = AsinhHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AsinhHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AsinhHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index 29c4941d76..87fad500ab 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -75,7 +75,8 @@ template <typename argT, typename resT> struct AtanFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
             /*
              * atan(in) = I * conj( atanh(I * conj(in)) )
              * so we first calculate w = atanh(I * conj(in)) with
@@ -122,7 +123,7 @@ template <typename argT, typename resT> struct AtanFunctor
              * The sign of pi/2 depends on the sign of imaginary part of the
              * input.
              */
-            constexpr realT r_eps =
+            static constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
                 const realT pi_half = sycl::atan(realT(1)) * 2;
@@ -202,8 +203,8 @@ sycl::event atan_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using AtanHS = hyperparam_detail::AtanContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AtanHS::vec_sz;
-    constexpr std::uint8_t n_vec = AtanHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AtanHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AtanHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 32384fc8a9..7e5dbe0bb5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -145,8 +145,8 @@ sycl::event atan2_contig_impl(sycl::queue &exec_q,
 {
     using Atan2HS =
         hyperparam_detail::Atan2ContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = Atan2HS::vec_sz;
-    constexpr std::uint8_t n_vecs = Atan2HS::n_vecs;
+    static constexpr std::uint8_t vec_sz = Atan2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Atan2HS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 39f11e0f90..4148c1545a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct AtanhFunctor
     {
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(in);
             const realT y = std::imag(in);
@@ -193,8 +194,8 @@ sycl::event atanh_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = AtanhHS::vec_sz;
-    constexpr std::uint8_t n_vec = AtanhHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = AtanhHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AtanhHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index cdea67f080..9c164ea5a2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -203,8 +203,8 @@ bitwise_and_contig_impl(sycl::queue &exec_q,
 {
     using BitwiseAndHS =
         hyperparam_detail::BitwiseAndContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
-    constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
+    static constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
@@ -391,8 +391,8 @@ bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
 {
     using BitwiseAndHS =
         hyperparam_detail::BitwiseAndContigHyperparameterSet<resTy, argTy>;
-    constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseAndInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index d96bf35afe..c292373575 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -150,8 +150,8 @@ bitwise_invert_contig_impl(sycl::queue &exec_q,
 {
     using BitwiseInvertHS =
         hyperparam_detail::BitwiseInvertContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz;
-    constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz;
+    static constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 3ff2d5e315..6511895537 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -84,8 +84,9 @@ struct BitwiseLeftShiftFunctor
 private:
     resT impl(const argT1 &in1, const argT2 &in2) const
     {
-        constexpr argT2 in1_bitsize = static_cast<argT2>(sizeof(argT1) * 8);
-        constexpr resT zero = resT(0);
+        static constexpr argT2 in1_bitsize =
+            static_cast<argT2>(sizeof(argT1) * 8);
+        static constexpr resT zero = resT(0);
 
         // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
         // array API spec mandates 0
@@ -214,8 +215,8 @@ bitwise_left_shift_contig_impl(sycl::queue &exec_q,
     using BitwiseLSHS =
         hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<argTy1,
                                                                    argTy2>;
-    constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseLeftShiftOutputType,
@@ -316,8 +317,8 @@ template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
 private:
     void impl(resT &res, const argT &in) const
     {
-        constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
-        constexpr resT zero = resT(0);
+        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        static constexpr resT zero = resT(0);
 
         // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
         // array API spec mandates 0
@@ -407,8 +408,8 @@ sycl::event bitwise_left_shift_inplace_contig_impl(
     using BitwiseLSHS =
         hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<resTy,
                                                                    argTy>;
-    constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index f1cbb63afc..4e20b75138 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -202,8 +202,8 @@ sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
 {
     using BitwiseOrHS =
         hyperparam_detail::BitwiseOrContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
@@ -387,8 +387,8 @@ bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
     using BitwiseOrHS =
         hyperparam_detail::BitwiseOrContigHyperparameterSet<resTy, argTy>;
 
-    constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseOrInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 580ee7d828..047a33cea9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -83,8 +83,9 @@ struct BitwiseRightShiftFunctor
 private:
     resT impl(const argT1 &in1, const argT2 &in2) const
     {
-        constexpr argT2 in1_bitsize = static_cast<argT2>(sizeof(argT1) * 8);
-        constexpr resT zero = resT(0);
+        static constexpr argT2 in1_bitsize =
+            static_cast<argT2>(sizeof(argT1) * 8);
+        static constexpr resT zero = resT(0);
 
         // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
         // array API spec mandates 0
@@ -318,8 +319,8 @@ template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
 private:
     void impl(resT &res, const argT &in) const
     {
-        constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
-        constexpr resT zero = resT(0);
+        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        static constexpr resT zero = resT(0);
 
         // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
         // array API spec mandates 0
@@ -413,8 +414,8 @@ sycl::event bitwise_right_shift_inplace_contig_impl(
                                                                     argTy>;
 
     // res = OP(res, arg)
-    constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 84d1c8ff64..d14a12a248 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -204,8 +204,8 @@ bitwise_xor_contig_impl(sycl::queue &exec_q,
 {
     using BitwiseXorHS =
         hyperparam_detail::BitwiseXorContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
@@ -393,8 +393,8 @@ bitwise_xor_inplace_contig_impl(sycl::queue &exec_q,
     using BitwiseXorHS =
         hyperparam_detail::BitwiseXorContigHyperparameterSet<resTy, argTy>;
 
-    constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseXorInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
index afa83a64cb..4cf32725b1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
@@ -54,8 +54,8 @@ template <typename realT> realT cabs(std::complex<realT> const &z)
     const realT x = std::real(z);
     const realT y = std::imag(z);
 
-    constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
-    constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+    static constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+    static constexpr realT p_inf = std::numeric_limits<realT>::infinity();
 
     const realT res =
         std::isinf(x)
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index d857e1563c..a26c0b6875 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -124,8 +124,8 @@ sycl::event cbrt_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = CbrtHS::vec_sz;
-    constexpr std::uint8_t n_vecs = CbrtHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = CbrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CbrtHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index c587ba6767..eb9d576db1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -146,8 +146,8 @@ sycl::event ceil_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using CeilHS = hyperparam_detail::CeilContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = CeilHS::vec_sz;
-    constexpr std::uint8_t n_vecs = CeilHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = CeilHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CeilHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 961a373890..1836b61d0e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -77,7 +77,7 @@ struct UnaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
         UnaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
@@ -94,7 +94,7 @@ struct UnaryContigFunctor
                 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
                                 sg.get_group_id()[0] * sgSize);
             if (base + elems_per_wi * sgSize < nelems_) {
-                constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
+                static constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
                 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
                     const std::size_t offset = base + it * sgSize;
@@ -275,7 +275,7 @@ SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
     // TODO: make the decision based on device descriptors
 
     // constexpr SizeT few_threshold = (SizeT(1) << 17);
-    constexpr SizeT med_threshold = (SizeT(1) << 21);
+    static constexpr SizeT med_threshold = (SizeT(1) << 21);
 
     const SizeT lws =
         (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256));
@@ -302,7 +302,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+    static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
     const std::size_t n_work_items_needed = nelems / elems_per_wi;
     const std::size_t lws =
         select_lws(exec_q.get_device(), n_work_items_needed);
@@ -324,7 +324,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         if (is_aligned<required_alignment>(arg_p) &&
             is_aligned<required_alignment>(res_p))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
                                         enable_sg_loadstore>;
@@ -334,7 +334,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
                 Impl(arg_tp, res_tp, nelems));
         }
         else {
-            constexpr bool disable_sg_loadstore = false;
+            static constexpr bool disable_sg_loadstore = false;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
             using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
@@ -415,7 +415,7 @@ struct BinaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
         BinaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
@@ -817,7 +817,7 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
             is_aligned<required_alignment>(arg2_tp) &&
             is_aligned<required_alignment>(res_tp))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
                                               n_vecs, enable_sg_loadstore>;
@@ -827,7 +827,7 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
                 Impl(arg1_tp, arg2_tp, res_tp, nelems));
         }
         else {
-            constexpr bool disable_sg_loadstore = false;
+            static constexpr bool disable_sg_loadstore = false;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
             using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 6e16690f90..4c16f8ff8f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -79,7 +79,7 @@ struct BinaryInplaceContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         BinaryInplaceOperatorT op{};
-        constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NB: Workgroup size must be divisible by sub-group size */
 
@@ -337,7 +337,7 @@ binary_inplace_contig_impl(sycl::queue &exec_q,
         if (is_aligned<required_alignment>(arg_tp) &&
             is_aligned<required_alignment>(res_tp))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using Impl =
                 BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
@@ -348,7 +348,7 @@ binary_inplace_contig_impl(sycl::queue &exec_q,
                 Impl(arg_tp, res_tp, nelems));
         }
         else {
-            constexpr bool disable_sg_loadstore = true;
+            static constexpr bool disable_sg_loadstore = true;
             using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index 19a95df5a1..6bf9658fc6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -152,8 +152,8 @@ sycl::event conj_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using ConjHS = hyperparam_detail::ConjContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = ConjHS::vec_sz;
-    constexpr std::uint8_t n_vecs = ConjHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = ConjHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ConjHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index ecfbf99c28..5762053821 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -160,8 +160,8 @@ sycl::event copysign_contig_impl(sycl::queue &exec_q,
 {
     using CopySignHS =
         hyperparam_detail::CopysignContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = CopySignHS::vec_sz;
-    constexpr std::uint8_t n_vecs = CopySignHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index 5940315c62..81608346a7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct CosFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             realT const &in_re = std::real(in);
             realT const &in_im = std::imag(in);
@@ -225,8 +226,8 @@ sycl::event cos_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using CosHS = hyperparam_detail::CosContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = CosHS::vec_sz;
-    constexpr std::uint8_t n_vecs = CosHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = CosHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CosHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index 59468428d1..be3e76cb2b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -71,7 +71,8 @@ template <typename argT, typename resT> struct CoshFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(in);
             const realT y = std::imag(in);
@@ -214,8 +215,8 @@ sycl::event cosh_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using CoshHS = hyperparam_detail::CoshContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = CoshHS::vec_sz;
-    constexpr std::uint8_t n_vecs = CoshHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = CoshHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CoshHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index a53f6412de..682e9f397e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -231,8 +231,8 @@ sycl::event equal_contig_impl(sycl::queue &exec_q,
 {
     using EqualHS =
         hyperparam_detail::EqualContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = EqualHS::vec_sz;
-    constexpr std::uint8_t n_vecs = EqualHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = EqualHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = EqualHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, EqualOutputType, EqualContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index 00f8213251..770229f804 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -70,7 +70,8 @@ template <typename argT, typename resT> struct ExpFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(in);
             const realT y = std::imag(in);
@@ -183,8 +184,8 @@ sycl::event exp_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using ExpHS = hyperparam_detail::ExpContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = ExpHS::vec_sz;
-    constexpr std::uint8_t n_vecs = ExpHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = ExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ExpHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index 22291101ca..44cef5fc9e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -73,7 +73,8 @@ template <typename argT, typename resT> struct Exp2Functor
 
             const argT tmp = in * sycl::log(realT(2));
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(tmp);
             const realT y = std::imag(tmp);
@@ -186,8 +187,8 @@ sycl::event exp2_contig_impl(sycl::queue &exec_q,
 {
     using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet<argTy>;
 
-    constexpr std::uint8_t vec_sz = Exp2HS::vec_sz;
-    constexpr std::uint8_t n_vecs = Exp2HS::n_vecs;
+    static constexpr std::uint8_t vec_sz = Exp2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Exp2HS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index d1d64f4904..de0bf515b4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -198,8 +198,8 @@ sycl::event expm1_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = Expm1HS::vec_sz;
-    constexpr std::uint8_t n_vecs = Expm1HS::n_vecs;
+    static constexpr std::uint8_t vec_sz = Expm1HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Expm1HS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index 5bc6b888ef..b80c1f3b54 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -146,8 +146,8 @@ sycl::event floor_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using FloorHS = hyperparam_detail::FloorContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = FloorHS::vec_sz;
-    constexpr std::uint8_t n_vecs = FloorHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = FloorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index d290aa66a6..21b9304e53 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -245,8 +245,8 @@ floor_divide_contig_impl(sycl::queue &exec_q,
 {
     using FloorDivideHS =
         hyperparam_detail::FloorDivideContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
-    constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
@@ -472,8 +472,8 @@ floor_divide_inplace_contig_impl(sycl::queue &exec_q,
     using FloorDivideHS =
         hyperparam_detail::FloorDivideContigHyperparameterSet<resTy, argTy>;
 
-    constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
-    constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, FloorDivideInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 4d0b7fb94f..bdde27e175 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -234,8 +234,8 @@ sycl::event greater_contig_impl(sycl::queue &exec_q,
     using GreaterHS =
         hyperparam_detail::GreaterContigHyperparameterSet<argTy1, argTy2>;
 
-    constexpr std::uint8_t vec_sz = GreaterHS::vec_sz;
-    constexpr std::uint8_t n_vecs = GreaterHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = GreaterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = GreaterHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, GreaterOutputType, GreaterContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index b149158ee0..e5599b5a14 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -235,8 +235,8 @@ greater_equal_contig_impl(sycl::queue &exec_q,
 {
     using GreaterEqHS =
         hyperparam_detail::GreaterEqualContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz;
-    constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index faa92c2f2d..12b67481f0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -161,8 +161,8 @@ sycl::event hypot_contig_impl(sycl::queue &exec_q,
 {
     using HypotHS =
         hyperparam_detail::HypotContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = HypotHS::vec_sz;
-    constexpr std::uint8_t n_vecs = HypotHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = HypotHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = HypotHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, HypotOutputType, HypotContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index c5e0feea12..e4d615f87e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -150,8 +150,8 @@ sycl::event imag_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using ImagHS = hyperparam_detail::ImagContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = ImagHS::vec_sz;
-    constexpr std::uint8_t n_vecs = ImagHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = ImagHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ImagHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index b0651a4d8b..4615f9ce93 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -151,8 +151,8 @@ sycl::event isfinite_contig_impl(sycl::queue &exec_q,
 {
     using IsFiniteHS =
         hyperparam_detail::IsFiniteContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz;
-    constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, IsFiniteOutputType, IsFiniteContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index ec78746143..9291eeeb72 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -150,8 +150,8 @@ sycl::event isinf_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = IsInfHS::vec_sz;
-    constexpr std::uint8_t n_vecs = IsInfHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = IsInfHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsInfHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index fbf6ef9383..397037dbae 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -148,8 +148,8 @@ sycl::event isnan_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = IsNanHS::vec_sz;
-    constexpr std::uint8_t n_vecs = IsNanHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = IsNanHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsNanHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index 523410a161..f323e28a93 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -230,8 +230,8 @@ sycl::event less_contig_impl(sycl::queue &exec_q,
 {
     using LessHS =
         hyperparam_detail::LessContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = LessHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LessHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LessHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LessHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index 5827d350a3..8ae6f236fc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -232,8 +232,8 @@ sycl::event less_equal_contig_impl(sycl::queue &exec_q,
 {
     using LessEqHS =
         hyperparam_detail::LessEqualContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = LessEqHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LessEqHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LessEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LessEqHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index 84471a5ef4..b6a934677d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -140,8 +140,8 @@ sycl::event log_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using LogHS = hyperparam_detail::LogContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = LogHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LogHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LogHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index d308c85ac9..bf27099e6a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -159,8 +159,8 @@ sycl::event log10_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = Log10HS::vec_sz;
-    constexpr std::uint8_t n_vecs = Log10HS::n_vecs;
+    static constexpr std::uint8_t vec_sz = Log10HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log10HS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index b8d993dd94..be14d459ee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -164,8 +164,8 @@ sycl::event log1p_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = Log1pHS::vec_sz;
-    constexpr std::uint8_t n_vecs = Log1pHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = Log1pHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log1pHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 42c837cfa3..0548c61f9d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -160,8 +160,8 @@ sycl::event log2_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = Log2HS::vec_sz;
-    constexpr std::uint8_t n_vecs = Log2HS::n_vecs;
+    static constexpr std::uint8_t vec_sz = Log2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log2HS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 0076a0bbe3..cd1dc11e9c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -177,8 +177,8 @@ sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
 {
     using LogAddExpHS =
         hyperparam_detail::LogAddExpContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index 0aa1f61b90..6b91ff6915 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -203,8 +203,8 @@ logical_and_contig_impl(sycl::queue &exec_q,
 {
     using LogicalAndHS =
         hyperparam_detail::LogicalAndContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index e6d3e6d5ad..d4f9ec671f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -125,8 +125,8 @@ logical_not_contig_impl(sycl::queue &exec_q,
 {
     using LogicalNotHS =
         hyperparam_detail::LogicalNotContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, LogicalNotOutputType, LogicalNotContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index 1fdcd84f60..fdf3d134de 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -201,8 +201,8 @@ sycl::event logical_or_contig_impl(sycl::queue &exec_q,
 {
     using LogicalOrHS =
         hyperparam_detail::LogicalOrContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index 0ef3b17dff..b1521a238a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -204,8 +204,8 @@ logical_xor_contig_impl(sycl::queue &exec_q,
 {
     using LogicalXorHS =
         hyperparam_detail::LogicalXorContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz;
-    constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index 799cbb1d8c..6fd8faf648 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -235,8 +235,8 @@ sycl::event maximum_contig_impl(sycl::queue &exec_q,
 {
     using MaxHS =
         hyperparam_detail::MaximumContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = MaxHS::vec_sz;
-    constexpr std::uint8_t n_vecs = MaxHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = MaxHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MaxHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 9a672e539f..37b43eb0a0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -235,8 +235,8 @@ sycl::event minimum_contig_impl(sycl::queue &exec_q,
 {
     using MinHS =
         hyperparam_detail::MinimumContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = MinHS::vec_sz;
-    constexpr std::uint8_t n_vecs = MinHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = MinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MinHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index ca24383b44..798610f445 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -227,8 +227,8 @@ sycl::event multiply_contig_impl(sycl::queue &exec_q,
 {
     using MulHS =
         hyperparam_detail::MultiplyContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = MulHS::vec_sz;
-    constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
@@ -513,8 +513,8 @@ multiply_inplace_contig_impl(sycl::queue &exec_q,
 {
     using MulHS =
         hyperparam_detail::MultiplyContigHyperparameterSet<resTy, argTy>;
-    constexpr std::uint8_t vec_sz = MulHS::vec_sz;
-    constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, MultiplyInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 47707a5f04..4a8fc76bcc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -130,8 +130,8 @@ sycl::event negative_contig_impl(sycl::queue &exec_q,
                                  const std::vector<sycl::event> &depends = {})
 {
     using NegHS = hyperparam_detail::NegativeContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = NegHS::vec_sz;
-    constexpr std::uint8_t n_vecs = NegHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = NegHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NegHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, NegativeOutputType, NegativeContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
index 4a54a115f3..f68e4e8295 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -160,8 +160,8 @@ sycl::event nextafter_contig_impl(sycl::queue &exec_q,
 {
     using NextafterHS =
         hyperparam_detail::NextafterContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = NextafterHS::vec_sz;
-    constexpr std::uint8_t n_vecs = NextafterHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index f21bc678fd..f8f436ec4f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -216,8 +216,8 @@ sycl::event not_equal_contig_impl(sycl::queue &exec_q,
 {
     using NotEqHS =
         hyperparam_detail::NotEqualContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = NotEqHS::vec_sz;
-    constexpr std::uint8_t n_vecs = NotEqHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index df6c04021f..9bc9e5782a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -145,8 +145,8 @@ sycl::event positive_contig_impl(sycl::queue &exec_q,
                                  const std::vector<sycl::event> &depends = {})
 {
     using PosHS = hyperparam_detail::PositiveContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = PosHS::vec_sz;
-    constexpr std::uint8_t n_vecs = PosHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = PosHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PosHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, PositiveOutputType, PositiveContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index d7b0ed909e..326d83f412 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -277,8 +277,8 @@ sycl::event pow_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using PowHS = hyperparam_detail::PowContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = PowHS::vec_sz;
-    constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
@@ -522,8 +522,8 @@ pow_inplace_contig_impl(sycl::queue &exec_q,
                         const std::vector<sycl::event> &depends = {})
 {
     using PowHS = hyperparam_detail::PowContigHyperparameterSet<resTy, argTy>;
-    constexpr std::uint8_t vec_sz = PowHS::vec_sz;
-    constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index df5edface1..a9d630e762 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -149,8 +149,8 @@ sycl::event proj_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using ProjHS = hyperparam_detail::ProjContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = ProjHS::vec_sz;
-    constexpr std::uint8_t n_vecs = ProjHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = ProjHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ProjHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index 9ecb822a20..a8b5c719a0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -149,8 +149,8 @@ sycl::event real_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using RealHS = hyperparam_detail::RealContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = RealHS::vec_sz;
-    constexpr std::uint8_t n_vecs = RealHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = RealHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RealHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
index 0e46acba39..a320aa3181 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -145,8 +145,8 @@ sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
                                    const std::vector<sycl::event> &depends = {})
 {
     using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = RecipHS::vec_sz;
-    constexpr std::uint8_t n_vecs = RecipHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = RecipHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RecipHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ReciprocalOutputType, ReciprocalContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 6d3baf0d1f..028b078c6b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -262,8 +262,8 @@ sycl::event remainder_contig_impl(sycl::queue &exec_q,
 {
     using RemHS =
         hyperparam_detail::RemainderContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = RemHS::vec_sz;
-    constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
@@ -495,8 +495,8 @@ remainder_inplace_contig_impl(sycl::queue &exec_q,
 {
     using RemHS =
         hyperparam_detail::RemainderContigHyperparameterSet<resTy, argTy>;
-    constexpr std::uint8_t vec_sz = RemHS::vec_sz;
-    constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, RemainderInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index 7fbb20ae32..f83349d6e5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -156,8 +156,8 @@ sycl::event round_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using RoundHS = hyperparam_detail::RoundContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = RoundHS::vec_sz;
-    constexpr std::uint8_t n_vecs = RoundHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = RoundHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RoundHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index 0450246807..ba78e19fa9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -127,8 +127,8 @@ sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz;
-    constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index baa224942f..97c3305c7f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -168,8 +168,8 @@ sycl::event sign_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using SignHS = hyperparam_detail::SignContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = SignHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SignHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SignHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index 9020bceb7e..01d25453cd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -134,8 +134,8 @@ sycl::event signbit_contig_impl(sycl::queue &exec_q,
                                 const std::vector<sycl::event> &depends = {})
 {
     using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = SignbitHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SignbitHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SignbitHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SignbitHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index e075a90a88..eede1e82c9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -70,7 +70,8 @@ template <typename argT, typename resT> struct SinFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             realT const &in_re = std::real(in);
             realT const &in_im = std::imag(in);
@@ -247,8 +248,8 @@ sycl::event sin_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using SinHS = hyperparam_detail::SinContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = SinHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SinHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SinHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index 23b3588a3b..e26631d5dc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -216,8 +216,8 @@ sycl::event sinh_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using SinhHS = hyperparam_detail::SinhContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = SinhHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SinhHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SinhHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SinhHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index b83ff72495..ed80521acd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -142,8 +142,8 @@ sycl::event sqrt_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = SqrtHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SqrtHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SqrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SqrtHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index f9d9d848c0..7aa12452cc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -167,8 +167,8 @@ sycl::event square_contig_impl(sycl::queue &exec_q,
                                const std::vector<sycl::event> &depends = {})
 {
     using SquareHS = hyperparam_detail::SquareContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = SquareHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SquareHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SquareHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SquareHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 51a3955142..ee817c2941 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -213,8 +213,8 @@ sycl::event subtract_contig_impl(sycl::queue &exec_q,
 {
     using SubHS =
         hyperparam_detail::SubtractContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = SubHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
@@ -511,8 +511,8 @@ subtract_inplace_contig_impl(sycl::queue &exec_q,
 {
     using SubHS =
         hyperparam_detail::SubtractContigHyperparameterSet<resTy, argTy>;
-    constexpr std::uint8_t vec_sz = SubHS::vec_sz;
-    constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, SubtractInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 770518f918..fc67ca9a25 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -73,7 +73,8 @@ template <typename argT, typename resT> struct TanFunctor
 
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
             /*
              * since tan(in) = -I * tanh(I * in), for special cases,
              * we calculate real and imaginary parts of z = tanh(I * in) and
@@ -191,8 +192,8 @@ sycl::event tan_contig_impl(sycl::queue &exec_q,
                             const std::vector<sycl::event> &depends = {})
 {
     using TanHS = hyperparam_detail::TanContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = TanHS::vec_sz;
-    constexpr std::uint8_t n_vecs = TanHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = TanHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TanHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index 1d06fd3c4f..dda2e2914f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -73,7 +73,8 @@ template <typename argT, typename resT> struct TanhFunctor
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
 
-            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
 
             const realT x = std::real(in);
             const realT y = std::imag(in);
@@ -185,8 +186,8 @@ sycl::event tanh_contig_impl(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends = {})
 {
     using TanhHS = hyperparam_detail::TanhContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = TanhHS::vec_sz;
-    constexpr std::uint8_t n_vecs = TanhHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = TanhHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TanhHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index de6c9a8723..1aa3fbd482 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -221,8 +221,8 @@ true_divide_contig_impl(sycl::queue &exec_q,
 {
     using DivHS =
         hyperparam_detail::TrueDivideContigHyperparameterSet<argTy1, argTy2>;
-    constexpr std::uint8_t vec_sz = DivHS::vec_sz;
-    constexpr std::uint8_t n_vecs = DivHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = DivHS::n_vecs;
 
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
@@ -540,8 +540,8 @@ true_divide_inplace_contig_impl(sycl::queue &exec_q,
 {
     using DivHS =
         hyperparam_detail::TrueDivideContigHyperparameterSet<resTy, argTy>;
-    constexpr std::uint8_t vec_sz = DivHS::vec_sz;
-    constexpr std::uint8_t n_vecs = DivHS::vec_sz;
+    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = DivHS::vec_sz;
 
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, TrueDivideInplaceContigFunctor,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index 4c776e3560..61a37ca1b2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -143,8 +143,8 @@ sycl::event trunc_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends = {})
 {
     using TruncHS = hyperparam_detail::TruncContigHyperparameterSet<argTy>;
-    constexpr std::uint8_t vec_sz = TruncHS::vec_sz;
-    constexpr std::uint8_t n_vecs = TruncHS::n_vecs;
+    static constexpr std::uint8_t vec_sz = TruncHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TruncHS::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 285ae3575a..75807d3446 100644
--- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -96,7 +96,7 @@ class TakeFunctor
         ssize_t src_offset = orthog_offsets.get_first_offset();
         ssize_t dst_offset = orthog_offsets.get_second_offset();
 
-        constexpr ProjectorT proj{};
+        static constexpr ProjectorT proj{};
         for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
             indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
 
@@ -242,7 +242,7 @@ class PutFunctor
         ssize_t dst_offset = orthog_offsets.get_first_offset();
         ssize_t val_offset = orthog_offsets.get_second_offset();
 
-        constexpr ProjectorT proj{};
+        static constexpr ProjectorT proj{};
         for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
             indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
 
diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
index 71e2c15b6b..ceec04c0a1 100644
--- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
@@ -507,7 +507,7 @@ sycl::event dot_product_impl(sycl::queue &exec_q,
                                                   reduction_rhs_offset,
                                                   reduction_shape_stride};
 
-        constexpr std::size_t preferred_reductions_per_wi =
+        static constexpr std::size_t preferred_reductions_per_wi =
             4; // determined experimentally
         std::size_t reductions_per_wi =
             (reduction_nelems < preferred_reductions_per_wi * wg)
@@ -584,8 +584,8 @@ dot_product_contig_impl(sycl::queue &exec_q,
                                                    /* step */ reduction_nelems};
         const InputOutputBatchIndexerT inp_out_batch_indexer{
             inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                      NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
 
         sycl::event dot_ev =
             sequential_dot_product<lhsTy, rhsTy, resTy,
@@ -619,10 +619,10 @@ dot_product_contig_impl(sycl::queue &exec_q,
                                                    /* step */ reduction_nelems};
         const InputOutputBatchIndexerT inp_out_batch_indexer{
             inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                      NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
 
-        constexpr std::size_t preferred_reductions_per_wi =
+        static constexpr std::size_t preferred_reductions_per_wi =
             4; // determined experimentally
         std::size_t reductions_per_wi =
             (reduction_nelems < preferred_reductions_per_wi * wg)
@@ -968,7 +968,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
         return dot_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -1013,7 +1013,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
         return dot_ev;
     }
     else {
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         // more than one work-groups is needed, requires a temporary
@@ -1053,7 +1053,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
             const RhsIndexerT rhs_indexer(
                 batch_nd, batch_rhs_offset, batch_shape_and_strides,
                 batch_shape_and_strides + 2 * batch_nd);
-            constexpr ResIndexerT noop_tmp_indexer{};
+            static constexpr ResIndexerT noop_tmp_indexer{};
 
             const InputOutputBatchIndexerT in_out_iter_indexer{
                 lhs_indexer, rhs_indexer, noop_tmp_indexer};
@@ -1093,11 +1093,11 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
 
             const InputIndexerT inp_indexer{/* size */ batches,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev =
                 dpctl::tensor::kernels::submit_no_atomic_reduction<
@@ -1130,7 +1130,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -1198,8 +1198,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
                                                    /* step */ reduction_nelems};
         const InputOutputBatchIndexerT inp_out_batch_indexer{
             inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                      NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
 
         sycl::event dot_ev =
             sequential_dot_product<lhsTy, rhsTy, resTy,
@@ -1211,7 +1211,7 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
         return dot_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -1235,8 +1235,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
                                                    /* step */ reduction_nelems};
         const InputOutputBatchIndexerT inp_out_batch_indexer{
             inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                      NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
 
         if (batches == 1) {
             // increase GPU occupancy
@@ -1260,7 +1260,7 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
         return dot_ev;
     }
     else {
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         // more than one work-groups is needed, requires a temporary
@@ -1300,8 +1300,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
                 /* step */ reduction_nelems};
             const InputOutputBatchIndexerT inp_out_batch_indexer{
                 inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-            constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                          NoOpIndexerT{}};
+            static constexpr ReductionIndexerT reduction_indexer{
+                NoOpIndexerT{}, NoOpIndexerT{}};
 
             first_reduction_ev = submit_no_atomic_dot_product<
                 lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
@@ -1335,11 +1335,11 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
 
             const InputIndexerT inp_indexer{/* size */ batches,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev =
                 dpctl::tensor::kernels::submit_no_atomic_reduction<
@@ -1365,11 +1365,11 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
 
         const InputIndexerT inp_indexer{/* size */ batches,
                                         /* step */ remaining_reduction_nelems};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
index 4ad4eb142a..2b5a42af19 100644
--- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -59,7 +59,7 @@ void scale_gemm_k_parameters(const std::size_t &local_mem_size,
                              std::size_t &n_wi,
                              std::size_t &delta_n)
 {
-    constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
+    static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
 
     while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >=
            local_mem_size)
@@ -79,8 +79,8 @@ void scale_gemm_nm_parameters(const std::size_t &local_mem_size,
                               std::size_t &wg_delta_n,
                               std::size_t &wg_delta_m)
 {
-    constexpr std::size_t slm_A_elem_size = sizeof(T);
-    constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m;
+    static constexpr std::size_t slm_A_elem_size = sizeof(T);
+    static constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m;
 
     while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) +
                (wi_delta_k * wg_delta_m * slm_B_elem_size) +
@@ -210,8 +210,8 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q,
                 NoOpIndexerT, NoOpIndexerT>;
         using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
 
-        constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                              NoOpIndexerT{}};
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
         // tmp allocation is a C-contiguous matrix (reduction_nelems,
         // iter_nelems) and we are reducing by axis 0
         const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
@@ -239,8 +239,8 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q,
                 NoOpIndexerT, NoOpIndexerT>;
         using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
 
-        constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                              NoOpIndexerT{}};
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
         // tmp allocation is a C-contiguous matrix
         // (reduction_nelems, iter_nelems). Reducing along axis 0
         const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
@@ -294,8 +294,8 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q,
                 NoOpIndexerT, NoOpIndexerT>;
         using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
 
-        constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                              NoOpIndexerT{}};
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
         // partially_reduced_tmp is C-contig matrix with shape
         // (reduction_nelems, iter_nelems). Reducing along axis 0.
         const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
@@ -331,12 +331,12 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q,
 
         const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                         /* step */ reduction_groups_};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
 
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         sycl::event partial_reduction_ev =
             dpctl::tensor::kernels::submit_no_atomic_reduction<
@@ -369,7 +369,7 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q,
 
     const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                       res_iter_indexer};
-    constexpr ReductionIndexerT reduction_indexer{};
+    static constexpr ReductionIndexerT reduction_indexer{};
 
     wg = max_wg;
     reductions_per_wi =
@@ -416,8 +416,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q,
                                                                 NoOpIndexerT>;
     using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
 
-    constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                          NoOpIndexerT{}};
+    static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+        NoOpIndexerT{}, NoOpIndexerT{}};
     const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
                                               /* step */ iter_nelems};
 
@@ -454,12 +454,12 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q,
         // along the stack axis
         const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                         /* step */ reduction_groups_};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
 
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         sycl::event partial_reduction_ev =
             dpctl::tensor::kernels::submit_no_atomic_reduction<
@@ -487,11 +487,11 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q,
         const InputIndexerT inp_indexer{
             /* size   */ iter_nelems,
             /* step   */ remaining_reduction_nelems};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -615,7 +615,7 @@ class GemmBatchFunctorThreadK
 
         using accV_t = typename LocAccT::value_type;
 
-        constexpr resT identity_ = resT(0);
+        static constexpr resT identity_ = resT(0);
         if (local_i == 0) {
             for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
                 const std::size_t sq = s + q;
@@ -651,7 +651,7 @@ class GemmBatchFunctorThreadK
         std::size_t global_s_offset = i * k + t_shift;
 
         accV_t private_sum(identity_);
-        constexpr accV_t vec_identity_(identity_);
+        static constexpr accV_t vec_identity_(identity_);
         for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
             private_sum +=
                 ((i < n) && (t + t_shift < k))
@@ -749,7 +749,7 @@ sycl::event _gemm_k_impl(sycl::queue &exec_q,
                          const ResIndexerT &res_indexer,
                          const std::vector<sycl::event> &depends)
 {
-    constexpr std::size_t m_groups = 4;
+    static constexpr std::size_t m_groups = 4;
     const std::size_t delta_k(4);
     std::size_t n_wi(64);
     std::size_t delta_n(32);
@@ -823,7 +823,7 @@ sycl::event _gemm_small_m_impl(sycl::queue &exec_q,
                                const ResIndexerT &res_indexer,
                                const std::vector<sycl::event> &depends)
 {
-    constexpr std::size_t m_groups = 1;
+    static constexpr std::size_t m_groups = 1;
     const std::size_t delta_k(4);
     std::size_t n_wi(64);
     std::size_t delta_n(32);
@@ -942,8 +942,9 @@ class GemmBatchFunctorThreadNM_vecm
 
     void operator()(sycl::nd_item<1> it) const
     {
-        constexpr resT zero_(0);
-        constexpr std::uint32_t wi_total_delta_m = wi_delta_m_vecs * m_vec_size;
+        static constexpr resT zero_(0);
+        static constexpr std::uint32_t wi_total_delta_m =
+            wi_delta_m_vecs * m_vec_size;
 
         const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems;
         const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch;
@@ -987,7 +988,7 @@ class GemmBatchFunctorThreadNM_vecm
 
         // allocate/initialize private matrix C
         // size ( wi_total_delta_n, wi_total_delta_m )
-        constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs;
+        static constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs;
         std::array<slmB_t, C_size> private_C{slmB_t{zero_}};
 
         for (std::size_t s = 0; s < k; s += wi_delta_k) {
@@ -1281,16 +1282,18 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q,
                                 const ResIndexerT &res_indexer,
                                 std::vector<sycl::event> const &depends)
 {
-    constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector<resTy>
+    static constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector<
+        resTy>
         selector{};
-    constexpr auto hyper_params = selector.get();
+    static constexpr auto hyper_params = selector.get();
 
-    constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n();
-    constexpr std::uint32_t wi_delta_m_vecs =
+    static constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n();
+    static constexpr std::uint32_t wi_delta_m_vecs =
         hyper_params.get_wi_delta_m_vecs();
-    constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size();
+    static constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size();
 
-    constexpr std::uint32_t wi_total_delta_m = wi_delta_m_vecs * m_vec_size;
+    static constexpr std::uint32_t wi_total_delta_m =
+        wi_delta_m_vecs * m_vec_size;
 
     using KernelName =
         class gemm_batch_nm_vecm_krn<lhsTy, rhsTy, resTy, BatchIndexerT,
@@ -1313,7 +1316,7 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q,
         sycl::info::kernel_device_specific::work_group_size>(dev);
 
     // Limit work-group size
-    constexpr std::size_t wg_sz_limit(2048);
+    static constexpr std::size_t wg_sz_limit(2048);
     const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz);
 
     const std::uint32_t max_subgroups_per_wg =
@@ -1422,9 +1425,9 @@ sycl::event gemm_impl(sycl::queue &exec_q,
     const OuterInnerIndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
 
     using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchIndexerT batch_indexer{};
+    static constexpr BatchIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     const std::size_t min_nm = std::min(n, m);
     const std::size_t max_nm = std::max(n, m);
@@ -1503,14 +1506,14 @@ sycl::event gemm_contig_impl(sycl::queue &exec_q,
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
     using OuterInnerIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr OuterInnerIndexerT lhs_indexer{};
-    constexpr OuterInnerIndexerT rhs_indexer{};
-    constexpr OuterInnerIndexerT res_indexer{};
+    static constexpr OuterInnerIndexerT lhs_indexer{};
+    static constexpr OuterInnerIndexerT rhs_indexer{};
+    static constexpr OuterInnerIndexerT res_indexer{};
 
     using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchIndexerT batch_indexer{};
+    static constexpr BatchIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     const std::size_t min_nm = std::min(n, m);
     const std::size_t max_nm = std::max(n, m);
@@ -1712,9 +1715,9 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q,
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
 
     using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    constexpr OuterInnerDimsIndexerT res_indexer{};
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
 
     using dpctl::tensor::offset_utils::Strided1DIndexer;
     using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
@@ -1941,7 +1944,7 @@ class GemmBatchNoAtomicFunctorThreadNM
         const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n;
         const std::size_t b_offset = local_j * wi_delta_k;
 
-        constexpr resT identity_(0);
+        static constexpr resT identity_(0);
 
         for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) {
             const std::size_t a_pr_offset = private_i * wi_delta_k;
@@ -2076,7 +2079,7 @@ class GemmBatchNoAtomicFunctorThreadK
 
         using accV_t = typename LocAccT::value_type;
 
-        constexpr resT identity_ = resT(0);
+        static constexpr resT identity_ = resT(0);
         if (local_i == 0) {
             for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
                 std::size_t sq = s + q;
@@ -2112,7 +2115,7 @@ class GemmBatchNoAtomicFunctorThreadK
         std::size_t global_s_offset = i * k + t_shift;
 
         accV_t private_sum(identity_);
-        constexpr accV_t vec_identity_(identity_);
+        static constexpr accV_t vec_identity_(identity_);
         for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
             private_sum +=
                 ((i < n) && (t + t_shift < k))
@@ -2315,7 +2318,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         std::size_t iter_nelems = batch_nelems * n * m;
@@ -2329,7 +2332,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 4;
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -2337,7 +2340,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q,
             (preferred_reductions_per_wi * wg);
 
         // max_max_wg prevents running out of resources on CPU
-        constexpr std::size_t max_max_wg = 2048;
+        static constexpr std::size_t max_max_wg = 2048;
         std::size_t max_wg = std::min(
             max_max_wg,
             dev.get_info<sycl::info::device::max_work_group_size>() / 2);
@@ -2355,7 +2358,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q,
                 inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
             const OuterInnerDimsIndexerT rhs_indexer(
                 inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            constexpr TmpIndexerT res_indexer{};
+            static constexpr TmpIndexerT res_indexer{};
 
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::StridedIndexer;
@@ -2419,7 +2422,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q,
                 inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
             const OuterInnerDimsIndexerT rhs_indexer(
                 inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            constexpr TmpIndexerT res_indexer{};
+            static constexpr TmpIndexerT res_indexer{};
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::StridedIndexer;
             using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
@@ -2564,7 +2567,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q,
                         const ssize_t *res_shape_strides,
                         std::vector<sycl::event> const &depends)
 {
-    constexpr int wi_delta_n = 2;
+    static constexpr int wi_delta_n = 2;
     std::size_t wg_delta_n(16); // rows of A processed in WG
     std::size_t wg_delta_m(16); // rows of B processed in WG
     std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
@@ -2611,7 +2614,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
         std::size_t iter_nelems = batch_nelems * n * m;
         std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
@@ -2623,7 +2626,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 4;
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -2645,7 +2648,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q,
                 inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
             const OuterInnerDimsIndexerT rhs_indexer(
                 inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            constexpr TmpIndexerT res_indexer{};
+            static constexpr TmpIndexerT res_indexer{};
 
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::StridedIndexer;
@@ -2708,7 +2711,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q,
                 inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
             const OuterInnerDimsIndexerT rhs_indexer(
                 inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            constexpr TmpIndexerT res_indexer{};
+            static constexpr TmpIndexerT res_indexer{};
 
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::StridedIndexer;
@@ -2866,7 +2869,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!is_complex<resTy>::value) {
             if (m < 4) {
-                constexpr std::uint32_t m_groups_one = 1;
+                static constexpr std::uint32_t m_groups_one = 1;
                 return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
                                               m_groups_one>(
                     exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
@@ -2877,7 +2880,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
                     res_outer_shapes_strides, res_shape_strides, depends);
             }
             else {
-                constexpr std::uint32_t m_groups_four = 4;
+                static constexpr std::uint32_t m_groups_four = 4;
                 return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
                                               m_groups_four>(
                     exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
@@ -2889,7 +2892,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
             }
         }
         else {
-            constexpr std::uint32_t m_groups_one = 1;
+            static constexpr std::uint32_t m_groups_one = 1;
             return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy, m_groups_one>(
                 exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
                 batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
@@ -2902,7 +2905,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
     else { // m > 1, n > k or m > k
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!is_complex<resTy>::value) {
-            constexpr std::uint32_t m_groups_four = 4;
+            static constexpr std::uint32_t m_groups_four = 4;
             return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_four>(
                 exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
                 batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
@@ -2912,7 +2915,7 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
                 res_outer_shapes_strides, res_shape_strides, depends);
         }
         else { // m > 1, n > k or m > k, resTy complex
-            constexpr std::uint32_t m_groups_one = 1;
+            static constexpr std::uint32_t m_groups_one = 1;
             return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_one>(
                 exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
                 batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
@@ -2953,9 +2956,9 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
 
     if (k <= (delta_k * n_wi)) {
         using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        constexpr OuterInnerDimsIndexerT lhs_indexer{};
-        constexpr OuterInnerDimsIndexerT rhs_indexer{};
-        constexpr OuterInnerDimsIndexerT res_indexer{};
+        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT res_indexer{};
 
         using dpctl::tensor::offset_utils::Strided1DIndexer;
         using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
@@ -2984,7 +2987,7 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         std::size_t iter_nelems = batch_nelems * n * m;
@@ -2998,7 +3001,7 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 4;
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -3015,9 +3018,9 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
 
             using OuterInnerDimsIndexerT =
                 dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
             using BatchDimsIndexerT =
@@ -3068,9 +3071,9 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
 
             using OuterInnerDimsIndexerT =
                 dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
             using BatchDimsIndexerT =
@@ -3120,7 +3123,7 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
                                std::size_t m,
                                std::vector<sycl::event> const &depends)
 {
-    constexpr int wi_delta_n = 2;
+    static constexpr int wi_delta_n = 2;
     std::size_t wg_delta_n(16); // rows of A processed in WG
     std::size_t wg_delta_m(16); // rows of B processed in WG
     std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
@@ -3142,9 +3145,9 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
     // temp memory if only one group is needed
     if (k <= wi_delta_k) {
         using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        constexpr OuterInnerDimsIndexerT lhs_indexer{};
-        constexpr OuterInnerDimsIndexerT rhs_indexer{};
-        constexpr OuterInnerDimsIndexerT res_indexer{};
+        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT res_indexer{};
 
         using dpctl::tensor::offset_utils::Strided1DIndexer;
         using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
@@ -3172,7 +3175,7 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
         std::size_t iter_nelems = batch_nelems * n * m;
         std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
@@ -3184,7 +3187,7 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 4;
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -3202,9 +3205,9 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
 
             using OuterInnerDimsIndexerT =
                 dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
 
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
@@ -3256,9 +3259,9 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
 
             using OuterInnerDimsIndexerT =
                 dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
 
             using dpctl::tensor::offset_utils::Strided1DIndexer;
             using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
@@ -3325,9 +3328,9 @@ sycl::event gemm_nm_impl(sycl::queue &exec_q,
 
     using BatchDimsIndexerT =
         dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchDimsIndexerT batch_indexer{};
+    static constexpr BatchDimsIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
         lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
@@ -3351,15 +3354,15 @@ gemm_batch_nm_contig_impl(sycl::queue &exec_q,
                           std::vector<sycl::event> const &depends = {})
 {
     using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    constexpr OuterInnerDimsIndexerT res_indexer{};
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
     if (batch_nelems == single_batch_nelems) {
         using BatchDimsIndexerT =
             dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-        constexpr BatchDimsIndexerT batch_indexer{};
+        static constexpr BatchDimsIndexerT batch_indexer{};
 
         sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
             lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
@@ -3517,9 +3520,9 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
     );
 
     using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchIndexerT batch_indexer{};
+    static constexpr BatchIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
     const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
@@ -3543,7 +3546,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         std::size_t iter_nelems = n * m;
@@ -3557,7 +3560,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -3574,7 +3577,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
             resTy *tmp = tmp_owner.get();
 
             using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr ResIndexerT res_indexer{};
+            static constexpr ResIndexerT res_indexer{};
 
             sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
                 lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
@@ -3610,7 +3613,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
                 partially_reduced_tmp + reduction_nelems * iter_nelems;
 
             using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr ResIndexerT res_indexer{};
+            static constexpr ResIndexerT res_indexer{};
 
             sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
                 lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
@@ -3652,7 +3655,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
                               const ssize_t *res_shapes_strides,
                               const std::vector<sycl::event> &depends)
 {
-    constexpr int wi_delta_n = 2;
+    static constexpr int wi_delta_n = 2;
     std::size_t wg_delta_n(16); // rows of A processed in WG
     std::size_t wg_delta_m(16); // rows of B processed in WG
     std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
@@ -3670,9 +3673,9 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
     );
 
     using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchIndexerT batch_indexer{};
+    static constexpr BatchIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
     const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
@@ -3697,7 +3700,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         std::size_t iter_nelems = n * m;
@@ -3710,7 +3713,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -3726,7 +3729,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
             resTy *tmp = tmp_owner.get();
 
             using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr ResIndexerT res_indexer{};
+            static constexpr ResIndexerT res_indexer{};
 
             sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
                 lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
@@ -3762,7 +3765,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
                 partially_reduced_tmp + reduction_nelems * iter_nelems;
 
             using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            constexpr ResIndexerT res_indexer{};
+            static constexpr ResIndexerT res_indexer{};
 
             sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
                 lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
@@ -3910,14 +3913,14 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q,
     );
 
     using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    constexpr OuterInnerDimsIndexerT res_indexer{};
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
 
     using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchIndexerT batch_indexer{};
+    static constexpr BatchIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     sycl::event gemm_ev;
     if (k <= (delta_k * n_wi)) {
@@ -3933,7 +3936,7 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         std::size_t iter_nelems = n * m;
@@ -3947,7 +3950,7 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -4029,7 +4032,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
                                      std::size_t m,
                                      std::vector<sycl::event> const &depends)
 {
-    constexpr int wi_delta_n = 2;
+    static constexpr int wi_delta_n = 2;
     std::size_t wg_delta_n(16); // rows of A processed in WG
     std::size_t wg_delta_m(16); // rows of B processed in WG
     std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
@@ -4047,14 +4050,14 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
     );
 
     using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    constexpr OuterInnerDimsIndexerT res_indexer{};
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
 
     using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    constexpr BatchIndexerT batch_indexer{};
+    static constexpr BatchIndexerT batch_indexer{};
 
-    constexpr std::size_t single_batch_nelems = 1;
+    static constexpr std::size_t single_batch_nelems = 1;
 
     // each group processes delta_k items in a column,
     // so no need to allocate temp memory if one group needed
@@ -4072,7 +4075,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
             typename std::conditional<std::is_same_v<resTy, bool>,
                                       sycl::logical_or<resTy>,
                                       sycl::plus<resTy>>::type;
-        constexpr resTy identity_val =
+        static constexpr resTy identity_val =
             sycl::known_identity<ReductionOpT, resTy>::value;
 
         std::size_t iter_nelems = n * m;
@@ -4085,7 +4088,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
             dev.get_info<sycl::info::device::sub_group_sizes>();
         std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi(preferred_reductions_per_wi);
 
         std::size_t reduction_groups =
@@ -4175,7 +4178,7 @@ sycl::event gemm_contig_tree_impl(sycl::queue &exec_q,
     const std::size_t max_nm = std::max(n, m);
 
     if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        constexpr std::size_t single_batch_nelems = 1;
+        static constexpr std::size_t single_batch_nelems = 1;
         return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
             exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
             depends);
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index f056d246c9..7251e3fad9 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -768,7 +768,8 @@ sycl::event reduction_over_group_with_atomics_strided_impl(
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
-    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -825,7 +826,7 @@ sycl::event reduction_over_group_with_atomics_strided_impl(
         const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                   reduction_shape_stride};
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi =
             (reduction_nelems < preferred_reductions_per_wi * wg)
                 ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
@@ -879,7 +880,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -897,7 +899,7 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
             InputIterIndexerT{/* size */ iter_nelems,
                               /* step */ reduction_nelems},
             NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         sycl::event comp_ev =
             sequential_reduction<argTy, resTy, ReductionOpT,
@@ -922,12 +924,12 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
 
         const RowsIndexerT rows_indexer{/* size */ iter_nelems,
                                         /* step */ reduction_nelems};
-        constexpr NoOpIndexerT result_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
         const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
                                                           result_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi =
             (reduction_nelems < preferred_reductions_per_wi * wg)
                 ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
@@ -968,7 +970,8 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -1007,14 +1010,14 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
                 NoOpIndexerT, NoOpIndexerT>;
         using ReductionIndexerT = ColsIndexerT;
 
-        constexpr NoOpIndexerT columns_indexer{};
-        constexpr NoOpIndexerT result_indexer{};
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
         const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
                                                           result_indexer};
         const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
                                                   /* step */ iter_nelems};
 
-        constexpr std::size_t preferred_reductions_per_wi = 8;
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
         std::size_t reductions_per_wi =
             (reduction_nelems < preferred_reductions_per_wi * wg)
                 ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
@@ -1144,7 +1147,8 @@ sycl::event reduction_over_group_temps_strided_impl(
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
-    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
 
     if (reduction_nelems == 0) {
         sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
@@ -1196,7 +1200,7 @@ sycl::event reduction_over_group_temps_strided_impl(
         return comp_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -1272,7 +1276,7 @@ sycl::event reduction_over_group_temps_strided_impl(
             // inp_indexer
             const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
                                             iter_shape_and_strides);
-            constexpr ResIndexerT noop_tmp_indexer{};
+            static constexpr ResIndexerT noop_tmp_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               noop_tmp_indexer};
@@ -1317,11 +1321,11 @@ sycl::event reduction_over_group_temps_strided_impl(
 
                 const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                                 /* step */ reduction_groups_};
-                constexpr ResIndexerT res_iter_indexer{};
+                static constexpr ResIndexerT res_iter_indexer{};
 
                 const InputOutputIterIndexerT in_out_iter_indexer{
                     inp_indexer, res_iter_indexer};
-                constexpr ReductionIndexerT reduction_indexer{};
+                static constexpr ReductionIndexerT reduction_indexer{};
 
                 partial_reduction_ev = submit_no_atomic_reduction<
                     resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
@@ -1354,7 +1358,7 @@ sycl::event reduction_over_group_temps_strided_impl(
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -1401,7 +1405,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
 
     if (reduction_nelems == 0) {
         sycl::event res_init_ev = exec_q.fill<resTy>(
@@ -1426,7 +1431,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
             InputIterIndexerT{/* size */ iter_nelems,
                               /* step */ reduction_nelems},
             NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         sycl::event comp_ev =
             sequential_reduction<argTy, resTy, ReductionOpT,
@@ -1439,7 +1444,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
         return comp_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -1459,7 +1464,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
             InputIterIndexerT{/* size */ iter_nelems,
                               /* step */ reduction_nelems},
             NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         if (iter_nelems == 1) {
             // increase GPU occupancy
@@ -1512,10 +1517,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
 
             const RowsIndexerT rows_indexer{/* size */ iter_nelems,
                                             /* step */ reduction_nelems};
-            constexpr NoOpIndexerT noop_tmp_indexer{};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
             const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
                                                               noop_tmp_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             first_reduction_ev = submit_no_atomic_reduction<
                 argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
@@ -1551,11 +1556,11 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
 
             const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev = submit_no_atomic_reduction<
                 resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
@@ -1580,11 +1585,11 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
 
         const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                         /* step */ remaining_reduction_nelems};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -1631,7 +1636,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
 
     if (reduction_nelems == 0) {
         sycl::event res_init_ev = exec_q.fill<resTy>(
@@ -1667,7 +1673,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
         return comp_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -1683,8 +1689,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                 NoOpIndexerT, NoOpIndexerT>;
         using ReductionIndexerT = ColsIndexerT;
 
-        constexpr NoOpIndexerT columns_indexer{};
-        constexpr NoOpIndexerT result_indexer{};
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
         const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
                                                           result_indexer};
         const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
@@ -1741,8 +1747,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                     NoOpIndexerT, NoOpIndexerT>;
             using ReductionIndexerT = ColsIndexerT;
 
-            constexpr NoOpIndexerT columns_indexer{};
-            constexpr NoOpIndexerT noop_tmp_indexer{};
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
             const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
                                                               noop_tmp_indexer};
             const ReductionIndexerT reduction_indexer{
@@ -1783,11 +1789,11 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
 
             const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev = submit_no_atomic_reduction<
                 resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
@@ -1812,11 +1818,11 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
 
         const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                         /* step */ remaining_reduction_nelems};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -2508,8 +2514,10 @@ sycl::event search_over_group_temps_strided_impl(
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
-    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
-    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
 
     if (reduction_nelems == 0) {
         sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
@@ -2567,7 +2575,7 @@ sycl::event search_over_group_temps_strided_impl(
         return comp_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 4;
+    static constexpr std::size_t preferred_reductions_per_wi = 4;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -2651,7 +2659,7 @@ sycl::event search_over_group_temps_strided_impl(
             // to be accessed by inp_indexer
             const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
                                             iter_shape_and_strides);
-            constexpr ResIndexerT noop_tmp_indexer{};
+            static constexpr ResIndexerT noop_tmp_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               noop_tmp_indexer};
@@ -2698,11 +2706,11 @@ sycl::event search_over_group_temps_strided_impl(
 
             const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev =
                 submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
@@ -2737,7 +2745,7 @@ sycl::event search_over_group_temps_strided_impl(
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -2800,8 +2808,10 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
-    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
 
     if (reduction_nelems == 0) {
         sycl::event res_init_ev = exec_q.fill<resTy>(
@@ -2826,7 +2836,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
             InputIterIndexerT{/* size */ iter_nelems,
                               /* step */ reduction_nelems},
             NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
@@ -2846,7 +2856,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
         return comp_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -2865,7 +2875,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
             InputIterIndexerT{/* size */ iter_nelems,
                               /* step */ reduction_nelems},
             NoOpIndexerT{}};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         if (iter_nelems == 1) {
             // increase GPU occupancy
@@ -2930,7 +2940,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                 InputIterIndexerT{/* size */ iter_nelems,
                                   /* step */ reduction_nelems},
                 NoOpIndexerT{}};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             first_reduction_ev =
                 submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
@@ -2972,11 +2982,11 @@ sycl::event search_axis1_over_group_temps_contig_impl(
 
             const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev =
                 submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
@@ -3004,11 +3014,11 @@ sycl::event search_axis1_over_group_temps_contig_impl(
 
         const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                         /* step */ remaining_reduction_nelems};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
@@ -3060,8 +3070,10 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
-    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
 
     if (reduction_nelems == 0) {
         sycl::event res_init_ev = exec_q.fill<resTy>(
@@ -3109,7 +3121,7 @@ sycl::event search_axis0_over_group_temps_contig_impl(
         return comp_ev;
     }
 
-    constexpr std::size_t preferred_reductions_per_wi = 8;
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
     // prevents running out of resources on CPU
     std::size_t max_wg = reduction_detail::get_work_group_size(d);
 
@@ -3124,8 +3136,8 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                 NoOpIndexerT, NoOpIndexerT>;
         using ReductionIndexerT = ColsIndexerT;
 
-        constexpr NoOpIndexerT columns_indexer{};
-        constexpr NoOpIndexerT result_indexer{};
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
         const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
                                                           result_indexer};
         const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
@@ -3190,8 +3202,8 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                     NoOpIndexerT, NoOpIndexerT>;
             using ReductionIndexerT = ColsIndexerT;
 
-            constexpr NoOpIndexerT columns_indexer{};
-            constexpr NoOpIndexerT result_indexer{};
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT result_indexer{};
             const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
                                                               result_indexer};
             const ReductionIndexerT reduction_indexer{
@@ -3238,11 +3250,11 @@ sycl::event search_axis0_over_group_temps_contig_impl(
 
             const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                             /* step */ reduction_groups_};
-            constexpr ResIndexerT res_iter_indexer{};
+            static constexpr ResIndexerT res_iter_indexer{};
 
             const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                               res_iter_indexer};
-            constexpr ReductionIndexerT reduction_indexer{};
+            static constexpr ReductionIndexerT reduction_indexer{};
 
             sycl::event partial_reduction_ev =
                 submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
@@ -3270,11 +3282,11 @@ sycl::event search_axis0_over_group_temps_contig_impl(
 
         const InputIndexerT inp_indexer{/* size */ iter_nelems,
                                         /* step */ remaining_reduction_nelems};
-        constexpr ResIndexerT res_iter_indexer{};
+        static constexpr ResIndexerT res_iter_indexer{};
 
         const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                           res_iter_indexer};
-        constexpr ReductionIndexerT reduction_indexer{};
+        static constexpr ReductionIndexerT reduction_indexer{};
 
         wg = max_wg;
         reductions_per_wi = std::max<std::size_t>(
diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
index 54d8ff1797..2d7b897ace 100644
--- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
@@ -235,7 +235,7 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
         T *dst_tp = reinterpret_cast<T *>(dst_cp);
 
         // orthog ndim indexer
-        constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
         // indexers along repeated axis
         const StridedIndexer src_indexer{src_nd, 0, src_shape_strides};
         const Strided1DIndexer dst_indexer{/* size */ dst_shape,
@@ -422,7 +422,7 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
         T *dst_tp = reinterpret_cast<T *>(dst_cp);
 
         // orthog ndim indexer
-        constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
         // indexers along repeated axis
         const StridedIndexer src_indexer(src_nd, 0, src_shape_strides);
         const Strided1DIndexer dst_indexer{/* size */ dst_shape,
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index ea4cd085cc..504b47c5f9 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -408,7 +408,7 @@ sort_over_work_group_contig_impl(sycl::queue &q,
     const std::uint64_t nelems_per_slm =
         (device_local_memory_size - safety_margin) / (2 * sizeof(T));
 
-    constexpr std::uint32_t sub_groups_per_work_group = 4;
+    static constexpr std::uint32_t sub_groups_per_work_group = 4;
     const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
 
     const std::size_t lws = sub_groups_per_work_group * max_sg_size;
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 8af2bc5923..3f4314ea84 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -143,13 +143,13 @@ std::make_unsigned_t<IntT> order_preserving_cast(IntT val)
 
     if constexpr (is_ascending) {
         // ascending_mask: 100..0
-        constexpr UIntT ascending_mask =
+        static constexpr UIntT ascending_mask =
             (UIntT(1) << std::numeric_limits<IntT>::digits);
         return (uint_val ^ ascending_mask);
     }
     else {
         // descending_mask: 011..1
-        constexpr UIntT descending_mask =
+        static constexpr UIntT descending_mask =
             (std::numeric_limits<UIntT>::max() >> 1);
         return (uint_val ^ descending_mask);
     }
@@ -167,11 +167,11 @@ template <bool is_ascending> std::uint16_t order_preserving_cast(sycl::half val)
     // test the sign bit of the original value
     const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15));
 
-    constexpr UIntT zero_mask = UIntT(0x8000u);
-    constexpr UIntT nonzero_mask = UIntT(0xFFFFu);
+    static constexpr UIntT zero_mask = UIntT(0x8000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFu);
 
-    constexpr UIntT inv_zero_mask = static_cast<UIntT>(~zero_mask);
-    constexpr UIntT inv_nonzero_mask = static_cast<UIntT>(~nonzero_mask);
+    static constexpr UIntT inv_zero_mask = static_cast<UIntT>(~zero_mask);
+    static constexpr UIntT inv_nonzero_mask = static_cast<UIntT>(~nonzero_mask);
 
     if constexpr (is_ascending) {
         mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
@@ -200,8 +200,8 @@ std::uint32_t order_preserving_cast(FloatT val)
     // test the sign bit of the original value
     const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31));
 
-    constexpr UIntT zero_mask = UIntT(0x80000000u);
-    constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu);
+    static constexpr UIntT zero_mask = UIntT(0x80000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu);
 
     if constexpr (is_ascending)
         mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
@@ -227,8 +227,8 @@ std::uint64_t order_preserving_cast(FloatT val)
     // test the sign bit of the original value
     const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63));
 
-    constexpr UIntT zero_mask = UIntT(0x8000000000000000u);
-    constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu);
+    static constexpr UIntT zero_mask = UIntT(0x8000000000000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu);
 
     if constexpr (is_ascending)
         mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
@@ -290,8 +290,9 @@ radix_sort_count_submit(sycl::queue &exec_q,
                         const std::vector<sycl::event> &dependency_events)
 {
     // bin_count = radix_states used for an array storing bucket state counters
-    constexpr std::uint32_t radix_states = (std::uint32_t(1) << radix_bits);
-    constexpr std::uint32_t radix_mask = radix_states - 1;
+    static constexpr std::uint32_t radix_states =
+        (std::uint32_t(1) << radix_bits);
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
 
     // iteration space info
     const std::size_t n = n_values;
@@ -444,7 +445,8 @@ sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
     const std::size_t scan_size = n_segments + 1;
     wg_size = std::min(scan_size, wg_size);
 
-    constexpr std::uint32_t radix_states = std::uint32_t(1) << radix_bits;
+    static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                  << radix_bits;
 
     // compilation of the kernel prevents out of resources issue, which may
     // occur due to usage of collective algorithms such as joint_exclusive_scan
@@ -497,8 +499,8 @@ struct empty_storage
 // Number with `n` least significant bits of uint32_t
 inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept
 {
-    constexpr std::uint32_t zero{};
-    constexpr std::uint32_t all_bits_set = ~zero;
+    static constexpr std::uint32_t zero{};
+    static constexpr std::uint32_t all_bits_set = ~zero;
 
     return ~(all_bits_set << n);
 }
@@ -716,8 +718,9 @@ radix_sort_reorder_submit(sycl::queue &exec_q,
     using ValueT = InputT;
     using PeerHelper = peer_prefix_helper<OffsetT, PeerAlgo>;
 
-    constexpr std::uint32_t radix_states = std::uint32_t{1} << radix_bits;
-    constexpr std::uint32_t radix_mask = radix_states - 1;
+    static constexpr std::uint32_t radix_states = std::uint32_t{1}
+                                                  << radix_bits;
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
     const std::size_t elems_per_segment =
         (n_values + n_segments - 1) / n_segments;
 
@@ -777,7 +780,7 @@ radix_sort_reorder_submit(sycl::queue &exec_q,
             OffsetT scanned_bin = 0;
 
             /* find cumulative offset */
-            constexpr std::uint32_t zero_radix_state_id = 0;
+            static constexpr std::uint32_t zero_radix_state_id = 0;
             offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
 
             for (std::uint32_t radix_state_id = 1;
@@ -980,11 +983,12 @@ struct parallel_radix_sort_iteration_step
             exec_q.get_device()
                 .template get_info<sycl::info::device::max_work_group_size>();
 
-        constexpr std::size_t two_mils = (std::size_t(1) << 21);
+        static constexpr std::size_t two_mils = (std::size_t(1) << 21);
         std::size_t count_wg_size =
             ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size);
 
-        constexpr std::uint32_t radix_states = std::uint32_t(1) << radix_bits;
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
 
         // correct count_wg_size according to local memory limit in count phase
         const auto max_count_wg_size = _slm_adjusted_work_group_size(
@@ -1022,13 +1026,14 @@ struct parallel_radix_sort_iteration_step
         sycl::event reorder_ev{};
         // subgroup_ballot-based peer algo uses extract_bits to populate
         // uint32_t mask and hence relies on sub-group to be 32 or narrower
-        constexpr std::size_t sg32_v = 32u;
-        constexpr std::size_t sg16_v = 16u;
-        constexpr std::size_t sg08_v = 8u;
+        static constexpr std::size_t sg32_v = 32u;
+        static constexpr std::size_t sg16_v = 16u;
+        static constexpr std::size_t sg08_v = 8u;
         if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
             sg08_v == reorder_sg_size)
         {
-            constexpr auto peer_algorithm = peer_prefix_algo::subgroup_ballot;
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::subgroup_ballot;
 
             reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel,
                                                    radix_bits, peer_algorithm>(
@@ -1037,7 +1042,7 @@ struct parallel_radix_sort_iteration_step
                 {scan_ev});
         }
         else {
-            constexpr auto peer_algorithm =
+            static constexpr auto peer_algorithm =
                 peer_prefix_algo::scan_then_broadcast;
 
             reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel,
@@ -1089,7 +1094,7 @@ struct subgroup_radix_sort
         using _SortKernelGlob =
             radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 2>;
 
-        constexpr std::size_t max_concurrent_work_groups = 128U;
+        static constexpr std::size_t max_concurrent_work_groups = 128U;
 
         // Choose this to occupy the entire accelerator
         const std::size_t n_work_groups =
@@ -1104,8 +1109,8 @@ struct subgroup_radix_sort
         switch (SLM_availability) {
         case temp_allocations::both_in_slm:
         {
-            constexpr auto storage_for_values = use_slm_tag{};
-            constexpr auto storage_for_counters = use_slm_tag{};
+            static constexpr auto storage_for_values = use_slm_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
 
             return one_group_submitter<_SortKernelLoc>()(
                 exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr,
@@ -1114,8 +1119,8 @@ struct subgroup_radix_sort
         }
         case temp_allocations::counters_in_slm:
         {
-            constexpr auto storage_for_values = use_global_mem_tag{};
-            constexpr auto storage_for_counters = use_slm_tag{};
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
 
             return one_group_submitter<_SortKernelPartGlob>()(
                 exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
@@ -1124,8 +1129,8 @@ struct subgroup_radix_sort
         }
         default:
         {
-            constexpr auto storage_for_values = use_global_mem_tag{};
-            constexpr auto storage_for_counters = use_global_mem_tag{};
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_global_mem_tag{};
 
             return one_group_submitter<_SortKernelGlob>()(
                 exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
@@ -1186,7 +1191,7 @@ struct subgroup_radix_sort
         // the kernel is designed for data size <= 64K
         assert(n <= (SizeT(1) << 16));
 
-        constexpr auto req_slm_size_counters =
+        static constexpr auto req_slm_size_counters =
             counter_buf_sz * sizeof(std::uint16_t);
 
         const auto &dev = exec_q.get_device();
@@ -1319,7 +1324,7 @@ struct subgroup_radix_sort
                         std::uint16_t wi = ndit.get_local_linear_id();
                         std::uint16_t begin_bit = 0;
 
-                        constexpr std::uint16_t end_bit =
+                        static constexpr std::uint16_t end_bit =
                             number_of_bits_in_type<KeyT>();
 
                         // copy from input array into values
@@ -1358,12 +1363,12 @@ struct subgroup_radix_sort
                                     {
                                         const std::uint16_t id =
                                             wi * block_size + i;
-                                        constexpr std::uint16_t bin_mask =
-                                            bin_count - 1;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
 
                                         // points to the padded element, i.e. id
                                         // is in-range
-                                        constexpr std::uint16_t
+                                        static constexpr std::uint16_t
                                             default_out_of_range_bin_id =
                                                 bin_mask;
 
@@ -1395,12 +1400,12 @@ struct subgroup_radix_sort
                                     {
                                         const std::uint16_t id =
                                             wi * block_size + i;
-                                        constexpr std::uint16_t bin_mask =
-                                            bin_count - 1;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
 
                                         // points to the padded element, i.e. id
                                         // is in-range
-                                        constexpr std::uint16_t
+                                        static constexpr std::uint16_t
                                             default_out_of_range_bin_id =
                                                 bin_mask;
 
@@ -1549,7 +1554,7 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
 
     // radix bits represent number of processed bits in each value during one
     // iteration
-    constexpr std::uint32_t radix_bits = 4;
+    static constexpr std::uint32_t radix_bits = 4;
 
     sycl::event sort_ev{};
 
@@ -1557,14 +1562,14 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
     const auto max_wg_size =
         dev.template get_info<sycl::info::device::max_work_group_size>();
 
-    constexpr std::uint16_t ref_wg_size = 64;
+    static constexpr std::uint16_t ref_wg_size = 64;
     if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) {
         using _RadixSortKernel = OneWorkGroupRadixSortKernel<ValueT, ProjT>;
 
         if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) {
             // wg_size * block_size == 64 * 1 * 1 == 64
-            constexpr std::uint16_t wg_size = ref_wg_size;
-            constexpr std::uint16_t block_size = 1;
+            static constexpr std::uint16_t wg_size = ref_wg_size;
+            static constexpr std::uint16_t block_size = 1;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1573,8 +1578,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) {
             // wg_size * block_size == 64 * 2 * 1 == 128
-            constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            constexpr std::uint16_t block_size = 1;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 1;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1583,8 +1588,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) {
             // wg_size * block_size == 64 * 2 * 2 == 256
-            constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            constexpr std::uint16_t block_size = 2;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 2;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1593,8 +1598,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) {
             // wg_size * block_size == 64 * 2 * 4 == 512
-            constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            constexpr std::uint16_t block_size = 4;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 4;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1603,8 +1608,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) {
             // wg_size * block_size == 64 * 2 * 8 == 1024
-            constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            constexpr std::uint16_t block_size = 8;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 8;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1613,8 +1618,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) {
             // wg_size * block_size == 64 * 4 * 8 == 2048
-            constexpr std::uint16_t wg_size = ref_wg_size * 4;
-            constexpr std::uint16_t block_size = 8;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 8;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1623,8 +1628,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) {
             // wg_size * block_size == 64 * 4 * 16 == 4096
-            constexpr std::uint16_t wg_size = ref_wg_size * 4;
-            constexpr std::uint16_t block_size = 16;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 16;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1633,8 +1638,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) {
             // wg_size * block_size == 64 * 8 * 16 == 8192
-            constexpr std::uint16_t wg_size = ref_wg_size * 8;
-            constexpr std::uint16_t block_size = 16;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 16;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1643,8 +1648,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
         else {
             // wg_size * block_size == 64 * 8 * 32 == 16384
-            constexpr std::uint16_t wg_size = ref_wg_size * 8;
-            constexpr std::uint16_t block_size = 32;
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 32;
 
             sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
                                           radix_bits>{}(
@@ -1653,12 +1658,13 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         }
     }
     else {
-        constexpr std::uint32_t radix_iters =
+        static constexpr std::uint32_t radix_iters =
             number_of_buckets_in_type<KeyT>(radix_bits);
-        constexpr std::uint32_t radix_states = std::uint32_t(1) << radix_bits;
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
 
-        constexpr std::size_t bound_512k = (std::size_t(1) << 19);
-        constexpr std::size_t bound_2m = (std::size_t(1) << 21);
+        static constexpr std::size_t bound_512k = (std::size_t(1) << 19);
+        static constexpr std::size_t bound_2m = (std::size_t(1) << 21);
 
         const auto wg_sz_k = (n_to_sort < bound_512k)  ? 8
                              : (n_to_sort <= bound_2m) ? 4
@@ -1682,7 +1688,7 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
 
         CountT *count_ptr = count_owner.get();
 
-        constexpr std::uint32_t zero_radix_iter{0};
+        static constexpr std::uint32_t zero_radix_iter{0};
 
         if constexpr (std::is_same_v<KeyT, bool>) {
 
@@ -1805,7 +1811,7 @@ radix_sort_axis1_contig_impl(sycl::queue &exec_q,
         reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
 
     using Proj = radix_sort_details::IdentityProj;
-    constexpr Proj proj_op{};
+    static constexpr Proj proj_op{};
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<argTy, Proj>(
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
index ddf61471c6..6042f23e03 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
@@ -87,7 +87,7 @@ struct SearchSortedFunctor
         // position of the needle_v in the hay array
         indTy pos{};
 
-        constexpr std::size_t zero(0);
+        static constexpr std::size_t zero(0);
         if constexpr (left_side) {
             // search in hay in left-closed interval, give `pos` such that
             // hay[pos - 1] < needle_v <= hay[pos]
@@ -155,9 +155,9 @@ sycl::event searchsorted_contig_impl(sycl::queue &exec_q,
 
         using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
 
-        constexpr TrivialIndexerT hay_indexer{};
-        constexpr TrivialIndexerT needles_indexer{};
-        constexpr TrivialIndexerT positions_indexer{};
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT positions_indexer{};
 
         const auto fnctr =
             SearchSortedFunctor<argTy, indTy, left_closed, TrivialIndexerT,
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index 2ffa0cb88a..f38ece27c5 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -47,8 +47,8 @@ sycl::event iota_impl(sycl::queue &exec_q,
                       std::size_t nelems,
                       const std::vector<sycl::event> &dependent_events)
 {
-    constexpr std::uint32_t lws = 256;
-    constexpr std::uint32_t n_wi = 4;
+    static constexpr std::uint32_t lws = 256;
+    static constexpr std::uint32_t n_wi = 4;
     const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
 
     sycl::range<1> gRange{n_groups * lws};
@@ -72,7 +72,7 @@ sycl::event iota_impl(sycl::queue &exec_q,
             }
 
             if (offset + n_wi * max_sgSize < nelems) {
-                constexpr auto group_ls_props = syclexp::properties{
+                static constexpr auto group_ls_props = syclexp::properties{
                     syclexp::data_placement_striped
                     // , syclexp::full_group
                 };
@@ -105,8 +105,8 @@ sycl::event map_back_impl(sycl::queue &exec_q,
                           std::size_t row_size,
                           const std::vector<sycl::event> &dependent_events)
 {
-    constexpr std::uint32_t lws = 64;
-    constexpr std::uint32_t n_wi = 4;
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
     const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
 
     sycl::range<1> lRange{lws};
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index fc4e08064d..558cdfc167 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -83,8 +83,8 @@ sycl::event write_out_impl(sycl::queue &exec_q,
                            IndexTy *inds_tp,
                            const std::vector<sycl::event> &depends)
 {
-    constexpr std::uint32_t lws = 64;
-    constexpr std::uint32_t n_wi = 4;
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
     const std::size_t nelems = iter_nelems * k;
     const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
 
@@ -252,7 +252,7 @@ sycl::event topk_merge_impl(
         const std::uint64_t nelems_per_slm =
             (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy));
 
-        constexpr std::uint32_t sub_groups_per_work_group = 4;
+        static constexpr std::uint32_t sub_groups_per_work_group = 4;
         const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
 
         std::size_t lws = sub_groups_per_work_group * max_sg_size;
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
index 7c103e6ce4..e8309b359c 100644
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -88,7 +88,7 @@ class WhereContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
@@ -185,8 +185,8 @@ sycl::event where_contig_impl(sycl::queue &q,
         cgh.depends_on(depends);
 
         std::size_t lws = 64;
-        constexpr std::uint8_t vec_sz = 4u;
-        constexpr std::uint8_t n_vecs = 2u;
+        static constexpr std::uint8_t vec_sz = 4u;
+        static constexpr std::uint8_t n_vecs = 2u;
         const std::size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -197,7 +197,7 @@ sycl::event where_contig_impl(sycl::queue &q,
             is_aligned<required_alignment>(x2_cp) &&
             is_aligned<required_alignment>(dst_cp))
         {
-            constexpr bool enable_sg_loadstore = true;
+            static constexpr bool enable_sg_loadstore = true;
             using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
 
             cgh.parallel_for<KernelName>(
@@ -207,7 +207,7 @@ sycl::event where_contig_impl(sycl::queue &q,
                                                         x2_tp, dst_tp));
         }
         else {
-            constexpr bool disable_sg_loadstore = false;
+            static constexpr bool disable_sg_loadstore = false;
             using InnerKernelName =
                 where_contig_kernel<T, condT, vec_sz, n_vecs>;
             using KernelName =
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
index 2aae899d50..2ff6c0abe7 100644
--- a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
@@ -55,16 +55,18 @@ template <typename IndT> struct WrapIndex
     ssize_t operator()(ssize_t max_item, IndT ind) const
     {
         ssize_t projected;
-        constexpr ssize_t unit(1);
+        static constexpr ssize_t unit(1);
         max_item = sycl::max(max_item, unit);
 
-        constexpr std::uintmax_t ind_max = std::numeric_limits<IndT>::max();
-        constexpr std::uintmax_t ssize_max =
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
             std::numeric_limits<ssize_t>::max();
 
         if constexpr (std::is_signed_v<IndT>) {
-            constexpr std::intmax_t ind_min = std::numeric_limits<IndT>::min();
-            constexpr std::intmax_t ssize_min =
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
                 std::numeric_limits<ssize_t>::min();
 
             if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
@@ -102,25 +104,27 @@ template <typename IndT> struct ClipIndex
     ssize_t operator()(ssize_t max_item, IndT ind) const
     {
         ssize_t projected;
-        constexpr ssize_t unit(1);
+        static constexpr ssize_t unit(1);
         max_item = sycl::max<ssize_t>(max_item, unit);
 
-        constexpr std::uintmax_t ind_max = std::numeric_limits<IndT>::max();
-        constexpr std::uintmax_t ssize_max =
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
             std::numeric_limits<ssize_t>::max();
         if constexpr (std::is_signed_v<IndT>) {
-            constexpr std::intmax_t ind_min = std::numeric_limits<IndT>::min();
-            constexpr std::intmax_t ssize_min =
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
                 std::numeric_limits<ssize_t>::min();
 
             if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
                 const ssize_t ind_ = static_cast<ssize_t>(ind);
-                constexpr ssize_t lb(0);
+                static constexpr ssize_t lb(0);
                 const ssize_t ub = max_item - 1;
                 projected = sycl::clamp(ind_, lb, ub);
             }
             else {
-                constexpr IndT lb(0);
+                static constexpr IndT lb(0);
                 const IndT ub = static_cast<IndT>(max_item - 1);
                 projected = static_cast<std::size_t>(sycl::clamp(ind, lb, ub));
             }
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
index a49b56b6ba..92b3a3dc56 100644
--- a/dpctl/tensor/libtensor/include/utils/math_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
@@ -124,7 +124,7 @@ template <typename T> T logaddexp(T x, T y)
     }
     else {
         const T tmp = x - y;
-        constexpr T zero(0);
+        static constexpr T zero(0);
 
         return (tmp > zero)
                    ? (x + sycl::log1p(sycl::exp(-tmp)))
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 73a09011a0..90a6e204ad 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -181,7 +181,7 @@ sycl::event async_smart_free(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends,
                              UniquePtrTs &&...unique_pointers)
 {
-    constexpr std::size_t n = sizeof...(UniquePtrTs);
+    static constexpr std::size_t n = sizeof...(UniquePtrTs);
     static_assert(
         n > 0, "async_smart_free requires at least one smart pointer argument");
 
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index ece8852643..f78193e614 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -168,9 +168,9 @@ T custom_reduce_over_group(const GroupT &wg,
 {
     // value experimentally tuned to achieve best runtime on Iris Xe,
     // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU.
-    constexpr std::uint32_t low_sz = 8u;
+    static constexpr std::uint32_t low_sz = 8u;
     // maximal work-group size
-    constexpr std::uint32_t high_sz = 1024u;
+    static constexpr std::uint32_t high_sz = 1024u;
     const std::uint32_t wgs = wg.get_local_linear_range();
     const std::uint32_t lid = wg.get_local_linear_id();
 
@@ -568,7 +568,8 @@ auto sub_group_load(const sycl::sub_group &sg,
 #if (USE_GROUP_LOAD_STORE)
     using ValueT = typename std::remove_cv_t<ElementType>;
     sycl::vec<ValueT, vec_sz> x{};
-    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
     ls_ns::group_load(sg, m_ptr, x, striped);
     return x;
 #else
@@ -585,7 +586,8 @@ auto sub_group_load(const sycl::sub_group &sg,
 #if (USE_GROUP_LOAD_STORE)
     using ValueT = typename std::remove_cv_t<ElementType>;
     ValueT x{};
-    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
     ls_ns::group_load(sg, m_ptr, x, striped);
     return x;
 #else
@@ -607,7 +609,8 @@ sub_group_store(const sycl::sub_group &sg,
 {
 #if (USE_GROUP_LOAD_STORE)
     static_assert(std::is_same_v<VecT, ElementType>);
-    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
     ls_ns::group_store(sg, val, m_ptr, striped);
     return;
 #else
@@ -628,7 +631,8 @@ sub_group_store(const sycl::sub_group &sg,
                 sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
 {
 #if (USE_GROUP_LOAD_STORE)
-    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
     ls_ns::group_store(sg, val, m_ptr, striped);
     return;
 #else
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
index f1e2d4986b..10a9ff8378 100644
--- a/dpctl/tensor/libtensor/source/accumulators.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -148,8 +148,10 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
 
     // cumsum must be int32_t/int64_t only
-    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
     if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
         throw py::value_error(
             "Cumulative sum array must have int32 or int64 data-type.");
@@ -307,7 +309,8 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
     int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
 
     // this cumsum must be int64_t only
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
     if (cumsum_typeid != int64_typeid) {
         throw py::value_error(
             "Cumulative sum array must have int64 data-type.");
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
index 2352b6ab30..23e0c28c1a 100644
--- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -250,7 +250,7 @@ std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
                               "the same array ranks");
     }
 
-    constexpr int acc_nd = 1;
+    static constexpr int acc_nd = 1;
 
     int iter_nd = src_nd - acc_nd;
     if (iter_nd < 0) {
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
index 85d861dcd7..aa1f4aa091 100644
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -133,7 +133,7 @@ struct CumLogSumExp1DContigFactory
                           srcTy, dstTy>::is_defined)
         {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
-            constexpr bool include_initial = false;
+            static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -166,7 +166,7 @@ struct CumLogSumExp1DIncludeInitialContigFactory
                           srcTy, dstTy>::is_defined)
         {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
-            constexpr bool include_initial = true;
+            static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -199,7 +199,7 @@ struct CumLogSumExpStridedFactory
                           srcTy, dstTy>::is_defined)
         {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
-            constexpr bool include_initial = false;
+            static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -232,7 +232,7 @@ struct CumLogSumExpIncludeInitialStridedFactory
                           srcTy, dstTy>::is_defined)
         {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
-            constexpr bool include_initial = true;
+            static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
index 045b1b330e..750da77a1b 100644
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -146,7 +146,7 @@ struct CumProd1DContigFactory
                                                              dstTy>::is_defined)
         {
             using ScanOpT = CumProdScanOpT<dstTy>;
-            constexpr bool include_initial = false;
+            static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -179,7 +179,7 @@ struct CumProd1DIncludeInitialContigFactory
                                                              dstTy>::is_defined)
         {
             using ScanOpT = CumProdScanOpT<dstTy>;
-            constexpr bool include_initial = true;
+            static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -212,7 +212,7 @@ struct CumProdStridedFactory
                                                              dstTy>::is_defined)
         {
             using ScanOpT = CumProdScanOpT<dstTy>;
-            constexpr bool include_initial = false;
+            static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -245,7 +245,7 @@ struct CumProdIncludeInitialStridedFactory
                                                              dstTy>::is_defined)
         {
             using ScanOpT = CumProdScanOpT<dstTy>;
-            constexpr bool include_initial = true;
+            static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
index e44678e15f..7136e9005a 100644
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -145,7 +145,7 @@ struct CumSum1DContigFactory
                                                             dstTy>::is_defined)
         {
             using ScanOpT = CumSumScanOpT<dstTy>;
-            constexpr bool include_initial = false;
+            static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -178,7 +178,7 @@ struct CumSum1DIncludeInitialContigFactory
                                                             dstTy>::is_defined)
         {
             using ScanOpT = CumSumScanOpT<dstTy>;
-            constexpr bool include_initial = true;
+            static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -211,7 +211,7 @@ struct CumSumStridedFactory
                                                             dstTy>::is_defined)
         {
             using ScanOpT = CumSumScanOpT<dstTy>;
-            constexpr bool include_initial = false;
+            static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
@@ -244,7 +244,7 @@ struct CumSumIncludeInitialStridedFactory
                                                             dstTy>::is_defined)
         {
             using ScanOpT = CumSumScanOpT<dstTy>;
-            constexpr bool include_initial = true;
+            static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
                 using dpctl::tensor::kernels::accumulators::NoOpTransformer;
                 fnT fn = dpctl::tensor::kernels::accumulators::
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index b4ec15c96f..1de4bd74e0 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -224,8 +224,10 @@ py_extract(const dpctl::tensor::usm_ndarray &src,
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
     int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
 
-    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
     if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
         throw py::value_error("Unexpected data type of cumsum array, expecting "
                               "'int32' or 'int64'");
@@ -550,8 +552,10 @@ py_place(const dpctl::tensor::usm_ndarray &dst,
     int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
     int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
 
-    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
     if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
         throw py::value_error("Unexpected data type of cumsum array, expecting "
                               "'int32' or 'int64'");
diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp
index 04ddef3495..8b988e73ed 100644
--- a/dpctl/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp
@@ -116,7 +116,7 @@ template <typename dimT> std::size_t get_nelems(const std::vector<dimT> &shape)
         return prod * static_cast<std::size_t>(term);
     };
 
-    constexpr std::size_t unit{1};
+    static constexpr std::size_t unit{1};
 
     const std::size_t nelems =
         std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
@@ -185,7 +185,7 @@ py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
     if (src_nd >= 2) {
         auto n = dst_shape_vec.back();
         if (n == dst_shape_vec[src_nd - 2]) {
-            constexpr auto unit_stride = py::ssize_t(1);
+            static constexpr auto unit_stride = py::ssize_t(1);
             if (src_strides_vec[src_nd - 2] == unit_stride) {
                 return py_as_c_contig_f2c(src, dst, exec_q, depends);
             }
@@ -310,7 +310,7 @@ py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
     if (src_nd >= 2) {
         auto n = dst_shape_vec.front();
         if (n == dst_shape_vec[1]) {
-            constexpr auto unit_stride = py::ssize_t(1);
+            static constexpr auto unit_stride = py::ssize_t(1);
             if (src_strides_vec[1] == unit_stride) {
                 return py_as_f_contig_c2f(src, dst, exec_q, depends);
             }
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
index 9bac9db768..ed46fac47d 100644
--- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
@@ -160,7 +160,7 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
         auto fn = copy_for_roll_contig_dispatch_vector[type_id];
 
         if (fn != nullptr) {
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event copy_for_roll_ev =
                 fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
@@ -331,8 +331,8 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
     auto const &dst_strides = dst.get_strides_vector();
     auto const &common_shape = src.get_shape_vector();
 
-    constexpr py::ssize_t src_offset = 0;
-    constexpr py::ssize_t dst_offset = 0;
+    static constexpr py::ssize_t src_offset = 0;
+    static constexpr py::ssize_t dst_offset = 0;
 
     auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
 
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
index 6a53ad6cc0..6b52de7eee 100644
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -160,7 +160,7 @@ void copy_numpy_ndarray_into_usm_ndarray(
                 copy_and_cast_from_host_contig_blocking_dispatch_table
                     [dst_type_id][src_type_id];
 
-            constexpr py::ssize_t zero_offset(0);
+            static constexpr py::ssize_t zero_offset(0);
 
             copy_and_cast_from_host_contig_blocking_fn(
                 exec_q, src_nelems, src_data, zero_offset, dst_data,
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index c046321006..59c9c570c9 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -458,7 +458,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                 std::initializer_list<py::ssize_t>{0, 1};
             static constexpr auto one_zero_strides =
                 std::initializer_list<py::ssize_t>{1, 0};
-            constexpr py::ssize_t one{1};
+            static constexpr py::ssize_t one{1};
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_src2_strides, zero_one_strides) &&
                 isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
@@ -751,7 +751,7 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs,
         if (nd == 2) {
             static constexpr auto one_zero_strides =
                 std::initializer_list<py::ssize_t>{1, 0};
-            constexpr py::ssize_t one{1};
+            static constexpr py::ssize_t one{1};
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_rhs_strides, one_zero_strides) &&
                 isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
index 0e3fb38015..1ad4889fbd 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
@@ -298,11 +298,15 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
     const char *src_data = src.get_data();
     char *dst_data = dst.get_data();
 
-    constexpr int float16_typeid = static_cast<int>(td_ns::typenum_t::HALF);
-    constexpr int float32_typeid = static_cast<int>(td_ns::typenum_t::FLOAT);
-    constexpr int float64_typeid = static_cast<int>(td_ns::typenum_t::DOUBLE);
-    constexpr int complex64_typeid = static_cast<int>(td_ns::typenum_t::CFLOAT);
-    constexpr int complex128_typeid =
+    static constexpr int float16_typeid =
+        static_cast<int>(td_ns::typenum_t::HALF);
+    static constexpr int float32_typeid =
+        static_cast<int>(td_ns::typenum_t::FLOAT);
+    static constexpr int float64_typeid =
+        static_cast<int>(td_ns::typenum_t::DOUBLE);
+    static constexpr int complex64_typeid =
+        static_cast<int>(td_ns::typenum_t::CFLOAT);
+    static constexpr int complex128_typeid =
         static_cast<int>(td_ns::typenum_t::CDOUBLE);
 
     // statically pre-allocated memory for scalar
@@ -370,7 +374,7 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
 
     if (nd == 0) {
         // handle 0d array as 1d array with 1 element
-        constexpr py::ssize_t one{1};
+        static constexpr py::ssize_t one{1};
         simplified_shape.push_back(one);
         simplified_src_strides.push_back(one);
         simplified_dst_strides.push_back(one);
diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp
index fe668422a0..1b3e14b245 100644
--- a/dpctl/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/full_ctor.cpp
@@ -127,7 +127,7 @@ sycl::event full_contig_impl(sycl::queue &exec_q,
         }
 
         if (is_zero) {
-            constexpr int memset_val = 0;
+            static constexpr int memset_val = 0;
             fill_ev = exec_q.submit([&](sycl::handler &cgh) {
                 cgh.depends_on(depends);
 
diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp
index ce267baa1b..57bf88c00c 100644
--- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp
+++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp
@@ -346,7 +346,7 @@ py_dot(const dpctl::tensor::usm_ndarray &x1,
                                                             [x2_typeid];
             }
             if (fn != nullptr) {
-                constexpr py::ssize_t zero_offset = 0;
+                static constexpr py::ssize_t zero_offset = 0;
                 dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
                             x2.get_data(), dst.get_data(),
                             zero_offset, // lhs batch offset
@@ -601,7 +601,7 @@ py_dot(const dpctl::tensor::usm_ndarray &x1,
                                                                [x2_typeid];
                 }
                 if (fn != nullptr) {
-                    constexpr py::ssize_t zero_offset = 0;
+                    static constexpr py::ssize_t zero_offset = 0;
                     dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
                                 x1_outer_nelems, // n
                                 inner_nelems,    // k
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
index 4139cbce19..bc24c71658 100644
--- a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -52,8 +52,8 @@ template <typename T>
 bool check_atomic_support(const sycl::queue &exec_q,
                           sycl::usm::alloc usm_alloc_type)
 {
-    constexpr bool atomic32 = (sizeof(T) == 4);
-    constexpr bool atomic64 = (sizeof(T) == 8);
+    static constexpr bool atomic32 = (sizeof(T) == 4);
+    static constexpr bool atomic64 = (sizeof(T) == 8);
     using dpctl::tensor::type_utils::is_complex;
     if constexpr ((!atomic32 && !atomic64) || is_complex<T>::value) {
         return fixed_decision<false>(exec_q, usm_alloc_type);
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
index 9458374482..14abb2eb2e 100644
--- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -266,7 +266,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
 
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event reduction_over_axis_contig_ev =
                 fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
@@ -299,7 +299,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
 
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event reduction_over_axis_contig_ev =
                 fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
@@ -586,7 +586,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
 
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event reduction_over_axis_contig_ev =
                 fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
@@ -610,7 +610,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
 
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event reduction_over_axis_contig_ev =
                 fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
@@ -863,7 +863,7 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
 
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event reduction_over_axis_contig_ev =
                 fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
@@ -885,7 +885,7 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
 
-            constexpr py::ssize_t zero_offset = 0;
+            static constexpr py::ssize_t zero_offset = 0;
 
             sycl::event reduction_over_axis_contig_ev =
                 fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
@@ -1128,7 +1128,8 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
     int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
-    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
     if (dst_typeid != int32_typeid) {
         throw py::value_error(
             "Unexpected data type of destination array, expecting 'int32'");
@@ -1152,7 +1153,7 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
         (is_src_f_contig && dst_nelems == 0))
     {
         auto fn = axis1_contig_dispatch_vector[src_typeid];
-        constexpr py::ssize_t zero_offset = 0;
+        static constexpr py::ssize_t zero_offset = 0;
 
         sycl::event red_ev =
             fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
@@ -1167,7 +1168,7 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
              ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
     {
         auto fn = axis0_contig_dispatch_vector[src_typeid];
-        constexpr py::ssize_t zero_offset = 0;
+        static constexpr py::ssize_t zero_offset = 0;
 
         sycl::event red_ev =
             fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp
index 25146eac88..9dfc84681e 100644
--- a/dpctl/tensor/libtensor/source/repeat.cpp
+++ b/dpctl/tensor/libtensor/source/repeat.cpp
@@ -198,7 +198,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
             "Destination array must have the same elemental data type");
     }
 
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
     if (cumsum_typeid != int64_typeid) {
         throw py::value_error(
             "Unexpected data type of `cumsum` array, expecting "
@@ -425,7 +426,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
             "Destination array must have the same elemental data type");
     }
 
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
     if (cumsum_typeid != int64_typeid) {
         throw py::value_error(
             "Unexpected data type of `cumsum` array, expecting "
diff --git a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp
index 387c1cb521..2a727c4bd9 100644
--- a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp
+++ b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp
@@ -127,7 +127,7 @@ py_sort(const dpctl::tensor::usm_ndarray &src,
     bool is_dst_c_contig = dst.is_c_contiguous();
 
     if (is_src_c_contig && is_dst_c_contig) {
-        constexpr py::ssize_t zero_offset = py::ssize_t(0);
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
 
         auto fn = sort_contig_fns[src_typeid];
 
diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp
index 174214e4c9..fd73c6ecdf 100644
--- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp
+++ b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -74,7 +74,7 @@ struct LeftSideSearchSortedContigFactory
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
                       std::is_same_v<indTy, std::int64_t>)
         {
-            constexpr bool left_side_search(true);
+            static constexpr bool left_side_search(true);
             using dpctl::tensor::kernels::searchsorted_contig_impl;
 
             using Compare = typename AscendingSorter<argTy>::type;
@@ -98,7 +98,7 @@ struct RightSideSearchSortedContigFactory
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
                       std::is_same_v<indTy, std::int64_t>)
         {
-            constexpr bool right_side_search(false);
+            static constexpr bool right_side_search(false);
             using dpctl::tensor::kernels::searchsorted_contig_impl;
 
             using Compare = typename AscendingSorter<argTy>::type;
@@ -130,7 +130,7 @@ struct LeftSideSearchSortedStridedFactory
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
                       std::is_same_v<indTy, std::int64_t>)
         {
-            constexpr bool left_side_search(true);
+            static constexpr bool left_side_search(true);
             using dpctl::tensor::kernels::searchsorted_strided_impl;
 
             using Compare = typename AscendingSorter<argTy>::type;
@@ -154,7 +154,7 @@ struct RightSideSearchSortedStridedFactory
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
                       std::is_same_v<indTy, std::int64_t>)
         {
-            constexpr bool right_side_search(false);
+            static constexpr bool right_side_search(false);
             using dpctl::tensor::kernels::searchsorted_strided_impl;
 
             using Compare = typename AscendingSorter<argTy>::type;
@@ -321,7 +321,7 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay,
                                                              [positions_typeid];
 
         if (fn) {
-            constexpr py::ssize_t zero_offset(0);
+            static constexpr py::ssize_t zero_offset(0);
 
             sycl::event comp_ev =
                 fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset,
@@ -399,7 +399,7 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay,
             "No implementation for data types of input arrays");
     }
 
-    constexpr py::ssize_t zero_offset(0);
+    static constexpr py::ssize_t zero_offset(0);
     py::ssize_t hay_step = hay.get_strides_vector()[0];
 
     const sycl::event &comp_ev = strided_fn(
@@ -429,7 +429,7 @@ py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay,
                      sycl::queue &exec_q,
                      const std::vector<sycl::event> &depends)
 {
-    constexpr bool side_left(true);
+    static constexpr bool side_left(true);
     return py_searchsorted(hay, needles, positions, exec_q, side_left, depends);
 }
 
@@ -443,7 +443,7 @@ py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay,
                       sycl::queue &exec_q,
                       const std::vector<sycl::event> &depends)
 {
-    constexpr bool side_right(false);
+    static constexpr bool side_right(false);
     return py_searchsorted(hay, needles, positions, exec_q, side_right,
                            depends);
 }
diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
index 65bfe5bef3..ef74469a8b 100644
--- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
@@ -78,7 +78,7 @@ sycl::event zeros_contig_impl(sycl::queue &exec_q,
                               const std::vector<sycl::event> &depends)
 {
 
-    constexpr int memset_val(0);
+    static constexpr int memset_val(0);
     sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
diff --git a/dpctl/utils/src/sequential_order_keeper.hpp b/dpctl/utils/src/sequential_order_keeper.hpp
index b29e27a2b9..a6cba662b0 100644
--- a/dpctl/utils/src/sequential_order_keeper.hpp
+++ b/dpctl/utils/src/sequential_order_keeper.hpp
@@ -7,9 +7,10 @@
 
 namespace
 {
-bool is_event_complete(const sycl::event &e)
+inline bool is_event_complete(const sycl::event &e)
 {
-    constexpr auto exec_complete = sycl::info::event_command_status::complete;
+    static constexpr auto exec_complete =
+        sycl::info::event_command_status::complete;
 
     const auto status =
         e.get_info<sycl::info::event::command_execution_status>();
diff --git a/libsyclinterface/source/dpctl_sycl_device_manager.cpp b/libsyclinterface/source/dpctl_sycl_device_manager.cpp
index e87f7dd0ab..3bc213fc43 100644
--- a/libsyclinterface/source/dpctl_sycl_device_manager.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_manager.cpp
@@ -269,7 +269,7 @@ DPCTLDeviceMgr_GetDeviceInfoStr(__dpctl_keep const DPCTLSyclDeviceRef DRef)
 int DPCTLDeviceMgr_GetPositionInDevices(__dpctl_keep DPCTLSyclDeviceRef DRef,
                                         int device_identifier)
 {
-    constexpr int not_found = -1;
+    static constexpr int not_found = -1;
     if (!DRef) {
         return not_found;
     }
diff --git a/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp
index d7c5bcedbc..8e15f694a7 100644
--- a/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp
@@ -100,7 +100,7 @@ __dpctl_give DPCTLSyclDeviceSelectorRef DPCTLGPUSelector_Create()
 int DPCTLDeviceSelector_Score(__dpctl_keep DPCTLSyclDeviceSelectorRef DSRef,
                               __dpctl_keep DPCTLSyclDeviceRef DRef)
 {
-    constexpr int REJECT_DEVICE_SCORE = -1;
+    static constexpr int REJECT_DEVICE_SCORE = -1;
     if (DSRef && DRef) {
         auto dev = *(unwrap<device>(DRef));
         return (*unwrap<dpctl_device_selector>(DSRef))(dev);
diff --git a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
index 70d89fd4fb..78c714ecbb 100644
--- a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
@@ -72,7 +72,7 @@ static const int clLibLoadFlags = 0;
     case code:                                                                 \
         return std::string(#code) + CodeStringSuffix(code)
 
-constexpr backend cl_be = backend::opencl;
+static constexpr backend cl_be = backend::opencl;
 
 struct cl_loader
 {
@@ -339,7 +339,7 @@ static const int zeLibLoadFlags = 0;
 #error "Level Zero program compilation is unavailable for this platform"
 #endif
 
-constexpr sycl::backend ze_be = sycl::backend::ext_oneapi_level_zero;
+static constexpr sycl::backend ze_be = sycl::backend::ext_oneapi_level_zero;
 
 struct ze_loader
 {
diff --git a/libsyclinterface/tests/test_sycl_device_aspects.cpp b/libsyclinterface/tests/test_sycl_device_aspects.cpp
index 8c0ffe1f16..fc14ea131f 100644
--- a/libsyclinterface/tests/test_sycl_device_aspects.cpp
+++ b/libsyclinterface/tests/test_sycl_device_aspects.cpp
@@ -99,10 +99,10 @@ auto build_gtest_values(const std::array<std::pair<T1, T2>, N> &params)
 
 auto build_params()
 {
-    constexpr auto param_1 = get_param_list<const char *>(
+    static constexpr auto param_1 = get_param_list<const char *>(
         "opencl:gpu", "opencl:cpu", "level_zero:gpu");
 
-    constexpr auto param_2 =
+    static constexpr auto param_2 =
         get_param_list<std::pair<const char *, sycl::aspect>>(
             std::make_pair("cpu", sycl::aspect::cpu),
             std::make_pair("gpu", sycl::aspect::gpu),
diff --git a/libsyclinterface/tests/test_sycl_queue_submit.cpp b/libsyclinterface/tests/test_sycl_queue_submit.cpp
index 651c98fd61..ab5b6bef82 100644
--- a/libsyclinterface/tests/test_sycl_queue_submit.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_submit.cpp
@@ -45,7 +45,7 @@
 
 namespace
 {
-constexpr std::size_t SIZE = 1024;
+static constexpr std::size_t SIZE = 1024;
 static_assert(SIZE % 8 == 0);
 
 using namespace dpctl::syclinterface;
@@ -59,10 +59,10 @@ void submit_kernel(DPCTLSyclQueueRef QRef,
                    std::string kernelName)
 {
     T scalarVal = 3;
-    constexpr std::size_t NARGS = 4;
-    constexpr std::size_t RANGE_NDIMS_1 = 1;
-    constexpr std::size_t RANGE_NDIMS_2 = 2;
-    constexpr std::size_t RANGE_NDIMS_3 = 3;
+    static constexpr std::size_t NARGS = 4;
+    static constexpr std::size_t RANGE_NDIMS_1 = 1;
+    static constexpr std::size_t RANGE_NDIMS_2 = 2;
+    static constexpr std::size_t RANGE_NDIMS_3 = 3;
 
     ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str()));
     auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str());
@@ -373,7 +373,7 @@ TEST_F(TestQueueSubmit, CheckForUnsupportedArgTy)
     int scalarVal = 3;
     std::size_t Range[] = {SIZE};
     std::size_t RANGE_NDIMS = 1;
-    constexpr std::size_t NARGS = 4;
+    static constexpr std::size_t NARGS = 4;
 
     auto kernel = DPCTLKernelBundle_GetKernel(KBRef, "_ZTS11RangeKernelIdE");
     void *args[NARGS] = {unwrap<void>(nullptr), unwrap<void>(nullptr),
diff --git a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp
index a8a9416c67..f6110375bb 100644
--- a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp
@@ -44,7 +44,7 @@
 
 namespace
 {
-constexpr std::size_t SIZE = 320;
+static constexpr std::size_t SIZE = 320;
 
 static_assert(SIZE % 10 == 0);
 
@@ -58,8 +58,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef,
                    DPCTLKernelArgType kernelArgTy,
                    std::string kernelName)
 {
-    constexpr std::size_t NARGS = 2;
-    constexpr std::size_t RANGE_NDIMS = 1;
+    static constexpr std::size_t NARGS = 2;
+    static constexpr std::size_t RANGE_NDIMS = 1;
 
     ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str()));
     auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str());
@@ -366,7 +366,7 @@ TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUnsupportedArgTy)
     std::size_t gRange[] = {SIZE};
     std::size_t lRange[] = {SIZE / 10};
     std::size_t RANGE_NDIMS = 1;
-    constexpr std::size_t NARGS = 2;
+    static constexpr std::size_t NARGS = 2;
 
     auto la = MDLocalAccessor{1, DPCTL_UNSUPPORTED_KERNEL_ARG, SIZE / 10, 1, 1};
     auto kernel = DPCTLKernelBundle_GetKernel(KBRef, "_ZTS14SyclKernel_SLMImE");
diff --git a/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp
index 2d120f7157..f40bc20066 100644
--- a/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_submit_raw_kernel_arg.cpp
@@ -46,7 +46,7 @@
 
 namespace
 {
-constexpr std::size_t SIZE = 320;
+static constexpr std::size_t SIZE = 320;
 
 static_assert(SIZE % 10 == 0);
 
@@ -71,8 +71,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef,
         return;
     }
 
-    constexpr std::size_t NARGS = 2;
-    constexpr std::size_t RANGE_NDIMS = 1;
+    static constexpr std::size_t NARGS = 2;
+    static constexpr std::size_t RANGE_NDIMS = 1;
 
     ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str()));
     auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str());
diff --git a/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp
index 658cca428a..d0f44b7275 100644
--- a/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp
@@ -46,7 +46,7 @@
 
 namespace
 {
-constexpr std::size_t SIZE = 320;
+static constexpr std::size_t SIZE = 320;
 
 static_assert(SIZE % 10 == 0);
 
@@ -67,8 +67,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef,
         return;
     }
 
-    constexpr std::size_t NARGS = 2;
-    constexpr std::size_t RANGE_NDIMS = 1;
+    static constexpr std::size_t NARGS = 2;
+    static constexpr std::size_t RANGE_NDIMS = 1;
 
     ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str()));
     auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str());
diff --git a/libsyclinterface/tests/test_sycl_usm_interface.cpp b/libsyclinterface/tests/test_sycl_usm_interface.cpp
index e04cbc1ad8..ad693661ff 100644
--- a/libsyclinterface/tests/test_sycl_usm_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_usm_interface.cpp
@@ -42,7 +42,7 @@ using namespace sycl;
 
 namespace
 {
-constexpr size_t SIZE = 1024;
+static constexpr size_t SIZE = 1024;
 
 void common_test_body(size_t nbytes,
                       const DPCTLSyclUSMRef Ptr,