From a23f9adcbaaf7be5ee02b8e5a9c59d05ebd13710 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 28 Oct 2020 17:41:43 -0700
Subject: [PATCH 01/24] [SYCL] Parallel-for range rounding-up for improved
 group size selection by GPU driver.

Signed-off-by: rdeodhar <rajiv.deodhar@intel.com>
---
 clang/lib/Sema/SemaSYCL.cpp      | 16 ++++++++++
 sycl/include/CL/sycl/handler.hpp | 53 ++++++++++++++++++++++++++------
 sycl/include/CL/sycl/id.hpp      |  2 ++
 sycl/include/CL/sycl/item.hpp    |  2 ++
 sycl/include/CL/sycl/range.hpp   |  3 ++
 5 files changed, 67 insertions(+), 9 deletions(-)
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index d08f4bc9ae614..86984ec8a993a 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -510,6 +510,22 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
       FunctionDecl *FD = WorkList.back().first;
       FunctionDecl *ParentFD = WorkList.back().second;
 
+      // To implement rounding-up of a parallel-for range (Jira 20239)
+      // a kernel call is modified like this:
+      // auto Wrapper = [=](TransformedArgType Arg) {
+      //  if (Arg[0] >= NumWorkItems[0])
+      //    return;
+      //  Arg.set_allowed_range(NumWorkItems);
+      //  KernelFunc(Arg);
+      // };
+      //
+      // This transformation leads to a condition where a kernel body
+      // function becomes callable from a new kernel body function.
+      // Hence this test.
+      if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD)) {
+        KernelBody = FD;
+      }
+
       if ((ParentFD == SYCLKernel) && isSYCLKernelBodyFunction(FD)) {
         assert(!KernelBody && "inconsistent call graph - only one kernel body "
                               "function can be called");
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 890fa1041293d..d86285cf7cd8f 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -728,23 +728,58 @@ class __SYCL_EXPORT handler {
   void parallel_for_lambda_impl(range<Dims> NumWorkItems,
                                 KernelType KernelFunc) {
     throwIfActionIsCreated();
-    using NameT =
-        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
     using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+
+    // If 1D kernel argument is an integral type, convert it to sycl::item<1>
     using TransformedArgType =
         typename std::conditional<std::is_integral<LambdaArgType>::value &&
                                       Dims == 1,
                                   item<Dims>, LambdaArgType>::type;
+    using NameT =
+        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+    constexpr size_t GoodLocalSizeX = 32;
+    std::string KName = typeid(NameT *).name();
+    bool DisableRounding =
+        KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos;
+    if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) {
+      // Not a multiple
+      size_t NewValX =
+          ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
+          GoodLocalSizeX;
+      if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr)
+        std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to "
+                  << NewValX << " *****\n";
+      auto Wrapper = [=](TransformedArgType Arg) {
+        if (Arg[0] >= NumWorkItems[0])
+          return;
+        Arg.set_allowed_range(NumWorkItems);
+        KernelFunc(Arg);
+      };
+
+      using NameWT = NameT *;
+      range<Dims> AdjustedRange = NumWorkItems;
+      AdjustedRange.set_range(NewValX);
 #ifdef __SYCL_DEVICE_ONLY__
-    (void)NumWorkItems;
-    kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
+      kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
 #else
-    detail::checkValueRange<Dims>(NumWorkItems);
-    MNDRDesc.set(std::move(NumWorkItems));
-    StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
-        std::move(KernelFunc));
-    MCGType = detail::CG::KERNEL;
+      detail::checkValueRange<Dims>(AdjustedRange);
+      MNDRDesc.set(std::move(AdjustedRange));
+      StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
+          std::move(Wrapper));
+      MCGType = detail::CG::KERNEL;
 #endif
+    } else {
+#ifdef __SYCL_DEVICE_ONLY__
+      (void)NumWorkItems;
+      kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
+#else
+      detail::checkValueRange<Dims>(NumWorkItems);
+      MNDRDesc.set(std::move(NumWorkItems));
+      StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
+          std::move(KernelFunc));
+      MCGType = detail::CG::KERNEL;
+#endif
+    }
   }
 
   /// Defines and invokes a SYCL kernel function for the specified range.
diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp
index 16d176b8b698d..79f35aca19c40 100644
--- a/sycl/include/CL/sycl/id.hpp
+++ b/sycl/include/CL/sycl/id.hpp
@@ -94,6 +94,8 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
     return result;
   }
 
+  void set_allowed_range(range<dimensions> rnwi) { (void)rnwi[0]; }
+
 #ifndef __SYCL_DISABLE_ID_TO_INT_CONV__
   /* Template operator is not allowed because it disables further type
    * conversion. For example, the next code will not work in case of template
diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp
index 9d9a879815294..a5409794dfdcf 100644
--- a/sycl/include/CL/sycl/item.hpp
+++ b/sycl/include/CL/sycl/item.hpp
@@ -104,6 +104,8 @@ template <int dimensions = 1, bool with_offset = true> class item {
 
   bool operator!=(const item &rhs) const { return rhs.MImpl != MImpl; }
 
+  void set_allowed_range(const range<dimensions> rnwi) { MImpl.MExtent = rnwi; }
+
 protected:
   template <bool has_offset = with_offset>
   item(detail::enable_if_t<has_offset, const range<dimensions>> &extent,
diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp
index 2745a05667a31..8a92bd2c32c4d 100644
--- a/sycl/include/CL/sycl/range.hpp
+++ b/sycl/include/CL/sycl/range.hpp
@@ -62,6 +62,9 @@ template <int dimensions = 1> class range : public detail::array<dimensions> {
     return size;
   }
 
+  // Adjust the first dim of the range
+  void set_range(const size_t dim0) { this->common_array[0] = dim0; }
+
   range(const range<dimensions> &rhs) = default;
   range(range<dimensions> &&rhs) = default;
   range<dimensions> &operator=(const range<dimensions> &rhs) = default;

From 73f50bd487a5305a0a3e34117ada09e96958fa42 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 3 Nov 2020 17:47:57 -0800
Subject: [PATCH 02/24] Correction to wrapper kernel name.

---
 sycl/include/CL/sycl/handler.hpp | 69 ++++++++++++++------------------
 1 file changed, 30 insertions(+), 39 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index d86285cf7cd8f..35233bc24d4aa 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -120,6 +120,14 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
   using name = Type;
 };
 
+// Used when parallel_for range is rounded-up.
+template <typename Type> class __pf_wrapper_kernel;
+
+template <typename Type> struct get_kernel_wrapper_name_t {
+  using name = __pf_wrapper_kernel<
+      typename get_kernel_name_t<detail::auto_name, Type>::name>;
+};
+
 template <typename, typename T> struct check_fn_signature {
   static_assert(std::integral_constant<T, false>::value,
                 "Second template parameter is required to be of function type");
@@ -738,48 +746,31 @@ class __SYCL_EXPORT handler {
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
     constexpr size_t GoodLocalSizeX = 32;
-    std::string KName = typeid(NameT *).name();
-    bool DisableRounding =
-        KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos;
-    if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) {
-      // Not a multiple
-      size_t NewValX =
-          ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
-          GoodLocalSizeX;
-      if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr)
-        std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to "
-                  << NewValX << " *****\n";
-      auto Wrapper = [=](TransformedArgType Arg) {
-        if (Arg[0] >= NumWorkItems[0])
-          return;
-        Arg.set_allowed_range(NumWorkItems);
-        KernelFunc(Arg);
-      };
-
-      using NameWT = NameT *;
-      range<Dims> AdjustedRange = NumWorkItems;
-      AdjustedRange.set_range(NewValX);
-#ifdef __SYCL_DEVICE_ONLY__
-      kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
-#else
-      detail::checkValueRange<Dims>(AdjustedRange);
-      MNDRDesc.set(std::move(AdjustedRange));
-      StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
-          std::move(Wrapper));
-      MCGType = detail::CG::KERNEL;
-#endif
-    } else {
+    // Round-up just the first dimension of the range.
+    // Even for multi-dimensional ranges the total number of work items will
+    // be a multiple of the rounding factor since any multiple of a rounded-up
+    // first dimension will also be a rounded-up value.
+    size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
+                     GoodLocalSizeX;
+    auto Wrapper = [=](TransformedArgType Arg) {
+      if (Arg[0] >= NumWorkItems[0])
+        return;
+      Arg.set_allowed_range(NumWorkItems);
+      KernelFunc(Arg);
+    };
+
+    using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
+    range<Dims> AdjustedRange = NumWorkItems;
+    AdjustedRange.set_range(NewValX);
 #ifdef __SYCL_DEVICE_ONLY__
-      (void)NumWorkItems;
-      kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
+    kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
 #else
-      detail::checkValueRange<Dims>(NumWorkItems);
-      MNDRDesc.set(std::move(NumWorkItems));
-      StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
-          std::move(KernelFunc));
-      MCGType = detail::CG::KERNEL;
+    detail::checkValueRange<Dims>(AdjustedRange);
+    MNDRDesc.set(std::move(AdjustedRange));
+    StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
+        std::move(Wrapper));
+    MCGType = detail::CG::KERNEL;
 #endif
-    }
   }
 
   /// Defines and invokes a SYCL kernel function for the specified range.

From 2aad33d0bd2ee75b8ba93541717aa481de9d683c Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 4 Nov 2020 18:31:25 -0800
Subject: [PATCH 03/24] Test correction to backend interoperability interface.

---
 sycl/include/CL/sycl/handler.hpp                   | 13 +++++++------
 sycl/test/basic_tests/handler/handler_set_args.cpp |  8 ++++----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 35233bc24d4aa..5986c37115f93 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -745,13 +745,17 @@ class __SYCL_EXPORT handler {
                                   item<Dims>, LambdaArgType>::type;
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    constexpr size_t GoodLocalSizeX = 32;
+
     // Round-up just the first dimension of the range.
     // Even for multi-dimensional ranges the total number of work items will
     // be a multiple of the rounding factor since any multiple of a rounded-up
     // first dimension will also be a rounded-up value.
+    constexpr size_t GoodLocalSizeX = 32;
+    range<Dims> AdjustedRange = NumWorkItems;
     size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
                      GoodLocalSizeX;
+    AdjustedRange.set_range(NewValX);
+
     auto Wrapper = [=](TransformedArgType Arg) {
       if (Arg[0] >= NumWorkItems[0])
         return;
@@ -759,15 +763,12 @@ class __SYCL_EXPORT handler {
       KernelFunc(Arg);
     };
 
-    using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
-    range<Dims> AdjustedRange = NumWorkItems;
-    AdjustedRange.set_range(NewValX);
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
+    kernel_parallel_for<NameT, TransformedArgType>(Wrapper);
 #else
     detail::checkValueRange<Dims>(AdjustedRange);
     MNDRDesc.set(std::move(AdjustedRange));
-    StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
+    StoreLambda<NameT, decltype(Wrapper), Dims, TransformedArgType>(
         std::move(Wrapper));
     MCGType = detail::CG::KERNEL;
 #endif
diff --git a/sycl/test/basic_tests/handler/handler_set_args.cpp b/sycl/test/basic_tests/handler/handler_set_args.cpp
index 6e15ecc6fa836..3bb984ad831eb 100644
--- a/sycl/test/basic_tests/handler/handler_set_args.cpp
+++ b/sycl/test/basic_tests/handler/handler_set_args.cpp
@@ -115,7 +115,7 @@ int main() {
         getPrebuiltKernel<ParallelForRangeIdFunctor<NoOffset>>(Queue);
 
     checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
+      cgh.set_args(Range, acc);
       cgh.parallel_for(Range, preBuiltKernel);
     });
   }
@@ -135,7 +135,7 @@ int main() {
         getPrebuiltKernel<ParallelForRangeItemFunctor<NoOffset>>(Queue);
 
     checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
+      cgh.set_args(Range, acc);
       cgh.parallel_for(Range, preBuiltKernel);
     });
   }
@@ -174,7 +174,7 @@ int main() {
         getPrebuiltKernel<ParallelForRangeIdFunctor<NoOffset>>(Queue);
 
     checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
+      cgh.set_args(Range, acc);
       cgh.parallel_for<class OtherKernelName2>(
           preBuiltKernel, Range, [=](cl::sycl::id<1> id) { acc[0] = 10; });
     });
@@ -197,7 +197,7 @@ int main() {
         getPrebuiltKernel<ParallelForRangeItemFunctor<NoOffset>>(Queue);
 
     checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
+      cgh.set_args(Range, acc);
       cgh.parallel_for<class OtherKernelName4>(
           preBuiltKernel, Range, [=](cl::sycl::item<1> item) { acc[0] = 10; });
     });

From ac6bf289e679815f1ceac496cbac09e95aacbbb2 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 5 Nov 2020 16:09:43 -0800
Subject: [PATCH 04/24] Environment var control to disable optimization;
 correction to one test.

---
 sycl/include/CL/sycl/handler.hpp              | 28 ++++++++++++++++++-
 .../free_function_queries.cpp                 |  5 +++-
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 5986c37115f93..b05cbc32b470b 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -732,6 +732,8 @@ class __SYCL_EXPORT handler {
   ///
   /// \param NumWorkItems is a range defining indexing space.
   /// \param KernelFunc is a SYCL kernel function.
+#ifndef __SYCL_PFWG_ROUNDED_SIZE_DISABLE__
+#define __SYCL_PFWG_ROUNDED_SIZE__ 32
   template <typename KernelName, typename KernelType, int Dims>
   void parallel_for_lambda_impl(range<Dims> NumWorkItems,
                                 KernelType KernelFunc) {
@@ -750,7 +752,7 @@ class __SYCL_EXPORT handler {
     // Even for multi-dimensional ranges the total number of work items will
     // be a multiple of the rounding factor since any multiple of a rounded-up
     // first dimension will also be a rounded-up value.
-    constexpr size_t GoodLocalSizeX = 32;
+    constexpr size_t GoodLocalSizeX = __SYCL_PFWG_ROUNDED_SIZE__;
     range<Dims> AdjustedRange = NumWorkItems;
     size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
                      GoodLocalSizeX;
@@ -773,6 +775,30 @@ class __SYCL_EXPORT handler {
     MCGType = detail::CG::KERNEL;
 #endif
   }
+#else // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
+                                KernelType KernelFunc) {
+    throwIfActionIsCreated();
+    using NameT =
+        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+    using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+    using TransformedArgType =
+        typename std::conditional<std::is_integral<LambdaArgType>::value &&
+                                      Dims == 1,
+                                  item<Dims>, LambdaArgType>::type;
+#ifdef __SYCL_DEVICE_ONLY__
+    (void)NumWorkItems;
+    kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
+#else
+    detail::checkValueRange<Dims>(NumWorkItems);
+    MNDRDesc.set(std::move(NumWorkItems));
+    StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
+        std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+#endif // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__
 
   /// Defines and invokes a SYCL kernel function for the specified range.
   ///
diff --git a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp b/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp
index c410261d06e03..a6f81c98e7a92 100644
--- a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp
+++ b/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp
@@ -47,7 +47,8 @@ int main() {
 
           auto that_item = sycl::this_item<1>();
           results_acc[1] = that_item.get_id() == i;
-          results_acc[2] = that_item.get_range() == sycl::range<1>(n);
+          results_acc[2] = that_item.get_range() ==
+                           sycl::range<1>(__SYCL_PFWG_ROUNDED_SIZE__);
           acc[i]++;
         });
       });
@@ -81,6 +82,8 @@ int main() {
           auto that_id = sycl::this_id<1>();
           results_acc[0] = i.get_id() == that_id;
           auto that_item = sycl::this_item<1>();
+          i.set_allowed_range(__SYCL_PFWG_ROUNDED_SIZE__);
+          that_item.set_allowed_range(__SYCL_PFWG_ROUNDED_SIZE__);
           results_acc[1] = i == that_item;
           acc[i]++;
         });

From da1a3ab2ffa49f25a06896d6346ca24eb69c9ece Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Mon, 16 Nov 2020 13:59:08 -0800
Subject: [PATCH 05/24] Fixes for set_args and this_item usage, and for review
 comments.

---
 clang/include/clang/Sema/Sema.h             |   6 ++
 clang/lib/Sema/SemaSYCL.cpp                 |  97 ++++++++++++++++++-
 sycl/include/CL/sycl/detail/kernel_desc.hpp |   2 +
 sycl/include/CL/sycl/handler.hpp            | 102 +++++++++++---------
 sycl/include/CL/sycl/id.hpp                 |   6 +-
 sycl/include/CL/sycl/item.hpp               |   5 +-
 sycl/include/CL/sycl/range.hpp              |  11 ++-
 7 files changed, 171 insertions(+), 58 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b1efcf73cf584..4bb5a3716b3c9 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -345,6 +345,9 @@ class SYCLIntegrationHeader {
   /// Registers a specialization constant to emit info for it into the header.
   void addSpecConstant(StringRef IDName, QualType IDType);
 
+  /// Notes that this_item is called within the kernel.
+  void setCallsThisItem(bool B);
+
 private:
   // Kernel actual parameter descriptor.
   struct KernelParamDesc {
@@ -378,6 +381,9 @@ class SYCLIntegrationHeader {
     /// Descriptor of kernel actual parameters.
     SmallVector<KernelParamDesc, 8> Params;
 
+    // Whether kernel calls this_item()
+    bool CallsThisItem;
+
     KernelDesc() = default;
   };
 
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 71e862eb21ab7..033725e47ef61 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -510,7 +510,7 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
       FunctionDecl *FD = WorkList.back().first;
       FunctionDecl *ParentFD = WorkList.back().second;
 
-      // To implement rounding-up of a parallel-for range (Jira 20239)
+      // To implement rounding-up of a parallel-for range
       // a kernel call is modified like this:
       // auto Wrapper = [=](TransformedArgType Arg) {
       //  if (Arg[0] >= NumWorkItems[0])
@@ -643,6 +643,40 @@ class FindPFWGLambdaFnVisitor
   const CXXRecordDecl *LambdaObjTy;
 };
 
+// Searches for a call to PF lambda function and captures it.
+class FindPFLambdaFnVisitor
+  : public RecursiveASTVisitor<FindPFLambdaFnVisitor> {
+public:
+  // LambdaObjTy - lambda type of the PF lambda object
+  FindPFLambdaFnVisitor(const CXXRecordDecl* LambdaObjTy)
+    : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {}
+
+  bool VisitCallExpr(CallExpr* Call) {
+    auto* M = dyn_cast<CXXMethodDecl>(Call->getDirectCallee());
+    if (!M || (M->getOverloadedOperator() != OO_Call))
+      return true;
+    const int NumPFLambdaArgs = 2; // range and lambda obj
+    if (Call->getNumArgs() != NumPFLambdaArgs)
+      return true;
+    QualType Range = Call->getArg(1)->getType();
+    if (!Util::isSyclType(Range, "id", true /*Tmpl*/) &&
+        !Util::isSyclType(Range, "item", true /*Tmpl*/))
+      return true;
+    if (Call->getArg(0)->getType()->getAsCXXRecordDecl() != LambdaObjTy)
+      return true;
+    LambdaFn = M; // call to PF lambda found - record the lambda
+    return false; // ... and stop searching
+  }
+
+  // Returns the captured lambda function or nullptr;
+  CXXMethodDecl* getLambdaFn() const { return LambdaFn; }
+
+private:
+  CXXMethodDecl* LambdaFn;
+  const CXXRecordDecl* LambdaObjTy;
+};
+
+
 class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
 public:
   MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {}
@@ -2655,13 +2689,61 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
     return !SemaRef.getASTContext().hasSameType(FD->getType(), Ty);
   }
 
+  // Sets a flag if the kernel is a parallel_for that calls the
+  // free function API "this_item".
+  void setThisItemIsCalled(const CXXRecordDecl *KernelObj,
+                           FunctionDecl *KernelFunc) {
+    if (getKernelInvocationKind(KernelFunc) != InvokeParallelFor)
+      return;
+
+    FindPFLambdaFnVisitor V(KernelObj);
+    V.TraverseStmt(KernelFunc->getBody());
+    CXXMethodDecl *WGLambdaFn = V.getLambdaFn();
+    if (!WGLambdaFn)
+      return;
+
+    // The call graph for this translation unit.
+    CallGraph SYCLCG;
+    SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl());
+    typedef std::pair<FunctionDecl *, FunctionDecl *> ChildParentPair;
+    llvm::SmallPtrSet<FunctionDecl *, 16> Visited;
+    llvm::SmallVector<ChildParentPair, 16> WorkList;
+    WorkList.push_back({WGLambdaFn, nullptr});
+
+    while (!WorkList.empty()) {
+      FunctionDecl *FD = WorkList.back().first;
+      WorkList.pop_back();
+      if (!Visited.insert(FD).second)
+        continue; // We've already seen this Decl
+
+      if (FD->isFunctionOrMethod() && FD->getIdentifier() &&
+          !FD->getName().empty() && "this_item" == FD->getName()) {
+        Header.setCallsThisItem(true);
+        return;
+      }
+
+      CallGraphNode *N = SYCLCG.getNode(FD);
+      if (!N)
+        continue;
+
+      for (const CallGraphNode *CI : *N) {
+        if (auto *Callee = dyn_cast<FunctionDecl>(CI->getDecl())) {
+          Callee = Callee->getMostRecentDecl();
+          if (!Visited.count(Callee))
+            WorkList.push_back({Callee, FD});
+        }
+      }
+    }
+  }
+
 public:
   static constexpr const bool VisitInsideSimpleContainers = false;
   SyclKernelIntHeaderCreator(Sema &S, SYCLIntegrationHeader &H,
                              const CXXRecordDecl *KernelObj, QualType NameType,
-                             StringRef Name, StringRef StableName)
+                             StringRef Name, StringRef StableName, FunctionDecl* KernelFunc)
       : SyclKernelFieldHandler(S), Header(H) {
     Header.startKernel(Name, NameType, StableName, KernelObj->getLocation());
+    setThisItemIsCalled(KernelObj, KernelFunc);
   }
 
   bool handleSyclAccessorType(const CXXRecordDecl *RD,
@@ -3085,7 +3167,7 @@ void Sema::ConstructOpenCLKernel(FunctionDecl *KernelCallerFunc,
   SyclKernelIntHeaderCreator int_header(
       *this, getSyclIntegrationHeader(), KernelObj,
       calculateKernelNameType(Context, KernelCallerFunc), KernelName,
-      StableName);
+      StableName, KernelCallerFunc);
 
   KernelObjVisitor Visitor{*this};
   Visitor.VisitRecordBases(KernelObj, kernel_decl, kernel_body, int_header);
@@ -3793,6 +3875,9 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) {
     O << "getParamDesc(unsigned i) {\n";
     O << "    return kernel_signatures[i+" << CurStart << "];\n";
     O << "  }\n";
+    O << "  __SYCL_DLL_LOCAL\n";
+    O << "  static constexpr bool callsThisItem() { return ";
+    O << K.CallsThisItem << "; }\n";
     O << "};\n";
     CurStart += N;
   }
@@ -3849,6 +3934,12 @@ void SYCLIntegrationHeader::addSpecConstant(StringRef IDName, QualType IDType) {
   SpecConsts.emplace_back(std::make_pair(IDType, IDName.str()));
 }
 
+void SYCLIntegrationHeader::setCallsThisItem(bool B) {
+  auto *K = getCurKernelDesc();
+  assert(K && "no kernels");
+  K->CallsThisItem = B;
+}
+
 SYCLIntegrationHeader::SYCLIntegrationHeader(DiagnosticsEngine &_Diag,
                                              bool _UnnamedLambdaSupport,
                                              Sema &_S)
diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp
index a7f73d9d19f70..a5153c1fa4da0 100644
--- a/sycl/include/CL/sycl/detail/kernel_desc.hpp
+++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp
@@ -56,6 +56,7 @@ template <class KernelNameType> struct KernelInfo {
     return Dummy;
   }
   static constexpr const char *getName() { return ""; }
+  static bool callsThisItem() { return false; }
 };
 #else
 template <char...> struct KernelInfoData {
@@ -65,6 +66,7 @@ template <char...> struct KernelInfoData {
     return Dummy;
   }
   static constexpr const char *getName() { return ""; }
+  static bool callsThisItem() { return false; }
 };
 
 // C++14 like index_sequence and make_index_sequence
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index b05cbc32b470b..f5d6993398665 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -121,10 +121,10 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
 };
 
 // Used when parallel_for range is rounded-up.
-template <typename Type> class __pf_wrapper_kernel;
+template <typename Type> class __pf_kernel_wrapper;
 
 template <typename Type> struct get_kernel_wrapper_name_t {
-  using name = __pf_wrapper_kernel<
+  using name = __pf_kernel_wrapper<
       typename get_kernel_name_t<detail::auto_name, Type>::name>;
 };
 
@@ -732,8 +732,6 @@ class __SYCL_EXPORT handler {
   ///
   /// \param NumWorkItems is a range defining indexing space.
   /// \param KernelFunc is a SYCL kernel function.
-#ifndef __SYCL_PFWG_ROUNDED_SIZE_DISABLE__
-#define __SYCL_PFWG_ROUNDED_SIZE__ 32
   template <typename KernelName, typename KernelType, int Dims>
   void parallel_for_lambda_impl(range<Dims> NumWorkItems,
                                 KernelType KernelFunc) {
@@ -748,57 +746,65 @@ class __SYCL_EXPORT handler {
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
 
-    // Round-up just the first dimension of the range.
-    // Even for multi-dimensional ranges the total number of work items will
-    // be a multiple of the rounding factor since any multiple of a rounded-up
-    // first dimension will also be a rounded-up value.
-    constexpr size_t GoodLocalSizeX = __SYCL_PFWG_ROUNDED_SIZE__;
-    range<Dims> AdjustedRange = NumWorkItems;
-    size_t NewValX = ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
-                     GoodLocalSizeX;
-    AdjustedRange.set_range(NewValX);
-
-    auto Wrapper = [=](TransformedArgType Arg) {
-      if (Arg[0] >= NumWorkItems[0])
-        return;
-      Arg.set_allowed_range(NumWorkItems);
-      KernelFunc(Arg);
-    };
-
+    // A reasonable choice for rounding up the range is 32.
+    constexpr size_t GoodLocalSizeX = 32;
+
+    // Disable the rounding-up optimizations under these conditions:
+    // 1. The env var SYCL_OPT_PFWGS_DISABLE is set
+    // 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name.
+    // 3. The kernel is created and invoked without an integration header entry.
+    // 4. The API "this_item" is used inside the kernel.
+    // 5. The range is already a multiple of the rounding factor.
+
+    // Get the kernal name to check condition 3.
+    std::string KName = typeid(NameT *).name();
+    using KI = detail::KernelInfo<KernelName>;
+    bool DisableRounding =
+        (getenv("SYCL_OPT_PFWGS_DISABLE") != nullptr) ||
+        (KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos) ||
+        (KI::getName() == nullptr || KI::getName()[0] == '\0') ||
+        (KI::callsThisItem());
+
+    if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) {
+      // Range is not a multiple and rounding-up is allowed
+      size_t NewValX =
+          ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
+          GoodLocalSizeX;
+      using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
+      if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr)
+        std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to "
+                  << NewValX << " *****\n";
+      auto Wrapper = [=](TransformedArgType Arg) {
+        if (Arg[0] >= NumWorkItems[0])
+          return;
+        Arg.set_allowed_range(NumWorkItems);
+        KernelFunc(Arg);
+      };
+
+      range<Dims> AdjustedRange = NumWorkItems;
+      AdjustedRange.set_range(NewValX);
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<NameT, TransformedArgType>(Wrapper);
+      kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
 #else
-    detail::checkValueRange<Dims>(AdjustedRange);
-    MNDRDesc.set(std::move(AdjustedRange));
-    StoreLambda<NameT, decltype(Wrapper), Dims, TransformedArgType>(
-        std::move(Wrapper));
-    MCGType = detail::CG::KERNEL;
+      detail::checkValueRange<Dims>(AdjustedRange);
+      MNDRDesc.set(std::move(AdjustedRange));
+      StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
+          std::move(Wrapper));
+      MCGType = detail::CG::KERNEL;
 #endif
-  }
-#else // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__
-  template <typename KernelName, typename KernelType, int Dims>
-  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
-                                KernelType KernelFunc) {
-    throwIfActionIsCreated();
-    using NameT =
-        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
-    using TransformedArgType =
-        typename std::conditional<std::is_integral<LambdaArgType>::value &&
-                                      Dims == 1,
-                                  item<Dims>, LambdaArgType>::type;
+    } else {
 #ifdef __SYCL_DEVICE_ONLY__
-    (void)NumWorkItems;
-    kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
+      (void)NumWorkItems;
+      kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
 #else
-    detail::checkValueRange<Dims>(NumWorkItems);
-    MNDRDesc.set(std::move(NumWorkItems));
-    StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
-        std::move(KernelFunc));
-    MCGType = detail::CG::KERNEL;
+      detail::checkValueRange<Dims>(NumWorkItems);
+      MNDRDesc.set(std::move(NumWorkItems));
+      StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
+          std::move(KernelFunc));
+      MCGType = detail::CG::KERNEL;
 #endif
+    }
   }
-#endif // __SYCL_PFWG_ROUNDED_SIZE_DISABLE__
 
   /// Defines and invokes a SYCL kernel function for the specified range.
   ///
diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp
index 79f35aca19c40..151657aa661e8 100644
--- a/sycl/include/CL/sycl/id.hpp
+++ b/sycl/include/CL/sycl/id.hpp
@@ -94,8 +94,6 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
     return result;
   }
 
-  void set_allowed_range(range<dimensions> rnwi) { (void)rnwi[0]; }
-
 #ifndef __SYCL_DISABLE_ID_TO_INT_CONV__
   /* Template operator is not allowed because it disables further type
    * conversion. For example, the next code will not work in case of template
@@ -241,6 +239,10 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
   __SYCL_GEN_OPT(^=)
 
 #undef __SYCL_GEN_OPT
+
+private:
+  friend class handler;
+  void set_allowed_range(range<dimensions> rnwi) { (void)rnwi[0]; }
 };
 
 namespace detail {
diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp
index a5409794dfdcf..a8aa9c8ef09f5 100644
--- a/sycl/include/CL/sycl/item.hpp
+++ b/sycl/include/CL/sycl/item.hpp
@@ -104,8 +104,6 @@ template <int dimensions = 1, bool with_offset = true> class item {
 
   bool operator!=(const item &rhs) const { return rhs.MImpl != MImpl; }
 
-  void set_allowed_range(const range<dimensions> rnwi) { MImpl.MExtent = rnwi; }
-
 protected:
   template <bool has_offset = with_offset>
   item(detail::enable_if_t<has_offset, const range<dimensions>> &extent,
@@ -120,6 +118,9 @@ template <int dimensions = 1, bool with_offset = true> class item {
   friend class detail::Builder;
 
 private:
+  friend class handler;
+  void set_allowed_range(const range<dimensions> rnwi) { MImpl.MExtent = rnwi; }
+
   detail::ItemBase<dimensions, with_offset> MImpl;
 };
 
diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp
index 8a92bd2c32c4d..3f08a67e7c207 100644
--- a/sycl/include/CL/sycl/range.hpp
+++ b/sycl/include/CL/sycl/range.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 #include <CL/sycl/detail/array.hpp>
+#include <CL/sycl/detail/helpers.hpp>
 
 #include <stdexcept>
 #include <type_traits>
@@ -62,9 +63,6 @@ template <int dimensions = 1> class range : public detail::array<dimensions> {
     return size;
   }
 
-  // Adjust the first dim of the range
-  void set_range(const size_t dim0) { this->common_array[0] = dim0; }
-
   range(const range<dimensions> &rhs) = default;
   range(range<dimensions> &&rhs) = default;
   range<dimensions> &operator=(const range<dimensions> &rhs) = default;
@@ -144,6 +142,13 @@ template <int dimensions = 1> class range : public detail::array<dimensions> {
   __SYCL_GEN_OPT(^=)
 
 #undef __SYCL_GEN_OPT
+
+private:
+  friend class handler;
+  friend class detail::Builder;
+
+  // Adjust the first dim of the range
+  void set_range(const size_t dim0) { this->common_array[0] = dim0; }
 };
 
 #ifdef __cpp_deduction_guides

From eaacd8a0faba0e5d6175e1de667efccad6ab43d3 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Mon, 16 Nov 2020 15:19:28 -0800
Subject: [PATCH 06/24] Formatting changes.

---
 clang/lib/Sema/SemaSYCL.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 69d53b0074c58..2f7bb6f7669fd 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -659,14 +659,14 @@ class FindPFWGLambdaFnVisitor
 
 // Searches for a call to PF lambda function and captures it.
 class FindPFLambdaFnVisitor
-  : public RecursiveASTVisitor<FindPFLambdaFnVisitor> {
+    : public RecursiveASTVisitor<FindPFLambdaFnVisitor> {
 public:
   // LambdaObjTy - lambda type of the PF lambda object
-  FindPFLambdaFnVisitor(const CXXRecordDecl* LambdaObjTy)
-    : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {}
+  FindPFLambdaFnVisitor(const CXXRecordDecl *LambdaObjTy)
+      : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {}
 
-  bool VisitCallExpr(CallExpr* Call) {
-    auto* M = dyn_cast<CXXMethodDecl>(Call->getDirectCallee());
+  bool VisitCallExpr(CallExpr *Call) {
+    auto *M = dyn_cast<CXXMethodDecl>(Call->getDirectCallee());
     if (!M || (M->getOverloadedOperator() != OO_Call))
       return true;
     const int NumPFLambdaArgs = 2; // range and lambda obj
@@ -683,11 +683,11 @@ class FindPFLambdaFnVisitor
   }
 
   // Returns the captured lambda function or nullptr;
-  CXXMethodDecl* getLambdaFn() const { return LambdaFn; }
+  CXXMethodDecl *getLambdaFn() const { return LambdaFn; }
 
 private:
-  CXXMethodDecl* LambdaFn;
-  const CXXRecordDecl* LambdaObjTy;
+  CXXMethodDecl *LambdaFn;
+  const CXXRecordDecl *LambdaObjTy;
 };
 
 
@@ -2754,7 +2754,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
   static constexpr const bool VisitInsideSimpleContainers = false;
   SyclKernelIntHeaderCreator(Sema &S, SYCLIntegrationHeader &H,
                              const CXXRecordDecl *KernelObj, QualType NameType,
-                             StringRef Name, StringRef StableName, FunctionDecl* KernelFunc)
+                             StringRef Name, StringRef StableName,
+                             FunctionDecl *KernelFunc)
       : SyclKernelFieldHandler(S), Header(H) {
     Header.startKernel(Name, NameType, StableName, KernelObj->getLocation());
     setThisItemIsCalled(KernelObj, KernelFunc);

From 535745fbe80113f3efaf869725f535e174434120 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Mon, 16 Nov 2020 15:24:27 -0800
Subject: [PATCH 07/24] Formatting change.

---
 clang/lib/Sema/SemaSYCL.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 2f7bb6f7669fd..4460581df7ef5 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -690,7 +690,6 @@ class FindPFLambdaFnVisitor
   const CXXRecordDecl *LambdaObjTy;
 };
 
-
 class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
 public:
   MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {}

From 5c6c841ee44231fdf8e5511433bd58970f05cc3c Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Mon, 16 Nov 2020 15:46:37 -0800
Subject: [PATCH 08/24] Removed unneeded files.

---
 .../free_function_queries.cpp                 | 173 -------------
 .../basic_tests/handler/handler_set_args.cpp  | 230 ------------------
 2 files changed, 403 deletions(-)
 delete mode 100644 sycl/test/basic_tests/free_function_queries/free_function_queries.cpp
 delete mode 100644 sycl/test/basic_tests/handler/handler_set_args.cpp

diff --git a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp b/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp
deleted file mode 100644
index c410261d06e03..0000000000000
--- a/sycl/test/basic_tests/free_function_queries/free_function_queries.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-
-// TODO: Support global work offset on Level Zero.
-// XFAIL: level_zero
-
-//==- free_function_queries.cpp - SYCL free function queries test -=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===------------------------------------------------------------------------===//
-
-#include <CL/sycl.hpp>
-
-#include <cassert>
-#include <iostream>
-#include <type_traits>
-
-int main() {
-  constexpr std::size_t n = 10;
-
-  int data[n]{};
-  int counter{0};
-
-  {
-    constexpr int checks_number{3};
-    int results[checks_number]{};
-    {
-      sycl::buffer<int> buf(data, sycl::range<1>(n));
-      sycl::buffer<int> results_buf(results, sycl::range<1>(checks_number));
-      sycl::queue q;
-      q.submit([&](cl::sycl::handler &cgh) {
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            acc(buf.get_access<sycl::access::mode::write>(cgh));
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            results_acc(results_buf.get_access<sycl::access::mode::write>(cgh));
-        cgh.parallel_for<class IdTest>(n, [=](sycl::id<1> i) {
-          auto that_id = sycl::this_id<1>();
-          results_acc[0] = that_id == i;
-
-          auto that_item = sycl::this_item<1>();
-          results_acc[1] = that_item.get_id() == i;
-          results_acc[2] = that_item.get_range() == sycl::range<1>(n);
-          acc[i]++;
-        });
-      });
-    }
-    ++counter;
-    for (int i = 0; i < n; i++) {
-      assert(data[i] == counter);
-    }
-    for (auto val : results) {
-      assert(val == 1);
-    }
-  }
-
-  {
-    constexpr int checks_number{2};
-    int results[checks_number]{};
-    {
-      sycl::buffer<int> buf(data, sycl::range<1>(n));
-      sycl::buffer<int> results_buf(results, sycl::range<1>(checks_number));
-      sycl::queue q;
-      q.submit([&](cl::sycl::handler &cgh) {
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            acc(buf.get_access<sycl::access::mode::write>(cgh));
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            results_acc(results_buf.get_access<sycl::access::mode::write>(cgh));
-        cgh.parallel_for<class ItemTest>(n, [=](auto i) {
-          static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
-                        "lambda arg type is unexpected");
-          auto that_id = sycl::this_id<1>();
-          results_acc[0] = i.get_id() == that_id;
-          auto that_item = sycl::this_item<1>();
-          results_acc[1] = i == that_item;
-          acc[i]++;
-        });
-      });
-    }
-    ++counter;
-    for (int i = 0; i < n; i++) {
-      assert(data[i] == counter);
-    }
-    for (auto val : results) {
-      assert(val == 1);
-    }
-  }
-
-  {
-    constexpr int checks_number{2};
-    int results[checks_number]{};
-    {
-      sycl::buffer<int> buf(data, sycl::range<1>(n));
-      sycl::buffer<int> results_buf(results, sycl::range<1>(checks_number));
-      sycl::queue q;
-      sycl::id<1> offset(1);
-      q.submit([&](cl::sycl::handler &cgh) {
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            acc(buf.get_access<sycl::access::mode::write>(cgh));
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            results_acc(results_buf.get_access<sycl::access::mode::write>(cgh));
-        cgh.parallel_for<class ItemOffsetTest>(
-            sycl::range<1>{n}, offset, [=](sycl::item<1, true> i) {
-              auto that_id = sycl::this_id<1>();
-              results_acc[0] = i.get_id() == that_id;
-              auto that_item = sycl::this_item<1>();
-              results_acc[1] = i == that_item;
-              acc[that_item.get_linear_id()]++;
-            });
-      });
-    }
-    ++counter;
-    for (int i = 0; i < n; i++) {
-      assert(data[i] == counter);
-    }
-    for (auto val : results) {
-      assert(val == 1);
-    }
-  }
-
-  {
-    constexpr int checks_number{5};
-    int results[checks_number]{};
-    {
-      sycl::buffer<int> buf(data, sycl::range<1>(n));
-      sycl::buffer<int> results_buf(results, sycl::range<1>(checks_number));
-      sycl::queue q;
-      sycl::nd_range<1> NDR(sycl::range<1>{n}, sycl::range<1>{2});
-      q.submit([&](cl::sycl::handler &cgh) {
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            acc(buf.get_access<sycl::access::mode::write>(cgh));
-        sycl::accessor<int, 1, sycl::access::mode::write,
-                       sycl::access::target::global_buffer>
-            results_acc(results_buf.get_access<sycl::access::mode::write>(cgh));
-        cgh.parallel_for<class NdItemTest>(NDR, [=](auto nd_i) {
-          static_assert(std::is_same<decltype(nd_i), sycl::nd_item<1>>::value,
-                        "lambda arg type is unexpected");
-          auto that_nd_item = sycl::this_nd_item<1>();
-          results_acc[0] = that_nd_item == nd_i;
-          auto nd_item_group = that_nd_item.get_group();
-          results_acc[1] = nd_item_group == sycl::this_group<1>();
-          auto nd_item_id = that_nd_item.get_global_id();
-          results_acc[2] = nd_item_id == sycl::this_id<1>();
-          auto that_item = sycl::this_item<1>();
-          results_acc[3] = nd_item_id == that_item.get_id();
-          results_acc[4] =
-              that_nd_item.get_global_range() == that_item.get_range();
-
-          acc[that_nd_item.get_global_id(0)]++;
-        });
-      });
-    }
-    ++counter;
-    for (int i = 0; i < n; i++) {
-      assert(data[i] == counter);
-    }
-    for (auto val : results) {
-      assert(val == 1);
-    }
-  }
-}
diff --git a/sycl/test/basic_tests/handler/handler_set_args.cpp b/sycl/test/basic_tests/handler/handler_set_args.cpp
deleted file mode 100644
index 6e15ecc6fa836..0000000000000
--- a/sycl/test/basic_tests/handler/handler_set_args.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-//==--------------- handler_set_args.cpp -------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <CL/sycl.hpp>
-#include <cassert>
-
-constexpr bool UseOffset = true;
-constexpr bool NoOffset = false;
-const cl::sycl::range<1> Range = 1;
-
-using AccessorT = cl::sycl::accessor<int, 1, cl::sycl::access::mode::read_write,
-                                     cl::sycl::access::target::global_buffer>;
-
-struct SingleTaskFunctor {
-  SingleTaskFunctor(AccessorT acc) : MAcc(acc) {}
-
-  void operator()() const { MAcc[0] = 10; }
-
-  AccessorT MAcc;
-};
-
-template <bool useOffset> struct ParallelForRangeIdFunctor {
-  ParallelForRangeIdFunctor(AccessorT acc) : MAcc(acc) {}
-
-  void operator()(cl::sycl::id<1> id) const { MAcc[0] = 10; }
-
-  AccessorT MAcc;
-};
-
-template <bool useOffset> struct ParallelForRangeItemFunctor {
-  ParallelForRangeItemFunctor(AccessorT acc) : MAcc(acc) {}
-
-  void operator()(cl::sycl::item<1> item) const { MAcc[0] = 10; }
-
-  AccessorT MAcc;
-};
-
-struct ParallelForNdRangeFunctor {
-  ParallelForNdRangeFunctor(AccessorT acc) : MAcc(acc) {}
-
-  void operator()(cl::sycl::nd_item<1> ndItem) const { MAcc[0] = 10; }
-
-  AccessorT MAcc;
-};
-
-template <class kernel_name>
-cl::sycl::kernel getPrebuiltKernel(cl::sycl::queue &queue) {
-  cl::sycl::program program(queue.get_context());
-  program.build_with_kernel_type<kernel_name>();
-  return program.get_kernel<kernel_name>();
-}
-
-template <class kernel_wrapper>
-void checkApiCall(cl::sycl::queue &queue, kernel_wrapper &&kernelWrapper) {
-  int result = 0;
-  {
-    auto buf = cl::sycl::buffer<int, 1>(&result, Range);
-    queue.submit([&](cl::sycl::handler &cgh) {
-      auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      kernelWrapper(cgh, acc);
-    });
-  }
-  assert(result == 10);
-}
-
-int main() {
-  cl::sycl::queue Queue;
-  const cl::sycl::id<1> Offset(0);
-  const cl::sycl::nd_range<1> NdRange(Range, Range);
-
-  checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-    cgh.single_task(SingleTaskFunctor(acc));
-  });
-
-  checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-    cgh.parallel_for(Range, ParallelForRangeIdFunctor<NoOffset>(acc));
-  });
-
-  checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-    cgh.parallel_for(Range, Offset, ParallelForRangeIdFunctor<UseOffset>(acc));
-  });
-
-  checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-    cgh.parallel_for(Range, ParallelForRangeItemFunctor<NoOffset>(acc));
-  });
-
-  checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-    cgh.parallel_for(Range, Offset,
-                     ParallelForRangeItemFunctor<UseOffset>(acc));
-  });
-
-  checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-    cgh.parallel_for(NdRange, ParallelForNdRangeFunctor(acc));
-  });
-
-  {
-    auto preBuiltKernel = getPrebuiltKernel<SingleTaskFunctor>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.single_task(preBuiltKernel);
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeIdFunctor<NoOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for(Range, preBuiltKernel);
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeIdFunctor<UseOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for(Range, Offset, preBuiltKernel);
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeItemFunctor<NoOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for(Range, preBuiltKernel);
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeItemFunctor<UseOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for(Range, Offset, preBuiltKernel);
-    });
-  }
-
-  {
-    auto preBuiltKernel = getPrebuiltKernel<ParallelForNdRangeFunctor>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for(NdRange, preBuiltKernel);
-    });
-  }
-
-  {
-    auto preBuiltKernel = getPrebuiltKernel<SingleTaskFunctor>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.single_task<class OtherKernelName1>(preBuiltKernel,
-                                              [=]() { acc[0] = 10; });
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeIdFunctor<NoOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for<class OtherKernelName2>(
-          preBuiltKernel, Range, [=](cl::sycl::id<1> id) { acc[0] = 10; });
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeIdFunctor<UseOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for<class OtherKernelName3>(
-          preBuiltKernel, Range, Offset,
-          [=](cl::sycl::id<1> id) { acc[0] = 10; });
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeItemFunctor<NoOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for<class OtherKernelName4>(
-          preBuiltKernel, Range, [=](cl::sycl::item<1> item) { acc[0] = 10; });
-    });
-  }
-
-  {
-    auto preBuiltKernel =
-        getPrebuiltKernel<ParallelForRangeItemFunctor<UseOffset>>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for<class OtherKernelName5>(
-          preBuiltKernel, Range, Offset,
-          [=](cl::sycl::item<1> item) { acc[0] = 10; });
-    });
-  }
-
-  {
-    auto preBuiltKernel = getPrebuiltKernel<ParallelForNdRangeFunctor>(Queue);
-
-    checkApiCall(Queue, [&](cl::sycl::handler &cgh, AccessorT acc) {
-      cgh.set_args(acc);
-      cgh.parallel_for<class OtherKernelName6>(
-          preBuiltKernel, NdRange,
-          [=](cl::sycl::nd_item<1> ndItem) { acc[0] = 10; });
-    });
-  }
-
-  return 0;
-}

From dc20fa1e1a56deb4920a6e834f637246f2e00a65 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Mon, 16 Nov 2020 17:34:55 -0800
Subject: [PATCH 09/24] Added some comments requested by reviewers.

---
 sycl/include/CL/sycl/handler.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 16711e3fde354..d99c196fbae0c 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -765,8 +765,13 @@ class __SYCL_EXPORT handler {
         (KI::getName() == nullptr || KI::getName()[0] == '\0') ||
         (KI::callsThisItem());
 
+    // Perform range rounding if rounding-up is enabled
+    // and the user-specified range is not a multiple of a "good" value.
     if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) {
-      // Range is not a multiple and rounding-up is allowed
+      // It is sufficient to round up just the first dimension.
+      // Multiplying the rounded-up value of the first dimension
+      // by the values of the remaining dimensions (if any)
+      // will yield a rounded-up value for the total range.
       size_t NewValX =
           ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
           GoodLocalSizeX;

From d62e2f1162078ddfe724886eec85775e6e1893af Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 17 Nov 2020 14:32:58 -0800
Subject: [PATCH 10/24] Added a test for integration header and one execution
 test.

---
 clang/test/CodeGenSYCL/Inputs/sycl.hpp        |  12 ++
 .../CodeGenSYCL/parallel_for_this_item.cpp    |  53 ++++++
 .../parallel_for_range_roundup.cpp            | 153 ++++++++++++++++++
 3 files changed, 218 insertions(+)
 create mode 100755 clang/test/CodeGenSYCL/parallel_for_this_item.cpp
 create mode 100755 sycl/test/basic_tests/parallel_for_range_roundup.cpp

diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp
index 402663eafb595..a0f774ad8d243 100644
--- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp
+++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp
@@ -118,6 +118,18 @@ struct id {
   int Data;
 };
 
+template <int dim> struct item {
+  template <typename... T>
+  item(T... args) {} // fake constructor
+private:
+  // Some fake field added to see using of item arguments in the
+  // kernel wrapper
+  int Data;
+};
+
+template <int Dims> item<Dims>
+this_item() { return item<Dims>{}; }
+
 template <int dim>
 struct range {
   template <typename... T>
diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
new file mode 100755
index 0000000000000..dbb62b52b22dd
--- /dev/null
+++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -fsycl -fsycl-is-device -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only
+// RUN: FileCheck -input-file=%t.h %s
+
+// This test checks that compiler generates correct kernel description
+// for parallel_for kernels that use the this_item API.
+
+// CHECK: __SYCL_INLINE_NAMESPACE(cl) {
+// CHECK-NEXT: namespace sycl {
+// CHECK-NEXT: namespace detail {
+
+// CHECK: static constexpr
+// CHECK-NEXT: const char* const kernel_names[] = {
+// CHECK-NEXT:   "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3GNU",
+// CHECK-NEXT:   "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3EMU",
+// CHECK-NEXT:   "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3OWL",
+// CHECK-NEXT:   "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT"
+// CHECK-NEXT: };
+
+// CHECK: template <> struct KernelInfo<class GNU> {
+// CHECK: __SYCL_DLL_LOCAL
+// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; }
+
+// CHECK: template <> struct KernelInfo<class EMU> {
+// CHECK: __SYCL_DLL_LOCAL
+// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; }
+
+// CHECK: template <> struct KernelInfo<class OWL> {
+// CHECK: __SYCL_DLL_LOCAL
+// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; }
+
+// CHECK: template <> struct KernelInfo<class RAT> {
+// CHECK: __SYCL_DLL_LOCAL
+// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; }
+
+#include "Inputs/sycl.hpp"
+
+int main() {
+  cl::sycl::queue myQueue;
+  myQueue.submit([&](cl::sycl::handler &cgh) {
+    cgh.parallel_for<class GNU>(cl::sycl::range<1>(1),
+                                [=](cl::sycl::item<1> I) {});
+    cgh.parallel_for<class EMU>(
+        cl::sycl::range<1>(1),
+        [=](cl::sycl::item<1> I) { cl::sycl::this_item<1>(); });
+    cgh.parallel_for<class OWL>(cl::sycl::range<1>(1),
+                                [=](cl::sycl::id<1> I) {});
+    cgh.parallel_for<class RAT>(cl::sycl::range<1>(1), [=](cl::sycl::id<1> I) {
+      cl::sycl::this_item<1>();
+    });
+  });
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
new file mode 100755
index 0000000000000..52519abb8e0f0
--- /dev/null
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -0,0 +1,153 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+
+class PF_init_item;
+class PF_init_id;
+class PF_init_id1;
+
+struct SizesInfo {
+  range<1> ItemGlobalSize = {0};
+  range<1> RealGlobalSizeX = {0};
+  range<1> RealGlobalSizeY = {0};
+  range<1> RealGlobalSizeZ = {0};
+};
+
+void check(const char *msg, size_t v, size_t ref) {
+  std::cout << msg << v << std::endl;
+  assert(v == ref);
+}
+
+int try_item(size_t size) {
+  size_t RoundedUpSize = (size + 32 - 1) / 32 * 32;
+  SizesInfo SInfo;
+  range<1> Size{size};
+  int Counter = 0;
+  bool OnGpu;
+
+  {
+    buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
+    buffer<int, 1> BufCounter(&Counter, Size);
+    queue myQueue;
+    OnGpu = myQueue.get_device().is_gpu();
+
+    myQueue.submit([&](handler &cgh) {
+      auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+
+      cgh.parallel_for<PF_init_item>(Size, [=](item<1> ITEM) {
+        // cgh.parallel_for<PF_init_item>(Size, [=](int ITEM) {
+        AccCounter[0].fetch_add(1);
+        AccSizes[0].ItemGlobalSize = ITEM.get_range(0);
+#ifdef __SYCL_DEVICE_ONLY__
+        AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()};
+        AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()};
+        AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()};
+#endif // __SYCL_DEVICE_ONLY__
+      });
+    });
+  }
+
+  std::cout << std::endl;
+  if (OnGpu) {
+    std::cout << "Ran on GPU" << std::endl;
+    check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize);
+    check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1);
+    check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1);
+  }
+  check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size);
+  check("Counter = ", Counter, size);
+  std::cout << std::endl;
+  return 0;
+}
+
+int try_id(size_t size) {
+  size_t RoundedUpSize = (size + 32 - 1) / 32 * 32;
+  SizesInfo SInfo;
+  range<1> Size{size};
+  int Counter = 0;
+  bool OnGpu;
+
+  {
+    buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
+    buffer<int, 1> BufCounter(&Counter, Size);
+    queue myQueue;
+    OnGpu = myQueue.get_device().is_gpu();
+
+    myQueue.submit([&](handler &cgh) {
+      auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+
+      cgh.parallel_for<class PF_init_id>(Size, [=](id<1> ID) {
+        AccCounter[0].fetch_add(1);
+        AccSizes[0].ItemGlobalSize = ID[0];
+#ifdef __SYCL_DEVICE_ONLY__
+        AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()};
+        AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()};
+        AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()};
+#endif // __SYCL_DEVICE_ONLY__
+      });
+    });
+  }
+  std::cout << std::endl;
+  if (OnGpu) {
+    std::cout << "Ran on GPU" << std::endl;
+    check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize);
+    check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1);
+    check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1);
+  }
+  check("Counter = ", Counter, size);
+  std::cout << std::endl;
+
+  {
+    buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
+    buffer<int, 1> BufCounter(&Counter, Size);
+    queue myQueue;
+    Counter = 0;
+    OnGpu = myQueue.get_device().is_gpu();
+
+    myQueue.submit([&](handler &cgh) {
+      auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+      cgh.parallel_for<class PF_init_id1>(Size, [=](id<1> ID) {
+        AccCounter[0].fetch_add(1);
+        AccSizes[0].ItemGlobalSize = ID[0];
+#ifdef __SYCL_DEVICE_ONLY__
+        AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()};
+        AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()};
+        AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()};
+#endif // __SYCL_DEVICE_ONLY__
+      });
+    });
+  }
+  std::cout << std::endl;
+  if (OnGpu) {
+    std::cout << "Ran on GPU" << std::endl;
+    check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize);
+    check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1);
+    check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1);
+  }
+  check("Counter = ", Counter, size);
+  std::cout << std::endl;
+
+  return 0;
+}
+
+int main() {
+  int x;
+
+  x = 10;
+  try_item(x);
+  try_id(x);
+  x = 256;
+  try_item(x);
+  try_id(x);
+
+  return 0;
+}
\ No newline at end of file

From 8677a2b6db7df868cda0cf4996f200bc087aa7ac Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 17 Nov 2020 16:19:57 -0800
Subject: [PATCH 11/24] Adjustment to test to account for added lines in
 sycl.hpp.

---
 clang/test/CodeGenSYCL/kernel-by-reference.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGenSYCL/kernel-by-reference.cpp b/clang/test/CodeGenSYCL/kernel-by-reference.cpp
index 6502cddf602d8..f5bbac0e75730 100644
--- a/clang/test/CodeGenSYCL/kernel-by-reference.cpp
+++ b/clang/test/CodeGenSYCL/kernel-by-reference.cpp
@@ -15,7 +15,7 @@ int simple_add(int i) {
 int main() {
   queue q;
 #if defined(SYCL2020)
-  // expected-warning@Inputs/sycl.hpp:286 {{Passing kernel functions by value is deprecated in SYCL 2020}}
+  // expected-warning@Inputs/sycl.hpp:298 {{Passing kernel functions by value is deprecated in SYCL 2020}}
   // expected-note@+3 {{in instantiation of function template specialization}}
 #endif
   q.submit([&](handler &h) {
@@ -23,7 +23,7 @@ int main() {
   });
 
 #if defined(SYCL2017)
-  // expected-warning@Inputs/sycl.hpp:281 {{Passing of kernel functions by reference is a SYCL 2020 extension}}
+  // expected-warning@Inputs/sycl.hpp:293 {{Passing of kernel functions by reference is a SYCL 2020 extension}}
   // expected-note@+3 {{in instantiation of function template specialization}}
 #endif
   q.submit([&](handler &h) {

From c340ccf649daed65af5f306a8e464d559bb6f095 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 17 Nov 2020 22:55:43 -0800
Subject: [PATCH 12/24] Changed runtime test.

---
 .../parallel_for_range_roundup.cpp            | 64 ++++---------------
 1 file changed, 12 insertions(+), 52 deletions(-)

diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index 52519abb8e0f0..367b284473bb1 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
+// RUN: %ACC_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
+// RUN: %CPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
+// RUN: %GPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
+// RUN: cat %t.errout | FileCheck %s
 
 #include <CL/sycl.hpp>
 
@@ -29,41 +30,25 @@ int try_item(size_t size) {
   SizesInfo SInfo;
   range<1> Size{size};
   int Counter = 0;
-  bool OnGpu;
 
   {
     buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
-    buffer<int, 1> BufCounter(&Counter, Size);
+    buffer<int, 1> BufCounter(&Counter, 1);
     queue myQueue;
-    OnGpu = myQueue.get_device().is_gpu();
 
     myQueue.submit([&](handler &cgh) {
       auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
       auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
 
       cgh.parallel_for<PF_init_item>(Size, [=](item<1> ITEM) {
-        // cgh.parallel_for<PF_init_item>(Size, [=](int ITEM) {
         AccCounter[0].fetch_add(1);
         AccSizes[0].ItemGlobalSize = ITEM.get_range(0);
-#ifdef __SYCL_DEVICE_ONLY__
-        AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()};
-        AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()};
-        AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()};
-#endif // __SYCL_DEVICE_ONLY__
       });
     });
   }
 
-  std::cout << std::endl;
-  if (OnGpu) {
-    std::cout << "Ran on GPU" << std::endl;
-    check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize);
-    check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1);
-    check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1);
-  }
   check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size);
   check("Counter = ", Counter, size);
-  std::cout << std::endl;
   return 0;
 }
 
@@ -76,9 +61,8 @@ int try_id(size_t size) {
 
   {
     buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
-    buffer<int, 1> BufCounter(&Counter, Size);
+    buffer<int, 1> BufCounter(&Counter, 1);
     queue myQueue;
-    OnGpu = myQueue.get_device().is_gpu();
 
     myQueue.submit([&](handler &cgh) {
       auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
@@ -87,30 +71,16 @@ int try_id(size_t size) {
       cgh.parallel_for<class PF_init_id>(Size, [=](id<1> ID) {
         AccCounter[0].fetch_add(1);
         AccSizes[0].ItemGlobalSize = ID[0];
-#ifdef __SYCL_DEVICE_ONLY__
-        AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()};
-        AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()};
-        AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()};
-#endif // __SYCL_DEVICE_ONLY__
       });
     });
   }
-  std::cout << std::endl;
-  if (OnGpu) {
-    std::cout << "Ran on GPU" << std::endl;
-    check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize);
-    check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1);
-    check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1);
-  }
   check("Counter = ", Counter, size);
-  std::cout << std::endl;
 
   {
     buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
-    buffer<int, 1> BufCounter(&Counter, Size);
+    buffer<int, 1> BufCounter(&Counter, 1);
     queue myQueue;
     Counter = 0;
-    OnGpu = myQueue.get_device().is_gpu();
 
     myQueue.submit([&](handler &cgh) {
       auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
@@ -118,23 +88,10 @@ int try_id(size_t size) {
       cgh.parallel_for<class PF_init_id1>(Size, [=](id<1> ID) {
         AccCounter[0].fetch_add(1);
         AccSizes[0].ItemGlobalSize = ID[0];
-#ifdef __SYCL_DEVICE_ONLY__
-        AccSizes[0].RealGlobalSizeX = {__spirv_GlobalSize_x()};
-        AccSizes[0].RealGlobalSizeY = {__spirv_GlobalSize_y()};
-        AccSizes[0].RealGlobalSizeZ = {__spirv_GlobalSize_z()};
-#endif // __SYCL_DEVICE_ONLY__
       });
     });
   }
-  std::cout << std::endl;
-  if (OnGpu) {
-    std::cout << "Ran on GPU" << std::endl;
-    check("Real global size X = ", SInfo.RealGlobalSizeX.get(0), RoundedUpSize);
-    check("Real global size Y = ", SInfo.RealGlobalSizeY.get(0), 1);
-    check("Real global size Z = ", SInfo.RealGlobalSizeZ.get(0), 1);
-  }
   check("Counter = ", Counter, size);
-  std::cout << std::endl;
 
   return 0;
 }
@@ -145,6 +102,9 @@ int main() {
   x = 10;
   try_item(x);
   try_id(x);
+  // CHECK: ***** Adjusted size from 10 to 32 *****
+  // CHECK: ***** Adjusted size from 10 to 32 *****
+  // CHECK: ***** Adjusted size from 10 to 32 *****
   x = 256;
   try_item(x);
   try_id(x);

From e08a478357e3b83a91aac42f469eeae7296c81b4 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 18 Nov 2020 21:13:09 -0800
Subject: [PATCH 13/24] Changes requested by reviewers, and test modifications.

---
 clang/lib/Sema/SemaSYCL.cpp                   | 25 +++--
 .../CodeGenSYCL/parallel_for_this_item.cpp    | 95 ++++++++++++++-----
 sycl/include/CL/sycl/handler.hpp              |  2 +-
 .../parallel_for_range_roundup.cpp            | 30 ++++--
 4 files changed, 107 insertions(+), 45 deletions(-)

diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index e7b84368a90b7..1b50daf55b518 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -80,6 +80,10 @@ class Util {
   /// stream class.
   static bool isSyclStreamType(const QualType &Ty);
 
+  /// Checks whether given clang type is a full specialization of the SYCL
+  /// item class.
+  static bool isSyclItemType(const QualType &Ty);
+
   /// Checks whether given clang type is a full specialization of the SYCL
   /// half class.
   static bool isSyclHalfType(const QualType &Ty);
@@ -511,8 +515,8 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
       FunctionDecl *FD = WorkList.back().first;
       FunctionDecl *ParentFD = WorkList.back().second;
 
-      // To implement rounding-up of a parallel-for range
-      // a kernel call is modified like this:
+      // To implement rounding-up of a parallel-for range the
+      // SYCL header implementation modifies the kernel call like this:
       // auto Wrapper = [=](TransformedArgType Arg) {
       //  if (Arg[0] >= NumWorkItems[0])
       //    return;
@@ -523,9 +527,8 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
       // This transformation leads to a condition where a kernel body
       // function becomes callable from a new kernel body function.
       // Hence this test.
-      if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD)) {
+      if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD))
         KernelBody = FD;
-      }
 
       if ((ParentFD == SYCLKernel) && isSYCLKernelBodyFunction(FD)) {
         assert(!KernelBody && "inconsistent call graph - only one kernel body "
@@ -2752,7 +2755,7 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
     // The call graph for this translation unit.
     CallGraph SYCLCG;
     SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl());
-    typedef std::pair<FunctionDecl *, FunctionDecl *> ChildParentPair;
+    using ChildParentPair = std::pair<FunctionDecl *, FunctionDecl *>;
     llvm::SmallPtrSet<FunctionDecl *, 16> Visited;
     llvm::SmallVector<ChildParentPair, 16> WorkList;
     WorkList.push_back({WGLambdaFn, nullptr});
@@ -2764,7 +2767,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
         continue; // We've already seen this Decl
 
       if (FD->isFunctionOrMethod() && FD->getIdentifier() &&
-          !FD->getName().empty() && "this_item" == FD->getName()) {
+          !FD->getName().empty() && "this_item" == FD->getName() &&
+          Util::isSyclItemType(FD->getReturnType())) {
         Header.setCallsThisItem(true);
         return;
       }
@@ -3952,9 +3956,10 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) {
     O << "  __SYCL_DLL_LOCAL\n";
     O << "  static constexpr bool isESIMD() { return " << K.IsESIMDKernel
       << "; }\n";
+    O << "  __SYCL_DLL_LOCAL\n";
     O << "  static constexpr bool callsThisItem() { return ";
     O << K.CallsThisItem << "; }\n";
-    O << "} ;\n";
+    O << "};\n";
     CurStart += N;
   }
   O << "\n";
@@ -4013,7 +4018,7 @@ void SYCLIntegrationHeader::addSpecConstant(StringRef IDName, QualType IDType) {
 }
 
 void SYCLIntegrationHeader::setCallsThisItem(bool B) {
-  auto *K = getCurKernelDesc();
+  KernelDesc *K = getCurKernelDesc();
   assert(K && "no kernels");
   K->CallsThisItem = B;
 }
@@ -4039,6 +4044,10 @@ bool Util::isSyclStreamType(const QualType &Ty) {
   return isSyclType(Ty, "stream");
 }
 
+bool Util::isSyclItemType(const QualType &Ty) {
+  return isSyclType(Ty, "item", true /*Tmpl*/);
+}
+
 bool Util::isSyclHalfType(const QualType &Ty) {
   const StringRef &Name = "half";
   std::array<DeclContextDesc, 5> Scopes = {
diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
index dbb62b52b22dd..69212e83ea925 100755
--- a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
+++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl -fsycl-is-device -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only
+// RUN: %clang_cc1 -fsycl -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only
 // RUN: FileCheck -input-file=%t.h %s
 
 // This test checks that compiler generates correct kernel description
@@ -16,36 +16,79 @@
 // CHECK-NEXT:   "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT"
 // CHECK-NEXT: };
 
-// CHECK: template <> struct KernelInfo<class GNU> {
-// CHECK: __SYCL_DLL_LOCAL
-// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; }
+// CHECK:template <> struct KernelInfo<class GNU> {
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3GNU"; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr unsigned getNumParams() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
+// CHECK-NEXT:    return kernel_signatures[i+0];
+// CHECK-NEXT:  }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:  static constexpr bool isESIMD() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:  static constexpr bool callsThisItem() { return 0; }
+// CHECK-NEXT:};
+// CHECK-NEXT:template <> struct KernelInfo<class EMU> {
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3EMU"; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr unsigned getNumParams() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
+// CHECK-NEXT:    return kernel_signatures[i+0];
+// CHECK-NEXT:  }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:  static constexpr bool isESIMD() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr bool callsThisItem() { return 1; }
+// CHECK-NEXT:};
+// CHECK-NEXT:template <> struct KernelInfo<class OWL> {
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3OWL"; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr unsigned getNumParams() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
+// CHECK-NEXT:    return kernel_signatures[i+0];
+// CHECK-NEXT:  }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:  static constexpr bool isESIMD() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr bool callsThisItem() { return 0; }
+// CHECK-NEXT:};
+// CHECK-NEXT:template <> struct KernelInfo<class RAT> {
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT"; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr unsigned getNumParams() { return 0; }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
+// CHECK-NEXT:    return kernel_signatures[i+0];
+// CHECK-NEXT: }
+// CHECK-NEXT:  __SYCL_DLL_LOCAL
+// CHECK-NEXT:  static constexpr bool isESIMD() { return 0; }
+// CHECK-NEXT: __SYCL_DLL_LOCAL
+// CHECK-NEXT:    static constexpr bool callsThisItem() { return 1; }
+// CHECK-NEXT:};
 
-// CHECK: template <> struct KernelInfo<class EMU> {
-// CHECK: __SYCL_DLL_LOCAL
-// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; }
+#include "sycl.hpp"
 
-// CHECK: template <> struct KernelInfo<class OWL> {
-// CHECK: __SYCL_DLL_LOCAL
-// CHECK_NEXT: static constexpr bool callsThisItem() { return 0; }
-
-// CHECK: template <> struct KernelInfo<class RAT> {
-// CHECK: __SYCL_DLL_LOCAL
-// CHECK_NEXT: static constexpr bool callsThisItem() { return 1; }
-
-#include "Inputs/sycl.hpp"
+using namespace cl::sycl;
 
 int main() {
-  cl::sycl::queue myQueue;
-  myQueue.submit([&](cl::sycl::handler &cgh) {
-    cgh.parallel_for<class GNU>(cl::sycl::range<1>(1),
-                                [=](cl::sycl::item<1> I) {});
+  ::queue myQueue;
+  myQueue.submit([&](::handler &cgh) {
+    cgh.parallel_for<class GNU>(::range<1>(1),
+                                [=](::item<1> I) {});
     cgh.parallel_for<class EMU>(
-        cl::sycl::range<1>(1),
-        [=](cl::sycl::item<1> I) { cl::sycl::this_item<1>(); });
-    cgh.parallel_for<class OWL>(cl::sycl::range<1>(1),
-                                [=](cl::sycl::id<1> I) {});
-    cgh.parallel_for<class RAT>(cl::sycl::range<1>(1), [=](cl::sycl::id<1> I) {
-      cl::sycl::this_item<1>();
+        ::range<1>(1),
+        [=](::item<1> I) { ::this_item<1>(); });
+    cgh.parallel_for<class OWL>(::range<1>(1),
+                                [=](::id<1> I) {});
+    cgh.parallel_for<class RAT>(::range<1>(1), [=](::id<1> I) {
+      ::this_item<1>();
     });
   });
 
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 36fd72ac2cc41..af4b8ebdc88de 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -791,7 +791,7 @@ class __SYCL_EXPORT handler {
           GoodLocalSizeX;
       using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
       if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr)
-        std::cerr << "***** Adjusted size from " << NumWorkItems[0] << " to "
+        std::cout << "***** Adjusted size from " << NumWorkItems[0] << " to "
                   << NewValX << " *****\n";
       auto Wrapper = [=](TransformedArgType Arg) {
         if (Arg[0] >= NumWorkItems[0])
diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index 367b284473bb1..a53c48abb29e0 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -1,9 +1,6 @@
-// RUN: %clangxx -fsycl %s -o %t.out
-// RUN: env SYCL_DEVICE_TYPE=HOST SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
-// RUN: %ACC_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
-// RUN: %CPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
-// RUN: %GPU_RUN_PLACEHOLDER SYCL_OPT_PFWGS_TRACE=1 %t.out 2> %t.errout
-// RUN: cat %t.errout | FileCheck %s
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
+// RUN: env SYCL_OPT_PFWGS_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_OPT_PFWGS_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
 
 #include <CL/sycl.hpp>
 
@@ -45,6 +42,7 @@ int try_item(size_t size) {
         AccSizes[0].ItemGlobalSize = ITEM.get_range(0);
       });
     });
+    myQueue.wait();
   }
 
   check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size);
@@ -73,6 +71,7 @@ int try_id(size_t size) {
         AccSizes[0].ItemGlobalSize = ID[0];
       });
     });
+    myQueue.wait();
   }
   check("Counter = ", Counter, size);
 
@@ -90,6 +89,7 @@ int try_id(size_t size) {
         AccSizes[0].ItemGlobalSize = ID[0];
       });
     });
+    myQueue.wait();
   }
   check("Counter = ", Counter, size);
 
@@ -102,12 +102,22 @@ int main() {
   x = 10;
   try_item(x);
   try_id(x);
-  // CHECK: ***** Adjusted size from 10 to 32 *****
-  // CHECK: ***** Adjusted size from 10 to 32 *****
-  // CHECK: ***** Adjusted size from 10 to 32 *****
+
   x = 256;
   try_item(x);
   try_id(x);
 
   return 0;
-}
\ No newline at end of file
+}
+
+// CHECK:       ***** Adjusted size from 10 to 32 *****
+// CHECK-NEXT:  Size seen by user = 10
+// CHECK-NEXT:  Counter = 10
+// CHECK-NEXT:  ***** Adjusted size from 10 to 32 *****
+// CHECK-NEXT:  Counter = 10
+// CHECK-NEXT:  ***** Adjusted size from 10 to 32 *****
+// CHECK-NEXT:  Counter = 10
+// CHECK-NEXT:  Size seen by user = 256
+// CHECK-NEXT:  Counter = 256
+// CHECK-NEXT:  Counter = 256
+// CHECK-NEXT:  Counter = 256
\ No newline at end of file

From 0b878dcf688728d180720448958ee3cc9691497f Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 18 Nov 2020 22:04:41 -0800
Subject: [PATCH 14/24] Fixed EOL.

---
 sycl/test/basic_tests/parallel_for_range_roundup.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index a53c48abb29e0..6ebc376dce1b5 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -120,4 +120,4 @@ int main() {
 // CHECK-NEXT:  Size seen by user = 256
 // CHECK-NEXT:  Counter = 256
 // CHECK-NEXT:  Counter = 256
-// CHECK-NEXT:  Counter = 256
\ No newline at end of file
+// CHECK-NEXT:  Counter = 256

From d6773cbc2287fbb9be5a3461cca37afea50ea2e9 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 19 Nov 2020 12:42:20 -0800
Subject: [PATCH 15/24] Modified a test.

---
 .../CodeGenSYCL/parallel_for_this_item.cpp    | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
index 69212e83ea925..40684da91b019 100755
--- a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
+++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
@@ -77,19 +77,17 @@
 
 using namespace cl::sycl;
 
+SYCL_EXTERNAL item<1> g() { return this_item<1>(); }
+SYCL_EXTERNAL item<1> f() { return g(); }
+
 int main() {
-  ::queue myQueue;
+  queue myQueue;
   myQueue.submit([&](::handler &cgh) {
-    cgh.parallel_for<class GNU>(::range<1>(1),
-                                [=](::item<1> I) {});
-    cgh.parallel_for<class EMU>(
-        ::range<1>(1),
-        [=](::item<1> I) { ::this_item<1>(); });
-    cgh.parallel_for<class OWL>(::range<1>(1),
-                                [=](::id<1> I) {});
-    cgh.parallel_for<class RAT>(::range<1>(1), [=](::id<1> I) {
-      ::this_item<1>();
-    });
+    cgh.parallel_for<class GNU>(range<1>(1), [=](item<1> I) {});
+    cgh.parallel_for<class EMU>(range<1>(1),
+                                [=](::item<1> I) { this_item<1>(); });
+    cgh.parallel_for<class OWL>(range<1>(1), [=](id<1> I) {});
+    cgh.parallel_for<class RAT>(range<1>(1), [=](id<1> I) { f(); });
   });
 
   return 0;

From 59ae7782bad07790a5449085dcfac904504b2342 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Fri, 20 Nov 2020 13:07:20 -0800
Subject: [PATCH 16/24] Modified this_item search, and env var names.

---
 clang/lib/Sema/SemaSYCL.cpp                   | 44 +++----------------
 sycl/include/CL/sycl/handler.hpp              | 23 +++++++---
 .../parallel_for_range_roundup.cpp            | 10 ++---
 3 files changed, 27 insertions(+), 50 deletions(-)

diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 1b50daf55b518..a3d9f2704615d 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -661,39 +661,6 @@ class FindPFWGLambdaFnVisitor
   const CXXRecordDecl *LambdaObjTy;
 };
 
-// Searches for a call to PF lambda function and captures it.
-class FindPFLambdaFnVisitor
-    : public RecursiveASTVisitor<FindPFLambdaFnVisitor> {
-public:
-  // LambdaObjTy - lambda type of the PF lambda object
-  FindPFLambdaFnVisitor(const CXXRecordDecl *LambdaObjTy)
-      : LambdaFn(nullptr), LambdaObjTy(LambdaObjTy) {}
-
-  bool VisitCallExpr(CallExpr *Call) {
-    auto *M = dyn_cast<CXXMethodDecl>(Call->getDirectCallee());
-    if (!M || (M->getOverloadedOperator() != OO_Call))
-      return true;
-    const int NumPFLambdaArgs = 2; // range and lambda obj
-    if (Call->getNumArgs() != NumPFLambdaArgs)
-      return true;
-    QualType Range = Call->getArg(1)->getType();
-    if (!Util::isSyclType(Range, "id", true /*Tmpl*/) &&
-        !Util::isSyclType(Range, "item", true /*Tmpl*/))
-      return true;
-    if (Call->getArg(0)->getType()->getAsCXXRecordDecl() != LambdaObjTy)
-      return true;
-    LambdaFn = M; // call to PF lambda found - record the lambda
-    return false; // ... and stop searching
-  }
-
-  // Returns the captured lambda function or nullptr;
-  CXXMethodDecl *getLambdaFn() const { return LambdaFn; }
-
-private:
-  CXXMethodDecl *LambdaFn;
-  const CXXRecordDecl *LambdaObjTy;
-};
-
 class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
 public:
   MarkWIScopeFnVisitor(ASTContext &Ctx) : Ctx(Ctx) {}
@@ -2746,22 +2713,21 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
     if (getKernelInvocationKind(KernelFunc) != InvokeParallelFor)
       return;
 
-    FindPFLambdaFnVisitor V(KernelObj);
-    V.TraverseStmt(KernelFunc->getBody());
-    CXXMethodDecl *WGLambdaFn = V.getLambdaFn();
+    const CXXMethodDecl *WGLambdaFn = getOperatorParens(KernelObj);
     if (!WGLambdaFn)
       return;
 
     // The call graph for this translation unit.
     CallGraph SYCLCG;
     SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl());
-    using ChildParentPair = std::pair<FunctionDecl *, FunctionDecl *>;
-    llvm::SmallPtrSet<FunctionDecl *, 16> Visited;
+    using ChildParentPair =
+        std::pair<const FunctionDecl *, const FunctionDecl *>;
+    llvm::SmallPtrSet<const FunctionDecl *, 16> Visited;
     llvm::SmallVector<ChildParentPair, 16> WorkList;
     WorkList.push_back({WGLambdaFn, nullptr});
 
     while (!WorkList.empty()) {
-      FunctionDecl *FD = WorkList.back().first;
+      const FunctionDecl *FD = WorkList.back().first;
       WorkList.pop_back();
       if (!Visited.insert(FD).second)
         continue; // We've already seen this Decl
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index af4b8ebdc88de..7c3323778ad46 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -766,16 +766,27 @@ class __SYCL_EXPORT handler {
     // Disable the rounding-up optimizations under these conditions:
     // 1. The env var SYCL_OPT_PFWGS_DISABLE is set
     // 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name.
-    // 3. The kernel is created and invoked without an integration header entry.
+    // 3. The kernel is provided via an interoperability method.
     // 4. The API "this_item" is used inside the kernel.
     // 5. The range is already a multiple of the rounding factor.
+    //
+    // Cases 3 and 4 could be supported with extra effort.
+    // As an optimization for the common case it is an
+    // implementation choice to not support those scenarios.
+    // Note that "this_item" is a free function, i.e. not tied to any
+    // specific id or item. When concurrent parallel_fors are executing
+    // on a device it is difficult to tell which parallel_for the call is
+    // being made from. One could replicate portions of the
+    // call-graph to make this_item calls kernel-specific but this is
+    // not considered worthwhile.
 
     // Get the kernal name to check condition 3.
     std::string KName = typeid(NameT *).name();
     using KI = detail::KernelInfo<KernelName>;
     bool DisableRounding =
-        (getenv("SYCL_OPT_PFWGS_DISABLE") != nullptr) ||
-        (KName.find("SYCL_OPT_PFWGS_DISABLE") != std::string::npos) ||
+        (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) ||
+        (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") !=
+         std::string::npos) ||
         (KI::getName() == nullptr || KI::getName()[0] == '\0') ||
         (KI::callsThisItem());
 
@@ -790,9 +801,9 @@ class __SYCL_EXPORT handler {
           ((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
           GoodLocalSizeX;
       using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
-      if (getenv("SYCL_OPT_PFWGS_TRACE") != nullptr)
-        std::cout << "***** Adjusted size from " << NumWorkItems[0] << " to "
-                  << NewValX << " *****\n";
+      if (getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE") != nullptr)
+        std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
+                  << " to " << NewValX << std::endl;
       auto Wrapper = [=](TransformedArgType Arg) {
         if (Arg[0] >= NumWorkItems[0])
           return;
diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index 6ebc376dce1b5..e74e8ce51cc98 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -1,6 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
-// RUN: env SYCL_OPT_PFWGS_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
-// RUN: env SYCL_OPT_PFWGS_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
 
 #include <CL/sycl.hpp>
 
@@ -110,12 +110,12 @@ int main() {
   return 0;
 }
 
-// CHECK:       ***** Adjusted size from 10 to 32 *****
+// CHECK:       parallel_for range adjusted from 10 to 32
 // CHECK-NEXT:  Size seen by user = 10
 // CHECK-NEXT:  Counter = 10
-// CHECK-NEXT:  ***** Adjusted size from 10 to 32 *****
+// CHECK-NEXT:  parallel_for range adjusted from 10 to 32
 // CHECK-NEXT:  Counter = 10
-// CHECK-NEXT:  ***** Adjusted size from 10 to 32 *****
+// CHECK-NEXT:  parallel_for range adjusted from 10 to 32
 // CHECK-NEXT:  Counter = 10
 // CHECK-NEXT:  Size seen by user = 256
 // CHECK-NEXT:  Counter = 256

From 81b777cace323d77f39a15d11ff5cd3bd1772b6c Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Fri, 20 Nov 2020 13:09:29 -0800
Subject: [PATCH 17/24] Added env var documentation.

---
 sycl/doc/EnvironmentVariables.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 98480cb3f3dec..8731f6c8774df 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -29,6 +29,8 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
 | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
 | SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. |
+| SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE | Any(\*) | Enables tracing of parallel_for invocations with rounded-up ranges. |
+| SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING | Any(\*) | Disables automatic rounding-up of parallel_for invocation ranges. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 

From 900aca862030fec8a8db0ac9b5fd4ac8d0eae990 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Sat, 21 Nov 2020 15:23:20 -0800
Subject: [PATCH 18/24] Restrict rounding-up optimization to GPU devices.

---
 sycl/include/CL/sycl/handler.hpp              | 22 ++++++++++++++-----
 sycl/source/handler.cpp                       |  5 +++++
 .../parallel_for_range_roundup.cpp            |  1 -
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 7c3323778ad46..5cdc76a05d233 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -760,17 +760,20 @@ class __SYCL_EXPORT handler {
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
 
+    // The work group size preferred by this device.
     // A reasonable choice for rounding up the range is 32.
     constexpr size_t GoodLocalSizeX = 32;
 
     // Disable the rounding-up optimizations under these conditions:
-    // 1. The env var SYCL_OPT_PFWGS_DISABLE is set
-    // 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name.
-    // 3. The kernel is provided via an interoperability method.
-    // 4. The API "this_item" is used inside the kernel.
-    // 5. The range is already a multiple of the rounding factor.
+    // 1. The device is not a GPU. Only GPUs benefit from rounding.
+    // 2. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
+    // 3. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in
+    //    the kernel name.
+    // 4. The kernel is provided via an interoperability method.
+    // 5. The API "this_item" is used inside the kernel.
+    // 6. The range is already a multiple of the rounding factor.
     //
-    // Cases 3 and 4 could be supported with extra effort.
+    // Cases 4 and 5 could be supported with extra effort.
     // As an optimization for the common case it is an
     // implementation choice to not support those scenarios.
     // Note that "this_item" is a free function, i.e. not tied to any
@@ -784,6 +787,7 @@ class __SYCL_EXPORT handler {
     std::string KName = typeid(NameT *).name();
     using KI = detail::KernelInfo<KernelName>;
     bool DisableRounding =
+        !is_gpu(MQueue) ||
         (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) ||
         (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") !=
          std::string::npos) ||
@@ -1949,6 +1953,12 @@ class __SYCL_EXPORT handler {
   /// \param Count is a number of bytes to be prefetched.
   void prefetch(const void *Ptr, size_t Count);
 
+  /// Check if the queue being used is for a GPU device
+  ///
+  /// \param Queue is the queue for this handler.
+  /// \return Whether the device is a GPU.
+  bool is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue);
+
 private:
   shared_ptr_class<detail::queue_impl> MQueue;
   /// The storage for the arguments passed.
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index b3dc32070f51e..d4f75d2c472c8 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -343,5 +343,10 @@ void handler::prefetch(const void *Ptr, size_t Count) {
   MLength = Count;
   MCGType = detail::CG::PREFETCH_USM;
 }
+
+bool handler::is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue) {
+  device Dev = Queue->get_device();
+  return Dev.is_gpu();
+}
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index e74e8ce51cc98..cf6749c22d042 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -1,6 +1,5 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
 // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
-// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
 
 #include <CL/sycl.hpp>
 

From 700c056c2590ed50f23da972f730b32cb60c38a4 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Sat, 21 Nov 2020 17:04:41 -0800
Subject: [PATCH 19/24] Made a method private.

---
 sycl/include/CL/sycl/handler.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 5cdc76a05d233..14b1552324ce6 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -899,6 +899,12 @@ class __SYCL_EXPORT handler {
 
 #endif
 
+  /// Check if the queue being used is for a GPU device
+  ///
+  /// \param Queue is the queue for this handler.
+  /// \return Whether the device is a GPU.
+  bool is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue);
+
 public:
   handler(const handler &) = delete;
   handler(handler &&) = delete;
@@ -1953,12 +1959,6 @@ class __SYCL_EXPORT handler {
   /// \param Count is a number of bytes to be prefetched.
   void prefetch(const void *Ptr, size_t Count);
 
-  /// Check if the queue being used is for a GPU device
-  ///
-  /// \param Queue is the queue for this handler.
-  /// \return Whether the device is a GPU.
-  bool is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue);
-
 private:
   shared_ptr_class<detail::queue_impl> MQueue;
   /// The storage for the arguments passed.

From 383ae96f9de5007546ee962ff056da76b556f57c Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Sat, 21 Nov 2020 17:20:42 -0800
Subject: [PATCH 20/24] Merge correction.

---
 sycl/doc/EnvironmentVariables.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 8731f6c8774df..2b3e7d6577879 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
 | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
-| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. |
+| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
 | SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE | Any(\*) | Enables tracing of parallel_for invocations with rounded-up ranges. |
 | SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING | Any(\*) | Disables automatic rounding-up of parallel_for invocation ranges. |
 

From e8b0872e4eacd5a3c4386c32116a819275c2102f Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 25 Nov 2020 18:24:11 -0800
Subject: [PATCH 21/24] Test changes and improved this_item call detection.

---
 clang/lib/Sema/SemaSYCL.cpp                   |  65 ++++---
 .../CodeGenSYCL/parallel_for_this_item.cpp    |  24 ++-
 sycl/include/CL/sycl/handler.hpp              |   2 +-
 sycl/include/CL/sycl/range.hpp                |   2 +-
 .../parallel_for_range_roundup.cpp            | 159 +++++++++++++-----
 5 files changed, 185 insertions(+), 67 deletions(-)

diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 206cbe19fe862..1dd2c8bb64fa8 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -80,10 +80,6 @@ class Util {
   /// stream class.
   static bool isSyclStreamType(const QualType &Ty);
 
-  /// Checks whether given clang type is a full specialization of the SYCL
-  /// item class.
-  static bool isSyclItemType(const QualType &Ty);
-
   /// Checks whether given clang type is a full specialization of the SYCL
   /// half class.
   static bool isSyclHalfType(const QualType &Ty);
@@ -103,10 +99,23 @@ class Util {
   /// \param Tmpl  whether the class is template instantiation or simple record
   static bool isSyclType(const QualType &Ty, StringRef Name, bool Tmpl = false);
 
+  /// Checks whether given function is a standard SYCL API function with given
+  /// name.
+  /// \param FD    the function being checked.
+  /// \param Name  the function name to be checked against.
+  static bool isSyclFunction(const FunctionDecl *FD, StringRef Name);
+
   /// Checks whether given clang type is a full specialization of the SYCL
   /// specialization constant class.
   static bool isSyclSpecConstantType(const QualType &Ty);
 
+  // Checks declaration context hierarchy.
+  /// \param DC     the context of the item to be checked.
+  /// \param Scopes the declaration scopes leading from the item context to the
+  ///               translation unit (excluding the latter)
+  static bool matchContext(const DeclContext *DC,
+                           ArrayRef<Util::DeclContextDesc> Scopes);
+
   /// Checks whether given clang type is declared in the given hierarchy of
   /// declaration contexts.
   /// \param Ty         the clang type being checked
@@ -2736,9 +2745,8 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
       if (!Visited.insert(FD).second)
         continue; // We've already seen this Decl
 
-      if (FD->isFunctionOrMethod() && FD->getIdentifier() &&
-          !FD->getName().empty() && "this_item" == FD->getName() &&
-          Util::isSyclItemType(FD->getReturnType())) {
+      // Check whether this call is to sycl::this_item().
+      if (Util::isSyclFunction(FD, "this_item")) {
         Header.setCallsThisItem(true);
         return;
       }
@@ -4014,10 +4022,6 @@ bool Util::isSyclStreamType(const QualType &Ty) {
   return isSyclType(Ty, "stream");
 }
 
-bool Util::isSyclItemType(const QualType &Ty) {
-  return isSyclType(Ty, "item", true /*Tmpl*/);
-}
-
 bool Util::isSyclHalfType(const QualType &Ty) {
   const StringRef &Name = "half";
   std::array<DeclContextDesc, 5> Scopes = {
@@ -4064,6 +4068,21 @@ bool Util::isSyclType(const QualType &Ty, StringRef Name, bool Tmpl) {
   return matchQualifiedTypeName(Ty, Scopes);
 }
 
+bool Util::isSyclFunction(const FunctionDecl *FD, StringRef Name) {
+  if (!FD->isFunctionOrMethod() || !FD->getIdentifier() ||
+      FD->getName().empty() || Name != FD->getName())
+    return false;
+
+  const DeclContext *DC = FD->getDeclContext();
+  if (DC->isTranslationUnit())
+    return false;
+
+  std::array<DeclContextDesc, 2> Scopes = {
+      Util::DeclContextDesc{clang::Decl::Kind::Namespace, "cl"},
+      Util::DeclContextDesc{clang::Decl::Kind::Namespace, "sycl"}};
+  return matchContext(DC, Scopes);
+}
+
 bool Util::isAccessorPropertyListType(const QualType &Ty) {
   const StringRef &Name = "accessor_property_list";
   std::array<DeclContextDesc, 4> Scopes = {
@@ -4074,21 +4093,15 @@ bool Util::isAccessorPropertyListType(const QualType &Ty) {
   return matchQualifiedTypeName(Ty, Scopes);
 }
 
-bool Util::matchQualifiedTypeName(const QualType &Ty,
-                                  ArrayRef<Util::DeclContextDesc> Scopes) {
-  // The idea: check the declaration context chain starting from the type
+bool Util::matchContext(const DeclContext *Ctx,
+                        ArrayRef<Util::DeclContextDesc> Scopes) {
+  // The idea: check the declaration context chain starting from the item
   // itself. At each step check the context is of expected kind
   // (namespace) and name.
-  const CXXRecordDecl *RecTy = Ty->getAsCXXRecordDecl();
-
-  if (!RecTy)
-    return false; // only classes/structs supported
-  const auto *Ctx = cast<DeclContext>(RecTy);
   StringRef Name = "";
 
   for (const auto &Scope : llvm::reverse(Scopes)) {
     clang::Decl::Kind DK = Ctx->getDeclKind();
-
     if (DK != Scope.first)
       return false;
 
@@ -4102,7 +4115,7 @@ bool Util::matchQualifiedTypeName(const QualType &Ty,
       Name = cast<NamespaceDecl>(Ctx)->getName();
       break;
     default:
-      llvm_unreachable("matchQualifiedTypeName: decl kind not supported");
+      llvm_unreachable("matchContext: decl kind not supported");
     }
     if (Name != Scope.second)
       return false;
@@ -4110,3 +4123,13 @@ bool Util::matchQualifiedTypeName(const QualType &Ty,
   }
   return Ctx->isTranslationUnit();
 }
+
+bool Util::matchQualifiedTypeName(const QualType &Ty,
+                                  ArrayRef<Util::DeclContextDesc> Scopes) {
+  const CXXRecordDecl *RecTy = Ty->getAsCXXRecordDecl();
+
+  if (!RecTy)
+    return false; // only classes/structs supported
+  const auto *Ctx = cast<DeclContext>(RecTy);
+  return Util::matchContext(Ctx, Scopes);
+}
diff --git a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
index 40684da91b019..422a1bad33373 100755
--- a/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
+++ b/clang/test/CodeGenSYCL/parallel_for_this_item.cpp
@@ -80,13 +80,33 @@ using namespace cl::sycl;
 SYCL_EXTERNAL item<1> g() { return this_item<1>(); }
 SYCL_EXTERNAL item<1> f() { return g(); }
 
+// This is a similar-looking this_item function but not the real one.
+template <int Dims> item<Dims> this_item(int i) { return item<1>{i}; }
+
+// This is a method named this_item but not the real one.
+class C {
+public:
+  template <int Dims> item<Dims> this_item() { return item<1>{66}; };
+};
+
 int main() {
   queue myQueue;
   myQueue.submit([&](::handler &cgh) {
-    cgh.parallel_for<class GNU>(range<1>(1), [=](item<1> I) {});
+    // This kernel does not call sycl::this_item
+    cgh.parallel_for<class GNU>(range<1>(1),
+                                [=](item<1> I) { this_item<1>(55); });
+
+    // This kernel calls sycl::this_item
     cgh.parallel_for<class EMU>(range<1>(1),
                                 [=](::item<1> I) { this_item<1>(); });
-    cgh.parallel_for<class OWL>(range<1>(1), [=](id<1> I) {});
+
+    // This kernel does not call sycl::this_item
+    cgh.parallel_for<class OWL>(range<1>(1), [=](id<1> I) {
+      class C c;
+      c.this_item<1>();
+    });
+
+    // This kernel calls sycl::this_item
     cgh.parallel_for<class RAT>(range<1>(1), [=](id<1> I) { f(); });
   });
 
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 14b1552324ce6..833e9be148514 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -816,7 +816,7 @@ class __SYCL_EXPORT handler {
       };
 
       range<Dims> AdjustedRange = NumWorkItems;
-      AdjustedRange.set_range(NewValX);
+      AdjustedRange.set_range_dim0(NewValX);
 #ifdef __SYCL_DEVICE_ONLY__
       kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
 #else
diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp
index 365c852a389dd..32337109f97a9 100644
--- a/sycl/include/CL/sycl/range.hpp
+++ b/sycl/include/CL/sycl/range.hpp
@@ -148,7 +148,7 @@ template <int dimensions = 1> class range : public detail::array<dimensions> {
   friend class detail::Builder;
 
   // Adjust the first dim of the range
-  void set_range(const size_t dim0) { this->common_array[0] = dim0; }
+  void set_range_dim0(const size_t dim0) { this->common_array[0] = dim0; }
 };
 
 #ifdef __cpp_deduction_guides
diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index cf6749c22d042..a9d2e883c282e 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -5,93 +5,147 @@
 
 using namespace sycl;
 
-class PF_init_item;
-class PF_init_id;
-class PF_init_id1;
-
-struct SizesInfo {
-  range<1> ItemGlobalSize = {0};
-  range<1> RealGlobalSizeX = {0};
-  range<1> RealGlobalSizeY = {0};
-  range<1> RealGlobalSizeZ = {0};
-};
+range<1> Range1 = {0};
+range<2> Range2 = {0, 0};
+range<3> Range3 = {0, 0, 0};
 
 void check(const char *msg, size_t v, size_t ref) {
   std::cout << msg << v << std::endl;
   assert(v == ref);
 }
 
-int try_item(size_t size) {
-  size_t RoundedUpSize = (size + 32 - 1) / 32 * 32;
-  SizesInfo SInfo;
+int try_item1(size_t size) {
   range<1> Size{size};
   int Counter = 0;
-
   {
-    buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
+    buffer<range<1>, 1> BufRange(&Range1, 1);
     buffer<int, 1> BufCounter(&Counter, 1);
     queue myQueue;
 
     myQueue.submit([&](handler &cgh) {
-      auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
       auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
-
-      cgh.parallel_for<PF_init_item>(Size, [=](item<1> ITEM) {
+      cgh.parallel_for<class PF_init_item1>(Size, [=](item<1> ITEM) {
         AccCounter[0].fetch_add(1);
-        AccSizes[0].ItemGlobalSize = ITEM.get_range(0);
+        AccRange[0] = ITEM.get_range(0);
       });
     });
     myQueue.wait();
   }
-
-  check("Size seen by user = ", SInfo.ItemGlobalSize.get(0), size);
+  check("Size seen by user = ", Range1.get(0), size);
   check("Counter = ", Counter, size);
   return 0;
 }
 
-int try_id(size_t size) {
-  size_t RoundedUpSize = (size + 32 - 1) / 32 * 32;
-  SizesInfo SInfo;
-  range<1> Size{size};
+int try_item2(size_t size) {
+  range<2> Size{size, size};
   int Counter = 0;
-  bool OnGpu;
-
   {
-    buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
+    buffer<range<2>, 1> BufRange(&Range2, 1);
     buffer<int, 1> BufCounter(&Counter, 1);
     queue myQueue;
 
     myQueue.submit([&](handler &cgh) {
-      auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
       auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+      cgh.parallel_for<class PF_init_item2>(Size, [=](item<2> ITEM) {
+        AccCounter[0].fetch_add(1);
+        AccRange[0][0] = ITEM.get_range(0);
+      });
+    });
+    myQueue.wait();
+  }
+  check("Size seen by user = ", Range2.get(0), size);
+  check("Counter = ", Counter, size * size);
+  return 0;
+}
+
+int try_item3(size_t size) {
+  range<3> Size{size, size, size};
+  int Counter = 0;
+  {
+    buffer<range<3>, 1> BufRange(&Range3, 1);
+    buffer<int, 1> BufCounter(&Counter, 1);
+    queue myQueue;
 
-      cgh.parallel_for<class PF_init_id>(Size, [=](id<1> ID) {
+    myQueue.submit([&](handler &cgh) {
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+      cgh.parallel_for<class PF_init_item3>(Size, [=](item<3> ITEM) {
         AccCounter[0].fetch_add(1);
-        AccSizes[0].ItemGlobalSize = ID[0];
+        AccRange[0][0] = ITEM.get_range(0);
       });
     });
     myQueue.wait();
   }
-  check("Counter = ", Counter, size);
+  check("Size seen by user = ", Range3.get(0), size);
+  check("Counter = ", Counter, size * size * size);
+  return 0;
+}
 
+int try_id1(size_t size) {
+  range<1> Size{size};
+  int Counter = 0;
   {
-    buffer<SizesInfo, 1> BufSizes(&SInfo, 1);
+    buffer<range<1>, 1> BufRange(&Range1, 1);
     buffer<int, 1> BufCounter(&Counter, 1);
     queue myQueue;
-    Counter = 0;
 
     myQueue.submit([&](handler &cgh) {
-      auto AccSizes = BufSizes.get_access<access::mode::read_write>(cgh);
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
       auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
       cgh.parallel_for<class PF_init_id1>(Size, [=](id<1> ID) {
         AccCounter[0].fetch_add(1);
-        AccSizes[0].ItemGlobalSize = ID[0];
+        AccRange[0] = ID[0];
       });
     });
     myQueue.wait();
   }
   check("Counter = ", Counter, size);
+  return 0;
+}
+
+int try_id2(size_t size) {
+  range<2> Size{size, size};
+  int Counter = 0;
+  {
+    buffer<range<2>, 1> BufRange(&Range2, 1);
+    buffer<int, 1> BufCounter(&Counter, 1);
+    queue myQueue;
 
+    myQueue.submit([&](handler &cgh) {
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+      cgh.parallel_for<class PF_init_id2>(Size, [=](id<2> ID) {
+        AccCounter[0].fetch_add(1);
+        AccRange[0][0] = ID[0];
+      });
+    });
+    myQueue.wait();
+  }
+  check("Counter = ", Counter, size * size);
+  return 0;
+}
+
+int try_id3(size_t size) {
+  range<3> Size{size, size, size};
+  int Counter = 0;
+  {
+    buffer<range<3>, 1> BufRange(&Range3, 1);
+    buffer<int, 1> BufCounter(&Counter, 1);
+    queue myQueue;
+
+    myQueue.submit([&](handler &cgh) {
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+      cgh.parallel_for<class PF_init_id3>(Size, [=](id<3> ID) {
+        AccCounter[0].fetch_add(1);
+        AccRange[0][0] = ID[0];
+      });
+    });
+    myQueue.wait();
+  }
+  check("Counter = ", Counter, size * size * size);
   return 0;
 }
 
@@ -99,12 +153,20 @@ int main() {
   int x;
 
   x = 10;
-  try_item(x);
-  try_id(x);
+  try_item1(x);
+  try_item2(x);
+  try_item3(x);
+  try_id1(x);
+  try_id2(x);
+  try_id3(x);
 
   x = 256;
-  try_item(x);
-  try_id(x);
+  try_item1(x);
+  try_item2(x);
+  try_item3(x);
+  try_id1(x);
+  try_id2(x);
+  try_id3(x);
 
   return 0;
 }
@@ -113,10 +175,23 @@ int main() {
 // CHECK-NEXT:  Size seen by user = 10
 // CHECK-NEXT:  Counter = 10
 // CHECK-NEXT:  parallel_for range adjusted from 10 to 32
-// CHECK-NEXT:  Counter = 10
+// CHECK-NEXT:  Size seen by user = 10
+// CHECK-NEXT:  Counter = 100
+// CHECK-NEXT:  parallel_for range adjusted from 10 to 32
+// CHECK-NEXT:  Size seen by user = 10
+// CHECK-NEXT:  Counter = 1000
 // CHECK-NEXT:  parallel_for range adjusted from 10 to 32
 // CHECK-NEXT:  Counter = 10
+// CHECK-NEXT:  parallel_for range adjusted from 10 to 32
+// CHECK-NEXT:  Counter = 100
+// CHECK-NEXT:  parallel_for range adjusted from 10 to 32
+// CHECK-NEXT:  Counter = 1000
 // CHECK-NEXT:  Size seen by user = 256
 // CHECK-NEXT:  Counter = 256
+// CHECK-NEXT:  Size seen by user = 256
+// CHECK-NEXT:  Counter = 65536
+// CHECK-NEXT:  Size seen by user = 256
+// CHECK-NEXT:  Counter = 16777216
 // CHECK-NEXT:  Counter = 256
-// CHECK-NEXT:  Counter = 256
+// CHECK-NEXT:  Counter = 65536
+// CHECK-NEXT:  Counter = 16777216

From e6d42a0f2c71a52dac98b894eab108d611bb3142 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Fri, 27 Nov 2020 09:28:27 -0800
Subject: [PATCH 22/24] Necessary addition to SYCL symbols.

---
 sycl/test/abi/sycl_symbols_linux.dump | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index e8677277fca0f..9fda77a8d44fb 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3838,6 +3838,7 @@ _ZN2cl4sycl7handler18extractArgsAndReqsEv
 _ZN2cl4sycl7handler20associateWithHandlerEPNS0_6detail16AccessorBaseHostENS0_6access6targetE
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb
+_ZN2cl4sycl7handler6is_gpuESt10shared_ptrINS0_6detail10queue_implEE
 _ZN2cl4sycl7handler6memcpyEPvPKvm
 _ZN2cl4sycl7handler6memsetEPvim
 _ZN2cl4sycl7handler7barrierERKSt6vectorINS0_5eventESaIS3_EE

From 4b9093eb4be1f51f703e024838c83db90176c21a Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 3 Dec 2020 13:47:42 -0800
Subject: [PATCH 23/24] Minor corrections.

---
 sycl/include/CL/sycl/handler.hpp                  |  2 +-
 sycl/source/handler.cpp                           |  2 +-
 .../basic_tests/parallel_for_range_roundup.cpp    | 15 +++++----------
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 833e9be148514..4566aa09fb148 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -903,7 +903,7 @@ class __SYCL_EXPORT handler {
   ///
   /// \param Queue is the queue for this handler.
   /// \return Whether the device is a GPU.
-  bool is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue);
+  bool is_gpu(const shared_ptr_class<sycl::detail::queue_impl> Queue);
 
 public:
   handler(const handler &) = delete;
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index d4f75d2c472c8..4e7a62d0edd23 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -344,7 +344,7 @@ void handler::prefetch(const void *Ptr, size_t Count) {
   MCGType = detail::CG::PREFETCH_USM;
 }
 
-bool handler::is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue) {
+bool handler::is_gpu(const shared_ptr_class<sycl::detail::queue_impl> Queue) {
   device Dev = Queue->get_device();
   return Dev.is_gpu();
 }
diff --git a/sycl/test/basic_tests/parallel_for_range_roundup.cpp b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
index a9d2e883c282e..a4a8f45c2ae92 100755
--- a/sycl/test/basic_tests/parallel_for_range_roundup.cpp
+++ b/sycl/test/basic_tests/parallel_for_range_roundup.cpp
@@ -37,7 +37,7 @@ int try_item1(size_t size) {
   return 0;
 }
 
-int try_item2(size_t size) {
+void try_item2(size_t size) {
   range<2> Size{size, size};
   int Counter = 0;
   {
@@ -57,10 +57,9 @@ int try_item2(size_t size) {
   }
   check("Size seen by user = ", Range2.get(0), size);
   check("Counter = ", Counter, size * size);
-  return 0;
 }
 
-int try_item3(size_t size) {
+void try_item3(size_t size) {
   range<3> Size{size, size, size};
   int Counter = 0;
   {
@@ -80,10 +79,9 @@ int try_item3(size_t size) {
   }
   check("Size seen by user = ", Range3.get(0), size);
   check("Counter = ", Counter, size * size * size);
-  return 0;
 }
 
-int try_id1(size_t size) {
+void try_id1(size_t size) {
   range<1> Size{size};
   int Counter = 0;
   {
@@ -102,10 +100,9 @@ int try_id1(size_t size) {
     myQueue.wait();
   }
   check("Counter = ", Counter, size);
-  return 0;
 }
 
-int try_id2(size_t size) {
+void try_id2(size_t size) {
   range<2> Size{size, size};
   int Counter = 0;
   {
@@ -124,10 +121,9 @@ int try_id2(size_t size) {
     myQueue.wait();
   }
   check("Counter = ", Counter, size * size);
-  return 0;
 }
 
-int try_id3(size_t size) {
+void try_id3(size_t size) {
   range<3> Size{size, size, size};
   int Counter = 0;
   {
@@ -146,7 +142,6 @@ int try_id3(size_t size) {
     myQueue.wait();
   }
   check("Counter = ", Counter, size * size * size);
-  return 0;
 }
 
 int main() {

From a2a6ded4565a098d4a576e37e8d410d2d93d110c Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Thu, 10 Dec 2020 11:38:24 -0800
Subject: [PATCH 24/24] Enabled rounding for CPU also.

---
 sycl/include/CL/sycl/handler.hpp      | 20 ++++++--------------
 sycl/source/handler.cpp               |  5 -----
 sycl/test/abi/sycl_symbols_linux.dump |  1 -
 3 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 4566aa09fb148..f0eb9c4872fc2 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -765,15 +765,14 @@ class __SYCL_EXPORT handler {
     constexpr size_t GoodLocalSizeX = 32;
 
     // Disable the rounding-up optimizations under these conditions:
-    // 1. The device is not a GPU. Only GPUs benefit from rounding.
-    // 2. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
-    // 3. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in
+    // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
+    // 2. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in
     //    the kernel name.
-    // 4. The kernel is provided via an interoperability method.
-    // 5. The API "this_item" is used inside the kernel.
-    // 6. The range is already a multiple of the rounding factor.
+    // 3. The kernel is provided via an interoperability method.
+    // 4. The API "this_item" is used inside the kernel.
+    // 5. The range is already a multiple of the rounding factor.
     //
-    // Cases 4 and 5 could be supported with extra effort.
+    // Cases 3 and 4 could be supported with extra effort.
     // As an optimization for the common case it is an
     // implementation choice to not support those scenarios.
     // Note that "this_item" is a free function, i.e. not tied to any
@@ -787,7 +786,6 @@ class __SYCL_EXPORT handler {
     std::string KName = typeid(NameT *).name();
     using KI = detail::KernelInfo<KernelName>;
     bool DisableRounding =
-        !is_gpu(MQueue) ||
         (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) ||
         (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") !=
          std::string::npos) ||
@@ -899,12 +897,6 @@ class __SYCL_EXPORT handler {
 
 #endif
 
-  /// Check if the queue being used is for a GPU device
-  ///
-  /// \param Queue is the queue for this handler.
-  /// \return Whether the device is a GPU.
-  bool is_gpu(const shared_ptr_class<sycl::detail::queue_impl> Queue);
-
 public:
   handler(const handler &) = delete;
   handler(handler &&) = delete;
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 4e7a62d0edd23..b3dc32070f51e 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -343,10 +343,5 @@ void handler::prefetch(const void *Ptr, size_t Count) {
   MLength = Count;
   MCGType = detail::CG::PREFETCH_USM;
 }
-
-bool handler::is_gpu(const shared_ptr_class<sycl::detail::queue_impl> Queue) {
-  device Dev = Queue->get_device();
-  return Dev.is_gpu();
-}
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 9fda77a8d44fb..e8677277fca0f 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3838,7 +3838,6 @@ _ZN2cl4sycl7handler18extractArgsAndReqsEv
 _ZN2cl4sycl7handler20associateWithHandlerEPNS0_6detail16AccessorBaseHostENS0_6access6targetE
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb
-_ZN2cl4sycl7handler6is_gpuESt10shared_ptrINS0_6detail10queue_implEE
 _ZN2cl4sycl7handler6memcpyEPvPKvm
 _ZN2cl4sycl7handler6memsetEPvim
 _ZN2cl4sycl7handler7barrierERKSt6vectorINS0_5eventESaIS3_EE