diff --git a/sycl/include/sycl/detail/group_sort_impl.hpp b/sycl/include/sycl/detail/group_sort_impl.hpp
index 7410761cac303..6974413492a7c 100644
--- a/sycl/include/sycl/detail/group_sort_impl.hpp
+++ b/sycl/include/sycl/detail/group_sort_impl.hpp
@@ -15,11 +15,43 @@
 #include <sycl/builtins.hpp>
 #include <sycl/group_algorithm.hpp>
 #include <sycl/group_barrier.hpp>
+#include <sycl/sycl_span.hpp>
+
+#include <memory>
 
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
 
+// Helpers for sorting algorithms
+#ifdef __SYCL_DEVICE_ONLY__
+template <typename T, typename Group>
+static __SYCL_ALWAYS_INLINE T *align_scratch(sycl::span<std::byte> scratch,
+                                             Group g,
+                                             size_t number_of_elements) {
+  // Adjust the scratch pointer based on alignment of the type T.
+  // Per extension specification if scratch size is less than the value
+  // returned by memory_required then behavior is undefined, so we don't check
+  // that the scratch size statisfies the requirement.
+  T *scratch_begin = nullptr;
+  // We must have a barrier here before array placement new because it is
+  // possible that scratch memory is already in use, so we need to synchronize
+  // work items.
+  sycl::group_barrier(g);
+  if (g.leader()) {
+    void *scratch_ptr = scratch.data();
+    size_t space = scratch.size();
+    scratch_ptr = std::align(alignof(T), number_of_elements * sizeof(T),
+                             scratch_ptr, space);
+    scratch_begin = ::new (scratch_ptr) T[number_of_elements];
+  }
+  // Broadcast leader's pointer (the beginning of the scratch) to all work
+  // items in the group.
+  scratch_begin = sycl::group_broadcast(g, scratch_begin);
+  return scratch_begin;
+}
+#endif
+
 // ---- merge sort implementation
 
 // following two functions could be useless if std::[lower|upper]_bound worked
diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
index e790678e7e884..2885a7673795b 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
@@ -15,9 +15,10 @@
 #include <sycl/detail/pi.h>             // for PI_ERROR_INVALID_DEVICE
 #include <sycl/exception.hpp>           // for sycl_category, exception
 #include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16
-#include <sycl/memory_enums.hpp>        // for memory_scope
-#include <sycl/range.hpp>               // for range
-#include <sycl/sycl_span.hpp>           // for span
+#include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/memory_enums.hpp> // for memory_scope
+#include <sycl/range.hpp>        // for range
+#include <sycl/sycl_span.hpp>    // for span
 
 #ifdef __SYCL_DEVICE_ONLY__
 #include <sycl/detail/group_sort_impl.hpp>
@@ -36,6 +37,54 @@ namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
+enum class group_algorithm_data_placement { blocked, striped };
+
+struct input_data_placement_key
+    : detail::compile_time_property_key<detail::PropKind::InputDataPlacement> {
+  template <group_algorithm_data_placement Placement>
+  using value_t =
+      property_value<input_data_placement_key,
+                     std::integral_constant<int, static_cast<int>(Placement)>>;
+};
+
+struct output_data_placement_key
+    : detail::compile_time_property_key<detail::PropKind::OutputDataPlacement> {
+  template <group_algorithm_data_placement Placement>
+  using value_t =
+      property_value<output_data_placement_key,
+                     std::integral_constant<int, static_cast<int>(Placement)>>;
+};
+
+template <group_algorithm_data_placement Placement>
+inline constexpr input_data_placement_key::value_t<Placement>
+    input_data_placement;
+
+template <group_algorithm_data_placement Placement>
+inline constexpr output_data_placement_key::value_t<Placement>
+    output_data_placement;
+
+namespace detail {
+
+template <typename Properties>
+constexpr bool isInputBlocked(Properties properties) {
+  if constexpr (properties.template has_property<input_data_placement_key>())
+    return properties.template get_property<input_data_placement_key>() ==
+           input_data_placement<group_algorithm_data_placement::blocked>;
+  else
+    return true;
+}
+
+template <typename Properties>
+constexpr bool isOutputBlocked(Properties properties) {
+  if constexpr (properties.template has_property<output_data_placement_key>())
+    return properties.template get_property<output_data_placement_key>() ==
+           output_data_placement<group_algorithm_data_placement::blocked>;
+  else
+    return true;
+}
+
+} // namespace detail
+
 // ---- group helpers
 template <typename Group, size_t Extent> class group_with_scratchpad {
   Group g;
@@ -63,26 +112,9 @@ template <typename Compare = std::less<>> class default_sorter {
   void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
                   [[maybe_unused]] Ptr last) {
 #ifdef __SYCL_DEVICE_ONLY__
-    // Adjust the scratch pointer based on alignment of the type T.
-    // Per extension specification if scratch size is less than the value
-    // returned by memory_required then behavior is undefined, so we don't check
-    // that the scratch size statisfies the requirement.
     using T = typename sycl::detail::GetValueType<Ptr>::type;
-    T *scratch_begin = nullptr;
     size_t n = last - first;
-    // We must have a barrier here before array placement new because it is
-    // possible that scratch memory is already in use, so we need to synchronize
-    // work items.
-    sycl::group_barrier(g);
-    if (g.leader()) {
-      void *scratch_ptr = scratch.data();
-      size_t space = scratch.size();
-      scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space);
-      scratch_begin = ::new (scratch_ptr) T[n];
-    }
-    // Broadcast leader's pointer (the beginning of the scratch) to all work
-    // items in the group.
-    scratch_begin = sycl::group_broadcast(g, scratch_begin);
+    T *scratch_begin = sycl::detail::align_scratch<T>(scratch, g, n);
     sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
 #else
     throw sycl::exception(
@@ -94,29 +126,10 @@ template <typename Compare = std::less<>> class default_sorter {
   template <typename Group, typename T>
   T operator()([[maybe_unused]] Group g, T val) {
 #ifdef __SYCL_DEVICE_ONLY__
-    // Adjust the scratch pointer based on alignment of the type T.
-    // Per extension specification if scratch size is less than the value
-    // returned by memory_required then behavior is undefined, so we don't check
-    // that the scratch size statisfies the requirement.
-    T *scratch_begin = nullptr;
     std::size_t local_id = g.get_local_linear_id();
     auto range_size = g.get_local_range().size();
-    // We must have a barrier here before array placement new because it is
-    // possible that scratch memory is already in use, so we need to synchronize
-    // work items.
-    sycl::group_barrier(g);
-    if (g.leader()) {
-      void *scratch_ptr = scratch.data();
-      size_t space = scratch.size();
-      scratch_ptr =
-          std::align(alignof(T), /* output storage and temporary storage */ 2 *
-                                     range_size * sizeof(T),
-                     scratch_ptr, space);
-      scratch_begin = ::new (scratch_ptr) T[2 * range_size];
-    }
-    // Broadcast leader's pointer (the beginning of the scratch) to all work
-    // items in the group.
-    scratch_begin = sycl::group_broadcast(g, scratch_begin);
+    T *scratch_begin = sycl::detail::align_scratch<T>(
+        scratch, g, /* output storage and temporary storage */ 2 * range_size);
     scratch_begin[local_id] = val;
     sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
                              scratch_begin + range_size);
@@ -252,26 +265,9 @@ template <typename CompareT = std::less<>> class joint_sorter {
   void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
                   [[maybe_unused]] Ptr last) {
 #ifdef __SYCL_DEVICE_ONLY__
-    // Adjust the scratch pointer based on alignment of the type T.
-    // Per extension specification if scratch size is less than the value
-    // returned by memory_required then behavior is undefined, so we don't check
-    // that the scratch size statisfies the requirement.
     using T = typename sycl::detail::GetValueType<Ptr>::type;
-    T *scratch_begin = nullptr;
     size_t n = last - first;
-    // We must have a barrier here before array placement new because it is
-    // possible that scratch memory is already in use, so we need to synchronize
-    // work items.
-    sycl::group_barrier(g);
-    if (g.leader()) {
-      void *scratch_ptr = scratch.data();
-      size_t space = scratch.size();
-      scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space);
-      scratch_begin = ::new (scratch_ptr) T[n];
-    }
-    // Broadcast leader's pointer (the beginning of the scratch) to all work
-    // items in the group.
-    scratch_begin = sycl::group_broadcast(g, scratch_begin);
+    T *scratch_begin = sycl::detail::align_scratch<T>(scratch, g, n);
     sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
 #else
     throw sycl::exception(
@@ -300,29 +296,10 @@ class group_sorter {
 
   template <typename Group> T operator()([[maybe_unused]] Group g, T val) {
 #ifdef __SYCL_DEVICE_ONLY__
-    // Adjust the scratch pointer based on alignment of the type T.
-    // Per extension specification if scratch size is less than the value
-    // returned by memory_required then behavior is undefined, so we don't check
-    // that the scratch size statisfies the requirement.
-    T *scratch_begin = nullptr;
     std::size_t local_id = g.get_local_linear_id();
     auto range_size = g.get_local_range().size();
-    // We must have a barrier here before array placement new because it is
-    // possible that scratch memory is already in use, so we need to synchronize
-    // work items.
-    sycl::group_barrier(g);
-    if (g.leader()) {
-      void *scratch_ptr = scratch.data();
-      size_t space = scratch.size();
-      scratch_ptr =
-          std::align(alignof(T), /* output storage and temporary storage */ 2 *
-                                     range_size * sizeof(T),
-                     scratch_ptr, space);
-      scratch_begin = ::new (scratch_ptr) T[2 * range_size];
-    }
-    // Broadcast leader's pointer (the beginning of the scratch) to all work
-    // items in the group.
-    scratch_begin = sycl::group_broadcast(g, scratch_begin);
+    T *scratch_begin = sycl::detail::align_scratch<T>(
+        scratch, g, /* output storage and temporary storage */ 2 * range_size);
     scratch_begin[local_id] = val;
     sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
                              scratch_begin + range_size);
@@ -335,6 +312,34 @@ class group_sorter {
     return val;
   }
 
+  template <typename Group, typename Properties>
+  void operator()([[maybe_unused]] Group g,
+                  [[maybe_unused]] sycl::span<T, ElementsPerWorkItem> values,
+                  [[maybe_unused]] Properties properties) {
+#ifdef __SYCL_DEVICE_ONLY__
+    std::size_t local_id = g.get_local_linear_id();
+    auto wg_size = g.get_local_range().size();
+    auto number_of_elements = wg_size * ElementsPerWorkItem;
+    T *scratch_begin = sycl::detail::align_scratch<T>(
+        scratch, g,
+        /* output storage and temporary storage */ 2 * number_of_elements);
+    for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i)
+      scratch_begin[local_id * ElementsPerWorkItem + i] = values[i];
+    sycl::detail::merge_sort(g, scratch_begin, number_of_elements, comp,
+                             scratch_begin + number_of_elements);
+
+    std::size_t shift{};
+    for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) {
+      if constexpr (detail::isOutputBlocked(properties)) {
+        shift = local_id * ElementsPerWorkItem + i;
+      } else {
+        shift = i * wg_size + local_id;
+      }
+      values[i] = scratch_begin[shift];
+    }
+#endif
+  }
+
   static std::size_t memory_required(sycl::memory_scope scope,
                                      size_t range_size) {
     return 2 * joint_sorter<>::template memory_required<T>(
@@ -480,6 +485,19 @@ class group_sorter {
 #endif
   }
 
+  template <typename Group, typename Properties>
+  void operator()([[maybe_unused]] Group g,
+                  [[maybe_unused]] sycl::span<ValT, ElementsPerWorkItem> values,
+                  [[maybe_unused]] Properties properties) {
+#ifdef __SYCL_DEVICE_ONLY__
+    sycl::detail::privateStaticSort<
+        /*is_key_value=*/false, detail::isOutputBlocked(properties),
+        OrderT == sorting_order::ascending, ElementsPerWorkItem, bits>(
+        g, values.data(), /*empty*/ values.data(), scratch.data(), first_bit,
+        last_bit);
+#endif
+  }
+
   static constexpr size_t
   memory_required([[maybe_unused]] sycl::memory_scope scope,
                   size_t range_size) {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
index d5b67d0cb4df9..5dece1c54f7c4 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
@@ -114,6 +114,59 @@ sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
                          default_sorters::group_sorter<T>(exec.get_memory()));
 }
 
+template <typename Group, typename T, std::size_t ElementsPerWorkItem,
+          typename Sorter,
+          typename Properties = ext::oneapi::experimental::empty_properties_t>
+std::enable_if_t<sycl::ext::oneapi::experimental::is_property_list_v<
+                     std::decay_t<Properties>>,
+                 void>
+sort_over_group([[maybe_unused]] Group g,
+                [[maybe_unused]] sycl::span<T, ElementsPerWorkItem> values,
+                [[maybe_unused]] Sorter sorter,
+                [[maybe_unused]] Properties properties = {}) {
+#ifdef __SYCL_DEVICE_ONLY__
+  return sorter(g, values, properties);
+#else
+  throw sycl::exception(
+      std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+      "Group algorithms are not supported on host device.");
+#endif
+}
+
+template <typename Group, typename T, std::size_t Extent,
+          std::size_t ElementsPerWorkItem,
+          typename Properties = ext::oneapi::experimental::empty_properties_t>
+std::enable_if_t<sycl::ext::oneapi::experimental::is_property_list_v<
+                     std::decay_t<Properties>>,
+                 void>
+sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
+                sycl::span<T, ElementsPerWorkItem> values,
+                Properties properties = {}) {
+  return sort_over_group(
+      exec.get_group(), values,
+      default_sorters::group_sorter<T, std::less<T>, ElementsPerWorkItem>(
+          exec.get_memory()),
+      properties);
+}
+
+template <typename Group, typename T, std::size_t Extent,
+          std::size_t ElementsPerWorkItem, typename Compare,
+          typename Properties = ext::oneapi::experimental::empty_properties_t>
+std::enable_if_t<!sycl::ext::oneapi::experimental::is_property_list_v<
+                     std::decay_t<Compare>> &&
+                     sycl::ext::oneapi::experimental::is_property_list_v<
+                         std::decay_t<Properties>>,
+                 void>
+sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
+                sycl::span<T, ElementsPerWorkItem> values, Compare comp,
+                Properties properties = {}) {
+  return sort_over_group(
+      exec.get_group(), values,
+      default_sorters::group_sorter<T, Compare, ElementsPerWorkItem>(
+          exec.get_memory(), comp),
+      properties);
+}
+
 // ---- joint_sort
 template <typename Group, typename Iter, typename Sorter>
 std::enable_if_t<detail::is_sorter<Sorter, Group, Iter>::value, void>
diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp
index 559dd20c5fe09..3f1bb28268d39 100644
--- a/sycl/include/sycl/ext/oneapi/properties/property.hpp
+++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp
@@ -207,8 +207,10 @@ enum PropKind : uint32_t {
   SingleTaskKernel = 66,
   IndirectlyCallable = 67,
   CallsIndirectly = 68,
+  InputDataPlacement = 69,
+  OutputDataPlacement = 70,
   // PropKindSize must always be the last value.
-  PropKindSize = 69,
+  PropKindSize = 71,
 };
 
 struct property_key_base_tag {};
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp
new file mode 100644
index 0000000000000..b7ced46f10edb
--- /dev/null
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp
@@ -0,0 +1,250 @@
+// REQUIRES: sg-8
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies sorting APIs for fixed-size array input from group_sort
+// extension.
+#include "common.hpp"
+#include <sycl/ext/oneapi/experimental/group_sort.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <vector>
+
+template <UseGroupT UseGroup, int Dims, size_t ElementsPerWorkItem,
+          class Property = sycl::ext::oneapi::experimental::empty_properties_t,
+          class T, class Compare>
+void RunSortOverGroupArray(sycl::queue &Q, const std::vector<T> &DataToSort,
+                           const Compare &Comp, Property Prop) {
+
+  const size_t WorkSize = DataToSort.size() / ElementsPerWorkItem;
+  const size_t NumSubGroups = WorkSize / ReqSubGroupSize + 1;
+
+  const sycl::nd_range<Dims> NDRange = [&]() {
+    if constexpr (Dims == 1)
+      return sycl::nd_range<1>{{WorkSize}, {WorkSize}};
+    else
+      return sycl::nd_range<2>{{1, WorkSize}, {1, WorkSize}};
+    static_assert(Dims < 3,
+                  "Only one and two dimensional kernels are supported");
+  }();
+
+  using DefaultSorterT =
+      oneapi_exp::default_sorters::group_sorter<T, Compare,
+                                                ElementsPerWorkItem>;
+
+  using RadixSorterT = oneapi_exp::radix_sorters::group_sorter<
+      typename ConvertToSimpleType<T>::Type,
+      ConvertToSortingOrder<Compare>::Type, ElementsPerWorkItem>;
+
+  constexpr bool IsSG = (UseGroup == UseGroupT::SubGroup);
+  auto Scope =
+      IsSG ? sycl::memory_scope::sub_group : sycl::memory_scope::work_group;
+  auto WGSize = NDRange.get_local_range().size();
+  auto GroupSize = IsSG ? ReqSubGroupSize : WGSize;
+  std::size_t LocalMemorySizeDefault =
+      DefaultSorterT::memory_required(Scope, GroupSize);
+  std::size_t LocalMemorySizeRadix =
+      RadixSorterT::memory_required(Scope, GroupSize);
+  std::array<std::vector<T>, 4> DataToSortCase = {DataToSort, DataToSort,
+                                                  DataToSort, DataToSort};
+
+  // Sort data using 3 different versions of sort_over_group API
+  {
+    std::array<std::shared_ptr<sycl::buffer<T>>, 4> BufToSort;
+    for (int i = 0; i < 4; i++)
+      BufToSort[i].reset(new sycl::buffer<T>(DataToSortCase[i].data(),
+                                             DataToSortCase[i].size()));
+
+    Q.submit([&](sycl::handler &CGH) {
+       auto AccToSort0 = sycl::accessor(*BufToSort[0], CGH);
+       auto AccToSort1 = sycl::accessor(*BufToSort[1], CGH);
+       auto AccToSort2 = sycl::accessor(*BufToSort[2], CGH);
+       auto AccToSort3 = sycl::accessor(*BufToSort[3], CGH);
+
+       // Allocate local memory for all sub-groups in a work-group
+       const size_t TotalLocalMemSizeDefault =
+           IsSG ? LocalMemorySizeDefault * NumSubGroups
+                : LocalMemorySizeDefault;
+       sycl::local_accessor<std::byte, 1> ScratchDefault(
+           {TotalLocalMemSizeDefault}, CGH);
+
+       const size_t TotalLocalMemSizeRadix =
+           IsSG ? LocalMemorySizeRadix * NumSubGroups : LocalMemorySizeRadix;
+
+       sycl::local_accessor<std::byte, 1> ScratchRadix({TotalLocalMemSizeRadix},
+                                                       CGH);
+
+       CGH.parallel_for(
+           NDRange, [=](sycl::nd_item<Dims> id) [[intel::reqd_sub_group_size(
+                        ReqSubGroupSize)]] {
+             const size_t GlobalLinearID = id.get_global_linear_id();
+             using RadixSorterT = oneapi_exp::radix_sorters::group_sorter<
+                 typename ConvertToSimpleType<T>::Type,
+                 ConvertToSortingOrder<Compare>::Type, ElementsPerWorkItem>;
+
+             auto Group = [&]() {
+               if constexpr (IsSG)
+                 return id.get_sub_group();
+               else
+                 return id.get_group();
+             }();
+
+             // Each sub-group should use its own part of the scratch pad
+             const size_t ScratchShiftDefault =
+                 IsSG ? id.get_sub_group().get_group_linear_id() *
+                            LocalMemorySizeDefault
+                      : 0;
+             std::byte *ScratchPtrDefault =
+                 &ScratchDefault[0] + ScratchShiftDefault;
+
+             T ValsPrivate[ElementsPerWorkItem];
+
+             auto ReadToPrivate = [&](auto Acc) {
+               for (std::size_t I = 0; I < ElementsPerWorkItem; ++I)
+                 ValsPrivate[I] = Acc[GlobalLinearID * ElementsPerWorkItem + I];
+             };
+             auto WriteToGlobal = [&](auto Acc) {
+               for (std::size_t I = 0; I < ElementsPerWorkItem; ++I)
+                 Acc[GlobalLinearID * ElementsPerWorkItem + I] = ValsPrivate[I];
+             };
+
+             auto Scratch =
+                 sycl::span{ScratchPtrDefault, LocalMemorySizeDefault};
+             auto PrivateArr = sycl::span<T, ElementsPerWorkItem>{
+                 ValsPrivate, ValsPrivate + ElementsPerWorkItem};
+             if constexpr (std::is_same_v<Compare, std::less<T>>) {
+               ReadToPrivate(AccToSort0);
+               oneapi_exp::sort_over_group(
+                   oneapi_exp::group_with_scratchpad(Group, Scratch),
+                   PrivateArr, Prop); // (4)
+               WriteToGlobal(AccToSort0);
+             }
+
+             ReadToPrivate(AccToSort1);
+             oneapi_exp::sort_over_group(
+                 oneapi_exp::group_with_scratchpad(Group, Scratch), PrivateArr,
+                 Comp, Prop); // (5)
+             WriteToGlobal(AccToSort1);
+
+             ReadToPrivate(AccToSort2);
+             oneapi_exp::sort_over_group(Group, PrivateArr,
+                                         DefaultSorterT(Scratch), Prop); // (6)
+             WriteToGlobal(AccToSort2);
+
+             // Each sub-group should use its own part of the scratch pad
+             const size_t ScratchShiftRadix =
+                 IsSG ? id.get_sub_group().get_group_linear_id() *
+                            LocalMemorySizeRadix
+                      : 0;
+             std::byte *ScratchPtrRadix = &ScratchRadix[0] + ScratchShiftRadix;
+
+             // Radix doesn't support custom types
+             if constexpr (!std::is_same_v<CustomType, T>) {
+               ReadToPrivate(AccToSort3);
+               oneapi_exp::sort_over_group(
+                   Group, PrivateArr,
+                   RadixSorterT(
+                       sycl::span{ScratchPtrRadix, LocalMemorySizeRadix}),
+                   Prop); // (6)
+               WriteToGlobal(AccToSort3);
+             }
+           });
+     }).wait_and_throw();
+  }
+
+  // Verification
+  {
+    // Emulate independent sorting of each work-group/sub-group
+    const size_t ChunkSize = GroupSize * ElementsPerWorkItem;
+    std::vector<T> TempSorted = DataToSort;
+    auto It = TempSorted.begin();
+    for (; (It + ChunkSize) < TempSorted.end(); It += ChunkSize)
+      std::sort(It, It + ChunkSize, Comp);
+
+    // Sort reminder
+    std::sort(It, TempSorted.end(), Comp);
+    std::vector<T> DataSorted;
+    DataSorted.resize(TempSorted.size());
+    writeBlockedOrStriped<T>(/*In */ TempSorted, /* Out */ DataSorted,
+                             GroupSize, ElementsPerWorkItem, Prop);
+
+    if constexpr (std::is_same_v<Compare, std::less<T>>)
+      assert(DataToSortCase[0] == DataSorted);
+
+    assert(DataToSortCase[1] == DataSorted);
+    assert(DataToSortCase[2] == DataSorted);
+    // Radix doesn't support custom types
+    if constexpr (!std::is_same_v<CustomType, T>)
+      assert(DataToSortCase[3] == DataSorted);
+  }
+}
+
+template <
+    UseGroupT UseGroup, int Dim, size_t ElementsPerWorkItem, class T,
+    typename Compare,
+    typename Properties = sycl::ext::oneapi::experimental::empty_properties_t>
+void RunOnData(sycl::queue &Q, const std::vector<T> &Data,
+               const Compare &Comparator, Properties Prop = {}) {
+  if constexpr (UseGroup == UseGroupT::SubGroup)
+    if (Q.get_backend() == sycl::backend::ext_oneapi_cuda ||
+        Q.get_backend() == sycl::backend::ext_oneapi_hip) {
+      std::cout << "Note! Skipping sub group testing on CUDA BE" << std::endl;
+      return;
+    }
+
+  RunSortOverGroupArray<UseGroup, Dim, ElementsPerWorkItem>(Q, Data, Comparator,
+                                                            Prop);
+};
+
+template <class T> void RunOverType(sycl::queue &Q, size_t DataSize) {
+  constexpr size_t PerWI = 4;
+  std::vector<T> ArrayDataRandom(DataSize * PerWI);
+
+  // Fill using random numbers
+  std::default_random_engine generator;
+  std::normal_distribution<float> distribution((10.0), (2.0));
+  for (T &Elem : ArrayDataRandom)
+    Elem = T(distribution(generator));
+
+  auto blocked = oneapi_exp::properties{oneapi_exp::input_data_placement<
+      oneapi_exp::group_algorithm_data_placement::blocked>};
+  auto striped = oneapi_exp::properties{oneapi_exp::input_data_placement<
+      oneapi_exp::group_algorithm_data_placement::striped>};
+  RunOnData<UseGroupT::WorkGroup, 1, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                            blocked);
+  RunOnData<UseGroupT::WorkGroup, 1, PerWI>(Q, ArrayDataRandom,
+                                            std::greater<T>{}, striped);
+  RunOnData<UseGroupT::WorkGroup, 2, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                            blocked);
+  RunOnData<UseGroupT::WorkGroup, 2, PerWI>(Q, ArrayDataRandom,
+                                            std::greater<T>{}, striped);
+  RunOnData<UseGroupT::SubGroup, 1, PerWI>(Q, ArrayDataRandom,
+                                           std::greater<T>{}, blocked);
+  RunOnData<UseGroupT::SubGroup, 1, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                           striped);
+  RunOnData<UseGroupT::SubGroup, 2, PerWI>(Q, ArrayDataRandom,
+                                           std::greater<T>{}, blocked);
+  RunOnData<UseGroupT::SubGroup, 2, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                           striped);
+}
+
+int main() {
+  try {
+    sycl::queue Q;
+
+    static constexpr size_t Size = 18;
+
+    RunOverType<std::int32_t>(Q, Size);
+    RunOverType<CustomType>(Q, Size);
+
+    std::cout << "Test passed." << std::endl;
+    return 0;
+  } catch (std::exception &E) {
+    std::cout << "Test failed" << std::endl;
+    std::cout << E.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp
index bec228a074b47..8f3317addb6e0 100644
--- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp
@@ -48,3 +48,45 @@ constexpr size_t ReqSubGroupSize = 8;
 
 template <typename...> class KernelNameOverGroup;
 template <typename...> class KernelNameJoint;
+
+template <typename Properties>
+constexpr bool IsOutputBlocked(Properties properties) {
+  if constexpr (properties.template has_property<
+                    oneapi_exp::output_data_placement_key>())
+    return properties.template get_property<
+               oneapi_exp::output_data_placement_key>() ==
+           oneapi_exp::output_data_placement<
+               oneapi_exp::group_algorithm_data_placement::blocked>;
+  else
+    return true;
+}
+
+// Write data from In to Out in blocked/striped way.
+template <typename T, typename Properties>
+void writeBlockedOrStriped(const std::vector<T> &In, std::vector<T> &Out,
+                           size_t MaxGroupSize, size_t ElementsPerWorkItem,
+                           Properties Prop) {
+  assert(In.size() == Out.size());
+  size_t index = {};
+  size_t shift = {};
+  auto ChunkSize = MaxGroupSize * ElementsPerWorkItem;
+  std::uint32_t ChunkStart = 0;
+  for (std::uint32_t ChunkStart = 0; ChunkStart < In.size();
+       ChunkStart += ChunkSize) {
+    auto GroupSize = (In.size() - ChunkStart) >= ChunkSize
+                         ? MaxGroupSize
+                         : (In.size() - ChunkStart) / ElementsPerWorkItem;
+    for (std::uint32_t j = 0; j < GroupSize; ++j) {
+      for (std::uint32_t k = 0; k < ElementsPerWorkItem; ++k) {
+        index = ChunkStart + j * ElementsPerWorkItem + k;
+        if constexpr (IsOutputBlocked(Prop)) {
+          shift = index;
+        } else {
+          shift = ChunkStart + k * GroupSize + j;
+        }
+        if (index < Out.size() && shift < In.size())
+          Out[index] = In[shift];
+      }
+    }
+  }
+}