Skip to content

Commit 16eb5fc

Browse files
committed
[UR][CUDA][HIP] Fix incorrect outputs and improve performance of queue::fill
Update the UR tag to fix queue::fill for the CUDA and HIP backends, which was previously producing incorrect outputs for any pattern size other than 1, 2, or a multiple of 4 bytes. A new optimisation is also added which speeds up the fill greatly if the pattern equals to the first word repeated throughout (e.g. all zeros). Add a new e2e test to validate queue::fill outputs for any pattern size between 1 and 32 bytes. This test fails for CUDA and HIP before the UR change and passes with this PR. Other backends already worked correctly.
1 parent b03552a commit 16eb5fc

File tree

3 files changed

+85
-8
lines changed

3 files changed

+85
-8
lines changed

sycl/cmake/modules/FetchUnifiedRuntime.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
116116
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
117117
endfunction()
118118

119-
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
119+
set(UNIFIED_RUNTIME_REPO "https://github.com/rafbiels/unified-runtime.git")
120120
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
121121

122122
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
# commit 3a5b23c8b475712f9107c1d5ab41f27a1465578e
2-
# Merge: f9f71f17 1696524d
3-
# Author: Piotr Balcer <[email protected]>
4-
# Date: Thu Nov 14 14:38:05 2024 +0100
5-
# Merge pull request #2253 from pbalcer/low-power-events
6-
# add low-power events experimental extension spec
7-
set(UNIFIED_RUNTIME_TAG 3a5b23c8b475712f9107c1d5ab41f27a1465578e)
1+
# commit 6f9d5c5a02502b13e17a7ac5c526722f86b2af47
2+
# Author: Rafal Bielski <[email protected]>
3+
# Date: Thu Oct 31 23:13:30 2024 +0000
4+
# Fix incorrect outputs and improve performance of commonMemSetLargePattern
5+
set(UNIFIED_RUNTIME_TAG rafbiels/improve-memset)

sycl/test-e2e/USM/fill_any_size.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// RUN: %{build} -o %t1.out
2+
// RUN: %{run} %t1.out
3+
4+
/**
5+
* Test of the queue::fill interface with a range of pattern sizes and values.
6+
*
7+
* Loops over pattern sizes from 1 to MaxPatternSize bytes and calls queue::fill
8+
* with std::array<uint8_t,Size> for the pattern. Two pattern values are tested,
9+
* all zeros and value=index+42. The output is copied back to host and
10+
* validated.
11+
*/
12+
13+
#include <array>
14+
#include <cstdio>
15+
#include <sycl/detail/core.hpp>
16+
#include <sycl/usm.hpp>
17+
18+
constexpr size_t MaxPatternSize{32}; // bytes
19+
constexpr size_t NumElements{10};
20+
constexpr size_t NumRepeats{1};
21+
constexpr bool verbose{false};
22+
23+
template <size_t PatternSize, bool SameValue>
24+
int test(sycl::queue &q, uint8_t firstValue = 0) {
25+
using T = std::array<uint8_t, PatternSize>;
26+
T value{};
27+
for (unsigned int i{0}; i < PatternSize; ++i) {
28+
if constexpr (SameValue) {
29+
value[0] = firstValue;
30+
} else {
31+
value[i] = firstValue + i;
32+
}
33+
}
34+
35+
T *dptr{sycl::malloc_device<T>(NumElements, q)};
36+
for (size_t repeat{0}; repeat < NumRepeats; ++repeat) {
37+
q.fill(dptr, value, NumElements).wait();
38+
}
39+
40+
std::array<T, NumElements> host{};
41+
q.copy<T>(dptr, host.data(), NumElements).wait();
42+
bool pass{true};
43+
for (unsigned int i{0}; i < NumElements; ++i) {
44+
for (unsigned int j{0}; j < PatternSize; ++j) {
45+
if (host[i][j] != value[j]) {
46+
pass = false;
47+
}
48+
}
49+
}
50+
sycl::free(dptr, q);
51+
52+
if (!pass || verbose) {
53+
printf("Pattern size %3lu bytes, %s values (initial %3u) %s\n", PatternSize,
54+
(SameValue ? " equal" : "varied"), firstValue,
55+
(pass ? "== PASS ==" : "== FAIL =="));
56+
}
57+
58+
return !pass;
59+
}
60+
61+
template <size_t Size> int testOneSize(sycl::queue &q) {
62+
return test<Size, true>(q, 0) + test<Size, false>(q, 42);
63+
}
64+
65+
template <size_t... Sizes>
66+
int testSizes(sycl::queue &q, std::index_sequence<Sizes...>) {
67+
return (testOneSize<1u + Sizes>(q) + ...);
68+
}
69+
70+
int main() {
71+
sycl::queue q{};
72+
int failures = testSizes(q, std::make_index_sequence<MaxPatternSize>{});
73+
if (failures > 0) {
74+
printf("%d / %lu tests failed\n", failures, 2 * MaxPatternSize);
75+
} else {
76+
printf("All %lu tests passed\n", 2 * MaxPatternSize);
77+
}
78+
return failures;
79+
}

0 commit comments

Comments
 (0)