From 9941a6ce1e760862fa4fcff5de15fe2577a644b8 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 2 Sep 2020 23:19:38 -0700 Subject: [PATCH 1/3] [SYCL] Store stream buffers in the scheduler Stream buffers need to be alive after submitting a kernel because it is executed by the scheduler asynchronosly. For this reason currently stream buffers are stored in an associated stream object. This stream object is passed to the handler and then forwarded further to a commandi group to keep streamm buffers alive for the scheduler. But there is a problem with this approach. A command group cannot be destroyed while stream buffers (which are accessed in this command group) are alive. Stream buffers are destroyed only if the stream is destroyed. Stream object is destrtoyed only if command group is destroyed. So, there is a loop dependcy. Which results in memory leaks. Solution is to store stream buffers in the scheduler for each stream. With this approach resources are released properly. --- sycl/source/detail/scheduler/scheduler.cpp | 8 +++ sycl/source/detail/scheduler/scheduler.hpp | 32 +++++++++++ sycl/source/detail/stream_impl.cpp | 54 ++++++++++++++----- sycl/source/detail/stream_impl.hpp | 25 ++------- sycl/test/abi/sycl_symbols_linux.dump | 3 ++ .../stream/release_resources_test.cpp | 34 ++++++++++++ 6 files changed, 121 insertions(+), 35 deletions(-) create mode 100644 sycl/test/basic_tests/stream/release_resources_test.cpp diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 979e4b7d708c5..6f6953c07a942 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -238,6 +238,14 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) { EnqueueLeaves(Record->MWriteLeaves); } +void Scheduler::allocateStreamBuffers(stream_impl *Impl, + size_t StreamBufferSize, + size_t FlushBufferSize) { + std::lock_guard lock(StreamBuffersPoolMutex); + StreamBuffersPool.insert( + {Impl, StreamBuffers(StreamBufferSize, FlushBufferSize)}); +} + Scheduler::Scheduler() { sycl::device HostDevice; DefaultHostQueue = QueueImplPtr( diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 86a015f978cc0..0e6d7f5a5e74f 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -715,6 +715,38 @@ class Scheduler { friend class Command; friend class DispatchHostTask; + + class StreamBuffers { + public: + StreamBuffers(size_t StreamBufferSize, size_t FlushBufferSize) + // Initialize stream buffer with zeros, this is needed for two reasons: + // 1. We don't need to care about end of line when printing out + // streamed data. + // 2. Offset is properly initialized. + : Data(StreamBufferSize, 0), + Buf(Data.data(), range<1>(StreamBufferSize), + {property::buffer::use_host_ptr()}), + FlushBuf(range<1>(FlushBufferSize)) {} + + // Vector on the host side which is used to initialize the stream + // buffer + std::vector Data; + + // Stream buffer + buffer Buf; + + // Global flush buffer + buffer FlushBuf; + }; + + friend class stream_impl; + + // Protects stream buffers pool + std::mutex StreamBuffersPoolMutex; + std::map StreamBuffersPool; + + // Allocate buffers in the pool for a provided stream + void allocateStreamBuffers(stream_impl *, size_t, size_t); }; } // namespace detail diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index db79b83fd1408..33708c2cbe8bd 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -17,18 +17,44 @@ namespace detail { stream_impl::stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH) - : BufferSize_(BufferSize), MaxStatementSize_(MaxStatementSize), - // Allocate additional place for the offset variable and the end of line - // symbol. Initialize buffer with zeros, this is needed for two reasons: - // 1. We don't need to care about end of line when printing out streamed - // data. - // 2. Offset is properly initialized. - Data(BufferSize + OffsetSize + 1, 0), - Buf(Data.data(), range<1>(BufferSize + OffsetSize + 1), - {property::buffer::use_host_ptr()}), - - FlushBuf(range<1>(MaxStatementSize)) {} + : BufferSize_(BufferSize), MaxStatementSize_(MaxStatementSize) { + // We need to store stream buffers in the scheduler because they need to be + // alive after submitting the kernel. They cannot be stored in the stream + // object because it causes loop dependency between objects and results in + // memory leak. + // Allocate additional place in the stream buffer for the offset variable and + // the end of line symbol. + detail::Scheduler::getInstance().allocateStreamBuffers( + this, BufferSize + OffsetSize + 1 /* size of the stream buffer */, + MaxStatementSize /* size of the flush buffer */); +} + +// Method to provide an access to the global stream buffer +GlobalBufAccessorT stream_impl::accessGlobalBuf(handler &CGH) { + return detail::Scheduler::getInstance() + .StreamBuffersPool.find(this) + ->second.Buf.get_access( + CGH, range<1>(BufferSize_), id<1>(OffsetSize)); +} +// Method to provide an accessor to the global flush buffer +GlobalBufAccessorT stream_impl::accessGlobalFlushBuf(handler &CGH) { + return detail::Scheduler::getInstance() + .StreamBuffersPool.find(this) + ->second.FlushBuf.get_access( + CGH, range<1>(MaxStatementSize_), id<1>(0)); +} + +// Method to provide an atomic access to the offset in the global stream +// buffer and offset in the flush buffer +GlobalOffsetAccessorT stream_impl::accessGlobalOffset(handler &CGH) { + auto OffsetSubBuf = buffer( + detail::Scheduler::getInstance().StreamBuffersPool.find(this)->second.Buf, + id<1>(0), range<1>(OffsetSize)); + auto ReinterpretedBuf = OffsetSubBuf.reinterpret(range<1>(2)); + return ReinterpretedBuf.get_access( + CGH, range<1>(2), id<1>(0)); +} size_t stream_impl::get_size() const { return BufferSize_; } size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; } @@ -36,8 +62,10 @@ size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; } void stream_impl::flush() { // Access the stream buffer on the host. This access guarantees that kernel is // executed and buffer contains streamed data. - auto HostAcc = Buf.get_access( - range<1>(BufferSize_), id<1>(OffsetSize)); + auto HostAcc = detail::Scheduler::getInstance() + .StreamBuffersPool.find(this) + ->second.Buf.get_access( + range<1>(BufferSize_), id<1>(OffsetSize)); printf("%s", HostAcc.get_pointer()); fflush(stdout); diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp index a4c8bdd559e53..e3b548c07bee9 100644 --- a/sycl/source/detail/stream_impl.hpp +++ b/sycl/source/detail/stream_impl.hpp @@ -26,25 +26,14 @@ class __SYCL_EXPORT stream_impl { stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH); // Method to provide an access to the global stream buffer - GlobalBufAccessorT accessGlobalBuf(handler &CGH) { - return Buf.get_access( - CGH, range<1>(BufferSize_), id<1>(OffsetSize)); - } + GlobalBufAccessorT accessGlobalBuf(handler &CGH); // Method to provide an accessor to the global flush buffer - GlobalBufAccessorT accessGlobalFlushBuf(handler &CGH) { - return FlushBuf.get_access( - CGH, range<1>(MaxStatementSize_), id<1>(0)); - } + GlobalBufAccessorT accessGlobalFlushBuf(handler &CGH); // Method to provide an atomic access to the offset in the global stream // buffer and offset in the flush buffer - GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) { - auto OffsetSubBuf = buffer(Buf, id<1>(0), range<1>(OffsetSize)); - auto ReinterpretedBuf = OffsetSubBuf.reinterpret(range<1>(2)); - return ReinterpretedBuf.get_access( - CGH, range<1>(2), id<1>(0)); - } + GlobalOffsetAccessorT accessGlobalOffset(handler &CGH); // Copy stream buffer to the host and print the contents void flush(); @@ -65,14 +54,6 @@ class __SYCL_EXPORT stream_impl { // 2 variables: offset in the stream buffer and offset in the flush buffer. static const size_t OffsetSize = 2 * sizeof(unsigned); - // Vector on the host side which is used to initialize the stream buffer - std::vector Data; - - // Stream buffer - buffer Buf; - - // Global flush buffer - buffer FlushBuf; }; } // namespace detail diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 6bc212fb4a34f..36a994dd94c3d 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4156,3 +4156,6 @@ _ZNK2cl4sycl9exception11has_contextEv _ZNK2cl4sycl9exception4whatEv __sycl_register_lib __sycl_unregister_lib +_ZN2cl4sycl6detail11stream_impl15accessGlobalBufERNS0_7handlerE +_ZN2cl4sycl6detail11stream_impl20accessGlobalFlushBufERNS0_7handlerE +_ZN2cl4sycl6detail11stream_impl18accessGlobalOffsetERNS0_7handlerE diff --git a/sycl/test/basic_tests/stream/release_resources_test.cpp b/sycl/test/basic_tests/stream/release_resources_test.cpp new file mode 100644 index 0000000000000..d57e21cefbf27 --- /dev/null +++ b/sycl/test/basic_tests/stream/release_resources_test.cpp @@ -0,0 +1,34 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_TRACE=2 %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER + +//==----------------------- release_resources_test.cpp ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Check that buffer used by a stream object is released. + +#include + +using namespace cl::sycl; + +int main() { + { + queue Queue; + + // CHECK:---> piMemRelease + Queue.submit([&](handler &CGH) { + stream Out(1024, 80, CGH); + CGH.parallel_for( + range<1>(2), [=](id<1> i) { Out << "Hello, World!" << endl; }); + }); + Queue.wait(); + } + + return 0; +} From a0bf82066ae45b37e187dae0924ff79d4efc6ffc Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Sep 2020 23:22:08 -0700 Subject: [PATCH 2/3] Address review comments and fix CI fails --- sycl/source/detail/scheduler/scheduler.cpp | 5 +++++ sycl/source/detail/scheduler/scheduler.hpp | 11 ++++++++--- sycl/source/detail/stream_impl.cpp | 17 +++++++++++------ .../stream/release_resources_test.cpp | 8 -------- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 6f6953c07a942..1ca92d93080c4 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -246,6 +246,11 @@ void Scheduler::allocateStreamBuffers(stream_impl *Impl, {Impl, StreamBuffers(StreamBufferSize, FlushBufferSize)}); } +void Scheduler::deallocateStreamBuffers(stream_impl *Impl) { + std::lock_guard lock(StreamBuffersPoolMutex); + StreamBuffersPool.erase(Impl); +} + Scheduler::Scheduler() { sycl::device HostDevice; DefaultHostQueue = QueueImplPtr( diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 0e6d7f5a5e74f..d6dec5e599f8c 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -716,8 +716,10 @@ class Scheduler { friend class Command; friend class DispatchHostTask; - class StreamBuffers { - public: + /// Stream buffers structure. + /// + /// The structure contains all buffers for a stream object. + struct StreamBuffers { StreamBuffers(size_t StreamBufferSize, size_t FlushBufferSize) // Initialize stream buffer with zeros, this is needed for two reasons: // 1. We don't need to care about end of line when printing out @@ -745,8 +747,11 @@ class Scheduler { std::mutex StreamBuffersPoolMutex; std::map StreamBuffersPool; - // Allocate buffers in the pool for a provided stream + /// Allocate buffers in the pool for a provided stream void allocateStreamBuffers(stream_impl *, size_t, size_t); + + /// Deallocate buffers in the pool for a provided stream + void deallocateStreamBuffers(stream_impl *); }; } // namespace detail diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 33708c2cbe8bd..2cc9db9244f2c 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -62,13 +62,18 @@ size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; } void stream_impl::flush() { // Access the stream buffer on the host. This access guarantees that kernel is // executed and buffer contains streamed data. - auto HostAcc = detail::Scheduler::getInstance() - .StreamBuffersPool.find(this) - ->second.Buf.get_access( - range<1>(BufferSize_), id<1>(OffsetSize)); + { + auto HostAcc = detail::Scheduler::getInstance() + .StreamBuffersPool.find(this) + ->second.Buf.get_access( + range<1>(BufferSize_), id<1>(OffsetSize)); - printf("%s", HostAcc.get_pointer()); - fflush(stdout); + printf("%s", HostAcc.get_pointer()); + fflush(stdout); + } + + // Flushed the stream, can deallocate the buffers now. + detail::Scheduler::getInstance().deallocateStreamBuffers(this); } } // namespace detail } // namespace sycl diff --git a/sycl/test/basic_tests/stream/release_resources_test.cpp b/sycl/test/basic_tests/stream/release_resources_test.cpp index d57e21cefbf27..8b7d2ee73db7b 100644 --- a/sycl/test/basic_tests/stream/release_resources_test.cpp +++ b/sycl/test/basic_tests/stream/release_resources_test.cpp @@ -3,14 +3,6 @@ // RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER // RUN: env SYCL_PI_TRACE=2 %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER -//==----------------------- release_resources_test.cpp ---------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - // Check that buffer used by a stream object is released. #include From 4708e9617ee6922d30ae40b4750fc29431633ff0 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 4 Sep 2020 00:44:04 -0700 Subject: [PATCH 3/3] [SYCL] Disable flaky test after uplifting to Level Zero v1.0 --- sycl/test/basic_tests/image_accessor_readsampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/basic_tests/image_accessor_readsampler.cpp b/sycl/test/basic_tests/image_accessor_readsampler.cpp index a15e55f2087cd..5c8a4247b7cbc 100644 --- a/sycl/test/basic_tests/image_accessor_readsampler.cpp +++ b/sycl/test/basic_tests/image_accessor_readsampler.cpp @@ -1,4 +1,4 @@ -// UNSUPPORTED: cuda +// UNSUPPORTED: cuda, level_zero // CUDA cannot support SYCL 1.2.1 images. // // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out