Skip to content

Commit 7af800e

Browse files
[SYCL] Fix flaky failure of concurrent cache eviction UT (#16522)
Regression after: #16289 fixes #16515 **Problem** The exception is thrown when one process tries to calculate cache size and another process simultaneously inserted/removed any item from the cache. This might cause std::filesystem::recursive_directory iterator to throw std::filesystem_error. **Solution** We catch the exception and just skip over to the next item. This means that while calculating the size of cache we might not consider the size of any item added/removed from cache in the meanwhile. We calculate the cache size only once (for all processes/threads using the same cache) and that too the first-time persistent cache is used by any process. So, this race is rather rare and just ignoring the exception would work.
1 parent 69da088 commit 7af800e

File tree

4 files changed

+61
-15
lines changed

4 files changed

+61
-15
lines changed

sycl/include/sycl/detail/os_util.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -95,15 +95,16 @@ class __SYCL_EXPORT OSUtil {
9595
// exporting them as ABI. They are only used in persistent cache
9696
// implementation and should not be exposed to the end users.
9797
// Get size of directory in bytes.
98-
size_t getDirectorySize(const std::string &Path);
98+
size_t getDirectorySize(const std::string &Path, bool ignoreErrors = false);
9999

100100
// Get size of file in bytes.
101101
size_t getFileSize(const std::string &Path);
102102

103103
// Function to recursively iterate over the directory and execute
104104
// 'Func' on each regular file.
105105
void fileTreeWalk(const std::string Path,
106-
std::function<void(const std::string)> Func);
106+
std::function<void(const std::string)> Func,
107+
bool ignoreErrors = false);
107108

108109
} // namespace detail
109110
} // namespace _V1

sycl/source/detail/os_util.cpp

+24-12
Original file line numberDiff line numberDiff line change
@@ -248,33 +248,45 @@ size_t getFileSize(const std::string &Path) {
248248
// Function to recursively iterate over the directory and execute
249249
// 'Func' on each regular file.
250250
void fileTreeWalk(const std::string Path,
251-
std::function<void(const std::string)> Func) {
251+
std::function<void(const std::string)> Func,
252+
bool ignoreErrors) {
252253

253254
std::error_code EC;
254255
for (auto It = fs::recursive_directory_iterator(Path, EC);
255256
It != fs::recursive_directory_iterator(); It.increment(EC)) {
256257

257258
// Errors can happen if a file was removed/added during the iteration.
258-
if (EC)
259-
throw sycl::exception(
260-
make_error_code(errc::runtime),
261-
"Failed to do File Tree Walk. Ensure that the directory is not "
262-
"getting updated while FileTreeWalk is in progress.: " +
263-
Path + "\n" + EC.message());
264-
265-
if (fs::is_regular_file(It->path()))
266-
Func(It->path().string());
259+
if (EC) {
260+
if (ignoreErrors) {
261+
EC.clear();
262+
continue;
263+
} else
264+
throw sycl::exception(
265+
make_error_code(errc::runtime),
266+
"Failed to do File Tree Walk. Ensure that the directory is not "
267+
"getting updated while FileTreeWalk is in progress.: " +
268+
Path + "\n" + EC.message());
269+
}
270+
271+
try {
272+
if (fs::is_regular_file(It->path()))
273+
Func(It->path().string());
274+
} catch (...) {
275+
// Ignore errors if ignoreErrors is set to true.
276+
if (!ignoreErrors)
277+
throw;
278+
}
267279
}
268280
}
269281

270282
// Get size of a directory in bytes.
271-
size_t getDirectorySize(const std::string &Path) {
283+
size_t getDirectorySize(const std::string &Path, bool ignoreErrors) {
272284
size_t DirSizeVar = 0;
273285

274286
auto CollectFIleSize = [&DirSizeVar](const std::string Path) {
275287
DirSizeVar += getFileSize(Path);
276288
};
277-
fileTreeWalk(Path, CollectFIleSize);
289+
fileTreeWalk(Path, CollectFIleSize, ignoreErrors);
278290

279291
return DirSizeVar;
280292
}

sycl/source/detail/persistent_device_code_cache.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ void PersistentDeviceCodeCache::repopulateCacheSizeFile(
241241
// Calculate the size of the cache directory.
242242
// During directory size calculation, do not add anything
243243
// in the cache. Otherwise, we'll get a std::fs_error.
244-
size_t CacheSize = getDirectorySize(CacheRoot);
244+
size_t CacheSize = getDirectorySize(CacheRoot, /*Ignore Error*/ true);
245245

246246
std::ofstream FileStream{CacheSizeFile};
247247
FileStream << CacheSize;

sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp

+33
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,39 @@ TEST_P(PersistentDeviceCodeCache, ConcurentReadWriteCacheEviction) {
653653
ConcurentReadWriteCache(2, 100);
654654
}
655655

656+
// Unit test for ensuring that os_utils::getDirectorySize is thread-safe.
657+
TEST_P(PersistentDeviceCodeCache, ConcurentDirectorySizeCalculation) {
658+
// Cleanup the cache directory.
659+
std::string CacheRoot = detail::PersistentDeviceCodeCache::getRootDir();
660+
ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(CacheRoot));
661+
ASSERT_NO_ERROR(llvm::sys::fs::create_directories(CacheRoot));
662+
663+
// Spawn multiple threads to calculate the size of the directory concurrently
664+
// and adding/removing file simultaneously.
665+
constexpr size_t ThreadCount = 50;
666+
Barrier b(ThreadCount);
667+
{
668+
auto testLambda = [&](std::size_t threadId) {
669+
b.wait();
670+
// Create a file named: test_file_<thread_id>.txt
671+
std::string FileName =
672+
CacheRoot + "/test_file_" + std::to_string(threadId) + ".txt";
673+
std::ofstream FileStream(FileName,
674+
std::ofstream::out | std::ofstream::trunc);
675+
FileStream << "Test file for thread: " << threadId;
676+
FileStream.close();
677+
678+
// Calculate the size of the directory.
679+
[[maybe_unused]] auto i = detail::getDirectorySize(CacheRoot, true);
680+
681+
// Remove the created file.
682+
ASSERT_NO_ERROR(llvm::sys::fs::remove(FileName));
683+
};
684+
685+
ThreadPool MPool(ThreadCount, testLambda);
686+
}
687+
}
688+
656689
INSTANTIATE_TEST_SUITE_P(PersistentDeviceCodeCacheImpl,
657690
PersistentDeviceCodeCache,
658691
::testing::Values(SYCL_DEVICE_BINARY_TYPE_SPIRV,

0 commit comments

Comments
 (0)