Skip to content

Commit b000db8

Browse files
authored
[SYCL][L0] Set minimum pooled USM allocation size on device to 512 (intel#5635)
1 parent a836c87 commit b000db8

File tree

2 files changed

+29
-64
lines changed

2 files changed

+29
-64
lines changed

sycl/doc/EnvironmentVariables.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ variables in production code.</span>
140140
| Environment variable | Values | Description |
141141
| -------------------- | ------ | ----------- |
142142
| `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
143-
| `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | EnableBuffers, MaxPoolSize [, MemType, MaxPoolableSize, Capacity, SlabMinSize]... | EnableBuffers enables pooling for SYCL buffers, default false. MaxPoolSize is the maximum size of the pool, default 0. MemType is host, device or shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for host and shared, 32KB for device. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 0. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared. |
143+
| `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | [EnableBuffers][;[MaxPoolSize][;[host\|device\|shared:][MaxPoolableSize][,[Capacity][,SlabMinSize]]]...] | EnableBuffers enables pooling for SYCL buffers, default 0, set to 1 to enable. MaxPoolSize is the maximum size of the pool, default 0. MemType is host, device or shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for host and shared, 32KB for device. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 0. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared. Example: SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1;32M;host:1M,4,64K;device:1M,4,64K;shared:0,0,2M|
144144
| `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of compute commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
145145
| `SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE` | Integer | Sets a preferred number of copy commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
146146
| `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |

sycl/plugins/level_zero/usm_allocator.cpp

+28-63
Original file line numberDiff line numberDiff line change
@@ -43,58 +43,19 @@
4343
// allocations/deallocations.
4444

4545
namespace settings {
46+
47+
// Buckets for Host use a minimum of the cache line size of 64 bytes.
48+
// This prevents two separate allocations residing in the same cache line.
49+
// Buckets for Device and Shared allocations will use starting size of 512.
50+
// This is because memory compression on newer GPUs makes the
51+
// minimum granularity 512 bytes instead of 64.
52+
static constexpr size_t MinBucketSize[3] = {64, 512, 512};
53+
4654
// The largest size which is allocated via the allocator.
4755
// Allocations with size > CutOff bypass the USM allocator and
4856
// go directly to the runtime.
4957
static constexpr size_t CutOff = (size_t)1 << 31; // 2GB
5058

51-
// Unfortunately we cannot deduce the size of the array, so every change
52-
// to the number of buckets should be reflected here.
53-
using BucketsArrayType = std::array<size_t, 53>;
54-
55-
// Generates a list of bucket sizes used by the allocator.
56-
static constexpr BucketsArrayType generateBucketSizes() {
57-
58-
// In order to make bucket sizes constexpr simply write
59-
// them all. There are some restrictions that doesn't
60-
// allow to write this in a nicer way.
61-
62-
// Simple helper to compute power of 2
63-
#define P(n) (1ULL << n)
64-
65-
BucketsArrayType Sizes = {64, 96,
66-
128, 192,
67-
P(8), P(8) + P(7),
68-
P(9), P(9) + P(8),
69-
P(10), P(10) + P(9),
70-
P(11), P(11) + P(10),
71-
P(12), P(12) + P(11),
72-
P(13), P(13) + P(12),
73-
P(14), P(14) + P(13),
74-
P(15), P(15) + P(14),
75-
P(16), P(16) + P(15),
76-
P(17), P(17) + P(16),
77-
P(18), P(18) + P(17),
78-
P(19), P(19) + P(18),
79-
P(20), P(20) + P(19),
80-
P(21), P(21) + P(20),
81-
P(22), P(22) + P(21),
82-
P(23), P(23) + P(22),
83-
P(24), P(24) + P(23),
84-
P(25), P(25) + P(24),
85-
P(26), P(26) + P(25),
86-
P(27), P(27) + P(26),
87-
P(28), P(28) + P(27),
88-
P(29), P(29) + P(28),
89-
P(30), P(30) + P(29),
90-
CutOff};
91-
#undef P
92-
93-
return Sizes;
94-
}
95-
96-
static constexpr BucketsArrayType BucketSizes = generateBucketSizes();
97-
9859
// Protects the capacity checking of the pool.
9960
static sycl::detail::SpinLock PoolLock;
10061

@@ -117,14 +78,14 @@ static class SetLimits {
11778
size_t CurPoolSize = 0;
11879
size_t CurPoolSizes[3] = {0, 0, 0};
11980

120-
bool EnableBuffers = true;
81+
size_t EnableBuffers = 1;
12182

12283
// Whether to print pool usage statistics
12384
int PoolTrace = 0;
12485

12586
SetLimits() {
12687
// Parse optional parameters of this form:
127-
// SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=[EnableBuffers][;MaxPoolSize][;memtypelimits]...
88+
// SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=[EnableBuffers][;[MaxPoolSize][;memtypelimits]...]
12889
// memtypelimits: [<memtype>:]<limits>
12990
// memtype: host|device|shared
13091
// limits: [MaxPoolableSize][,[Capacity][,SlabMinSize]]
@@ -135,7 +96,7 @@ static class SetLimits {
13596
// Duplicate specifications will result in the right-most taking effect.
13697
//
13798
// EnableBuffers: Apply chunking/pooling to SYCL buffers.
138-
// Default true.
99+
// Default 1.
139100
// MaxPoolSize: Limit on overall unfreed memory.
140101
// Default 16MB.
141102
// MaxPoolableSize: Maximum allocation size subject to chunking/pooling.
@@ -148,7 +109,7 @@ static class SetLimits {
148109
// Example of usage:
149110
// SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1;32M;host:1M,4,64K;device:1M,4,64K;shared:0,0,2M
150111

151-
auto GetValue = [](std::string &Param, size_t Length) {
112+
auto GetValue = [=](std::string &Param, size_t Length, size_t &Setting) {
152113
size_t Multiplier = 1;
153114
if (tolower(Param[Length - 1]) == 'k') {
154115
Length--;
@@ -163,8 +124,8 @@ static class SetLimits {
163124
Multiplier = 1024 * 1024 * 1024;
164125
}
165126
std::string TheNumber = Param.substr(0, Length);
166-
assert(TheNumber.find_first_not_of("0123456789") == std::string::npos);
167-
return std::stoi(TheNumber) * Multiplier;
127+
if (TheNumber.find_first_not_of("0123456789") == std::string::npos)
128+
Setting = std::stoi(TheNumber) * Multiplier;
168129
};
169130

170131
auto ParamParser = [=](std::string &Params, size_t &Setting,
@@ -177,13 +138,13 @@ static class SetLimits {
177138
size_t Pos = Params.find(',');
178139
if (Pos != std::string::npos) {
179140
if (Pos > 0) {
180-
Setting = GetValue(Params, Pos);
141+
GetValue(Params, Pos, Setting);
181142
ParamWasSet = true;
182143
}
183144
Params.erase(0, Pos + 1);
184145
More = true;
185146
} else {
186-
Setting = GetValue(Params, Params.size());
147+
GetValue(Params, Params.size(), Setting);
187148
ParamWasSet = true;
188149
More = false;
189150
}
@@ -244,13 +205,13 @@ static class SetLimits {
244205
size_t Pos = Params.find(';');
245206
if (Pos != std::string::npos) {
246207
if (Pos > 0) {
247-
EnableBuffers = GetValue(Params, Pos);
208+
GetValue(Params, Pos, EnableBuffers);
248209
}
249210
Params.erase(0, Pos + 1);
250211
size_t Pos = Params.find(';');
251212
if (Pos != std::string::npos) {
252213
if (Pos > 0) {
253-
MaxPoolSize = GetValue(Params, Pos);
214+
GetValue(Params, Pos, MaxPoolSize);
254215
}
255216
Params.erase(0, Pos + 1);
256217
do {
@@ -269,10 +230,10 @@ static class SetLimits {
269230
}
270231
} while (true);
271232
} else {
272-
MaxPoolSize = GetValue(Params, Params.size());
233+
GetValue(Params, Params.size(), MaxPoolSize);
273234
}
274235
} else {
275-
EnableBuffers = GetValue(Params, Params.size());
236+
GetValue(Params, Params.size(), EnableBuffers);
276237
}
277238
}
278239

@@ -514,11 +475,15 @@ class USMAllocContext::USMAllocImpl {
514475
USMAllocImpl(std::unique_ptr<SystemMemory> SystemMemHandle)
515476
: MemHandle{std::move(SystemMemHandle)} {
516477

517-
Buckets.reserve(BucketSizes.size());
518-
519-
for (auto &&Size : BucketSizes) {
520-
Buckets.emplace_back(std::make_unique<Bucket>(Size, *this));
478+
// Generate buckets sized such as: 64, 96, 128, 192, ..., CutOff.
479+
// Powers of 2 and the value halfway between the powers of 2.
480+
auto Size1 = MinBucketSize[MemHandle->getMemType()];
481+
auto Size2 = Size1 + Size1 / 2;
482+
for (; Size2 < CutOff; Size1 *= 2, Size2 *= 2) {
483+
Buckets.push_back(std::make_unique<Bucket>(Size1, *this));
484+
Buckets.push_back(std::make_unique<Bucket>(Size2, *this));
521485
}
486+
Buckets.push_back(std::make_unique<Bucket>(CutOff, *this));
522487
}
523488

524489
void *allocate(size_t Size, size_t Alignment, bool &FromPool);

0 commit comments

Comments
 (0)