10
10
#pragma once
11
11
12
12
#include < ATen/cuda/tunable/Tunable.h>
13
+ #include < ATen/cuda/tunable/StreamTimer.h>
13
14
#include < ATen/cuda/Sleep.h>
14
15
#include < c10/cuda/CUDACachingAllocator.h>
15
16
@@ -38,7 +39,57 @@ class Callable {
38
39
}
39
40
};
40
41
41
- template <typename ParamsT, typename TimerT>
42
+ namespace {
43
+
44
+ /* * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */
45
+
46
+ class Stats {
47
+ public:
48
+ Stats () {
49
+ _n = 0UL ;
50
+ _mean = 0.0 ;
51
+ _M2 = 0.0 ;
52
+ _sum = 0.0 ;
53
+ _min = 0.0 ;
54
+ _max = 0.0 ;
55
+ }
56
+
57
+ void sample_value (const double x) {
58
+ double delta = 0 ;
59
+ _sum = _sum + x;
60
+ if (0UL == _n) {
61
+ _min = x;
62
+ _max = x;
63
+ }
64
+ else {
65
+ _min = _min < x ? _min : x;
66
+ _max = _max > x ? _max : x;
67
+ }
68
+ _n = _n + 1UL ;
69
+ delta = x - _mean;
70
+ _mean = _mean + delta/_n;
71
+ _M2 = _M2 + delta * (x - _mean);
72
+ }
73
+
74
+ double variance () const {
75
+ return _M2/(_n-1 );
76
+ }
77
+
78
+ double stddev () const {
79
+ return std::sqrt (variance ());
80
+ }
81
+
82
+ unsigned long _n;
83
+ double _mean;
84
+ double _M2;
85
+ double _sum;
86
+ double _min;
87
+ double _max;
88
+ };
89
+
90
+ } // anonymous namespace
91
+
92
+ template <typename ParamsT>
42
93
class TunableOp {
43
94
public:
44
95
TunableOp () = default ;
@@ -99,10 +150,17 @@ class TunableOp {
99
150
}
100
151
}
101
152
102
- static double Profile (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
153
+ static double ProfileSimple (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
103
154
TuningContext* ctx = getTuningContext ();
104
155
bool do_flush = ctx->IsICacheFlushEnabled ();
105
- TimerT timer{};
156
+ StreamTimerNoSync timer{};
157
+
158
+ // Small Mandatory Warmup
159
+ // Reduces outliers
160
+ for (size_t i = 0 ; i < 2 ; i++) {
161
+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
162
+ }
163
+
106
164
timer.Start ();
107
165
for (size_t i = 0 ; i < num_iter; i++) {
108
166
if (do_flush) {
@@ -114,6 +172,32 @@ class TunableOp {
114
172
return timer.Duration () / num_iter;
115
173
}
116
174
175
+ static Stats ProfileStats (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
176
+ TuningContext* ctx = getTuningContext ();
177
+ bool do_flush = ctx->IsICacheFlushEnabled ();
178
+ std::vector<StreamTimerNoSync> timer (num_iter);
179
+
180
+ // Small Mandatory Warmup
181
+ // Reduces outliers
182
+ for (size_t i = 0 ; i < 2 ; i++) {
183
+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
184
+ }
185
+
186
+ for (size_t i = 0 ; i < num_iter; i++) {
187
+ timer[i].Start ();
188
+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
189
+ timer[i].End ();
190
+ if (do_flush) {
191
+ at::cuda::flush_icache ();
192
+ }
193
+ }
194
+ Stats s;
195
+ for (size_t i = 0 ; i < num_iter; i++) {
196
+ s.sample_value (timer[i].Duration ());
197
+ }
198
+ return s;
199
+ }
200
+
117
201
protected:
118
202
virtual ResultEntry FindFastest (const ParamsT* params) {
119
203
TuningContext* ctx = getTuningContext ();
@@ -183,14 +267,25 @@ class TunableOp {
183
267
}
184
268
185
269
// collect a small profile
186
- constexpr const int approx_num_iter = 3 ;
187
- auto approx_duration = Profile (candidate, reusable_params, approx_num_iter, offset);
270
+ int approx_num_iter = 3 ;
271
+ auto s = ProfileStats (candidate, reusable_params, approx_num_iter, offset);
272
+ double approx_duration = s._mean ;
188
273
// bail if too slow
189
- if (approx_duration > 2 * min_duration_ms) {
274
+ if (approx_duration > 1.5 * min_duration_ms) {
190
275
TUNABLE_LOG3 (" ├──skip slow instance id=" , i, " , " , op_sig, ' (' , params_sig, " ) " , op_names_[i]);
191
276
continue ;
192
277
}
193
278
279
+ // 2nd phase skip, more aggressive
280
+ approx_num_iter = 10 ;
281
+ s = ProfileStats (candidate, reusable_params, approx_num_iter, offset);
282
+ approx_duration = s._mean ;
283
+ // bail if too slow
284
+ if (approx_duration > 1.15 * min_duration_ms) {
285
+ TUNABLE_LOG3 (" ├──2nd skip slow instance id=" , i, " , " , op_sig, ' (' , params_sig, " ) " , op_names_[i]);
286
+ continue ;
287
+ }
288
+
194
289
// for warmup does user set max duration, max iters, or both?
195
290
// warmup is allowed to be skipped by setting either iterations or duration to 0
196
291
double max_warmup_duration = ctx->GetMaxWarmupDurationMs ();
@@ -237,12 +332,27 @@ class TunableOp {
237
332
" instance id=" , i, " , " , op_sig, " (" , params_sig, " ) " , op_names_[i]);
238
333
TUNABLE_LOG3 (" ├──offset at " , offset);
239
334
WarmUp (candidate, reusable_params, warmup_iter, offset);
240
- auto duration_ms = Profile (candidate, reusable_params, tuning_iter, offset);
241
- if (duration_ms < min_duration_ms) {
242
- TUNABLE_LOG3 (" ├──found better instance id=" , i, " . " , duration_ms, " ms. " , op_names_[i]);
243
- min_duration_ms = duration_ms;
335
+ s = ProfileStats (candidate, reusable_params, tuning_iter, offset);
336
+ auto s_stddev = s.stddev ();
337
+ // Assume normal distribution.
338
+ // Solution with smallest mean + 2*sigma will be a better solution?
339
+ // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) {
340
+ if (s._mean < min_duration_ms) {
341
+ TUNABLE_LOG3 (" ├──found better instance id=" , i, " . " , s._mean , " ms. " , op_names_[i],
342
+ " min " , s._min ,
343
+ " max " , s._max ,
344
+ " mean " , s._mean ,
345
+ " std " , s_stddev);
346
+ min_duration_ms = s._mean ;
244
347
id_name = op_names_[i];
245
348
}
349
+ else {
350
+ TUNABLE_LOG3 (" ├──found slower instance id=" , i, " . " , s._mean , " ms. " , op_names_[i],
351
+ " min " , s._min ,
352
+ " max " , s._max ,
353
+ " mean " , s._mean ,
354
+ " std " , s_stddev);
355
+ }
246
356
}
247
357
248
358
for (size_t i = 0 ; i < reusable_params.size (); i++) {
0 commit comments