10
10
#pragma once
11
11
12
12
#include < ATen/cuda/tunable/Tunable.h>
13
+ #include < ATen/cuda/tunable/StreamTimer.h>
13
14
#include < ATen/cuda/Sleep.h>
14
15
#include < c10/cuda/CUDACachingAllocator.h>
15
16
@@ -35,7 +36,57 @@ class Callable {
35
36
}
36
37
};
37
38
38
- template <typename ParamsT, typename TimerT>
39
+ namespace {
40
+
41
+ /* * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */
42
+
43
+ class Stats {
44
+ public:
45
+ Stats () {
46
+ _n = 0UL ;
47
+ _mean = 0.0 ;
48
+ _M2 = 0.0 ;
49
+ _sum = 0.0 ;
50
+ _min = 0.0 ;
51
+ _max = 0.0 ;
52
+ }
53
+
54
+ void sample_value (const double x) {
55
+ double delta = 0 ;
56
+ _sum = _sum + x;
57
+ if (0UL == _n) {
58
+ _min = x;
59
+ _max = x;
60
+ }
61
+ else {
62
+ _min = _min < x ? _min : x;
63
+ _max = _max > x ? _max : x;
64
+ }
65
+ _n = _n + 1UL ;
66
+ delta = x - _mean;
67
+ _mean = _mean + delta/_n;
68
+ _M2 = _M2 + delta * (x - _mean);
69
+ }
70
+
71
+ double variance () const {
72
+ return _M2/(_n-1 );
73
+ }
74
+
75
+ double stddev () const {
76
+ return std::sqrt (variance ());
77
+ }
78
+
79
+ unsigned long _n;
80
+ double _mean;
81
+ double _M2;
82
+ double _sum;
83
+ double _min;
84
+ double _max;
85
+ };
86
+
87
+ } // anonymous namespace
88
+
89
+ template <typename ParamsT>
39
90
class TunableOp {
40
91
public:
41
92
virtual ~TunableOp () = default ;
@@ -100,10 +151,17 @@ class TunableOp {
100
151
}
101
152
}
102
153
103
- static double Profile (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
154
+ static double ProfileSimple (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
104
155
TuningContext* ctx = getTuningContext ();
105
156
bool do_flush = ctx->IsICacheFlushEnabled ();
106
- TimerT timer{};
157
+ StreamTimerNoSync timer{};
158
+
159
+ // Small Mandatory Warmup
160
+ // Reduces outliers
161
+ for (size_t i = 0 ; i < 2 ; i++) {
162
+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
163
+ }
164
+
107
165
timer.Start ();
108
166
for (size_t i = 0 ; i < num_iter; i++) {
109
167
if (do_flush) {
@@ -115,6 +173,32 @@ class TunableOp {
115
173
return timer.Duration () / num_iter;
116
174
}
117
175
176
+ static Stats ProfileStats (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
177
+ TuningContext* ctx = getTuningContext ();
178
+ bool do_flush = ctx->IsICacheFlushEnabled ();
179
+ std::vector<StreamTimerNoSync> timer (num_iter);
180
+
181
+ // Small Mandatory Warmup
182
+ // Reduces outliers
183
+ for (size_t i = 0 ; i < 2 ; i++) {
184
+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
185
+ }
186
+
187
+ for (size_t i = 0 ; i < num_iter; i++) {
188
+ timer[i].Start ();
189
+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
190
+ timer[i].End ();
191
+ if (do_flush) {
192
+ at::cuda::flush_icache ();
193
+ }
194
+ }
195
+ Stats s;
196
+ for (size_t i = 0 ; i < num_iter; i++) {
197
+ s.sample_value (timer[i].Duration ());
198
+ }
199
+ return s;
200
+ }
201
+
118
202
protected:
119
203
virtual ResultEntry FindFastest (const ParamsT* params) {
120
204
TuningContext* ctx = getTuningContext ();
@@ -184,14 +268,25 @@ class TunableOp {
184
268
}
185
269
186
270
// collect a small profile
187
- constexpr const int approx_num_iter = 3 ;
188
- auto approx_duration = Profile (candidate, reusable_params, approx_num_iter, offset);
271
+ int approx_num_iter = 3 ;
272
+ auto s = ProfileStats (candidate, reusable_params, approx_num_iter, offset);
273
+ double approx_duration = s._mean ;
189
274
// bail if too slow
190
- if (approx_duration > 2 * min_duration_ms) {
275
+ if (approx_duration > 1.5 * min_duration_ms) {
191
276
TUNABLE_LOG3 (" ├──skip slow instance id=" , i, " , " , op_sig, ' (' , params_sig, " ) " , op_names_[i]);
192
277
continue ;
193
278
}
194
279
280
+ // 2nd phase skip, more aggressive
281
+ approx_num_iter = 10 ;
282
+ s = ProfileStats (candidate, reusable_params, approx_num_iter, offset);
283
+ approx_duration = s._mean ;
284
+ // bail if too slow
285
+ if (approx_duration > 1.15 * min_duration_ms) {
286
+ TUNABLE_LOG3 (" ├──2nd skip slow instance id=" , i, " , " , op_sig, ' (' , params_sig, " ) " , op_names_[i]);
287
+ continue ;
288
+ }
289
+
195
290
// for warmup does user set max duration, max iters, or both?
196
291
// warmup is allowed to be skipped by setting either iterations or duration to 0
197
292
double max_warmup_duration = ctx->GetMaxWarmupDurationMs ();
@@ -238,12 +333,27 @@ class TunableOp {
238
333
" instance id=" , i, " , " , op_sig, " (" , params_sig, " ) " , op_names_[i]);
239
334
TUNABLE_LOG3 (" ├──offset at " , offset);
240
335
WarmUp (candidate, reusable_params, warmup_iter, offset);
241
- auto duration_ms = Profile (candidate, reusable_params, tuning_iter, offset);
242
- if (duration_ms < min_duration_ms) {
243
- TUNABLE_LOG3 (" ├──found better instance id=" , i, " . " , duration_ms, " ms. " , op_names_[i]);
244
- min_duration_ms = duration_ms;
336
+ s = ProfileStats (candidate, reusable_params, tuning_iter, offset);
337
+ auto s_stddev = s.stddev ();
338
+ // Assume normal distribution.
339
+ // Solution with smallest mean + 2*sigma will be a better solution?
340
+ // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) {
341
+ if (s._mean < min_duration_ms) {
342
+ TUNABLE_LOG3 (" ├──found better instance id=" , i, " . " , s._mean , " ms. " , op_names_[i],
343
+ " min " , s._min ,
344
+ " max " , s._max ,
345
+ " mean " , s._mean ,
346
+ " std " , s_stddev);
347
+ min_duration_ms = s._mean ;
245
348
id_name = op_names_[i];
246
349
}
350
+ else {
351
+ TUNABLE_LOG3 (" ├──found slower instance id=" , i, " . " , s._mean , " ms. " , op_names_[i],
352
+ " min " , s._min ,
353
+ " max " , s._max ,
354
+ " mean " , s._mean ,
355
+ " std " , s_stddev);
356
+ }
247
357
}
248
358
249
359
for (size_t i = 0 ; i < reusable_params.size (); i++) {
0 commit comments