|
10 | 10 | "bento/extensions/theme/main.css": true
|
11 | 11 | },
|
12 | 12 | "kernelspec": {
|
13 |
| - "display_name": "accelerators", |
| 13 | + "display_name": "dper3_pytorch (cinder)", |
14 | 14 | "language": "python",
|
15 |
| - "name": "bento_kernel_accelerators", |
| 15 | + "name": "bento_kernel_dper3_pytorch_cinder", |
16 | 16 | "metadata": {
|
17 |
| - "kernel_name": "bento_kernel_accelerators", |
18 |
| - "nightly_builds": true, |
| 17 | + "kernel_name": "bento_kernel_dper3_pytorch_cinder", |
| 18 | + "nightly_builds": false, |
19 | 19 | "fbpkg_supported": true,
|
20 |
| - "cinder_runtime": false, |
| 20 | + "cinder_runtime": true, |
21 | 21 | "is_prebuilt": true
|
22 | 22 | }
|
23 | 23 | },
|
|
32 | 32 | "nbconvert_exporter": "python",
|
33 | 33 | "pygments_lexer": "ipython3"
|
34 | 34 | },
|
35 |
| - "last_server_session_id": "c6f6ab3c-9274-41e7-8592-b1b583442e00", |
36 |
| - "last_kernel_id": "fcbf3a69-76a4-4730-9b41-bcd0b24729ca", |
37 |
| - "last_base_url": "https://devgpu005.ftw6.facebook.com:8093/", |
38 |
| - "last_msg_id": "e28f842c-f32dde25c1b80ef7d423dfee_407", |
| 35 | + "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58", |
| 36 | + "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202", |
| 37 | + "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/", |
| 38 | + "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139", |
39 | 39 | "outputWidgetContext": {}
|
40 | 40 | },
|
41 | 41 | "nbformat": 4,
|
|
58 | 58 | {
|
59 | 59 | "cell_type": "code",
|
60 | 60 | "metadata": {
|
61 |
| - "originalKey": "7909785f-b9b4-41dd-82af-c144b879df39", |
| 61 | + "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", |
62 | 62 | "showInput": true,
|
63 | 63 | "customInput": null,
|
64 | 64 | "collapsed": false,
|
65 |
| - "requestMsgId": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", |
| 65 | + "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424", |
66 | 66 | "customOutput": null,
|
67 |
| - "executionStartTime": 1656395936225, |
68 |
| - "executionStopTime": 1656395937851 |
| 67 | + "executionStartTime": 1661189891682, |
| 68 | + "executionStopTime": 1661189891856 |
69 | 69 | },
|
70 | 70 | "source": [
|
71 | 71 | "import typing as t\n",
|
|
74 | 74 | "\n",
|
75 | 75 | "import torch\n",
|
76 | 76 | "import torchvision\n",
|
77 |
| - "from torch_tensorrt.fx.lower import lower_to_trt\n", |
| 77 | + "from torch_tensorrt.fx.lower import compile\n", |
78 | 78 | "from torch_tensorrt.fx.utils import LowerPrecision"
|
79 | 79 | ],
|
80 |
| - "execution_count": 4, |
| 80 | + "execution_count": 9, |
81 | 81 | "outputs": []
|
82 | 82 | },
|
83 | 83 | {
|
|
98 | 98 | {
|
99 | 99 | "cell_type": "code",
|
100 | 100 | "metadata": {
|
101 |
| - "originalKey": "a4455135-8633-4d2d-bdd3-6435a4a9f4dd", |
| 101 | + "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726", |
102 | 102 | "showInput": true,
|
103 | 103 | "customInput": null,
|
104 | 104 | "code_folding": [],
|
105 | 105 | "hidden_ranges": [],
|
106 | 106 | "collapsed": false,
|
107 |
| - "requestMsgId": "2835fffa-cc50-479a-9080-c4f7002c0726", |
| 107 | + "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5", |
108 | 108 | "customOutput": null,
|
109 |
| - "executionStartTime": 1656398717455, |
110 |
| - "executionStopTime": 1656398717662 |
| 109 | + "executionStartTime": 1661189260550, |
| 110 | + "executionStopTime": 1661189262039 |
111 | 111 | },
|
112 | 112 | "source": [
|
113 | 113 | "@dataclass\n",
|
|
159 | 159 | " f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n",
|
160 | 160 | " )"
|
161 | 161 | ],
|
162 |
| - "execution_count": 22, |
163 |
| - "outputs": [] |
| 162 | + "execution_count": 2, |
| 163 | + "outputs": [ |
| 164 | + { |
| 165 | + "output_type": "stream", |
| 166 | + "name": "stderr", |
| 167 | + "text": [ |
| 168 | + "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n" |
| 169 | + ] |
| 170 | + }, |
| 171 | + { |
| 172 | + "output_type": "stream", |
| 173 | + "name": "stderr", |
| 174 | + "text": [ |
| 175 | + "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n" |
| 176 | + ] |
| 177 | + } |
| 178 | + ] |
164 | 179 | },
|
165 | 180 | {
|
166 | 181 | "cell_type": "markdown",
|
167 | 182 | "metadata": {
|
168 | 183 | "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b",
|
169 |
| - "showInput": true, |
| 184 | + "showInput": false, |
170 | 185 | "customInput": null,
|
171 | 186 | "code_folding": [],
|
172 | 187 | "hidden_ranges": []
|
173 | 188 | },
|
174 | 189 | "source": [
|
175 | 190 | "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n",
|
176 |
| - "The FX path lowering and TensorRT engine creation is integrated into `low_to_trt()` API which is defined in `fx/lower.py` file.\n", |
| 191 | + "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n", |
177 | 192 | "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n",
|
178 | 193 | "```\n",
|
179 |
| - "def lower_to_trt(\n", |
| 194 | + "def compile(\n", |
180 | 195 | " module: nn.Module,\n",
|
181 | 196 | " input: ,\n",
|
182 | 197 | " max_batch_size: int = 2048,\n",
|
|
212 | 227 | {
|
213 | 228 | "cell_type": "code",
|
214 | 229 | "metadata": {
|
215 |
| - "originalKey": "91333212-7f6d-4bde-a248-44d485e83e5e", |
| 230 | + "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b", |
216 | 231 | "showInput": true,
|
217 | 232 | "customInput": null,
|
218 | 233 | "code_folding": [],
|
219 | 234 | "hidden_ranges": [],
|
220 | 235 | "collapsed": false,
|
221 |
| - "requestMsgId": "3002935b-b95a-4a08-a57f-f7a35485af5b", |
| 236 | + "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0", |
222 | 237 | "customOutput": null,
|
223 |
| - "executionStartTime": 1656397903207, |
224 |
| - "executionStopTime": 1656397964752 |
| 238 | + "executionStartTime": 1661189697773, |
| 239 | + "executionStopTime": 1661189753875 |
225 | 240 | },
|
226 | 241 | "source": [
|
227 |
| - "test_model = torchvision.models.resnet18(pretrained=True)\n", |
228 |
| - "input = [torch.rand(128, 3, 224, 224)] \n", |
229 |
| - "benchmark(test_model, input, 50, 128)\n", |
230 |
| - "\n", |
231 | 242 | "def benchmark_torch_function(iters: int, f, *args) -> float:\n",
|
232 | 243 | " \"\"\"Estimates the average time duration for a single inference call in second\n",
|
233 | 244 | "\n",
|
|
266 | 277 | " time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n",
|
267 | 278 | " elif not conf.jit:\n",
|
268 | 279 | " # Run lowering eager mode benchmark\n",
|
269 |
| - " lowered_module = lower_to_trt(\n", |
| 280 | + " lowered_module = compile(\n", |
270 | 281 | " module,\n",
|
271 | 282 | " input,\n",
|
272 | 283 | " max_batch_size=conf.batch_size,\n",
|
|
279 | 290 | " result = Result(module=module, input=input, conf=conf, time_sec=time)\n",
|
280 | 291 | " return result\n",
|
281 | 292 | "\n",
|
| 293 | + "\n", |
282 | 294 | "@torch.inference_mode()\n",
|
283 | 295 | "def benchmark(\n",
|
284 | 296 | " model,\n",
|
|
315 | 327 | " ),\n",
|
316 | 328 | " ]\n",
|
317 | 329 | "\n",
|
318 |
| - " results = [\n", |
319 |
| - " run_configuration_benchmark(deepcopy(model), inputs, conf_)\n", |
320 |
| - " for conf_ in configurations\n", |
321 |
| - " ]\n", |
| 330 | + " results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n", |
322 | 331 | "\n",
|
323 | 332 | " for res in results:\n",
|
324 |
| - " print(res.format())" |
| 333 | + " print(res.format())\n", |
| 334 | + "\n", |
| 335 | + "\n", |
| 336 | + "test_model = torchvision.models.resnet18(pretrained=True)\n", |
| 337 | + "input = [torch.rand(128, 3, 224, 224)]\n", |
| 338 | + "benchmark(test_model, input, 50, 128)" |
325 | 339 | ],
|
326 |
| - "execution_count": 21, |
| 340 | + "execution_count": 8, |
327 | 341 | "outputs": [
|
| 342 | + { |
| 343 | + "output_type": "stream", |
| 344 | + "name": "stderr", |
| 345 | + "text": [ |
| 346 | + "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n" |
| 347 | + ] |
| 348 | + }, |
328 | 349 | {
|
329 | 350 | "output_type": "stream",
|
330 | 351 | "name": "stdout",
|
|
339 | 360 | "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n"
|
340 | 361 | ]
|
341 | 362 | },
|
| 363 | + { |
| 364 | + "output_type": "stream", |
| 365 | + "name": "stderr", |
| 366 | + "text": [ |
| 367 | + "I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n" |
| 368 | + ] |
| 369 | + }, |
| 370 | + { |
| 371 | + "output_type": "stream", |
| 372 | + "name": "stderr", |
| 373 | + "text": [ |
| 374 | + "I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n" |
| 375 | + ] |
| 376 | + }, |
| 377 | + { |
| 378 | + "output_type": "stream", |
| 379 | + "name": "stderr", |
| 380 | + "text": [ |
| 381 | + "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" |
| 382 | + ] |
| 383 | + }, |
| 384 | + { |
| 385 | + "output_type": "stream", |
| 386 | + "name": "stderr", |
| 387 | + "text": [ |
| 388 | + "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" |
| 389 | + ] |
| 390 | + }, |
342 | 391 | {
|
343 | 392 | "output_type": "stream",
|
344 | 393 | "name": "stdout",
|
345 | 394 | "text": [
|
346 |
| - "== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpaayayg72\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpdw_pq71j\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
| 395 | + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
347 | 396 | ]
|
348 | 397 | },
|
349 | 398 | {
|
350 | 399 | "output_type": "stream",
|
351 | 400 | "name": "stderr",
|
352 | 401 | "text": [
|
353 |
| - "I0627 233146.650 fx2trt.py:190] Run Module elapsed time: 0:00:00.244369\n" |
| 402 | + "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n" |
354 | 403 | ]
|
355 | 404 | },
|
356 | 405 | {
|
357 | 406 | "output_type": "stream",
|
358 | 407 | "name": "stderr",
|
359 | 408 | "text": [
|
360 |
| - "I0627 233206.570 fx2trt.py:241] Build TRT engine elapsed time: 0:00:19.918630\n" |
| 409 | + "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n" |
| 410 | + ] |
| 411 | + }, |
| 412 | + { |
| 413 | + "output_type": "stream", |
| 414 | + "name": "stderr", |
| 415 | + "text": [ |
| 416 | + "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n" |
361 | 417 | ]
|
362 | 418 | },
|
363 | 419 | {
|
|
374 | 430 | "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n"
|
375 | 431 | ]
|
376 | 432 | },
|
| 433 | + { |
| 434 | + "output_type": "stream", |
| 435 | + "name": "stderr", |
| 436 | + "text": [ |
| 437 | + "I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n" |
| 438 | + ] |
| 439 | + }, |
| 440 | + { |
| 441 | + "output_type": "stream", |
| 442 | + "name": "stderr", |
| 443 | + "text": [ |
| 444 | + "I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n" |
| 445 | + ] |
| 446 | + }, |
| 447 | + { |
| 448 | + "output_type": "stream", |
| 449 | + "name": "stderr", |
| 450 | + "text": [ |
| 451 | + "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" |
| 452 | + ] |
| 453 | + }, |
| 454 | + { |
| 455 | + "output_type": "stream", |
| 456 | + "name": "stderr", |
| 457 | + "text": [ |
| 458 | + "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" |
| 459 | + ] |
| 460 | + }, |
377 | 461 | {
|
378 | 462 | "output_type": "stream",
|
379 | 463 | "name": "stdout",
|
380 | 464 | "text": [
|
381 |
| - "== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpnoeblgd5\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpyb1egsof\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
| 465 | + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
| 466 | + ] |
| 467 | + }, |
| 468 | + { |
| 469 | + "output_type": "stream", |
| 470 | + "name": "stderr", |
| 471 | + "text": [ |
| 472 | + "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n" |
382 | 473 | ]
|
383 | 474 | },
|
384 | 475 | {
|
385 | 476 | "output_type": "stream",
|
386 | 477 | "name": "stderr",
|
387 | 478 | "text": [
|
388 |
| - "I0627 233208.996 fx2trt.py:190] Run Module elapsed time: 0:00:00.217076\n" |
| 479 | + "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n" |
389 | 480 | ]
|
390 | 481 | },
|
391 | 482 | {
|
392 | 483 | "output_type": "stream",
|
393 | 484 | "name": "stderr",
|
394 | 485 | "text": [
|
395 |
| - "I0627 233244.147 fx2trt.py:241] Build TRT engine elapsed time: 0:00:35.150950\n" |
| 486 | + "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n" |
396 | 487 | ]
|
397 | 488 | },
|
398 | 489 | {
|
|
406 | 497 | "output_type": "stream",
|
407 | 498 | "name": "stdout",
|
408 | 499 | "text": [
|
409 |
| - "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 15.00ms, QPS: 8530.72, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.95ms, QPS: 16098.45, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.36ms, QPS: 29365.31, Accuracy: None (rtol=0.01)\n" |
| 500 | + "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n" |
410 | 501 | ]
|
411 | 502 | }
|
412 | 503 | ]
|
|
0 commit comments