Skip to content

Commit d7fd691

Browse files
author
Wei
authored
update fx notebook (#1297)
1 parent 10325f1 commit d7fd691

File tree

1 file changed

+137
-46
lines changed

1 file changed

+137
-46
lines changed

notebooks/getting_started_with_fx_path_lower_to_trt.ipynb

+137-46
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
"bento/extensions/theme/main.css": true
1111
},
1212
"kernelspec": {
13-
"display_name": "accelerators",
13+
"display_name": "dper3_pytorch (cinder)",
1414
"language": "python",
15-
"name": "bento_kernel_accelerators",
15+
"name": "bento_kernel_dper3_pytorch_cinder",
1616
"metadata": {
17-
"kernel_name": "bento_kernel_accelerators",
18-
"nightly_builds": true,
17+
"kernel_name": "bento_kernel_dper3_pytorch_cinder",
18+
"nightly_builds": false,
1919
"fbpkg_supported": true,
20-
"cinder_runtime": false,
20+
"cinder_runtime": true,
2121
"is_prebuilt": true
2222
}
2323
},
@@ -32,10 +32,10 @@
3232
"nbconvert_exporter": "python",
3333
"pygments_lexer": "ipython3"
3434
},
35-
"last_server_session_id": "c6f6ab3c-9274-41e7-8592-b1b583442e00",
36-
"last_kernel_id": "fcbf3a69-76a4-4730-9b41-bcd0b24729ca",
37-
"last_base_url": "https://devgpu005.ftw6.facebook.com:8093/",
38-
"last_msg_id": "e28f842c-f32dde25c1b80ef7d423dfee_407",
35+
"last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58",
36+
"last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202",
37+
"last_base_url": "https://devgpu005.ftw6.facebook.com:8091/",
38+
"last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139",
3939
"outputWidgetContext": {}
4040
},
4141
"nbformat": 4,
@@ -58,14 +58,14 @@
5858
{
5959
"cell_type": "code",
6060
"metadata": {
61-
"originalKey": "7909785f-b9b4-41dd-82af-c144b879df39",
61+
"originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
6262
"showInput": true,
6363
"customInput": null,
6464
"collapsed": false,
65-
"requestMsgId": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
65+
"requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424",
6666
"customOutput": null,
67-
"executionStartTime": 1656395936225,
68-
"executionStopTime": 1656395937851
67+
"executionStartTime": 1661189891682,
68+
"executionStopTime": 1661189891856
6969
},
7070
"source": [
7171
"import typing as t\n",
@@ -74,10 +74,10 @@
7474
"\n",
7575
"import torch\n",
7676
"import torchvision\n",
77-
"from torch_tensorrt.fx.lower import lower_to_trt\n",
77+
"from torch_tensorrt.fx.lower import compile\n",
7878
"from torch_tensorrt.fx.utils import LowerPrecision"
7979
],
80-
"execution_count": 4,
80+
"execution_count": 9,
8181
"outputs": []
8282
},
8383
{
@@ -98,16 +98,16 @@
9898
{
9999
"cell_type": "code",
100100
"metadata": {
101-
"originalKey": "a4455135-8633-4d2d-bdd3-6435a4a9f4dd",
101+
"originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726",
102102
"showInput": true,
103103
"customInput": null,
104104
"code_folding": [],
105105
"hidden_ranges": [],
106106
"collapsed": false,
107-
"requestMsgId": "2835fffa-cc50-479a-9080-c4f7002c0726",
107+
"requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5",
108108
"customOutput": null,
109-
"executionStartTime": 1656398717455,
110-
"executionStopTime": 1656398717662
109+
"executionStartTime": 1661189260550,
110+
"executionStopTime": 1661189262039
111111
},
112112
"source": [
113113
"@dataclass\n",
@@ -159,24 +159,39 @@
159159
" f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n",
160160
" )"
161161
],
162-
"execution_count": 22,
163-
"outputs": []
162+
"execution_count": 2,
163+
"outputs": [
164+
{
165+
"output_type": "stream",
166+
"name": "stderr",
167+
"text": [
168+
"I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
169+
]
170+
},
171+
{
172+
"output_type": "stream",
173+
"name": "stderr",
174+
"text": [
175+
"I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
176+
]
177+
}
178+
]
164179
},
165180
{
166181
"cell_type": "markdown",
167182
"metadata": {
168183
"originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b",
169-
"showInput": true,
184+
"showInput": false,
170185
"customInput": null,
171186
"code_folding": [],
172187
"hidden_ranges": []
173188
},
174189
"source": [
175190
"Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n",
176-
"The FX path lowering and TensorRT engine creation is integrated into `low_to_trt()` API which is defined in `fx/lower.py` file.\n",
191+
"The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n",
177192
"It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n",
178193
"```\n",
179-
"def lower_to_trt(\n",
194+
"def compile(\n",
180195
" module: nn.Module,\n",
181196
" input: ,\n",
182197
" max_batch_size: int = 2048,\n",
@@ -212,22 +227,18 @@
212227
{
213228
"cell_type": "code",
214229
"metadata": {
215-
"originalKey": "91333212-7f6d-4bde-a248-44d485e83e5e",
230+
"originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b",
216231
"showInput": true,
217232
"customInput": null,
218233
"code_folding": [],
219234
"hidden_ranges": [],
220235
"collapsed": false,
221-
"requestMsgId": "3002935b-b95a-4a08-a57f-f7a35485af5b",
236+
"requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0",
222237
"customOutput": null,
223-
"executionStartTime": 1656397903207,
224-
"executionStopTime": 1656397964752
238+
"executionStartTime": 1661189697773,
239+
"executionStopTime": 1661189753875
225240
},
226241
"source": [
227-
"test_model = torchvision.models.resnet18(pretrained=True)\n",
228-
"input = [torch.rand(128, 3, 224, 224)] \n",
229-
"benchmark(test_model, input, 50, 128)\n",
230-
"\n",
231242
"def benchmark_torch_function(iters: int, f, *args) -> float:\n",
232243
" \"\"\"Estimates the average time duration for a single inference call in second\n",
233244
"\n",
@@ -266,7 +277,7 @@
266277
" time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n",
267278
" elif not conf.jit:\n",
268279
" # Run lowering eager mode benchmark\n",
269-
" lowered_module = lower_to_trt(\n",
280+
" lowered_module = compile(\n",
270281
" module,\n",
271282
" input,\n",
272283
" max_batch_size=conf.batch_size,\n",
@@ -279,6 +290,7 @@
279290
" result = Result(module=module, input=input, conf=conf, time_sec=time)\n",
280291
" return result\n",
281292
"\n",
293+
"\n",
282294
"@torch.inference_mode()\n",
283295
"def benchmark(\n",
284296
" model,\n",
@@ -315,16 +327,25 @@
315327
" ),\n",
316328
" ]\n",
317329
"\n",
318-
" results = [\n",
319-
" run_configuration_benchmark(deepcopy(model), inputs, conf_)\n",
320-
" for conf_ in configurations\n",
321-
" ]\n",
330+
" results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n",
322331
"\n",
323332
" for res in results:\n",
324-
" print(res.format())"
333+
" print(res.format())\n",
334+
"\n",
335+
"\n",
336+
"test_model = torchvision.models.resnet18(pretrained=True)\n",
337+
"input = [torch.rand(128, 3, 224, 224)]\n",
338+
"benchmark(test_model, input, 50, 128)"
325339
],
326-
"execution_count": 21,
340+
"execution_count": 8,
327341
"outputs": [
342+
{
343+
"output_type": "stream",
344+
"name": "stderr",
345+
"text": [
346+
"I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n"
347+
]
348+
},
328349
{
329350
"output_type": "stream",
330351
"name": "stdout",
@@ -339,25 +360,60 @@
339360
"== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n"
340361
]
341362
},
363+
{
364+
"output_type": "stream",
365+
"name": "stderr",
366+
"text": [
367+
"I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n"
368+
]
369+
},
370+
{
371+
"output_type": "stream",
372+
"name": "stderr",
373+
"text": [
374+
"I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n"
375+
]
376+
},
377+
{
378+
"output_type": "stream",
379+
"name": "stderr",
380+
"text": [
381+
"I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
382+
]
383+
},
384+
{
385+
"output_type": "stream",
386+
"name": "stderr",
387+
"text": [
388+
"I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
389+
]
390+
},
342391
{
343392
"output_type": "stream",
344393
"name": "stdout",
345394
"text": [
346-
"== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpaayayg72\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpdw_pq71j\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
395+
"\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
347396
]
348397
},
349398
{
350399
"output_type": "stream",
351400
"name": "stderr",
352401
"text": [
353-
"I0627 233146.650 fx2trt.py:190] Run Module elapsed time: 0:00:00.244369\n"
402+
"I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n"
354403
]
355404
},
356405
{
357406
"output_type": "stream",
358407
"name": "stderr",
359408
"text": [
360-
"I0627 233206.570 fx2trt.py:241] Build TRT engine elapsed time: 0:00:19.918630\n"
409+
"I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n"
410+
]
411+
},
412+
{
413+
"output_type": "stream",
414+
"name": "stderr",
415+
"text": [
416+
"I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n"
361417
]
362418
},
363419
{
@@ -374,25 +430,60 @@
374430
"== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n"
375431
]
376432
},
433+
{
434+
"output_type": "stream",
435+
"name": "stderr",
436+
"text": [
437+
"I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n"
438+
]
439+
},
440+
{
441+
"output_type": "stream",
442+
"name": "stderr",
443+
"text": [
444+
"I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n"
445+
]
446+
},
447+
{
448+
"output_type": "stream",
449+
"name": "stderr",
450+
"text": [
451+
"I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
452+
]
453+
},
454+
{
455+
"output_type": "stream",
456+
"name": "stderr",
457+
"text": [
458+
"I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
459+
]
460+
},
377461
{
378462
"output_type": "stream",
379463
"name": "stdout",
380464
"text": [
381-
"== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpnoeblgd5\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpyb1egsof\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
465+
"\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
466+
]
467+
},
468+
{
469+
"output_type": "stream",
470+
"name": "stderr",
471+
"text": [
472+
"I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n"
382473
]
383474
},
384475
{
385476
"output_type": "stream",
386477
"name": "stderr",
387478
"text": [
388-
"I0627 233208.996 fx2trt.py:190] Run Module elapsed time: 0:00:00.217076\n"
479+
"I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n"
389480
]
390481
},
391482
{
392483
"output_type": "stream",
393484
"name": "stderr",
394485
"text": [
395-
"I0627 233244.147 fx2trt.py:241] Build TRT engine elapsed time: 0:00:35.150950\n"
486+
"I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n"
396487
]
397488
},
398489
{
@@ -406,7 +497,7 @@
406497
"output_type": "stream",
407498
"name": "stdout",
408499
"text": [
409-
"== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 15.00ms, QPS: 8530.72, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.95ms, QPS: 16098.45, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.36ms, QPS: 29365.31, Accuracy: None (rtol=0.01)\n"
500+
"== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n"
410501
]
411502
}
412503
]

0 commit comments

Comments
 (0)