diff --git a/python/models/llama2/LLaMA-2 E2E Notebook.ipynb b/python/models/llama/LLaMA-2 E2E Notebook.ipynb similarity index 100% rename from python/models/llama2/LLaMA-2 E2E Notebook.ipynb rename to python/models/llama/LLaMA-2 E2E Notebook.ipynb diff --git a/python/models/llama/README.md b/python/models/llama/README.md new file mode 100644 index 000000000..34379cd2f --- /dev/null +++ b/python/models/llama/README.md @@ -0,0 +1,145 @@ +# LLaMA-2 + +This folder contains a Jupyter notebook that demonstrates how to export, optimize, and run the LLaMA-2 model with ONNX Runtime. For more details, please see the notebook and the [ORT README](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/llama/README.md). + +## LLaMA-2 7B FP16 CUDA (1 A100 80GB) + +| Engine | Batch Size | Prompt Length | Prompt Processing Latency (ms) | Prompt Processing Throughput (tps) | Average Latency of First 128 Tokens Generated (ms) | Average Throughput of First 128 Tokens Generated (tps) | Average Latency of First 256 Tokens Generated (ms) | Average Throughput of First 256 Tokens Generated (tps) | Wall-Clock Latency (s) | Wall-Clock Throughput (tps) | +|-----------------|----|------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| +| onnxruntime | 1 | 16 | 11.967659 | 1336.936489 | 10.52479073 | 95.01376562 | 10.54278947 | 94.85155731 | 3.08197999 | 88.25495327 | +| onnxruntime | 1 | 64 | 12.41350174 | 5155.676564 | 10.51662862 | 95.08750721 | 10.55776421 | 94.71702343 | 3.122560978 | 104.3349877 | +| onnxruntime | 1 | 256 | 22.4044323 | 11426.3105 | 10.7493531 | 93.02885402 | 10.78576129 | 92.71482774 | 3.139767647 | 163.0693916 | +| onnxruntime | 1 | 1024 | 75.05702972 | 13642.95928 | 11.31167263 | 88.40425575 | 11.34056505 | 88.17902774 | 3.332163334 | 384.1348313 | +| onnxruntime | 1 | 2048 | 135.2889538 | 15137.96909 | 12.08372787 | 82.75591863 | 12.11640146 | 82.53275559 | 3.582954168 | 643.044787 | +| onnxruntime | 1 | 3840 | 251.5854836 | 15263.20178 | 13.44519481 | 74.37601419 | 13.48242071 | 74.17065688 | 4.047522068 | 1011.977188 | +| onnxruntime | 4 | 16 | 12.75753975 | 5016.641238 | 10.92023589 | 366.2924539 | 10.99625602 | 363.7601736 | 3.188626289 | 341.2127673 | +| onnxruntime | 4 | 64 | 22.7124691 | 11271.3417 | 11.15260646 | 358.6605531 | 11.19375136 | 357.3422236 | 3.256895304 | 393.0123264 | +| onnxruntime | 4 | 256 | 73.77910614 | 13879.26818 | 11.26689278 | 355.0224609 | 11.35130133 | 352.3825051 | 3.345386028 | 612.186451 | +| onnxruntime | 4 | 1024 | 250.616312 | 16343.7087 | 12.52830587 | 319.2770068 | 12.6034962 | 317.3722542 | 3.847688437 | 1330.669072 | +| onnxruntime | 4 | 2048 | 506.0505867 | 16188.10494 | 14.06471804 | 284.3995869 | 14.14138451 | 282.8577355 | 4.497682095 | 2049.055448 | +| onnxruntime | 4 | 3840 | 978.5776138 | 15696.25115 | 16.76318049 | 238.6182026 | 16.83990005 | 237.531101 | 5.664571524 | 2892.363514 | +| onnxruntime | 16 | 16 | 21.32916451 | 12002.34542 | 11.54885069 | 1385.419245 | 11.97430678 | 1336.194261 | 3.479871035 | 1250.621059 | +| onnxruntime | 16 | 64 | 73.28677177 | 13972.50793 | 11.71741821 | 1365.488516 | 12.04443816 | 1328.413977 | 3.52155304 | 1453.903986 | +| onnxruntime | 16 | 256 | 248.3313084 | 16494.09423 | 12.81819306 | 1248.225855 | 13.13442457 | 1218.172894 | 3.978744745 | 2058.940828 | +| onnxruntime | 16 | 1024 | 975.6298065 | 16793.25487 | 16.74189232 | 955.6864715 | 17.06122886 | 937.7988026 | 5.703416586 | 3590.830109 | +| onnxruntime | 16 | 2048 | 1993.696928 | 16435.79801 | 22.16357179 | 721.9053026 | 22.49017637 | 711.4217219 | 8.114635229 | 4542.902911 | +| onnxruntime | 16 | 3840 | 3924.712181 | 15654.65113 | 31.63040616 | 505.8423822 | 31.95275087 | 500.7393594 | 12.46947217 | 5255.715648 | +| pytorch-eager | 1 | 16 | 32.97473 | 485.2201 | 31.95276 | 31.2962 | 31.8423 | 31.40477 | 8.28506 | 32.83018 | +| pytorch-eager | 1 | 64 | 32.63447 | 1961.117 | 31.33203 | 31.91622 | 31.36941 | 31.87819 | 8.164876 | 39.19227 | +| pytorch-eager | 1 | 256 | 34.46941 | 7426.875 | 31.69294 | 31.55277 | 31.53167 | 31.71414 | 8.207787 | 62.37979 | +| pytorch-eager | 1 | 1024 | 103.928 | 9852.975 | 31.84283 | 31.40424 | 31.80877 | 31.43787 | 8.408238 | 152.2317 | +| pytorch-eager | 1 | 2048 | 244.3801 | 8380.386 | 32.11394 | 31.13912 | 32.11288 | 31.14015 | 8.720115 | 264.2167 | +| pytorch-eager | 1 | 3840 | 611.0726 | 6284.032 | 32.04668 | 31.20448 | 32.02001 | 31.23048 | 9.293344 | 440.7455 | +| pytorch-eager | 4 | 16 | 32.7481 | 1954.312 | 31.60442 | 126.5646 | 31.45407 | 127.1696 | 8.18083 | 132.9938 | +| pytorch-eager | 4 | 64 | 33.18802 | 7713.626 | 31.20292 | 128.1931 | 31.26663 | 127.9319 | 8.132635 | 157.3906 | +| pytorch-eager | 4 | 256 | 89.22571 | 11476.51 | 31.29607 | 127.8116 | 31.29617 | 127.8111 | 8.248695 | 248.2817 | +| pytorch-eager | 4 | 1024 | 392.79 | 10427.96 | 31.26839 | 127.9247 | 31.22812 | 128.0897 | 8.707226 | 588.0174 | +| pytorch-eager | 4 | 2048 | 955.0025 | 8577.988 | 31.27921 | 127.8805 | 31.28768 | 127.8458 | 9.992102 | 922.3284 | +| pytorch-eager | 4 | 3840 | 2467.054 | 6226.05 | 31.35273 | 127.5806 | 31.33206 | 127.6647 | 15.97773 | 1025.427 | +| pytorch-eager | 16 | 16 | 33.24208 | 7701.083 | 31.49257 | 508.0563 | 32.28204 | 495.6316 | 8.396241 | 518.3272 | +| pytorch-eager | 16 | 64 | 86.10473 | 11892.49 | 31.38509 | 509.7962 | 32.1652 | 497.432 | 8.467185 | 604.6874 | +| pytorch-eager | 16 | 256 | 332.7774 | 12308.53 | 32.50902 | 492.171 | 32.52583 | 491.9167 | 8.955728 | 914.7219 | +| pytorch-eager | 16 | 1024 | 1543.535 | 10614.59 | 33.13551 | 482.8656 | 33.08991 | 483.5311 | 16.21622 | 1262.933 | +| pytorch-eager | 16 | 2048 | 3856.058 | 8497.797 | 33.05266 | 484.0761 | 33.0516 | 484.0915 | 28.2955 | 1302.822 | +| pytorch-eager | 16 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | +| pytorch-compile | 1 | 16 | 12.97314 | 1233.317 | 15.06754 | 66.36782 | 14.94154 | 66.9275 | 3.949777 | 68.86465 | +| pytorch-compile | 1 | 64 | 13.30206 | 4811.285 | 14.92108 | 67.01927 | 14.79991 | 67.56797 | 3.913011 | 81.77846 | +| pytorch-compile | 1 | 256 | 21.05542 | 12158.39 | 14.74479 | 67.82056 | 14.73028 | 67.88739 | 3.938656 | 129.9936 | +| pytorch-compile | 1 | 1024 | 75.77764 | 13513.22 | 14.73202 | 67.87934 | 14.64023 | 68.30496 | 4.274032 | 299.483 | +| pytorch-compile | 1 | 2048 | 159.0262 | 12878.38 | 14.77194 | 67.69592 | 14.69836 | 68.0348 | 5.412601 | 425.6734 | +| pytorch-compile | 1 | 3840 | 339.8554 | 11298.92 | 14.06384 | 71.10431 | 14.04383 | 71.20565 | 7.099414 | 576.949 | +| pytorch-compile | 4 | 16 | 14.82386 | 4317.365 | 14.85815 | 269.2126 | 14.85617 | 269.2483 | 3.927973 | 276.9877 | +| pytorch-compile | 4 | 64 | 20.7674 | 12327.01 | 14.78437 | 270.556 | 14.80803 | 270.1237 | 3.955819 | 323.574 | +| pytorch-compile | 4 | 256 | 70.34887 | 14556.03 | 14.9404 | 267.7305 | 14.9841 | 266.9496 | 4.313815 | 474.7537 | +| pytorch-compile | 4 | 1024 | 290.767 | 14086.88 | 15.47449 | 258.49 | 15.64255 | 255.7128 | 6.704303 | 763.6886 | +| pytorch-compile | 4 | 2048 | 644.6995 | 12706.69 | 17.22421 | 232.2313 | 17.09852 | 233.9384 | 10.17314 | 905.9152 | +| pytorch-compile | 4 | 3840 | 1488.37 | 10320.01 | 17.54926 | 227.9298 | 16.19208 | 247.0344 | 16.26342 | 1007.414 | +| pytorch-compile | 16 | 16 | 20.62188 | 12414 | 16.14968 | 990.7319 | 17.28572 | 925.6194 | 5.190115 | 838.5171 | +| pytorch-compile | 16 | 64 | 68.86672 | 14869.3 | 15.93814 | 1003.881 | 17.00272 | 941.0257 | 5.524729 | 926.7423 | +| pytorch-compile | 16 | 256 | 262.5498 | 15600.85 | 16.28529 | 982.4817 | 18.87143 | 847.8423 | 7.905223 | 1036.277 | +| pytorch-compile | 16 | 1024 | 1134.517 | 14441.39 | 19.28937 | 829.4722 | 20.54817 | 778.6581 | 16.55617 | 1237.001 | +| pytorch-compile | 16 | 2048 | 3682.501 | 8898.3 | 32.4632 | 492.8657 | 32.31265 | 495.1621 | 28.07167 | 1313.21 | +| pytorch-compile | 16 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | + +## LLaMA-2 13B FP16 CUDA (1 A100 80GB) + +| Engine | Batch Size | Prompt Length | Prompt Processing Latency (ms) | Prompt Processing Throughput (tps) | Average Latency of First 128 Tokens Generated (ms) | Average Throughput of First 128 Tokens Generated (tps) | Average Latency of First 256 Tokens Generated (ms) | Average Throughput of First 256 Tokens Generated (tps) | Wall-Clock Latency (s) | Wall-Clock Throughput (tps) | +|-----------------|----|------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| + +| onnxruntime | 1 | 16 | 11.13351822 | 1437.101883 | 10.54286957 | 94.85083673 | 10.57387795 | 94.57268227 | 3.068660975 | 88.6380093 | +| onnxruntime | 1 | 64 | 11.76165581 | 5441.41072 | 10.52967831 | 94.9696629 | 10.58431901 | 94.47938965 | 3.021231651 | 105.9170686 | +| onnxruntime | 1 | 256 | 21.47537231 | 11920.63151 | 10.69651358 | 93.48840561 | 10.74470952 | 93.06905859 | 3.068811178 | 166.8398511 | +| onnxruntime | 1 | 1024 | 75.85664749 | 13499.14653 | 11.28780469 | 88.59118555 | 11.37996651 | 87.8737208 | 3.290901899 | 388.9511262 | +| onnxruntime | 1 | 2048 | 136.8558168 | 14964.65439 | 12.11677119 | 82.53023716 | 12.13857438 | 82.38199712 | 3.555114031 | 648.0804779 | +| onnxruntime | 1 | 3840 | 255.1124668 | 15052.18482 | 13.43484595 | 74.433306 | 13.47557642 | 74.2083284 | 4.00520277 | 1022.669821 | +| onnxruntime | 4 | 16 | 11.84427261 | 5403.4555 | 10.9059643 | 366.7717856 | 10.99776383 | 363.7103015 | 3.129503489 | 347.658983 | +| onnxruntime | 4 | 64 | 21.20970249 | 12069.94771 | 10.98915562 | 363.9952093 | 11.09936275 | 360.3810499 | 3.171034098 | 403.6538115 | +| onnxruntime | 4 | 256 | 74.3429184 | 13774.00864 | 11.28847338 | 354.3437509 | 11.38650626 | 351.293005 | 3.290605307 | 622.3778938 | +| onnxruntime | 4 | 1024 | 254.2414045 | 16110.67248 | 12.47876324 | 320.5445863 | 12.57592719 | 318.0679992 | 3.774018049 | 1356.64428 | +| onnxruntime | 4 | 2048 | 513.8937044 | 15941.03981 | 14.03383911 | 285.0253569 | 14.12440091 | 283.197852 | 4.431999207 | 2079.422755 | +| onnxruntime | 4 | 3840 | 981.845336 | 15644.01178 | 16.76735282 | 238.5588258 | 16.86015446 | 237.2457506 | 5.601641655 | 2924.856856 | +| onnxruntime | 16 | 16 | 35.34348 | 7243.203 | 21.84567 | 732.4107 | 21.85107 | 732.2296 | 6.171947 | 705.126 | +| onnxruntime | 16 | 64 | 125.9579 | 8129.699 | 21.25177 | 752.8786 | 21.75558 | 735.4436 | 6.062363 | 844.5551 | +| onnxruntime | 16 | 256 | 463.7862 | 8831.656 | 22.7791 | 702.3983 | 23.27515 | 687.4284 | 6.786547 | 1207.094 | +| onnxruntime | 16 | 1024 | 1873.496 | 8745.146 | 28.65446 | 558.3773 | 29.13779 | 549.115 | 9.696249 | 2112.157 | +| onnxruntime | 16 | 2048 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | +| onnxruntime | 16 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | +| pytorch-eager | 1 | 16 | 42.12493 | 379.8226 | 41.37673 | 24.16818 | 41.3158 | 24.20382 | 10.72739 | 25.35564 | +| pytorch-eager | 1 | 64 | 42.26999 | 1514.076 | 41.31047 | 24.20694 | 41.32404 | 24.19899 | 10.72874 | 29.82644 | +| pytorch-eager | 1 | 256 | 45.08846 | 5677.728 | 41.18236 | 24.28224 | 41.16458 | 24.29273 | 10.70244 | 47.83956 | +| pytorch-eager | 1 | 1024 | 181.966 | 5627.425 | 41.06609 | 24.35099 | 41.0729 | 24.34695 | 10.88831 | 117.5572 | +| pytorch-eager | 1 | 2048 | 403.207 | 5079.277 | 42.42671 | 23.57006 | 42.3149 | 23.63234 | 11.11297 | 207.3253 | +| pytorch-eager | 1 | 3840 | 1018.319 | 3770.922 | 41.1257 | 24.3157 | 41.12887 | 24.31382 | 12.18967 | 336.0221 | +| pytorch-eager | 4 | 16 | 43.02286 | 1487.581 | 41.28961 | 96.87667 | 41.34238 | 96.75302 | 10.7351 | 101.3498 | +| pytorch-eager | 4 | 64 | 44.4156 | 5763.741 | 40.88859 | 97.82681 | 41.0022 | 97.55573 | 10.65282 | 120.156 | +| pytorch-eager | 4 | 256 | 160.3949 | 6384.245 | 41.0125 | 97.53125 | 41.05703 | 97.42545 | 10.85462 | 188.6755 | +| pytorch-eager | 4 | 1024 | 659.7745 | 6208.182 | 40.50975 | 98.74166 | 40.46299 | 98.85578 | 11.51654 | 444.5778 | +| pytorch-eager | 4 | 2048 | 1623.698 | 5045.274 | 40.59845 | 98.52594 | 40.50154 | 98.76167 | 15.95867 | 577.4918 | +| pytorch-eager | 4 | 3840 | 4142.559 | 3707.853 | 41.05242 | 97.43639 | 40.99807 | 97.56556 | 25.11443 | 652.374 | +| pytorch-eager | 16 | 16 | 44.22766 | 5788.233 | 43.33905 | 369.1821 | 46.63927 | 343.0585 | 12.10059 | 359.6518 | +| pytorch-eager | 16 | 64 | 154.077 | 6646.029 | 41.8791 | 382.0522 | 46.081 | 347.2147 | 12.17601 | 420.499 | +| pytorch-eager | 16 | 256 | 567.7781 | 7214.086 | 41.41301 | 386.352 | 41.57611 | 384.8364 | 12.75126 | 642.4463 | +| pytorch-eager | 16 | 1024 | 2726.571 | 6009.013 | 41.55068 | 385.0719 | 42.79878 | 373.8425 | 25.90561 | 790.5624 | +| pytorch-eager | 16 | 2048 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | +| pytorch-eager | 16 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | +| pytorch-compile | 1 | 16 | 19.38103 | 825.5494 | 19.05318 | 52.48469 | 52.48469 | 52.94427 | 5.558527 | 48.93383 | +| pytorch-compile | 1 | 64 | 19.37849 | 3302.632 | 18.47518 | 54.12666 | 18.42021 | 54.28819 | 5.583981 | 57.30679 | +| pytorch-compile | 1 | 256 | 34.72291 | 7372.653 | 18.41529 | 54.3027 | 18.41931 | 54.29085 | 5.862381 | 87.33652 | +| pytorch-compile | 1 | 1024 | 138.4733 | 7394.93 | 19.16589 | 52.17604 | 19.16391 | 52.1814 | 6.957563 | 183.9725 | +| pytorch-compile | 1 | 2048 | 271.4814 | 7543.794 | 17.87733 | 55.93676 | 17.86563 | 55.9734 | 8.67444 | 265.6079 | +| pytorch-compile | 1 | 3840 | 597.6365 | 6425.311 | 18.66753 | 53.56894 | 18.60976 | 53.73523 | 11.2927 | 362.7123 | +| pytorch-compile | 4 | 16 | 19.46838 | 3287.381 | 19.08949 | 209.5394 | 19.07973 | 209.6465 | 6.095531 | 178.4914 | +| pytorch-compile | 4 | 64 | 34.53485 | 7412.803 | 19.00581 | 210.462 | 19.21915 | 208.1257 | 6.400232 | 199.9928 | +| pytorch-compile | 4 | 256 | 130.9406 | 7820.343 | 24.85916 | 160.9065 | 22.26585 | 179.6473 | 7.743704 | 264.4729 | +| pytorch-compile | 4 | 1024 | 505.2351 | 8107.116 | 20.01902 | 199.81 | 20.86037 | 191.7511 | 11.46451 | 446.5955 | +| pytorch-compile | 4 | 2048 | 1153.294 | 7103.134 | 19.50131 | 205.1144 | 20.31255 | 196.9226 | 16.80264 | 548.4854 | +| pytorch-compile | 4 | 3840 | 2578.104 | 5957.867 | 19.23593 | 207.9442 | 19.10223 | 209.3997 | 24.32255 | 673.6136 | +| pytorch-compile | 16 | 16 | 34.4058 | 7440.605 | 20.89283 | 765.8128 | 24.65018 | 649.0825 | 8.886711 | 489.72 | +| pytorch-compile | 16 | 64 | 128.428 | 7973.342 | 24.17005 | 661.9762 | 27.00489 | 592.4853 | 10.14654 | 504.6057 | +| pytorch-compile | 16 | 256 | 461.1762 | 8881.638 | 24.57411 | 651.0918 | 26.20077 | 610.6691 | 13.1846 | 621.331 | +| pytorch-compile | 16 | 1024 | 2606.956 | 6284.725 | 39.67544 | 403.2722 | 39.51063 | 404.9543 | 25.67172 | 797.7651 | +| pytorch-compile | 16 | 2048 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | +| pytorch-compile | 16 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | + +## LLaMA-2 70B FP16 CUDA (4 A100 80GB) + +| Engine | Batch Size | Prompt Length | Prompt Processing Latency (ms) | Prompt Processing Throughput (tps) | Average Latency of First 128 Tokens Generated (ms) | Average Throughput of First 128 Tokens Generated (tps) | Average Latency of First 256 Tokens Generated (ms) | Average Throughput of First 256 Tokens Generated (tps) | Wall-Clock Latency (s) | Wall-Clock Throughput (tps) | +|-------------|----|------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| +| onnxruntime | 1 | 16 | 33.90884399 | 471.8533018 | 32.09379315 | 31.15867281 | 32.19444305 | 31.06126105 | 9.030313015 | 30.12077207 | +| onnxruntime | 1 | 64 | 41.69011116 | 1535.136228 | 33.19698013 | 30.12322193 | 33.02684613 | 30.27839825 | 9.254086733 | 34.57931714 | +| onnxruntime | 1 | 256 | 68.52030754 | 3736.118666 | 32.60990791 | 30.66552665 | 32.65001439 | 30.62785787 | 9.191497326 | 55.70365544 | +| onnxruntime | 1 | 1024 | 207.4344158 | 4936.50003 | 34.06454809 | 29.35603306 | 34.11752358 | 29.31045091 | 9.773607016 | 130.9649547 | +| onnxruntime | 1 | 2048 | 384.3646049 | 5328.274179 | 36.26316041 | 27.57619548 | 36.19885352 | 27.62518431 | 10.52682996 | 218.8693091 | +| onnxruntime | 1 | 3840 | 688.8821125 | 5574.248381 | 39.21672702 | 25.49932327 | 39.3556226 | 25.40932995 | 11.68651938 | 350.4893001 | +| onnxruntime | 4 | 16 | 41.2812233 | 1550.341654 | 33.15291367 | 120.6530454 | 33.26787427 | 120.2361163 | 9.408552885 | 115.6394627 | +| onnxruntime | 4 | 64 | 67.01588631 | 3819.989768 | 33.51482376 | 119.3501726 | 33.34233537 | 119.9676014 | 9.426578999 | 135.7862699 | +| onnxruntime | 4 | 256 | 207.9341412 | 4924.636206 | 33.40258636 | 119.751206 | 33.4388027 | 119.6215079 | 9.585339069 | 213.6596301 | +| onnxruntime | 4 | 1024 | 723.0362892 | 5664.9992 | 34.91823189 | 114.5533374 | 35.27628351 | 113.3906297 | 10.53820944 | 485.8510385 | +| onnxruntime | 4 | 2048 | 1404.65641 | 5832.031193 | 37.19914332 | 107.529358 | 37.12219838 | 107.7522392 | 11.71326399 | 786.8003324 | +| onnxruntime | 4 | 3840 | 2701.535702 | 5685.655011 | 41.06120393 | 97.41555574 | 41.02838039 | 97.49349015 | 14.05006671 | 1166.11546 | +| onnxruntime | 16 | 16 | 67.89302826 | 3770.637524 | 33.8511169 | 472.6579643 | 34.15830154 | 468.4073645 | 9.626672506 | 452.0772881 | +| onnxruntime | 16 | 64 | 204.0295601 | 5018.880595 | 34.05243531 | 469.8636046 | 34.30210985 | 466.4436115 | 9.769803524 | 524.0637631 | +| onnxruntime | 16 | 256 | 715.4211998 | 5725.298609 | 35.19898839 | 454.558518 | 35.41505337 | 451.7852856 | 10.63756585 | 770.1009906 | +| onnxruntime | 16 | 1024 | 2801.265717 | 5848.784677 | 38.95713016 | 410.7078713 | 39.20867946 | 408.0729119 | 13.68341517 | 1496.702376 | +| onnxruntime | 16 | 2048 | 5650.010824 | 5799.634907 | 46.37654498 | 345.0019834 | 46.1302707 | 346.843835 | 18.42047048 | 2001.251816 | +| onnxruntime | 16 | 3840 | 10810.8182 | 5683.196118 | 54.980563 | 291.0119345 | 54.17970195 | 295.3135477 | 25.82262492 | 2537.92944 | diff --git a/python/models/llama2/README.md b/python/models/llama2/README.md deleted file mode 100644 index b89dd91dd..000000000 --- a/python/models/llama2/README.md +++ /dev/null @@ -1,126 +0,0 @@ -# LLaMA-2 - -This folder contains a Jupyter notebook that demonstrates how to export, optimize, and run the LLaMA-2 model with ONNX Runtime. For more details, please see the notebook and the [ORT README](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/llama/README.md). - -## LLaMA-2 7B FP16 CUDA (1 A100 80GB) - -| Engine | Batch Size | Prompt Length | Prompt Latency (ms) | Prompt Throughput (tps) | First 128 Tokens Generated Avg Latency (ms) | First 128 Tokens Generated Avg Throughput (tps) | First 256 Tokens Generated Avg Latency (ms) | First 256 Tokens Generated Avg Throughput (tps) | Wall-Clock Latency (s) | Wall-Clock Throughput (tps) | -|-----------------|------------|---------------|---------------------|-------------------------|---------------------------------------------|-------------------------------------------------|---------------------------------------------|-------------------------------------------------|------------------------|-----------------------------| -| onnxruntime | 1 | 16 | 11.967659 | 1336.936489 | 10.52479073 | 95.01376562 | 10.54278947 | 94.85155731 | 3.08197999 | 88.25495327 | -| onnxruntime | 1 | 64 | 12.41350174| 5155.676564 | 10.51662862 | 95.08750721 | 10.55776421 | 94.71702343 | 3.122560978 | 104.3349877 | -| onnxruntime | 1 | 256 | 22.4044323 | 11426.3105 | 10.7493531 | 93.02885402 | 10.78576129 | 92.71482774 | 3.139767647 | 163.0693916 | -| onnxruntime | 1 | 1024 | 75.05702972| 13642.95928 | 11.31167263 | 88.40425575 | 11.34056505 | 88.17902774 | 3.332163334 | 384.1348313 | -| onnxruntime | 1 | 2048 | 135.2889538| 15137.96909 | 12.08372787 | 82.75591863 | 12.11640146 | 82.53275559 | 3.582954168 | 643.044787 | -| onnxruntime | 1 | 3840 | 251.5854836| 15263.20178 | 13.44519481 | 74.37601419 | 13.48242071 | 74.17065688 | 4.047522068 | 1011.977188 | -| onnxruntime | 4 | 16 | 12.75753975| 5016.641238 | 10.92023589 | 366.2924539 | 10.99625602 | 363.7601736 | 3.188626289 | 341.2127673 | -| onnxruntime | 4 | 64 | 22.7124691 | 11271.3417 | 11.15260646 | 358.6605531 | 11.19375136 | 357.3422236 | 3.256895304 | 393.0123264 | -| onnxruntime | 4 | 256 | 73.77910614| 13879.26818 | 11.26689278 | 355.0224609 | 11.35130133 | 352.3825051 | 3.345386028 | 612.186451 | -| onnxruntime | 4 | 1024 | 250.616312 | 16343.7087 | 12.52830587 | 319.2770068 | 12.6034962 | 317.3722542 | 3.847688437 | 1330.669072 | -| onnxruntime | 4 | 2048 | 506.0505867| 16188.10494 | 14.06471804 | 284.3995869 | 14.14138451 | 282.8577355 | 4.497682095 | 2049.055448 | -| onnxruntime | 4 | 3840 | 978.5776138| 15696.25115 | 16.76318049 | 238.6182026 | 16.83990005 | 237.531101 | 5.664571524 | 2892.363514 | -| onnxruntime | 16| 16 | 21.32916451| 12002.34542 | 11.54885069 | 1385.419245 | 11.97430678 | 1336.194261 | 3.479871035 | 1250.621059 | -| onnxruntime | 16| 64 | 73.28677177| 13972.50793 | 11.71741821 | 1365.488516 | 12.04443816 | 1328.413977 | 3.52155304 | 1453.903986 | -| onnxruntime | 16| 256 | 248.3313084| 16494.09423 | 12.81819306 | 1248.225855 | 13.13442457 | 1218.172894 | 3.978744745 | 2058.940828 | -| onnxruntime | 16| 1024 | 975.6298065| 16793.25487 | 16.74189232 | 955.6864715 | 17.06122886 | 937.7988026 | 5.703416586 | 3590.830109 | -| onnxruntime | 16| 2048 | 1993.696928| 16435.79801 | 22.16357179 | 721.9053026 | 22.49017637 | 711.4217219 | 8.114635229 | 4542.902911 | -| onnxruntime | 16| 3840 | 3924.712181| 15654.65113 | 31.63040616 | 505.8423822 | 31.95275087 | 500.7393594 | 12.46947217 | 5255.715648 | -| pytorch-eager | 1 | 16 | 28.39229584| 563.5331532 | 28.91619876 | 34.58269215 | 28.64711825 | 34.90752512 | 7.47516489 | 36.38715721 | -| pytorch-eager | 1 | 64 | 28.33832264| 2258.425836 | 28.10826525 | 35.57672418 | 28.13924383 | 35.53755765 | 7.326321363 | 43.67812769 | -| pytorch-eager | 1 | 256 | 29.52625751| 8670.248841 | 28.09674852 | 35.59130692 | 28.07580587 | 35.61785563 | 7.309934616 | 70.04166616 | -| pytorch-eager | 1 | 1024 | 84.25255299| 12153.93438 | 28.16583589 | 35.50400578 | 28.17474399 | 35.49278036 | 7.446149111 | 171.9009358 | -| pytorch-eager | 1 | 2048 | 163.7624216| 12505.92157 | 28.19093503 | 35.47239561 | 28.41567807 | 35.19184014 | 7.653230667 | 301.0493346 | -| pytorch-eager | 1 | 3840 | 305.544219 | 12567.73901 | 28.34107354 | 35.2844785 | 28.31646986 | 35.31513656 | 7.866804838 | 520.668821 | -| pytorch-eager | 4 | 16 | 28.24880123| 2265.582864 | 28.22476067 | 141.7195365 | 28.22929993 | 141.696748 | 7.349802017 | 148.0311983 | -| pytorch-eager | 4 | 64 | 29.51703072| 8672.959095 | 28.09201181 | 142.3892325 | 28.07555441 | 142.4726986 | 7.310798168 | 175.0834821 | -| pytorch-eager | 4 | 256 | 82.96530724| 12342.50838 | 28.09202299 | 142.3891758 | 28.1163929 | 142.2657598 | 7.429426908 | 275.6605624 | -| pytorch-eager | 4 | 1024 | 312.88486 | 13091.07766 | 28.82966399 | 138.7459806 | 28.59257255 | 139.8964711 | 7.932596445 | 645.4381028 | -| pytorch-eager | 4 | 2048 | 625.9682178| 13086.92641 | 27.89676189 | 143.3858171 | 27.89451275 | 143.3973784 | 9.468767405 | 973.3051417 | -| pytorch-eager | 4 | 3840 | 1205.71981 | 12739.27813 | 28.0031655 | 142.8409942 | 27.98954025 | 142.9105289 | 14.3259058 | 1143.662413 | -| pytorch-eager | 16| 16 | 29.48878765| 8681.265674 | 28.67878228 | 557.9037437 | 29.3890154 | 544.4210969 | 7.648054838 | 569.0335768 | -| pytorch-eager | 16| 64 | 82.87042141| 12356.64043 | 28.48163992 | 561.7654055 | 29.31071911 | 545.8753823 | 7.73479414 | 661.943926 | -| pytorch-eager | 16| 256 | 309.0610218| 13253.0462 | 28.26745622 | 566.0219256 | 28.41518167 | 563.0792787 | 8.03791666 | 1019.169562 | -| pytorch-eager | 16| 1024 | 1220.398097| 13425.12746 | 28.05251256 | 570.3588927 | 28.0295182 | 570.8267936 | 15.23662639 | 1344.129565 | -| pytorch-eager | 16| 2048 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | -| pytorch-eager | 16| 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | -| pytorch-compile | 1 | 16 | 29.29198742| 546.2244596 | 28.4164343 | 35.19090359 | 28.44167873 | 35.15966865 | 7.420462847 | 36.65539544 | -| pytorch-compile | 1 | 64 | 30.74082851| 2081.921766 | 28.99985015 | 34.4829368 | 28.99154648 | 34.4928133 | 7.562855721 | 42.31205934 | -| pytorch-compile | 1 | 256 | 30.93654156| 8275.003834 | 29.35606614 | 34.06450971 | 29.32720538 | 34.09803242 | 7.642072678 | 66.99753085 | -| pytorch-compile | 1 | 1024 | 83.73229504| 12229.45101 | 29.33040075 | 34.09431765 | 29.32655532 | 34.09878825 | 7.727388144 | 165.6445847 | -| pytorch-compile | 1 | 2048 | 163.1656265| 12551.66326 | 28.71328779 | 34.82708102 | 28.88092864 | 34.62492541 | 7.760000467 | 296.9071986 | -| pytorch-compile | 1 | 3840 | 305.436058 | 12572.18949 | 28.93828973 | 34.55629235 | 28.79324462 | 34.73036864 | 8.011547804 | 511.2620058 | -| pytorch-compile | 4 | 16 | 28.16486835| 2272.334427 | 28.76319736 | 139.0665978 | 28.68680656 | 139.4369217 | 7.47304821 | 145.5898543 | -| pytorch-compile | 4 | 64 | 30.34268856| 8436.958363 | 28.90248969 | 138.3963819 | 28.87104731 | 138.5471042 | 7.521306515 | 170.1831986 | -| pytorch-compile | 4 | 256 | 82.32629299| 12438.31057 | 28.95763144 | 138.1328445 | 28.93526014 | 138.2396419 | 7.625442266 | 268.574586 | -| pytorch-compile | 4 | 1024 | 313.4985256| 13065.4522 | 29.65001948 | 134.9071626 | 29.51486036 | 135.5249509 | 8.291597605 | 617.4925803 | -| pytorch-compile | 4 | 2048 | 627.4584389| 13055.84481 | 28.3304248 | 141.1909644 | 28.2943137 | 141.3711618 | 9.530734301 | 966.9769096 | -| pytorch-compile | 4 | 3840 | 1208.906603| 12705.69618 | 29.69196066 | 134.7166004 | 28.80228218 | 138.8778839 | 14.60439372 | 1121.85417 | -| pytorch-compile | 16| 16 | 29.28521633| 8741.612051 | 28.65620144 | 558.3433671 | 29.6270512 | 540.0469959 | 7.712681055 | 564.2655218 | -| pytorch-compile | 16| 64 | 82.25325108| 12449.35594 | 29.05234694 | 550.7300333 | 29.91442662 | 534.8589898 | 7.878657341 | 649.8569208 | -| pytorch-compile | 16| 256 | 309.0423536| 13253.84677 | 29.45811115 | 543.1441248 | 29.59321346 | 540.6645014 | 8.359596491 | 979.9516052 | -| pytorch-compile | 16| 1024 | 1220.913434| 13419.46083 | 28.96543033 | 552.3826097 | 28.92627008 | 553.1304228 | 15.39626837 | 1330.192454 | -| pytorch-compile | 16| 2048 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | -| pytorch-compile | 16| 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | - -## LLaMA-2 13B FP16 CUDA (1 A100 80GB) - -| Engine | Batch Size | Prompt Length | Prompt Latency (ms) | Prompt Throughput (tps) | First 128 Tokens Generated Avg Latency (ms) | First 128 Tokens Generated Avg Throughput (tps) | First 256 Tokens Generated Avg Latency (ms) | First 256 Tokens Generated Avg Throughput (tps) | Wall-Clock Latency (s) | Wall-Clock Throughput (tps) | -|-----------------|------------|---------------|---------------------|-------------------------|---------------------------------------------|-------------------------------------------------|---------------------------------------------|-------------------------------------------------|------------------------|-----------------------------| -| onnxruntime | 1 | 16 | 11.13351822| 1437.101883 | 10.54286957 | 94.85083673 | 10.57387795 | 94.57268227 | 3.068660975 | 88.6380093 | -| onnxruntime | 1 | 64 | 11.76165581| 5441.41072 | 10.52967831 | 94.9696629 | 10.58431901 | 94.47938965 | 3.021231651 | 105.9170686 | -| onnxruntime | 1 | 256 | 21.47537231| 11920.63151 | 10.69651358 | 93.48840561 | 10.74470952 | 93.06905859 | 3.068811178 | 166.8398511 | -| onnxruntime | 1 | 1024 | 75.85664749| 13499.14653 | 11.28780469 | 88.59118555 | 11.37996651 | 87.8737208 | 3.290901899 | 388.9511262 | -| onnxruntime | 1 | 2048 | 136.8558168| 14964.65439 | 12.11677119 | 82.53023716 | 12.13857438 | 82.38199712 | 3.555114031 | 648.0804779 | -| onnxruntime | 1 | 3840 | 255.1124668| 15052.18482 | 13.43484595 | 74.433306 | 13.47557642 | 74.2083284 | 4.00520277 | 1022.669821 | -| onnxruntime | 4 | 16 | 11.84427261| 5403.4555 | 10.9059643 | 366.7717856 | 10.99776383 | 363.7103015 | 3.129503489 | 347.658983 | -| onnxruntime | 4 | 64 | 21.20970249| 12069.94771 | 10.98915562 | 363.9952093 | 11.09936275 | 360.3810499 | 3.171034098 | 403.6538115 | -| onnxruntime | 4 | 256 | 74.3429184 | 13774.00864 | 11.28847338 | 354.3437509 | 11.38650626 | 351.293005 | 3.290605307 | 622.3778938 | -| onnxruntime | 4 | 1024 | 254.2414045| 16110.67248 | 12.47876324 | 320.5445863 | 12.57592719 | 318.0679992 | 3.774018049 | 1356.64428 | -| onnxruntime | 4 | 2048 | 513.8937044| 15941.03981 | 14.03383911 | 285.0253569 | 14.12440091 | 283.197852 | 4.431999207 | 2079.422755 | -| onnxruntime | 4 | 3840 | 981.845336 | 15644.01178 | 16.76735282 | 238.5588258 | 16.86015446 | 237.2457506 | 5.601641655 | 2924.856856 | -| pytorch-eager | 1 | 16 | 36.80826187| 434.6850187 | 36.94714047 | 27.06569405 | 37.05261089 | 26.98865143 | 9.640327454 | 28.21480923 | -| pytorch-eager | 1 | 64 | 36.83074474| 1737.678682 | 37.16384247 | 26.90787425 | 37.19904274 | 26.88241219 | 9.661841154 | 33.11998147 | -| pytorch-eager | 1 | 256 | 43.54115963| 5879.494303 | 36.97926924 | 27.04217851 | 36.98103409 | 27.04088797 | 9.618678331 | 53.22976633 | -| pytorch-eager | 1 | 1024 | 153.0961895| 6688.605401 | 37.06065193 | 26.98279571 | 37.03337349 | 27.00267099 | 9.816339016 | 130.3948445 | -| pytorch-eager | 1 | 2048 | 279.2695189| 7333.417583 | 37.15946153 | 26.91104658 | 37.15209011 | 26.91638605 | 10.0624733 | 228.9695517 | -| pytorch-eager | 1 | 3840 | 543.3611107| 7067.123363 | 37.07464412 | 26.97261225 | 37.09909786 | 26.95483335 | 10.48582697 | 390.6225052 | -| pytorch-eager | 4 | 16 | 37.02672482| 1728.481261 | 36.97393276 | 108.1843261 | 37.04812285 | 107.9676835 | 9.625072956 | 113.0381042 | -| pytorch-eager | 4 | 64 | 43.27553749| 5915.582217 | 36.61502711 | 109.2447641 | 36.72512993 | 108.9172457 | 9.555830717 | 133.9496312 | -| pytorch-eager | 4 | 256 | 150.5873823| 6800.038517 | 36.66387312 | 109.0992211 | 36.71003506 | 108.9620316 | 9.732349157 | 210.4322365 | -| pytorch-eager | 4 | 1024 | 539.2320776| 7595.987276 | 36.48488037 | 109.6344557 | 36.48771811 | 109.6259291 | 10.81396365 | 473.4619206 | -| pytorch-eager | 4 | 2048 | 1109.398093| 7384.184316 | 35.74676253 | 111.8982452 | 35.64815875 | 112.2077588 | 15.03228784 | 613.0803309 | -| pytorch-eager | 4 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | -| pytorch-compile | 1 | 16 | 37.68274307| 424.5975398 | 36.89514473 | 27.1038373 | 36.82048898 | 27.15879196 | 9.5859828 | 28.37476404 | -| pytorch-compile | 1 | 64 | 36.81082249| 1738.619125 | 37.46636584 | 26.69060576 | 37.53986489 | 26.6383484 | 9.759324074 | 32.78915605 | -| pytorch-compile | 1 | 256 | 43.24804783| 5919.342325 | 37.30925731 | 26.80299936 | 37.14028466 | 26.92494172 | 9.664298773 | 52.97849456 | -| pytorch-compile | 1 | 1024 | 153.4820652| 6671.789298 | 37.37829812 | 26.75349201 | 37.32166439 | 26.79408907 | 9.916397095 | 129.0791391 | -| pytorch-compile | 1 | 2048 | 280.7492685| 7294.765221 | 37.29733266 | 26.81156878 | 37.18023375 | 26.89601165 | 10.1268518 | 227.5139447 | -| pytorch-compile | 1 | 3840 | 546.2423229| 7029.847082 | 37.31329553 | 26.80009862 | 37.28003707 | 26.82400766 | 10.63764501 | 385.0476301 | -| pytorch-compile | 4 | 16 | 37.21810818| 1719.593046 | 37.26841882 | 107.3294797 | 37.26031259 | 107.3528299 | 9.688776016 | 112.2948862 | -| pytorch-compile | 4 | 64 | 43.0194521 | 5950.796385 | 37.05189936 | 107.9566789 | 37.00789157 | 108.0850551 | 9.627130032 | 132.9575892 | -| pytorch-compile | 4 | 256 | 150.8078575| 6790.097127 | 36.91511415 | 108.3567014 | 36.73042823 | 108.9015346 | 9.759964943 | 209.8368193 | -| pytorch-compile | 4 | 1024 | 541.2075663| 7568.260784 | 36.83323227 | 108.597583 | 36.7437331 | 108.8621014 | 10.87742996 | 470.6994224 | -| pytorch-compile | 4 | 2048 | 1113.946185| 7354.035688 | 36.22605093 | 110.4177766 | 36.3899637 | 109.9204174 | 15.18939614 | 606.7390641 | -| pytorch-compile | 4 | 3840 | OOM | OOM | OOM | OOM | OOM | OOM | OOM | OOM | - -## LLaMA-2 70B FP16 CUDA (4 A100 80GB) - -| Engine | Batch Size | Prompt Length | Prompt Latency (ms) | Prompt Throughput (tps) | First 128 Tokens Generated Avg Latency (ms) | First 128 Tokens Generated Avg Throughput (tps) | First 256 Tokens Generated Avg Latency (ms) | First 256 Tokens Generated Avg Throughput (tps) | Wall-Clock Latency (s) | Wall-Clock Throughput (tps) | -|-------------|------------|---------------|---------------------|-------------------------|---------------------------------------------|-------------------------------------------------|---------------------------------------------|-------------------------------------------------|------------------------|-----------------------------| -| onnxruntime | 1 | 16 | 33.90884399| 471.8533018 | 32.09379315 | 31.15867281 | 32.19444305 | 31.06126105 | 9.030313015 | 30.12077207 | -| onnxruntime | 1 | 64 | 41.69011116| 1535.136228 | 33.19698013 | 30.12322193 | 33.02684613 | 30.27839825 | 9.254086733 | 34.57931714 | -| onnxruntime | 1 | 256 | 68.52030754| 3736.118666 | 32.60990791 | 30.66552665 | 32.65001439 | 30.62785787 | 9.191497326 | 55.70365544 | -| onnxruntime | 1 | 1024 | 207.4344158| 4936.50003 | 34.06454809 | 29.35603306 | 34.11752358 | 29.31045091 | 9.773607016 | 130.9649547 | -| onnxruntime | 1 | 2048 | 384.3646049| 5328.274179 | 36.26316041 | 27.57619548 | 36.19885352 | 27.62518431 | 10.52682996 | 218.8693091 | -| onnxruntime | 1 | 3840 | 688.8821125| 5574.248381 | 39.21672702 | 25.49932327 | 39.3556226 | 25.40932995 | 11.68651938 | 350.4893001 | -| onnxruntime | 4 | 16 | 41.2812233 | 1550.341654 | 33.15291367 | 120.6530454 | 33.26787427 | 120.2361163 | 9.408552885 | 115.6394627 | -| onnxruntime | 4 | 64 | 67.01588631| 3819.989768 | 33.51482376 | 119.3501726 | 33.34233537 | 119.9676014 | 9.426578999 | 135.7862699 | -| onnxruntime | 4 | 256 | 207.9341412| 4924.636206 | 33.40258636 | 119.751206 | 33.4388027 | 119.6215079 | 9.585339069 | 213.6596301 | -| onnxruntime | 4 | 1024 | 723.0362892| 5664.9992 | 34.91823189 | 114.5533374 | 35.27628351 | 113.3906297 | 10.53820944 | 485.8510385 | -| onnxruntime | 4 | 2048 | 1404.65641 | 5832.031193 | 37.19914332 | 107.529358 | 37.12219838 | 107.7522392 | 11.71326399 | 786.8003324 | -| onnxruntime | 4 | 3840 | 2701.535702| 5685.655011 | 41.06120393 | 97.41555574 | 41.02838039 | 97.49349015 | 14.05006671 | 1166.11546 | -| onnxruntime | 16| 16 | 67.89302826| 3770.637524 | 33.8511169 | 472.6579643 | 34.15830154 | 468.4073645 | 9.626672506 | 452.0772881 | -| onnxruntime | 16| 64 | 204.0295601| 5018.880595 | 34.05243531 | 469.8636046 | 34.30210985 | 466.4436115 | 9.769803524 | 524.0637631 | -| onnxruntime | 16| 256 | 715.4211998| 5725.298609 | 35.19898839 | 454.558518 | 35.41505337 | 451.7852856 | 10.63756585 | 770.1009906 | -| onnxruntime | 16| 1024 | 2801.265717| 5848.784677 | 38.95713016 | 410.7078713 | 39.20867946 | 408.0729119 | 13.68341517 | 1496.702376 | -| onnxruntime | 16| 2048 | 5650.010824| 5799.634907 | 46.37654498 | 345.0019834 | 46.1302707 | 346.843835 | 18.42047048 | 2001.251816 | -| onnxruntime | 16| 3840 | 10810.8182 | 5683.196118 | 54.980563 | 291.0119345 | 54.17970195 | 295.3135477 | 25.82262492 | 2537.92944 |