NVIDIA
diff --git a/‎.gitignore
-1 b/‎.gitignore
-1
diff --git a/‎CHANGELOG.md
+320-187 b/‎CHANGELOG.md
+320-187
diff --git a/‎CMakeLists.txt
+11-25 b/‎CMakeLists.txt
+11-25
diff --git a/‎README.md
+168-147 b/‎README.md
+168-147
diff --git a/‎VERSION
+1-1 b/‎VERSION
+1-1
diff --git a/‎cmake/toolchains/cmake_aarch64_cross.toolchain
+2 b/‎cmake/toolchains/cmake_aarch64_cross.toolchain
+2
diff --git a/‎demo/BERT/README.md
+345-313 b/‎demo/BERT/README.md
+345-313
diff --git a/‎demo/DeBERTa/README.md
+3-3 b/‎demo/DeBERTa/README.md
+3-3
diff --git a/‎demo/Diffusion/.gitignore
+5-3 b/‎demo/Diffusion/.gitignore
+5-3
diff --git a/‎demo/Diffusion/README.md
+40-53 b/‎demo/Diffusion/README.md
+40-53
diff --git a/‎demo/Diffusion/calibration-images/rocket.png ‎demo/Diffusion/calibration_data/calibration-images/rocket.png b/‎demo/Diffusion/calibration-images/rocket.png ‎demo/Diffusion/calibration_data/calibration-images/rocket.png
diff --git a/‎demo/Diffusion/calibration-prompts.txt ‎demo/Diffusion/calibration_data/calibration-prompts.txt b/‎demo/Diffusion/calibration-prompts.txt ‎demo/Diffusion/calibration_data/calibration-prompts.txt
diff --git a/‎demo/Diffusion/demo_controlnet.py
+53-19 b/‎demo/Diffusion/demo_controlnet.py
+53-19
diff --git a/‎demo/Diffusion/utils_sd3/__init__.py ‎demo/Diffusion/demo_diffusion/__init__.py b/‎demo/Diffusion/utils_sd3/__init__.py ‎demo/Diffusion/demo_diffusion/__init__.py
@@ -3,7 +3,6 @@ build/
 /demo/BERT/engines
 /demo/BERT/squad/*.json
 /docker/jetpack_files/*
-*.nvmk
 *.sln
 *.vcxproj
 externals/
 
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -176,43 +176,29 @@ set(CUDA_LIBRARIES ${CUDART_LIB})
 if (DEFINED GPU_ARCHS)
   message(STATUS "GPU_ARCHS defined as ${GPU_ARCHS}. Generating CUDA code for SM ${GPU_ARCHS}")
   separate_arguments(GPU_ARCHS)
+  foreach(SM IN LISTS GPU_ARCHS)
+    list(APPEND CMAKE_CUDA_ARCHITECTURES SM)
+  endforeach()
 else()
-  list(APPEND GPU_ARCHS
-      75
-    )
-
-  find_file(IS_L4T_NATIVE nv_tegra_release PATHS /env/)
-  set (IS_L4T_CROSS "False")
-  if (DEFINED ENV{IS_L4T_CROSS})
-    set(IS_L4T_CROSS $ENV{IS_L4T_CROSS})
+  list(APPEND CMAKE_CUDA_ARCHITECTURES 72 75 80 86 87 89 90)
+  
+  if(CUDA_VERSION VERSION_GREATER_EQUAL 12.8)
+      list(APPEND CMAKE_CUDA_ARCHITECTURES 100 120)
   endif()
 
-  if (IS_L4T_NATIVE OR ${IS_L4T_CROSS} STREQUAL "True")
-    # Only Orin (SM87) supported
-    list(APPEND GPU_ARCHS 87)
-  endif()
-
-  if (CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
-    # Ampere GPU (SM80) support is only available in CUDA versions > 11.0
-    list(APPEND GPU_ARCHS 80)
-  endif()
-  if (CUDA_VERSION VERSION_GREATER_EQUAL 11.1)
-    list(APPEND GPU_ARCHS 86)
-  endif()
-
-  message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${GPU_ARCHS}")
+  message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${CMAKE_CUDA_ARCHITECTURES}")
 endif()
 set(BERT_GENCODES)
 # Generate SASS for each architecture
-foreach(arch ${GPU_ARCHS})
+foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
     if (${arch} GREATER_EQUAL 75)
         set(BERT_GENCODES "${BERT_GENCODES} -gencode arch=compute_${arch},code=sm_${arch}")
     endif()
     set(GENCODES "${GENCODES} -gencode arch=compute_${arch},code=sm_${arch}")
 endforeach()
 
 # Generate PTX for the last architecture in the list.
-list(GET GPU_ARCHS -1 LATEST_SM)
+list(GET CMAKE_CUDA_ARCHITECTURES -1 LATEST_SM)
 set(GENCODES "${GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}")
 if (${LATEST_SM} GREATER_EQUAL 75)
     set(BERT_GENCODES "${BERT_GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}")
 
@@ -1 +1 @@
-10.8.0.43
+10.9.0.34
@@ -53,3 +53,5 @@ set(CMAKE_CUDA_COMPILER_FORCED TRUE)
 set(CUDA_LIBS -L${CUDA_ROOT}/lib)
 
 set(ADDITIONAL_PLATFORM_LIB_FLAGS ${CUDA_LIBS} -lcublas -lcudart -lstdc++ -lm)
+
+link_directories(${CUDA_ROOT}/lib)
@@ -75,7 +75,7 @@ Note that the performance gap between BERT's self-attention and DeBERTa's disent
 ## Environment Setup
 It is recommended to use docker for reproducing the following steps. Follow the setup steps in TensorRT OSS [README](https://github.com/NVIDIA/TensorRT#setting-up-the-build-environment) to build and launch the container and build OSS:
 
-**Example: Ubuntu 20.04 on x86-64 with cuda-12.5 (default)**
+**Example: Ubuntu 20.04 on x86-64 with cuda-12.8 (default)**
 ```bash
 # Download this TensorRT OSS repo
 git clone -b main https://github.com/nvidia/TensorRT TensorRT
@@ -84,10 +84,10 @@ git submodule update --init --recursive
 
 ## at root of TensorRT OSS
 # build container
-./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda12.5
+./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda12.8
 
 # launch container
-./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.5 --gpus all
+./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.8 --gpus all
 
 ## now inside container
 # build OSS (only required for pre-8.4.3 TensorRT versions)
 
@@ -1,4 +1,6 @@
 __pycache__/
-onnx/*.onnx
-engine/*.plan
-output/*.png
+onnx/
+engine/
+output/
+pytorch_model/
+artifacts_cache/
@@ -22,12 +22,14 @@
 from cuda import cudart
 from PIL import Image
 
-from stable_diffusion_pipeline import StableDiffusionPipeline
-from utilities import PIPELINE_TYPE, TRT_LOGGER, add_arguments, download_image, process_pipeline_args
+from demo_diffusion import dd_argparse
+from demo_diffusion import image as image_module
+from demo_diffusion import pipeline as pipeline_module
+
 
 def parseArgs():
     parser = argparse.ArgumentParser(description="Options for Stable Diffusion ControlNet Demo", conflict_handler='resolve')
-    parser = add_arguments(parser)
+    parser = dd_argparse.add_arguments(parser)
     parser.add_argument('--scheduler', type=str, default="UniPC", choices=["DDIM", "DPM", "EulerA", "LMSD", "PNDM", "UniPC"], help="Scheduler for diffusion process")
     parser.add_argument('--input-image', nargs = '+', type=str, default=[], help="Path to the input image/images already prepared for ControlNet modality. For example: canny edged image for canny ControlNet, not just regular rgb image")
     parser.add_argument('--controlnet-type', nargs='+', type=str, default=["canny"], help="Controlnet type, can be `None`, `str` or `str` list from ['canny', 'depth', 'hed', 'mlsd', 'normal', 'openpose', 'scribble', 'seg']")
@@ -41,15 +43,15 @@ def parseArgs():
     # Controlnet configuration
     if not isinstance(args.controlnet_type, list):
         raise ValueError(f"`--controlnet-type` must be of type `str` or `str` list, but is {type(args.controlnet_type)}")
-    
+
     # Controlnet configuration
     if not isinstance(args.controlnet_scale, list):
         raise ValueError(f"`--controlnet-scale`` must be of type `float` or `float` list, but is {type(args.controlnet_scale)}")
-    
+
     # Check number of ControlNets to ControlNet scales
     if len(args.controlnet_type) != len(args.controlnet_scale):
         raise ValueError(f"Numbers of ControlNets {len(args.controlnet_type)} should be equal to number of ControlNet scales {len(args.controlnet_scale)}.")
-    
+
     # Convert controlnet scales to tensor
     controlnet_scale = torch.FloatTensor(args.controlnet_scale)
 
@@ -61,48 +63,80 @@ def parseArgs():
     else:
         for controlnet in args.controlnet_type:
             if controlnet == "canny":
-                canny_image = download_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
-                canny_image = controlnet_aux.CannyDetector()(canny_image)
+                if args.version == "xl-1.0":
+                    canny_image = image_module.download_image(
+                        "https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0/resolve/main/out_bird.png"
+                    )
+                    # "out_bird.png" has 5 images combined in a row. We pick the first image which is the input image.
+                    canny_image = canny_image.crop((0, 0, canny_image.width / 5, canny_image.height))
+                elif args.version == "1.5":
+                    canny_image = image_module.download_image(
+                        "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+                    )
+                    canny_image = controlnet_aux.CannyDetector()(canny_image)
+                else:
+                    raise ValueError(
+                        f"This demo supports ControlNets for v1.5 and SDXL base pipelines only. Version provided: {args.version}"
+                    )
                 input_images.append(canny_image.resize((args.height, args.width)))
             elif controlnet == "normal":
-                normal_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-normal/resolve/main/images/toy.png")
+                normal_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-normal/resolve/main/images/toy.png"
+                )
                 normal_image = controlnet_aux.NormalBaeDetector.from_pretrained("lllyasviel/Annotators")(normal_image)
                 input_images.append(normal_image.resize((args.height, args.width)))
             elif controlnet == "depth":
-                depth_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
+                depth_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png"
+                )
                 depth_image = controlnet_aux.LeresDetector.from_pretrained("lllyasviel/Annotators")(depth_image)
                 input_images.append(depth_image.resize((args.height, args.width)))
             elif controlnet == "hed":
-                hed_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-hed/resolve/main/images/man.png")
+                hed_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-hed/resolve/main/images/man.png"
+                )
                 hed_image = controlnet_aux.HEDdetector.from_pretrained("lllyasviel/Annotators")(hed_image)
                 input_images.append(hed_image.resize((args.height, args.width)))
             elif controlnet == "mlsd":
-                mlsd_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-mlsd/resolve/main/images/room.png")
+                mlsd_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-mlsd/resolve/main/images/room.png"
+                )
                 mlsd_image = controlnet_aux.MLSDdetector.from_pretrained("lllyasviel/Annotators")(mlsd_image)
                 input_images.append(mlsd_image.resize((args.height, args.width)))
             elif controlnet == "openpose":
-                openpose_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-openpose/resolve/main/images/pose.png")
+                openpose_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-openpose/resolve/main/images/pose.png"
+                )
                 openpose_image = controlnet_aux.OpenposeDetector.from_pretrained("lllyasviel/Annotators")(openpose_image)
                 input_images.append(openpose_image.resize((args.height, args.width)))
             elif controlnet == "scribble":
-                scribble_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-scribble/resolve/main/images/bag.png")
+                scribble_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-scribble/resolve/main/images/bag.png"
+                )
                 scribble_image = controlnet_aux.HEDdetector.from_pretrained("lllyasviel/Annotators")(scribble_image, scribble=True)
                 input_images.append(scribble_image.resize((args.height, args.width)))
             elif controlnet == "seg":
-                seg_image = download_image("https://huggingface.co/lllyasviel/sd-controlnet-seg/resolve/main/images/house.png")
+                seg_image = image_module.download_image(
+                    "https://huggingface.co/lllyasviel/sd-controlnet-seg/resolve/main/images/house.png"
+                )
                 seg_image = controlnet_aux.SamDetector.from_pretrained("ybelkada/segment-anything", subfolder="checkpoints")(seg_image)
                 input_images.append(seg_image.resize((args.height, args.width)))
             else:
                 raise ValueError(f"You should implement the conditonal image of this controlnet: {controlnet}")
     assert len(input_images) > 0
 
-    kwargs_init_pipeline, kwargs_load_engine, args_run_demo = process_pipeline_args(args)
+    kwargs_init_pipeline, kwargs_load_engine, args_run_demo = dd_argparse.process_pipeline_args(args)
 
     # Initialize demo
-    demo = StableDiffusionPipeline(
-        pipeline_type=PIPELINE_TYPE.CONTROLNET,
+    demo = pipeline_module.StableDiffusionPipeline(
+        pipeline_type=(
+            pipeline_module.PIPELINE_TYPE.CONTROLNET
+            if args.version != "xl-1.0"
+            else pipeline_module.PIPELINE_TYPE.XL_CONTROLNET
+        ),
         controlnets=args.controlnet_type,
-        **kwargs_init_pipeline)
+        **kwargs_init_pipeline,
+    )
 
     # Load TensorRT engines and pytorch modules
     demo.loadEngines(