Skip to content

Commit 2ac381c

Browse files
authored
Cherry pick SDXL demo update to 1.16.3 (#18496)
Cherry pick SDXL demo update to 1.16.3 ----- Co-authored-by: kunal-vaishnavi <[email protected]>
1 parent de0e87e commit 2ac381c

14 files changed

+448
-166
lines changed

Diff for: onnxruntime/python/tools/transformers/models/stable_diffusion/README.md

+126-57
Large diffs are not rendered by default.

Diff for: onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,31 @@
5353
f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
5454
)
5555

56-
pipeline_info = PipelineInfo(args.version)
57-
pipeline = init_pipeline(Txt2ImgPipeline, pipeline_info, engine_type, args, max_batch_size, batch_size)
56+
# For TensorRT, performance of engine built with dynamic shape is very sensitive to the range of image size.
57+
# Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance.
58+
# This range can cover common used shape of landscape 512x768, portrait 768x512, or square 512x512 and 768x768.
59+
min_image_size = 512 if args.engine != "ORT_CUDA" else 256
60+
max_image_size = 768 if args.engine != "ORT_CUDA" else 1024
61+
pipeline_info = PipelineInfo(args.version, min_image_size=min_image_size, max_image_size=max_image_size)
62+
63+
# Ideally, the optimized batch size and image size for TRT engine shall align with user's preference. That is to
64+
# optimize the shape used most frequently. We can let user config it when we develop a UI plugin.
65+
# In this demo, we optimize batch size 1 and image size 512x512 (or 768x768 for SD 2.0/2.1) for dynamic engine.
66+
# This is mainly for benchmark purpose to simulate the case that we have no knowledge of user's preference.
67+
opt_batch_size = 1 if args.build_dynamic_batch else batch_size
68+
opt_image_height = pipeline_info.default_image_size() if args.build_dynamic_shape else args.height
69+
opt_image_width = pipeline_info.default_image_size() if args.build_dynamic_shape else args.width
70+
71+
pipeline = init_pipeline(
72+
Txt2ImgPipeline,
73+
pipeline_info,
74+
engine_type,
75+
args,
76+
max_batch_size,
77+
opt_batch_size,
78+
opt_image_height,
79+
opt_image_width,
80+
)
5881

5982
if engine_type == EngineType.TRT:
6083
max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())

Diff for: onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py

+148-28
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,7 @@
2929
from pipeline_txt2img_xl import Txt2ImgXLPipeline
3030

3131

32-
def run_demo():
33-
"""Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image."""
34-
35-
args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo")
36-
37-
prompt, negative_prompt = repeat_prompt(args)
38-
39-
# Recommend image size as one of those used in training (see Appendix I in https://arxiv.org/pdf/2307.01952.pdf).
40-
image_height = args.height
41-
image_width = args.width
42-
32+
def load_pipelines(args, batch_size):
4333
# Register TensorRT plugins
4434
engine_type = get_engine_type(args.engine)
4535
if engine_type == EngineType.TRT:
@@ -49,37 +39,83 @@ def run_demo():
4939

5040
max_batch_size = 16
5141
if (engine_type in [EngineType.ORT_TRT, EngineType.TRT]) and (
52-
args.build_dynamic_shape or image_height > 512 or image_width > 512
42+
args.build_dynamic_shape or args.height > 512 or args.width > 512
5343
):
5444
max_batch_size = 4
5545

56-
batch_size = len(prompt)
5746
if batch_size > max_batch_size:
5847
raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.")
5948

49+
# For TensorRT, performance of engine built with dynamic shape is very sensitive to the range of image size.
50+
# Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance.
51+
# This range can cover most frequent shape of landscape (832x1216), portrait (1216x832) or square (1024x1024).
52+
min_image_size = 832 if args.engine != "ORT_CUDA" else 512
53+
max_image_size = 1216 if args.engine != "ORT_CUDA" else 2048
54+
6055
# No VAE decoder in base when it outputs latent instead of image.
61-
base_info = PipelineInfo(args.version, use_vae=False)
62-
base = init_pipeline(Txt2ImgXLPipeline, base_info, engine_type, args, max_batch_size, batch_size)
56+
base_info = PipelineInfo(
57+
args.version, use_vae=args.disable_refiner, min_image_size=min_image_size, max_image_size=max_image_size
58+
)
6359

64-
refiner_info = PipelineInfo(args.version, is_refiner=True)
65-
refiner = init_pipeline(Img2ImgXLPipeline, refiner_info, engine_type, args, max_batch_size, batch_size)
60+
# Ideally, the optimized batch size and image size for TRT engine shall align with user's preference. That is to
61+
# optimize the shape used most frequently. We can let user config it when we develop a UI plugin.
62+
# In this demo, we optimize batch size 1 and image size 1024x1024 for SD XL dynamic engine.
63+
# This is mainly for benchmark purpose to simulate the case that we have no knowledge of user's preference.
64+
opt_batch_size = 1 if args.build_dynamic_batch else batch_size
65+
opt_image_height = base_info.default_image_size() if args.build_dynamic_shape else args.height
66+
opt_image_width = base_info.default_image_size() if args.build_dynamic_shape else args.width
67+
68+
base = init_pipeline(
69+
Txt2ImgXLPipeline,
70+
base_info,
71+
engine_type,
72+
args,
73+
max_batch_size,
74+
opt_batch_size,
75+
opt_image_height,
76+
opt_image_width,
77+
)
78+
79+
refiner = None
80+
if not args.disable_refiner:
81+
refiner_info = PipelineInfo(
82+
args.version, is_refiner=True, min_image_size=min_image_size, max_image_size=max_image_size
83+
)
84+
refiner = init_pipeline(
85+
Img2ImgXLPipeline,
86+
refiner_info,
87+
engine_type,
88+
args,
89+
max_batch_size,
90+
opt_batch_size,
91+
opt_image_height,
92+
opt_image_width,
93+
)
6694

6795
if engine_type == EngineType.TRT:
68-
max_device_memory = max(base.backend.max_device_memory(), refiner.backend.max_device_memory())
96+
max_device_memory = max(base.backend.max_device_memory(), (refiner or base).backend.max_device_memory())
6997
_, shared_device_memory = cudart.cudaMalloc(max_device_memory)
7098
base.backend.activate_engines(shared_device_memory)
71-
refiner.backend.activate_engines(shared_device_memory)
99+
if refiner:
100+
refiner.backend.activate_engines(shared_device_memory)
72101

73102
if engine_type == EngineType.ORT_CUDA:
74103
enable_vae_slicing = args.enable_vae_slicing
75104
if batch_size > 4 and not enable_vae_slicing:
76105
print("Updating enable_vae_slicing to be True to avoid cuDNN error for batch size > 4.")
77106
enable_vae_slicing = True
78107
if enable_vae_slicing:
79-
refiner.backend.enable_vae_slicing()
108+
(refiner or base).backend.enable_vae_slicing()
109+
return base, refiner
110+
80111

112+
def run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False):
113+
image_height = args.height
114+
image_width = args.width
115+
batch_size = len(prompt)
81116
base.load_resources(image_height, image_width, batch_size)
82-
refiner.load_resources(image_height, image_width, batch_size)
117+
if refiner:
118+
refiner.load_resources(image_height, image_width, batch_size)
83119

84120
def run_base_and_refiner(warmup=False):
85121
images, time_base = base.run(
@@ -91,8 +127,13 @@ def run_base_and_refiner(warmup=False):
91127
denoising_steps=args.denoising_steps,
92128
guidance=args.guidance,
93129
seed=args.seed,
94-
return_type="latent",
130+
return_type="latent" if refiner else "image",
95131
)
132+
if refiner is None:
133+
return images, time_base
134+
135+
# Use same seed in base and refiner.
136+
seed = base.get_current_seed()
96137

97138
images, time_refiner = refiner.run(
98139
prompt,
@@ -103,7 +144,7 @@ def run_base_and_refiner(warmup=False):
103144
warmup=warmup,
104145
denoising_steps=args.denoising_steps,
105146
guidance=args.guidance,
106-
seed=args.seed,
147+
seed=seed,
107148
)
108149

109150
return images, time_base + time_refiner
@@ -112,25 +153,104 @@ def run_base_and_refiner(warmup=False):
112153
# inference once to get cuda graph
113154
_, _ = run_base_and_refiner(warmup=True)
114155

115-
print("[I] Warming up ..")
156+
if args.num_warmup_runs > 0:
157+
print("[I] Warming up ..")
116158
for _ in range(args.num_warmup_runs):
117159
_, _ = run_base_and_refiner(warmup=True)
118160

161+
if is_warm_up:
162+
return
163+
119164
print("[I] Running StableDiffusion XL pipeline")
120165
if args.nvtx_profile:
121166
cudart.cudaProfilerStart()
122167
_, latency = run_base_and_refiner(warmup=False)
123168
if args.nvtx_profile:
124169
cudart.cudaProfilerStop()
125170

126-
base.teardown()
127-
128171
print("|------------|--------------|")
129172
print("| {:^10} | {:>9.2f} ms |".format("e2e", latency))
130173
print("|------------|--------------|")
131-
refiner.teardown()
174+
175+
176+
def run_demo(args):
177+
"""Run Stable Diffusion XL Base + Refiner together (known as ensemble of expert denoisers) to generate an image."""
178+
179+
prompt, negative_prompt = repeat_prompt(args)
180+
batch_size = len(prompt)
181+
base, refiner = load_pipelines(args, batch_size)
182+
run_pipelines(args, base, refiner, prompt, negative_prompt)
183+
base.teardown()
184+
if refiner:
185+
refiner.teardown()
186+
187+
188+
def run_dynamic_shape_demo(args):
189+
"""Run demo of generating images with different settings with ORT CUDA provider."""
190+
args.engine = "ORT_CUDA"
191+
args.disable_cuda_graph = True
192+
base, refiner = load_pipelines(args, 1)
193+
194+
prompts = [
195+
"starry night over Golden Gate Bridge by van gogh",
196+
"beautiful photograph of Mt. Fuji during cherry blossom",
197+
"little cute gremlin sitting on a bed, cinematic",
198+
"cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
199+
"beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
200+
"blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
201+
]
202+
203+
# batch size, height, width, scheduler, steps, prompt, seed
204+
configs = [
205+
(1, 832, 1216, "UniPC", 8, prompts[0], None),
206+
(1, 1024, 1024, "DDIM", 24, prompts[1], None),
207+
(1, 1216, 832, "UniPC", 16, prompts[2], None),
208+
(1, 1344, 768, "DDIM", 24, prompts[3], None),
209+
(2, 640, 1536, "UniPC", 16, prompts[4], 4312973633252712),
210+
(2, 1152, 896, "DDIM", 24, prompts[5], 1964684802882906),
211+
]
212+
213+
# Warm up each combination of (batch size, height, width) once before serving.
214+
args.prompt = ["warm up"]
215+
args.num_warmup_runs = 1
216+
for batch_size, height, width, _, _, _, _ in configs:
217+
args.batch_size = batch_size
218+
args.height = height
219+
args.width = width
220+
print(f"\nWarm up batch_size={batch_size}, height={height}, width={width}")
221+
prompt, negative_prompt = repeat_prompt(args)
222+
run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=True)
223+
224+
# Run pipeline on a list of prompts.
225+
args.num_warmup_runs = 0
226+
for batch_size, height, width, scheduler, steps, example_prompt, seed in configs:
227+
args.prompt = [example_prompt]
228+
args.batch_size = batch_size
229+
args.height = height
230+
args.width = width
231+
args.scheduler = scheduler
232+
args.denoising_steps = steps
233+
args.seed = seed
234+
base.set_scheduler(scheduler)
235+
if refiner:
236+
refiner.set_scheduler(scheduler)
237+
print(
238+
f"\nbatch_size={batch_size}, height={height}, width={width}, scheduler={scheduler}, steps={steps}, prompt={example_prompt}, seed={seed}"
239+
)
240+
prompt, negative_prompt = repeat_prompt(args)
241+
run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False)
242+
243+
base.teardown()
244+
if refiner:
245+
refiner.teardown()
132246

133247

134248
if __name__ == "__main__":
135249
coloredlogs.install(fmt="%(funcName)20s: %(message)s")
136-
run_demo()
250+
251+
args = parse_arguments(is_xl=True, description="Options for Stable Diffusion XL Demo")
252+
no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0]
253+
if no_prompt:
254+
run_dynamic_shape_demo(args)
255+
else:
256+
run_demo(args)

Diff for: onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py

+19-15
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,13 @@ def parse_arguments(is_xl: bool, description: str):
7878
help="Root Directory to store torch or ONNX models, built engines and output images etc.",
7979
)
8080

81-
parser.add_argument("prompt", nargs="+", help="Text prompt(s) to guide image generation.")
81+
parser.add_argument("prompt", nargs="*", default=[""], help="Text prompt(s) to guide image generation.")
8282

8383
parser.add_argument(
8484
"--negative-prompt", nargs="*", default=[""], help="Optional negative prompt(s) to guide the image generation."
8585
)
8686
parser.add_argument(
87-
"--repeat-prompt",
87+
"--batch-size",
8888
type=int,
8989
default=1,
9090
choices=[1, 2, 4, 8, 16],
@@ -145,6 +145,10 @@ def parse_arguments(is_xl: bool, description: str):
145145
parser.add_argument("--seed", type=int, default=None, help="Seed for random generator to get consistent results.")
146146
parser.add_argument("--disable-cuda-graph", action="store_true", help="Disable cuda graph.")
147147

148+
parser.add_argument(
149+
"--disable-refiner", action="store_true", help="Disable refiner and only run base for XL pipeline."
150+
)
151+
148152
group = parser.add_argument_group("Options for ORT_CUDA engine only")
149153
group.add_argument("--enable-vae-slicing", action="store_true", help="True will feed only one image to VAE once.")
150154

@@ -174,9 +178,9 @@ def parse_arguments(is_xl: bool, description: str):
174178
)
175179

176180
# Validate image dimensions
177-
if args.height % 8 != 0 or args.width % 8 != 0:
181+
if args.height % 64 != 0 or args.width % 64 != 0:
178182
raise ValueError(
179-
f"Image height and width have to be divisible by 8 but specified as: {args.height} and {args.width}."
183+
f"Image height and width have to be divisible by 64 but specified as: {args.height} and {args.width}."
180184
)
181185

182186
if (args.build_dynamic_batch or args.build_dynamic_shape) and not args.disable_cuda_graph:
@@ -194,7 +198,7 @@ def parse_arguments(is_xl: bool, description: str):
194198
def repeat_prompt(args):
195199
if not isinstance(args.prompt, list):
196200
raise ValueError(f"`prompt` must be of type `str` or `str` list, but is {type(args.prompt)}")
197-
prompt = args.prompt * args.repeat_prompt
201+
prompt = args.prompt * args.batch_size
198202

199203
if not isinstance(args.negative_prompt, list):
200204
raise ValueError(
@@ -209,7 +213,9 @@ def repeat_prompt(args):
209213
return prompt, negative_prompt
210214

211215

212-
def init_pipeline(pipeline_class, pipeline_info, engine_type, args, max_batch_size, batch_size):
216+
def init_pipeline(
217+
pipeline_class, pipeline_info, engine_type, args, max_batch_size, opt_batch_size, opt_image_height, opt_image_width
218+
):
213219
onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
214220
work_dir=args.work_dir, pipeline_info=pipeline_info, engine_type=engine_type
215221
)
@@ -234,9 +240,6 @@ def init_pipeline(pipeline_class, pipeline_info, engine_type, args, max_batch_si
234240
engine_dir=engine_dir,
235241
framework_model_dir=framework_model_dir,
236242
onnx_dir=onnx_dir,
237-
opt_image_height=args.height,
238-
opt_image_width=args.height,
239-
opt_batch_size=batch_size,
240243
force_engine_rebuild=args.force_engine_build,
241244
device_id=torch.cuda.current_device(),
242245
)
@@ -247,14 +250,15 @@ def init_pipeline(pipeline_class, pipeline_info, engine_type, args, max_batch_si
247250
framework_model_dir,
248251
onnx_dir,
249252
args.onnx_opset,
250-
opt_image_height=args.height,
251-
opt_image_width=args.height,
252-
opt_batch_size=batch_size,
253+
opt_image_height=opt_image_height,
254+
opt_image_width=opt_image_width,
255+
opt_batch_size=opt_batch_size,
253256
force_engine_rebuild=args.force_engine_build,
254257
static_batch=not args.build_dynamic_batch,
255258
static_image_shape=not args.build_dynamic_shape,
256259
max_workspace_size=0,
257260
device_id=torch.cuda.current_device(),
261+
timing_cache=timing_cache,
258262
)
259263
elif engine_type == EngineType.TRT:
260264
# Load TensorRT engines and pytorch modules
@@ -263,9 +267,9 @@ def init_pipeline(pipeline_class, pipeline_info, engine_type, args, max_batch_si
263267
framework_model_dir,
264268
onnx_dir,
265269
args.onnx_opset,
266-
opt_batch_size=batch_size,
267-
opt_image_height=args.height,
268-
opt_image_width=args.height,
270+
opt_batch_size=opt_batch_size,
271+
opt_image_height=opt_image_height,
272+
opt_image_width=opt_image_width,
269273
force_export=args.force_onnx_export,
270274
force_optimize=args.force_onnx_optimize,
271275
force_build=args.force_engine_build,

0 commit comments

Comments
 (0)