vae encoder

mcmonkey4eva · mcmonkey4eva · commit c98de3ceee96 · 2024-03-21T08:04:10.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -2,5 +2,6 @@ venv/
 models/
 __pycache__/
 *.png
+*.jpg
 *.latent
 tests.py
diff --git a/sd3_impls.py b/sd3_impls.py
@@ -221,6 +221,18 @@ def forward(self, x):
         return x + hidden
 
 
+class Downsample(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0, dtype=dtype, device=device)
+
+    def forward(self, x):
+        pad = (0,1,0,1)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+
+
 class Upsample(torch.nn.Module):
     def __init__(self, in_channels, dtype=torch.float32, device=None):
         super().__init__()
@@ -232,6 +244,61 @@ def forward(self, x):
         return x
 
 
+class VAEEncoder(torch.nn.Module):
+    def __init__(self, ch=128, ch_mult=(1,2,4,4), num_res_blocks=2, in_channels=3, z_channels=16, dtype=torch.float32, device=None):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = torch.nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = torch.nn.ModuleList()
+            attn = torch.nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+                block_in = block_out
+            down = torch.nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, dtype=dtype, device=device)
+            self.down.append(down)
+        # middle
+        self.mid = torch.nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        # end
+        self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+        self.conv_out = torch.nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.swish = torch.nn.SiLU(inplace=True)
+
+    def forward(self, x):
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = self.swish(h)
+        h = self.conv_out(h)
+        return h
+
+
 class VAEDecoder(torch.nn.Module):
     def __init__(self, ch=128, out_ch=3, ch_mult=(1, 2, 4, 4), num_res_blocks=2, resolution=256, z_channels=16, dtype=torch.float32, device=None):
         super().__init__()
@@ -286,11 +353,19 @@ def forward(self, z):
 
 
 class SDVAE(torch.nn.Module):
-    """Note that the VAE Encoder is not included in our current reference SD3 models. Might be added on release. Not needed for most gens anyway, only for img2img (Init Image), so for this codebase we'll just ignore it, and implement only the decoder."""
     def __init__(self, dtype=torch.float32, device=None):
         super().__init__()
+        self.encoder = VAEEncoder(dtype=dtype, device=device)
         self.decoder = VAEDecoder(dtype=dtype, device=device)
 
     @torch.autocast("cuda", dtype=torch.float16)
     def decode(self, latent):
         return self.decoder(latent)
+
+    @torch.autocast("cuda", dtype=torch.float16)
+    def encode(self, image):
+        hidden = self.encoder(image)
+        mean, logvar = torch.chunk(hidden, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        return mean + std * torch.randn_like(mean)
diff --git a/sd3_infer.py b/sd3_infer.py
@@ -22,7 +22,7 @@
 def load_into(f, model, prefix, device, dtype=None):
     """Just a debugging-friendly hack to apply the weights in a safetensors file to the pytorch module."""
     for key in f.keys():
-        if key.startswith(prefix):
+        if key.startswith(prefix) and not key.startswith("loss."):
             path = key[len(prefix):].split(".")
             obj = model
             for p in path:
@@ -133,6 +133,10 @@ def __init__(self, model):
 MODEL = "models/sd3_beta.safetensors"
 # VAE model file path, or set "None" to use the same model file
 VAEFile = "models/sd3_vae.safetensors"
+# Optional init image file path
+INIT_IMAGE = None
+# If init_image is given, this is the percentage of denoising steps to run (1.0 = full denoise, 0.0 = no denoise at all)
+DENOISE = 0.6
 # Output file path
 OUTPUT = "output.png"
 
@@ -194,12 +198,13 @@ def fix_cond(self, cond):
         cond, pooled = (cond[0].half().cuda(), cond[1].half().cuda())
         return { "c_crossattn": cond, "y": pooled }
 
-    def do_sampling(self, latent, seed, conditioning, neg_cond, steps, cfg_scale) -> torch.Tensor:
+    def do_sampling(self, latent, seed, conditioning, neg_cond, steps, cfg_scale, denoise=1.0) -> torch.Tensor:
         print("Sampling...")
         latent = latent.half().cuda()
         self.sd3.model = self.sd3.model.cuda()
         noise = self.get_noise(seed, latent).cuda()
         sigmas = self.get_sigmas(self.sd3.model.model_sampling, steps).cuda()
+        sigmas = sigmas[int(steps * (1 - denoise)):]
         conditioning = self.fix_cond(conditioning)
         neg_cond = self.fix_cond(neg_cond)
         extra_args = { "cond": conditioning, "uncond": neg_cond, "cond_scale": cfg_scale }
@@ -210,6 +215,21 @@ def do_sampling(self, latent, seed, conditioning, neg_cond, steps, cfg_scale) ->
         print("Sampling done")
         return latent
 
+    def vae_encode(self, image) -> torch.Tensor:
+        print("Encoding image to latent...")
+        image = image.convert("RGB")
+        image_np = np.array(image).astype(np.float32) / 255.0
+        image_np = np.moveaxis(image_np, 2, 0)
+        batch_images = np.expand_dims(image_np, axis=0).repeat(1, axis=0)
+        image_torch = torch.from_numpy(batch_images)
+        image_torch = 2.0 * image_torch - 1.0
+        image_torch = image_torch.cuda()
+        self.vae.model = self.vae.model.cuda()
+        latent = self.vae.model.encode(image_torch).cpu()
+        self.vae.model = self.vae.model.cpu()
+        print("Encoded")
+        return latent
+
     def vae_decode(self, latent) -> Image.Image:
         print("Decoding latent to image...")
         latent = latent.cuda()
@@ -224,20 +244,25 @@ def vae_decode(self, latent) -> Image.Image:
         print("Decoded")
         return out_image
 
-    def gen_image(self, prompt=PROMPT, width=WIDTH, height=HEIGHT, steps=STEPS, cfg_scale=CFG_SCALE, seed=SEED, output=OUTPUT):
+    def gen_image(self, prompt=PROMPT, width=WIDTH, height=HEIGHT, steps=STEPS, cfg_scale=CFG_SCALE, seed=SEED, output=OUTPUT, init_image=INIT_IMAGE, denoise=DENOISE):
         latent = self.get_empty_latent(width, height)
+        if init_image:
+            image_data = Image.open(init_image)
+            image_data = image_data.resize((width, height), Image.LANCZOS)
+            latent = self.vae_encode(image_data)
+            latent = SD3LatentFormat().process_in(latent)
         conditioning = self.get_cond(prompt)
         neg_cond = self.get_cond("")
-        sampled_latent = self.do_sampling(latent, seed, conditioning, neg_cond, steps, cfg_scale)
+        sampled_latent = self.do_sampling(latent, seed, conditioning, neg_cond, steps, cfg_scale, denoise if init_image else 1.0)
         image = self.vae_decode(sampled_latent)
         print(f"Will save to {output}")
         image.save(output)
         print("Done")
 
 @torch.no_grad()
-def main(prompt=PROMPT, width=WIDTH, height=HEIGHT, steps=STEPS, cfg_scale=CFG_SCALE, shift=SHIFT, model=MODEL, vae=VAEFile, seed=SEED, output=OUTPUT):
+def main(prompt=PROMPT, width=WIDTH, height=HEIGHT, steps=STEPS, cfg_scale=CFG_SCALE, shift=SHIFT, model=MODEL, vae=VAEFile, seed=SEED, output=OUTPUT, init_image=INIT_IMAGE, denoise=DENOISE):
     inferencer = SD3Inferencer()
     inferencer.load(model, vae, shift)
-    inferencer.gen_image(prompt, width, height, steps, cfg_scale, seed, output)
+    inferencer.gen_image(prompt, width, height, steps, cfg_scale, seed, output, init_image, denoise)
 
 fire.Fire(main)