Update to 0.0.18

zqevans · zqevans · commit 330a7a89252a · 2025-02-18T20:33:50.000Z
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='stable-audio-tools',
-    version='0.0.17',
+    version='0.0.18',
     url='https://github.com/Stability-AI/stable-audio-tools.git',
     author='Stability AI',
     description='Training and inference tools for generative audio models from Stability AI',
@@ -25,7 +25,6 @@
         'prefigure==0.0.9',
         'pytorch_lightning==2.1.0',
         'PyWavelets==1.4.1',
-        'pypesq==1.2.4',
         'safetensors',
         'sentencepiece==0.1.99',
         'torch>=2.0.1',
diff --git a/stable_audio_tools/models/conditioners.py b/stable_audio_tools/models/conditioners.py
@@ -545,7 +545,16 @@ class SourceMixConditioner(Conditioner):
         source_keys: a list of keys for the potential sources in the metadata
 
     """
-    def __init__(self, pretransform: Pretransform, output_dim: int, save_pretransform: bool = False, source_keys: tp.List[str] = [], pre_encoded: bool = False):
+    def __init__(
+        self, 
+        pretransform: Pretransform, 
+        output_dim: int, 
+        save_pretransform: bool = False, 
+        source_keys: tp.List[str] = [], 
+        pre_encoded: bool = False, 
+        allow_null_source=False,
+        source_length=None
+    ):
         super().__init__(pretransform.encoded_channels, output_dim)
 
         if not save_pretransform:
@@ -559,16 +568,28 @@ def __init__(self, pretransform: Pretransform, output_dim: int, save_pretransfor
 
         self.pre_encoded = pre_encoded
 
+        self.allow_null_source = allow_null_source
+
+        if self.allow_null_source:
+            self.null_source = nn.Parameter(torch.randn(output_dim, 1))
+
+            assert source_length is not None, "Source length must be specified if allowing null sources"
+
+            self.source_length = source_length
+
     def forward(self, sources: tp.List[tp.Dict[str, torch.Tensor]], device: tp.Union[torch.device, str]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
 
         self.pretransform.to(device)
         self.proj_out.to(device)
 
+        dtype = next(self.proj_out.parameters()).dtype
+
         # Output has to be the batch of summed projections
         # Input is per-batch-item list of source audio
 
         mixes = []
 
+        num_null_sources = 0
         for source_dict in sources: # Iterate over batch items
 
             mix = None
@@ -579,14 +600,16 @@ def forward(self, sources: tp.List[tp.Dict[str, torch.Tensor]], device: tp.Union
                     source = source_dict[key]
 
                     if not self.pre_encoded:
-                        audio = set_audio_channels(source, self.pretransform.io_channels)
+                        assert source.dim() == 2, f"Source audio must be shape [channels, samples], got shape: {source.shape}"
+                        audio = set_audio_channels(source.unsqueeze(0), self.pretransform.io_channels)
 
                         audio = audio.to(device)
-
-                        latents = self.pretransform.encode(audio)
+                        latents = self.pretransform.encode(audio).squeeze(0)
                     else:
                         latents = source.to(device)           
 
+                    latents = latents.to(dtype)
+
                     if mix is None:
                         mix = self.source_heads[key_ix](latents)
                     else:
@@ -595,7 +618,10 @@ def forward(self, sources: tp.List[tp.Dict[str, torch.Tensor]], device: tp.Union
             if mix is not None:
                 mixes.append(mix)
             else:
-                raise ValueError("No sources found for mix")
+                if self.allow_null_source:
+                    mixes.append(self.null_source.repeat(1, self.source_length))
+                else:
+                    raise ValueError("No sources found for mix")
 
         mixes = torch.stack(mixes, dim=0)