Update LICENSE, update MusicGen code, update dependencies

zqevans · zqevans · commit a2dd95f58eb5 · 2023-09-30T03:56:25.000Z
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Harmonai-org
+Copyright (c) 2023 Stability AI
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/harmonai_tools/models/conditioners.py b/harmonai_tools/models/conditioners.py
@@ -5,6 +5,8 @@
 import typing as tp
 import gc
 import os
+from ..training.utils import copy_state_dict
+from laion_clap.clap_module.factory import load_state_dict as clap_load_state_dict
 
 from audio_diffusion_pytorch_fork import NumberEmbedder
 
@@ -91,11 +93,13 @@ def __init__(self,
                  feature_layer_ix: int = -1,
                  audio_model_type="HTSAT-base", 
                  enable_fusion=True,
-                 project_out: bool = False):
+                 project_out: bool = False,
+                 finetune: bool = False):
         super().__init__(768 if use_text_features else 512, output_dim, 1, project_out=project_out)
 
         self.use_text_features = use_text_features
         self.feature_layer_ix = feature_layer_ix
+        self.finetune = finetune
 
         # Suppress logging from transformers
         previous_level = logging.root.manager.disable
@@ -105,8 +109,23 @@ def __init__(self,
             try:
                 import laion_clap
                 
-                self.__dict__["model"] = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=audio_model_type, device='cpu').requires_grad_(False).eval()
-                self.model.load_ckpt(clap_ckpt_path)
+                model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=audio_model_type, device='cpu')
+
+                if self.finetune:
+                    self.model = model
+                else: 
+                    self.__dict__["model"] = model
+
+                state_dict = clap_load_state_dict(clap_ckpt_path)
+                self.model.model.load_state_dict(state_dict, strict=False)
+
+                if self.finetune:
+                    self.model.model.text_branch.requires_grad_(True)
+                    self.model.model.text_branch.train()
+                else:
+                    self.model.model.text_branch.requires_grad_(False)
+                    self.model.model.text_branch.eval()
+
             finally:
                 logging.disable(previous_level)
 
@@ -167,8 +186,23 @@ def __init__(self,
             try:
                 import laion_clap
                 
-                self.__dict__["model"] = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=audio_model_type, device=device).requires_grad_(False).eval()
-                self.model.load_ckpt(clap_ckpt_path)
+                model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=audio_model_type, device='cpu')
+
+                if self.finetune:
+                    self.model = model
+                else: 
+                    self.__dict__["model"] = model
+
+                state_dict = clap_load_state_dict(clap_ckpt_path)
+                self.model.model.load_state_dict(state_dict, strict=False)
+
+                if self.finetune:
+                    self.model.model.audio_branch.requires_grad_(True)
+                    self.model.model.audio_branch.train()
+                else:
+                    self.model.model.audio_branch.requires_grad_(False)
+                    self.model.model.audio_branch.eval()
+
             finally:
                 logging.disable(previous_level)
 
diff --git a/harmonai_tools/training/factory.py b/harmonai_tools/training/factory.py
@@ -75,9 +75,8 @@ def create_training_wrapper_from_config(model_config, model):
     elif model_type == 'diffusion_autoencoder':
         from .diffusion import DiffusionAutoencoderTrainingWrapper
 
-
         ema_copy = create_model_from_config(model_config)
-        #ema_copy = create_model_from_config(model_config) # I don't know why this needs to be called twice but it broke when I called it once
+        
         # Copy each weight to the ema copy
         for name, param in model.state_dict().items():
             if isinstance(param, Parameter):
@@ -92,8 +91,18 @@ def create_training_wrapper_from_config(model_config, model):
         )
     elif model_type == 'musicgen':
         from .musicgen import MusicGenTrainingWrapper
+
+        ema_copy = create_model_from_config(model_config).lm
+
+        for name, param in model.lm.state_dict().items():
+            if isinstance(param, Parameter):
+                # backwards compatibility for serialized parameters
+                param = param.data
+            ema_copy.state_dict()[name].copy_(param)
+
         return MusicGenTrainingWrapper(
             model,
+            ema_copy=ema_copy,
             lr=training_config["learning_rate"]
         )
     else:
diff --git a/harmonai_tools/training/musicgen.py b/harmonai_tools/training/musicgen.py
@@ -37,7 +37,7 @@ def __repr__(self):
 
 
 class MusicGenTrainingWrapper(pl.LightningModule):
-    def __init__(self, musicgen_model, lr = 1e-4):
+    def __init__(self, musicgen_model, lr = 1e-4, ema_copy=None):
         super().__init__()
 
         self.musicgen_model: MusicGen = musicgen_model
@@ -48,6 +48,8 @@ def __init__(self, musicgen_model, lr = 1e-4):
 
         self.lm.to(torch.float32).train().requires_grad_(True)
 
+        self.lm_ema = EMA(self.lm, ema_model=ema_copy, beta=0.99, update_every=10)
+
         self.cfg_dropout = ClassifierFreeGuidanceDropout(0.1)
 
         self.lr = lr
@@ -96,7 +98,9 @@ def _compute_cross_entropy(
 
     def training_step(self, batch, batch_idx):
         reals, metadata = batch
-        reals = reals[0]
+
+        if reals.ndim == 4 and reals.shape[0] == 1:
+            reals = reals[0]
 
         # Convert reals to mono if necessary
         if self.musicgen_model.audio_channels == 1:
@@ -113,7 +117,7 @@ def training_step(self, batch, batch_idx):
 
             codes, _ = self.musicgen_model.compression_model.encode(reals) # [b, k, t]
 
-            attributes = [ConditioningAttributes(text={'description': md["prompt"][0]}) for md in metadata]
+            attributes = [ConditioningAttributes(text={'description': md["prompt"][0][:512]}) for md in metadata]
             attributes = self.lm.cfg_dropout(attributes)
             attributes = self.lm.att_dropout(attributes)
             tokenized = self.lm.condition_provider.tokenize(attributes)
@@ -147,7 +151,11 @@ def training_step(self, batch, batch_idx):
         self.log_dict(log_dict, prog_bar=True, on_step=True)
         return loss
 
+    def on_before_zero_grad(self, *args, **kwargs):
+        self.lm_ema.update()
+
     def export_model(self, path):
+        self.musicgen_model.lm = self.lm_ema.ema_model
         export_state_dict = {"state_dict": self.musicgen_model.state_dict()}
         
         torch.save(export_state_dict, path)
diff --git a/harmonai_tools/training/utils.py b/harmonai_tools/training/utils.py
@@ -14,6 +14,47 @@ def get_rank():
     
     return torch.distributed.get_rank()
 
+class InverseLR(torch.optim.lr_scheduler._LRScheduler):
+    """Implements an inverse decay learning rate schedule with an optional exponential
+    warmup. When last_epoch=-1, sets initial lr as lr.
+    inv_gamma is the number of steps/epochs required for the learning rate to decay to
+    (1 / 2)**power of its original value.
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        inv_gamma (float): Inverse multiplicative factor of learning rate decay. Default: 1.
+        power (float): Exponential factor of learning rate decay. Default: 1.
+        warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable)
+            Default: 0.
+        final_lr (float): The final learning rate. Default: 0.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+    """
+
+    def __init__(self, optimizer, inv_gamma=1., power=1., warmup=0., final_lr=0.,
+                 last_epoch=-1, verbose=False):
+        self.inv_gamma = inv_gamma
+        self.power = power
+        if not 0. <= warmup < 1:
+            raise ValueError('Invalid value for warmup')
+        self.warmup = warmup
+        self.final_lr = final_lr
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            import warnings
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.")
+
+        return self._get_closed_form_lr()
+
+    def _get_closed_form_lr(self):
+        warmup = 1 - self.warmup ** (self.last_epoch + 1)
+        lr_mult = (1 + self.last_epoch / self.inv_gamma) ** -self.power
+        return [warmup * max(self.final_lr, base_lr * lr_mult)
+                for base_lr in self.base_lrs]
+
 def copy_state_dict(model, state_dict):
     """Load state_dict to model, but only for keys that match exactly.
 
@@ -55,6 +96,9 @@ def create_scheduler_from_config(scheduler_config, optimizer):
     Returns:
         torch.optim.lr_scheduler._LRScheduler: scheduler.
     """
-    scheduler_fn = getattr(torch.optim.lr_scheduler, scheduler_config["type"])
+    if scheduler_config["type"] == "InverseLR":
+        scheduler_fn = InverseLR
+    else:
+        scheduler_fn = getattr(torch.optim.lr_scheduler, scheduler_config["type"])
     scheduler = scheduler_fn(optimizer, **scheduler_config["config"])
     return scheduler
diff --git a/setup.py b/setup.py
@@ -34,7 +34,7 @@
         'torchaudio>=2.0.2',
         'torchmetrics==0.11.4',
         'tqdm',
-        'transformers==4.30.2',
+        'transformers==4.33.3',
         'v-diffusion-pytorch==0.0.2',
         'vector-quantize-pytorch==1.6.21',
         'wandb==0.15.4',
diff --git a/train.py b/train.py
@@ -74,7 +74,7 @@ def main():
         else:
             strategy = args.strategy
     else:
-        strategy = 'ddp' if args.num_gpus > 1 else None 
+        strategy = 'ddp_find_unused_parameters_true' if args.num_gpus > 1 else None 
 
     trainer = pl.Trainer(
         devices=args.num_gpus,