Polish refactoring

Blonck · Blonck · commit c6186fc7ccbb · 2023-05-23T15:24:24.000+02:00
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -190,18 +190,19 @@ class TestLossModuleBase:
             "action_key": "action",
             "reward_key": "reward",
             "done_key": "done",
+            "steps_to_next_obs_key": "steps_to_next_obs",
         },
         SACLoss: {
             "priority_key": "td_error",
-            "state_value_key": "state_value",
+            "value_key": "state_value",
             "state_action_value_key": "state_action_value",
             "action_key": "action",
             "sample_log_prob_key": "sample_log_prob",
             "log_prob_key": "_log_prob",
         },
         DiscreteSACLoss: {
             "priority_key": "td_error",
-            "state_value_key": "state_value",
+            "value_key": "state_value",
             "action_key": "action",
         },
         TD3Loss: {
diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py
@@ -85,16 +85,6 @@ def __init__(
         value_target_key: str = None,
     ):
         super().__init__()
-        self.convert_to_functional(
-            actor, "actor", funs_to_decorate=["forward", "get_dist"]
-        )
-        if separate_losses:
-            # we want to make sure there are no duplicates in the params: the
-            # params of critic must be refs to actor if they're shared
-            policy_params = list(actor.parameters())
-        else:
-            policy_params = None
-        self.convert_to_functional(critic, "critic", compare_against=policy_params)
 
         tensordict_keys = {
             "advantage_key": "advantage",
@@ -107,6 +97,17 @@ def __init__(
             advantage_key=advantage_key, value_target_key=value_target_key
         )
 
+        self.convert_to_functional(
+            actor, "actor", funs_to_decorate=["forward", "get_dist"]
+        )
+        if separate_losses:
+            # we want to make sure there are no duplicates in the params: the
+            # params of critic must be refs to actor if they're shared
+            policy_params = list(actor.parameters())
+        else:
+            policy_params = None
+        self.convert_to_functional(critic, "critic", compare_against=policy_params)
+
         self.samples_mc_entropy = samples_mc_entropy
         self.entropy_bonus = entropy_bonus and entropy_coef
         self.register_buffer(
@@ -200,7 +201,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         hp.update(hyperparams)
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
-        value_key = "state_value"
+        value_key = self.value_key
         if value_type == ValueEstimators.TD1:
             self._value_estimator = TD1Estimator(
                 value_network=self.critic, value_key=value_key, **hp
diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py
@@ -85,7 +85,7 @@ def _set_deprecated_ctor_keys(self, **kwargs):
         for key, value in kwargs.items():
             if value is not None:
                 warnings.warn(
-                    f"Setting '{key}' via ctor is deprecated, use .set_keys(advantage_key='some_key') instead.",
+                    f"Setting '{key}' via ctor is deprecated, use .set_keys({key}='some_key') instead.",
                     category=DeprecationWarning,
                 )
                 self.tensordict_keys[key] = value
@@ -104,10 +104,7 @@ def set_keys(self, **kwargs):
         for key, value in kwargs.items():
             if key not in self.tensordict_keys.keys():
                 raise ValueError(f"{key} not a valid tensordict key")
-            if value is None:
-                set_value = self.tensordict_keys[key]
-            else:
-                set_value = value
+            set_value = value if value is not None else self.tensordict_keys[key]
             setattr(self, key, set_value)
 
     def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py
@@ -36,7 +36,7 @@ class DDPGLoss(LossModule):
         delay_actor (bool, optional): whether to separate the target actor networks from the actor networks used for
             data collection. Default is ``False``.
         delay_value (bool, optional): whether to separate the target value networks from the value networks used for
-            data collection. Default is ``True``.
+            data collection. Default is ``False``.
     """
 
     default_value_estimator: ValueEstimators = ValueEstimators.TD0
@@ -48,7 +48,7 @@ def __init__(
         *,
         loss_function: str = "l2",
         delay_actor: bool = False,
-        delay_value: bool = True,
+        delay_value: bool = False,
         gamma: float = None,
     ) -> None:
         super().__init__()
@@ -84,7 +84,7 @@ def __init__(
 
         self.actor_in_keys = actor_network.in_keys
 
-        self.loss_function = loss_function
+        self.loss_funtion = loss_function
 
         if gamma is not None:
             warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING, category=DeprecationWarning)
@@ -173,7 +173,7 @@ def _loss_value(
 
         # td_error = pred_val - target_value
         loss_value = distance_loss(
-            pred_val, target_value, loss_function=self.loss_function
+            pred_val, target_value, loss_function=self.loss_funtion
         )
 
         return loss_value, (pred_val - target_value).pow(2), pred_val, target_value
@@ -186,7 +186,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
         hp.update(hyperparams)
-        value_key = "state_action_value"
+        value_key = self.state_action_value_key
         if value_type == ValueEstimators.TD1:
             self._value_estimator = TD1Estimator(
                 value_network=self.actor_critic, value_key=value_key, **hp
diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py
@@ -48,8 +48,8 @@ class DQNLoss(LossModule):
             :class:`torchrl.data.BinaryDiscreteTensorSpec` or :class:`torchrl.data.DiscreteTensorSpec`).
             If not provided, an attempt to retrieve it from the value network
             will be made.
-        priority_key (str, optional): [Deprecated, use .set_keys() instead] the
-            key at which priority is assumed to be stored within TensorDicts added
+        priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead]
+            The key at which priority is assumed to be stored within TensorDicts added
             to this ReplayBuffer.  This is to be used when the sampler is of type
             :class:`~torchrl.data.PrioritizedSampler`.  Defaults to ``"td_error"``.
 
@@ -243,8 +243,8 @@ class DistributionalDQNLoss(LossModule):
               Unlike :class:`DQNLoss`, this class does not currently support
               custom value functions. The next value estimation is always
               bootstrapped.
-        priority_key (str, optional): [Deprecated, use .set_keys() instead] the
-            key at which priority is assumed to be stored within TensorDicts added
+        priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead]
+            The key at which priority is assumed to be stored within TensorDicts added
             to this ReplayBuffer.  This is to be used when the sampler is of type
             :class:`~torchrl.data.PrioritizedSampler`.  Defaults to ``"td_error"``.
 
@@ -266,6 +266,7 @@ def __init__(
             "action_key": "action",
             "reward_key": "reward",
             "done_key": "done",
+            "steps_to_next_obs_key": "steps_to_next_obs",
         }
         self._set_default_tensordict_keys(tensordict_keys)
         self._set_deprecated_ctor_keys(priority_key=priority_key)
@@ -325,7 +326,7 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict:
         reward = tensordict.get(("next", self.reward_key))
         done = tensordict.get(("next", self.done_key))
 
-        steps_to_next_obs = tensordict.get("steps_to_next_obs", 1)
+        steps_to_next_obs = tensordict.get(self.steps_to_next_obs_key, 1)
         discount = self.gamma**steps_to_next_obs
 
         # Calculate current state probabilities (online network noise already
diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py
@@ -259,7 +259,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
             value_type = self.default_value_estimator
         self.value_type = value_type
         value_net = None
-        value_key = "state_value"
+        value_key = self.value_key
         hp = dict(default_value_kwargs(value_type))
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py
@@ -52,7 +52,7 @@ class IQLLoss(LossModule):
             maximum of the Q-function.
         expectile (float, optional): expectile :math:`\tau`. A larger value of :math:`\tau` is crucial
             for antmaze tasks that require dynamical programming ("stichting").
-        priority_key (str, optional): [Deprecated, use .set_keys() instead]
+        priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead]
             tensordict key where to write the priority (for prioritized replay
             buffer usage). Default is `"td_error"`.
     """
@@ -257,7 +257,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         self.value_type = value_type
         value_net = self.value_network
 
-        value_key = "state_value"
+        value_key = self.value_key
         hp = dict(default_value_kwargs(value_type))
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py
@@ -64,11 +64,14 @@ class PPOLoss(LossModule):
             policy and critic will only be trained on the policy loss.
             Defaults to ``False``, ie. gradients are propagated to shared
             parameters for both policy and critic losses.
-        advantage_key (str, optional): [Deprecated, use set_keys() instead] the input tensordict key where the advantage is
+        advantage_key (str, optional): [Deprecated, use set_keys(advantage_key=advantage_key) instead]
+            The input tensordict key where the advantage is
             expected to be written. Defaults to ``"advantage"``.
-        value_target_key (str, optional): [Deprecated, use set_keys() instead] the input tensordict key where the target state
+        value_target_key (str, optional): [Deprecated, use set_keys(value_target_key=value_target_key) instead]
+            The input tensordict key where the target state
             value is expected to be written. Defaults to ``"value_target"``.
-        value_key (str, optional): [Deprecated, use set_keys() instead] the input tensordict key where the state
+        value_key (str, optional): [Deprecated, use set_keys(value_key) instead]
+            The input tensordict key where the state
             value is expected to be written. Defaults to ``"state_value"``.
 
     .. note::
diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py
@@ -350,7 +350,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
         hp.update(hyperparams)
-        value_key = "state_value"
+        value_key = self.value_key
         # we do not need a value network bc the next state value is already passed
         if value_type == ValueEstimators.TD1:
             self._value_estimator = TD1Estimator(
diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py
@@ -33,10 +33,11 @@ class ReinforceLoss(LossModule):
             for the critic. Defaults to ``False``.
         loss_critic_type (str): loss function for the value discrepancy.
             Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``.
-        advantage_key (str): [Deprecated, use .set_keys() instead] the input tensordict key where the advantage is
-            expected to be written.
+        advantage_key (str): [Deprecated, use .set_keys(advantage_key=advantage_key) instead]
+            The input tensordict key where the advantage is expected to be written.
             Defaults to ``"advantage"``.
-        value_target_key (str): [Deprecated, use .set_keys() instead] the input tensordict key where the target state
+        value_target_key (str): [Deprecated, use .set_keys(value_target_key=value_target_key) instead]
+            The input tensordict key where the target state
             value is expected to be written. Defaults to ``"value_target"``.
 
     .. note:
@@ -170,7 +171,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
         hp.update(hyperparams)
-        value_key = "state_value"
+        value_key = self.value_key
         if value_type == ValueEstimators.TD1:
             self._value_estimator = TD1Estimator(
                 value_network=self.critic, value_key=value_key, **hp
diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py
@@ -83,9 +83,9 @@ class SACLoss(LossModule):
         delay_value (bool, optional): Whether to separate the target value
             networks from the value networks used for data collection.
             Default is ``False``.
-        priority_key (str, optional): [Deprecated, use .set_keys() instead] tensordict key where to write the
-            priority (for prioritized replay buffer usage). Defaults to
-            ``"td_error"``.
+        priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead]
+            Tensordict key where to write the
+            priority (for prioritized replay buffer usage). Defaults to ``"td_error"``.
     """
 
     default_value_estimator = ValueEstimators.TD0
@@ -507,8 +507,8 @@ class DiscreteSACLoss(LossModule):
         target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto".
         delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used
             for data collection. Default is ``False``.
-        priority_key (str, optional): [Deprecated, use .set_keys() instead] Key
-            where to write the priority value for prioritized replay buffers.
+        priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead]
+            Key where to write the priority value for prioritized replay buffers.
             Default is `"td_error"`.
 
     """
diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py
@@ -150,7 +150,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
             -self.max_action, self.max_action
         )
         actor_output_td[1].set(self.action_key, next_action, inplace=True)
-        tensordict_actor[self.action_key] = actor_output_td[self.action_key]
+        tensordict_actor.set(self.action_key, actor_output_td.get(self.action_key))
 
         # repeat tensordict_actor to match the qvalue size
         _actor_loss_td = (

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:`
`150`	`150`	`-self.max_action, self.max_action`
`151`	`151`	`)`
`152`	`152`	`actor_output_td[1].set(self.action_key, next_action, inplace=True)`
`153`		`- tensordict_actor[self.action_key] = actor_output_td[self.action_key]`
	`153`	`+ tensordict_actor.set(self.action_key, actor_output_td.get(self.action_key))`
`154`	`154`
`155`	`155`	`# repeat tensordict_actor to match the qvalue size`
`156`	`156`	`_actor_loss_td = (`