Add shared critic configurability for PPO (#45)

andrewcoh · GitHub Enterprise · commit d697fcc65f1a · 2022-04-29T08:51:17.000-07:00
diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
@@ -63,6 +63,7 @@ the `trainer` setting above).
 | `hyperparameters -> epsilon_schedule` | (default = `learning_rate_schedule `) Determines how epsilon changes over time (PPO only). <br><br>`linear` decays epsilon linearly, reaching 0 at max_steps, while `constant` keeps the epsilon constant for the entire training run. If not explicitly set, the default epsilon schedule will be set to `hyperparameters -> learning_rate_schedule`.
 | `hyperparameters -> lambd`     | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
 | `hyperparameters -> num_epoch` | (default = `3`) Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
+| `hyperparameters -> shared_critic` | (default = `False`) Whether or not the policy and value function networks share a backbone. It may be useful to use a shared backbone when learning from image observations.
 
 ### SAC-specific Configurations
 
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
@@ -284,6 +284,7 @@ behaviors:
       epsilon_schedule: linear
       lambd: 0.95
       num_epoch: 3
+      shared_critic: False
 
     # Configuration of the neural network (common to PPO/SAC)
     network_settings:
diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
@@ -29,6 +29,7 @@ def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
         reward_signal_configs = trainer_settings.reward_signals
         reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
 
+        params = list(self.policy.actor.parameters())
         if policy.shared_critic:
             self._critic = policy.actor
         else:
@@ -38,8 +39,8 @@ def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
                 network_settings=trainer_settings.network_settings,
             )
             self._critic.to(default_device())
+            params += list(self._critic.parameters())
 
-        params = list(self.policy.actor.parameters()) + list(self._critic.parameters())
         self.hyperparameters: PPOSettings = cast(
             PPOSettings, trainer_settings.hyperparameters
         )
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -229,7 +229,7 @@ def create_torch_policy(
             behavior_spec,
             self.trainer_settings,
             condition_sigma_on_obs=False,  # Faster training for PPO
-            separate_critic=True,  # Match network architecture with TF
+            separate_critic=not self.hyperparameters.shared_critic,  # Only PPO currently allows shared critic
         )
         return policy
 
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
@@ -180,6 +180,7 @@ class PPOSettings(HyperparamSettings):
     epsilon: float = 0.2
     lambd: float = 0.95
     num_epoch: int = 3
+    shared_critic: bool = False
     learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
     beta_schedule: ScheduleType = ScheduleType.LINEAR
     epsilon_schedule: ScheduleType = ScheduleType.LINEAR
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
@@ -151,7 +151,8 @@ def test_2d_ppo(action_sizes):
 
 @pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
 @pytest.mark.parametrize("num_visual", [1, 2])
-def test_visual_ppo(num_visual, action_sizes):
+@pytest.mark.parametrize("shared_critic", [True, False])
+def test_visual_ppo(shared_critic, num_visual, action_sizes):
     env = SimpleEnvironment(
         [BRAIN_NAME],
         action_sizes=action_sizes,
@@ -160,7 +161,9 @@ def test_visual_ppo(num_visual, action_sizes):
         step_size=0.2,
     )
     new_hyperparams = attr.evolve(
-        PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
+        PPO_TORCH_CONFIG.hyperparameters,
+        learning_rate=3.0e-4,
+        shared_critic=shared_critic,
     )
     config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
     check_environment_trains(env, {BRAIN_NAME: config})

Original file line number	Diff line number	Diff line change
`@@ -229,7 +229,7 @@ def create_torch_policy(`
`229`	`229`	`behavior_spec,`
`230`	`230`	`self.trainer_settings,`
`231`	`231`	`condition_sigma_on_obs=False, # Faster training for PPO`
`232`		`- separate_critic=True, # Match network architecture with TF`
	`232`	`+ separate_critic=not self.hyperparameters.shared_critic, # Only PPO currently allows shared critic`
`233`	`233`	`)`
`234`	`234`	`return policy`
`235`	`235`