Basic example running with multiple envs in sb3

Henry Peteet · Henry Peteet · commit a867e130be67 · 2022-01-27T10:35:12.000-05:00
diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py
@@ -1,13 +1,21 @@
 import itertools
+from dataclasses import dataclass
+
 import numpy as np
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union, Callable
 
 import gym
-from gym import error, spaces
+from gym import error, spaces, Env
+from stable_baselines3.common.vec_env import VecEnv, SubprocVecEnv
 
 from mlagents_envs.base_env import ActionTuple, BaseEnv
 from mlagents_envs.base_env import DecisionSteps, TerminalSteps
 from mlagents_envs import logging_util
+from mlagents_envs.environment import UnityEnvironment
+from mlagents_envs.side_channel.engine_configuration_channel import (
+    EngineConfig,
+    EngineConfigurationChannel,
+)
 
 
 class UnityGymException(error.Error):
@@ -23,6 +31,48 @@ class UnityGymException(error.Error):
 
 GymStepResult = Tuple[np.ndarray, float, bool, Dict]
 
+# Default values from CLI (See cli_utils.py)
+DEFAULT_ENGINE_CONFIG = EngineConfig(
+    width=84,
+    height=84,
+    quality_level=4,
+    time_scale=20,
+    target_frame_rate=-1,
+    capture_frame_rate=60,
+)
+
+
+# Some config subset of an actual config.yaml file for MLA.
+@dataclass
+class LimitedConfig:
+    env_path: str
+    num_env: int = 1
+    engine_config: EngineConfig = DEFAULT_ENGINE_CONFIG
+
+
+def make_mla_sb3_env(config: LimitedConfig) -> VecEnv:
+    def create_env(path: str, worker_id: int, seed: int) -> Callable[[], Env]:
+        def _f() -> Env:
+            engine_configuration_channel = EngineConfigurationChannel()
+            engine_configuration_channel.set_configuration(config.engine_config)
+            side_channels = [engine_configuration_channel]
+            return UnityToGymWrapper(
+                UnityEnvironment(
+                    file_name=path,
+                    worker_id=worker_id,
+                    seed=seed,
+                    side_channels=side_channels,
+                ),
+                uint8_visual=True,
+            )
+
+        return _f
+
+    env_facts = [
+        create_env(config.env_path, worker_id=x, seed=x) for x in range(config.num_env)
+    ]
+    return SubprocVecEnv(env_facts)
+
 
 class UnityToGymWrapper(gym.Env):
     """
diff --git a/sb3_examples/3dball_num_envs.py b/sb3_examples/3dball_num_envs.py
@@ -0,0 +1,51 @@
+from math import ceil
+
+from baselines.common.schedules import LinearSchedule
+from stable_baselines3 import PPO
+from stable_baselines3.common.vec_env import VecMonitor
+
+from gym_unity.envs import make_mla_sb3_env, LimitedConfig
+
+TOTAL_TAINING_STEPS_GOAL = (
+    500000
+)  # Same as config for CI 3dball... Not sure if MLA steps == SB3 steps.
+NUM_ENVS = 12
+STEPS_PER_UPDATE = 2048
+
+
+# NOTE: This only achieves ~90/100 reward and is just a POC. Needs tuning to be useful.
+def main():
+    env = make_mla_sb3_env(
+        LimitedConfig(
+            env_path="/Users/henry.peteet/Documents/RandomBuilds/3DBallSingleNoVis",
+            num_env=NUM_ENVS,
+        )
+    )
+    # Log results in the "results" folder
+    env = VecMonitor(env, "results")
+    # Attempt to approximate settings from 3DBall.yaml
+    schedule = LinearSchedule(
+        schedule_timesteps=TOTAL_TAINING_STEPS_GOAL, final_p=0.0, initial_p=0.0003
+    )
+    model = PPO(
+        "MlpPolicy",
+        env,
+        verbose=1,
+        # TODO: Check if I am using schedule correctly.
+        learning_rate=lambda progress: schedule.value(
+            TOTAL_TAINING_STEPS_GOAL * progress
+        ),
+        tensorboard_log="results",
+        n_steps=int(STEPS_PER_UPDATE),
+    )
+    training_rounds = ceil(TOTAL_TAINING_STEPS_GOAL / int(STEPS_PER_UPDATE * NUM_ENVS))
+    for i in range(training_rounds):
+        print(f"Training round {i + 1}/{training_rounds}")
+        # NOTE: rest_num_timesteps should only happen the first time so that tensorboard logs are consistent.
+        model.learn(total_timesteps=6000, reset_num_timesteps=(i == 0))
+        model.policy.eval()
+    env.close()
+
+
+if __name__ == "__main__":
+    main()