Merge pull request #1588 from Unity-Technologies/hotfix-0.6.0a

vincentpierre · web-flow · commit d3f9fd63043f · 2019-01-11T15:19:49.000-08:00
Hotfix 0.6.0a to master
diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs
@@ -411,8 +411,8 @@ private string CheckVisualObsShape(Tensor tensor, int visObsIndex)
             var widthBp = resolutionBp.width;
             var heightBp = resolutionBp.height;
             var pixelBp = resolutionBp.blackAndWhite ? 1 : 3;
-            var widthT = tensor.Shape[1];
-            var heightT = tensor.Shape[2];
+            var heightT = tensor.Shape[1];
+            var widthT = tensor.Shape[2];
             var pixelT = tensor.Shape[3];
             if  ((widthBp != widthT) || (heightBp != heightT) || (pixelBp != pixelT))
             {
diff --git a/config/curricula/push-block/PushBlockBrain.json b/config/curricula/push-block/PushBlockBrain.json
diff --git a/config/curricula/wall-jump/BigWallJumpLearning.json b/config/curricula/wall-jump/BigWallJumpLearning.json
diff --git a/config/curricula/wall-jump/SmallWallJumpLearning.json b/config/curricula/wall-jump/SmallWallJumpLearning.json
diff --git a/docs/Training-Imitation-Learning.md b/docs/Training-Imitation-Learning.md
@@ -12,17 +12,31 @@ from a demonstration to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFY
 
 ## Recording Demonstrations
 
-It is possible to record demonstrations of agent behavior from the Unity Editor, and save them as assets. These demonstrations contain information on the observations, actions, and rewards for a given agent during the recording session. They can be managed from the Editor, as well as used for training with Offline Behavioral Cloning (see below).
+It is possible to record demonstrations of agent behavior from the Unity Editor, 
+and save them as assets. These demonstrations contain information on the 
+observations, actions, and rewards for a given agent during the recording session. 
+They can be managed from the Editor, as well as used for training with Offline 
+Behavioral Cloning (see below).
 
-In order to record demonstrations from an agent, add the `Demonstration Recorder` component to a GameObject in the scene which contains an `Agent` component. Once added, it is possible to name the demonstration that will be recorded from the agent.
+In order to record demonstrations from an agent, add the `Demonstration Recorder` 
+component to a GameObject in the scene which contains an `Agent` component. 
+Once added, it is possible to name the demonstration that will be recorded 
+from the agent.
 
 <p align="center">
   <img src="images/demo_component.png"
        alt="BC Teacher Helper"
        width="375" border="10" />
 </p>
 
-When `Record` is checked, a demonstration will be created whenever the scene is played from the Editor. Depending on the complexity of the task, anywhere from a few minutes or a few hours of demonstration data may be necessary to be useful for imitation learning. When you have recorded enough data, end the Editor play session, and a `.demo` file will be created in the `Assets/Demonstrations` folder. This file contains the demonstrations. Clicking on the file will provide metadata about the demonstration in the inspector.
+When `Record` is checked, a demonstration will be created whenever the scene 
+is played from the Editor. Depending on the complexity of the task, anywhere 
+from a few minutes or a few hours of demonstration data may be necessary to 
+be useful for imitation learning. When you have recorded enough data, end 
+the Editor play session, and a `.demo` file will be created in the 
+`Assets/Demonstrations` folder. This file contains the demonstrations. 
+Clicking on the file will provide metadata about the demonstration in the 
+inspector.
 
 <p align="center">
   <img src="images/demo_inspector.png"
@@ -33,29 +47,42 @@ When `Record` is checked, a demonstration will be created whenever the scene is
 
 ## Training with Behavioral Cloning
 
-There are a variety of possible imitation learning algorithms which can be used,
-the simplest one of them is Behavioral Cloning. It works by collecting demonstrations from a teacher, and then simply uses them to directly learn a policy, in the
-same way the supervised learning for image classification or other traditional
-Machine Learning tasks work.
+There are a variety of possible imitation learning algorithms which can 
+be used, the simplest one of them is Behavioral Cloning. It works by collecting 
+demonstrations from a teacher, and then simply uses them to directly learn a 
+policy, in the same way the supervised learning for image classification 
+or other traditional Machine Learning tasks work.
 
 
 ### Offline Training
 
-With offline behavioral cloning, we can use demonstrations (`.demo` files) generated using the `Demonstration Recorder` as the dataset used to train a behavior.
+With offline behavioral cloning, we can use demonstrations (`.demo` files) 
+generated using the `Demonstration Recorder` as the dataset used to train a behavior.
 
 1. Choose an agent you would like to learn to imitate some set of demonstrations. 
-2. Record a set of demonstration using the `Demonstration Recorder` (see above). For illustrative purposes we will refer to this file as `AgentRecording.demo`. 
-3. Build the scene, assigning the agent a Learning Brain, and set the Brain to Control in the Broadcast Hub. For more information on Brains, see [here](Learning-Environment-Design-Brains.md).
+2. Record a set of demonstration using the `Demonstration Recorder` (see above). 
+   For illustrative purposes we will refer to this file as `AgentRecording.demo`. 
+3. Build the scene, assigning the agent a Learning Brain, and set the Brain to 
+   Control in the Broadcast Hub. For more information on Brains, see 
+   [here](Learning-Environment-Design-Brains.md).
 4. Open the `config/offline_bc_config.yaml` file. 
-5. Modify the `demo_path` parameter in the file to reference the path to the demonstration file recorded in step 2. In our case this is: `./UnitySDK/Assets/Demonstrations/AgentRecording.demo`
-6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml` as the config parameter, and include the `--run-id` and `--train` as usual. Provide your environment as the `--env` parameter if it has been compiled as standalone, or omit to train in the editor.
+5. Modify the `demo_path` parameter in the file to reference the path to the 
+   demonstration file recorded in step 2. In our case this is: 
+   `./UnitySDK/Assets/Demonstrations/AgentRecording.demo`
+6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml` 
+   as the config parameter, and include the `--run-id` and `--train` as usual. 
+   Provide your environment as the `--env` parameter if it has been compiled 
+   as standalone, or omit to train in the editor.
 7. (Optional) Observe training performance using Tensorboard.
 
-This will use the demonstration file to train a neural network driven agent to directly imitate the actions provided in the demonstration. The environment will launch and be used for evaluating the agent's performance during training.
+This will use the demonstration file to train a neural network driven agent 
+to directly imitate the actions provided in the demonstration. The environment 
+will launch and be used for evaluating the agent's performance during training.
 
 ### Online Training
 
-It is also possible to provide demonstrations in realtime during training, without pre-recording a demonstration file. The steps to do this are as follows:
+It is also possible to provide demonstrations in realtime during training, 
+without pre-recording a demonstration file. The steps to do this are as follows:
 
 1. First create two Brains, one which will be the "Teacher," and the other which
    will be the "Student." We will assume that the names of the Brain
@@ -65,27 +92,27 @@ It is also possible to provide demonstrations in realtime during training, witho
 3. The "Student" Brain must be a **Learning Brain**.
 4. The Brain Parameters of both the "Teacher" and "Student" Brains must be 
    compatible with the agent.
-5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub`
+5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub` 
    and check the `Control` checkbox on the "Student" Brain. 
-4. Link the Brains to the desired Agents (one Agent as the teacher and at least
+6. Link the Brains to the desired Agents (one Agent as the teacher and at least
    one Agent as a student).
-5. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set
-   the `trainer` parameter of this entry to `imitation`, and the
+7. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set
+   the `trainer` parameter of this entry to `online_bc`, and the
    `brain_to_imitate` parameter to the name of the teacher Brain: "Teacher".
    Additionally, set `batches_per_epoch`, which controls how much training to do
    each moment. Increase the `max_steps` option if you'd like to keep training
    the Agents for a longer period of time.
-6. Launch the training process with `mlagents-learn config/online_bc_config.yaml
+8. Launch the training process with `mlagents-learn config/online_bc_config.yaml
    --train --slow`, and press the :arrow_forward: button in Unity when the
    message _"Start training by pressing the Play button in the Unity Editor"_ is
    displayed on the screen
-7. From the Unity window, control the Agent with the Teacher Brain by providing
+9. From the Unity window, control the Agent with the Teacher Brain by providing
    "teacher demonstrations" of the behavior you would like to see.
-8. Watch as the Agent(s) with the student Brain attached begin to behave
+10. Watch as the Agent(s) with the student Brain attached begin to behave
    similarly to the demonstrations.
-9. Once the Student Agents are exhibiting the desired behavior, end the training
+11. Once the Student Agents are exhibiting the desired behavior, end the training
    process with `CTL+C` from the command line.
-10. Move the resulting `*.bytes` file into the `TFModels` subdirectory of the
+12. Move the resulting `*.bytes` file into the `TFModels` subdirectory of the
     Assets folder (or a subdirectory within Assets of your choosing) , and use
     with `Learning` Brain.
 
diff --git a/ml-agents/mlagents/envs/rpc_communicator.py b/ml-agents/mlagents/envs/rpc_communicator.py
@@ -53,7 +53,9 @@ def create_server(self):
             self.server = grpc.server(ThreadPoolExecutor(max_workers=10))
             self.unity_to_external = UnityToExternalServicerImplementation()
             add_UnityToExternalServicer_to_server(self.unity_to_external, self.server)
-            self.server.add_insecure_port('localhost:' + str(self.port))
+            # Using unspecified address, which means that grpc is communicating on all IPs
+            # This is so that the docker container can connect.
+            self.server.add_insecure_port('[::]:' + str(self.port))
             self.server.start()
             self.is_open = True
         except:
diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py
@@ -28,12 +28,27 @@ class AgentBufferField(list):
             AgentBufferField with the append method.
             """
 
+            def __init__(self):
+                self.padding_value = 0
+                super(Buffer.AgentBuffer.AgentBufferField, self).__init__()
+
             def __str__(self):
                 return str(np.array(self).shape)
 
+            def append(self, element, padding_value=0):
+                """
+                Adds an element to this list. Also lets you change the padding 
+                type, so that it can be set on append (e.g. action_masks should
+                be padded with 1.) 
+                :param element: The element to append to the list.
+                :param padding_value: The value used to pad when get_batch is called.
+                """
+                super(Buffer.AgentBuffer.AgentBufferField, self).append(element)
+                self.padding_value = padding_value
+
             def extend(self, data):
                 """
-                Ads a list of np.arrays to the end of the list of np.arrays.
+                Adds a list of np.arrays to the end of the list of np.arrays.
                 :param data: The np.array list to append.
                 """
                 self += list(np.array(data))
@@ -99,7 +114,7 @@ def get_batch(self, batch_size=None, training_length=1, sequential=True):
                             raise BufferException("The batch size and training length requested for get_batch where"
                                                   " too large given the current number of data points.")
                         tmp_list = []
-                        padding = np.array(self[-1]) * 0
+                        padding = np.array(self[-1]) * self.padding_value
                         # The padding is made with zeros and its shape is given by the shape of the last element
                         for end in range(len(self), len(self) % training_length, -training_length)[:batch_size]:
                             tmp_list += [np.array(self[end - training_length:end])]
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
@@ -6,8 +6,8 @@
 import numpy as np
 from docopt import docopt
 
-from .trainer_controller import TrainerController
-from .exception import TrainerError
+from mlagents.trainers.trainer_controller import TrainerController
+from mlagents.trainers.exception import TrainerError
 
 
 def run_training(sub_id, run_seed, run_options, process_queue):
@@ -107,13 +107,23 @@ def main():
 
     jobs = []
     run_seed = seed
-    for i in range(num_runs):
+
+    if num_runs == 1:
         if seed == -1:
             run_seed = np.random.randint(0, 10000)
-        process_queue = Queue()
-        p = Process(target=run_training, args=(i, run_seed, options, process_queue))
-        jobs.append(p)
-        p.start()
-        # Wait for signal that environment has successfully launched
-        while process_queue.get() is not True:
-            continue
+        run_training(0, run_seed, options, Queue())
+    else:
+        for i in range(num_runs):
+            if seed == -1:
+                run_seed = np.random.randint(0, 10000)
+            process_queue = Queue()
+            p = Process(target=run_training, args=(i, run_seed, options, process_queue))
+            jobs.append(p)
+            p.start()
+            # Wait for signal that environment has successfully launched
+            while process_queue.get() is not True:
+                continue
+
+# For python debugger to directly run this script
+if __name__ == "__main__":
+    main()
diff --git a/ml-agents/mlagents/trainers/policy.py b/ml-agents/mlagents/trainers/policy.py
@@ -179,6 +179,7 @@ def export_model(self):
                 clear_devices=True, initializer_nodes='', input_saver='',
                 restore_op_name='save/restore_all',
                 filename_tensor_name='save/Const:0')
+            logger.info('Exported ' + self.model_path + '.bytes file')
 
     def _process_graph(self):
         """
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -224,7 +224,7 @@ def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainIn
                             epsilons[idx])
                     else:
                         self.training_buffer[agent_id]['action_mask'].append(
-                            stored_info.action_masks[idx])
+                            stored_info.action_masks[idx], padding_value=1)
                     a_dist = stored_take_action_outputs['log_probs']
                     value = stored_take_action_outputs['value']
                     self.training_buffer[agent_id]['actions'].append(actions[idx])
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -6,6 +6,10 @@
 import glob
 import logging
 import shutil
+import sys
+if sys.platform.startswith('win'):
+    import win32api
+    import win32con
 
 import yaml
 import re
@@ -103,6 +107,7 @@ def __init__(self, env_path, run_id, save_freq, curriculum_folder,
         self.keep_checkpoints = keep_checkpoints
         self.trainers = {}
         self.seed = seed
+        self.global_step = 0
         np.random.seed(self.seed)
         tf.set_random_seed(self.seed)
         self.env = UnityEnvironment(file_name=env_path,
@@ -181,6 +186,23 @@ def _save_model(self,steps=0):
             self.trainers[brain_name].save_model()
         self.logger.info('Saved Model')
 
+    def _save_model_when_interrupted(self, steps=0):
+        self.logger.info('Learning was interrupted. Please wait '
+                         'while the graph is generated.')
+        self._save_model(steps)
+
+    def _win_handler(self, event):
+        """
+        This function gets triggered after ctrl-c or ctrl-break is pressed
+        under Windows platform.
+        """
+        if event in (win32con.CTRL_C_EVENT, win32con.CTRL_BREAK_EVENT):
+            self._save_model_when_interrupted(self.global_step)
+            self._export_graph()
+            sys.exit()
+            return True
+        return False
+
     def _export_graph(self):
         """
         Exports latest saved models to .bytes format for Unity embedding.
@@ -288,12 +310,14 @@ def start_learning(self):
         self._initialize_trainers(trainer_config)
         for _, t in self.trainers.items():
             self.logger.info(t)
-        global_step = 0  # This is only for saving the model
         curr_info = self._reset_env()
         if self.train_model:
             for brain_name, trainer in self.trainers.items():
                 trainer.write_tensorboard_text('Hyperparameters',
                                                trainer.parameters)
+            if sys.platform.startswith('win'):
+                # Add the _win_handler function to the windows console's handler function list
+                win32api.SetConsoleCtrlHandler(self._win_handler, True)
         try:
             while any([t.get_step <= t.get_max_steps \
                        for k, t in self.trainers.items()]) \
@@ -353,31 +377,27 @@ def start_learning(self):
                     # Write training statistics to Tensorboard.
                     if self.meta_curriculum is not None:
                         trainer.write_summary(
-                            global_step,
+                            self.global_step,
                             lesson_num=self.meta_curriculum
                                 .brains_to_curriculums[brain_name]
                                 .lesson_num)
                     else:
-                        trainer.write_summary(global_step)
+                        trainer.write_summary(self.global_step)
                     if self.train_model \
                             and trainer.get_step <= trainer.get_max_steps:
                         trainer.increment_step_and_update_last_reward()
-                global_step += 1
-                if global_step % self.save_freq == 0 and global_step != 0 \
+                self.global_step += 1
+                if self.global_step % self.save_freq == 0 and self.global_step != 0 \
                         and self.train_model:
                     # Save Tensorflow model
-                    self._save_model(steps=global_step)
+                    self._save_model(steps=self.global_step)
                 curr_info = new_info
             # Final save Tensorflow model
-            if global_step != 0 and self.train_model:
-                self._save_model(steps=global_step)
+            if self.global_step != 0 and self.train_model:
+                self._save_model(steps=self.global_step)
         except KeyboardInterrupt:
-            print('--------------------------Now saving model--------------'
-                  '-----------')
             if self.train_model:
-                self.logger.info('Learning was interrupted. Please wait '
-                                 'while the graph is generated.')
-                self._save_model(steps=global_step)
+                self._save_model_when_interrupted(steps=self.global_step)
             pass
         self.env.close()
         if self.train_model: