neardws
diff --git a/‎Agents/MAD5PG/actors_test.py
+4-4 b/‎Agents/MAD5PG/actors_test.py
+4-4
diff --git a/‎Agents/MAD5PG/agent.py
+20-16 b/‎Agents/MAD5PG/agent.py
+20-16
diff --git a/‎Agents/MAD5PG/agent_test.py
+3-3 b/‎Agents/MAD5PG/agent_test.py
+3-3
diff --git a/‎Agents/MAD5PG/learning.py
+38-20 b/‎Agents/MAD5PG/learning.py
+38-20
diff --git a/‎Agents/MAD5PG/multiplexers.py
+80 b/‎Agents/MAD5PG/multiplexers.py
+80
@@ -2,9 +2,9 @@
 sys.path.append(r"/home/neardws/Documents/Game-Theoretic-Deep-Reinforcement-Learning/")
 from environment_loop import EnvironmentLoop
 from absl.testing import absltest
-from Agents.MAD4PG import actors
+from Agents.MAD5PG import actors
 from Environment.environment import make_environment_spec
-from Agents.MAD4PG.networks import make_policy_network
+from Agents.MAD5PG.networks import make_policy_network
 from Experiment.make_environment import get_default_environment
 
 class ActorTest(absltest.TestCase):
@@ -13,11 +13,11 @@ class ActorTest(absltest.TestCase):
     def test_feedforward(self):
 
         time_slots, task_list, vehicle_list, edge_list, distance_matrix, channel_condition_matrix, \
-        vehicle_index_within_edges, environment_config, environment = get_default_environment()
+        vehicle_index_within_edges, environment_config, environment = get_default_environment(for_mad5pg=True)
 
         env_spec = make_environment_spec(environment)
 
-        policy_networks = [make_policy_network(env_spec.edge_actions) for _ in range(environment_config.edge_number)]
+        policy_networks = make_policy_network(env_spec.edge_actions)
 
         actor = actors.FeedForwardActor(
             policy_networks=policy_networks,
 
@@ -4,7 +4,7 @@
 
 import copy
 import dataclasses
-from typing import Callable, Iterator, List, Optional, Union, Sequence
+from typing import Iterator, List, Optional, Union, Sequence
 import acme
 from acme import adders
 from acme import core
@@ -23,7 +23,7 @@
 import sonnet as snt
 import launchpad as lp
 import functools
-import dm_env
+from Utilities.FileOperator import load_obj
 from Agents.MAD5PG.networks import make_default_MAD3PGNetworks, MAD3PGNetwork
 from environment_loop import EnvironmentLoop
 
@@ -57,17 +57,17 @@ class MAD3PGConfig:
         accelerator: 'TPU', 'GPU', or 'CPU'. If omitted, the first available accelerator type from ['TPU', 'GPU', 'CPU'] will be selected.
     """
     discount: float = 0.996
-    batch_size: int = 512
+    batch_size: int = 256
     prefetch_size: int = 4
     target_update_period: int = 100
-    variable_update_period: int = 1000
+    variable_update_period: int = 500
     policy_optimizers: Optional[snt.Optimizer] = None
     critic_optimizers: Optional[snt.Optimizer] = None
     min_replay_size: int = 1000
     max_replay_size: int = 1000000
-    samples_per_insert: Optional[float] = 1.0
-    n_step: int = 1
-    sigma: float = 0.3
+    samples_per_insert: Optional[float] = 32.0
+    n_step: int = 5
+    sigma: float = 0.5
     clipping: bool = True
     replay_table_name: str = reverb_adders.DEFAULT_PRIORITY_TABLE
     counter: Optional[counting.Counter] = None
@@ -104,6 +104,7 @@ def __init__(
         if networks is None:
             online_networks = make_default_MAD3PGNetworks(
                 action_spec=environment_spec.edge_actions,
+                sigma=self._config.sigma,
             )
         else:
             online_networks = networks
@@ -118,7 +119,7 @@ def __init__(
         target_networks.init(self._environment_spec)
 
         # Create the behavior policy.
-        policy_networks = online_networks.make_policy(self._environment_spec, self._config.sigma)
+        policy_networks = online_networks.make_policy()
 
         # Create the replay server and grab its address.
         replay_tables = self.make_replay_tables(self._environment_spec)
@@ -289,7 +290,7 @@ class MultiAgentDistributedDDPG:
     def __init__(
         self,
         config: MAD3PGConfig,
-        environment_factory: Callable[[bool], dm_env.Environment],
+        environment_file_name: str,
         environment_spec,
         networks: Optional[MAD3PGNetwork] = None,
         num_actors: int = 1,
@@ -311,11 +312,14 @@ def __init__(
         self._log_every = log_every
         self._networks = networks
         self._environment_spec = environment_spec
-        self._environment_factory = environment_factory
+        self._environment_file_name = environment_file_name
         # Create the agent.
+        
+        environment = load_obj(environment_file_name)
+        
         self._agent = MAD3PGAgent(
             config=self._config,
-            environment=self._environment_factory(False),
+            environment=environment,
             environment_spec=self._environment_spec,
             networks=self._networks,
         )
@@ -379,10 +383,10 @@ def actor(
 
         networks.init(self._environment_spec)
 
-        policy_networks = networks.make_policy(environment_spec=self._environment_spec, sigma=self._config.sigma)
+        policy_networks = networks.make_policy()
 
         # Create the environment
-        environment = self._environment_factory(False)
+        environment = load_obj(self._environment_file_name)
 
         # Create the agent.
         actor = self._agent.make_actor(
@@ -395,7 +399,7 @@ def actor(
         counter = counting.Counter(counter, 'actor')
         logger = loggers.make_default_logger(
             'actor',
-            save_data=False,
+            save_data=True,
             time_delta=self._log_every,
             steps_key='actor_steps')
 
@@ -420,10 +424,10 @@ def evaluator(
         networks = self._networks
         networks.init(self._environment_spec)
 
-        policy_networks = networks.make_policy(self._environment_spec)
+        policy_networks = networks.make_policy()
 
         # Make the environment
-        environment = self._environment_factory(True)
+        environment = load_obj(self._environment_file_name)
 
         # Create the agent.
         actor = self._agent.make_actor(
 
@@ -6,8 +6,8 @@
 import launchpad as lp
 from absl.testing import absltest
 from Environment.environment import vehicularNetworkEnv, make_environment_spec
-from Agents.MAD4PG.networks import make_default_MAD3PGNetworks
-from Agents.MAD4PG.agent import MultiAgentDistributedDDPG, MAD3PGConfig
+from Agents.MAD5PG.networks import make_default_MAD3PGNetworks
+from Agents.MAD5PG.agent import MultiAgentDistributedDDPG, MAD3PGConfig
 from Experiment.make_environment import get_default_environment
 
 
@@ -18,7 +18,7 @@ def test_control_suite(self):
         """Tests that the agent can run on the control suite without crashing."""
 
         time_slots, task_list, vehicle_list, edge_list, distance_matrix, channel_condition_matrix, \
-        vehicle_index_within_edges, environment_config, environment = get_default_environment()
+        vehicle_index_within_edges, environment_config, environment = get_default_environment(for_mad5pg=True)
 
         spec = make_environment_spec(environment)
 
 
@@ -154,11 +154,11 @@ def __init__(
                     'critic_optimizer': self._critic_optimizers,
                     'num_steps': self._num_steps,
                 })
-            object_to_save = dict()
-            object_to_save['policy'] = self._policy_networks
-            object_to_save['critic_mean'] = snt.Sequential([self._critic_networks, acme_nets.StochasticMeanHead()])
-            self._snapshotter = tf2_savers.Snapshotter(
-                objects_to_save=object_to_save)
+            # object_to_save = dict()
+            # object_to_save['policy'] = self._policy_networks
+            # object_to_save['critic_mean'] = snt.Sequential([self._critic_networks, acme_nets.StochasticMeanHead()])
+            # self._snapshotter = tf2_savers.Snapshotter(
+            #     objects_to_save=object_to_save)
 
         # Do not record timestamps until after the first learning step is done.
         # This is to avoid including the time it takes for actors to come online and
@@ -195,7 +195,6 @@ def _step(self, sample) -> Dict[str, tf.Tensor]:
             #     a_t_list.append(a_t)
 
             # edge_a_t = tf.concat([a_t_list[i] for i in range(len(self._target_observation_networks))], axis=1)
-            
             a_t_list = []
             for i in range(self._edge_number):
                 observation = transitions.next_observation[:, i, :]
@@ -205,7 +204,7 @@ def _step(self, sample) -> Dict[str, tf.Tensor]:
                 a_t_list.append(a_t)
 
             edge_next_a_t = tf.concat([a_t_list[i] for i in range(self._edge_number)], axis=1)
-            
+            edge_next_a_t = tf.reshape(edge_next_a_t, [batch_size, self._edge_number, self._edge_action_size])
 
             for edge_index in range(self._edge_number):
 
@@ -218,8 +217,20 @@ def _step(self, sample) -> Dict[str, tf.Tensor]:
                 o_t = tree.map_structure(tf.stop_gradient, o_t)
 
                 # Critic learning.
-                q_tm1 = self._critic_networks(o_tm1, tf.reshape(transitions.action, shape=[batch_size, -1]))
-                q_t = self._target_critic_networks(o_t, tf.reshape(edge_next_a_t, shape=[batch_size, -1]))
+                critic_actions = tf2_utils.batch_concat([
+                        transitions.action[:, : edge_index, :],
+                        transitions.action[:, edge_index + 1 :, :],
+                        transitions.action[:, edge_index, :],
+                    ]) 
+                q_tm1 = self._critic_networks(o_tm1, tf.reshape(critic_actions, shape=[batch_size, -1]))
+                
+                
+                critic_actions = tf2_utils.batch_concat([
+                        edge_next_a_t[:, : edge_index, :],
+                        edge_next_a_t[:, edge_index + 1 :, :],
+                        edge_next_a_t[:, edge_index, :],
+                    ])
+                q_t = self._target_critic_networks(o_t, tf.reshape(critic_actions, shape=[batch_size, -1]))
 
                 # Critic loss.
                 critic_loss = losses.categorical(q_tm1, transitions.reward[:, edge_index],
@@ -231,26 +242,33 @@ def _step(self, sample) -> Dict[str, tf.Tensor]:
                 critic_losses.append(critic_loss)
 
                 # Actor learning
-                if edge_index == 0:
-                    dpg_a_t = self._policy_networks(o_t)
-                else:
-                    dpg_a_t = tf.reshape(edge_next_a_t, shape=[batch_size, self._edge_number, self._edge_action_size])[:, 0, :]
-                for i in range(self._edge_number):
-                    if i != 0 and i != edge_index:
-                        dpg_a_t = tf.concat([dpg_a_t, tf.reshape(edge_next_a_t, shape=[batch_size, self._edge_number, self._edge_action_size])[:, i, :]], axis=1)
-                    elif i != 0 and i == edge_index:
-                        dpg_a_t = tf.concat([dpg_a_t, self._policy_networks(o_t)], axis=1)
+                policy_a_t = self._policy_networks(o_t)
+                
+                dpg_a_t = tf2_utils.batch_concat([
+                        edge_next_a_t[:, : edge_index, :],
+                        edge_next_a_t[:, edge_index + 1 :, :],
+                        policy_a_t,
+                    ])
+                
+                # if edge_index == 0:
+                #     dpg_a_t = policy_a_t
+                # else:
+                #     dpg_a_t = edge_next_a_t[:, 0, :]
+                # for i in range(self._edge_number):
+                #     if i != 0 and i != edge_index:
+                #         dpg_a_t = tf.concat([dpg_a_t, edge_next_a_t[:, i, :]], axis=1)
+                #     elif i != 0 and i == edge_index:
+                #         dpg_a_t = tf.concat([dpg_a_t, policy_a_t], axis=1)
 
                 dpg_z_t = self._critic_networks(o_t, dpg_a_t)
                 dpg_q_t = dpg_z_t.mean()
-
                 # Actor loss. If clipping is true use dqda clipping and clip the norm.
                 dqda_clipping = 1.0 if self._clipping else None
                 # myapp.debug(f"dpg_q_t: {np.array(dpg_q_t)}")
                 # myapp.debug(f"dpg_a_t: {np.array(dpg_a_t)}")
                 policy_loss = losses.dpg(
                     dpg_q_t,
-                    dpg_a_t,
+                    policy_a_t,
                     tape=tape,
                     dqda_clipping=dqda_clipping,
                     clip_norm=self._clipping)
 
@@ -0,0 +1,80 @@
+# python3
+# Copyright 2018 DeepMind Technologies Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multiplexers are networks that take multiple inputs."""
+
+from typing import Callable, Optional, Union
+
+from acme import types
+from acme.tf import utils as tf2_utils
+
+import sonnet as snt
+import tensorflow as tf
+import tensorflow_probability as tfp
+
+tfd = tfp.distributions
+TensorTransformation = Union[snt.Module, Callable[[types.NestedTensor],
+                                                tf.Tensor]]
+
+
+class CriticMultiplexer(snt.Module):
+    """Module connecting a critic torso to (transformed) observations/actions.
+
+    This takes as input a `critic_network`, an `observation_network`, and an
+    `action_network` and returns another network whose outputs are given by
+    `critic_network(observation_network(o), action_network(a))`.
+
+    The observations and actions passed to this module are assumed to have a batch
+    dimension that match.
+
+    Notes:
+    - Either the `observation_` or `action_network` can be `None`, in which case
+        the observation or action, resp., are passed to the critic network as is.
+    - If all `critic_`, `observation_` and `action_network` are `None`, this
+        module reduces to a simple `tf2_utils.batch_concat()`.
+    """
+
+    def __init__(self,
+                critic_network: Optional[TensorTransformation] = None,
+                observation_network: Optional[TensorTransformation] = None,
+                action_network: Optional[TensorTransformation] = None):
+        self._critic_network = critic_network
+        self._observation_network = observation_network
+        self._action_network = action_network
+        super().__init__(name='critic_multiplexer')
+
+    def __call__(self,
+                observation: types.NestedTensor,
+                action: types.NestedTensor) -> tf.Tensor:
+
+        # Maybe transform observations and actions before feeding them on.
+        if self._observation_network:
+            observation = self._observation_network(observation)
+        if self._action_network:
+            action = self._action_network(action)
+
+        if hasattr(observation, 'dtype') and hasattr(action, 'dtype'):
+            if observation.dtype != action.dtype:
+                # Observation and action must be the same type for concat to work
+                action = tf.cast(action, observation.dtype)
+
+        # Concat observations and actions, with one batch dimension.
+        outputs = tf2_utils.batch_concat([observation, action])
+
+        # Maybe transform output before returning.
+        if self._critic_network:
+            outputs = self._critic_network(outputs)
+
+        return outputs