64
64
EPS = 1e-8 # numerical residual
65
65
MODEL_PATH = 'model/ppo_multi'
66
66
NUM_WORKERS = 2 # or: mp.cpu_count()
67
- ACTION_RANGE = 2 . # if unnormalized, normalized action range should be 1.
67
+ ACTION_RANGE = 1 . # if unnormalized, normalized action range should be 1.
68
68
METHOD = [
69
69
dict (name = 'kl_pen' , kl_target = 0.01 , lam = 0.5 ), # KL penalty
70
70
dict (name = 'clip' , epsilon = 0.2 ), # Clipped surrogate objective, find this is better
71
71
][0 ] # choose the method for optimization
72
72
73
73
############################### PPO ####################################
74
74
75
+
76
+ class AddBias (nn .Module ):
77
+ def __init__ (self , bias ):
78
+ super (AddBias , self ).__init__ ()
79
+ self ._bias = nn .Parameter (bias .unsqueeze (1 ))
80
+
81
+ def forward (self , x ):
82
+ if x .dim () == 2 :
83
+ bias = self ._bias .t ().view (1 , - 1 )
84
+ else :
85
+ bias = self ._bias .t ().view (1 , - 1 , 1 , 1 )
86
+
87
+ return x + bias
88
+
89
+
75
90
class ValueNetwork (nn .Module ):
76
91
def __init__ (self , state_dim , hidden_dim , init_w = 3e-3 ):
77
92
super (ValueNetwork , self ).__init__ ()
@@ -104,12 +119,10 @@ def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=
104
119
# self.linear4 = nn.Linear(hidden_dim, hidden_dim)
105
120
106
121
self .mean_linear = nn .Linear (hidden_dim , num_actions )
107
- self .mean_linear .weight .data .uniform_ (- init_w , init_w )
108
- self .mean_linear .bias .data .uniform_ (- init_w , init_w )
109
-
110
- self .log_std_linear = nn .Linear (hidden_dim , num_actions )
111
- self .log_std_linear .weight .data .uniform_ (- init_w , init_w )
112
- self .log_std_linear .bias .data .uniform_ (- init_w , init_w )
122
+ # implementation 1
123
+ # self.log_std_linear = nn.Linear(hidden_dim, num_actions)
124
+ # # implementation 2: not dependent on latent features, reference:https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py
125
+ self .log_std = AddBias (torch .zeros (num_actions ))
113
126
114
127
self .num_actions = num_actions
115
128
self .action_range = action_range
@@ -122,8 +135,15 @@ def forward(self, state):
122
135
# x = F.relu(self.linear4(x))
123
136
124
137
mean = self .action_range * F .tanh (self .mean_linear (x ))
125
- log_std = self .log_std_linear (x )
126
- log_std = torch .clamp (log_std , self .log_std_min , self .log_std_max )
138
+ # implementation 1
139
+ # log_std = self.log_std_linear(x)
140
+ # log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
141
+
142
+ # implementation 2
143
+ zeros = torch .zeros (mean .size ())
144
+ if state .is_cuda :
145
+ zeros = zeros .cuda ()
146
+ log_std = self .log_std (zeros )
127
147
128
148
return mean , log_std
129
149
@@ -396,7 +416,7 @@ def main():
396
416
np .random .seed (RANDOMSEED )
397
417
torch .manual_seed (RANDOMSEED )
398
418
399
- env = gym .make (ENV_NAME ).unwrapped
419
+ env = NormalizedActions ( gym .make (ENV_NAME ).unwrapped )
400
420
state_dim = env .observation_space .shape [0 ]
401
421
action_dim = env .action_space .shape [0 ]
402
422
0 commit comments