-
Notifications
You must be signed in to change notification settings - Fork 739
/
Copy pathdeep_sarsa_agent.py
executable file
·117 lines (98 loc) · 3.89 KB
/
deep_sarsa_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import copy
import pylab
import random
import numpy as np
from environment import Env
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
EPISODES = 1000
# this is DeepSARSA Agent for the GridWorld
# Utilize Neural Network as q function approximator
class DeepSARSAgent:
def __init__(self):
self.load_model = False
# actions which agent can do
self.action_space = [0, 1, 2, 3, 4]
# get size of state and action
self.action_size = len(self.action_space)
self.state_size = 15
self.discount_factor = 0.99
self.learning_rate = 0.001
self.epsilon = 1. # exploration
self.epsilon_decay = .9999
self.epsilon_min = 0.01
self.model = self.build_model()
if self.load_model:
self.epsilon = 0.05
self.model.load_weights('./save_model/deep_sarsa_trained.h5')
# approximate Q function using Neural Network
# state is input and Q Value of each action is output of network
def build_model(self):
model = Sequential()
model.add(Dense(30, input_dim=self.state_size, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(self.action_size, activation='linear'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
# get action from model using epsilon-greedy policy
def get_action(self, state):
if np.random.rand() <= self.epsilon:
# The agent acts randomly
return random.randrange(self.action_size)
else:
# Predict the reward value based on the given state
state = np.float32(state)
q_values = self.model.predict(state)
return np.argmax(q_values[0])
def train_model(self, state, action, reward, next_state, next_action, done):
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
state = np.float32(state)
next_state = np.float32(next_state)
target = self.model.predict(state)[0]
# like Q Learning, get maximum Q value at s'
# But from target model
if done:
target[action] = reward
else:
target[action] = (reward + self.discount_factor *
self.model.predict(next_state)[0][next_action])
target = np.reshape(target, [1, 5])
# make minibatch which includes target q value and predicted q value
# and do the model fit!
self.model.fit(state, target, epochs=1, verbose=0)
if __name__ == "__main__":
env = Env()
agent = DeepSARSAgent()
global_step = 0
scores, episodes = [], []
for e in range(EPISODES):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, 15])
while not done:
# fresh env
global_step += 1
# get action for the current state and go one step in environment
action = agent.get_action(state)
next_state, reward, done = env.step(action)
next_state = np.reshape(next_state, [1, 15])
next_action = agent.get_action(next_state)
agent.train_model(state, action, reward, next_state, next_action,
done)
state = next_state
# every time step we do training
score += reward
state = copy.deepcopy(next_state)
if done:
scores.append(score)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
pylab.savefig("./save_graph/deep_sarsa_.png")
print("episode:", e, " score:", score, "global_step",
global_step, " epsilon:", agent.epsilon)
if e % 100 == 0:
agent.model.save_weights("./save_model/deep_sarsa.h5")