@@ -79,7 +79,7 @@ def to_one_hot(i, n_classes=None):
79
79
## Choose an action by greedily (with e chance of random action) from the Q-network
80
80
a , allQ = sess .run ([predict , y ], feed_dict = {inputs : [to_one_hot (s , 16 )]})
81
81
## e-Greedy Exploration !!! sample random action
82
- if np .random .rand (1 ) < e :
82
+ if np .random .rand (1 ) < e :
83
83
a [0 ] = env .action_space .sample ()
84
84
## Get new state and reward from environment
85
85
s1 , r , d , _ = env .step (a [0 ])
@@ -88,7 +88,7 @@ def to_one_hot(i, n_classes=None):
88
88
## Obtain maxQ' and set our target value for chosen action.
89
89
maxQ1 = np .max (Q1 ) # in Q-Learning, policy is greedy, so we use "max" to select the next action.
90
90
targetQ = allQ
91
- targetQ [0 , a [0 ]] = r + lambd * maxQ1
91
+ targetQ [0 , a [0 ]] = r + lambd * maxQ1
92
92
## Train network using target and predicted Q values
93
93
# it is not real target Q value, it is just an estimation,
94
94
# but check the Q-Learning update formula:
@@ -99,7 +99,7 @@ def to_one_hot(i, n_classes=None):
99
99
rAll += r
100
100
s = s1
101
101
## Reduce chance of random action if an episode is done.
102
- if d == True :
102
+ if d == True :
103
103
e = 1. / ((i / 50 ) + 10 ) # reduce e, GLIE: Greey in the limit with infinite Exploration
104
104
break
105
105
0 commit comments