7
7
- 'x' : the initial point
8
8
- 'config` : a table with configuration parameters for the optimizer
9
9
- 'config.learningRate' : learning rate
10
+ - `config.learningRateDecay` : learning rate decay
10
11
- 'config.beta1' : first moment coefficient
11
12
- 'config.beta2' : second moment coefficient
12
13
- 'config.epsilon' : for numerical stability
@@ -25,6 +26,7 @@ function optim.adam(opfunc, x, config, state)
25
26
local config = config or {}
26
27
local state = state or config
27
28
local lr = config .learningRate or 0.001
29
+ local lrd = config .learningRateDecay or 0
28
30
29
31
local beta1 = config .beta1 or 0.9
30
32
local beta2 = config .beta2 or 0.999
@@ -48,6 +50,9 @@ function optim.adam(opfunc, x, config, state)
48
50
-- A tmp tensor to hold the sqrt(v) + epsilon
49
51
state .denom = state .denom or x .new (dfdx :size ()):zero ()
50
52
53
+ -- (3) learning rate decay (annealing)
54
+ local clr = lr / (1 + state .t * lrd )
55
+
51
56
state .t = state .t + 1
52
57
53
58
-- Decay the first and second moment running average coefficient
@@ -58,8 +63,8 @@ function optim.adam(opfunc, x, config, state)
58
63
59
64
local biasCorrection1 = 1 - beta1 ^ state .t
60
65
local biasCorrection2 = 1 - beta2 ^ state .t
61
- local stepSize = lr * math.sqrt (biasCorrection2 )/ biasCorrection1
62
- -- (3 ) update x
66
+ local stepSize = clr * math.sqrt (biasCorrection2 )/ biasCorrection1
67
+ -- (4 ) update x
63
68
x :addcdiv (- stepSize , state .m , state .denom )
64
69
65
70
-- return x*, f(x) before optimization
0 commit comments