Provide AD gradient for MLE/MAP (#1369)

cpfiffer · web-flow · commit f2f666523853 · 2020-08-20T06:51:50.000-07:00
* Use in-place gradients

* Fixing tests, tidying things up

* Increment patch version

* Change version to 0.14.0, address comments

* Remove hack to fix 2nd order optimizers

* Remove redundant FG function

* Add contexts to gradient_logp for Zygote and ReverseDiff

* One day I'll fix all the files at once
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Turing"
 uuid = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
-version = "0.13.0"
+version = "0.14.0"
 
 [deps]
 AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
diff --git a/src/core/ad.jl b/src/core/ad.jl
@@ -60,6 +60,7 @@ ADBackend(::Val) = error("The requested AD backend is not available. Make sure t
 Find the autodifferentiation backend of the algorithm `alg`.
 """
 getADbackend(spl::Sampler) = getADbackend(spl.alg)
+getADbackend(spl::SampleFromPrior) = ADBackend()()
 
 """
     gradient_logp(
@@ -77,9 +78,10 @@ function gradient_logp(
     θ::AbstractVector{<:Real},
     vi::VarInfo,
     model::Model,
-    sampler::Sampler
+    sampler::AbstractSampler,
+    ctx::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()
 )
-    return gradient_logp(getADbackend(sampler), θ, vi, model, sampler)
+    return gradient_logp(getADbackend(sampler), θ, vi, model, sampler, ctx)
 end
 
 """
@@ -100,12 +102,13 @@ function gradient_logp(
     vi::VarInfo,
     model::Model,
     sampler::AbstractSampler=SampleFromPrior(),
+    ctx::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()
 )
     # Define function to compute log joint.
     logp_old = getlogp(vi)
     function f(θ)
         new_vi = VarInfo(vi, sampler, θ)
-        model(new_vi, sampler)
+        model(new_vi, sampler, ctx)
         logp = getlogp(new_vi)
         setlogp!(vi, ForwardDiff.value(logp))
         return logp
@@ -127,13 +130,14 @@ function gradient_logp(
     vi::VarInfo,
     model::Model,
     sampler::AbstractSampler = SampleFromPrior(),
+    ctx::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()
 )
     T = typeof(getlogp(vi))
 
     # Specify objective function.
     function f(θ)
         new_vi = VarInfo(vi, sampler, θ)
-        model(new_vi, sampler)
+        model(new_vi, sampler, ctx)
         return getlogp(new_vi)
     end
 
diff --git a/src/core/compat/reversediff.jl b/src/core/compat/reversediff.jl
@@ -20,6 +20,7 @@ function gradient_logp(
     vi::VarInfo,
     model::Model,
     sampler::AbstractSampler = SampleFromPrior(),
+    context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()
 )
     T = typeof(getlogp(vi))
     
@@ -57,6 +58,7 @@ end
         vi::VarInfo,
         model::Model,
         sampler::AbstractSampler = SampleFromPrior(),
+        context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()
     )
         T = typeof(getlogp(vi))
         
diff --git a/src/core/compat/zygote.jl b/src/core/compat/zygote.jl
@@ -10,6 +10,7 @@ function gradient_logp(
     vi::VarInfo,
     model::Model,
     sampler::AbstractSampler = SampleFromPrior(),
+    context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()
 )
     T = typeof(getlogp(vi))
     
diff --git a/src/modes/ModeEstimation.jl b/src/modes/ModeEstimation.jl
@@ -147,6 +147,44 @@ function (f::OptimLogDensity)(z)
     return -DynamicPPL.getlogp(varinfo)
 end
 
+function (f::OptimLogDensity)(F, G, H, z)
+    # Throw an error if a second order method was used.
+    if H !== nothing
+        error("Second order optimization is not yet supported.")
+    end
+
+    spl = DynamicPPL.SampleFromPrior()
+    
+    if G !== nothing
+        # Calculate log joint and the gradient
+        l, g = gradient_logp(
+            z, 
+            DynamicPPL.VarInfo(f.vi, spl, z), 
+            f.model, 
+            spl,
+            f.context
+        )
+
+        # Use the negative gradient because we are minimizing.
+        G[:] = -g
+
+        # If F is something, return that since we already have the 
+        # log joint.
+        if F !== nothing
+            F = -l
+            return F
+        end
+    end
+
+    # No gradient necessary, just return the log joint.
+    if F !== nothing
+        F = f(z)
+        return F
+    end
+
+    return nothing
+end
+
 """
     ModeResult{
         V<:NamedArrays.NamedArray, 
@@ -378,9 +416,8 @@ function _optimize(
     link!(f.vi, spl)
     init_vals = f.vi[spl]
 
-
     # Optimize!
-    M = Optim.optimize(f, init_vals, optimizer, options, args...; kwargs...)
+    M = Optim.optimize(Optim.only_fgh!(f), init_vals, optimizer, options, args...; kwargs...)
 
     # Warn the user if the optimization did not converge.
     if !Optim.converged(M)
diff --git a/test/modes/ModeEstimation.jl b/test/modes/ModeEstimation.jl
@@ -13,19 +13,17 @@ include(dir*"/test/test_utils/AllUtils.jl")
 @testset "ModeEstimation.jl" begin
     @testset "MLE" begin
         Random.seed!(222)
-        true_value = [0.0625031, 1.75]
+        true_value = [0.0625, 1.75]
         
         m1 = optimize(gdemo_default, MLE())
         m2 = optimize(gdemo_default, MLE(), NelderMead())
-        m3 = optimize(gdemo_default, MLE(), Newton())
-        m4 = optimize(gdemo_default, MLE(), true_value, Newton())
-        m5 = optimize(gdemo_default, MLE(), true_value)
+        m3 = optimize(gdemo_default, MLE(), true_value, LBFGS())
+        m4 = optimize(gdemo_default, MLE(), true_value)
 
         @test all(isapprox.(m1.values.array - true_value, 0.0, atol=0.01))
         @test all(isapprox.(m2.values.array - true_value, 0.0, atol=0.01))
         @test all(isapprox.(m3.values.array - true_value, 0.0, atol=0.01))
         @test all(isapprox.(m4.values.array - true_value, 0.0, atol=0.01))
-        @test all(isapprox.(m5.values.array - true_value, 0.0, atol=0.01))
     end
 
     @testset "MAP" begin
@@ -34,15 +32,13 @@ include(dir*"/test/test_utils/AllUtils.jl")
         
         m1 = optimize(gdemo_default, MAP())
         m2 = optimize(gdemo_default, MAP(), NelderMead())
-        m3 = optimize(gdemo_default, MAP(), Newton())
-        m4 = optimize(gdemo_default, MAP(), true_value, Newton())
-        m5 = optimize(gdemo_default, MAP(), true_value)
+        m3 = optimize(gdemo_default, MAP(), true_value, LBFGS())
+        m4 = optimize(gdemo_default, MAP(), true_value)
         
         @test all(isapprox.(m1.values.array - true_value, 0.0, atol=0.01))
         @test all(isapprox.(m2.values.array - true_value, 0.0, atol=0.01))
         @test all(isapprox.(m3.values.array - true_value, 0.0, atol=0.01))
         @test all(isapprox.(m4.values.array - true_value, 0.0, atol=0.01))
-        @test all(isapprox.(m5.values.array - true_value, 0.0, atol=0.01))
     end
 
     @testset "StatsBase integration" begin

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ function gradient_logp(`
`20`	`20`	`vi::VarInfo,`
`21`	`21`	`model::Model,`
`22`	`22`	`sampler::AbstractSampler = SampleFromPrior(),`
	`23`	`+ context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()`
`23`	`24`	`)`
`24`	`25`	`T = typeof(getlogp(vi))`
`25`	`26`
`@@ -57,6 +58,7 @@ end`
`57`	`58`	`vi::VarInfo,`
`58`	`59`	`model::Model,`
`59`	`60`	`sampler::AbstractSampler = SampleFromPrior(),`
	`61`	`+ context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()`
`60`	`62`	`)`
`61`	`63`	`T = typeof(getlogp(vi))`
`62`	`64`
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ function gradient_logp(`
`10`	`10`	`vi::VarInfo,`
`11`	`11`	`model::Model,`
`12`	`12`	`sampler::AbstractSampler = SampleFromPrior(),`
	`13`	`+ context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext()`
`13`	`14`	`)`
`14`	`15`	`T = typeof(getlogp(vi))`
`15`	`16`