-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
Copy pathapproximations.py
432 lines (380 loc) · 14.2 KB
/
approximations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import numpy as np
import theano
from theano import tensor as tt
import pymc3 as pm
from pymc3.distributions.dist_math import rho2sd, log_normal, log_normal_mv
from pymc3.variational.opvi import Approximation
from pymc3.theanof import memoize
__all__ = [
'MeanField',
'FullRank',
'Empirical',
'sample_approx'
]
class MeanField(Approximation):
"""Mean Field approximation to the posterior where spherical Gaussian family
is fitted to minimize KL divergence from True posterior. It is assumed
that latent space variables are uncorrelated that is the main drawback
of the method
Parameters
----------
local_rv : dict[var->tuple]
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
model : :class:`pymc3.Model`
PyMC3 model for inference
start : `Point`
initial mean
cost_part_grad_scale : `scalar`
Scaling score part of gradient can be useful near optimum for
archiving better convergence properties. Common schedule is
1 at the start and 0 in the end. So slow decay will be ok.
See (Sticking the Landing; Geoffrey Roeder,
Yuhuai Wu, David Duvenaud, 2016) for details
scale_cost_to_minibatch : `bool`
Scale cost to minibatch instead of full dataset, default False
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
References
----------
- Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
approximateinference.org/accepted/RoederEtAl2016.pdf
"""
@property
def mean(self):
return self.shared_params['mu']
@property
def rho(self):
return self.shared_params['rho']
@property
def cov(self):
return tt.diag(rho2sd(self.rho)**2)
@property
def std(self):
return rho2sd(self.rho)
def create_shared_params(self, **kwargs):
start = kwargs.get('start')
if start is None:
start = self.model.test_point
else:
start_ = self.model.test_point.copy()
pm.sampling._update_start_vals(start_, start, self.model)
start = start_
start = self.gbij.map(start)
return {'mu': theano.shared(
pm.floatX(start), 'mu'),
'rho': theano.shared(
pm.floatX(np.zeros((self.global_size,))), 'rho')}
def log_q_W_global(self, z):
"""
log_q_W samples over q for global vars
"""
mu = self.scale_grad(self.mean)
rho = self.scale_grad(self.rho)
z = z[self.global_slc]
logq = tt.sum(log_normal(z, mu, rho=rho))
return logq
def random_global(self, size=None, no_rand=False):
initial = self.initial(size, no_rand, l=self.global_size)
sd = rho2sd(self.rho)
mu = self.mean
return sd * initial + mu
class FullRank(Approximation):
"""Full Rank approximation to the posterior where Multivariate Gaussian family
is fitted to minimize KL divergence from True posterior. In contrast to
MeanField approach correlations between variables are taken in account. The
main drawback of the method is computational cost.
Parameters
----------
local_rv : dict[var->tuple]
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
model : PyMC3 model for inference
start : Point
initial mean
cost_part_grad_scale : float or scalar tensor
Scaling score part of gradient can be useful near optimum for
archiving better convergence properties. Common schedule is
1 at the start and 0 in the end. So slow decay will be ok.
See (Sticking the Landing; Geoffrey Roeder,
Yuhuai Wu, David Duvenaud, 2016) for details
scale_cost_to_minibatch : bool, default False
Scale cost to minibatch instead of full dataset
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
Other Parameters
----------------
gpu_compat : bool
use GPU compatible version or not
References
----------
- Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
approximateinference.org/accepted/RoederEtAl2016.pdf
"""
def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
scale_cost_to_minibatch=False,
gpu_compat=False, random_seed=None, **kwargs):
super(FullRank, self).__init__(
local_rv=local_rv, model=model,
cost_part_grad_scale=cost_part_grad_scale,
scale_cost_to_minibatch=scale_cost_to_minibatch,
random_seed=random_seed, **kwargs
)
self.gpu_compat = gpu_compat
@property
def L(self):
return self.shared_params['L_tril'][self.tril_index_matrix]
@property
def mean(self):
return self.shared_params['mu']
@property
def cov(self):
L = self.L
return L.dot(L.T)
@property
def num_tril_entries(self):
n = self.global_size
return int(n * (n + 1) / 2)
@property
def tril_index_matrix(self):
n = self.global_size
num_tril_entries = self.num_tril_entries
tril_index_matrix = np.zeros([n, n], dtype=int)
tril_index_matrix[np.tril_indices(n)] = np.arange(num_tril_entries)
tril_index_matrix[
np.tril_indices(n)[::-1]
] = np.arange(num_tril_entries)
return tril_index_matrix
def create_shared_params(self, **kwargs):
start = kwargs.get('start')
if start is None:
start = self.model.test_point
else:
start_ = self.model.test_point.copy()
pm.sampling._update_start_vals(start_, start, self.model)
start = start_
start = pm.floatX(self.gbij.map(start))
n = self.global_size
L_tril = (
np.eye(n)
[np.tril_indices(n)]
.astype(theano.config.floatX)
)
return {'mu': theano.shared(start, 'mu'),
'L_tril': theano.shared(L_tril, 'L_tril')
}
def log_q_W_global(self, z):
"""log_q_W samples over q for global vars
"""
mu = self.scale_grad(self.mean)
L = self.scale_grad(self.L)
z = z[self.global_slc]
return log_normal_mv(z, mu, chol=L, gpu_compat=self.gpu_compat)
def random_global(self, size=None, no_rand=False):
# (samples, dim) or (dim, )
initial = self.initial(size, no_rand, l=self.global_size).T
# (dim, dim)
L = self.L
# (dim, )
mu = self.mean
# x = Az + m, but it assumes z is vector
# When we get z with shape (samples, dim)
# we need to transpose Az
return L.dot(initial).T + mu
@classmethod
def from_mean_field(cls, mean_field, gpu_compat=False):
"""Construct FullRank from MeanField approximation
Parameters
----------
mean_field : :class:`MeanField`
approximation to start with
Other Parameters
----------------
gpu_compat : `bool`
use GPU compatible version or not
Returns
-------
:class:`FullRank`
"""
full_rank = object.__new__(cls) # type: FullRank
full_rank.gpu_compat = gpu_compat
full_rank.__dict__.update(mean_field.__dict__)
full_rank.shared_params = full_rank.create_shared_params()
full_rank.shared_params['mu'].set_value(
mean_field.shared_params['mu'].get_value()
)
rho = mean_field.shared_params['rho'].get_value()
n = full_rank.global_size
L_tril = (
np.diag(np.log1p(np.exp(rho))) # rho2sd
[np.tril_indices(n)]
.astype(theano.config.floatX)
)
full_rank.shared_params['L_tril'].set_value(L_tril)
return full_rank
class Empirical(Approximation):
"""Builds Approximation instance from a given trace,
it has the same interface as variational approximation
Parameters
----------
trace : :class:`MultiTrace`
Trace storing samples (e.g. from step methods)
local_rv : dict[var->tuple]
Experimental for Empirical Approximation
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
scale_cost_to_minibatch : `bool`
Scale cost to minibatch instead of full dataset, default False
model : :class:`pymc3.Model`
PyMC3 model for inference
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
Examples
--------
>>> with model:
... step = NUTS()
... trace = sample(1000, step=step)
... histogram = Empirical(trace[100:])
"""
def __init__(self, trace, local_rv=None,
scale_cost_to_minibatch=False,
model=None, random_seed=None, **kwargs):
super(Empirical, self).__init__(
local_rv=local_rv, scale_cost_to_minibatch=scale_cost_to_minibatch,
model=model, trace=trace, random_seed=random_seed, **kwargs)
def check_model(self, model, **kwargs):
trace = kwargs.get('trace')
if (trace is not None
and not all([var.name in trace.varnames
for var in model.free_RVs])):
raise ValueError('trace has not all FreeRV')
def create_shared_params(self, **kwargs):
trace = kwargs.get('trace')
if trace is None:
histogram = np.atleast_2d(self.gbij.map(self.model.test_point))
else:
histogram = np.empty((len(trace), self.global_size))
for i in range(len(trace)):
histogram[i] = self.gbij.map(trace[i])
return theano.shared(pm.floatX(histogram), 'histogram')
def randidx(self, size=None):
if size is None:
size = ()
elif isinstance(size, tt.TensorVariable):
if size.ndim < 1:
size = size[None]
elif size.ndim > 1:
raise ValueError('size ndim should be no more than 1d')
else:
pass
else:
size = tuple(np.atleast_1d(size))
return (self._rng
.uniform(size=size, low=0.0, high=self.histogram.shape[0] - 1e-16)
.astype('int64'))
def random_global(self, size=None, no_rand=False):
theano_condition_is_here = isinstance(no_rand, tt.Variable)
if theano_condition_is_here:
return tt.switch(no_rand,
self.mean,
self.histogram[self.randidx(size)])
else:
if no_rand:
return self.mean
else:
return self.histogram[self.randidx(size)]
@property
def histogram(self):
"""Shortcut to flattened Trace
"""
return self.shared_params
@property
@memoize
def histogram_logp(self):
"""Symbolic logp for every point in trace
"""
node = self.to_flat_input(self.model.logpt)
def mapping(z):
return theano.clone(node, {self.input: z})
x = self.histogram
_histogram_logp, _ = theano.scan(
mapping, x, n_steps=x.shape[0]
)
return _histogram_logp
@property
def mean(self):
return self.histogram.mean(0)
@property
def cov(self):
x = (self.histogram - self.mean)
return x.T.dot(x) / self.histogram.shape[0]
@classmethod
def from_noise(cls, size, jitter=.01, local_rv=None,
start=None, model=None, random_seed=None, **kwargs):
"""Initialize Histogram with random noise
Parameters
----------
size : `int`
number of initial particles
jitter : `float`
initial sd
local_rv : `dict`
mapping {model_variable -> local_variable}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
start : `Point`
initial point
model : :class:`pymc3.Model`
PyMC3 model for inference
random_seed : None or `int`
leave None to use package global RandomStream or other
valid value to create instance specific one
kwargs : other kwargs passed to init
Returns
-------
:class:`Empirical`
"""
hist = cls(
None,
local_rv=local_rv,
model=model,
random_seed=random_seed,
**kwargs)
if start is None:
start = hist.model.test_point
else:
start_ = hist.model.test_point.copy()
pm.sampling._update_start_vals(start_, start, hist.model)
start = start_
start = pm.floatX(hist.gbij.map(start))
# Initialize particles
x0 = np.tile(start, (size, 1))
x0 += pm.floatX(np.random.normal(0, jitter, x0.shape))
hist.histogram.set_value(x0)
return hist
def sample_approx(approx, draws=100, include_transformed=True):
"""Draw samples from variational posterior.
Parameters
----------
approx : :class:`Approximation`
Approximation to sample from
draws : `int`
Number of random samples.
include_transformed : `bool`
If True, transformed variables are also sampled. Default is True.
Returns
-------
trace : class:`pymc3.backends.base.MultiTrace`
Samples drawn from variational posterior.
"""
if not isinstance(approx, Approximation):
raise TypeError('Need Approximation instance, got %r' % approx)
return approx.sample(draws=draws, include_transformed=include_transformed)