diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index fa0c0d1f16..b3ebc22f46 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -4,6 +4,8 @@ ### New features +- `Mixture` now supports mixtures of multidimensional probability distributions, not just lists of 1D distributions. + ### Maintenance - All occurances of `sd` as a parameter name have been renamed to `sigma`. `sd` will continue to function for backwards compatibility. @@ -13,6 +15,13 @@ - Added a fix to allow the imputation of single missing values of observed data, which previously would fail (Fix issue #3122). - Fix for #3346. The `draw_values` function was too permissive with what could be grabbed from inside `point`, which lead to an error when sampling posterior predictives of variables that depended on shared variables that had changed their shape after `pm.sample()` had been called. - Fix for #3354. `draw_values` now adds the theano graph descendants of `TensorConstant` or `SharedVariables` to the named relationship nodes stack, only if these descendants are `ObservedRV` or `MultiObservedRV` instances. +- Fixed bug in broadcast_distrution_samples, which did not handle correctly cases in which some samples did not have the size tuple prepended. +- Changed `MvNormal.random`'s usage of `tensordot` for Cholesky encoded covariances. This lead to wrong axis broadcasting and seemed to be the cause for issue #3343. +- Fixed defect in `Mixture.random` when multidimensional mixtures were involved. The mixture component was not preserved across all the elements of the dimensions of the mixture. This meant that the correlations across elements within a given draw of the mixture were partly broken. +- Restructured `Mixture.random` to allow better use of vectorized calls to `comp_dists.random`. +- Added tests for mixtures of multidimensional distributions to the test suite. +- Fixed incorrect usage of `broadcast_distribution_samples` in `DiscreteWeibull`. +- `Mixture`'s default dtype is now determined by `theano.config.floatX`. ### Deprecations diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index cd963110cf..e74293b068 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -347,12 +347,12 @@ def _ppf(self, p): def _random(self, q, beta, size=None): p = np.random.uniform(size=size) - p, q, beta = broadcast_distribution_samples([p, q, beta], size=size) return np.ceil(np.power(np.log(1 - p) / np.log(q), 1. / beta)) - 1 def random(self, point=None, size=None): q, beta = draw_values([self.q, self.beta], point=point, size=size) + q, beta = broadcast_distribution_samples([q, beta], size=size) return generate_samples(self._random, q, beta, dist_shape=self.shape, diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py index 16b9cf14d0..d9a74fa43c 100644 --- a/pymc3/distributions/distribution.py +++ b/pymc3/distributions/distribution.py @@ -254,7 +254,7 @@ class _DrawValuesContextBlocker(_DrawValuesContext, metaclass=InitContextMeta): """ def __new__(cls, *args, **kwargs): # resolves the parent instance - instance = super(_DrawValuesContextBlocker, cls).__new__(cls) + instance = super().__new__(cls) instance._parent = None return instance @@ -599,6 +599,9 @@ def generate_samples(generator, *args, **kwargs): parameters. This may be required when the parameter shape does not determine the shape of a single sample, for example, the shape of the probabilities in the Categorical distribution. + not_broadcast_kwargs: dict or None + Key word argument dictionary to provide to the random generator, which + must not be broadcasted with the rest of the *args and **kwargs. Any remaining *args and **kwargs are passed on to the generator function. """ @@ -606,6 +609,9 @@ def generate_samples(generator, *args, **kwargs): one_d = _is_one_d(dist_shape) size = kwargs.pop('size', None) broadcast_shape = kwargs.pop('broadcast_shape', None) + not_broadcast_kwargs = kwargs.pop('not_broadcast_kwargs', None) + if not_broadcast_kwargs is None: + not_broadcast_kwargs = dict() if size is None: size = 1 @@ -625,6 +631,8 @@ def generate_samples(generator, *args, **kwargs): kwargs = {k: v.reshape(v.shape + (1,) * (max_dims - v.ndim)) for k, v in kwargs.items()} inputs = args + tuple(kwargs.values()) broadcast_shape = np.broadcast(*inputs).shape # size of generator(size=1) + # Update kwargs with the keyword arguments that were not broadcasted + kwargs.update(not_broadcast_kwargs) dist_shape = to_tuple(dist_shape) broadcast_shape = to_tuple(broadcast_shape) @@ -639,20 +647,30 @@ def generate_samples(generator, *args, **kwargs): samples = generator(size=broadcast_shape, *args, **kwargs) elif dist_shape == broadcast_shape: samples = generator(size=size_tup + dist_shape, *args, **kwargs) - elif len(dist_shape) == 0 and size_tup and broadcast_shape[:len(size_tup)] == size_tup: - # Input's dist_shape is scalar, but it has size repetitions. - # So now the size matches but we have to manually broadcast to - # the right dist_shape - samples = [generator(*args, **kwargs)] - if samples[0].shape == broadcast_shape: - samples = samples[0] + elif len(dist_shape) == 0 and size_tup and broadcast_shape: + # There is no dist_shape (scalar distribution) but the parameters + # broadcast shape and size_tup determine the size to provide to + # the generator + if broadcast_shape[:len(size_tup)] == size_tup: + # Input's dist_shape is scalar, but it has size repetitions. + # So now the size matches but we have to manually broadcast to + # the right dist_shape + samples = [generator(*args, **kwargs)] + if samples[0].shape == broadcast_shape: + samples = samples[0] + else: + suffix = broadcast_shape[len(size_tup):] + dist_shape + samples.extend([generator(*args, **kwargs). + reshape(broadcast_shape)[..., np.newaxis] + for _ in range(np.prod(suffix, + dtype=int) - 1)]) + samples = np.hstack(samples).reshape(size_tup + suffix) else: - suffix = broadcast_shape[len(size_tup):] + dist_shape - samples.extend([generator(*args, **kwargs). - reshape(broadcast_shape)[..., np.newaxis] - for _ in range(np.prod(suffix, - dtype=int) - 1)]) - samples = np.hstack(samples).reshape(size_tup + suffix) + # The parameter shape is given, but we have to concatenate it + # with the size tuple + samples = generator(size=size_tup + broadcast_shape, + *args, + **kwargs) else: samples = None # Args have been broadcast correctly, can just ask for the right shape out @@ -686,27 +704,68 @@ def generate_samples(generator, *args, **kwargs): def broadcast_distribution_samples(samples, size=None): + """Broadcast samples drawn from distributions taking into account the + size (i.e. the number of samples) of the draw, which is prepended to + the sample's shape. + + Parameters + ---------- + samples: Iterable of ndarrays holding the sampled values + size: None, int or tuple (optional) + size of the sample set requested. + + Returns + ------- + List of broadcasted sample arrays + + Examples + -------- + .. code-block:: python + size = 100 + sample0 = np.random.randn(size) + sample1 = np.random.randn(size, 5) + sample2 = np.random.randn(size, 4, 5) + out = broadcast_distribution_samples([sample0, sample1, sample2], + size=size) + assert all((o.shape == (size, 4, 5) for o in out)) + assert np.all(sample0[:, None, None] == out[0]) + assert np.all(sample1[:, None, :] == out[1]) + assert np.all(sample2 == out[2]) + + .. code-block:: python + size = 100 + sample0 = np.random.randn(size) + sample1 = np.random.randn(5) + sample2 = np.random.randn(4, 5) + out = broadcast_distribution_samples([sample0, sample1, sample2], + size=size) + assert all((o.shape == (size, 4, 5) for o in out)) + assert np.all(sample0[:, None, None] == out[0]) + assert np.all(sample1 == out[1]) + assert np.all(sample2 == out[2]) + """ if size is None: return np.broadcast_arrays(*samples) _size = to_tuple(size) - try: - broadcasted_samples = np.broadcast_arrays(*samples) - except ValueError: - # Raw samples shapes - p_shapes = [p.shape for p in samples] - # samples shapes without the size prepend - sp_shapes = [s[len(_size):] if _size == s[:len(_size)] else s - for s in p_shapes] - broadcast_shape = np.broadcast(*[np.empty(s) for s in sp_shapes]).shape - broadcasted_samples = [] - for param, p_shape, sp_shape in zip(samples, p_shapes, sp_shapes): - if _size == p_shape[:len(_size)]: - slicer_head = [slice(None)] * len(_size) - else: - slicer_head = [np.newaxis] * len(_size) + # Raw samples shapes + p_shapes = [p.shape for p in samples] + # samples shapes without the size prepend + sp_shapes = [s[len(_size):] if _size == s[:len(_size)] else s + for s in p_shapes] + broadcast_shape = np.broadcast(*[np.empty(s) for s in sp_shapes]).shape + broadcasted_samples = [] + for param, p_shape, sp_shape in zip(samples, p_shapes, sp_shapes): + if _size == p_shape[:len(_size)]: + # If size prepends the shape, then we have to add broadcasting axis + # in the middle + slicer_head = [slice(None)] * len(_size) slicer_tail = ([np.newaxis] * (len(broadcast_shape) - len(sp_shape)) + [slice(None)] * len(sp_shape)) - broadcasted_samples.append(param[tuple(slicer_head + slicer_tail)]) - broadcasted_samples = np.broadcast_arrays(*broadcasted_samples) - return broadcasted_samples + else: + # If size does not prepend the shape, then we have leave the + # parameter as is + slicer_head = [] + slicer_tail = [slice(None)] * len(sp_shape) + broadcasted_samples.append(param[tuple(slicer_head + slicer_tail)]) + return np.broadcast_arrays(*broadcasted_samples) diff --git a/pymc3/distributions/mixture.py b/pymc3/distributions/mixture.py index 2333a83505..93e2e2326c 100644 --- a/pymc3/distributions/mixture.py +++ b/pymc3/distributions/mixture.py @@ -1,4 +1,6 @@ +from collections.abc import Iterable import numpy as np +import theano import theano.tensor as tt from pymc3.util import get_variable_name @@ -6,8 +8,10 @@ from .dist_math import bound, random_choice from .distribution import (Discrete, Distribution, draw_values, generate_samples, _DrawValuesContext, - _DrawValuesContextBlocker, to_tuple) + _DrawValuesContextBlocker, to_tuple, + broadcast_distribution_samples) from .continuous import get_tau_sigma, Normal +from ..theanof import _conversion_map def all_discrete(comp_dists): @@ -39,16 +43,16 @@ class Mixture(Distribution): w >= 0 and w <= 1 the mixture weights comp_dists : multidimensional PyMC3 distribution (e.g. `pm.Poisson.dist(...)`) - or iterable of one-dimensional PyMC3 distributions the - component distributions :math:`f_1, \ldots, f_n` + or iterable of PyMC3 distributions the component distributions + :math:`f_1, \ldots, f_n` - Example - ------- + Examples + -------- .. code-block:: python # 2-Mixture Poisson distribution with pm.Model() as model: - lam = pm.Exponential('lam', lam=1, shape=(2,)) # `shape=(2,)` indicates two mixtures. + lam = pm.Exponential('lam', lam=1, shape=(2,)) # `shape=(2,)` indicates two mixture components. # As we just need the logp, rather than add a RV to the model, we need to call .dist() components = pm.Poisson.dist(mu=lam, shape=(2,)) @@ -68,9 +72,50 @@ class Mixture(Distribution): w = pm.Dirichlet('w', a=np.array([1, 1])) like = pm.Mixture('like', w=w, comp_dists = [pois1, pois2], observed=data) + + # npop-Mixture of multidimensional Gaussian + npop = 5 + nd = (3, 4) + with pm.Model() as model: + mu = pm.Normal('mu', mu=np.arange(npop), sigma=1, shape=npop) # Each component has an independent mean + + w = pm.Dirichlet('w', a=np.ones(npop)) + + components = pm.Normal.dist(mu=mu, sigma=1, shape=nd + (npop,)) # nd + (npop,) shaped multinomial + + like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=nd) # The resulting mixture is nd-shaped + + # Multidimensional Mixture as stacked independent mixtures + with pm.Model() as model: + mu = pm.Normal('mu', mu=np.arange(5), sigma=1, shape=5) # Each component has an independent mean + + w = pm.Dirichlet('w', a=np.ones(3, 5)) # w is a stack of 3 independent 5 component weight arrays + + components = pm.Normal.dist(mu=mu, sigma=1, shape=(3, 5)) + + # The mixture is an array of 3 elements. + # Each can be thought of as an independent scalar mixture of 5 components + like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=3) """ def __init__(self, w, comp_dists, *args, **kwargs): + # comp_dists type checking + if not ( + isinstance(comp_dists, Distribution) + or ( + isinstance(comp_dists, Iterable) + and all((isinstance(c, Distribution) for c in comp_dists)) + ) + ): + raise TypeError( + "Supplied Mixture comp_dists must be a " + "Distribution or an iterable of " + "Distributions. Got {} instead.".format( + type(comp_dists) + if not isinstance(comp_dists, Iterable) + else [type(c) for c in comp_dists] + ) + ) shape = kwargs.pop('shape', ()) self.w = w = tt.as_tensor_variable(w) @@ -79,9 +124,9 @@ def __init__(self, w, comp_dists, *args, **kwargs): defaults = kwargs.pop('defaults', []) if all_discrete(comp_dists): - dtype = kwargs.pop('dtype', 'int64') + default_dtype = _conversion_map[theano.config.floatX] else: - dtype = kwargs.pop('dtype', 'float64') + default_dtype = theano.config.floatX try: self.mean = (w * self._comp_means()).sum(axis=-1) @@ -90,6 +135,7 @@ def __init__(self, w, comp_dists, *args, **kwargs): defaults.append('mean') except AttributeError: pass + dtype = kwargs.pop('dtype', default_dtype) try: comp_modes = self._comp_modes() @@ -108,41 +154,128 @@ def comp_dists(self): return self._comp_dists @comp_dists.setter - def comp_dists(self, _comp_dists): - self._comp_dists = _comp_dists - # Tests if the comp_dists can call random with non None size - with _DrawValuesContextBlocker(): - if isinstance(self.comp_dists, (list, tuple)): - try: - [comp_dist.random(size=23) - for comp_dist in self.comp_dists] - self._comp_dists_vect = True - except Exception: - # The comp_dists cannot call random with non None size or - # without knowledge of the point so we assume that we will - # have to iterate calls to random to get the correct size - self._comp_dists_vect = False + def comp_dists(self, comp_dists): + self._comp_dists = comp_dists + if isinstance(comp_dists, Distribution): + self._comp_dist_shapes = to_tuple(comp_dists.shape) + self._broadcast_shape = self._comp_dist_shapes + self.comp_is_distribution = True + else: + # Now we check the comp_dists distribution shape, see what + # the broadcast shape would be. This shape will be the dist_shape + # used by generate samples (the shape of a single random sample) + # from the mixture + self._comp_dist_shapes = [to_tuple(d.shape) for d in comp_dists] + # All component distributions must broadcast with each other + try: + self._broadcast_shape = np.broadcast( + *[np.empty(shape) for shape in self._comp_dist_shapes] + ).shape + except Exception: + raise TypeError('Supplied comp_dists shapes do not broadcast ' + 'with each other. comp_dists shapes are: ' + '{}'.format(self._comp_dist_shapes)) + + # We wrap the _comp_dist.random by adding the kwarg raw_size_, + # which will be the size attribute passed to _comp_samples. + # _comp_samples then calls generate_samples, which may change the + # size value to make it compatible with scipy.stats.*.rvs + self._generators = [] + for comp_dist in comp_dists: + generator = Mixture._comp_dist_random_wrapper(comp_dist.random) + self._generators.append(generator) + self.comp_is_distribution = False + + @staticmethod + def _comp_dist_random_wrapper(random): + """Wrap the comp_dists.random method to take the kwarg raw_size_ and + use it's value to replace the size parameter. This is needed because + generate_samples makes the size value compatible with the + scipy.stats.*.rvs, where size has a different meaning than in the + distributions' random methods. + """ + def wrapped_random(*args, **kwargs): + raw_size_ = kwargs.pop('raw_size_', None) + # Distribution.random's signature is always (point=None, size=None) + # so size could be the second arg or be given as a kwarg + if len(args) > 1: + args[1] = raw_size_ else: - try: - self.comp_dists.random(size=23) - self._comp_dists_vect = True - except Exception: - # The comp_dists cannot call random with non None size or - # without knowledge of the point so we assume that we will - # have to iterate calls to random to get the correct size - self._comp_dists_vect = False + kwargs['size'] = raw_size_ + return random(*args, **kwargs) + return wrapped_random def _comp_logp(self, value): comp_dists = self.comp_dists - try: - value_ = value if value.ndim > 1 else tt.shape_padright(value) - + if self.comp_is_distribution: + # Value can be many things. It can be the self tensor, the mode + # test point or it can be observed data. The latter case requires + # careful handling of shape, as the observed's shape could look + # like (repetitions,) + dist_shape, which does not include the last + # mixture axis. For this reason, we try to eval the value.shape, + # compare it with self.shape and shape_padright if we infer that + # the value holds observed data + try: + val_shape = tuple(value.shape.eval()) + except AttributeError: + val_shape = value.shape + except theano.gof.MissingInputError: + val_shape = None + try: + self_shape = tuple(self.shape) + except AttributeError: + # Happens in __init__ when computing self.logp(comp_modes) + self_shape = None + comp_shape = tuple(comp_dists.shape) + ndim = value.ndim + if ( + val_shape is not None and + not((self_shape is not None and val_shape == self_shape) or + val_shape == comp_shape) + ): + # value is neither the test point nor the self tensor, it + # is likely to hold observed values, so we must compute the + # ndim discarding the dimensions that don't match + # self_shape + if ( + self_shape and + val_shape[-len(self_shape):] == self_shape + ): + # value has observed values for the Mixture + ndim = len(self_shape) + elif ( + comp_shape and + val_shape[-len(comp_shape):] == comp_shape + ): + # value has observed for the Mixture components + ndim = len(comp_shape) + else: + # We cannot infer what was passed, we handle this + # as was done in earlier versions of Mixture. We pad + # always if ndim is lower or equal to 1 (default + # legacy implementation) + if ndim <= 1: + ndim = len(comp_dists.shape) - 1 + else: + # We reach this point if value does not hold observed data, so + # we can use its ndim safely to determine shape padding, or it + # holds something that we cannot infer, so we revert to using + # the value's ndim for shape padding. + # We will always pad a single dimension if ndim is lower or + # equal to 1 (default legacy implementation) + if ndim <= 1: + ndim = len(comp_dists.shape) - 1 + if ndim < len(comp_dists.shape): + value_ = tt.shape_padright(value, + len(comp_dists.shape) - ndim) + else: + value_ = value return comp_dists.logp(value_) - except AttributeError: + else: return tt.squeeze(tt.stack([comp_dist.logp(value) for comp_dist in comp_dists], - axis=1)) + axis=-1)) def _comp_means(self): try: @@ -150,7 +283,7 @@ def _comp_means(self): except AttributeError: return tt.squeeze(tt.stack([comp_dist.mean for comp_dist in self.comp_dists], - axis=1)) + axis=-1)) def _comp_modes(self): try: @@ -158,36 +291,110 @@ def _comp_modes(self): except AttributeError: return tt.squeeze(tt.stack([comp_dist.mode for comp_dist in self.comp_dists], - axis=1)) + axis=-1)) - def _comp_samples(self, point=None, size=None): - if self._comp_dists_vect or size is None: - try: - return self.comp_dists.random(point=point, size=size) - except AttributeError: - samples = np.array([comp_dist.random(point=point, size=size) - for comp_dist in self.comp_dists]) - samples = np.moveaxis(samples, 0, samples.ndim - 1) + def _comp_samples(self, point=None, size=None, + comp_dist_shapes=None, + broadcast_shape=None): + if self.comp_is_distribution: + samples = self._comp_dists.random(point=point, size=size) else: - # We must iterate the calls to random manually - size = to_tuple(size) - _size = int(np.prod(size)) - try: - samples = np.array([self.comp_dists.random(point=point, - size=None) - for _ in range(_size)]) - samples = np.reshape(samples, size + samples.shape[1:]) - except AttributeError: - samples = np.array([[comp_dist.random(point=point, size=None) - for _ in range(_size)] - for comp_dist in self.comp_dists]) - samples = np.moveaxis(samples, 0, samples.ndim - 1) - samples = np.reshape(samples, size + samples[1:]) - - if samples.shape[-1] == 1: - return samples[..., 0] + if comp_dist_shapes is None: + comp_dist_shapes = self._comp_dist_shapes + if broadcast_shape is None: + broadcast_shape = self._sample_shape + samples = [] + for dist_shape, generator in zip(comp_dist_shapes, + self._generators): + sample = generate_samples( + generator=generator, + dist_shape=dist_shape, + broadcast_shape=broadcast_shape, + point=point, + size=size, + not_broadcast_kwargs={'raw_size_': size}, + ) + samples.append(sample) + samples = np.array( + broadcast_distribution_samples(samples, size=size) + ) + # In the logp we assume the last axis holds the mixture components + # so we move the axis to the last dimension + samples = np.moveaxis(samples, 0, -1) + return samples.astype(self.dtype) + + def infer_comp_dist_shapes(self, point=None): + """Try to infer the shapes of the component distributions, + `comp_dists`, and how they should broadcast together. + The behavior is slightly different if `comp_dists` is a `Distribution` + as compared to when it is a list of `Distribution`s. When it is a list + the following procedure is repeated for each element in the list: + 1. Look up the `comp_dists.shape` + 2. If it is not empty, use it as `comp_dist_shape` + 3. If it is an empty tuple, a single random sample is drawn by calling + `comp_dists.random(point=point, size=None)`, and the returned + test_sample's shape is used as the inferred `comp_dists.shape` + + Parameters + ---------- + point: None or dict (optional) + Dictionary that maps rv names to values, to supply to + `self.comp_dists.random` + + Returns + ------- + comp_dist_shapes: shape tuple or list of shape tuples. + If `comp_dists` is a `Distribution`, it is a shape tuple of the + inferred distribution shape. + If `comp_dists` is a list of `Distribution`s, it is a list of + shape tuples inferred for each element in `comp_dists` + broadcast_shape: shape tuple + The shape that results from broadcasting all component's shapes + together. + """ + if self.comp_is_distribution: + if len(self._comp_dist_shapes) > 0: + comp_dist_shapes = self._comp_dist_shapes + else: + # Happens when the distribution is a scalar or when it was not + # given a shape. In these cases we try to draw a single value + # to check its shape, we use the provided point dictionary + # hoping that it can circumvent the Flat and HalfFlat + # undrawable distributions. + with _DrawValuesContextBlocker(): + test_sample = self._comp_dists.random(point=point, + size=None) + comp_dist_shapes = test_sample.shape + broadcast_shape = comp_dist_shapes else: - return samples + # Now we check the comp_dists distribution shape, see what + # the broadcast shape would be. This shape will be the dist_shape + # used by generate samples (the shape of a single random sample) + # from the mixture + comp_dist_shapes = [] + for dist_shape, comp_dist in zip(self._comp_dist_shapes, + self._comp_dists): + if dist_shape == tuple(): + # Happens when the distribution is a scalar or when it was + # not given a shape. In these cases we try to draw a single + # value to check its shape, we use the provided point + # dictionary hoping that it can circumvent the Flat and + # HalfFlat undrawable distributions. + with _DrawValuesContextBlocker(): + test_sample = comp_dist.random(point=point, + size=None) + dist_shape = test_sample.shape + comp_dist_shapes.append(dist_shape) + # All component distributions must broadcast with each other + try: + broadcast_shape = np.broadcast( + *[np.empty(shape) for shape in comp_dist_shapes] + ).shape + except Exception: + raise TypeError('Inferred comp_dist shapes do not broadcast ' + 'with each other. comp_dists inferred shapes ' + 'are: {}'.format(comp_dist_shapes)) + return comp_dist_shapes, broadcast_shape def logp(self, value): w = self.w @@ -199,14 +406,13 @@ def logp(self, value): def random(self, point=None, size=None): # Convert size to tuple size = to_tuple(size) - # Draw mixture weights and a sample from each mixture to infer shape + # Draw mixture weights and infer the comp_dists shapes with _DrawValuesContext() as draw_context: # We first need to check w and comp_tmp shapes and re compute size w = draw_values([self.w], point=point, size=size)[0] - with _DrawValuesContextBlocker(): - # We don't want to store the values drawn here in the context - # because they wont have the correct size - comp_tmp = self._comp_samples(point=point, size=None) + comp_dist_shapes, broadcast_shape = ( + self.infer_comp_dist_shapes(point=point) + ) # When size is not None, it's hard to tell the w parameter shape if size is not None and w.shape[:len(size)] == size: @@ -215,14 +421,61 @@ def random(self, point=None, size=None): w_shape = w.shape # Try to determine parameter shape and dist_shape - param_shape = np.broadcast(np.empty(w_shape), - comp_tmp).shape + if self.comp_is_distribution: + param_shape = np.broadcast(np.empty(w_shape), + np.empty(broadcast_shape)).shape + else: + param_shape = np.broadcast(np.empty(w_shape), + np.empty(broadcast_shape + (1,))).shape if np.asarray(self.shape).size != 0: dist_shape = np.broadcast(np.empty(self.shape), np.empty(param_shape[:-1])).shape else: dist_shape = param_shape[:-1] + # Try to determine the size that must be used to get the mixture + # components (i.e. get random choices using w). + # 1. There must be size independent choices based on w. + # 2. There must also be independent draws for each non singleton axis + # of w. + # 3. There must also be independent draws for each dimension added by + # self.shape with respect to the w.ndim. These usually correspond to + # observed variables with batch shapes + wsh = (1,) * (len(dist_shape) - len(w_shape) + 1) + w_shape[:-1] + psh = (1,) * (len(dist_shape) - len(param_shape) + 1) + param_shape[:-1] + w_sample_size = [] + # Loop through the dist_shape to get the conditions 2 and 3 first + for i in range(len(dist_shape)): + if dist_shape[i] != psh[i] and wsh[i] == 1: + # self.shape[i] is a non singleton dimension (usually caused by + # observed data) + sh = dist_shape[i] + else: + sh = wsh[i] + w_sample_size.append(sh) + if size is not None and w_sample_size[:len(size)] != size: + w_sample_size = size + tuple(w_sample_size) + # Broadcast w to the w_sample_size (add a singleton last axis for the + # mixture components) + w = broadcast_distribution_samples([w, np.empty(w_sample_size + (1,))], + size=size)[0] + + # Semiflatten the mixture weights. The last axis is the number of + # mixture mixture components, and the rest is all about size, + # dist_shape and broadcasting + w_ = np.reshape(w, (-1, w.shape[-1])) + w_samples = generate_samples(random_choice, + p=w_, + broadcast_shape=w.shape[:-1] or (1,), + dist_shape=w.shape[:-1] or (1,), + size=None) # w's shape already includes size + # Now we broadcast the chosen components to the dist_shape + w_samples = np.reshape(w_samples, w.shape[:-1]) + if size is not None and dist_shape[:len(size)] != size: + w_samples = np.broadcast_to(w_samples, size + dist_shape) + else: + w_samples = np.broadcast_to(w_samples, dist_shape) + # When size is not None, maybe dist_shape partially overlaps with size if size is not None: if size == dist_shape: @@ -237,51 +490,39 @@ def random(self, point=None, size=None): else: _size = int(np.prod(size)) - # Now we must broadcast w to the shape that considers size, dist_shape - # and param_shape. However, we must take care with the cases in which - # dist_shape and param_shape overlap - if size is not None and w.shape[:len(size)] == size: - if w.shape[:len(size + dist_shape)] != (size + dist_shape): - # To allow w to broadcast, we insert new axis in between the - # "size" axis and the "mixture" axis - _w = w[(slice(None),) * len(size) + # Index the size axis - (np.newaxis,) * len(dist_shape) + # Add new axis for the dist_shape - (slice(None),)] # Close with the slice of mixture components - w = np.broadcast_to(_w, size + dist_shape + (param_shape[-1],)) - elif size is not None: - w = np.broadcast_to(w, size + dist_shape + (param_shape[-1],)) - else: - w = np.broadcast_to(w, dist_shape + (param_shape[-1],)) - # Compute the total size of the mixture's random call with size if _size is not None: output_size = int(_size * np.prod(dist_shape) * param_shape[-1]) else: output_size = int(np.prod(dist_shape) * param_shape[-1]) # Get the size we need for the mixture's random call - mixture_size = int(output_size // np.prod(comp_tmp.shape)) + if self.comp_is_distribution: + mixture_size = int(output_size // np.prod(broadcast_shape)) + else: + mixture_size = int(output_size // + (np.prod(broadcast_shape) * param_shape[-1])) if mixture_size == 1 and _size is None: mixture_size = None - # Semiflatten the mixture weights. The last axis is the number of - # mixture mixture components, and the rest is all about size, - # dist_shape and broadcasting - w = np.reshape(w, (-1, w.shape[-1])) - # Normalize mixture weights - w = w / w.sum(axis=-1, keepdims=True) - - w_samples = generate_samples(random_choice, - p=w, - broadcast_shape=w.shape[:-1] or (1,), - dist_shape=w.shape[:-1] or (1,), - size=size) # Sample from the mixture with draw_context: - mixed_samples = self._comp_samples(point=point, - size=mixture_size) - w_samples = w_samples.flatten() + mixed_samples = self._comp_samples( + point=point, + size=mixture_size, + broadcast_shape=broadcast_shape, + comp_dist_shapes=comp_dist_shapes, + ) + # Test that the mixture has the same number of "samples" as w + if w_samples.size != (mixed_samples.size // w.shape[-1]): + raise ValueError('Inconsistent number of samples from the ' + 'mixture and mixture weights. Drew {} mixture ' + 'weights elements, and {} samples from the ' + 'mixture components.'. + format(w_samples.size, + mixed_samples.size // w.shape[-1])) # Semiflatten the mixture to be able to zip it with w_samples - mixed_samples = np.reshape(mixed_samples, (-1, comp_tmp.shape[-1])) + w_samples = w_samples.flatten() + mixed_samples = np.reshape(mixed_samples, (-1, w.shape[-1])) # Select the samples from the mixture samples = np.array([mixed[choice] for choice, mixed in zip(w_samples, mixed_samples)]) diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py index 7d6d7d4a84..d773036d77 100755 --- a/pymc3/distributions/multivariate.py +++ b/pymc3/distributions/multivariate.py @@ -267,7 +267,7 @@ def random(self, point=None, size=None): else: std_norm_shape = mu.shape standard_normal = np.random.standard_normal(std_norm_shape) - return mu + np.tensordot(standard_normal, chol, axes=[[-1], [-1]]) + return mu + np.einsum('...ij,...j->...i', chol, standard_normal) else: mu, tau = draw_values([self.mu, self.tau], point=point, size=size) if mu.shape[-1] != tau[0].shape[-1]: diff --git a/pymc3/tests/test_mixture.py b/pymc3/tests/test_mixture.py index de6a95fc44..45692da761 100644 --- a/pymc3/tests/test_mixture.py +++ b/pymc3/tests/test_mixture.py @@ -1,3 +1,4 @@ +import pytest import numpy as np from numpy.testing import assert_allclose @@ -9,18 +10,30 @@ from scipy.special import logsumexp from pymc3.theanof import floatX import theano +from theano import tensor as tt +from pymc3.distributions.distribution import to_tuple # Generate data def generate_normal_mixture_data(w, mu, sd, size=1000): component = np.random.choice(w.size, size=size, p=w) - x = np.random.normal(mu[component], sd[component], size=size) + mu, sd = np.broadcast_arrays(mu, sd) + out_size = to_tuple(size) + mu.shape[:-1] + mu_ = np.array([mu[..., comp] for comp in component.ravel()]) + sd_ = np.array([sd[..., comp] for comp in component.ravel()]) + mu_ = np.reshape(mu_, out_size) + sd_ = np.reshape(sd_, out_size) + x = np.random.normal(mu_, sd_, size=out_size) return x def generate_poisson_mixture_data(w, mu, size=1000): component = np.random.choice(w.size, size=size, p=w) - x = np.random.poisson(mu[component], size=size) + mu = np.atleast_1d(mu) + out_size = to_tuple(size) + mu.shape[:-1] + mu_ = np.array([mu[..., comp] for comp in component.ravel()]) + mu_ = np.reshape(mu_, out_size) + x = np.random.poisson(mu_, size=out_size) return x @@ -75,27 +88,98 @@ def test_normal_mixture(self): np.sort(self.norm_mu), rtol=0.1, atol=0.1) - def test_normal_mixture_nd(self): - nd, ncomp = 3, 5 + @pytest.mark.parametrize('nd,ncomp', + [(tuple(), 5), + (1, 5), + (3, 5), + ((3, 3), 5), + (3, 3), + ((3, 3), 3)], + ids=str) + def test_normal_mixture_nd(self, nd, ncomp): + nd = to_tuple(nd) + ncomp = int(ncomp) + comp_shape = nd + (ncomp,) + test_mus = np.random.randn(*comp_shape) + test_taus = np.random.gamma(1, 1, size=comp_shape) + observed = generate_normal_mixture_data(w=np.ones(ncomp)/ncomp, + mu=test_mus, + sd=1/np.sqrt(test_taus), + size=10) with Model() as model0: - mus = Normal('mus', shape=(nd, ncomp)) - taus = Gamma('taus', alpha=1, beta=1, shape=(nd, ncomp)) + mus = Normal('mus', shape=comp_shape) + taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape) ws = Dirichlet('ws', np.ones(ncomp)) - mixture0 = NormalMixture('m', w=ws, mu=mus, tau=taus, shape=nd) + mixture0 = NormalMixture('m', w=ws, mu=mus, tau=taus, shape=nd, + comp_shape=comp_shape) + obs0 = NormalMixture('obs', w=ws, mu=mus, tau=taus, shape=nd, + comp_shape=comp_shape, + observed=observed) with Model() as model1: - mus = Normal('mus', shape=(nd, ncomp)) - taus = Gamma('taus', alpha=1, beta=1, shape=(nd, ncomp)) + mus = Normal('mus', shape=comp_shape) + taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape) ws = Dirichlet('ws', np.ones(ncomp)) - comp_dist = [Normal.dist(mu=mus[:, i], tau=taus[:, i]) + comp_dist = [Normal.dist(mu=mus[..., i], tau=taus[..., i], + shape=nd) for i in range(ncomp)] mixture1 = Mixture('m', w=ws, comp_dists=comp_dist, shape=nd) + obs1 = Mixture('obs', w=ws, comp_dists=comp_dist, shape=nd, + observed=observed) + + with Model() as model2: + # Expected to fail if comp_shape is not provided, + # nd is multidim and it does not broadcast with ncomp. If by chance + # it does broadcast, an error is raised if the mixture is given + # observed data. + # Furthermore, the Mixture will also raise errors when the observed + # data is multidimensional but it does not broadcast well with + # comp_dists. + mus = Normal('mus', shape=comp_shape) + taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape) + ws = Dirichlet('ws', np.ones(ncomp)) + if len(nd) > 1: + if nd[-1] != ncomp: + with pytest.raises(ValueError): + NormalMixture('m', w=ws, mu=mus, tau=taus, + shape=nd) + mixture2 = None + else: + mixture2 = NormalMixture('m', w=ws, mu=mus, tau=taus, + shape=nd) + else: + mixture2 = NormalMixture('m', w=ws, mu=mus, tau=taus, + shape=nd) + observed_fails = False + if len(nd) >= 1 and nd != (1,): + try: + np.broadcast(np.empty(comp_shape), observed) + except Exception: + observed_fails = True + if observed_fails: + with pytest.raises(ValueError): + NormalMixture('obs', w=ws, mu=mus, tau=taus, + shape=nd, + observed=observed) + obs2 = None + else: + obs2 = NormalMixture('obs', w=ws, mu=mus, tau=taus, + shape=nd, + observed=observed) testpoint = model0.test_point - testpoint['mus'] = np.random.randn(nd, ncomp) + testpoint['mus'] = test_mus + testpoint['taus'] = test_taus assert_allclose(model0.logp(testpoint), model1.logp(testpoint)) assert_allclose(mixture0.logp(testpoint), mixture1.logp(testpoint)) + assert_allclose(obs0.logp(testpoint), obs1.logp(testpoint)) + if mixture2 is not None and obs2 is not None: + assert_allclose(model0.logp(testpoint), model2.logp(testpoint)) + if mixture2 is not None: + assert_allclose(mixture0.logp(testpoint), mixture2.logp(testpoint)) + if obs2 is not None: + assert_allclose(obs0.logp(testpoint), obs2.logp(testpoint)) def test_poisson_mixture(self): with Model() as model: @@ -166,6 +250,10 @@ def test_mixture_of_mvn(self): mixlogp_st.sum() + priorlogp) def test_mixture_of_mixture(self): + if theano.config.floatX == 'float32': + rtol = 1e-4 + else: + rtol = 1e-7 nbr = 4 with Model() as model: # mixtures components @@ -192,26 +280,33 @@ def test_mixture_of_mixture(self): test_point = model.test_point def mixmixlogp(value, point): + floatX = theano.config.floatX priorlogp = st.dirichlet.logpdf(x=point['g_w'], alpha=np.ones(nbr)*0.0000001, - ) + \ - st.expon.logpdf(x=point['mu_g']).sum() + \ + ).astype(floatX) + \ + st.expon.logpdf(x=point['mu_g']).sum(dtype=floatX) + \ st.dirichlet.logpdf(x=point['l_w'], alpha=np.ones(nbr)*0.0000001, - ) + \ - st.expon.logpdf(x=point['mu_l']).sum() + \ + ).astype(floatX) + \ + st.expon.logpdf(x=point['mu_l']).sum(dtype=floatX) + \ st.dirichlet.logpdf(x=point['mix_w'], alpha=np.ones(2), - ) + ).astype(floatX) complogp1 = st.norm.logpdf(x=value, - loc=point['mu_g']) - mixlogp1 = logsumexp(np.log(point['g_w']) + complogp1, + loc=point['mu_g']).astype(floatX) + mixlogp1 = logsumexp(np.log(point['g_w']).astype(floatX) + + complogp1, axis=-1, keepdims=True) - complogp2 = st.lognorm.logpdf(value, 1., 0., np.exp(point['mu_l'])) - mixlogp2 = logsumexp(np.log(point['l_w']) + complogp2, + complogp2 = st.lognorm.logpdf(value, + 1., + 0., + np.exp(point['mu_l'])).astype(floatX) + mixlogp2 = logsumexp(np.log(point['l_w']).astype(floatX) + + complogp2, axis=-1, keepdims=True) complogp_mix = np.concatenate((mixlogp1, mixlogp2), axis=1) - mixmixlogpg = logsumexp(np.log(point['mix_w']) + complogp_mix, + mixmixlogpg = logsumexp(np.log(point['mix_w']).astype(floatX) + + complogp_mix, axis=-1, keepdims=True) return priorlogp, mixmixlogpg @@ -219,19 +314,23 @@ def mixmixlogp(value, point): priorlogp, mixmixlogpg = mixmixlogp(value, test_point) # check logp of mixture - assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point)) + assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point), + rtol=rtol) # check model logp assert_allclose(priorlogp + mixmixlogpg.sum(), - model.logp(test_point)) + model.logp(test_point), + rtol=rtol) # check input and check logp again test_point['g_w'] = np.asarray([.1, .1, .2, .6]) test_point['mu_g'] = np.exp(np.random.randn(nbr)) priorlogp, mixmixlogpg = mixmixlogp(value, test_point) - assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point)) + assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point), + rtol=rtol) assert_allclose(priorlogp + mixmixlogpg.sum(), - model.logp(test_point)) + model.logp(test_point), + rtol=rtol) def test_sample_prior_and_posterior(self): def build_toy_dataset(N, K): @@ -284,3 +383,114 @@ def build_toy_dataset(N, K): assert prior['x_obs'].shape == (n_samples,) + X.shape assert prior['mu0'].shape == (n_samples, D) assert prior['chol_cov_0'].shape == (n_samples, D * (D + 1) // 2) + + +class TestMixtureVsLatent(SeededTest): + def setup_method(self, *args, **kwargs): + super().setup_method(*args, **kwargs) + self.nd = 3 + self.npop = 3 + self.mus = tt.as_tensor_variable( + np.tile( + np.reshape( + np.arange(self.npop), + (1, -1,) + ), + (self.nd, 1,) + ) + ) + + def test_1d_w(self): + nd = self.nd + npop = self.npop + mus = self.mus + size = 100 + with pm.Model() as model: + m = pm.NormalMixture('m', + w=np.ones(npop) / npop, + mu=mus, + sigma=1e-5, + comp_shape=(nd, npop), + shape=nd) + z = pm.Categorical('z', p=np.ones(npop) / npop) + latent_m = pm.Normal('latent_m', + mu=mus[..., z], + sigma=1e-5, + shape=nd) + + m_val = m.random(size=size) + latent_m_val = latent_m.random(size=size) + assert m_val.shape == latent_m_val.shape + # Test that each element in axis = -1 comes from the same mixture + # component + assert all(np.all(np.diff(m_val) < 1e-3, axis=-1)) + assert all(np.all(np.diff(latent_m_val) < 1e-3, axis=-1)) + + self.samples_from_same_distribution(m_val, latent_m_val) + self.logp_matches(m, latent_m, z, npop, model=model) + + def test_2d_w(self): + nd = self.nd + npop = self.npop + mus = self.mus + size = 100 + with pm.Model() as model: + m = pm.NormalMixture('m', + w=np.ones((nd, npop)) / npop, + mu=mus, + sigma=1e-5, + comp_shape=(nd, npop), + shape=nd) + z = pm.Categorical('z', p=np.ones(npop) / npop, shape=nd) + mu = tt.as_tensor_variable([mus[i, z[i]] for i in range(nd)]) + latent_m = pm.Normal('latent_m', + mu=mu, + sigma=1e-5, + shape=nd) + + m_val = m.random(size=size) + latent_m_val = latent_m.random(size=size) + assert m_val.shape == latent_m_val.shape + # Test that each element in axis = -1 can come from independent + # components + assert not all(np.all(np.diff(m_val) < 1e-3, axis=-1)) + assert not all(np.all(np.diff(latent_m_val) < 1e-3, axis=-1)) + + self.samples_from_same_distribution(m_val, latent_m_val) + self.logp_matches(m, latent_m, z, npop, model=model) + + def samples_from_same_distribution(self, *args): + # Test if flattened samples distributions match (marginals match) + _, p_marginal = st.ks_2samp(*[s.flatten() for s in args]) + # Test if correlations within non independent draws match + _, p_correlation = st.ks_2samp( + *[np.array([np.corrcoef(ss) for ss in s]).flatten() + for s in args] + ) + assert p_marginal >= 0.05 and p_correlation >= 0.05 + + def logp_matches(self, mixture, latent_mix, z, npop, model): + if theano.config.floatX == 'float32': + rtol = 1e-4 + else: + rtol = 1e-7 + test_point = model.test_point + test_point['latent_m'] = test_point['m'] + mix_logp = mixture.logp(test_point) + logps = [] + for component in range(npop): + test_point['z'] = component * np.ones(z.distribution.shape) + # Count the number of axes that should be broadcasted from z to + # modify the logp + sh1 = test_point['z'].shape + sh2 = test_point['latent_m'].shape + if len(sh1) > len(sh2): + sh2 = (1,) * (len(sh1) - len(sh2)) + sh2 + elif len(sh2) > len(sh1): + sh1 = (1,) * (len(sh2) - len(sh1)) + sh1 + reps = np.prod([s2 if s1 != s2 else 1 for s1, s2 in + zip(sh1, sh2)]) + z_logp = z.logp(test_point) * reps + logps.append(z_logp + latent_mix.logp(test_point)) + latent_mix_logp = logsumexp(np.array(logps), axis=0) + assert_allclose(mix_logp, latent_mix_logp, rtol=rtol) diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py index 1d73e55edf..af1081fba5 100644 --- a/pymc3/tests/test_sampling.py +++ b/pymc3/tests/test_sampling.py @@ -223,8 +223,9 @@ def test_normal_scalar(self): ppc = pm.sample_posterior_predictive(trace, samples=1000, vars=[a]) assert 'a' in ppc assert ppc['a'].shape == (1000,) - _, pval = stats.kstest(ppc['a'], - stats.norm(loc=0, scale=np.sqrt(2)).cdf) + # mu's standard deviation may have changed thanks to a's observed + _, pval = stats.kstest(ppc['a'] - trace['mu'], + stats.norm(loc=0, scale=1).cdf) assert pval > 0.001 with model: