diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index fa0c0d1f16..b3ebc22f46 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -4,6 +4,8 @@
 
 ### New features
 
+- `Mixture` now supports mixtures of multidimensional probability distributions, not just lists of 1D distributions.
+
 ### Maintenance
 
 - All occurances of `sd` as a parameter name have been renamed to `sigma`. `sd` will continue to function for backwards compatibility.
@@ -13,6 +15,13 @@
 - Added a fix to allow the imputation of single missing values of observed data, which previously would fail (Fix issue #3122).
 - Fix for #3346. The `draw_values` function was too permissive with what could be grabbed from inside `point`, which lead to an error when sampling posterior predictives of variables that depended on shared variables that had changed their shape after `pm.sample()` had been called.
 - Fix for #3354. `draw_values` now adds the theano graph descendants of `TensorConstant` or `SharedVariables` to the named relationship nodes stack, only if these descendants are `ObservedRV` or `MultiObservedRV` instances.
+- Fixed bug in broadcast_distrution_samples, which did not handle correctly cases in which some samples did not have the size tuple prepended.
+- Changed `MvNormal.random`'s usage of `tensordot` for Cholesky encoded covariances. This lead to wrong axis broadcasting and seemed to be the cause for issue #3343.
+- Fixed defect in `Mixture.random` when multidimensional mixtures were involved. The mixture component was not preserved across all the elements of the dimensions of the mixture. This meant that the correlations across elements within a given draw of the mixture were partly broken.
+- Restructured `Mixture.random` to allow better use of vectorized calls to `comp_dists.random`.
+- Added tests for mixtures of multidimensional distributions to the test suite.
+- Fixed incorrect usage of `broadcast_distribution_samples` in `DiscreteWeibull`.
+- `Mixture`'s default dtype is now determined by `theano.config.floatX`.
 
 ### Deprecations
 
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index cd963110cf..e74293b068 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -347,12 +347,12 @@ def _ppf(self, p):
 
     def _random(self, q, beta, size=None):
         p = np.random.uniform(size=size)
-        p, q, beta = broadcast_distribution_samples([p, q, beta], size=size)
 
         return np.ceil(np.power(np.log(1 - p) / np.log(q), 1. / beta)) - 1
 
     def random(self, point=None, size=None):
         q, beta = draw_values([self.q, self.beta], point=point, size=size)
+        q, beta = broadcast_distribution_samples([q, beta], size=size)
 
         return generate_samples(self._random, q, beta,
                                 dist_shape=self.shape,
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 16b9cf14d0..d9a74fa43c 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -254,7 +254,7 @@ class _DrawValuesContextBlocker(_DrawValuesContext, metaclass=InitContextMeta):
     """
     def __new__(cls, *args, **kwargs):
         # resolves the parent instance
-        instance = super(_DrawValuesContextBlocker, cls).__new__(cls)
+        instance = super().__new__(cls)
         instance._parent = None
         return instance
 
@@ -599,6 +599,9 @@ def generate_samples(generator, *args, **kwargs):
         parameters. This may be required when the parameter shape
         does not determine the shape of a single sample, for example,
         the shape of the probabilities in the Categorical distribution.
+    not_broadcast_kwargs: dict or None
+        Key word argument dictionary to provide to the random generator, which
+        must not be broadcasted with the rest of the *args and **kwargs.
 
     Any remaining *args and **kwargs are passed on to the generator function.
     """
@@ -606,6 +609,9 @@ def generate_samples(generator, *args, **kwargs):
     one_d = _is_one_d(dist_shape)
     size = kwargs.pop('size', None)
     broadcast_shape = kwargs.pop('broadcast_shape', None)
+    not_broadcast_kwargs = kwargs.pop('not_broadcast_kwargs', None)
+    if not_broadcast_kwargs is None:
+        not_broadcast_kwargs = dict()
     if size is None:
         size = 1
 
@@ -625,6 +631,8 @@ def generate_samples(generator, *args, **kwargs):
             kwargs = {k: v.reshape(v.shape + (1,) * (max_dims - v.ndim)) for k, v in kwargs.items()}
             inputs = args + tuple(kwargs.values())
             broadcast_shape = np.broadcast(*inputs).shape  # size of generator(size=1)
+    # Update kwargs with the keyword arguments that were not broadcasted
+    kwargs.update(not_broadcast_kwargs)
 
     dist_shape = to_tuple(dist_shape)
     broadcast_shape = to_tuple(broadcast_shape)
@@ -639,20 +647,30 @@ def generate_samples(generator, *args, **kwargs):
             samples = generator(size=broadcast_shape, *args, **kwargs)
         elif dist_shape == broadcast_shape:
             samples = generator(size=size_tup + dist_shape, *args, **kwargs)
-        elif len(dist_shape) == 0 and size_tup and broadcast_shape[:len(size_tup)] == size_tup:
-            # Input's dist_shape is scalar, but it has size repetitions.
-            # So now the size matches but we have to manually broadcast to
-            # the right dist_shape
-            samples = [generator(*args, **kwargs)]
-            if samples[0].shape == broadcast_shape:
-                samples = samples[0]
+        elif len(dist_shape) == 0 and size_tup and broadcast_shape:
+            # There is no dist_shape (scalar distribution) but the parameters
+            # broadcast shape and size_tup determine the size to provide to
+            # the generator
+            if broadcast_shape[:len(size_tup)] == size_tup:
+                # Input's dist_shape is scalar, but it has size repetitions.
+                # So now the size matches but we have to manually broadcast to
+                # the right dist_shape
+                samples = [generator(*args, **kwargs)]
+                if samples[0].shape == broadcast_shape:
+                    samples = samples[0]
+                else:
+                    suffix = broadcast_shape[len(size_tup):] + dist_shape
+                    samples.extend([generator(*args, **kwargs).
+                                    reshape(broadcast_shape)[..., np.newaxis]
+                                    for _ in range(np.prod(suffix,
+                                                           dtype=int) - 1)])
+                    samples = np.hstack(samples).reshape(size_tup + suffix)
             else:
-                suffix = broadcast_shape[len(size_tup):] + dist_shape
-                samples.extend([generator(*args, **kwargs).
-                                reshape(broadcast_shape)[..., np.newaxis]
-                                for _ in range(np.prod(suffix,
-                                                       dtype=int) - 1)])
-                samples = np.hstack(samples).reshape(size_tup + suffix)
+                # The parameter shape is given, but we have to concatenate it
+                # with the size tuple
+                samples = generator(size=size_tup + broadcast_shape,
+                                    *args,
+                                    **kwargs)
         else:
             samples = None
     # Args have been broadcast correctly, can just ask for the right shape out
@@ -686,27 +704,68 @@ def generate_samples(generator, *args, **kwargs):
 
 
 def broadcast_distribution_samples(samples, size=None):
+    """Broadcast samples drawn from distributions taking into account the
+    size (i.e. the number of samples) of the draw, which is prepended to
+    the sample's shape.
+
+    Parameters
+    ----------
+    samples: Iterable of ndarrays holding the sampled values
+    size: None, int or tuple (optional)
+        size of the sample set requested.
+
+    Returns
+    -------
+    List of broadcasted sample arrays
+
+    Examples
+    --------
+    .. code-block:: python
+        size = 100
+        sample0 = np.random.randn(size)
+        sample1 = np.random.randn(size, 5)
+        sample2 = np.random.randn(size, 4, 5)
+        out = broadcast_distribution_samples([sample0, sample1, sample2],
+                                             size=size)
+        assert all((o.shape == (size, 4, 5) for o in out))
+        assert np.all(sample0[:, None, None] == out[0])
+        assert np.all(sample1[:, None, :] == out[1])
+        assert np.all(sample2 == out[2])
+
+    .. code-block:: python
+        size = 100
+        sample0 = np.random.randn(size)
+        sample1 = np.random.randn(5)
+        sample2 = np.random.randn(4, 5)
+        out = broadcast_distribution_samples([sample0, sample1, sample2],
+                                             size=size)
+        assert all((o.shape == (size, 4, 5) for o in out))
+        assert np.all(sample0[:, None, None] == out[0])
+        assert np.all(sample1 == out[1])
+        assert np.all(sample2 == out[2])
+    """
     if size is None:
         return np.broadcast_arrays(*samples)
     _size = to_tuple(size)
-    try:
-        broadcasted_samples = np.broadcast_arrays(*samples)
-    except ValueError:
-        # Raw samples shapes
-        p_shapes = [p.shape for p in samples]
-        # samples shapes without the size prepend
-        sp_shapes = [s[len(_size):] if _size == s[:len(_size)] else s
-                     for s in p_shapes]
-        broadcast_shape = np.broadcast(*[np.empty(s) for s in sp_shapes]).shape
-        broadcasted_samples = []
-        for param, p_shape, sp_shape in zip(samples, p_shapes, sp_shapes):
-            if _size == p_shape[:len(_size)]:
-                slicer_head = [slice(None)] * len(_size)
-            else:
-                slicer_head = [np.newaxis] * len(_size)
+    # Raw samples shapes
+    p_shapes = [p.shape for p in samples]
+    # samples shapes without the size prepend
+    sp_shapes = [s[len(_size):] if _size == s[:len(_size)] else s
+                 for s in p_shapes]
+    broadcast_shape = np.broadcast(*[np.empty(s) for s in sp_shapes]).shape
+    broadcasted_samples = []
+    for param, p_shape, sp_shape in zip(samples, p_shapes, sp_shapes):
+        if _size == p_shape[:len(_size)]:
+            # If size prepends the shape, then we have to add broadcasting axis
+            # in the middle
+            slicer_head = [slice(None)] * len(_size)
             slicer_tail = ([np.newaxis] * (len(broadcast_shape) -
                                            len(sp_shape)) +
                            [slice(None)] * len(sp_shape))
-            broadcasted_samples.append(param[tuple(slicer_head + slicer_tail)])
-        broadcasted_samples = np.broadcast_arrays(*broadcasted_samples)
-    return broadcasted_samples
+        else:
+            # If size does not prepend the shape, then we have leave the
+            # parameter as is
+            slicer_head = []
+            slicer_tail = [slice(None)] * len(sp_shape)
+        broadcasted_samples.append(param[tuple(slicer_head + slicer_tail)])
+    return np.broadcast_arrays(*broadcasted_samples)
diff --git a/pymc3/distributions/mixture.py b/pymc3/distributions/mixture.py
index 2333a83505..93e2e2326c 100644
--- a/pymc3/distributions/mixture.py
+++ b/pymc3/distributions/mixture.py
@@ -1,4 +1,6 @@
+from collections.abc import Iterable
 import numpy as np
+import theano
 import theano.tensor as tt
 
 from pymc3.util import get_variable_name
@@ -6,8 +8,10 @@
 from .dist_math import bound, random_choice
 from .distribution import (Discrete, Distribution, draw_values,
                            generate_samples, _DrawValuesContext,
-                           _DrawValuesContextBlocker, to_tuple)
+                           _DrawValuesContextBlocker, to_tuple,
+                           broadcast_distribution_samples)
 from .continuous import get_tau_sigma, Normal
+from ..theanof import _conversion_map
 
 
 def all_discrete(comp_dists):
@@ -39,16 +43,16 @@ class Mixture(Distribution):
         w >= 0 and w <= 1
         the mixture weights
     comp_dists : multidimensional PyMC3 distribution (e.g. `pm.Poisson.dist(...)`)
-        or iterable of one-dimensional PyMC3 distributions the
-        component distributions :math:`f_1, \ldots, f_n`
+        or iterable of PyMC3 distributions the component distributions
+        :math:`f_1, \ldots, f_n`
 
-    Example
-    -------
+    Examples
+    --------
     .. code-block:: python
 
         # 2-Mixture Poisson distribution
         with pm.Model() as model:
-            lam = pm.Exponential('lam', lam=1, shape=(2,))  # `shape=(2,)` indicates two mixtures.
+            lam = pm.Exponential('lam', lam=1, shape=(2,))  # `shape=(2,)` indicates two mixture components.
 
             # As we just need the logp, rather than add a RV to the model, we need to call .dist()
             components = pm.Poisson.dist(mu=lam, shape=(2,))
@@ -68,9 +72,50 @@ class Mixture(Distribution):
             w = pm.Dirichlet('w', a=np.array([1, 1]))
 
             like = pm.Mixture('like', w=w, comp_dists = [pois1, pois2], observed=data)
+
+        # npop-Mixture of multidimensional Gaussian
+        npop = 5
+        nd = (3, 4)
+        with pm.Model() as model:
+            mu = pm.Normal('mu', mu=np.arange(npop), sigma=1, shape=npop) # Each component has an independent mean
+
+            w = pm.Dirichlet('w', a=np.ones(npop))
+
+            components = pm.Normal.dist(mu=mu, sigma=1, shape=nd + (npop,))  # nd + (npop,) shaped multinomial
+
+            like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=nd)  # The resulting mixture is nd-shaped
+
+        # Multidimensional Mixture as stacked independent mixtures
+        with pm.Model() as model:
+            mu = pm.Normal('mu', mu=np.arange(5), sigma=1, shape=5) # Each component has an independent mean
+
+            w = pm.Dirichlet('w', a=np.ones(3, 5))  # w is a stack of 3 independent 5 component weight arrays
+
+            components = pm.Normal.dist(mu=mu, sigma=1, shape=(3, 5))
+
+            # The mixture is an array of 3 elements.
+            # Each can be thought of as an independent scalar mixture of 5 components
+            like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=3)
     """
 
     def __init__(self, w, comp_dists, *args, **kwargs):
+        # comp_dists type checking
+        if not (
+            isinstance(comp_dists, Distribution)
+            or (
+                isinstance(comp_dists, Iterable)
+                and all((isinstance(c, Distribution) for c in comp_dists))
+            )
+        ):
+            raise TypeError(
+                "Supplied Mixture comp_dists must be a "
+                "Distribution or an iterable of "
+                "Distributions. Got {} instead.".format(
+                    type(comp_dists)
+                    if not isinstance(comp_dists, Iterable)
+                    else [type(c) for c in comp_dists]
+                )
+            )
         shape = kwargs.pop('shape', ())
 
         self.w = w = tt.as_tensor_variable(w)
@@ -79,9 +124,9 @@ def __init__(self, w, comp_dists, *args, **kwargs):
         defaults = kwargs.pop('defaults', [])
 
         if all_discrete(comp_dists):
-            dtype = kwargs.pop('dtype', 'int64')
+            default_dtype = _conversion_map[theano.config.floatX]
         else:
-            dtype = kwargs.pop('dtype', 'float64')
+            default_dtype = theano.config.floatX
 
             try:
                 self.mean = (w * self._comp_means()).sum(axis=-1)
@@ -90,6 +135,7 @@ def __init__(self, w, comp_dists, *args, **kwargs):
                     defaults.append('mean')
             except AttributeError:
                 pass
+        dtype = kwargs.pop('dtype', default_dtype)
 
         try:
             comp_modes = self._comp_modes()
@@ -108,41 +154,128 @@ def comp_dists(self):
         return self._comp_dists
 
     @comp_dists.setter
-    def comp_dists(self, _comp_dists):
-        self._comp_dists = _comp_dists
-        # Tests if the comp_dists can call random with non None size
-        with _DrawValuesContextBlocker():
-            if isinstance(self.comp_dists, (list, tuple)):
-                try:
-                    [comp_dist.random(size=23)
-                     for comp_dist in self.comp_dists]
-                    self._comp_dists_vect = True
-                except Exception:
-                    # The comp_dists cannot call random with non None size or
-                    # without knowledge of the point so we assume that we will
-                    # have to iterate calls to random to get the correct size
-                    self._comp_dists_vect = False
+    def comp_dists(self, comp_dists):
+        self._comp_dists = comp_dists
+        if isinstance(comp_dists, Distribution):
+            self._comp_dist_shapes = to_tuple(comp_dists.shape)
+            self._broadcast_shape = self._comp_dist_shapes
+            self.comp_is_distribution = True
+        else:
+            # Now we check the comp_dists distribution shape, see what
+            # the broadcast shape would be. This shape will be the dist_shape
+            # used by generate samples (the shape of a single random sample)
+            # from the mixture
+            self._comp_dist_shapes = [to_tuple(d.shape) for d in comp_dists]
+            # All component distributions must broadcast with each other
+            try:
+                self._broadcast_shape = np.broadcast(
+                    *[np.empty(shape) for shape in self._comp_dist_shapes]
+                ).shape
+            except Exception:
+                raise TypeError('Supplied comp_dists shapes do not broadcast '
+                                'with each other. comp_dists shapes are: '
+                                '{}'.format(self._comp_dist_shapes))
+
+            # We wrap the _comp_dist.random by adding the kwarg raw_size_,
+            # which will be the size attribute passed to _comp_samples.
+            # _comp_samples then calls generate_samples, which may change the
+            # size value to make it compatible with scipy.stats.*.rvs
+            self._generators = []
+            for comp_dist in comp_dists:
+                generator = Mixture._comp_dist_random_wrapper(comp_dist.random)
+                self._generators.append(generator)
+            self.comp_is_distribution = False
+
+    @staticmethod
+    def _comp_dist_random_wrapper(random):
+        """Wrap the comp_dists.random method to take the kwarg raw_size_ and
+        use it's value to replace the size parameter. This is needed because
+        generate_samples makes the size value compatible with the
+        scipy.stats.*.rvs, where size has a different meaning than in the
+        distributions' random methods.
+        """
+        def wrapped_random(*args, **kwargs):
+            raw_size_ = kwargs.pop('raw_size_', None)
+            # Distribution.random's signature is always (point=None, size=None)
+            # so size could be the second arg or be given as a kwarg
+            if len(args) > 1:
+                args[1] = raw_size_
             else:
-                try:
-                    self.comp_dists.random(size=23)
-                    self._comp_dists_vect = True
-                except Exception:
-                    # The comp_dists cannot call random with non None size or
-                    # without knowledge of the point so we assume that we will
-                    # have to iterate calls to random to get the correct size
-                    self._comp_dists_vect = False
+                kwargs['size'] = raw_size_
+            return random(*args, **kwargs)
+        return wrapped_random
 
     def _comp_logp(self, value):
         comp_dists = self.comp_dists
 
-        try:
-            value_ = value if value.ndim > 1 else tt.shape_padright(value)
-
+        if self.comp_is_distribution:
+            # Value can be many things. It can be the self tensor, the mode
+            # test point or it can be observed data. The latter case requires
+            # careful handling of shape, as the observed's shape could look
+            # like (repetitions,) + dist_shape, which does not include the last
+            # mixture axis. For this reason, we try to eval the value.shape,
+            # compare it with self.shape and shape_padright if we infer that
+            # the value holds observed data
+            try:
+                val_shape = tuple(value.shape.eval())
+            except AttributeError:
+                val_shape = value.shape
+            except theano.gof.MissingInputError:
+                val_shape = None
+            try:
+                self_shape = tuple(self.shape)
+            except AttributeError:
+                # Happens in __init__ when computing self.logp(comp_modes)
+                self_shape = None
+            comp_shape = tuple(comp_dists.shape)
+            ndim = value.ndim
+            if (
+                val_shape is not None and
+                    not((self_shape is not None and val_shape == self_shape) or
+                        val_shape == comp_shape)
+            ):
+                # value is neither the test point nor the self tensor, it
+                # is likely to hold observed values, so we must compute the
+                # ndim discarding the dimensions that don't match
+                # self_shape
+                if (
+                    self_shape and
+                    val_shape[-len(self_shape):] == self_shape
+                ):
+                    # value has observed values for the Mixture
+                    ndim = len(self_shape)
+                elif (
+                    comp_shape and
+                    val_shape[-len(comp_shape):] == comp_shape
+                ):
+                    # value has observed for the Mixture components
+                    ndim = len(comp_shape)
+                else:
+                    # We cannot infer what was passed, we handle this
+                    # as was done in earlier versions of Mixture. We pad
+                    # always if ndim is lower or equal to 1  (default
+                    # legacy implementation)
+                    if ndim <= 1:
+                        ndim = len(comp_dists.shape) - 1
+            else:
+                # We reach this point if value does not hold observed data, so
+                # we can use its ndim safely to determine shape padding, or it
+                # holds something that we cannot infer, so we revert to using
+                # the value's ndim for shape padding.
+                # We will always pad a single dimension if ndim is lower or
+                # equal to 1 (default legacy implementation)
+                if ndim <= 1:
+                    ndim = len(comp_dists.shape) - 1
+            if ndim < len(comp_dists.shape):
+                value_ = tt.shape_padright(value,
+                                           len(comp_dists.shape) - ndim)
+            else:
+                value_ = value
             return comp_dists.logp(value_)
-        except AttributeError:
+        else:
             return tt.squeeze(tt.stack([comp_dist.logp(value)
                                         for comp_dist in comp_dists],
-                                       axis=1))
+                                       axis=-1))
 
     def _comp_means(self):
         try:
@@ -150,7 +283,7 @@ def _comp_means(self):
         except AttributeError:
             return tt.squeeze(tt.stack([comp_dist.mean
                                         for comp_dist in self.comp_dists],
-                                       axis=1))
+                                       axis=-1))
 
     def _comp_modes(self):
         try:
@@ -158,36 +291,110 @@ def _comp_modes(self):
         except AttributeError:
             return tt.squeeze(tt.stack([comp_dist.mode
                                         for comp_dist in self.comp_dists],
-                                       axis=1))
+                                       axis=-1))
 
-    def _comp_samples(self, point=None, size=None):
-        if self._comp_dists_vect or size is None:
-            try:
-                return self.comp_dists.random(point=point, size=size)
-            except AttributeError:
-                samples = np.array([comp_dist.random(point=point, size=size)
-                                    for comp_dist in self.comp_dists])
-                samples = np.moveaxis(samples, 0, samples.ndim - 1)
+    def _comp_samples(self, point=None, size=None,
+                      comp_dist_shapes=None,
+                      broadcast_shape=None):
+        if self.comp_is_distribution:
+            samples = self._comp_dists.random(point=point, size=size)
         else:
-            # We must iterate the calls to random manually
-            size = to_tuple(size)
-            _size = int(np.prod(size))
-            try:
-                samples = np.array([self.comp_dists.random(point=point,
-                                                           size=None)
-                                    for _ in range(_size)])
-                samples = np.reshape(samples, size + samples.shape[1:])
-            except AttributeError:
-                samples = np.array([[comp_dist.random(point=point, size=None)
-                                     for _ in range(_size)]
-                                    for comp_dist in self.comp_dists])
-                samples = np.moveaxis(samples, 0, samples.ndim - 1)
-                samples = np.reshape(samples, size + samples[1:])
-
-        if samples.shape[-1] == 1:
-            return samples[..., 0]
+            if comp_dist_shapes is None:
+                comp_dist_shapes = self._comp_dist_shapes
+            if broadcast_shape is None:
+                broadcast_shape = self._sample_shape
+            samples = []
+            for dist_shape, generator in zip(comp_dist_shapes,
+                                             self._generators):
+                sample = generate_samples(
+                    generator=generator,
+                    dist_shape=dist_shape,
+                    broadcast_shape=broadcast_shape,
+                    point=point,
+                    size=size,
+                    not_broadcast_kwargs={'raw_size_': size},
+                )
+                samples.append(sample)
+            samples = np.array(
+                broadcast_distribution_samples(samples, size=size)
+            )
+            # In the logp we assume the last axis holds the mixture components
+            # so we move the axis to the last dimension
+            samples = np.moveaxis(samples, 0, -1)
+        return samples.astype(self.dtype)
+
+    def infer_comp_dist_shapes(self, point=None):
+        """Try to infer the shapes of the component distributions,
+        `comp_dists`, and how they should broadcast together.
+        The behavior is slightly different if `comp_dists` is a `Distribution`
+        as compared to when it is a list of `Distribution`s. When it is a list
+        the following procedure is repeated for each element in the list:
+        1. Look up the `comp_dists.shape`
+        2. If it is not empty, use it as `comp_dist_shape`
+        3. If it is an empty tuple, a single random sample is drawn by calling
+        `comp_dists.random(point=point, size=None)`, and the returned
+        test_sample's shape is used as the inferred `comp_dists.shape`
+
+        Parameters
+        ----------
+        point: None or dict (optional)
+            Dictionary that maps rv names to values, to supply to
+            `self.comp_dists.random`
+
+        Returns
+        -------
+        comp_dist_shapes: shape tuple or list of shape tuples.
+            If `comp_dists` is a `Distribution`, it is a shape tuple of the
+            inferred distribution shape.
+            If `comp_dists` is a list of `Distribution`s, it is a list of
+            shape tuples inferred for each element in `comp_dists`
+        broadcast_shape: shape tuple
+            The shape that results from broadcasting all component's shapes
+            together.
+        """
+        if self.comp_is_distribution:
+            if len(self._comp_dist_shapes) > 0:
+                comp_dist_shapes = self._comp_dist_shapes
+            else:
+                # Happens when the distribution is a scalar or when it was not
+                # given a shape. In these cases we try to draw a single value
+                # to check its shape, we use the provided point dictionary
+                # hoping that it can circumvent the Flat and HalfFlat
+                # undrawable distributions.
+                with _DrawValuesContextBlocker():
+                    test_sample = self._comp_dists.random(point=point,
+                                                          size=None)
+                    comp_dist_shapes = test_sample.shape
+            broadcast_shape = comp_dist_shapes
         else:
-            return samples
+            # Now we check the comp_dists distribution shape, see what
+            # the broadcast shape would be. This shape will be the dist_shape
+            # used by generate samples (the shape of a single random sample)
+            # from the mixture
+            comp_dist_shapes = []
+            for dist_shape, comp_dist in zip(self._comp_dist_shapes,
+                                             self._comp_dists):
+                if dist_shape == tuple():
+                    # Happens when the distribution is a scalar or when it was
+                    # not given a shape. In these cases we try to draw a single
+                    # value to check its shape, we use the provided point
+                    # dictionary hoping that it can circumvent the Flat and
+                    # HalfFlat undrawable distributions.
+                    with _DrawValuesContextBlocker():
+                        test_sample = comp_dist.random(point=point,
+                                                       size=None)
+                        dist_shape = test_sample.shape
+                comp_dist_shapes.append(dist_shape)
+            # All component distributions must broadcast with each other
+            try:
+                broadcast_shape = np.broadcast(
+                    *[np.empty(shape) for shape in comp_dist_shapes]
+                ).shape
+            except Exception:
+                raise TypeError('Inferred comp_dist shapes do not broadcast '
+                                'with each other. comp_dists inferred shapes '
+                                'are: {}'.format(comp_dist_shapes))
+        return comp_dist_shapes, broadcast_shape
 
     def logp(self, value):
         w = self.w
@@ -199,14 +406,13 @@ def logp(self, value):
     def random(self, point=None, size=None):
         # Convert size to tuple
         size = to_tuple(size)
-        # Draw mixture weights and a sample from each mixture to infer shape
+        # Draw mixture weights and infer the comp_dists shapes
         with _DrawValuesContext() as draw_context:
             # We first need to check w and comp_tmp shapes and re compute size
             w = draw_values([self.w], point=point, size=size)[0]
-        with _DrawValuesContextBlocker():
-            # We don't want to store the values drawn here in the context
-            # because they wont have the correct size
-            comp_tmp = self._comp_samples(point=point, size=None)
+            comp_dist_shapes, broadcast_shape = (
+                self.infer_comp_dist_shapes(point=point)
+            )
 
         # When size is not None, it's hard to tell the w parameter shape
         if size is not None and w.shape[:len(size)] == size:
@@ -215,14 +421,61 @@ def random(self, point=None, size=None):
             w_shape = w.shape
 
         # Try to determine parameter shape and dist_shape
-        param_shape = np.broadcast(np.empty(w_shape),
-                                   comp_tmp).shape
+        if self.comp_is_distribution:
+            param_shape = np.broadcast(np.empty(w_shape),
+                                       np.empty(broadcast_shape)).shape
+        else:
+            param_shape = np.broadcast(np.empty(w_shape),
+                                       np.empty(broadcast_shape + (1,))).shape
         if np.asarray(self.shape).size != 0:
             dist_shape = np.broadcast(np.empty(self.shape),
                                       np.empty(param_shape[:-1])).shape
         else:
             dist_shape = param_shape[:-1]
 
+        # Try to determine the size that must be used to get the mixture
+        # components (i.e. get random choices using w).
+        # 1. There must be size independent choices based on w.
+        # 2. There must also be independent draws for each non singleton axis
+        # of w.
+        # 3. There must also be independent draws for each dimension added by
+        # self.shape with respect to the w.ndim. These usually correspond to
+        # observed variables with batch shapes
+        wsh = (1,) * (len(dist_shape) - len(w_shape) + 1) + w_shape[:-1]
+        psh = (1,) * (len(dist_shape) - len(param_shape) + 1) + param_shape[:-1]
+        w_sample_size = []
+        # Loop through the dist_shape to get the conditions 2 and 3 first
+        for i in range(len(dist_shape)):
+            if dist_shape[i] != psh[i] and wsh[i] == 1:
+                # self.shape[i] is a non singleton dimension (usually caused by
+                # observed data)
+                sh = dist_shape[i]
+            else:
+                sh = wsh[i]
+            w_sample_size.append(sh)
+        if size is not None and w_sample_size[:len(size)] != size:
+            w_sample_size = size + tuple(w_sample_size)
+        # Broadcast w to the w_sample_size (add a singleton last axis for the
+        # mixture components)
+        w = broadcast_distribution_samples([w, np.empty(w_sample_size + (1,))],
+                                           size=size)[0]
+
+        # Semiflatten the mixture weights. The last axis is the number of
+        # mixture mixture components, and the rest is all about size,
+        # dist_shape and broadcasting
+        w_ = np.reshape(w, (-1, w.shape[-1]))
+        w_samples = generate_samples(random_choice,
+                                     p=w_,
+                                     broadcast_shape=w.shape[:-1] or (1,),
+                                     dist_shape=w.shape[:-1] or (1,),
+                                     size=None) # w's shape already includes size
+        # Now we broadcast the chosen components to the dist_shape
+        w_samples = np.reshape(w_samples, w.shape[:-1])
+        if size is not None and dist_shape[:len(size)] != size:
+            w_samples = np.broadcast_to(w_samples, size + dist_shape)
+        else:
+            w_samples = np.broadcast_to(w_samples, dist_shape)
+
         # When size is not None, maybe dist_shape partially overlaps with size
         if size is not None:
             if size == dist_shape:
@@ -237,51 +490,39 @@ def random(self, point=None, size=None):
         else:
             _size = int(np.prod(size))
 
-        # Now we must broadcast w to the shape that considers size, dist_shape
-        # and param_shape. However, we must take care with the cases in which
-        # dist_shape and param_shape overlap
-        if size is not None and w.shape[:len(size)] == size:
-            if w.shape[:len(size + dist_shape)] != (size + dist_shape):
-                # To allow w to broadcast, we insert new axis in between the
-                # "size" axis and the "mixture" axis
-                _w = w[(slice(None),) * len(size) +  # Index the size axis
-                       (np.newaxis,) * len(dist_shape) +  # Add new axis for the dist_shape
-                       (slice(None),)]  # Close with the slice of mixture components
-                w = np.broadcast_to(_w, size + dist_shape + (param_shape[-1],))
-        elif size is not None:
-            w = np.broadcast_to(w, size + dist_shape + (param_shape[-1],))
-        else:
-            w = np.broadcast_to(w, dist_shape + (param_shape[-1],))
-
         # Compute the total size of the mixture's random call with size
         if _size is not None:
             output_size = int(_size * np.prod(dist_shape) * param_shape[-1])
         else:
             output_size = int(np.prod(dist_shape) * param_shape[-1])
         # Get the size we need for the mixture's random call
-        mixture_size = int(output_size // np.prod(comp_tmp.shape))
+        if self.comp_is_distribution:
+            mixture_size = int(output_size // np.prod(broadcast_shape))
+        else:
+            mixture_size = int(output_size //
+                               (np.prod(broadcast_shape) * param_shape[-1]))
         if mixture_size == 1 and _size is None:
             mixture_size = None
 
-        # Semiflatten the mixture weights. The last axis is the number of
-        # mixture mixture components, and the rest is all about size,
-        # dist_shape and broadcasting
-        w = np.reshape(w, (-1, w.shape[-1]))
-        # Normalize mixture weights
-        w = w / w.sum(axis=-1, keepdims=True)
-
-        w_samples = generate_samples(random_choice,
-                                     p=w,
-                                     broadcast_shape=w.shape[:-1] or (1,),
-                                     dist_shape=w.shape[:-1] or (1,),
-                                     size=size)
         # Sample from the mixture
         with draw_context:
-            mixed_samples = self._comp_samples(point=point,
-                                               size=mixture_size)
-        w_samples = w_samples.flatten()
+            mixed_samples = self._comp_samples(
+                point=point,
+                size=mixture_size,
+                broadcast_shape=broadcast_shape,
+                comp_dist_shapes=comp_dist_shapes,
+            )
+        # Test that the mixture has the same number of "samples" as w
+        if w_samples.size != (mixed_samples.size // w.shape[-1]):
+            raise ValueError('Inconsistent number of samples from the '
+                             'mixture and mixture weights. Drew {} mixture '
+                             'weights elements, and {} samples from the '
+                             'mixture components.'.
+                             format(w_samples.size,
+                                    mixed_samples.size // w.shape[-1]))
         # Semiflatten the mixture to be able to zip it with w_samples
-        mixed_samples = np.reshape(mixed_samples, (-1, comp_tmp.shape[-1]))
+        w_samples = w_samples.flatten()
+        mixed_samples = np.reshape(mixed_samples, (-1, w.shape[-1]))
         # Select the samples from the mixture
         samples = np.array([mixed[choice] for choice, mixed in
                             zip(w_samples, mixed_samples)])
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 7d6d7d4a84..d773036d77 100755
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -267,7 +267,7 @@ def random(self, point=None, size=None):
             else:
                 std_norm_shape = mu.shape
             standard_normal = np.random.standard_normal(std_norm_shape)
-            return mu + np.tensordot(standard_normal, chol, axes=[[-1], [-1]])
+            return mu + np.einsum('...ij,...j->...i', chol, standard_normal)
         else:
             mu, tau = draw_values([self.mu, self.tau], point=point, size=size)
             if mu.shape[-1] != tau[0].shape[-1]:
diff --git a/pymc3/tests/test_mixture.py b/pymc3/tests/test_mixture.py
index de6a95fc44..45692da761 100644
--- a/pymc3/tests/test_mixture.py
+++ b/pymc3/tests/test_mixture.py
@@ -1,3 +1,4 @@
+import pytest
 import numpy as np
 from numpy.testing import assert_allclose
 
@@ -9,18 +10,30 @@
 from scipy.special import logsumexp
 from pymc3.theanof import floatX
 import theano
+from theano import tensor as tt
+from pymc3.distributions.distribution import to_tuple
 
 # Generate data
 def generate_normal_mixture_data(w, mu, sd, size=1000):
     component = np.random.choice(w.size, size=size, p=w)
-    x = np.random.normal(mu[component], sd[component], size=size)
+    mu, sd = np.broadcast_arrays(mu, sd)
+    out_size = to_tuple(size) + mu.shape[:-1]
+    mu_ = np.array([mu[..., comp] for comp in component.ravel()])
+    sd_ = np.array([sd[..., comp] for comp in component.ravel()])
+    mu_ = np.reshape(mu_, out_size)
+    sd_ = np.reshape(sd_, out_size)
+    x = np.random.normal(mu_, sd_, size=out_size)
 
     return x
 
 
 def generate_poisson_mixture_data(w, mu, size=1000):
     component = np.random.choice(w.size, size=size, p=w)
-    x = np.random.poisson(mu[component], size=size)
+    mu = np.atleast_1d(mu)
+    out_size = to_tuple(size) + mu.shape[:-1]
+    mu_ = np.array([mu[..., comp] for comp in component.ravel()])
+    mu_ = np.reshape(mu_, out_size)
+    x = np.random.poisson(mu_, size=out_size)
 
     return x
 
@@ -75,27 +88,98 @@ def test_normal_mixture(self):
                         np.sort(self.norm_mu),
                         rtol=0.1, atol=0.1)
 
-    def test_normal_mixture_nd(self):
-        nd, ncomp = 3, 5
+    @pytest.mark.parametrize('nd,ncomp',
+                             [(tuple(), 5),
+                              (1, 5),
+                              (3, 5),
+                              ((3, 3), 5),
+                              (3, 3),
+                              ((3, 3), 3)],
+                             ids=str)
+    def test_normal_mixture_nd(self, nd, ncomp):
+        nd = to_tuple(nd)
+        ncomp = int(ncomp)
+        comp_shape = nd + (ncomp,)
+        test_mus = np.random.randn(*comp_shape)
+        test_taus = np.random.gamma(1, 1, size=comp_shape)
+        observed = generate_normal_mixture_data(w=np.ones(ncomp)/ncomp,
+                                                mu=test_mus,
+                                                sd=1/np.sqrt(test_taus),
+                                                size=10)
 
         with Model() as model0:
-            mus = Normal('mus', shape=(nd, ncomp))
-            taus = Gamma('taus', alpha=1, beta=1, shape=(nd, ncomp))
+            mus = Normal('mus', shape=comp_shape)
+            taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape)
             ws = Dirichlet('ws', np.ones(ncomp))
-            mixture0 = NormalMixture('m', w=ws, mu=mus, tau=taus, shape=nd)
+            mixture0 = NormalMixture('m', w=ws, mu=mus, tau=taus, shape=nd,
+                                     comp_shape=comp_shape)
+            obs0 = NormalMixture('obs', w=ws, mu=mus, tau=taus, shape=nd,
+                                 comp_shape=comp_shape,
+                                 observed=observed)
 
         with Model() as model1:
-            mus = Normal('mus', shape=(nd, ncomp))
-            taus = Gamma('taus', alpha=1, beta=1, shape=(nd, ncomp))
+            mus = Normal('mus', shape=comp_shape)
+            taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape)
             ws = Dirichlet('ws', np.ones(ncomp))
-            comp_dist = [Normal.dist(mu=mus[:, i], tau=taus[:, i])
+            comp_dist = [Normal.dist(mu=mus[..., i], tau=taus[..., i],
+                                     shape=nd)
                          for i in range(ncomp)]
             mixture1 = Mixture('m', w=ws, comp_dists=comp_dist, shape=nd)
+            obs1 = Mixture('obs', w=ws, comp_dists=comp_dist, shape=nd,
+                           observed=observed)
+
+        with Model() as model2:
+            # Expected to fail if comp_shape is not provided,
+            # nd is multidim and it does not broadcast with ncomp. If by chance
+            # it does broadcast, an error is raised if the mixture is given
+            # observed data.
+            # Furthermore, the Mixture will also raise errors when the observed
+            # data is multidimensional but it does not broadcast well with
+            # comp_dists.
+            mus = Normal('mus', shape=comp_shape)
+            taus = Gamma('taus', alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet('ws', np.ones(ncomp))
+            if len(nd) > 1:
+                if nd[-1] != ncomp:
+                    with pytest.raises(ValueError):
+                        NormalMixture('m', w=ws, mu=mus, tau=taus,
+                                      shape=nd)
+                    mixture2 = None
+                else:
+                    mixture2 = NormalMixture('m', w=ws, mu=mus, tau=taus,
+                                             shape=nd)
+            else:
+                mixture2 = NormalMixture('m', w=ws, mu=mus, tau=taus,
+                                         shape=nd)
+            observed_fails = False
+            if len(nd) >= 1 and nd != (1,):
+                try:
+                    np.broadcast(np.empty(comp_shape), observed)
+                except Exception:
+                    observed_fails = True
+            if observed_fails:
+                with pytest.raises(ValueError):
+                    NormalMixture('obs', w=ws, mu=mus, tau=taus,
+                                  shape=nd,
+                                  observed=observed)
+                obs2 = None
+            else:
+                obs2 = NormalMixture('obs', w=ws, mu=mus, tau=taus,
+                                     shape=nd,
+                                     observed=observed)
 
         testpoint = model0.test_point
-        testpoint['mus'] = np.random.randn(nd, ncomp)
+        testpoint['mus'] = test_mus
+        testpoint['taus'] = test_taus
         assert_allclose(model0.logp(testpoint), model1.logp(testpoint))
         assert_allclose(mixture0.logp(testpoint), mixture1.logp(testpoint))
+        assert_allclose(obs0.logp(testpoint), obs1.logp(testpoint))
+        if mixture2 is not None and obs2 is not None:
+            assert_allclose(model0.logp(testpoint), model2.logp(testpoint))
+        if mixture2 is not None:
+            assert_allclose(mixture0.logp(testpoint), mixture2.logp(testpoint))
+        if obs2 is not None:
+            assert_allclose(obs0.logp(testpoint), obs2.logp(testpoint))
 
     def test_poisson_mixture(self):
         with Model() as model:
@@ -166,6 +250,10 @@ def test_mixture_of_mvn(self):
                         mixlogp_st.sum() + priorlogp)
 
     def test_mixture_of_mixture(self):
+        if theano.config.floatX == 'float32':
+            rtol = 1e-4
+        else:
+            rtol = 1e-7
         nbr = 4
         with Model() as model:
             # mixtures components
@@ -192,26 +280,33 @@ def test_mixture_of_mixture(self):
         test_point = model.test_point
 
         def mixmixlogp(value, point):
+            floatX = theano.config.floatX
             priorlogp = st.dirichlet.logpdf(x=point['g_w'],
                                             alpha=np.ones(nbr)*0.0000001,
-                                            ) + \
-                        st.expon.logpdf(x=point['mu_g']).sum() + \
+                                            ).astype(floatX) + \
+                        st.expon.logpdf(x=point['mu_g']).sum(dtype=floatX) + \
                         st.dirichlet.logpdf(x=point['l_w'],
                                             alpha=np.ones(nbr)*0.0000001,
-                                            ) + \
-                        st.expon.logpdf(x=point['mu_l']).sum() + \
+                                            ).astype(floatX) + \
+                        st.expon.logpdf(x=point['mu_l']).sum(dtype=floatX) + \
                         st.dirichlet.logpdf(x=point['mix_w'],
                                             alpha=np.ones(2),
-                                            )
+                                            ).astype(floatX)
             complogp1 = st.norm.logpdf(x=value,
-                                       loc=point['mu_g'])
-            mixlogp1 = logsumexp(np.log(point['g_w']) + complogp1,
+                                       loc=point['mu_g']).astype(floatX)
+            mixlogp1 = logsumexp(np.log(point['g_w']).astype(floatX) +
+                                 complogp1,
                                  axis=-1, keepdims=True)
-            complogp2 = st.lognorm.logpdf(value, 1., 0., np.exp(point['mu_l']))
-            mixlogp2 = logsumexp(np.log(point['l_w']) + complogp2,
+            complogp2 = st.lognorm.logpdf(value,
+                                          1.,
+                                          0.,
+                                          np.exp(point['mu_l'])).astype(floatX)
+            mixlogp2 = logsumexp(np.log(point['l_w']).astype(floatX) +
+                                 complogp2,
                                  axis=-1, keepdims=True)
             complogp_mix = np.concatenate((mixlogp1, mixlogp2), axis=1)
-            mixmixlogpg = logsumexp(np.log(point['mix_w']) + complogp_mix,
+            mixmixlogpg = logsumexp(np.log(point['mix_w']).astype(floatX) +
+                                    complogp_mix,
                                     axis=-1, keepdims=True)
             return priorlogp, mixmixlogpg
 
@@ -219,19 +314,23 @@ def mixmixlogp(value, point):
         priorlogp, mixmixlogpg = mixmixlogp(value, test_point)
 
         # check logp of mixture
-        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point))
+        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point),
+                        rtol=rtol)
 
         # check model logp
         assert_allclose(priorlogp + mixmixlogpg.sum(),
-                        model.logp(test_point))
+                        model.logp(test_point),
+                        rtol=rtol)
 
         # check input and check logp again
         test_point['g_w'] = np.asarray([.1, .1, .2, .6])
         test_point['mu_g'] = np.exp(np.random.randn(nbr))
         priorlogp, mixmixlogpg = mixmixlogp(value, test_point)
-        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point))
+        assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point),
+                        rtol=rtol)
         assert_allclose(priorlogp + mixmixlogpg.sum(),
-                        model.logp(test_point))
+                        model.logp(test_point),
+                        rtol=rtol)
 
     def test_sample_prior_and_posterior(self):
         def build_toy_dataset(N, K):
@@ -284,3 +383,114 @@ def build_toy_dataset(N, K):
         assert prior['x_obs'].shape == (n_samples,) + X.shape
         assert prior['mu0'].shape == (n_samples, D)
         assert prior['chol_cov_0'].shape == (n_samples, D * (D + 1) // 2)
+
+
+class TestMixtureVsLatent(SeededTest):
+    def setup_method(self, *args, **kwargs):
+        super().setup_method(*args, **kwargs)
+        self.nd = 3
+        self.npop = 3
+        self.mus = tt.as_tensor_variable(
+            np.tile(
+                np.reshape(
+                    np.arange(self.npop),
+                    (1, -1,)
+                ),
+                (self.nd, 1,)
+            )
+        )
+
+    def test_1d_w(self):
+        nd = self.nd
+        npop = self.npop
+        mus = self.mus
+        size = 100
+        with pm.Model() as model:
+            m = pm.NormalMixture('m',
+                                 w=np.ones(npop) / npop,
+                                 mu=mus,
+                                 sigma=1e-5,
+                                 comp_shape=(nd, npop),
+                                 shape=nd)
+            z = pm.Categorical('z', p=np.ones(npop) / npop)
+            latent_m = pm.Normal('latent_m',
+                                 mu=mus[..., z],
+                                 sigma=1e-5,
+                                 shape=nd)
+
+        m_val = m.random(size=size)
+        latent_m_val = latent_m.random(size=size)
+        assert m_val.shape == latent_m_val.shape
+        # Test that each element in axis = -1 comes from the same mixture
+        # component
+        assert all(np.all(np.diff(m_val) < 1e-3, axis=-1))
+        assert all(np.all(np.diff(latent_m_val) < 1e-3, axis=-1))
+
+        self.samples_from_same_distribution(m_val, latent_m_val)
+        self.logp_matches(m, latent_m, z, npop, model=model)
+
+    def test_2d_w(self):
+        nd = self.nd
+        npop = self.npop
+        mus = self.mus
+        size = 100
+        with pm.Model() as model:
+            m = pm.NormalMixture('m',
+                                 w=np.ones((nd, npop)) / npop,
+                                 mu=mus,
+                                 sigma=1e-5,
+                                 comp_shape=(nd, npop),
+                                 shape=nd)
+            z = pm.Categorical('z', p=np.ones(npop) / npop, shape=nd)
+            mu = tt.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
+            latent_m = pm.Normal('latent_m',
+                                 mu=mu,
+                                 sigma=1e-5,
+                                 shape=nd)
+
+        m_val = m.random(size=size)
+        latent_m_val = latent_m.random(size=size)
+        assert m_val.shape == latent_m_val.shape
+        # Test that each element in axis = -1 can come from independent
+        # components
+        assert not all(np.all(np.diff(m_val) < 1e-3, axis=-1))
+        assert not all(np.all(np.diff(latent_m_val) < 1e-3, axis=-1))
+
+        self.samples_from_same_distribution(m_val, latent_m_val)
+        self.logp_matches(m, latent_m, z, npop, model=model)
+
+    def samples_from_same_distribution(self, *args):
+        # Test if flattened samples distributions match (marginals match)
+        _, p_marginal = st.ks_2samp(*[s.flatten() for s in args])
+        # Test if correlations within non independent draws match
+        _, p_correlation = st.ks_2samp(
+            *[np.array([np.corrcoef(ss) for ss in s]).flatten()
+              for s in args]
+        )
+        assert p_marginal >= 0.05 and p_correlation >= 0.05
+
+    def logp_matches(self, mixture, latent_mix, z, npop, model):
+        if theano.config.floatX == 'float32':
+            rtol = 1e-4
+        else:
+            rtol = 1e-7
+        test_point = model.test_point
+        test_point['latent_m'] = test_point['m']
+        mix_logp = mixture.logp(test_point)
+        logps = []
+        for component in range(npop):
+            test_point['z'] = component * np.ones(z.distribution.shape)
+            # Count the number of axes that should be broadcasted from z to
+            # modify the logp
+            sh1 = test_point['z'].shape
+            sh2 = test_point['latent_m'].shape
+            if len(sh1) > len(sh2):
+                sh2 = (1,) * (len(sh1) - len(sh2)) + sh2
+            elif len(sh2) > len(sh1):
+                sh1 = (1,) * (len(sh2) - len(sh1)) + sh1
+            reps = np.prod([s2 if s1 != s2 else 1 for s1, s2 in
+                            zip(sh1, sh2)])
+            z_logp = z.logp(test_point) * reps
+            logps.append(z_logp + latent_mix.logp(test_point))
+        latent_mix_logp = logsumexp(np.array(logps), axis=0)
+        assert_allclose(mix_logp, latent_mix_logp, rtol=rtol)
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 1d73e55edf..af1081fba5 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -223,8 +223,9 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, samples=1000, vars=[a])
             assert 'a' in ppc
             assert ppc['a'].shape == (1000,)
-        _, pval = stats.kstest(ppc['a'],
-                               stats.norm(loc=0, scale=np.sqrt(2)).cdf)
+        # mu's standard deviation may have changed thanks to a's observed
+        _, pval = stats.kstest(ppc['a'] - trace['mu'],
+                               stats.norm(loc=0, scale=1).cdf)
         assert pval > 0.001
 
         with model: