Skip to content

Allow specification of dims instead of shape #3551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Jun 10, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e8199eb
Allow specification of dims instead of shape
aseyboldt Jul 18, 2019
169feef
Add pm.TidyData
aseyboldt Jul 19, 2019
57909b0
Create coords for pm.Data(ndarray)
aseyboldt Jul 20, 2019
6cbcdb7
Merge branch 'master' into dims
michaelosthege Jun 5, 2020
e603855
empty commit to trigger CI
michaelosthege Jun 7, 2020
a0753ac
Apply suggestions from code review
michaelosthege Jun 8, 2020
e7cb979
apply black formatting
michaelosthege Jun 8, 2020
b9acb96
address review comments & formatting
michaelosthege Jun 8, 2020
fcb709c
Add demonstration of named coordinates/dims
michaelosthege Jun 8, 2020
26c3fc8
don't require dim names to be identifiers
michaelosthege Jun 9, 2020
37da459
sort imports
michaelosthege Jun 9, 2020
8864bbe
raise ShapeError instead of ValueError
michaelosthege Jun 9, 2020
1bdcd38
formatting
michaelosthege Jun 9, 2020
b178207
robustify Dtype and ShapeError
michaelosthege Jun 9, 2020
9a943ca
Removed TidyData and refined dims and coords implementation
AlexAndorra Jun 9, 2020
10c617b
Changed name of kwarg export_dims and improved docstrings
AlexAndorra Jun 9, 2020
39b6d92
Add link to ArviZ in docstrings
AlexAndorra Jun 9, 2020
903ee61
Removed TidyData from __all__
AlexAndorra Jun 9, 2020
68a863f
Polished Data container NB
AlexAndorra Jun 10, 2020
2e73535
Fixed line break in data.py
AlexAndorra Jun 10, 2020
0236ccd
Fix inference of coords for dataframes
AlexAndorra Jun 10, 2020
a4c832b
Refined Data container NB
AlexAndorra Jun 10, 2020
80aaa35
Updated getting started NB with new dims and coords features
AlexAndorra Jun 10, 2020
aec6d9c
Reran getting started NB
AlexAndorra Jun 10, 2020
add54c4
Blackified NBs
AlexAndorra Jun 10, 2020
072c6a4
rerun with ArviZ branch
michaelosthege Jun 10, 2020
abfeba9
use np.shape to be compatible with tuples/lists
michaelosthege Jun 10, 2020
7a5c327
add tests for named coordinate handling
michaelosthege Jun 10, 2020
cc25d47
Extended tests for data container
AlexAndorra Jun 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### New features

- Allow users to specify coordinates and dimension names instead of numerical shapes. This makes interoperability with arviz easier.
- Distinguish between `Data` and `Deterministic` variables when graphing models with graphviz. PR [#3491](https://github.com/pymc-devs/pymc3/pull/3491).
- Sequential Monte Carlo - Approximate Bayesian Computation step method is now available. The implementation is in an experimental stage and will be further improved.
- Added `Matern12` covariance function for Gaussian processes. This is the Matern kernel with nu=1/2.
Expand Down
57 changes: 47 additions & 10 deletions pymc3/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pymc3 as pm
import theano.tensor as tt
import theano
import pandas as pd

__all__ = [
'get_data',
Expand Down Expand Up @@ -431,22 +432,58 @@ class Data:
For more information, take a look at this example notebook
https://docs.pymc.io/notebooks/data_container.html
"""
def __new__(self, name, value):
def __new__(self, name, value, dims=None):
try:
model = pm.Model.get_context()
except TypeError:
raise TypeError("No model on context stack, which is needed to "
"instantiate a data container. Add variable "
"inside a 'with model:' block.")

# `pm.model.pandas_to_array` takes care of parameter `value` and
# transforms it to something digestible for pymc3
shared_object = theano.shared(pm.model.pandas_to_array(value), name)

if isinstance(dims, str):
dims = (dims,)
if dims is not None and len(dims) != shared_object.ndim:
raise ValueError('Length of `dims` must match the dimensionality '
'of the dataset.')

coords = {}
if isinstance(value, (pd.Series, pd.DataFrame)):
name = None
if dims is not None:
name = dims[0]
if (name is None
and value.index.name is not None
and value.index.name.isidentifier()):
name = value.index.name
if name is not None:
coords[name] = value.index
if isinstance(value, pd.DataFrame):
name = None
if dims is not None:
name = dims[1]
if (name is None
and value.columns.name is not None
and value.columns.name.isidentifier()):
name = value.columns.name
if name is not None:
coords[name] = value.columns

model.add_coords(coords)

# To draw the node for this variable in the graphviz Digraph we need
# its shape.
shared_object.dshape = tuple(shared_object.shape.eval())

# Add data container to the named variables of the model.
try:
model = pm.Model.get_context()
except TypeError:
raise TypeError("No model on context stack, which is needed to "
"instantiate a data container. Add variable "
"inside a 'with model:' block.")
model.add_random_variable(shared_object)
if dims is not None:
shape_dims = model.shape_from_dims(dims)
if shared_object.dshape != shape_dims:
raise ValueError('Invalid shape. It is %s but the dimensions '
'suggest %s.'
% (shared_object.dshape, shape_dims))

model.add_random_variable(shared_object, dims=dims)

return shared_object
35 changes: 25 additions & 10 deletions pymc3/distributions/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,32 @@ def __new__(cls, name, *args, **kwargs):
"a 'with model:' block, or use the '.dist' syntax "
"for a standalone distribution.")

if isinstance(name, string_types):
data = kwargs.pop('observed', None)
cls.data = data
if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
raise TypeError("observed needs to be data but got: {}".format(type(data)))
total_size = kwargs.pop('total_size', None)
dist = cls.dist(*args, **kwargs)
return model.Var(name, dist, data, total_size)
else:
if not isinstance(name, string_types):
raise TypeError("Name needs to be a string but got: {}".format(name))

data = kwargs.pop('observed', None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to add some comments explaining what the new code below does? Would be helpful for future-us and new comers I think.

cls.data = data
if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
raise TypeError("observed needs to be data but got: {}".format(type(data)))
total_size = kwargs.pop('total_size', None)

dims = kwargs.pop('dims', None)
has_shape = 'shape' in kwargs
shape = kwargs.pop('shape', None)
if dims is not None:
if shape is not None:
raise ValueError("Specify only one of 'dims' and 'shape'")
if isinstance(dims, string_types):
dims = (dims,)
shape = model.shape_from_dims(dims)

# Some distribution do not accept shape=None
if has_shape or shape is not None:
dist = cls.dist(*args, **kwargs, shape=shape)
else:
dist = cls.dist(*args, **kwargs)
return model.Var(name, dist, data, total_size, dims=dims)

def __getnewargs__(self):
return _Unpickling,

Expand All @@ -58,7 +73,7 @@ def dist(cls, *args, **kwargs):
return dist

def __init__(self, shape, dtype, testval=None, defaults=(),
transform=None, broadcastable=None):
transform=None, broadcastable=None, dims=None):
self.shape = np.atleast_1d(shape)
if False in (np.floor(self.shape) == self.shape):
raise TypeError("Expected int elements in shape")
Expand Down
57 changes: 50 additions & 7 deletions pymc3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,8 +647,12 @@ def __new__(cls, *args, **kwargs):
instance._theano_config = theano_config
return instance

def __init__(self, name='', model=None, theano_config=None):
def __init__(self, name='', model=None, theano_config=None, coords=None):
self.name = name
self.coords = {}
self.RV_dims = {}
self.add_coords(coords)

if self.parent is not None:
self.named_vars = treedict(parent=self.parent.named_vars)
self.free_RVs = treelist(parent=self.parent.free_RVs)
Expand Down Expand Up @@ -800,7 +804,36 @@ def cont_vars(self):
"""All the continuous variables in the model"""
return list(typefilter(self.vars, continuous_types))

def Var(self, name, dist, data=None, total_size=None):
def shape_from_dims(self, dims):
shape = []
if len(set(dims)) != len(dims):
raise ValueError('Can not contain the same dimension name twice.')
for dim in dims:
if dim not in self.coords:
raise ValueError('Unknown dimension name %s. All dimension '
'names must be specified in the `coords` '
'argument of the model or through a pm.Data '
'variable.' % dim)
shape.extend(self.coords[dim].shape)
return tuple(shape)

def add_coords(self, coords):
if coords is None:
return

for name in coords:
if not name.isidentifier():
raise ValueError('Invalid dimension name %s' % name)
if name == 'draw' or name == 'chain':
raise ValueError('Dimensions can not be named `draw` or `chain`.')
if name in self.coords:
if not coords[name].equals(self.coords[name]):
raise ValueError(
'Duplicate and incompatiple coordinate: %s.' % name)
else:
self.coords[name] = coords[name]

def Var(self, name, dist, data=None, total_size=None, dims=None):
"""Create and add (un)observed random variable to the model with an
appropriate prior distribution.

Expand All @@ -813,12 +846,15 @@ def Var(self, name, dist, data=None, total_size=None):
the variable is unobserved.
total_size : scalar
upscales logp of variable with ``coef = total_size/var.shape[0]``
dims : tuple
Dimension names for the variable.

Returns
-------
FreeRV or ObservedRV
"""
name = self.name_for(name)

if data is None:
if getattr(dist, "transform", None) is None:
with self:
Expand All @@ -837,7 +873,7 @@ def Var(self, name, dist, data=None, total_size=None):
name=name,
orig_name=get_transformed_name(name, dist.transform)))
self.deterministics.append(var)
self.add_random_variable(var)
self.add_random_variable(var, dims)
return var
elif isinstance(data, dict):
with self:
Expand All @@ -860,14 +896,21 @@ def Var(self, name, dist, data=None, total_size=None):
self.missing_values.append(var.missing_values)
self.named_vars[var.missing_values.name] = var.missing_values

self.add_random_variable(var)
self.add_random_variable(var, dims)
return var

def add_random_variable(self, var):
def add_random_variable(self, var, dims=None):
"""Add a random variable to the named variables of the model."""
if self.named_vars.tree_contains(var.name):
raise ValueError(
"Variable name {} already exists.".format(var.name))

if dims is not None:
if isinstance(dims, str):
dims = (dims,)
assert all(dim in self.coords for dim in dims)
self.RV_dims[var.name] = dims

self.named_vars[var.name] = var
if not hasattr(self, self.name_of(var.name)):
setattr(self, self.name_of(var.name), var)
Expand Down Expand Up @@ -1482,7 +1525,7 @@ def _latex_repr_rv(rv):
return r'$\text{%s} \sim \text{Deterministic}(%s)$' % (rv.name, r',~'.join(_walk_up_rv(rv)))


def Deterministic(name, var, model=None):
def Deterministic(name, var, model=None, dims=None):
"""Create a named deterministic variable

Parameters
Expand All @@ -1497,7 +1540,7 @@ def Deterministic(name, var, model=None):
model = modelcontext(model)
var = var.copy(model.name_for(name))
model.deterministics.append(var)
model.add_random_variable(var)
model.add_random_variable(var, dims)
var._repr_latex_ = functools.partial(_latex_repr_rv, var)
var.__latex__ = var._repr_latex_
return var
Expand Down