Skip to content

Commit 600fe90

Browse files
michaelosthegetwiecki
authored andcommitted
Emit FutureWarning when pm.Data(mutable=...) is not specified
1 parent 696d237 commit 600fe90

11 files changed

+91
-54
lines changed

Diff for: pymc/backends/arviz.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ def constant_data_to_xarray(self):
454454
"""Convert constant data to xarray."""
455455
# For constant data, we are concerned only with deterministics and
456456
# data. The constant data vars must be either pm.Data
457-
# (TensorSharedVariable) or pm.Deterministic
457+
# (TensorConstant/SharedVariable) or pm.Deterministic
458458
constant_data_vars = {} # type: Dict[str, Var]
459459

460460
def is_data(name, var) -> bool:

Diff for: pymc/data.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import os
1818
import pkgutil
1919
import urllib.request
20+
import warnings
2021

2122
from copy import copy
2223
from typing import Any, Dict, List, Optional, Sequence, Union
@@ -30,6 +31,7 @@
3031
from aesara.graph.basic import Apply
3132
from aesara.tensor.type import TensorType
3233
from aesara.tensor.var import TensorConstant, TensorVariable
34+
from packaging import version
3335

3436
import pymc as pm
3537

@@ -555,7 +557,7 @@ def Data(
555557
*,
556558
dims: Optional[Sequence[str]] = None,
557559
export_index_as_coords=False,
558-
mutable: bool = True,
560+
mutable: Optional[bool] = None,
559561
**kwargs,
560562
) -> Union[SharedVariable, TensorConstant]:
561563
"""Data container that registers a data variable with the model.
@@ -570,6 +572,11 @@ def Data(
570572
The name for this variable
571573
value: {List, np.ndarray, pd.Series, pd.Dataframe}
572574
A value to associate with this variable
575+
mutable : bool, optional
576+
Switches between creating a ``SharedVariable`` (``mutable=True``, default)
577+
vs. creating a ``TensorConstant`` (``mutable=False``).
578+
Consider using ``pm.ConstantData`` or ``pm.MutableData`` as less verbose
579+
alternatives to ``pm.Data(..., mutable=...)``.
573580
dims: {str, tuple of str}, optional, default=None
574581
Dimension names of the random variables (as opposed to the shapes of these
575582
random variables). Use this when `value` is a pandas Series or DataFrame. The
@@ -592,7 +599,7 @@ def Data(
592599
>>> observed_data = [mu + np.random.randn(20) for mu in true_mu]
593600
594601
>>> with pm.Model() as model:
595-
... data = pm.Data('data', observed_data[0])
602+
... data = pm.MutableData('data', observed_data[0])
596603
... mu = pm.Normal('mu', 0, 10)
597604
... pm.Normal('y', mu=mu, sigma=1, observed=data)
598605
@@ -626,6 +633,18 @@ def Data(
626633
# `pandas_to_array` takes care of parameter `value` and
627634
# transforms it to something digestible for Aesara.
628635
arr = pandas_to_array(value)
636+
637+
if mutable is None:
638+
current = version.Version(pm.__version__)
639+
mutable = current.major == 4 and current.minor < 1
640+
if mutable:
641+
warnings.warn(
642+
"The `mutable` kwarg was not specified. Currently it defaults to `pm.Data(mutable=True)`,"
643+
" which is equivalent to using `pm.MutableData()`."
644+
" In v4.1.0 the default will change to `pm.Data(mutable=False)`, equivalent to `pm.ConstantData`."
645+
" Set `pm.Data(..., mutable=False/True)`, or use `pm.ConstantData`/`pm.MutableData`.",
646+
FutureWarning,
647+
)
629648
if mutable:
630649
x = aesara.shared(arr, name, **kwargs)
631650
else:

Diff for: pymc/model.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,7 @@ def set_data(
11141114
):
11151115
"""Changes the values of a data variable in the model.
11161116
1117-
In contrast to pm.Data().set_value, this method can also
1117+
In contrast to pm.MutableData().set_value, this method can also
11181118
update the corresponding coordinates.
11191119
11201120
Parameters
@@ -1131,7 +1131,8 @@ def set_data(
11311131
shared_object = self[name]
11321132
if not isinstance(shared_object, SharedVariable):
11331133
raise TypeError(
1134-
f"The variable `{name}` must be a `SharedVariable` (e.g. `pymc.Data`) to allow updating. "
1134+
f"The variable `{name}` must be a `SharedVariable`"
1135+
" (created through `pm.MutableData()` or `pm.Data(mutable=True)`) to allow updating. "
11351136
f"The current type is: {type(shared_object)}"
11361137
)
11371138

@@ -1156,7 +1157,7 @@ def set_data(
11561157
length_changed = new_length != old_length
11571158

11581159
# Reject resizing if we already know that it would create shape problems.
1159-
# NOTE: If there are multiple pm.Data containers sharing this dim, but the user only
1160+
# NOTE: If there are multiple pm.MutableData containers sharing this dim, but the user only
11601161
# changes the values for one of them, they will run into shape problems nonetheless.
11611162
length_belongs_to = length_tensor.owner.inputs[0].owner.inputs[0]
11621163
if not isinstance(length_belongs_to, SharedVariable) and length_changed:
@@ -1735,8 +1736,8 @@ def set_data(new_data, model=None):
17351736
17361737
>>> import pymc as pm
17371738
>>> with pm.Model() as model:
1738-
... x = pm.Data('x', [1., 2., 3.])
1739-
... y = pm.Data('y', [1., 2., 3.])
1739+
... x = pm.MutableData('x', [1., 2., 3.])
1740+
... y = pm.MutableData('y', [1., 2., 3.])
17401741
... beta = pm.Normal('beta', 0, 1)
17411742
... obs = pm.Normal('obs', x * beta, 1, observed=y)
17421743
... idata = pm.sample(1000, tune=1000)

Diff for: pymc/model_graph.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,14 @@ def _make_node(self, var_name, graph, *, formatting: str = "plain"):
133133
shape = "octagon"
134134
style = "filled"
135135
label = f"{var_name}\n~\nPotential"
136-
elif isinstance(v, (SharedVariable, TensorConstant)):
136+
elif isinstance(v, TensorConstant):
137137
shape = "box"
138138
style = "rounded, filled"
139-
label = f"{var_name}\n~\nData"
139+
label = f"{var_name}\n~\nConstantData"
140+
elif isinstance(v, SharedVariable):
141+
shape = "box"
142+
style = "rounded, filled"
143+
label = f"{var_name}\n~\nMutableData"
140144
elif v.owner and isinstance(v.owner.op, RandomVariable):
141145
shape = "ellipse"
142146
if hasattr(v.tag, "observations"):

Diff for: pymc/sampling.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import xarray
4343

4444
from aesara.compile.mode import Mode
45+
from aesara.graph.basic import Constant
4546
from aesara.tensor.sharedvar import SharedVariable
4647
from arviz import InferenceData
4748
from fastprogress.fastprogress import progress_bar
@@ -1728,7 +1729,7 @@ def sample_posterior_predictive(
17281729
for rv in walk_model(vars_to_sample, walk_past_rvs=True)
17291730
if rv not in vars_to_sample
17301731
and rv in model.named_vars.values()
1731-
and not isinstance(rv, SharedVariable)
1732+
and not isinstance(rv, (Constant, SharedVariable))
17321733
]
17331734
if inputs_and_names:
17341735
inputs, input_names = zip(*inputs_and_names)
@@ -1739,7 +1740,7 @@ def sample_posterior_predictive(
17391740
input_names = [
17401741
n
17411742
for n in _trace.varnames
1742-
if n not in output_names and not isinstance(model[n], SharedVariable)
1743+
if n not in output_names and not isinstance(model[n], (Constant, SharedVariable))
17431744
]
17441745
inputs = [model[n] for n in input_names]
17451746

@@ -2067,7 +2068,7 @@ def sample_prior_predictive(
20672068
names.append(rv_var.name)
20682069
vars_to_sample.append(rv_var)
20692070

2070-
inputs = [i for i in inputvars(vars_to_sample) if not isinstance(i, SharedVariable)]
2071+
inputs = [i for i in inputvars(vars_to_sample) if not isinstance(i, (Constant, SharedVariable))]
20712072

20722073
sampler_fn = compile_pymc(
20732074
inputs, vars_to_sample, allow_input_downcast=True, accept_inplace=True, mode=mode

Diff for: pymc/tests/test_data_container.py

+32-20
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pytest
1818

1919
from aesara import shared
20+
from aesara.compile.sharedvalue import SharedVariable
2021
from aesara.tensor.sharedvar import ScalarSharedVariable
2122
from aesara.tensor.var import TensorVariable
2223

@@ -32,7 +33,7 @@ class TestData(SeededTest):
3233
def test_deterministic(self):
3334
data_values = np.array([0.5, 0.4, 5, 2])
3435
with pm.Model() as model:
35-
X = pm.Data("X", data_values)
36+
X = pm.MutableData("X", data_values)
3637
pm.Normal("y", 0, 1, observed=X)
3738
model.logp(model.recompute_initial_point())
3839

@@ -43,7 +44,7 @@ def test_sample(self):
4344
x_pred = np.linspace(-3, 3, 200, dtype="float32")
4445

4546
with pm.Model():
46-
x_shared = pm.Data("x_shared", x)
47+
x_shared = pm.MutableData("x_shared", x)
4748
b = pm.Normal("b", 0.0, 10.0)
4849
pm.Normal("obs", b * x_shared, np.sqrt(1e-2), observed=y)
4950

@@ -95,8 +96,8 @@ def test_sample_posterior_predictive_after_set_data(self):
9596

9697
def test_sample_after_set_data(self):
9798
with pm.Model() as model:
98-
x = pm.Data("x", [1.0, 2.0, 3.0])
99-
y = pm.Data("y", [1.0, 2.0, 3.0])
99+
x = pm.MutableData("x", [1.0, 2.0, 3.0])
100+
y = pm.MutableData("y", [1.0, 2.0, 3.0])
100101
beta = pm.Normal("beta", 0, 10.0)
101102
pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
102103
pm.sample(
@@ -131,8 +132,8 @@ def test_shared_data_as_index(self):
131132
See https://github.com/pymc-devs/pymc/issues/3813
132133
"""
133134
with pm.Model() as model:
134-
index = pm.Data("index", [2, 0, 1, 0, 2])
135-
y = pm.Data("y", [1.0, 2.0, 3.0, 2.0, 1.0])
135+
index = pm.MutableData("index", [2, 0, 1, 0, 2])
136+
y = pm.MutableData("y", [1.0, 2.0, 3.0, 2.0, 1.0])
136137
alpha = pm.Normal("alpha", 0, 1.5, size=3)
137138
pm.Normal("obs", alpha[index], np.sqrt(1e-2), observed=y)
138139

@@ -163,7 +164,7 @@ def test_shared_data_as_rv_input(self):
163164
See https://github.com/pymc-devs/pymc/issues/3842
164165
"""
165166
with pm.Model() as m:
166-
x = pm.Data("x", [1.0, 2.0, 3.0])
167+
x = pm.MutableData("x", [1.0, 2.0, 3.0])
167168
y = pm.Normal("y", mu=x, size=(2, 3))
168169
assert y.eval().shape == (2, 3)
169170
idata = pm.sample(
@@ -221,7 +222,7 @@ def test_shared_scalar_as_rv_input(self):
221222

222223
def test_creation_of_data_outside_model_context(self):
223224
with pytest.raises((IndexError, TypeError)) as error:
224-
pm.Data("data", [1.1, 2.2, 3.3])
225+
pm.ConstantData("data", [1.1, 2.2, 3.3])
225226
error.match("No model on context stack")
226227

227228
def test_set_data_to_non_data_container_variables(self):
@@ -244,8 +245,8 @@ def test_set_data_to_non_data_container_variables(self):
244245
@pytest.mark.xfail(reason="Depends on ModelGraph")
245246
def test_model_to_graphviz_for_model_with_data_container(self):
246247
with pm.Model() as model:
247-
x = pm.Data("x", [1.0, 2.0, 3.0])
248-
y = pm.Data("y", [1.0, 2.0, 3.0])
248+
x = pm.ConstantData("x", [1.0, 2.0, 3.0])
249+
y = pm.MutableData("y", [1.0, 2.0, 3.0])
249250
beta = pm.Normal("beta", 0, 10.0)
250251
obs_sigma = floatX(np.sqrt(1e-2))
251252
pm.Normal("obs", beta * x, obs_sigma, observed=y)
@@ -262,12 +263,14 @@ def test_model_to_graphviz_for_model_with_data_container(self):
262263
pm.model_to_graphviz(model, formatting=formatting)
263264

264265
exp_without = [
265-
'x [label="x\n~\nData" shape=box style="rounded, filled"]',
266+
'x [label="x\n~\nConstantData" shape=box style="rounded, filled"]',
267+
'y [label="x\n~\nMutableData" shape=box style="rounded, filled"]',
266268
'beta [label="beta\n~\nNormal"]',
267269
'obs [label="obs\n~\nNormal" style=filled]',
268270
]
269271
exp_with = [
270-
'x [label="x\n~\nData" shape=box style="rounded, filled"]',
272+
'x [label="x\n~\nConstantData" shape=box style="rounded, filled"]',
273+
'y [label="x\n~\nMutableData" shape=box style="rounded, filled"]',
271274
'beta [label="beta\n~\nNormal(mu=0.0, sigma=10.0)"]',
272275
f'obs [label="obs\n~\nNormal(mu=f(f(beta), x), sigma={obs_sigma})" style=filled]',
273276
]
@@ -290,7 +293,7 @@ def test_explicit_coords(self):
290293
}
291294
# pass coordinates explicitly, use numpy array in Data container
292295
with pm.Model(coords=coords) as pmodel:
293-
pm.Data("observations", data, dims=("rows", "columns"))
296+
pm.MutableData("observations", data, dims=("rows", "columns"))
294297

295298
assert "rows" in pmodel.coords
296299
assert pmodel.coords["rows"] == ("R1", "R2", "R3", "R4", "R5")
@@ -310,7 +313,7 @@ def test_symbolic_coords(self):
310313
Their lengths are then automatically linked to the corresponding Tensor dimension.
311314
"""
312315
with pm.Model() as pmodel:
313-
intensity = pm.Data("intensity", np.ones((2, 3)), dims=("row", "column"))
316+
intensity = pm.MutableData("intensity", np.ones((2, 3)), dims=("row", "column"))
314317
assert "row" in pmodel.dim_lengths
315318
assert "column" in pmodel.dim_lengths
316319
assert isinstance(pmodel.dim_lengths["row"], TensorVariable)
@@ -327,7 +330,7 @@ def test_no_resize_of_implied_dimensions(self):
327330
# Imply a dimension through RV params
328331
pm.Normal("n", mu=[1, 2, 3], dims="city")
329332
# _Use_ the dimension for a data variable
330-
inhabitants = pm.Data("inhabitants", [100, 200, 300], dims="city")
333+
inhabitants = pm.MutableData("inhabitants", [100, 200, 300], dims="city")
331334

332335
# Attempting to re-size the dimension through the data variable would
333336
# cause shape problems in InferenceData conversion, because the RV remains (3,).
@@ -343,7 +346,7 @@ def test_implicit_coords_series(self):
343346
name="sales",
344347
)
345348
with pm.Model() as pmodel:
346-
pm.Data("sales", ser_sales, dims="date", export_index_as_coords=True)
349+
pm.ConstantData("sales", ser_sales, dims="date", export_index_as_coords=True)
347350

348351
assert "date" in pmodel.coords
349352
assert len(pmodel.coords["date"]) == 22
@@ -360,7 +363,9 @@ def test_implicit_coords_dataframe(self):
360363

361364
# infer coordinates from index and columns of the DataFrame
362365
with pm.Model() as pmodel:
363-
pm.Data("observations", df_data, dims=("rows", "columns"), export_index_as_coords=True)
366+
pm.ConstantData(
367+
"observations", df_data, dims=("rows", "columns"), export_index_as_coords=True
368+
)
364369

365370
assert "rows" in pmodel.coords
366371
assert "columns" in pmodel.coords
@@ -370,23 +375,30 @@ def test_data_kwargs(self):
370375
strict_value = True
371376
allow_downcast_value = False
372377
with pm.Model():
373-
data = pm.Data(
374-
"data",
378+
data = pm.MutableData(
379+
"mdata",
375380
value=[[1.0], [2.0], [3.0]],
376381
strict=strict_value,
377382
allow_downcast=allow_downcast_value,
378383
)
379384
assert data.container.strict is strict_value
380385
assert data.container.allow_downcast is allow_downcast_value
381386

387+
def test_data_mutable_default_warning(self):
388+
with pm.Model():
389+
with pytest.warns(FutureWarning, match="`mutable` kwarg was not specified"):
390+
data = pm.Data("x", [1, 2, 3])
391+
assert isinstance(data, SharedVariable)
392+
pass
393+
382394

383395
def test_data_naming():
384396
"""
385397
This is a test for issue #3793 -- `Data` objects in named models are
386398
not given model-relative names.
387399
"""
388400
with pm.Model("named_model") as model:
389-
x = pm.Data("x", [1.0, 2.0, 3.0])
401+
x = pm.ConstantData("x", [1.0, 2.0, 3.0])
390402
y = pm.Normal("y")
391403
assert y.name == "named_model_y"
392404
assert x.name == "named_model_x"

0 commit comments

Comments
 (0)