Expand pm.Data capacities (#3925)

AlexAndorra · hottwaj · web-flow · commit 1ed04784b913 · 2020-05-18T17:08:53.000+02:00
* Initial changes to allow pymc3.Data() to support both int and float input data (previously all input data was coerced to float) WIP for #3813 * added exception for invalid dtype input to pandas_to_array * Refined implementation * Finished dtype conversion handling * Added SharedVariable option to getattr_value * Added dtype handling to set_data function * Added tests for pm.Data used for index variables * Added tests for using pm.data as RV input * Ran Black on data tests files * Added release note * Updated release notes * Updated code in light of Luciano's comments * Fixed implementation of integer checking * Simplified implementation of type checking * Corrected implementation for other uses of pandas_to_array Co-authored-by: hottwaj <jonathan.a.clarke@gmail.com>
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -3,7 +3,7 @@
 ## PyMC3 3.9 (On deck)
 
 ### New features
-- use [fastprogress](https://github.com/fastai/fastprogress) instead of tqdm [#3693](https://github.com/pymc-devs/pymc3/pull/3693)
+- Use [fastprogress](https://github.com/fastai/fastprogress) instead of tqdm [#3693](https://github.com/pymc-devs/pymc3/pull/3693).
 - `DEMetropolis` can now tune both `lambda` and `scaling` parameters, but by default neither of them are tuned. See [#3743](https://github.com/pymc-devs/pymc3/pull/3743) for more info.
 - `DEMetropolisZ`, an improved variant of `DEMetropolis` brings better parallelization and higher efficiency with fewer chains with a slower initial convergence. This implementation is experimental. See [#3784](https://github.com/pymc-devs/pymc3/pull/3784) for more info.
 - Notebooks that give insight into `DEMetropolis`, `DEMetropolisZ` and the `DifferentialEquation` interface are now located in the [Tutorials/Deep Dive](https://docs.pymc.io/nb_tutorials/index.html) section.
@@ -14,6 +14,8 @@
 - `pm.sample` now has support for adapting dense mass matrix using `QuadPotentialFullAdapt` (see [#3596](https://github.com/pymc-devs/pymc3/pull/3596), [#3705](https://github.com/pymc-devs/pymc3/pull/3705), [#3858](https://github.com/pymc-devs/pymc3/pull/3858), and [#3893](https://github.com/pymc-devs/pymc3/pull/3893)). Use `init="adapt_full"` or `init="jitter+adapt_full"` to use.
 - `Moyal` distribution added (see [#3870](https://github.com/pymc-devs/pymc3/pull/3870)).
 - `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)).
+- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (issue [#3813](https://github.com/pymc-devs/pymc3/issues/3813), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
+- `pm.Data` container can now be used as input for other random variables (issue [#3842](https://github.com/pymc-devs/pymc3/issues/3842), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
 
 ### Maintenance
 - Tuning results no longer leak into sequentially sampled `Metropolis` chains (see #3733 and #3796).
diff --git a/pymc3/data.py b/pymc3/data.py
@@ -153,9 +153,9 @@ class Minibatch(tt.TensorVariable):
     Examples
     --------
     Consider we have `data` as follows:
-    
+
     >>> data = np.random.rand(100, 100)
-    
+
     if we want a 1d slice of size 10 we do
 
     >>> x = Minibatch(data, batch_size=10)
@@ -182,7 +182,7 @@ class Minibatch(tt.TensorVariable):
 
     >>> assert x.eval().shape == (10, 10)
 
-    
+
     You can pass the Minibatch `x` to your desired model:
 
     >>> with pm.Model() as model:
@@ -192,7 +192,7 @@ class Minibatch(tt.TensorVariable):
 
 
     Then you can perform regular Variational Inference out of the box
-    
+
 
     >>> with model:
     ...     approx = pm.fit()
@@ -478,16 +478,19 @@ class Data:
     For more information, take a look at this example notebook
     https://docs.pymc.io/notebooks/data_container.html
     """
+
     def __new__(self, name, value):
+        if isinstance(value, list):
+            value = np.array(value)
 
         # Add data container to the named variables of the model.
         try:
             model = pm.Model.get_context()
         except TypeError:
-            raise TypeError("No model on context stack, which is needed to "
-                            "instantiate a data container. Add variable "
-                            "inside a 'with model:' block.")
-
+            raise TypeError(
+                "No model on context stack, which is needed to instantiate a data container. "
+                "Add variable inside a 'with model:' block."
+            )
         name = model.name_for(name)
 
         # `pm.model.pandas_to_array` takes care of parameter `value` and
@@ -498,7 +501,6 @@ def __new__(self, name, value):
         # its shape.
         shared_object.dshape = tuple(shared_object.shape.eval())
 
-
         model.add_random_variable(shared_object)
 
         return shared_object
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
@@ -111,6 +111,9 @@ def getattr_value(self, val):
         if isinstance(val, tt.TensorVariable):
             return val.tag.test_value
 
+        if isinstance(val, tt.sharedvar.TensorSharedVariable):
+            return val.get_value()
+
         if isinstance(val, theano_constant):
             return val.value
 
diff --git a/pymc3/model.py b/pymc3/model.py
@@ -1244,7 +1244,7 @@ def set_data(new_data, model=None):
     ----------
     new_data: dict
         New values for the data containers. The keys of the dictionary are
-        the  variables names in the model and the values are the objects
+        the variables' names in the model and the values are the objects
         with which to update.
     model: Model (optional if in `with` context)
 
@@ -1266,7 +1266,7 @@ def set_data(new_data, model=None):
     .. code:: ipython
 
         >>> with model:
-        ...     pm.set_data({'x': [5,6,9]})
+        ...     pm.set_data({'x': [5., 6., 9.]})
         ...     y_test = pm.sample_posterior_predictive(trace)
         >>> y_test['obs'].mean(axis=0)
         array([4.6088569 , 5.54128318, 8.32953844])
@@ -1275,6 +1275,8 @@ def set_data(new_data, model=None):
 
     for variable_name, new_value in new_data.items():
         if isinstance(model[variable_name], SharedVariable):
+            if isinstance(new_value, list):
+                new_value = np.array(new_value)
             model[variable_name].set_value(pandas_to_array(new_value))
         else:
             message = 'The variable `{}` must be defined as `pymc3.' \
@@ -1501,7 +1503,17 @@ def pandas_to_array(data):
         ret = generator(data)
     else:
         ret = np.asarray(data)
-    return pm.floatX(ret)
+
+    # type handling to enable index variables when data is int:
+    if hasattr(data, "dtype"):
+        if "int" in str(data.dtype):
+            return pm.intX(ret)
+        # otherwise, assume float:
+        else:
+            return pm.floatX(ret)
+    # needed for uses of this function other than with pm.Data:
+    else:
+        return pm.floatX(ret)
 
 
 def as_tensor(data, name, model, distribution):
diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
@@ -20,117 +20,168 @@
 
 class TestData(SeededTest):
     def test_deterministic(self):
-        data_values = np.array([.5, .4, 5, 2])
+        data_values = np.array([0.5, 0.4, 5, 2])
         with pm.Model() as model:
-            X = pm.Data('X', data_values)
-            pm.Normal('y', 0, 1, observed=X)
+            X = pm.Data("X", data_values)
+            pm.Normal("y", 0, 1, observed=X)
             model.logp(model.test_point)
 
     def test_sample(self):
         x = np.random.normal(size=100)
         y = x + np.random.normal(scale=1e-2, size=100)
 
-        x_pred = np.linspace(-3, 3, 200, dtype='float32')
+        x_pred = np.linspace(-3, 3, 200, dtype="float32")
 
         with pm.Model():
-            x_shared = pm.Data('x_shared', x)
-            b = pm.Normal('b', 0., 10.)
-            pm.Normal('obs', b * x_shared, np.sqrt(1e-2), observed=y)
-            prior_trace0 = pm.sample_prior_predictive(1000)
+            x_shared = pm.Data("x_shared", x)
+            b = pm.Normal("b", 0.0, 10.0)
+            pm.Normal("obs", b * x_shared, np.sqrt(1e-2), observed=y)
 
+            prior_trace0 = pm.sample_prior_predictive(1000)
             trace = pm.sample(1000, init=None, tune=1000, chains=1)
             pp_trace0 = pm.sample_posterior_predictive(trace, 1000)
             pp_trace01 = pm.fast_sample_posterior_predictive(trace, 1000)
 
             x_shared.set_value(x_pred)
+            prior_trace1 = pm.sample_prior_predictive(1000)
             pp_trace1 = pm.sample_posterior_predictive(trace, samples=1000)
             pp_trace11 = pm.fast_sample_posterior_predictive(trace, samples=1000)
-            prior_trace1 = pm.sample_prior_predictive(1000)
 
-        assert prior_trace0['b'].shape == (1000,)
-        assert prior_trace0['obs'].shape == (1000, 100)
-        assert prior_trace1['obs'].shape == (1000, 200)
+        assert prior_trace0["b"].shape == (1000,)
+        assert prior_trace0["obs"].shape == (1000, 100)
+        assert prior_trace1["obs"].shape == (1000, 200)
 
-        assert pp_trace0['obs'].shape == (1000, 100)
-        assert pp_trace01['obs'].shape == (1000, 100)
+        assert pp_trace0["obs"].shape == (1000, 100)
+        assert pp_trace01["obs"].shape == (1000, 100)
 
-        np.testing.assert_allclose(x, pp_trace0['obs'].mean(axis=0), atol=1e-1)
-        np.testing.assert_allclose(x, pp_trace01['obs'].mean(axis=0), atol=1e-1)
+        np.testing.assert_allclose(x, pp_trace0["obs"].mean(axis=0), atol=1e-1)
+        np.testing.assert_allclose(x, pp_trace01["obs"].mean(axis=0), atol=1e-1)
 
-        assert pp_trace1['obs'].shape == (1000, 200)
-        assert pp_trace11['obs'].shape == (1000, 200)
+        assert pp_trace1["obs"].shape == (1000, 200)
+        assert pp_trace11["obs"].shape == (1000, 200)
 
-        np.testing.assert_allclose(x_pred, pp_trace1['obs'].mean(axis=0),
-                                   atol=1e-1)
-        np.testing.assert_allclose(x_pred, pp_trace11['obs'].mean(axis=0),
-                                   atol=1e-1)
+        np.testing.assert_allclose(x_pred, pp_trace1["obs"].mean(axis=0), atol=1e-1)
+        np.testing.assert_allclose(x_pred, pp_trace11["obs"].mean(axis=0), atol=1e-1)
 
     def test_sample_posterior_predictive_after_set_data(self):
         with pm.Model() as model:
-            x = pm.Data('x', [1., 2., 3.])
-            y = pm.Data('y', [1., 2., 3.])
-            beta = pm.Normal('beta', 0, 10.)
-            pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
+            x = pm.Data("x", [1.0, 2.0, 3.0])
+            y = pm.Data("y", [1.0, 2.0, 3.0])
+            beta = pm.Normal("beta", 0, 10.0)
+            pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
             trace = pm.sample(1000, tune=1000, chains=1)
         # Predict on new data.
         with model:
             x_test = [5, 6, 9]
-            pm.set_data(new_data={'x': x_test})
+            pm.set_data(new_data={"x": x_test})
             y_test = pm.sample_posterior_predictive(trace)
             y_test1 = pm.fast_sample_posterior_predictive(trace)
 
-        assert y_test['obs'].shape == (1000, 3)
-        assert y_test1['obs'].shape == (1000, 3)
-        np.testing.assert_allclose(x_test, y_test['obs'].mean(axis=0),
-                                   atol=1e-1)
-        np.testing.assert_allclose(x_test, y_test1['obs'].mean(axis=0),
-                                   atol=1e-1)
+        assert y_test["obs"].shape == (1000, 3)
+        assert y_test1["obs"].shape == (1000, 3)
+        np.testing.assert_allclose(x_test, y_test["obs"].mean(axis=0), atol=1e-1)
+        np.testing.assert_allclose(x_test, y_test1["obs"].mean(axis=0), atol=1e-1)
 
     def test_sample_after_set_data(self):
         with pm.Model() as model:
-            x = pm.Data('x', [1., 2., 3.])
-            y = pm.Data('y', [1., 2., 3.])
-            beta = pm.Normal('beta', 0, 10.)
-            pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
+            x = pm.Data("x", [1.0, 2.0, 3.0])
+            y = pm.Data("y", [1.0, 2.0, 3.0])
+            beta = pm.Normal("beta", 0, 10.0)
+            pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
             pm.sample(1000, init=None, tune=1000, chains=1)
         # Predict on new data.
-        new_x = [5., 6., 9.]
-        new_y = [5., 6., 9.]
+        new_x = [5.0, 6.0, 9.0]
+        new_y = [5.0, 6.0, 9.0]
         with model:
-            pm.set_data(new_data={'x': new_x, 'y': new_y})
+            pm.set_data(new_data={"x": new_x, "y": new_y})
             new_trace = pm.sample(1000, init=None, tune=1000, chains=1)
             pp_trace = pm.sample_posterior_predictive(new_trace, 1000)
             pp_tracef = pm.fast_sample_posterior_predictive(new_trace, 1000)
 
-        assert pp_trace['obs'].shape == (1000, 3)
-        assert pp_tracef['obs'].shape == (1000, 3)
-        np.testing.assert_allclose(new_y, pp_trace['obs'].mean(axis=0),
-                                   atol=1e-1)
-        np.testing.assert_allclose(new_y, pp_tracef['obs'].mean(axis=0),
-                                   atol=1e-1)
+        assert pp_trace["obs"].shape == (1000, 3)
+        assert pp_tracef["obs"].shape == (1000, 3)
+        np.testing.assert_allclose(new_y, pp_trace["obs"].mean(axis=0), atol=1e-1)
+        np.testing.assert_allclose(new_y, pp_tracef["obs"].mean(axis=0), atol=1e-1)
+
+    def test_shared_data_as_index(self):
+        """
+        Allow pm.Data to be used for index variables, i.e with integers as well as floats.
+        See https://github.com/pymc-devs/pymc3/issues/3813
+        """
+        with pm.Model() as model:
+            index = pm.Data("index", [2, 0, 1, 0, 2])
+            y = pm.Data("y", [1.0, 2.0, 3.0, 2.0, 1.0])
+            alpha = pm.Normal("alpha", 0, 1.5, shape=3)
+            pm.Normal("obs", alpha[index], np.sqrt(1e-2), observed=y)
+
+            prior_trace = pm.sample_prior_predictive(1000, var_names=["alpha"])
+            trace = pm.sample(1000, init=None, tune=1000, chains=1)
+
+        # Predict on new data
+        new_index = np.array([0, 1, 2])
+        new_y = [5.0, 6.0, 9.0]
+        with model:
+            pm.set_data(new_data={"index": new_index, "y": new_y})
+            pp_trace = pm.sample_posterior_predictive(
+                trace, 1000, var_names=["alpha", "obs"]
+            )
+            pp_tracef = pm.fast_sample_posterior_predictive(
+                trace, 1000, var_names=["alpha", "obs"]
+            )
+
+        assert prior_trace["alpha"].shape == (1000, 3)
+        assert trace["alpha"].shape == (1000, 3)
+        assert pp_trace["alpha"].shape == (1000, 3)
+        assert pp_trace["obs"].shape == (1000, 3)
+        assert pp_tracef["alpha"].shape == (1000, 3)
+        assert pp_tracef["obs"].shape == (1000, 3)
+
+    def test_shared_data_as_rv_input(self):
+        """
+        Allow pm.Data to be used as input for other RVs.
+        See https://github.com/pymc-devs/pymc3/issues/3842
+        """
+        with pm.Model() as m:
+            x = pm.Data("x", [1.0, 2.0, 3.0])
+            _ = pm.Normal("y", mu=x, shape=3)
+            trace = pm.sample(chains=1)
+
+        np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), x.get_value(), atol=1e-1)
+        np.testing.assert_allclose(
+            np.array([1.0, 2.0, 3.0]), trace["y"].mean(0), atol=1e-1
+        )
+
+        with m:
+            pm.set_data({"x": np.array([2.0, 4.0, 6.0])})
+            trace = pm.sample(chains=1)
+
+        np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), x.get_value(), atol=1e-1)
+        np.testing.assert_allclose(
+            np.array([2.0, 4.0, 6.0]), trace["y"].mean(0), atol=1e-1
+        )
 
     def test_creation_of_data_outside_model_context(self):
         with pytest.raises((IndexError, TypeError)) as error:
-            pm.Data('data', [1.1, 2.2, 3.3])
-        error.match('No model on context stack')
+            pm.Data("data", [1.1, 2.2, 3.3])
+        error.match("No model on context stack")
 
     def test_set_data_to_non_data_container_variables(self):
         with pm.Model() as model:
-            x = np.array([1., 2., 3.])
-            y = np.array([1., 2., 3.])
-            beta = pm.Normal('beta', 0, 10.)
-            pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
+            x = np.array([1.0, 2.0, 3.0])
+            y = np.array([1.0, 2.0, 3.0])
+            beta = pm.Normal("beta", 0, 10.0)
+            pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
             pm.sample(1000, init=None, tune=1000, chains=1)
         with pytest.raises(TypeError) as error:
-            pm.set_data({'beta': [1.1, 2.2, 3.3]}, model=model)
-        error.match('defined as `pymc3.Data` inside the model')
+            pm.set_data({"beta": [1.1, 2.2, 3.3]}, model=model)
+        error.match("defined as `pymc3.Data` inside the model")
 
     def test_model_to_graphviz_for_model_with_data_container(self):
         with pm.Model() as model:
-            x = pm.Data('x', [1., 2., 3.])
-            y = pm.Data('y', [1., 2., 3.])
-            beta = pm.Normal('beta', 0, 10.)
-            pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
+            x = pm.Data("x", [1.0, 2.0, 3.0])
+            y = pm.Data("y", [1.0, 2.0, 3.0])
+            beta = pm.Normal("beta", 0, 10.0)
+            pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
             pm.sample(1000, init=None, tune=1000, chains=1)
 
         g = pm.model_to_graphviz(model)
@@ -147,7 +198,7 @@ def test_model_to_graphviz_for_model_with_data_container(self):
 
 def test_data_naming():
     """
-    This is a test for issue #3793 -- `Data` objects in named models are 
+    This is a test for issue #3793 -- `Data` objects in named models are
     not given model-relative names.
     """
     with pm.Model("named_model") as model: