Description
Describe the issue:
There are a couple of issues with the current design:
i) The keys of start
and start_sigma
are inconsistent as can be seen in example below.
ii) I can specify arbitrary (str-valued) keys for start_sigma
, if they are not in the model, then the default zero initialisation is used for the rho
variable in the variational approximation.
iii) The np.log(np.expm1(np.abs(sigma)))
transformation from sigma to rho in method create_shared_parameters
from class MeanFieldGroup
is also somewhat surprising.
iv) It is also unclear how the values generated from advi.approx.mean.eval
and advi.approx.std.eval
relate to the variational parameters mu
and rho
returned by method create_shared_params
and/or with the free random variables beta
and sigma
of the model. For example I think that tracker["mean"][-1][-1] (cf. example below) is the variational parameter corresponding to the variable sigma_log__
.
v) In pm.fit
arguments start
and start_sigma
are ignored if an instance of ADVI is passed as method.
Reproduceable code example:
Minimal example below
import numpy as np
import pandas as pd
import pymc as pm
def generate_data(num_samples: int) -> pd.DataFrame:
rng = np.random.default_rng(seed=42)
beta = 1.0
sigma = 10.0
x = rng.normal(loc=0.0, scale=1.0, size=num_samples)
y = beta * x + sigma * rng.normal(size=num_samples)
return pd.DataFrame({"x": x, "y": y})
def make_model(frame: pd.DataFrame) -> pm.Model:
with pm.Model() as model:
# Data
x = pm.Data("x", frame["x"])
y = pm.Data("y", frame["y"])
# Prior
beta = pm.Normal("beta", sigma=10.0)
sigma = pm.HalfNormal("sigma", sigma=20.0)
# Linear model
mu = beta * x
# Likelihood
pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y)
return model
if __name__ == "__main__":
frame = generate_data(10000)
model = make_model(frame)
with model:
advi = pm.ADVI(
start={"beta": 1.0, "sigma": 10.0},
start_sigma={"beta": 0.5, "sigma_log__": 0.5}
)
tracker = Tracker(
mean=advi.approx.mean.eval,
std=advi.approx.std.eval
)
approx = pm.fit(
n=1_000_000,
method=advi,
callbacks=[
CheckParametersConvergence(diff="relative", tolerance=1e-3),
CheckParametersConvergence(diff="absolute", tolerance=1e-3),
tracker
],
)
This is from class MeanFieldGroup
def create_shared_params(self, start=None, start_sigma=None):
# NOTE: `Group._prepare_start` uses `self.model.free_RVs` to identify free variables and
# `DictToArrayBijection` to turn them into a flat array, while `Approximation.rslice` assumes that the free
# variables are given by `self.group` and that the mapping between original variables and flat array is given
# by `self.ordering`. In the cases I looked into these turn out to be the same, but there may be edge cases or
# future code changes that break this assumption.
start = self._prepare_start(start)
rho1 = np.zeros((self.ddim,))
if start_sigma is not None:
for name, slice_, *_ in self.ordering.values():
sigma = start_sigma.get(name)
if sigma is not None:
rho1[slice_] = np.log(np.expm1(np.abs(sigma)))
rho = rho1
return {
"mu": pytensor.shared(pm.floatX(start), "mu"),
"rho": pytensor.shared(pm.floatX(rho), "rho"),
}
I think this simple example is already hitting the problematic edge case mentioned in the NOTE. Also the line sigma = start_sigma.get(name)
is problematic as passing start_sigma
with wrong keys will never raise an error.
This is in contrast to parameter start
where a wrong key will raise, e.g. using "beta_foo" instead of "beta"
Traceback (most recent call last):
File ".../univariate.py", line 167, in <module>
advi = pm.ADVI(
^^^^^^^^
File ".../pymc/variational/inference.py", line 471, in __init__
super().__init__(MeanField(*args, **kwargs))
^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".../pymc/variational/approximations.py", line 339, in __init__
super().__init__(groups, model=kwargs.get("model"))
File ".../pymc/variational/opvi.py", line 1229, in __init__
rest.__init_group__(unseen_free_RVs)
File ".../pytensor/configparser.py", line 44, in res
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File ".../pymc/variational/approximations.py", line 74, in __init_group__
self.shared_params = self.create_shared_params(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".../pymc/variational/approximations.py", line 85, in create_shared_params
start = self._prepare_start(start)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".../pymc/variational/opvi.py", line 760, in _prepare_start
ipfn = make_initial_point_fn(
^^^^^^^^^^^^^^^^^^^^^^
File ".../pymc/initial_point.py", line 134, in make_initial_point_fn
sdict_overrides = convert_str_to_rv_dict(model, overrides or {})
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".../pymc/initial_point.py", line 49, in convert_str_to_rv_dict
initvals[model[key]] = initval
~~~~~^^^^^
File ".../pymc/model/core.py", line 1575, in __getitem__
raise e
File ".../pymc/model/core.py", line 1570, in __getitem__
return self.named_vars[key]
~~~~~~~~~~~~~~~^^^^^
KeyError: 'beta_foo'
Also this block in pm.fit
is problematic since it will ignore start
and start_sigma
if an instance of e.g. ADVI is passed as method
argument to instead of the string 'advi'
_select = dict(advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD)
if isinstance(method, str):
method = method.lower()
if method in _select:
inference = _select[method](model=model, **inf_kwargs)
else:
raise KeyError(f"method should be one of {set(_select.keys())} or Inference instance")
elif isinstance(method, Inference):
inference = method
...
Initially this made debugging of the actual issue even more contrived. Also it is not clear if I can use any instance of ADVI to get tracking for mean
and std
or if it has to be the same instance that is passed to pm.fit
.
I'd be happy to make a PR with a fix, but I am lacking familiarity with the APIs/concepts mentioned in the NOTE, i.e.
`Group._prepare_start` uses `self.model.free_RVs` to identify free variables and
# `DictToArrayBijection` to turn them into a flat array, while `Approximation.rslice` assumes that the free
# variables are given by `self.group` and that the mapping between original variables and flat array is given
# by `self.ordering`.
I'd be super glad to receive guidance so that I can work on this. I think that a fix could significantly improve the API and usability of the initialisation for ADVI.
Context for the issue:
cf. https://discourse.pymc.io/t/variational-fit-advi-initialisation/15940/4 for more context. Also thanks a lot @jessegrabowski for already very helpful suggestions.
A working example, that demonstrates the quirks of the current state is below. The example demonstrates how to use the "final state" (variational parameters mu, rho???) to initialise a fit for a model with slightly enlarged data set, but I wouldn't know how to implement this for a complicated (hierarchical) model with many (transformed) random variables. As the transformation need to be applied in order to use the "final state" from the first fit as initialisation of the second fit.
import numpy as np
import pandas as pd
import pymc as pm
def generate_data(num_samples: int) -> pd.DataFrame:
rng = np.random.default_rng(seed=42)
beta = 1.0
sigma = 10.0
x = rng.normal(loc=0.0, scale=1.0, size=num_samples)
y = beta * x + sigma * rng.normal(size=num_samples)
return pd.DataFrame({"x": x, "y": y})
def make_model(frame: pd.DataFrame) -> pm.Model:
with pm.Model() as model:
# Data
x = pm.Data("x", frame["x"])
y = pm.Data("y", frame["y"])
# Prior
beta = pm.Normal("beta", sigma=10.0)
sigma = pm.HalfNormal("sigma", sigma=20.0)
# Linear model
mu = beta * x
# Likelihood
pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y)
return model
if __name__ == "__main__":
num_samples = 10_000
frame = generate_data(num_samples=num_samples)
model = make_model(frame)
with model:
advi = pm.ADVI(
)
tracker = Tracker(
mean=advi.approx.mean.eval,
std=advi.approx.std.eval
)
t0 = time.time()
approx = pm.fit(
n=1_000_000,
method=advi,
callbacks=[
CheckParametersConvergence(diff="relative", tolerance=1e-3),
CheckParametersConvergence(diff="absolute", tolerance=1e-3),
tracker
],
)
t = time.time() - t0
print(f"Time for fit is {t:.3f}s.")
frame_new = pd.concat([frame, generate_data(100)], axis=0)
model = make_model(frame_new)
with model:
advi = pm.ADVI(
start={"beta_foo": tracker["mean"][-1][0], "sigma": np.exp(tracker["mean"][-1][1])},
start_sigma={"beta": tracker["std"][-1][0], "sigma_log__": tracker["std"][-1][1]}
)
tracker = Tracker(
mean=advi.approx.mean.eval,
std=advi.approx.std.eval
)
t0 = time.time()
approx = pm.fit(
n=1_000_000,
method=advi,
callbacks=[
CheckParametersConvergence(diff="relative", tolerance=1e-3),
CheckParametersConvergence(diff="absolute", tolerance=1e-3),
tracker
],
)
t = time.time() - t0
print(f"Time for fit is {t:.3f}s.")
This is the trace for the initial ADVI fit
and here for the model with data updated and parameters initialised
As we can see we can save quite a lot of computation by using the final parameters from the old fit as the initial guess for the new fit.
Error message:
No response