[1]:
import warnings
import arviz as az
import matplotlib
import numpy as np
import pymc3 as pm
import scipy as sp
import scipy.sparse
import scipy.sparse.linalg
import seaborn as sbn
import theano as T
import theano.tensor as tt
from matplotlib import pyplot as plt
print(f"Running on PyMC3 v{pm.__version__}")
Running on PyMC3 v3.9.0
[2]:
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
warnings.filterwarnings("ignore")
np.set_printoptions(precision=3, suppress=True)
np.random.seed(314159)
[3]:
# Preliminary data generation
n = 250
k_true = 5
d = 9
err_sd = 2
N_SAMPLE = 350
M = np.random.binomial(1, 0.25, size=(k_true, n))
Q = np.hstack(
[np.random.exponential(2 * k_true - k, size=(d, 1)) for k in range(k_true)]
) * np.random.binomial(1, 0.75, size=(d, k_true))
Y = np.round(1000 * np.dot(Q, M) + np.random.normal(size=(d, n)) * err_sd) / 1000
print(np.sum(Q, axis=0))
Y[:3, :7]
[81.791 43.265 45.045 35.098 26.493]
[3]:
array([[ 0.406, 0. , 5.074, 9.827, 0.41 , 15.139, -0.001],
[ 0.003, 0.003, 14.975, 0.001, 0. , 8.635, -0.004],
[-0.002, 0.001, 5.012, 19.101, 3.31 , 20.363, -0.001]])
Identifiability and scalability in Bayesian Factor Analysis¶
Probabilistic PCA and Bayesian Factor Analysis are a common source of posts in the pymc3 discourse:
https://discourse.pymc.io/t/large-scale-factor-analysis-with-minibatch-advi/246
https://discourse.pymc.io/t/dealing-with-missing-data/252
https://discourse.pymc.io/t/unique-solution-for-probabilistic-pca/1324/14
The model for factor analysis is the probabilistic matrix factorization
\(X_{(d,n)}|W_{(d,k)}, F_{(k,n)} \sim N(WF, \Psi)\)
with \(\Psi\) a diagonal matrix. Subscripts denote the dimensionality of the matrices. Probabilistic PCA is a variant that sets \(\Psi = \sigma^2I\). The basic implementation (taken from https://gist.github.com/twiecki/c95578a6539d2098be2d83575e3d15fe
) is
[4]:
k = 2
with pm.Model() as PPCA:
W = pm.Normal("W", shape=(d, k))
F = pm.Normal("F", shape=(k, n))
psi = pm.HalfNormal("psi", 1.0)
X = pm.Normal("X", mu=tt.dot(W, F), sd=psi, observed=Y)
# select a subset of weights and factors to plot
W_plot = pm.Deterministic("W_plot", W[1:3, 0])
F_plot = pm.Deterministic("F_plot", F[0, 1:3])
trace = pm.sample(N_SAMPLE, chains=4, cores=1, init="advi+adapt_diag")
pm.traceplot(trace, ("W_plot", "F_plot", "psi"));
Only 350 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Convergence achieved at 53300
Interrupted at 53,299 [26%]: Average Loss = 30,068
Sequential sampling (4 chains in 1 job)
NUTS: [psi, F, W]
Sampling 4 chains for 1_000 tune and 350 draw iterations (4_000 + 1_400 draws total) took 46 seconds.
The chain contains only diverging samples. The model is probably misspecified.
The acceptance probability does not match the target. It is 0.4415202957992365, but should be close to 0.8. Try to increase the number of tuning steps.
There were 350 divergences after tuning. Increase `target_accept` or reparameterize.
There were 350 divergences after tuning. Increase `target_accept` or reparameterize.
There were 664 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.4973374193608769, but should be close to 0.8. Try to increase the number of tuning steps.
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

This approach suffers from two drawbacks: identifiability and scalability.
First, only the product \(WF\) matters for the likelihood of \(X\), so \(P(X|W, F) = P(X|W\Omega, \Omega^{-1}F)\) for any invertible matrix \(\Omega\). While the priors on \(W\) and \(F\) constrain \(|\Omega|\) to be neither too large or too small, factors and loadings can still be rotated, reflected, and/or permuted without changing the model likelihood. Expect it to happen between runs of the sampler, or even for the parametrization to “drift” within run, and to produce the gobbeldygook W traceplot above.
This can be fixed by constraining the form of W to be: + Lower triangular + Positive, increasing diagonal
We can adapt expand_block_triangular
to fill out a non-square matrix and then update our model:
[5]:
def expand_packed_block_triangular(n, k, packed, diag=None, mtype="theano"):
# like expand_packed_triangular, but with n > k.
assert mtype in {"theano", "numpy"}
assert n >= k
def set_(M, i_, v_):
if mtype == "theano":
return tt.set_subtensor(M[i_], v_)
M[i_] = v_
return M
out = tt.zeros((n, k), dtype=float) if mtype == "theano" else np.zeros((n, k), dtype=float)
if diag is None:
idxs = np.tril_indices(n, m=k)
out = set_(out, idxs, packed)
else:
idxs = np.tril_indices(n, k=-1, m=k)
out = set_(out, idxs, packed)
idxs = (np.arange(k), np.arange(k))
out = set_(out, idxs, diag)
return out
def makeW(dat, k):
# make a W matrix adapted to the data shape
n_od = int(k * d - k * (k - 1) / 2 - k)
z = pm.HalfNormal(
"W_z", 1.0, shape=(k,)
) # trick: the cumulative sum of z will be positive increasing
b = pm.Normal("W_b", 0.0, 1.0, shape=(n_od,))
ones = tt.ones(k)
L = pm.Deterministic("W_L", expand_packed_block_triangular(d, k, b, ones))
W = pm.Deterministic("W", tt.dot(L, tt.diag(tt.extra_ops.cumsum(z))))
return W
k = 2
with pm.Model() as PPCA_identified:
W = makeW(Y, k)
F = pm.Normal("F", shape=(k, n))
psi = pm.HalfNormal("psi", 1.0)
X = pm.Normal("X", mu=tt.dot(W, F), sd=psi, observed=Y)
# select a subset of weights and factors to plot
W_plot = pm.Deterministic("W_plot", W[1:3, 0])
F_plot = pm.Deterministic("F_plot", F[0, 1:3])
trace = pm.sample(N_SAMPLE, chains=4, cores=1, init="advi+adapt_diag")
pm.traceplot(trace, ("W_plot", "F_plot", "psi"));
Only 350 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Convergence achieved at 38000
Interrupted at 37,999 [18%]: Average Loss = 38,506
Sequential sampling (4 chains in 1 job)
NUTS: [psi, F, W_b, W_z]
Sampling 4 chains for 1_000 tune and 350 draw iterations (4_000 + 1_400 draws total) took 148 seconds.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.

W (and F!) now have consistent posterior distributions across runs of the sampler.
Because the \(k \times n\) parameters in F all need to be sampled, sampling can become quite expensive for large large n
. In addition, the link between an observed data point \(X_i\) and an associated latent value \(F_i\) means that mini-batching cannot be performed.
This scalability problem can be addressed analyitcally by integrating \(F\) out of the model. However, this fixes the prior on \(F\), allowing for no modeling flexibility. In keeping with \(F_{ij} \sim N(0, 1)\) we have
\(X|WF \sim \mathrm{MN}(WF, \Psi, I) \;\; F_{ij} \sim N(0, 1)\)
\(X|W \sim \mathrm{MN}(0, \Psi + WW^T, I)\)
The explicit integration of \(F\) also enables batching the observations for faster computation of ADVI and FullRankADVI approximations.
[6]:
k = 2
with pm.Model() as PPCA_scaling:
W = makeW(Y, k)
Y_mb = pm.Minibatch(Y.T, 50) # MvNormal parametrizes covariance of columns, so transpose Y
psi = pm.HalfNormal("psi", 1.0)
E = pm.Deterministic("cov", tt.dot(W, tt.transpose(W)) + psi * tt.diag(tt.ones(d)))
X = pm.MvNormal("X", 0.0, cov=E, observed=Y_mb)
W_plot = pm.Deterministic("W_plot", W[:3, 0])
trace = pm.FullRankADVI().fit(35000, obj_n_mc=2).sample(N_SAMPLE * 4)
pm.traceplot(trace, varnames=["W_plot", "psi"]);
Finished [100%]: Average Loss = 1,548.2

Post-hoc identification of F¶
The matrix \(F\) is typically of interest for factor analysis, and is often used as a feature matrix for dimensionality reduction. However, \(F\) has been marginalized away in order to make fitting the model easier; and now we need it back. This is, in effect, an exercise in least-squares as:
\(X|WF \sim N(WF, \Psi)\)
\((W^TW)^{-1}W^T\Psi^{-1/2}X|W,F \sim N(F, (W^TW)^{-1})\)
[7]:
F_sampled = np.zeros((500, k, n))
for q in range(500):
Wq = trace["W"][q, :, :]
Pq = trace["psi"][q]
WWq = np.linalg.inv(np.dot(Wq.T, Wq))
Fq_mu = np.dot(1 / np.sqrt(Pq) * np.dot(WWq, Wq.T), Y)
WWq_chol = np.linalg.cholesky(WWq)
F_sampled[q, :, :] = Fq_mu + np.dot(WWq_chol, np.random.normal(size=(k, n)))
cols = ["black", "blue", "red", "orange", "purple", "magenta", "green", "yellow"]
for i in range(2):
for j in range(5):
sbn.kdeplot(F_sampled[:, i, j], color=cols[(i + j) % len(cols)]);

[8]:
%load_ext watermark
%watermark -n -u -v -iv -w
seaborn 0.10.1
matplotlib 3.2.1
scipy 1.4.1
numpy 1.18.5
theano 1.0.4
arviz 0.8.3
pymc3 3.9.0
last updated: Mon Jun 15 2020
CPython 3.7.7
IPython 7.15.0
watermark 2.0.2