Source code for plotnine.stats.stat_ellipse

from __future__ import annotations

import typing
from warnings import warn

import numpy as np
import pandas as pd

from ..doctools import document
from ..exceptions import PlotnineWarning
from .stat import stat

if typing.TYPE_CHECKING:
    from typing import Any, Optional

    from plotnine.typing import FloatArray, FloatArrayLike


[docs]@document class stat_ellipse(stat): """ Calculate normal confidence interval ellipse {usage} Parameters ---------- {common_parameters} type : str in ['t', 'norm', 'euclid'] (default: 't') The type of ellipse. - ``'t'`` - assumes a multivariate t-distribution, and - ``'norm'`` - assumes a multivariate normal distribution. - ``'euclid'`` - draws a circle with the radius equal to `level`, representing the euclidean distance from the center. level : float, optional (default: 0.95) The confidence level at which to draw the ellipse. segments : int, optional (default: 51) Number of segments to be used in drawing the ellipse. """ REQUIRED_AES = {"x", "y"} DEFAULT_PARAMS = { "geom": "path", "position": "identity", "na_rm": False, "type": "t", "level": 0.95, "segments": 51, } @classmethod def compute_group(cls, data, scales, **params): import scipy.stats as stats from scipy import linalg level = params["level"] segments = params["segments"] type_ = params["type"] dfn = 2 dfd = len(data) - 1 if dfd < 3: warn("Too few points to calculate an ellipse", PlotnineWarning) return pd.DataFrame({"x": [], "y": []}) m: FloatArray = np.asarray(data[["x", "y"]]) # The stats used to create the ellipse if type_ == "t": res = cov_trob(m) cov = res["cov"] center = res["center"] elif type_ == "norm": cov = np.cov(m, rowvar=False) center = np.mean(m, axis=0) elif type_ == "euclid": cov = np.cov(m, rowvar=False) cov = np.diag(np.repeat(np.diag(cov).min(), 2)) center = np.mean(m, axis=0) else: raise ValueError(f"Unknown value for type={type_}") # numpy's cholesky function does not gaurantee upper/lower # triangular factorization. chol_decomp = linalg.cholesky(cov, lower=False) # Parameters of the ellipse if type_ == "euclid": radius = level / chol_decomp.max() else: radius = np.sqrt(dfn * stats.f.ppf(level, dfn, dfd)) space = np.linspace(0, 2 * np.pi, segments) # Catesian coordinates unit_circle = np.column_stack([np.cos(space), np.sin(space)]) res = center + radius * np.dot(unit_circle, chol_decomp) return pd.DataFrame({"x": res[:, 0], "y": res[:, 1]})
def cov_trob( x, wt: Optional[FloatArrayLike] = None, cor=False, center: FloatArrayLike | bool = True, nu=5, maxit=25, tol=0.01, ): """ Covariance Estimation for Multivariate t Distribution Estimates a covariance or correlation matrix assuming the data came from a multivariate t distribution: this provides some degree of robustness to outlier without giving a high breakdown point. **credit**: This function a port of the R function ``MASS::cov.trob``. Parameters ---------- x : array data matrix. Missing values (NaNs) are not allowed. wt : array A vector of weights for each case: these are treated as if the case i actually occurred ``wt[i]`` times. cor : bool Flag to choose between returning the correlation (``cor=True``) or covariance (``cor=False``) matrix. center : array or bool A logical value or a numeric vector providing the location about which the covariance is to be taken. If ``center=False``, no centering is done; if ``center=True`` the MLE of the location vector is used. nu : int 'degrees of freedom' for the multivariate t distribution. Must exceed 2 (so that the covariance matrix is finite). maxit : int Maximum number of iterations in fitting. tol : float Convergence tolerance for fitting. Returns ------- out : dict A dictionary with with the following key-value - ``cov`` : the fitted covariance matrix. - ``center`` : the estimated or specified location vector. - ``wt`` : the specified weights: only returned if the wt argument was given. - ``n_obs`` : the number of cases used in the fitting. - ``cor`` : the fitted correlation matrix: only returned if ``cor=True``. - ``call`` : The matched call. - ``iter`` : The number of iterations used. References ---------- - J. T. Kent, D. E. Tyler and Y. Vardi (1994) A curious likelihood identity for the multivariate t-distribution. *Communications in Statistics-Simulation and Computation* **23**, 441-453. - Venables, W. N. and Ripley, B. D. (1999) *Modern Applied Statistics with S-PLUS*. Third Edition. Springer. """ from scipy import linalg def test_values(x): if pd.isna(x).any() or np.isinf(x).any(): raise ValueError("Missing or infinite values in 'x'") def scale_simp(x, center, n, p): return x - np.repeat([center], n, axis=0) x = np.asarray(x) n, p = x.shape test_values(x) ans: dict[str, Any] = {} # wt if wt is None: wt = np.ones(n) else: wt = np.asarray(wt) ans["wt0"] = wt if len(wt) != n: raise ValueError( "length of 'wt' must equal number of observations." ) if any(wt < 0): raise ValueError("Negative weights not allowed.") if not np.sum(wt): raise ValueError("No positive weights.") x = x[wt > 0, :] wt = wt[wt > 0] n, _ = x.shape wt = wt[:, np.newaxis] # loc use_loc = False if isinstance(center, bool): if center: loc = np.sum(wt * x, axis=0) / wt.sum() use_loc = True else: loc = np.zeros(p) else: if len(center) != p: raise ValueError("'center' is not the right length") loc = p w = wt * (1 + p / nu) for iteration in range(maxit): w0 = w X = scale_simp(x, loc, n, p) _, s, v = linalg.svd(np.sqrt(w / np.sum(w)) * X) # wX = X @ v.T @ np.diag(np.full(p, 1/s)) wX = np.dot(np.dot(X, v.T), np.diag(np.full(p, 1 / s))) # Q = np.squeeze((wX**2) @ np.ones(p)) Q = np.squeeze(np.dot(wX**2, np.ones(p))) w = (wt * (nu + p)) / (nu + Q)[:, np.newaxis] if use_loc: loc = np.sum(w * x, axis=0) / w.sum() if all(np.abs(w - w0) < tol): break else: # nobreak _c1 = np.mean(w) - np.mean(wt) > tol _c2 = np.abs(np.mean(w * Q) / p - 1) > tol # pyright: ignore if _c1 and _c2: warn("Convergence probably failed.", PlotnineWarning) _a = np.sqrt(w) * X # pyright: ignore[reportUnboundVariable] # cov = (_a.T @ _a) / np.sum(wt) cov = np.dot(_a.T, _a) / np.sum(wt) if cor: sd = np.sqrt(np.diag(cov)) ans["cor"] = (cov / sd) / np.repeat([sd], p, axis=0).T ans.update( cov=cov, center=loc, n_obs=n, iter=iteration, # pyright: ignore[reportUnboundVariable] ) return ans