import numpy as np
import pandas as pd
from ..doctools import document
from .density import get_var_type, kde
from .stat import stat
[docs]@document
class stat_density_2d(stat):
"""
Compute 2D kernel density estimation
{usage}
Parameters
----------
{common_parameters}
contour : bool
Whether to create contours of the 2d density estimate.
Default is True.
n : int, optional(default: 64)
Number of equally spaced points at which the density is to
be estimated. For efficient computation, it should be a power
of two.
levels : int or array_like
Contour levels. If an integer, it specifies the maximum number
of levels, if array_like it is the levels themselves. Default
is 5.
package : str in ``['statsmodels', 'scipy', 'sklearn']``
Package whose kernel density estimation to use. Default is
statsmodels.
kde_params : dict
Keyword arguments to pass on to the kde class.
See Also
--------
statsmodels.nonparametric.kde.KDEMultivariate
scipy.stats.gaussian_kde
sklearn.neighbors.KernelDensity
"""
_aesthetics_doc = """
{aesthetics_table}
.. rubric:: Options for computed aesthetics
::
'level' # density level of a contour
'density' # Computed density at a point
'piece' # Numeric id of a contour in a given group
`level` is only relevant when contours are computed. `density`
is available only when no contours are computed. `piece` is
largely irrelevant.
"""
REQUIRED_AES = {"x"}
DEFAULT_PARAMS = {
"geom": "density_2d",
"position": "identity",
"na_rm": False,
"contour": True,
"package": "statsmodels",
"kde_params": None,
"n": 64,
"levels": 5,
}
CREATES = {"y"}
def setup_params(self, data):
params = self.params.copy()
if params["kde_params"] is None:
params["kde_params"] = {}
kde_params = params["kde_params"]
if params["package"] == "statsmodels":
params["package"] = "statsmodels-m"
if "var_type" not in kde_params:
kde_params["var_type"] = "{}{}".format(
get_var_type(data["x"]), get_var_type(data["y"])
)
return params
@classmethod
def compute_group(cls, data, scales, **params):
package = params["package"]
kde_params = params["kde_params"]
group = data["group"].iloc[0]
range_x = scales.x.dimension()
range_y = scales.y.dimension()
x = np.linspace(range_x[0], range_x[1], params["n"])
y = np.linspace(range_y[0], range_y[1], params["n"])
# The grid must have a "similar" shape (n, p) to the var_data
X, Y = np.meshgrid(x, y)
var_data = np.array([data["x"].to_numpy(), data["y"].to_numpy()]).T
grid = np.array([X.flatten(), Y.flatten()]).T
density = kde(var_data, grid, package, **kde_params)
if params["contour"]:
Z = density.reshape(len(x), len(y))
data = contour_lines(X, Y, Z, params["levels"])
# Each piece should have a distinct group
groups = str(group) + "-00" + data["piece"].astype(str)
data["group"] = groups
else:
data = pd.DataFrame(
{
"x": X.flatten(),
"y": Y.flatten(),
"density": density.flatten(),
"group": group,
"level": 1,
"piece": 1,
}
)
return data
def contour_lines(X, Y, Z, levels):
"""
Calculate contour lines
"""
from contourpy import contour_generator
# Preparation of values and the creating of contours is
# adapted from MPL with some adjustments.
X = np.asarray(X, dtype=np.float64)
Y = np.asarray(Y, dtype=np.float64)
Z = np.asarray(Z, dtype=np.float64)
zmin, zmax = Z.min(), Z.max()
cgen = contour_generator(
X, Y, Z, name="mpl2014", corner_mask=False, chunk_size=0
)
if isinstance(levels, int):
from mizani.breaks import breaks_extended
levels = breaks_extended(n=levels)((zmin, zmax))
# The counter_generator gives us a list of vertices that
# represent all the contour lines at that level. There
# may be 0, 1 or more vertices at a level. Each one of
# these we call a piece, and it represented as an nx2 array.
#
# We want x-y values that describe *all* the contour lines
# in tidy format. Therefore each x-y vertex has a
# corresponding level and piece id.
segments = []
piece_ids = []
level_values = []
start_pid = 1
for level in levels:
vertices, _ = cgen.create_contour(level)
for pid, piece in enumerate(vertices, start=start_pid):
n = len(piece) # pyright: ignore
segments.append(piece)
piece_ids.append(np.repeat(pid, n))
level_values.append(np.repeat(level, n))
start_pid = pid + 1
# Collapse the info and make it fit for dataframe columns
if segments:
x, y = np.vstack(segments).T
piece = np.hstack(piece_ids)
level = np.hstack(level_values)
else:
x, y = [], []
piece = []
level = []
data = pd.DataFrame(
{
"x": x,
"y": y,
"level": level,
"piece": piece,
}
)
return data