Source code for plotnine.stats.stat_sina

import numpy as np
import pandas as pd

from ..doctools import document
from ..exceptions import PlotnineError
from ..mapping.aes import has_groups
from ..mapping.evaluation import after_stat
from ..utils import array_kind, jitter, resolution
from .binning import breaks_from_bins, breaks_from_binwidth
from .stat import stat
from .stat_density import compute_density


[docs]@document class stat_sina(stat): """ Compute Sina plot values {usage} Parameters ---------- {common_parameters} binwidth : float The width of the bins. The default is to use bins that cover the range of the data. You should always override this value, exploring multiple widths to find the best to illustrate the stories in your data. bins : int (default: 50) Number of bins. Overridden by binwidth. method : 'density' or 'counts' Choose the method to spread the samples within the same bin along the x-axis. Available methods: "density", "counts" (can be abbreviated, e.g. "d"). See Details. maxwidth : float Control the maximum width the points can spread into. Values should be in the range (0, 1). adjust : float, optional (default: 1) Adjusts the bandwidth of the density kernel when ``method='density'`` (see density). bw : str or float, optional (default: 'nrd0') The bandwidth to use, If a float is given, it is the bandwidth. The :py:`str` choices are:: 'nrd0' 'normal_reference' 'scott' 'silverman' ``nrd0`` is a port of ``stats::bw.nrd0`` in R; it is eqiuvalent to ``silverman`` when there is more than 1 value in a group. bin_limit : int (default: 1) If the samples within the same y-axis bin are more than `bin_limit`, the samples's X coordinates will be adjusted. This parameter is effective only when :py:`method='counts'` random_state : int or ~numpy.random.RandomState, optional Seed or Random number generator to use. If ``None``, then numpy global generator :class:`numpy.random` is used. scale : str (default: area) How to scale the sina groups. The options are:: 'area' # Scale by the largest density/bin amoung the different # sinas 'count' # areas are scaled proportionally to the number of points 'width' # Only scale according to the maxwidth parameter. See Also -------- plotnine.geoms.geom_sina """ _aesthetics_doc = """ {aesthetics_table} .. rubric:: Options for computed aesthetics :: 'quantile' # quantile 'group' # group identifier Calculated aesthetics are accessed using the `after_stat` function. e.g. :py:`after_stat('quantile')`. """ REQUIRED_AES = {"x", "y"} DEFAULT_AES = {"xend": after_stat("scaled")} DEFAULT_PARAMS = { "geom": "sina", "position": "dodge", "na_rm": False, "binwidth": None, "bins": None, "method": "density", "bw": "nrd0", "maxwidth": None, "adjust": 1, "bin_limit": 1, "random_state": None, "scale": "area", } CREATES = {"scaled"} def setup_data(self, data): if ( array_kind.continuous(data["x"]) and not has_groups(data) and (data["x"] != data.loc["x", 0]).any() ): raise TypeError( "Continuous x aesthetic -- did you forget " "aes(group=...)?" ) return data def setup_params(self, data): params = self.params.copy() random_state = params["random_state"] if params["maxwidth"] is None: params["maxwidth"] = resolution(data["x"], False) * 0.9 if params["binwidth"] is None and self.params["bins"] is None: params["bins"] = 50 if random_state is None: params["random_state"] = np.random elif isinstance(random_state, int): params["random_state"] = np.random.RandomState(random_state) # Required by compute_density params["kernel"] = "gau" # It has to be a gaussian kernel params["cut"] = 0 params["gridsize"] = None params["clip"] = (-np.inf, np.inf) params["n"] = 512 return params @classmethod def compute_panel(cls, data, scales, **params): maxwidth = params["maxwidth"] random_state = params["random_state"] fuzz = 1e-8 y_dim = scales.y.dimension() y_dim_fuzzed = (y_dim[0] - fuzz, y_dim[1] + fuzz) if params["binwidth"] is not None: params["bins"] = breaks_from_binwidth( y_dim_fuzzed, params["binwidth"] ) else: params["bins"] = breaks_from_bins(y_dim_fuzzed, params["bins"]) data = super(cls, stat_sina).compute_panel(data, scales, **params) if not len(data): return data if params["scale"] == "area": data["sinawidth"] = data["density"] / data["density"].max() elif params["scale"] == "count": data["sinawidth"] = ( data["density"] / data["density"].max() * data["n"] / data["n"].max() ) elif params["scale"] == "width": data["sinawidth"] = data["scaled"] else: msg = "Unknown scale value '{}'" raise PlotnineError(msg.format(params["scale"])) data["xmin"] = data["x"] - maxwidth / 2 data["xmax"] = data["x"] + maxwidth / 2 data["x_diff"] = ( random_state.uniform(-1, 1, len(data)) * maxwidth * data["sinawidth"] / 2 ) data["width"] = maxwidth # jitter y values if the input is input is integer if (data["y"] == np.floor(data["y"])).all(): data["y"] = jitter(data["y"], random_state=random_state) return data @classmethod def compute_group(cls, data, scales, **params): maxwidth = params["maxwidth"] bins = params["bins"] bin_limit = params["bin_limit"] weight = None if len(data) == 0: return pd.DataFrame() elif len(data) < 3: data["density"] = 0 data["scaled"] = 1 elif params["method"] == "density": from scipy.interpolate import interp1d # density kernel estimation range_y = data["y"].min(), data["y"].max() dens = compute_density(data["y"], weight, range_y, **params) densf = interp1d( dens["x"], dens["density"], bounds_error=False, fill_value="extrapolate", # pyright: ignore ) data["density"] = densf(data["y"]) data["scaled"] = data["density"] / dens["density"].max() else: # bin based estimation bin_index = pd.cut( data["y"], bins, include_lowest=True, labels=False ) data["density"] = ( pd.Series(bin_index) .groupby(bin_index) .apply(len)[bin_index] .to_numpy() ) data.loc[data["density"] <= bin_limit, "density"] = 0 data["scaled"] = data["density"] / data["density"].max() # Compute width if x has multiple values if len(data["x"].unique()) > 1: width = np.ptp(data["x"]) * maxwidth else: width = maxwidth data["width"] = width data["n"] = len(data) data["x"] = np.mean([data["x"].max(), data["x"].min()]) return data def finish_layer(self, data, params): # Rescale x in case positions have been adjusted x_mod = (data["xmax"] - data["xmin"]) / data["width"] data["x"] = data["x"] + data["x_diff"] * x_mod return data