Source code for plotnine.stats.stat_bindot

from __future__ import annotations

from warnings import warn

import numpy as np
import pandas as pd

from ..doctools import document
from ..exceptions import PlotnineError, PlotnineWarning
from ..mapping.evaluation import after_stat
from ..utils import groupby_apply
from .binning import (
    assign_bins,
    breaks_from_bins,
    breaks_from_binwidth,
    freedman_diaconis_bins,
)
from .stat import stat


[docs]@document class stat_bindot(stat): """ Binning for a dot plot {usage} Parameters ---------- {common_parameters} bins : int, optional (default: None) Number of bins. Overridden by binwidth. If :py:`None`, a number is computed using the freedman-diaconis method. binwidth : float, optional (default: None) When :py:`method='dotdensity'`, this specifies the maximum binwidth. When :py:`method='histodot'`, this specifies the binwidth. This supercedes the ``bins``. origin : float, optional (default: None) When :py:`method='histodot'`, origin of the first bin. width : float, optional (default: 0.9) When :py:`binaxis='y'`, the spacing of the dotstacks for dodging. binaxis : str, optional (default: x) Axis to bin along. Either :py:`'x'` or :py:`'y'` method : str, optional (default: dotdensity) One of *dotdensity* or *histodot*. These provide either of dot-density binning or fixed bin widths. binpositions : str, optional (default: bygroup) Position of the bins when :py:`method='dotdensity'`. The value is one of:: 'bygroup' # positions of the bins for each group are # determined separately. 'all' # positions of the bins are determined with all # data taken together. This aligns the dots # stacks across multiple groups. drop : bool, optional (default: False) If :py:`True`, remove all bins with zero counts. right : bool, optional (default: True) When :py:`method='histodot'`, :py:`True` means include right edge of the bins and if :py:`False` the left edge is included. breaks : array-like, optional (default: None) Bin boundaries for :py:`method='histodot'`. This supercedes the ``binwidth`` and ``bins``. See Also -------- plotnine.stats.stat_bin """ _aesthetics_doc = """ {aesthetics_table} .. rubric:: Options for computed aesthetics :: 'count' # number of points in bin 'density' # density of points in bin, scaled to integrate to 1 'ncount' # count, scaled to maximum of 1 'ndensity' # density, scaled to maximum of 1 """ REQUIRED_AES = {"x"} NON_MISSING_AES = {"weight"} DEFAULT_PARAMS = { "geom": "dotplot", "position": "identity", "na_rm": False, "bins": None, "binwidth": None, "origin": None, "width": 0.9, "binaxis": "x", "method": "dotdensity", "binpositions": "bygroup", "drop": False, "right": True, "breaks": None, } DEFAULT_AES = {"y": after_stat("count")} CREATES = {"width", "count", "density", "ncount", "ndensity"} def setup_params(self, data): params = self.params if ( params["breaks"] is None and params["binwidth"] is None and params["bins"] is None ): params = params.copy() params["bins"] = freedman_diaconis_bins(data["x"]) msg = ( "'stat_bin()' using 'bins = {}'. " "Pick better value with 'binwidth'." ) warn(msg.format(params["bins"]), PlotnineWarning) return params @classmethod def compute_panel(cls, data, scales, **params): if ( params["method"] == "dotdensity" and params["binpositions"] == "all" ): binaxis = params["binaxis"] if binaxis == "x": newdata = densitybin( x=data["x"], weight=data.get("weight"), binwidth=params["binwidth"], bins=params["bins"], ) data = data.sort_values("x") data.reset_index(inplace=True, drop=True) newdata = newdata.sort_values("x") newdata.reset_index(inplace=True, drop=True) elif binaxis == "y": newdata = densitybin( x=data["y"], weight=data.get("weight"), binwidth=params["binwidth"], bins=params["bins"], ) data = data.sort_values("y") data.reset_index(inplace=True, drop=True) newdata = newdata.sort_values("x") newdata.reset_index(inplace=True, drop=True) else: raise ValueError(f"Unknown value {binaxis=}") data["bin"] = newdata["bin"] data["binwidth"] = newdata["binwidth"] data["weight"] = newdata["weight"] data["bincenter"] = newdata["bincenter"] return super(cls, stat_bindot).compute_panel(data, scales, **params) @classmethod def compute_group(cls, data, scales, **params): # Check that weights are whole numbers # (for dots, weights must be whole) weight: pd.Series | None = data.get("weight") # pyright: ignore if weight is not None: int_status = [(w * 1.0).is_integer() for w in weight] if not all(int_status): raise PlotnineError( "Weights for stat_bindot must be nonnegative integers." ) if params["binaxis"] == "x": rangee = scales.x.dimension((0, 0)) values = data["x"].to_numpy() midline = 0 # Make pyright happy else: rangee = scales.y.dimension((0, 0)) values = data["y"].to_numpy() # The middle of each group, on the stack axis midline = np.mean([data["x"].min(), data["x"].max()]) if params["method"] == "histodot": if params["binwidth"] is not None: breaks = breaks_from_binwidth( rangee, params["binwidth"], boundary=params["origin"] ) else: breaks = breaks_from_bins( rangee, params["bins"], boundary=params["origin"] ) closed = "right" if params["right"] else "left" data = assign_bins( values, breaks, data.get("weight"), pad=False, closed=closed ) # for consistency data.rename( columns={"width": "binwidth", "x": "bincenter"}, inplace=True ) elif params["method"] == "dotdensity": # If bin centers are found by group instead of by all, # find the bin centers (If binpositions=="all", then # we'll already have bin centers.) if params["binpositions"] == "bygroup": data = densitybin( x=values, weight=weight, binwidth=params["binwidth"], bins=params["bins"], rangee=rangee, ) # Collapse each bin and get a count def func(df): return pd.DataFrame( { "binwidth": [df["binwidth"].iloc[0]], "bincenter": [df["bincenter"].iloc[0]], "count": [int(df["weight"].sum())], } ) # plyr::ddply + plyr::summarize data = groupby_apply(data, "bincenter", func) if data["count"].sum() != 0: data.loc[np.isnan(data["count"]), "count"] = 0 data["ncount"] = data["count"] / data["count"].abs().max() if params["drop"]: data = data[data["count"] > 0] data.reset_index(inplace=True, drop=True) if params["binaxis"] == "x": data["x"] = data.pop("bincenter") # For x binning, the width of the geoms # is same as the width of the bin data["width"] = data["binwidth"] else: data["y"] = data.pop("bincenter") # For y binning, set the x midline. # This is needed for continuous x axis data["x"] = midline return data
def densitybin(x, weight=None, binwidth=None, bins=None, rangee=None): """ Do density binning It does not collapse each bin with a count. Parameters ---------- x : array-like Numbers to bin weight : array-like Weights binwidth : numeric Size of the bins bins : int Number of bins rangee : tuple Range of x Returns ------- data : DataFrame """ if all(pd.isna(x)): return pd.DataFrame() if weight is None: weight = np.ones(len(x)) weight = np.asarray(weight) weight[np.isnan(weight)] = 0 if rangee is None: rangee = np.min(x), np.max(x) if bins is None: bins = 30 if binwidth is None: binwidth = np.ptp(rangee) / bins # type: ignore # Sort weight and x, by x order = np.argsort(x) weight = weight[order] x = x[order] cbin = 0 # Current bin ID bin_ids = [] # The bin ID for each observation # End position of current bin (scan left to right) binend = -np.inf # Scan list and put dots in bins for value in x: # If past end of bin, start a new bin at this point if value >= binend: binend = value + binwidth cbin = cbin + 1 bin_ids.append(cbin) def func(series): return (series.min() + series.max()) / 2 results = pd.DataFrame( { "x": x, "bin": bin_ids, "binwidth": binwidth, "weight": weight, } ) # This is a plyr::ddply results["bincenter"] = results.groupby("bin")["x"].transform(func) return results