Source code for plotnine.stats.stat_boxplot

import numpy as np
import pandas as pd
import pandas.api.types as pdtypes

from ..doctools import document
from ..utils import resolution
from .stat import stat


[docs]@document class stat_boxplot(stat): """ Compute boxplot statistics {usage} Parameters ---------- {common_parameters} coef : float, optional (default: 1.5) Length of the whiskers as a multiple of the Interquartile Range. See Also -------- plotnine.geoms.geom_boxplot """ _aesthetics_doc = """ {aesthetics_table} .. rubric:: Options for computed aesthetics :: 'width' # width of boxplot 'lower' # lower hinge, 25% quantile 'middle' # median, 50% quantile 'upper' # upper hinge, 75% quantile 'notchlower' # lower edge of notch, computed as; # median - 1.58 * IQR / sqrt(n) 'notchupper' # upper edge of notch, computed as; # median + 1.58 * IQR / sqrt(n) 'ymin' # lower whisker, computed as; smallest observation # greater than or equal to lower hinge - 1.5 * IQR 'ymax' # upper whisker, computed as; largest observation # less than or equal to upper hinge + 1.5 * IQR Calculated aesthetics are accessed using the `after_stat` function. e.g. :py:`after_stat('width')`. """ REQUIRED_AES = {"x", "y"} NON_MISSING_AES = {"weight"} DEFAULT_PARAMS = { "geom": "boxplot", "position": "dodge", "na_rm": False, "coef": 1.5, "width": None, } CREATES = { "lower", "upper", "middle", "ymin", "ymax", "outliers", "notchupper", "notchlower", "width", "relvarwidth", } def setup_params(self, data): if self.params["width"] is None: self.params["width"] = resolution(data["x"], False) * 0.75 return self.params @classmethod def compute_group(cls, data, scales, **params): y = data["y"].to_numpy() if "weight" in data: weights = data["weight"] total_weight = np.sum(weights) else: weights = None total_weight = len(y) res = weighted_boxplot_stats(y, weights=weights, whis=params["coef"]) if len(np.unique(data["x"])) > 1: width = np.ptp(data["x"]) * 0.9 # type: ignore else: width = params["width"] if pdtypes.is_categorical_dtype(data["x"]): x = data["x"].iloc[0] else: x = np.mean([data["x"].min(), data["x"].max()]) d = { "ymin": res["whislo"], "lower": res["q1"], "middle": [res["med"]], "upper": res["q3"], "ymax": res["whishi"], "outliers": [res["fliers"]], "notchupper": res["cihi"], "notchlower": res["cilo"], "x": x, "width": width, "relvarwidth": np.sqrt(total_weight), } return pd.DataFrame(d)
def weighted_percentile(a, q, weights=None): """ Compute the weighted q-th percentile of data Parameters ---------- a : array_like Input that can be converted into an array. q : array_like[float] Percentile or sequence of percentiles to compute. Must be int the range [0, 100] weights : array_like Weights associated with the input values. """ # Calculate and interpolate weighted percentiles # method derived from https://en.wikipedia.org/wiki/Percentile # using numpy's standard C = 1 if weights is None: weights = np.ones(len(a)) weights = np.asarray(weights) q = np.asarray(q) C = 1 idx_s = np.argsort(a) a_s = a[idx_s] w_n = weights[idx_s] S_N = np.sum(weights) S_n = np.cumsum(w_n) p_n = (S_n - C * w_n) / (S_N + (1 - 2 * C) * w_n) pcts = np.interp(q / 100.0, p_n, a_s) return pcts def weighted_boxplot_stats(x, weights=None, whis=1.5): """ Calculate weighted boxplot plot statistics Parameters ---------- x : array_like Data weights : array_like, optional Weights associated with the data. whis : float, optional (default: 1.5) Position of the whiskers beyond the interquartile range. The data beyond the whisker are considered outliers. If a float, the lower whisker is at the lowest datum above ``Q1 - whis*(Q3-Q1)``, and the upper whisker at the highest datum below ``Q3 + whis*(Q3-Q1)``, where Q1 and Q3 are the first and third quartiles. The default value of ``whis = 1.5`` corresponds to Tukey's original definition of boxplots. Notes ----- This method adapted from Matplotlibs boxplot_stats. The key difference is the use of a weighted percentile calculation and then using linear interpolation to map weight percentiles back to data. """ if weights is None: q1, med, q3 = np.percentile(x, (25, 50, 75)) n = len(x) else: q1, med, q3 = weighted_percentile(x, (25, 50, 75), weights) n = np.sum(weights) iqr = q3 - q1 mean = np.average(x, weights=weights) cilo = med - 1.58 * iqr / np.sqrt(n) cihi = med + 1.58 * iqr / np.sqrt(n) # low extreme loval = q1 - whis * iqr lox = x[x >= loval] if len(lox) == 0 or np.min(lox) > q1: whislo = q1 else: whislo = np.min(lox) # high extreme hival = q3 + whis * iqr hix = x[x <= hival] if len(hix) == 0 or np.max(hix) < q3: whishi = q3 else: whishi = np.max(hix) bpstats = { "fliers": x[(x < whislo) | (x > whishi)], "mean": mean, "med": med, "q1": q1, "q3": q3, "iqr": iqr, "whislo": whislo, "whishi": whishi, "cilo": cilo, "cihi": cihi, } return bpstats