import warnings
import numpy as np
import pandas as pd
from ..doctools import document
from ..exceptions import PlotnineWarning
from .smoothers import predictdf
from .stat import stat
[docs]@document
class stat_smooth(stat):
"""
Calculate a smoothed conditional mean
{usage}
Parameters
----------
{common_parameters}
method : str or callable, optional (default: 'auto')
The available methods are::
'auto' # Use loess if (n<1000), glm otherwise
'lm', 'ols' # Linear Model
'wls' # Weighted Linear Model
'rlm' # Robust Linear Model
'glm' # Generalized linear Model
'gls' # Generalized Least Squares
'lowess' # Locally Weighted Regression (simple)
'loess' # Locally Weighted Regression
'mavg' # Moving Average
'gpr' # Gaussian Process Regressor
If a `callable` is passed, it must have the signature::
def my_smoother(data, xseq, **params):
# * data - has the x and y values for the model
# * xseq - x values to be predicted
# * params - stat parameters
#
# It must return a new dataframe. Below is the
# template used internally by Plotnine
# Input data into the model
x, y = data['x'], data['y']
# Create and fit a model
model = Model(x, y)
results = Model.fit()
# Create output data by getting predictions on
# the xseq values
data = pd.DataFrame({
'x': xseq,
'y': results.predict(xseq)})
# Compute confidence intervals, this depends on
# the model. However, given standard errors and the
# degrees of freedom we can compute the confidence
# intervals using the t-distribution.
#
# For an alternative, implement confidence interals by
# the bootstrap method
if params['se']:
from plotnine.utils.smoothers import tdist_ci
y = data['y'] # The predicted value
df = 123 # Degrees of freedom
stderr = results.stderr # Standard error
level = params['level'] # The parameter value
low, high = tdist_ci(y, df, stderr, level)
data['se'] = stderr
data['ymin'] = low
data['ymax'] = high
return data
For *loess* smoothing you must install the `scikit-misc` package.
You can install it using with ``pip install scikit-misc`` or
``pip install plotnine[all]``.
formula : formula_like
An object that can be used to construct a patsy design matrix.
This is usually a string. You can only use a formula if ``method``
is one of *lm*, *ols*, *wls*, *glm*, *rlm* or *gls*, and in the
:ref:`formula <patsy:formulas>` you may refer to the ``x`` and
``y`` aesthetic variables.
se : bool (default: True)
If :py:`True` draw confidence interval around the smooth line.
n : int (default: 80)
Number of points to evaluate the smoother at. Some smoothers
like *mavg* do not support this.
fullrange : bool (default: False)
If :py:`True` the fit will span the full range of the plot.
level : float (default: 0.95)
Level of confidence to use if :py:`se=True`.
span : float (default: 2/3.)
Controls the amount of smoothing for the *loess* smoother.
Larger number means more smoothing. It should be in the
``(0, 1)`` range.
method_args : dict (default: {})
Additional arguments passed on to the modelling method.
See Also
--------
statsmodels.regression.linear_model.OLS
statsmodels.regression.linear_model.WLS
statsmodels.robust.robust_linear_model.RLM
statsmodels.genmod.generalized_linear_model.GLM
statsmodels.regression.linear_model.GLS
statsmodels.nonparametric.smoothers_lowess.lowess
skmisc.loess.loess
pandas.DataFrame.rolling
sklearn.gaussian_process.GaussianProcessRegressor
Notes
-----
:class:`~plotnine.geoms.geom_smooth` and :class:`.stat_smooth` are
effectively aliases, they both use the same arguments.
Use :class:`~plotnine.geoms.geom_smooth` unless
you want to display the results with a non-standard geom.
"""
_aesthetics_doc = """
{aesthetics_table}
.. rubric:: Options for computed aesthetics
::
'se' # Standard error of points in bin
'ymin' # Lower confidence limit
'ymax' # Upper confidence limit
Calculated aesthetics are accessed using the `after_stat` function.
e.g. :py:`after_stat('se')`.
"""
REQUIRED_AES = {"x", "y"}
DEFAULT_PARAMS = {
"geom": "smooth",
"position": "identity",
"na_rm": False,
"method": "auto",
"se": True,
"n": 80,
"formula": None,
"fullrange": False,
"level": 0.95,
"span": 0.75,
"method_args": {},
}
CREATES = {"se", "ymin", "ymax"}
def setup_data(self, data):
"""
Overide to modify data before compute_layer is called
"""
data = data[np.isfinite(data["x"]) & np.isfinite(data["y"])]
return data
def setup_params(self, data):
params = self.params.copy()
# Use loess/lowess for small datasets
# and glm for large
if params["method"] == "auto":
max_group = data["group"].value_counts().max()
if max_group < 1000:
try:
from skmisc.loess import loess # noqa: F401
params["method"] = "loess"
except ImportError:
params["method"] = "lowess"
else:
params["method"] = "glm"
if params["method"] == "mavg":
if "window" not in params["method_args"]:
window = len(data) // 10
warnings.warn(
"No 'window' specified in the method_args. "
f"Using window = {window}. "
"The same window is used for all groups or "
"facets",
PlotnineWarning,
)
params["method_args"]["window"] = window
if params["formula"]:
allowed = {"lm", "ols", "wls", "glm", "rlm", "gls"}
if params["method"] not in allowed:
raise ValueError(
"You can only use a formula with `method` is "
f"one of {allowed}"
)
params["enviroment"] = self.environment
return params
@classmethod
def compute_group(cls, data, scales, **params):
data = data.sort_values("x")
n = params["n"]
x_unique = data["x"].unique()
if len(x_unique) < 2:
warnings.warn(
"Smoothing requires 2 or more points. Got "
f"{len(x_unique)}. Not enough points for smoothing. "
"If this message a surprise, make sure the column "
"mapped to the x aesthetic has the right dtype.",
PlotnineWarning,
)
# Not enough data to fit
return pd.DataFrame()
if data["x"].dtype.kind == "i":
if params["fullrange"]:
xseq = scales.x.dimension()
else:
xseq = np.sort(x_unique)
else:
if params["fullrange"]:
rangee = scales.x.dimension()
else:
rangee = [data["x"].min(), data["x"].max()]
xseq = np.linspace(rangee[0], rangee[1], n)
return predictdf(data, xseq, **params)