Source code for plotnine_extra.stats.stat_regline_equation
from __future__ import annotations
import numpy as np
import pandas as pd
from plotnine.doctools import document
from plotnine.mapping.evaluation import after_stat
from plotnine.stats.stat import stat
from ._label_utils import compute_label_position
[docs]
@document
class stat_regline_equation(stat):
"""
Add regression line equation and R-squared to a plot
Fits a polynomial regression and formats the equation
and goodness-of-fit statistics as a text label.
{usage}
Parameters
----------
{common_parameters}
formula : str, default="y ~ x"
Regression formula. Supported forms:
- ``"y ~ x"`` — simple linear regression
- ``"y ~ poly(x, n)"`` — polynomial of degree n
label_x_npc : float or str, default="left"
Normalized x position for the label.
label_y_npc : float or str, default="top"
Normalized y position for the label.
See Also
--------
plotnine.geom_text : The default `geom` for this `stat`.
plotnine.stat_smooth : For the regression line itself.
"""
_aesthetics_doc = """
{aesthetics_table}
**Options for computed aesthetics**
```python
"label" # Formatted equation label
"eq" # Equation string
"rr" # R-squared
"adj_rr" # Adjusted R-squared
"aic" # Akaike information criterion
"bic" # Bayesian information criterion
```
"""
REQUIRED_AES = {"x", "y"}
DEFAULT_AES = {"label": after_stat("label")}
DEFAULT_PARAMS = {
"geom": "text",
"position": "identity",
"na_rm": False,
"formula": "y ~ x",
"label_x_npc": "left",
"label_y_npc": "top",
}
CREATES = {"label", "eq", "rr", "adj_rr", "aic", "bic"}
[docs]
def compute_group(self, data, scales) -> pd.DataFrame:
x = data["x"].to_numpy(dtype=float)
y = data["y"].to_numpy(dtype=float)
if len(x) < 2:
return pd.DataFrame()
# Parse formula to get degree
degree = _parse_formula_degree(self.params["formula"])
# Fit polynomial
coeffs = np.polyfit(x, y, degree)
p = np.poly1d(coeffs)
y_pred = p(x)
# Compute statistics
n = len(x)
k = degree + 1 # number of parameters including intercept
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
rr = 1 - ss_res / ss_tot if ss_tot != 0 else 0
adj_rr = (
1 - (1 - rr) * (n - 1) / (n - k)
if n > k
else rr
)
# AIC and BIC (based on residual sum of squares)
if n > 0 and ss_res > 0:
log_likelihood = (
-n / 2 * (np.log(2 * np.pi * ss_res / n) + 1)
)
aic = 2 * k - 2 * log_likelihood
bic = k * np.log(n) - 2 * log_likelihood
else:
aic = np.nan
bic = np.nan
# Format equation
eq = _format_equation(coeffs, degree)
label = f"{eq}, R² = {rr:.2f}"
# Position the label
x_pos = compute_label_position(
x.min(), x.max(),
self.params["label_x_npc"],
)
y_pos = compute_label_position(
y.min(), y.max(),
self.params["label_y_npc"],
)
return pd.DataFrame(
{
"x": [x_pos],
"y": [y_pos],
"label": [label],
"eq": [eq],
"rr": [rr],
"adj_rr": [adj_rr],
"aic": [aic],
"bic": [bic],
}
)
def _parse_formula_degree(formula):
"""Parse a formula string to extract polynomial degree."""
import re
formula = formula.strip()
# Match "y ~ poly(x, n)"
poly_match = re.match(
r"y\s*~\s*poly\s*\(\s*x\s*,\s*(\d+)\s*\)", formula
)
if poly_match:
return int(poly_match.group(1))
# Default: "y ~ x" means degree 1
if re.match(r"y\s*~\s*x", formula):
return 1
return 1
def _format_equation(coeffs, degree):
"""Format polynomial coefficients as an equation string."""
parts = []
for i, coef in enumerate(coeffs):
power = degree - i
coef_str = f"{coef:.2g}"
if power == 0:
parts.append(coef_str)
elif power == 1:
if coef_str == "1":
parts.append("x")
elif coef_str == "-1":
parts.append("-x")
else:
parts.append(f"{coef_str}x")
else:
if coef_str == "1":
parts.append(f"x^{power}")
elif coef_str == "-1":
parts.append(f"-x^{power}")
else:
parts.append(f"{coef_str}x^{power}")
eq = "y = "
for i, part in enumerate(parts):
if i == 0:
eq += part
elif part.startswith("-"):
eq += f" - {part[1:]}"
else:
eq += f" + {part}"
return eq