Source code for plotnine_extra.stats.stat_regline_equation

from __future__ import annotations

import numpy as np
import pandas as pd
from plotnine.doctools import document
from plotnine.mapping.evaluation import after_stat
from plotnine.stats.stat import stat

from ._label_utils import compute_label_position



[docs]
@document
class stat_regline_equation(stat):
    """
    Add regression line equation and R-squared to a plot

    Fits a polynomial regression and formats the equation
    and goodness-of-fit statistics as a text label.

    {usage}

    Parameters
    ----------
    {common_parameters}
    formula : str, default="y ~ x"
        Regression formula. Supported forms:

        - ``"y ~ x"`` — simple linear regression
        - ``"y ~ poly(x, n)"`` — polynomial of degree n
    label_x_npc : float or str, default="left"
        Normalized x position for the label.
    label_y_npc : float or str, default="top"
        Normalized y position for the label.

    See Also
    --------
    plotnine.geom_text : The default `geom` for this `stat`.
    plotnine.stat_smooth : For the regression line itself.
    """

    _aesthetics_doc = """
    {aesthetics_table}

    **Options for computed aesthetics**

    ```python
    "label"    # Formatted equation label
    "eq"       # Equation string
    "rr"       # R-squared
    "adj_rr"   # Adjusted R-squared
    "aic"      # Akaike information criterion
    "bic"      # Bayesian information criterion
    ```

    """
    REQUIRED_AES = {"x", "y"}
    DEFAULT_AES = {"label": after_stat("label")}
    DEFAULT_PARAMS = {
        "geom": "text",
        "position": "identity",
        "na_rm": False,
        "formula": "y ~ x",
        "label_x_npc": "left",
        "label_y_npc": "top",
    }
    CREATES = {"label", "eq", "rr", "adj_rr", "aic", "bic"}


[docs]
    def compute_group(self, data, scales) -> pd.DataFrame:
        x = data["x"].to_numpy(dtype=float)
        y = data["y"].to_numpy(dtype=float)

        if len(x) < 2:
            return pd.DataFrame()

        # Parse formula to get degree
        degree = _parse_formula_degree(self.params["formula"])

        # Fit polynomial
        coeffs = np.polyfit(x, y, degree)
        p = np.poly1d(coeffs)
        y_pred = p(x)

        # Compute statistics
        n = len(x)
        k = degree + 1  # number of parameters including intercept
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)

        rr = 1 - ss_res / ss_tot if ss_tot != 0 else 0
        adj_rr = (
            1 - (1 - rr) * (n - 1) / (n - k)
            if n > k
            else rr
        )

        # AIC and BIC (based on residual sum of squares)
        if n > 0 and ss_res > 0:
            log_likelihood = (
                -n / 2 * (np.log(2 * np.pi * ss_res / n) + 1)
            )
            aic = 2 * k - 2 * log_likelihood
            bic = k * np.log(n) - 2 * log_likelihood
        else:
            aic = np.nan
            bic = np.nan

        # Format equation
        eq = _format_equation(coeffs, degree)
        label = f"{eq}, R² = {rr:.2f}"

        # Position the label
        x_pos = compute_label_position(
            x.min(), x.max(),
            self.params["label_x_npc"],
        )
        y_pos = compute_label_position(
            y.min(), y.max(),
            self.params["label_y_npc"],
        )

        return pd.DataFrame(
            {
                "x": [x_pos],
                "y": [y_pos],
                "label": [label],
                "eq": [eq],
                "rr": [rr],
                "adj_rr": [adj_rr],
                "aic": [aic],
                "bic": [bic],
            }
        )




def _parse_formula_degree(formula):
    """Parse a formula string to extract polynomial degree."""
    import re

    formula = formula.strip()
    # Match "y ~ poly(x, n)"
    poly_match = re.match(
        r"y\s*~\s*poly\s*\(\s*x\s*,\s*(\d+)\s*\)", formula
    )
    if poly_match:
        return int(poly_match.group(1))

    # Default: "y ~ x" means degree 1
    if re.match(r"y\s*~\s*x", formula):
        return 1

    return 1


def _format_equation(coeffs, degree):
    """Format polynomial coefficients as an equation string."""
    parts = []
    for i, coef in enumerate(coeffs):
        power = degree - i
        coef_str = f"{coef:.2g}"

        if power == 0:
            parts.append(coef_str)
        elif power == 1:
            if coef_str == "1":
                parts.append("x")
            elif coef_str == "-1":
                parts.append("-x")
            else:
                parts.append(f"{coef_str}x")
        else:
            if coef_str == "1":
                parts.append(f"x^{power}")
            elif coef_str == "-1":
                parts.append(f"-x^{power}")
            else:
                parts.append(f"{coef_str}x^{power}")

    eq = "y = "
    for i, part in enumerate(parts):
        if i == 0:
            eq += part
        elif part.startswith("-"):
            eq += f" - {part[1:]}"
        else:
            eq += f" + {part}"
    return eq