Source code for plotnine_extra.stats.stat_cor

from __future__ import annotations

import numpy as np
import pandas as pd
from plotnine.doctools import document
from plotnine.mapping.evaluation import after_stat
from plotnine.stats.stat import stat

from ._label_utils import compute_label_position
from ._stat_test import run_stat_test

_COEF_NAMES = {
    "pearson": "R",
    "spearman": "ρ",
    "kendall": "τ",
}



[docs]
@document
class stat_cor(stat):
    """
    Add correlation coefficients with p-values to a scatter plot

    Computes correlation coefficients (Pearson, Spearman, or
    Kendall) and formats them as text labels including p-values.

    {usage}

    Parameters
    ----------
    {common_parameters}
    method : str, default="pearson"
        Correlation method. One of ``"pearson"``,
        ``"spearman"``, or ``"kendall"``.
    alternative : str, default="two-sided"
        Alternative hypothesis. One of ``"two-sided"``,
        ``"greater"``, or ``"less"``.
    label_x_npc : float or str, default="left"
        Normalized x position for the label. Float in
        [0, 1] or one of ``"left"``, ``"center"``,
        ``"right"``.
    label_y_npc : float or str, default="top"
        Normalized y position for the label. Float in
        [0, 1] or one of ``"top"``, ``"center"``,
        ``"bottom"``.
    r_accuracy : float, default=0.01
        Decimal accuracy for the correlation coefficient.
    p_accuracy : float, default=0.001
        Decimal accuracy for the p-value.
    label_sep : str, default=", "
        Separator between the correlation and p-value
        labels.

    See Also
    --------
    plotnine.geom_text : The default `geom` for this `stat`.
    """

    _aesthetics_doc = """
    {aesthetics_table}

    **Options for computed aesthetics**

    ```python
    "label"    # Formatted correlation label
    "r"        # Correlation coefficient
    "rr"       # R-squared
    "p"        # P-value
    ```

    """
    REQUIRED_AES = {"x", "y"}
    DEFAULT_AES = {"label": after_stat("label")}
    DEFAULT_PARAMS = {
        "geom": "text",
        "position": "identity",
        "na_rm": False,
        "method": "pearson",
        "alternative": "two-sided",
        "label_x_npc": "left",
        "label_y_npc": "top",
        "r_accuracy": 0.01,
        "p_accuracy": 0.001,
        "label_sep": ", ",
    }
    CREATES = {"label", "r", "rr", "p"}


[docs]
    def compute_group(self, data, scales) -> pd.DataFrame:
        x = data["x"].to_numpy(dtype=float)
        y = data["y"].to_numpy(dtype=float)
        method = self.params["method"]
        alternative = self.params["alternative"]

        if len(x) < 3:
            return pd.DataFrame()

        result = run_stat_test(
            [x, y], method=method, alternative=alternative
        )

        coef_name = _COEF_NAMES.get(method, "R")
        r = result.statistic
        p = result.p_value
        rr = r**2

        # Format label
        r_accuracy = self.params["r_accuracy"]
        p_accuracy = self.params["p_accuracy"]
        label_sep = self.params["label_sep"]

        r_digits = _accuracy_to_digits(r_accuracy)
        r_str = f"{coef_name} = {r:.{r_digits}f}"
        p_str = _format_p(p, p_accuracy)
        label = f"{r_str}{label_sep}{p_str}"

        # Position the label
        x_pos = compute_label_position(
            x.min(), x.max(),
            self.params["label_x_npc"],
        )
        y_pos = compute_label_position(
            y.min(), y.max(),
            self.params["label_y_npc"],
        )

        return pd.DataFrame(
            {
                "x": [x_pos],
                "y": [y_pos],
                "label": [label],
                "r": [r],
                "rr": [rr],
                "p": [p],
            }
        )




def _accuracy_to_digits(accuracy: float) -> int:
    """Convert accuracy (e.g. 0.01) to number of digits (e.g. 2)."""
    if accuracy >= 1:
        return 0
    return max(0, int(np.ceil(-np.log10(accuracy))))


def _format_p(p: float, accuracy: float) -> str:
    """Format a p-value with the given accuracy."""
    if p < accuracy:
        digits = _accuracy_to_digits(accuracy)
        return f"p < {accuracy:.{digits}f}"
    digits = _accuracy_to_digits(accuracy)
    return f"p = {p:.{digits}f}"