Source code for plotnine_extra.stats.stat_cor

from __future__ import annotations

import numpy as np
import pandas as pd
from plotnine.doctools import document
from plotnine.mapping.evaluation import after_stat
from plotnine.stats.stat import stat

from ._label_utils import compute_label_position
from ._stat_test import run_stat_test

_COEF_NAMES = {
    "pearson": "R",
    "spearman": "ρ",
    "kendall": "τ",
}


[docs] @document class stat_cor(stat): """ Add correlation coefficients with p-values to a scatter plot Computes correlation coefficients (Pearson, Spearman, or Kendall) and formats them as text labels including p-values. {usage} Parameters ---------- {common_parameters} method : str, default="pearson" Correlation method. One of ``"pearson"``, ``"spearman"``, or ``"kendall"``. alternative : str, default="two-sided" Alternative hypothesis. One of ``"two-sided"``, ``"greater"``, or ``"less"``. label_x_npc : float or str, default="left" Normalized x position for the label. Float in [0, 1] or one of ``"left"``, ``"center"``, ``"right"``. label_y_npc : float or str, default="top" Normalized y position for the label. Float in [0, 1] or one of ``"top"``, ``"center"``, ``"bottom"``. r_accuracy : float, default=0.01 Decimal accuracy for the correlation coefficient. p_accuracy : float, default=0.001 Decimal accuracy for the p-value. label_sep : str, default=", " Separator between the correlation and p-value labels. See Also -------- plotnine.geom_text : The default `geom` for this `stat`. """ _aesthetics_doc = """ {aesthetics_table} **Options for computed aesthetics** ```python "label" # Formatted correlation label "r" # Correlation coefficient "rr" # R-squared "p" # P-value ``` """ REQUIRED_AES = {"x", "y"} DEFAULT_AES = {"label": after_stat("label")} DEFAULT_PARAMS = { "geom": "text", "position": "identity", "na_rm": False, "method": "pearson", "alternative": "two-sided", "label_x_npc": "left", "label_y_npc": "top", "r_accuracy": 0.01, "p_accuracy": 0.001, "label_sep": ", ", } CREATES = {"label", "r", "rr", "p"}
[docs] def compute_group(self, data, scales) -> pd.DataFrame: x = data["x"].to_numpy(dtype=float) y = data["y"].to_numpy(dtype=float) method = self.params["method"] alternative = self.params["alternative"] if len(x) < 3: return pd.DataFrame() result = run_stat_test( [x, y], method=method, alternative=alternative ) coef_name = _COEF_NAMES.get(method, "R") r = result.statistic p = result.p_value rr = r**2 # Format label r_accuracy = self.params["r_accuracy"] p_accuracy = self.params["p_accuracy"] label_sep = self.params["label_sep"] r_digits = _accuracy_to_digits(r_accuracy) r_str = f"{coef_name} = {r:.{r_digits}f}" p_str = _format_p(p, p_accuracy) label = f"{r_str}{label_sep}{p_str}" # Position the label x_pos = compute_label_position( x.min(), x.max(), self.params["label_x_npc"], ) y_pos = compute_label_position( y.min(), y.max(), self.params["label_y_npc"], ) return pd.DataFrame( { "x": [x_pos], "y": [y_pos], "label": [label], "r": [r], "rr": [rr], "p": [p], } )
def _accuracy_to_digits(accuracy: float) -> int: """Convert accuracy (e.g. 0.01) to number of digits (e.g. 2).""" if accuracy >= 1: return 0 return max(0, int(np.ceil(-np.log10(accuracy)))) def _format_p(p: float, accuracy: float) -> str: """Format a p-value with the given accuracy.""" if p < accuracy: digits = _accuracy_to_digits(accuracy) return f"p < {accuracy:.{digits}f}" digits = _accuracy_to_digits(accuracy) return f"p = {p:.{digits}f}"