Source code for tidypolars_extra.stats

import polars as pl
from .utils import (
    _col_expr,
)

__all__ = [
    # Agg stats
    "abs", "cor", "cov", "count", "first", "last", "length",
    "log", "log10",
    "max", "mean", "median", "min", "n",
     "quantile", "sd", "sqrt", "sum", "var", "rank",
    "floor", 'scale',
    # Cumulative
    "cumsum", "cumprod", "cummax", "cummin",
    # Ranking
    "percent_rank", "cume_dist", "ntile",
    # Extra stats
    "weighted_mean", "mode", "iqr", "mad", "zscore",
]


[docs] def abs(x): """ Absolute value Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(abs_x = tp.abs('x')) >>> df.mutate(abs_x = tp.abs(col('x'))) """ x = _col_expr(x) return x.abs()
[docs] def cor(x, y, method = 'pearson'): """ Find the correlation of two columns Parameters ---------- x : Expr A column y : Expr A column method : str Type of correlation to find. Either 'pearson' or 'spearman'. Examples -------- >>> df.summarize(cor = tp.cor(col('x'), col('y'))) """ if pl.Series([method]).is_in(['pearson', 'spearman']).not_().item(): raise ValueError("`method` must be either 'pearson' or 'spearman'") return pl.corr(x, y, method = method)
[docs] def cov(x, y): """ Find the covariance of two columns Parameters ---------- x : Expr A column y : Expr A column Examples -------- >>> df.summarize(cor = tp.cov(col('x'), col('y'))) """ return pl.cov(x, y)
[docs] def count(x): """ Number of observations in each group Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(count = tp.count(col('x'))) """ x = _col_expr(x) return x.count()
[docs] def first(x): """ Get first value Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(first_x = tp.first('x')) >>> df.summarize(first_x = tp.first(col('x'))) """ x = _col_expr(x) return x.first()
[docs] def last(x): """ Get last value Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(last_x = tp.last('x')) >>> df.summarize(last_x = tp.last(col('x'))) """ x = _col_expr(x) return x.last()
[docs] def length(x): """ Number of observations in each group. Alias for :func:`count`. Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(length = tp.length(col('x'))) """ return count(x)
[docs] def floor(x): """ Round numbers down to the lower integer Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(floor_x = tp.floor(col('x'))) """ x = _col_expr(x) return x.floor()
[docs] def log(x): """ Compute the natural logarithm of a column Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(log = tp.log('x')) """ x = _col_expr(x) return x.log()
[docs] def log10(x): """ Compute the base 10 logarithm of a column Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(log = tp.log10('x')) """ x = _col_expr(x) return x.log10()
[docs] def max(x): """ Get column max Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(max_x = tp.max('x')) >>> df.summarize(max_x = tp.max(col('x'))) """ x = _col_expr(x) return x.max()
[docs] def mean(x): """ Get column mean Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(mean_x = tp.mean('x')) >>> df.summarize(mean_x = tp.mean(col('x'))) """ x = _col_expr(x) return x.mean()
[docs] def median(x): """ Get column median Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(median_x = tp.median('x')) >>> df.summarize(median_x = tp.median(col('x'))) """ x = _col_expr(x) return x.median()
[docs] def min(x): """ Get column minimum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(min_x = tp.min('x')) >>> df.summarize(min_x = tp.min(col('x'))) """ x = _col_expr(x) return x.min()
[docs] def n(): """ Number of observations in each group Examples -------- >>> df.summarize(count = tp.n()) """ return pl.len()
[docs] def quantile(x, quantile = .5): """ Get number of distinct values in a column Parameters ---------- x : Expr, Series Column to operate on quantile : float Quantile to return Examples -------- >>> df.summarize(quantile_x = tp.quantile('x', .25)) """ x = _col_expr(x) return x.quantile(quantile)
[docs] def sd(x): """ Get column standard deviation Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(sd_x = tp.sd('x')) >>> df.summarize(sd_x = tp.sd(col('x'))) """ x = _col_expr(x) return x.std()
[docs] def sqrt(x): """ Get column square root Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(sqrt_x = tp.sqrt('x')) """ x = _col_expr(x) return x.sqrt()
[docs] def sum(x): """ Get column sum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(sum_x = tp.sum('x')) >>> df.summarize(sum_x = tp.sum(col('x'))) """ x = _col_expr(x) return x.sum()
[docs] def var(x): """ Get column variance Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.summarize(sum_x = tp.var('x')) >>> df.summarize(sum_x = tp.var(col('x'))) """ x = _col_expr(x) return x.var()
[docs] def rank(x, method='dense'): """ Assigns a minimum rank to each element in the input list, handling ties by assigning the same (lowest) rank to tied values. The next distinct value's rank is increased by the number of tied values before it. Parameters ---------- x : str Column to operate on method : str dense (default): Assigns ranks in a consecutive manner, without gaps, even for ties. average : Assigns the average rank to tied values. min: Assigns the minimum rank to tied values. max: Assigns the maximum rank to tied values. ordinal: Assigns a distinct rank to each value based on its order of appearance. Returns ------- list of int A list of ranks corresponding to the elements of `x`. Examples -------- >>> rank([10, 20, 20, 30]) [1, 2, 2, 3] >>> rank([3, 1, 2]) [3, 1, 2] # since sorted order is 1,2,3 => ranks are assigned as per their order >>> rank(["b", "a", "a", "c"]) [2, 1, 1, 3] """ x = _col_expr(x) return x.rank(method=method)
[docs] def scale(x): """ Standardize the input by scaling it to a mean of 0 and a standard deviation of 1. Parameters ---------- x : Expr Column to operate on Returns ------- array-like The standardized version of the input data. """ x = _col_expr(x) return (x - x.mean()) / x.std()
[docs] def zscore(x): """ Standardize to z-scores (alias for scale) Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(z = tp.zscore('x')) """ return scale(x)
[docs] def cumsum(x): """ Cumulative sum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(csum = tp.cumsum('x')) """ x = _col_expr(x) return x.cum_sum()
[docs] def cumprod(x): """ Cumulative product Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(cprod = tp.cumprod('x')) """ x = _col_expr(x) return x.cum_prod()
[docs] def cummax(x): """ Cumulative maximum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(cmax = tp.cummax('x')) """ x = _col_expr(x) return x.cum_max()
[docs] def cummin(x): """ Cumulative minimum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(cmin = tp.cummin('x')) """ x = _col_expr(x) return x.cum_min()
[docs] def percent_rank(x): """ Compute percent rank (values between 0 and 1) Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(prank = tp.percent_rank('x')) """ x = _col_expr(x) r = x.rank(method='min') denom = pl.len() - 1 # When n=1, percent_rank is defined as 0 (matching R behavior) return pl.when(denom == 0).then(0.0).otherwise((r - 1) / denom)
[docs] def cume_dist(x): """ Compute cumulative distribution (proportion of values <= current value) Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(cd = tp.cume_dist('x')) """ x = _col_expr(x) r = x.rank(method='max') return r / pl.len()
[docs] def ntile(x, n): """ Divide values into n roughly equal groups Parameters ---------- x : Expr, Series Column to operate on n : int Number of groups Examples -------- >>> df.mutate(quartile = tp.ntile('x', 4)) """ x = _col_expr(x) r = x.rank(method='ordinal') total = pl.len() # floor((rank - 1) * n / total) + 1, matching R's ntile behavior return ((r - 1) * n / total).floor().cast(pl.Int64) + 1
[docs] def weighted_mean(x, w): """ Compute weighted mean Parameters ---------- x : Expr, Series Column of values w : Expr, Series Column of weights Examples -------- >>> df.summarize(wm = tp.weighted_mean('x', 'w')) """ x = _col_expr(x) w = _col_expr(w) return (x * w).sum() / w.sum()
[docs] def mode(x): """ Compute the statistical mode (most frequent value) Returns the first mode if there are ties (non-deterministic for ties). Use in summarize() context. Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(m = tp.mode('x')) """ x = _col_expr(x) return x.mode().first()
[docs] def iqr(x): """ Compute the interquartile range (Q3 - Q1) Use in summarize() context only. Not suitable for mutate(). Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(iqr_val = tp.iqr('x')) """ x = _col_expr(x) return x.quantile(0.75) - x.quantile(0.25)
[docs] def mad(x): """ Compute the median absolute deviation Use in summarize() context only. Not suitable for mutate(). Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(mad_val = tp.mad('x')) """ x = _col_expr(x) return (x - x.median()).abs().median()