Source code for tidypolars_extra.funs

import polars as pl
from .utils import (
    _as_list,
    _col_expr,
    _col_exprs,
    _is_constant,
    _is_list,
    _is_iterable,
    _is_series,
    _is_string,
    _str_to_lit
    )

__all__ = [
    "between", "case_when", "coalesce", "if_else",
    "is_finite", "is_in", "is_infinite", "is_not", "is_not_in", "is_not_null", "is_null",
    "lead", "map", "n_distinct", "n_missing", "pct_missing",
    "rep", "replace_null", "round", "row_number",
]

[docs] def between(x, left, right): """ Test if values of a column are between two values Parameters ---------- x : Expr, Series Column to operate on left : int Value to test if column is greater than or equal to right : int Value to test if column is less than or equal to Examples -------- >>> df = tp.tibble(x = range(4)) >>> df.filter(tp.between(col('x'), 1, 3)) """ x = _col_expr(x) return x.is_between(left, right)
[docs] def is_finite(x): """ Test if values are finite Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(finite = tp.is_finite('x')) """ x = _col_expr(x) return x.is_finite()
[docs] def is_in(x, values): """ Test if values are in a list Parameters ---------- x : Expr, Series Column to operate on values : list List of values to check Examples -------- >>> df.mutate(in_list = tp.is_in('x', [1, 2])) """ x = _col_expr(x) return x.is_in(values)
[docs] def is_infinite(x): """ Test if values are infinite Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(infinite = tp.is_infinite('x')) """ x = _col_expr(x) return x.is_infinite()
[docs] def is_not(x): """ Negate a boolean expression Parameters ---------- x : Expr Boolean expression to negate Examples -------- >>> df.mutate(not_finite = tp.is_not(tp.is_finite(col('x')))) """ return ~x
[docs] def is_not_in(x, values): """ Test if values are not in a list Parameters ---------- x : Expr, Series Column to operate on values : list List of values to check Examples -------- >>> df.mutate(not_in = tp.is_not_in('x', [1, 2])) """ x = _col_expr(x) return ~x.is_in(values)
[docs] def is_not_null(x): """ Test if values are not null Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(not_null = tp.is_not_null('x')) """ x = _col_expr(x) return x.is_not_null()
[docs] def is_null(x): """ Test if values are null Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(null = tp.is_null('x')) """ x = _col_expr(x) return x.is_null()
[docs] def coalesce(*args): """ Coalesce missing values Parameters ---------- args : Expr Columns to coalesce Examples -------- >>> df.mutate(abs_x = tp.cast(col('x'), tp.Float64)) """ args = _as_list(args) expr = if_else(args[0].is_null(), args[1], args[0]) if len(args) > 2: locs = range(2, len(args)) for i in locs: expr = if_else(expr.is_null(), args[i], expr) return expr
[docs] def if_else(condition, true, false): """ If Else Parameters ---------- condition : Expr A logical expression true : Value if the condition is true false : Value if the condition is false Examples -------- >>> df = tp.tibble(x = range(1, 4)) >>> df.mutate(if_x = tp.if_else(col('x') < 2, 1, 2)) """ return pl.when(condition).then(true).otherwise(false)
[docs] def lead(x, n: int = 1, default = None): """ Get leading values Parameters ---------- x : Expr, Series Column to operate on n : int Number of positions to lead by default : optional Value to fill in missing values Examples -------- >>> df.mutate(lead_x = tp.lead(col('x'))) >>> df.mutate(lead_x = col('x').lead()) """ x = _col_expr(x) return x.shift(-n, fill_value = default)
[docs] def n_distinct(x): """ Get number of distinct values in a column Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(min_x = tp.n_distinct('x')) >>> df.summarize(min_x = tp.n_distinct(col('x'))) """ x = _col_expr(x) return x.n_unique()
[docs] def rep(x, times = 1): """ Replicate the values in x Parameters ---------- x : const, Series Value or Series to repeat times : int Number of times to repeat Examples -------- >>> tp.rep(1, 3) >>> tp.rep(pl.Series(range(3)), 3) """ if _is_constant(x): out = [x] elif _is_series(x): out = x.to_list() elif _is_list(x): out = x elif isinstance(x, pl.DataFrame): from .tibble_df import from_polars out = pl.concat([x for i in range(times)]).pipe(from_polars) elif _is_iterable(x): out = list(x) else: raise ValueError("Incompatible type") if _is_list(out): out = pl.Series(out * times) return out
[docs] def replace_null(x, replace = None): """ Replace null values Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.tibble(x = [0, None], y = [None, None]) >>> df.mutate(x = tp.replace_null(col('x'), 1)) """ if replace == None: return x return x.fill_null(replace)
[docs] def round(x, digits = 0): """ Round a column to the specified number of decimal places Parameters ---------- x : Expr, Series Column to operate on digits : int Decimals to round to Examples -------- >>> df.mutate(x = tp.round(col('x'))) """ x = _col_expr(x) return x.round(digits)
[docs] def row_number(): """ Return row number Examples -------- >>> df.mutate(row_num = tp.row_number()) """ return pl.int_range(0, pl.len()) + 1
[docs] def case_when(*args, _default = None): """ Case when Parameters ---------- *args : Expr When called with a single expression, returns pl.when() for chaining (e.g., tp.case_when(cond).then(val).otherwise(val)). When called with paired args (condition, value, condition, value, ...), builds the full case expression. _default : optional Default value when no condition is met (used with paired args) Examples -------- >>> df = tp.tibble(x = range(1, 4)) >>> # Chaining style >>> df.mutate(case_x = tp.case_when(col('x') < 2).then(0) ... .when(col('x') < 3).then(1) ... .otherwise(0)) >>> # Paired args style >>> df.mutate( >>> case_x = tp.case_when(col('x') < 2, 1, >>> col('x') < 3, 2, >>> _default = 0) >>> ) """ if len(args) == 1: return pl.when(args[0]) conditions = [args[i] for i in range(0, len(args), 2)] values = [args[i] for i in range(1, len(args), 2)] values = [_str_to_lit(value) for value in values] for i in range(len(conditions)): if i == 0: expr = pl.when(conditions[i]).then(values[i]) else: expr = expr.when(conditions[i]).then(values[i]) _default = _str_to_lit(_default) expr = expr.otherwise(_default) return expr
[docs] def map(cols, _fun): """ Apply function by row Parameters ---------- cols : list of str A list with the name of the columns in the data to apply function _fun : a function The function to apply to the columns. The function is applied to each row separately """ # map_groups give a list of lists. I flatten it so that _fun can refer to the list of # columns (cols) simply by index flatten = lambda cols: [item for series in cols for item in list(series)] res = pl.map_groups(cols, lambda cols: _fun(flatten(cols))).over(pl.int_range(pl.len())) return res
[docs] def n_missing(x): """ Count the number of null/missing values in a column Parameters ---------- x : Expr, str Column to operate on Returns ------- Expr Count of null values. Examples -------- >>> df.summarize(missing = tp.n_missing('x')) """ x = _col_expr(x) return x.null_count()
[docs] def pct_missing(x): """ Compute the percentage of null/missing values in a column Parameters ---------- x : Expr, str Column to operate on Returns ------- Expr Percentage of null values (0 to 100). Examples -------- >>> df.summarize(pct = tp.pct_missing('x')) """ x = _col_expr(x) return x.null_count() * 100.0 / pl.len()