Source code for tidypolars_extra.helpers

import polars as pl
import polars.selectors as cs
import copy
from .utils import (
    _as_list,
    _col_expr,
    _col_exprs,
    )



__all__ = ["contains", "ends_with", "everything", "starts_with",
           'matches', "desc", "across", "lag", "DescCol", "where"]

[docs] def contains(match, ignore_case = True): """ Contains a literal string Parameters ---------- match : str String to match columns ignore_case : bool If TRUE, the default, ignores case when matching names. Examples -------- >>> df = tp.tibble({'a': range(3), 'b': range(3), 'c': ['a', 'a', 'b']}) >>> df.select(contains('c')) """ if ignore_case == True: out = f"^*(?i){match}.*$" else: out = f"^*{match}.*$" return out
[docs] def ends_with(match, ignore_case = True): """ Ends with a suffix Parameters ---------- match : str String to match columns ignore_case : bool If TRUE, the default, ignores case when matching names. Examples -------- >>> df = tp.tibble({'a': range(3), 'b_code': range(3), 'c_code': ['a', 'a', 'b']}) >>> df.select(ends_with('code')) """ if ignore_case == True: out = f"^.*(?i){match}$" else: out = f"^.*{match}$" return out
[docs] def everything(): """ Selects all columns Examples -------- >>> df = tp.tibble({'a': range(3), 'b': range(3), 'c': ['a', 'a', 'b']}) >>> df.select(everything()) """ return matches('.')
[docs] def starts_with(match, ignore_case = True): """ Starts with a prefix Parameters ---------- match : str String to match columns ignore_case : bool If TRUE, the default, ignores case when matching names. Examples -------- >>> df = tp.tibble({'a': range(3), 'add': range(3), 'sub': ['a', 'a', 'b']}) >>> df.select(starts_with('a')) """ if ignore_case == True: out = f"^(?i){match}.*$" else: out = f"^{match}.*$" return out
[docs] def matches(match, ignore_case = False): """ Matches pattern Parameters ---------- match : str String to match columns ignore_case : bool If True, the default, ignores case when matching names. Examples -------- >>> df = tp.tibble({'a': range(3), 'add': range(3), 'sub': ['a', 'a', 'b']}) >>> df.select(tp.matches('a')) """ if ignore_case == True: out = f"^(?i){match}.*$" else: out = f"^{match}.*$" return out
[docs] def desc(x): """Mark a column to order in descending""" x = copy.copy(x) x = _col_expr(x) x.__class__ = DescCol return x
[docs] class DescCol(pl.Expr): pass
[docs] def across(cols, fn = lambda x: x, names_prefix = None, names_suffix = None): """ Apply a function across a selection of columns Parameters ---------- cols : list Columns to operate on fn : lambda A function or lambda to apply to each column names_prefix : Optional - str Prefix to append to changed columns Examples -------- >>> df = tp.tibble(x = ['a', 'a', 'b'], y = range(3), z = range(3)) >>> df.mutate(across(['y', 'z'], lambda x: x * 2)) >>> df.mutate(across(tp.Int64, lambda x: x * 2, names_prefix = "double_")) >>> df.summarize(across(['y', 'z'], tp.mean), by = 'x') """ _cols = _col_exprs(_as_list(cols)) exprs = [fn(_col) for _col in _cols] if names_prefix is not None: exprs = [expr.name.prefix(names_prefix) for expr in exprs] if names_suffix is not None: exprs = [expr.name.suffix(names_suffix) for expr in exprs] return exprs
[docs] def lag(x, n: int = 1, default = None): """ Get lagging values Parameters ---------- x : Expr, Series Column to operate on n : int Number of positions to lag by default : optional Value to fill in missing values Examples -------- >>> df.mutate(lag_x = tp.lag(col('x'))) >>> df.mutate(lag_x = tp.lag('x')) """ x = _col_expr(x) return x.shift(n, fill_value = default)
[docs] def where(col_type): """ Select columns by type using a string Options: character : factor (ordered or unordered) and string string : only strings, exclude factors factor : ordered or unordered factors ordered : only ordered factors unordered : only unordered factors numeric : float or integet float : only float integer : only integer date : date datetime : data and time Examples -------- >>> from tidypolars_extra.data import mtcars >>> df = mtcars >>> df.select(tp.where("integer")) >>> df.select(tp.where("numeric")) >>> df.select(tp.where("string") | tp.where("integer")) """ _col_types = { "character": cs.exclude(cs.numeric()), "string" : cs.string(), 'factor' : cs.exclude(cs.string(), cs.numeric()), 'ordered' : cs.exclude(cs.string(), cs.categorical(), cs.numeric()), 'unordered' : cs.categorical(), "numeric" : cs.numeric(), "float" : cs.float(), "integer" : cs.integer(), "date" : cs.date(), "datetime": cs.datetime(), } out = _col_types[col_type] return out