import polars as pl
from .utils import (
_as_list,
_col_expr,
_col_exprs,
_is_constant,
_is_list,
_is_iterable,
_is_series,
_is_string,
_str_to_lit
)
__all__ = [
"between", "case_when", "coalesce", "if_else",
"is_finite", "is_in", "is_infinite", "is_not", "is_not_in", "is_not_null", "is_null",
"lead", "map", "n_distinct", "n_missing", "pct_missing",
"rep", "replace_null", "round", "row_number",
]
[docs]
def between(x, left, right):
"""
Test if values of a column are between two values
Parameters
----------
x : Expr, Series
Column to operate on
left : int
Value to test if column is greater than or equal to
right : int
Value to test if column is less than or equal to
Examples
--------
>>> df = tp.tibble(x = range(4))
>>> df.filter(tp.between(col('x'), 1, 3))
"""
x = _col_expr(x)
return x.is_between(left, right)
[docs]
def is_finite(x):
"""
Test if values are finite
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(finite = tp.is_finite('x'))
"""
x = _col_expr(x)
return x.is_finite()
[docs]
def is_in(x, values):
"""
Test if values are in a list
Parameters
----------
x : Expr, Series
Column to operate on
values : list
List of values to check
Examples
--------
>>> df.mutate(in_list = tp.is_in('x', [1, 2]))
"""
x = _col_expr(x)
return x.is_in(values)
[docs]
def is_infinite(x):
"""
Test if values are infinite
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(infinite = tp.is_infinite('x'))
"""
x = _col_expr(x)
return x.is_infinite()
[docs]
def is_not(x):
"""
Negate a boolean expression
Parameters
----------
x : Expr
Boolean expression to negate
Examples
--------
>>> df.mutate(not_finite = tp.is_not(tp.is_finite(col('x'))))
"""
return ~x
[docs]
def is_not_in(x, values):
"""
Test if values are not in a list
Parameters
----------
x : Expr, Series
Column to operate on
values : list
List of values to check
Examples
--------
>>> df.mutate(not_in = tp.is_not_in('x', [1, 2]))
"""
x = _col_expr(x)
return ~x.is_in(values)
[docs]
def is_not_null(x):
"""
Test if values are not null
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(not_null = tp.is_not_null('x'))
"""
x = _col_expr(x)
return x.is_not_null()
[docs]
def is_null(x):
"""
Test if values are null
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(null = tp.is_null('x'))
"""
x = _col_expr(x)
return x.is_null()
[docs]
def coalesce(*args):
"""
Coalesce missing values
Parameters
----------
args : Expr
Columns to coalesce
Examples
--------
>>> df.mutate(abs_x = tp.cast(col('x'), tp.Float64))
"""
args = _as_list(args)
expr = if_else(args[0].is_null(), args[1], args[0])
if len(args) > 2:
locs = range(2, len(args))
for i in locs:
expr = if_else(expr.is_null(), args[i], expr)
return expr
[docs]
def if_else(condition, true, false):
"""
If Else
Parameters
----------
condition : Expr
A logical expression
true :
Value if the condition is true
false :
Value if the condition is false
Examples
--------
>>> df = tp.tibble(x = range(1, 4))
>>> df.mutate(if_x = tp.if_else(col('x') < 2, 1, 2))
"""
return pl.when(condition).then(true).otherwise(false)
[docs]
def lead(x, n: int = 1, default = None):
"""
Get leading values
Parameters
----------
x : Expr, Series
Column to operate on
n : int
Number of positions to lead by
default : optional
Value to fill in missing values
Examples
--------
>>> df.mutate(lead_x = tp.lead(col('x')))
>>> df.mutate(lead_x = col('x').lead())
"""
x = _col_expr(x)
return x.shift(-n, fill_value = default)
[docs]
def n_distinct(x):
"""
Get number of distinct values in a column
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(min_x = tp.n_distinct('x'))
>>> df.summarize(min_x = tp.n_distinct(col('x')))
"""
x = _col_expr(x)
return x.n_unique()
[docs]
def rep(x, times = 1):
"""
Replicate the values in x
Parameters
----------
x : const, Series
Value or Series to repeat
times : int
Number of times to repeat
Examples
--------
>>> tp.rep(1, 3)
>>> tp.rep(pl.Series(range(3)), 3)
"""
if _is_constant(x):
out = [x]
elif _is_series(x):
out = x.to_list()
elif _is_list(x):
out = x
elif isinstance(x, pl.DataFrame):
from .tibble_df import from_polars
out = pl.concat([x for i in range(times)]).pipe(from_polars)
elif _is_iterable(x):
out = list(x)
else:
raise ValueError("Incompatible type")
if _is_list(out):
out = pl.Series(out * times)
return out
[docs]
def replace_null(x, replace = None):
"""
Replace null values
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = [0, None], y = [None, None])
>>> df.mutate(x = tp.replace_null(col('x'), 1))
"""
if replace == None: return x
return x.fill_null(replace)
[docs]
def round(x, digits = 0):
"""
Round a column to the specified number of decimal places
Parameters
----------
x : Expr, Series
Column to operate on
digits : int
Decimals to round to
Examples
--------
>>> df.mutate(x = tp.round(col('x')))
"""
x = _col_expr(x)
return x.round(digits)
[docs]
def row_number():
"""
Return row number
Examples
--------
>>> df.mutate(row_num = tp.row_number())
"""
return pl.int_range(0, pl.len()) + 1
[docs]
def case_when(*args, _default = None):
"""
Case when
Parameters
----------
*args : Expr
When called with a single expression, returns pl.when() for chaining
(e.g., tp.case_when(cond).then(val).otherwise(val)).
When called with paired args (condition, value, condition, value, ...),
builds the full case expression.
_default : optional
Default value when no condition is met (used with paired args)
Examples
--------
>>> df = tp.tibble(x = range(1, 4))
>>> # Chaining style
>>> df.mutate(case_x = tp.case_when(col('x') < 2).then(0)
... .when(col('x') < 3).then(1)
... .otherwise(0))
>>> # Paired args style
>>> df.mutate(
>>> case_x = tp.case_when(col('x') < 2, 1,
>>> col('x') < 3, 2,
>>> _default = 0)
>>> )
"""
if len(args) == 1:
return pl.when(args[0])
conditions = [args[i] for i in range(0, len(args), 2)]
values = [args[i] for i in range(1, len(args), 2)]
values = [_str_to_lit(value) for value in values]
for i in range(len(conditions)):
if i == 0:
expr = pl.when(conditions[i]).then(values[i])
else:
expr = expr.when(conditions[i]).then(values[i])
_default = _str_to_lit(_default)
expr = expr.otherwise(_default)
return expr
[docs]
def map(cols, _fun):
"""
Apply function by row
Parameters
----------
cols : list of str
A list with the name of the columns in the data to apply function
_fun : a function
The function to apply to the columns. The function is applied
to each row separately
"""
# map_groups give a list of lists. I flatten it so that _fun can refer to the list of
# columns (cols) simply by index
flatten = lambda cols: [item for series in cols for item in list(series)]
res = pl.map_groups(cols, lambda cols: _fun(flatten(cols))).over(pl.int_range(pl.len()))
return res
[docs]
def n_missing(x):
"""
Count the number of null/missing values in a column
Parameters
----------
x : Expr, str
Column to operate on
Returns
-------
Expr
Count of null values.
Examples
--------
>>> df.summarize(missing = tp.n_missing('x'))
"""
x = _col_expr(x)
return x.null_count()
[docs]
def pct_missing(x):
"""
Compute the percentage of null/missing values in a column
Parameters
----------
x : Expr, str
Column to operate on
Returns
-------
Expr
Percentage of null values (0 to 100).
Examples
--------
>>> df.summarize(pct = tp.pct_missing('x'))
"""
x = _col_expr(x)
return x.null_count() * 100.0 / pl.len()