import polars as pl
import polars.selectors as cs
import copy
from .utils import (
_as_list,
_col_expr,
_col_exprs,
)
__all__ = ["contains", "ends_with", "everything", "starts_with",
'matches', "desc", "across", "lag", "DescCol", "where"]
[docs]
def contains(match, ignore_case = True):
"""
Contains a literal string
Parameters
----------
match : str
String to match columns
ignore_case : bool
If TRUE, the default, ignores case when matching names.
Examples
--------
>>> df = tp.tibble({'a': range(3), 'b': range(3), 'c': ['a', 'a', 'b']})
>>> df.select(contains('c'))
"""
if ignore_case == True:
out = f"^*(?i){match}.*$"
else:
out = f"^*{match}.*$"
return out
[docs]
def ends_with(match, ignore_case = True):
"""
Ends with a suffix
Parameters
----------
match : str
String to match columns
ignore_case : bool
If TRUE, the default, ignores case when matching names.
Examples
--------
>>> df = tp.tibble({'a': range(3), 'b_code': range(3), 'c_code': ['a', 'a', 'b']})
>>> df.select(ends_with('code'))
"""
if ignore_case == True:
out = f"^.*(?i){match}$"
else:
out = f"^.*{match}$"
return out
[docs]
def everything():
"""
Selects all columns
Examples
--------
>>> df = tp.tibble({'a': range(3), 'b': range(3), 'c': ['a', 'a', 'b']})
>>> df.select(everything())
"""
return matches('.')
[docs]
def starts_with(match, ignore_case = True):
"""
Starts with a prefix
Parameters
----------
match : str
String to match columns
ignore_case : bool
If TRUE, the default, ignores case when matching names.
Examples
--------
>>> df = tp.tibble({'a': range(3), 'add': range(3), 'sub': ['a', 'a', 'b']})
>>> df.select(starts_with('a'))
"""
if ignore_case == True:
out = f"^(?i){match}.*$"
else:
out = f"^{match}.*$"
return out
[docs]
def matches(match, ignore_case = False):
"""
Matches pattern
Parameters
----------
match : str
String to match columns
ignore_case : bool
If True, the default, ignores case when matching names.
Examples
--------
>>> df = tp.tibble({'a': range(3), 'add': range(3), 'sub': ['a', 'a', 'b']})
>>> df.select(tp.matches('a'))
"""
if ignore_case == True:
out = f"^(?i){match}.*$"
else:
out = f"^{match}.*$"
return out
[docs]
def desc(x):
"""Mark a column to order in descending"""
x = copy.copy(x)
x = _col_expr(x)
x.__class__ = DescCol
return x
[docs]
class DescCol(pl.Expr):
pass
[docs]
def across(cols, fn = lambda x: x, names_prefix = None, names_suffix = None):
"""
Apply a function across a selection of columns
Parameters
----------
cols : list
Columns to operate on
fn : lambda
A function or lambda to apply to each column
names_prefix : Optional - str
Prefix to append to changed columns
Examples
--------
>>> df = tp.tibble(x = ['a', 'a', 'b'], y = range(3), z = range(3))
>>> df.mutate(across(['y', 'z'], lambda x: x * 2))
>>> df.mutate(across(tp.Int64, lambda x: x * 2, names_prefix = "double_"))
>>> df.summarize(across(['y', 'z'], tp.mean), by = 'x')
"""
_cols = _col_exprs(_as_list(cols))
exprs = [fn(_col) for _col in _cols]
if names_prefix is not None:
exprs = [expr.name.prefix(names_prefix) for expr in exprs]
if names_suffix is not None:
exprs = [expr.name.suffix(names_suffix) for expr in exprs]
return exprs
[docs]
def lag(x, n: int = 1, default = None):
"""
Get lagging values
Parameters
----------
x : Expr, Series
Column to operate on
n : int
Number of positions to lag by
default : optional
Value to fill in missing values
Examples
--------
>>> df.mutate(lag_x = tp.lag(col('x')))
>>> df.mutate(lag_x = tp.lag('x'))
"""
x = _col_expr(x)
return x.shift(n, fill_value = default)
[docs]
def where(col_type):
"""
Select columns by type using a string
Options:
character : factor (ordered or unordered) and string
string : only strings, exclude factors
factor : ordered or unordered factors
ordered : only ordered factors
unordered : only unordered factors
numeric : float or integet
float : only float
integer : only integer
date : date
datetime : data and time
Examples
--------
>>> from tidypolars_extra.data import mtcars
>>> df = mtcars
>>> df.select(tp.where("integer"))
>>> df.select(tp.where("numeric"))
>>> df.select(tp.where("string") | tp.where("integer"))
"""
_col_types = {
"character": cs.exclude(cs.numeric()),
"string" : cs.string(),
'factor' : cs.exclude(cs.string(), cs.numeric()),
'ordered' : cs.exclude(cs.string(), cs.categorical(), cs.numeric()),
'unordered' : cs.categorical(),
"numeric" : cs.numeric(),
"float" : cs.float(),
"integer" : cs.integer(),
"date" : cs.date(),
"datetime": cs.datetime(),
}
out = _col_types[col_type]
return out