Source code for tidypolars_extra.forcats
import polars as pl
from .utils import _col_expr
__all__ = [
"fct_collapse",
"fct_infreq",
"fct_lump",
"fct_recode",
"fct_rev",
]
[docs]
def fct_infreq(df, col_name):
"""
Reorder factor levels by frequency (most common first)
Parameters
----------
df : tibble
The DataFrame containing the column
col_name : str
Name of the column to reorder
Returns
-------
tibble
DataFrame with column cast to Enum with levels ordered by frequency.
Examples
--------
>>> df = tp.tibble(x=['a', 'b', 'a', 'a', 'b', 'c'])
>>> df = tp.fct_infreq(df, 'x')
"""
counts = (df.to_polars()
.get_column(col_name)
.cast(pl.Utf8)
.value_counts(sort=True))
levels = counts.get_column(col_name).to_list()
dtype = pl.Enum(levels)
return df.mutate(pl.col(col_name).cast(pl.Utf8).cast(dtype).alias(col_name))
[docs]
def fct_rev(df, col_name):
"""
Reverse factor level order
Parameters
----------
df : tibble
The DataFrame containing the column
col_name : str
Name of the column to reverse
Returns
-------
tibble
DataFrame with column cast to Enum with reversed level order.
Examples
--------
>>> df = tp.tibble(x=['a', 'b', 'c'])
>>> df = tp.fct_rev(df, 'x')
"""
col_series = df.to_polars().get_column(col_name)
dtype = col_series.dtype
if isinstance(dtype, pl.Enum):
levels = list(reversed(dtype.categories.to_list()))
elif dtype == pl.Categorical:
levels = list(reversed(col_series.cast(pl.Utf8).unique().sort().to_list()))
else:
levels = list(reversed(col_series.cast(pl.Utf8).unique().sort().to_list()))
new_dtype = pl.Enum(levels)
return df.mutate(pl.col(col_name).cast(pl.Utf8).cast(new_dtype).alias(col_name))
[docs]
def fct_lump(x, n=None, prop=None, other_level='Other'):
"""
Collapse least frequent factor levels into 'Other'
Uses a ranking approach: for each value, computes its frequency rank
and replaces values outside the top n with other_level.
Parameters
----------
x : Expr, str
Factor/categorical column
n : int, optional
Number of most frequent levels to keep
prop : float, optional
Minimum proportion to keep a level (0 to 1)
other_level : str
Label for collapsed levels (default: 'Other')
Returns
-------
Expr
Expression with infrequent levels replaced.
Examples
--------
>>> df.mutate(x_lumped = tp.fct_lump('x', n=3))
"""
x = _col_expr(x)
x_str = x.cast(pl.Utf8)
if n is not None:
# Rank by frequency; keep levels where rank <= n
freq = x_str.len().over(x_str)
freq_rank = freq.rank(method='dense', descending=True)
return pl.when(freq_rank <= n).then(x_str).otherwise(pl.lit(other_level))
elif prop is not None:
# Keep levels that appear in at least prop fraction of rows
freq = x_str.len().over(x_str)
return pl.when(freq >= pl.len() * prop).then(x_str).otherwise(pl.lit(other_level))
else:
return x_str
[docs]
def fct_recode(x, **kwargs):
"""
Manually recode factor levels
Parameters
----------
x : Expr, str
Factor/categorical column
**kwargs
Mapping of new_level = 'old_level' or new_level = ['old1', 'old2']
Returns
-------
Expr
Expression with recoded levels.
Examples
--------
>>> df.mutate(x_recoded = tp.fct_recode('x', good='a', bad='b'))
"""
x = _col_expr(x)
result = x.cast(pl.Utf8)
for new_level, old_levels in kwargs.items():
if isinstance(old_levels, str):
old_levels = [old_levels]
for old_level in old_levels:
result = pl.when(result == old_level).then(pl.lit(new_level)).otherwise(result)
return result
[docs]
def fct_collapse(x, **kwargs):
"""
Collapse multiple factor levels into one
Parameters
----------
x : Expr, str
Factor/categorical column
**kwargs
Mapping of new_level = ['old1', 'old2', ...]
Returns
-------
Expr
Expression with collapsed levels.
Examples
--------
>>> df.mutate(x_collapsed = tp.fct_collapse('x', ab=['a', 'b'], cd=['c', 'd']))
"""
x = _col_expr(x)
result = x.cast(pl.Utf8)
for new_level, old_levels in kwargs.items():
if isinstance(old_levels, str):
old_levels = [old_levels]
result = pl.when(result.is_in(old_levels)).then(pl.lit(new_level)).otherwise(result)
return result