import polars as pl
import functools as ft
from .utils import _as_list, _col_expr
from .funs import map
__all__ = [
"paste",
"paste0",
"str_c",
"str_count",
"str_detect",
"str_dup",
"str_extract",
"str_extract_all",
"str_length",
"str_pad",
"str_remove_all",
"str_remove",
"str_replace_all",
"str_replace",
"str_ends",
"str_split",
"str_squish",
"str_starts",
"str_sub",
"str_to_lower",
"str_to_title",
"str_to_upper",
"str_trim",
"str_wrap"
]
[docs]
def paste(*args, sep = ' '):
"""
Concatenate strings together
Parameters
----------
args : Expr, str
Columns and or strings to concatenate
Examples
--------
>>> df = tp.tibble(x = ['a', 'b', 'c'])
>>> df.mutate(x_end = tp.paste(col('x'), 'end', sep = '_'))
"""
args = _as_list(args)
args = [pl.lit(arg) if not isinstance(arg, pl.Expr) else arg for arg in args]
curlies = ['{}'] * len(args)
string_format = sep.join(curlies)
return pl.format(string_format, *args)
[docs]
def paste0(*args):
"""
Concatenate strings together with no separator
Parameters
----------
args : Expr, str
Columns and or strings to concatenate
Examples
--------
>>> df = tp.tibble(x = ['a', 'b', 'c'])
>>> df.mutate(xend = tp.paste0(col('x'), 'end'))
"""
return paste(*args, sep = '')
[docs]
def str_c(*args, sep = ''):
"""
Concatenate strings together.
Alias for :func:`paste`.
Parameters
----------
args : Expr, str
Columns and/or strings to concatenate
Examples
--------
>>> df = tp.tibble(x = ['a', 'b', 'c'])
>>> df.mutate(x_end = str_c(col('x'), 'end', sep = '_'))
"""
return paste(*args, sep = sep)
[docs]
def str_detect(string, pattern, negate = False):
"""
Detect the presence or absence of a pattern in a string
Parameters
----------
string : str
Input series to operate on
pattern : str
Pattern to look for
negate : bool
If True, return non-matching elements
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_detect('name', 'a'))
>>> df.mutate(x = str_detect('name', ['a', 'e']))
"""
if isinstance(pattern, str):
pattern = [pattern]
string = _col_expr(string)
exprs = (string.str.contains(p) for p in pattern)
exprs = ft.reduce(lambda a, b : a & b, exprs)
if negate:
exprs = exprs.not_()
return exprs
[docs]
def str_ends(string, pattern, negate = False):
"""
Detect the presence or absence of a pattern at the end of a string.
Parameters
----------
string : Expr
Column to operate on
pattern : str
Pattern to look for
negate : bool
If True, return non-matching elements
Examples
--------
>>> df = tp.tibble(words = ['apple', 'bear', 'amazing'])
>>> df.filter(tp.str_ends(col('words'), 'ing'))
"""
pattern = pattern + "$"
return str_detect(string, pattern, negate)
[docs]
def str_length(string):
"""
Length of a string
Parameters
----------
string : str
Input series to operate on
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_length(col('name')))
"""
string = _col_expr(string)
return string.str.len_chars()
[docs]
def str_starts(string, pattern, negate = False):
"""
Detect the presence or absence of a pattern at the beginning of a string.
Parameters
----------
string : Expr
Column to operate on
pattern : str
Pattern to look for
negate : bool
If True, return non-matching elements
Examples
--------
>>> df = tp.tibble(words = ['apple', 'bear', 'amazing'])
>>> df.filter(tp.str_starts(col('words'), 'a'))
"""
pattern = "^" + pattern
return str_detect(string, pattern, negate)
[docs]
def str_sub(string, start = 0, end = None):
"""
Extract portion of string based on start and end inputs
Parameters
----------
string : str
Input series to operate on
start : int
First position of the character to return
end : int
Last position of the character to return
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_sub(col('name'), 0, 3))
"""
string = _col_expr(string)
return string.str.slice(start, end)
[docs]
def str_remove_all(string, pattern):
"""
Removes all matched patterns in a string
Parameters
----------
string : str
Input series to operate on
pattern : str
Pattern to look for
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_remove_all(col('name'), 'a'))
"""
return str_replace_all(string, pattern, "")
[docs]
def str_remove(string, pattern):
"""
Removes the first matched patterns in a string
Parameters
----------
string : str
Input series to operate on
pattern : str
Pattern to look for
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_remove(col('name'), 'a'))
"""
return str_replace(string, pattern, "")
[docs]
def str_replace_all(string, pattern, replacement):
"""
Replaces all matched patterns in a string
Parameters
----------
string : str
Input series to operate on
pattern : str
Pattern to look for
replacement : str
String that replaces anything that matches the pattern
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_replace_all(col('name'), 'a', 'A'))
"""
string = _col_expr(string)
return string.str.replace_all(pattern, replacement)
[docs]
def str_replace(string, pattern, replacement):
"""
Replaces the first matched patterns in a string
Parameters
----------
string : str
Input series to operate on
pattern : str
Pattern to look for
replacement : str
String that replaces anything that matches the pattern
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_replace(col('name'), 'a', 'A'))
"""
string = _col_expr(string)
return string.str.replace(pattern, replacement)
[docs]
def str_to_lower(string):
"""
Convert case of a string
Parameters
----------
string : str
Convert case of this string
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_to_lower(col('name')))
"""
string = _col_expr(string)
return string.str.to_lowercase()
[docs]
def str_to_upper(string):
"""
Convert case of a string
Parameters
----------
string : str
Convert case of this string
Examples
--------
>>> df = tp.tibble(name = ['apple', 'banana', 'pear', 'grape'])
>>> df.mutate(x = str_to_upper(col('name')))
"""
string = _col_expr(string)
return string.str.to_uppercase()
[docs]
def str_trim(string, side = "both"):
"""
Trim whitespace
Parameters
----------
string : Expr, Series
Column or series to operate on
side : str
One of:
* "both"
* "left"
* "right"
Examples
--------
>>> df = tp.tibble(x = [' a ', ' b ', ' c '])
>>> df.mutate(x = tp.str_trim(col('x')))
"""
string = _col_expr(string)
if side == "both":
out = _str_trim_right(_str_trim_left(string))
elif side == "left":
out = _str_trim_left(string)
elif side == "right":
out = _str_trim_right(string)
else:
raise ValueError("side must be one of 'both', 'left', or 'right'")
return out
def _str_trim_left(x):
"""
Remove leading whitespace.
"""
return x.str.replace(r"^\s*", "")
def _str_trim_right(x):
"""
Remove trailing whitespace.
"""
return x.str.replace(r"[ \t]+$", "")
[docs]
def str_wrap(string, width, sep="list"):
"""
Split string
Parameters
----------
string : str
Column name to operate on
width : int
Width to split the string
sep : string
One of
"\\n": put "\\n" to split the string; return a single string
"list": return a list based on width
"""
string = _col_expr(string)
s = string.str.extract_all(r"(.{1,"+f"{width}"+"})")
if sep!='list':
s = map(s, lambda row: f"{sep}".join(row[0]))
return s
[docs]
def str_count(string, pattern):
"""
Count occurrences of a pattern in a string
Parameters
----------
string : Expr, str
Column to operate on
pattern : str
Regular expression pattern to count
Examples
--------
>>> df.mutate(n = tp.str_count('x', 'a'))
"""
string = _col_expr(string)
return string.str.count_matches(pattern)
[docs]
def str_pad(string, width, side = 'left', pad = ' '):
"""
Pad a string to a specified width
Parameters
----------
string : Expr, str
Column to operate on
width : int
Minimum width of resulting string
side : str
Side to pad on: 'left', 'right', or 'both'
pad : str
Character to pad with (single character)
Examples
--------
>>> df.mutate(padded = tp.str_pad('x', 10))
"""
string = _col_expr(string)
if side == 'left':
return string.str.pad_start(width, pad)
elif side == 'right':
return string.str.pad_end(width, pad)
elif side == 'both':
return string.str.pad_start((width + 1) // 2, pad).str.pad_end(width, pad)
return string
[docs]
def str_split(string, pattern):
"""
Split a string by a pattern
Parameters
----------
string : Expr, str
Column to operate on
pattern : str
Pattern to split on
Returns
-------
Expr
A list column with split parts.
Examples
--------
>>> df.mutate(parts = tp.str_split('x', '_'))
"""
string = _col_expr(string)
# Always use split() which returns a list column
# splitn() returns a struct which is less ergonomic
return string.str.split(pattern)
[docs]
def str_squish(string):
"""
Remove leading/trailing whitespace and collapse internal whitespace
Parameters
----------
string : Expr, str
Column to operate on
Examples
--------
>>> df.mutate(clean = tp.str_squish('x'))
"""
string = _col_expr(string)
return string.str.strip_chars().str.replace_all(r'\s+', ' ')
[docs]
def str_to_title(string):
"""
Convert string to Title Case
Parameters
----------
string : Expr, str
Column to operate on
Examples
--------
>>> df.mutate(titled = tp.str_to_title('x'))
"""
string = _col_expr(string)
return string.str.to_titlecase()
[docs]
def str_dup(string, times):
"""
Duplicate/repeat a string
Parameters
----------
string : Expr, str
Column to operate on
times : int
Number of times to repeat
Examples
--------
>>> df.mutate(repeated = tp.str_dup('x', 3))
"""
string = _col_expr(string)
parts = [string] * times
return pl.concat_str(parts, separator='')