datafusion.functions.spark#
Spark-compatible function bindings.
These functions mirror the semantics of their Apache Spark counterparts
exactly. Some override DataFusion built-ins (substring is 1-indexed,
concat propagates NULL, round uses HALF_UP rounding, etc.), which is
why they live in a separate namespace rather than replacing the defaults.
For DataFrame use, import this module and call functions directly. For SQL
use, call datafusion.SessionContext.enable_spark_functions() to
register the Spark UDFs by name (overriding any built-ins with matching
names) before issuing SQL queries.
Functions#
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
|
Spark |
Module Contents#
- datafusion.functions.spark.abs(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
abs: absolute value.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.abs(dfn.lit(-5)).alias("v")) >>> r.collect_column("v")[0].as_py() 5
- datafusion.functions.spark.add_months(start: datafusion.expr.Expr, months: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
add_months: date + N months.monthsaccepts a nativeintor anExpr.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.add_months(d, 2).alias("v")) >>> r.collect_column("v")[0].as_py() datetime.date(2020, 3, 15)
- datafusion.functions.spark.array(*cols: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
array: builds an array from the given elements.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.array( ... dfn.lit(1), dfn.lit(2), dfn.lit(3) ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() [1, 2, 3]
- datafusion.functions.spark.array_contains(col: datafusion.expr.Expr, value: datafusion.expr.Expr | Any) datafusion.expr.Expr#
Spark
array_contains: true if the array contains the element.valueaccepts a native Python literal or anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.array_contains( ... dfn.functions.spark.array(dfn.lit(1), dfn.lit(2)), 1 ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() True
- datafusion.functions.spark.array_repeat(col: datafusion.expr.Expr, count: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
array_repeat: array ofelementrepeatedcounttimes.countaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.array_repeat(dfn.lit("a"), 3).alias("v")) >>> r.collect_column("v")[0].as_py() ['a', 'a', 'a']
- datafusion.functions.spark.ascii(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
ascii: code point of the first character.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.ascii(dfn.lit("A")).alias("v")) >>> r.collect_column("v")[0].as_py() 65
- datafusion.functions.spark.avg(col: datafusion.expr.Expr, distinct: bool | None = None, filter: datafusion.expr.Expr | None = None, order_by: list[datafusion.expr.SortKey] | datafusion.expr.SortKey | None = None, null_treatment: datafusion.common.NullTreatment | None = None) datafusion.expr.Expr#
Spark
avg: returns the mean of a numeric column.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) >>> r = df.aggregate( ... [], [dfn.functions.spark.avg(dfn.col("a")).alias("v")]) >>> r.collect_column("v")[0].as_py() 2.0
- datafusion.functions.spark.base64(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
base64: encode binary as a base64 string.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.base64(dfn.lit(b"hi")).alias("v")) >>> r.collect_column("v")[0].as_py() 'aGk='
- datafusion.functions.spark.bin(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
bin: binary string representation of a long.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.bin(dfn.lit(7)).alias("v")) >>> r.collect_column("v")[0].as_py() '111'
- datafusion.functions.spark.bit_count(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
bit_count: number of bits set in the integer’s binary form.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.bit_count(dfn.lit(7)).alias("v")) >>> r.collect_column("v")[0].as_py() 3
- datafusion.functions.spark.bit_get(col: datafusion.expr.Expr, pos: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
bit_get: returns the bit (0 or 1) atpos.A bare
strposis treated as a column name (matching pyspark), not a literal; passlit()for a literal position.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.bit_get(dfn.lit(5), dfn.lit(0)).alias("v")) >>> r.collect_column("v")[0].as_py() 1
- datafusion.functions.spark.bitmap_bit_position(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
bitmap_bit_position: bit position for a child expression.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.bitmap_bit_position(dfn.lit(15)).alias("v")) >>> r.collect_column("v")[0].as_py() 14
- datafusion.functions.spark.bitmap_bucket_number(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
bitmap_bucket_number: bucket number for a child expression.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.bitmap_bucket_number(dfn.lit(15)).alias("v")) >>> r.collect_column("v")[0].as_py() 1
- datafusion.functions.spark.bitmap_count(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
bitmap_count: number of set bits in a bitmap.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.bitmap_count(dfn.lit(b"\xff")).alias("v")) >>> r.collect_column("v")[0].as_py() 8
- datafusion.functions.spark.bitwise_not(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
~: bitwise NOT.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.bitwise_not(dfn.lit(0)).alias("v")) >>> r.collect_column("v")[0].as_py() -1
- datafusion.functions.spark.ceil(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
ceil: smallest integer ≥ arg.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.ceil(dfn.lit(1.2)).alias("v")) >>> r.collect_column("v")[0].as_py() 2
- datafusion.functions.spark.char(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
char: ASCII character for a code point (mod 256).Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.char(dfn.lit(65)).alias("v")) >>> r.collect_column("v")[0].as_py() 'A'
- datafusion.functions.spark.collect_list(col: datafusion.expr.Expr, distinct: bool | None = None, filter: datafusion.expr.Expr | None = None, order_by: list[datafusion.expr.SortKey] | datafusion.expr.SortKey | None = None, null_treatment: datafusion.common.NullTreatment | None = None) datafusion.expr.Expr#
Spark
collect_list: collect values into an array (preserves dups).Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1, 2, 2]}) >>> r = df.aggregate( ... [], [dfn.functions.spark.collect_list(dfn.col("a")).alias("v")]) >>> sorted(r.collect_column("v")[0].as_py()) [1, 2, 2]
- datafusion.functions.spark.collect_set(col: datafusion.expr.Expr, distinct: bool | None = None, filter: datafusion.expr.Expr | None = None, order_by: list[datafusion.expr.SortKey] | datafusion.expr.SortKey | None = None, null_treatment: datafusion.common.NullTreatment | None = None) datafusion.expr.Expr#
Spark
collect_set: collect distinct values into an array.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1, 2, 2, 3]}) >>> r = df.aggregate( ... [], [dfn.functions.spark.collect_set(dfn.col("a")).alias("v")]) >>> sorted(r.collect_column("v")[0].as_py()) [1, 2, 3]
- datafusion.functions.spark.concat(*cols: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
concat: concatenates strings; NULL if any input is NULL.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.concat(dfn.lit("a"), dfn.lit("b")).alias("v")) >>> r.collect_column("v")[0].as_py() 'ab'
- datafusion.functions.spark.crc32(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
crc32: cyclic redundancy check value as a bigint.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"s": ["ABC"]}) >>> r = df.select(dfn.functions.spark.crc32(dfn.col("s")).alias("v")) >>> r.collect_column("v")[0].as_py() 2743272264
- datafusion.functions.spark.csc(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
csc: cosecant.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.csc(dfn.lit(1.5708)).alias("v")) >>> f"{r.collect_column('v')[0].as_py():.4f}" '1.0000'
- datafusion.functions.spark.date_add(start: datafusion.expr.Expr, days: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
date_add: date + N days.daysaccepts a nativeintor anExpr.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.date_add(d, 5).alias("v")) >>> r.collect_column("v")[0].as_py() datetime.date(2020, 1, 20)
- datafusion.functions.spark.date_diff(end: datafusion.expr.Expr, start: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
date_diff: number of days fromstart_datetoend_date.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> end = dfn.lit(pa.scalar(date(2020, 1, 20), type=pa.date32())) >>> r = df.select(dfn.functions.spark.date_diff(end, d).alias("v")) >>> r.collect_column("v")[0].as_py() 5
- datafusion.functions.spark.date_part(field: datafusion.expr.Expr | str, source: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
date_part: extractfieldfrom a date/time/timestamp.fieldaccepts a nativestror anExpr.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select( ... dfn.functions.spark.date_part("year", d).alias("v")) >>> r.collect_column("v")[0].as_py() 2020
- datafusion.functions.spark.date_sub(start: datafusion.expr.Expr, days: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
date_sub: date - N days.daysaccepts a nativeintor anExpr.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.date_sub(d, 5).alias("v")) >>> r.collect_column("v")[0].as_py() datetime.date(2020, 1, 10)
- datafusion.functions.spark.date_trunc(format: datafusion.expr.Expr | str, timestamp: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
date_trunc: truncate timestamp to unitfmt.formataccepts a nativestror anExpr.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select( ... dfn.functions.spark.date_trunc("month", ts).alias("v")) >>> r.collect_column("v")[0].as_py() datetime.datetime(2020, 1, 1, 0, 0)
- datafusion.functions.spark.elt(*inputs: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
elt: returns the n-th input (1-indexed).Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.elt( ... dfn.lit(2), dfn.lit("a"), dfn.lit("b") ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 'b'
- datafusion.functions.spark.expm1(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
expm1: exp(arg) - 1.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.expm1(dfn.lit(0.0)).alias("v")) >>> r.collect_column("v")[0].as_py() 0.0
- datafusion.functions.spark.factorial(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
factorial: n! for n in [0..20], else NULL.Examples
>>> import pyarrow as pa >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.factorial( ... dfn.lit(pa.scalar(5, type=pa.int32())) ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 120
- datafusion.functions.spark.floor(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
floor: largest integer ≤ arg.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.floor(dfn.lit(1.8)).alias("v")) >>> r.collect_column("v")[0].as_py() 1
- datafusion.functions.spark.format_string(format: str | datafusion.expr.Expr, *cols: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
format_string: printf-style format string.formatis the printf-style template (a plainstris auto-promoted to a literal expression); remaining args are values to substitute.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.format_string( ... "%d-%s", dfn.lit(42), dfn.lit("hi") ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() '42-hi'
- datafusion.functions.spark.from_utc_timestamp(timestamp: datafusion.expr.Expr, tz: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
from_utc_timestamp: interprettsas UTC, convert totz.tzaccepts a nativestror anExpr.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select( ... dfn.functions.spark.from_utc_timestamp(ts, "UTC").alias("v")) >>> r.collect_column("v")[0].as_py() datetime.datetime(2020, 1, 15, 14, 30, 45)
- datafusion.functions.spark.hex(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
hex: hexadecimal representation.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.hex(dfn.lit(255)).alias("v")) >>> r.collect_column("v")[0].as_py() 'FF'
- datafusion.functions.spark.hour(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
hour: extract hour component of a timestamp.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select(dfn.functions.spark.hour(ts).alias("v")) >>> r.collect_column("v")[0].as_py() 14
- datafusion.functions.spark.if_(condition: datafusion.expr.Expr, if_true: datafusion.expr.Expr | Any, if_false: datafusion.expr.Expr | Any) datafusion.expr.Expr#
Spark
if: returnsif_truewhenconditionis true, elseif_false.Exposed as
if_becauseifis a Python keyword.if_trueandif_falseaccept native Python literals orExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.if_( ... dfn.lit(2) > dfn.lit(1), "big", "small" ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 'big'
- datafusion.functions.spark.ilike(str: datafusion.expr.Expr, pattern: datafusion.expr.Expr | ilike.str, escapeChar: ilike.str | None = None) datafusion.expr.Expr#
Spark
ilike: case-insensitive pattern match.A bare
strpatternis treated as a column name (matching pyspark), not a literal; passlit()for a literal pattern.escapeCharis accepted for pyspark parity but is not yet wired through the Rust binding; passing a non-Nonevalue raisesNotImplementedError.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.ilike(dfn.lit("HELLO"), dfn.lit("h%")).alias("v")) >>> r.collect_column("v")[0].as_py() True
- datafusion.functions.spark.is_valid_utf8(str: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
is_valid_utf8: true if the string is valid UTF-8.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.is_valid_utf8(dfn.lit("hello")).alias("v")) >>> r.collect_column("v")[0].as_py() True
- datafusion.functions.spark.json_tuple(col: datafusion.expr.Expr, *fields: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
json_tuple: extract top-level fields from a JSON string.Each field name accepts a native
stror anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.json_tuple( ... dfn.lit('{"a":1,"b":"x"}'), "a", "b" ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() {'c0': '1', 'c1': 'x'}
- datafusion.functions.spark.last_day(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
last_day: last day of the month containing the date.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.last_day(d).alias("v")) >>> r.collect_column("v")[0].as_py() datetime.date(2020, 1, 31)
- datafusion.functions.spark.length(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
length: character length of a string, or byte length of binary.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.length(dfn.lit("hello")).alias("v")) >>> r.collect_column("v")[0].as_py() 5
- datafusion.functions.spark.like(str: datafusion.expr.Expr, pattern: datafusion.expr.Expr | like.str, escapeChar: like.str | None = None) datafusion.expr.Expr#
Spark
like: case-sensitive pattern match.A bare
strpatternis treated as a column name (matching pyspark), not a literal; passlit()for a literal pattern.escapeCharis accepted for pyspark parity but is not yet wired through the Rust binding; passing a non-Nonevalue raisesNotImplementedError.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.like(dfn.lit("hello"), dfn.lit("h%")).alias("v")) >>> r.collect_column("v")[0].as_py() True
- datafusion.functions.spark.luhn_check(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
luhn_check: true if the digit string passes the Luhn check.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.luhn_check( ... dfn.lit("4111111111111111") ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() True
- datafusion.functions.spark.make_dt_interval(days: datafusion.expr.Expr | int | None = None, hours: datafusion.expr.Expr | int | None = None, mins: datafusion.expr.Expr | int | None = None, secs: datafusion.expr.Expr | float | None = None) datafusion.expr.Expr#
Spark
make_dt_interval: day-time interval from components.All parts are optional; omitted parts default to zero, matching pyspark. Integer parts accept a native
intandsecsaccepts afloat, or any part may be anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.make_dt_interval().alias("v")) >>> r.collect_column("v")[0].as_py() datetime.timedelta(0)
>>> r = df.select( ... dfn.functions.spark.make_dt_interval( ... days=1, hours=2, mins=3, secs=4.5 ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() datetime.timedelta(days=1, seconds=7384, microseconds=500000)
- datafusion.functions.spark.make_interval(years: datafusion.expr.Expr | int | None = None, months: datafusion.expr.Expr | int | None = None, weeks: datafusion.expr.Expr | int | None = None, days: datafusion.expr.Expr | int | None = None, hours: datafusion.expr.Expr | int | None = None, mins: datafusion.expr.Expr | int | None = None, secs: datafusion.expr.Expr | float | None = None) datafusion.expr.Expr#
Spark
make_interval: interval from year/month/week/day/hour/min/sec parts.All parts are optional; omitted parts default to zero, matching pyspark. Integer parts accept a native
intandsecsaccepts afloat, or any part may be anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.make_interval().alias("v")) >>> r.collect_column("v")[0].as_py().months 0
>>> r = df.select(dfn.functions.spark.make_interval(years=1).alias("v")) >>> r.collect_column("v")[0].as_py().months 12
- datafusion.functions.spark.make_valid_utf8(str: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
make_valid_utf8: replace invalid UTF-8 bytes with U+FFFD.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.make_valid_utf8(dfn.lit("hello")).alias("v")) >>> r.collect_column("v")[0].as_py() 'hello'
- datafusion.functions.spark.map_from_arrays(col1: datafusion.expr.Expr, col2: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
map_from_arrays: build a map from parallel key/value arrays.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> keys = dfn.functions.spark.array(dfn.lit("a"), dfn.lit("b")) >>> vals = dfn.functions.spark.array(dfn.lit(1), dfn.lit(2)) >>> r = df.select( ... dfn.functions.spark.map_from_arrays(keys, vals).alias("v")) >>> r.collect_column("v")[0].as_py() [('a', 1), ('b', 2)]
- datafusion.functions.spark.map_from_entries(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
map_from_entries: build a map from an array of key/value structs.colmust be an array whose elements are two-field structs; the first field becomes the map key and the second the value.Examples
>>> import pyarrow as pa >>> ctx = dfn.SessionContext() >>> entry_type = pa.list_( ... pa.struct([("key", pa.string()), ("value", pa.int64())])) >>> entries = pa.array( ... [[{"key": "a", "value": 1}, {"key": "b", "value": 2}]], ... type=entry_type) >>> df = ctx.from_arrow(pa.record_batch([entries], names=["e"])) >>> r = df.select( ... dfn.functions.spark.map_from_entries(dfn.col("e")).alias("v")) >>> r.collect_column("v")[0].as_py() [('a', 1), ('b', 2)]
- datafusion.functions.spark.minute(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
minute: extract minute component of a timestamp.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select(dfn.functions.spark.minute(ts).alias("v")) >>> r.collect_column("v")[0].as_py() 30
- datafusion.functions.spark.modulus(dividend: datafusion.expr.Expr | float, divisor: datafusion.expr.Expr | float) datafusion.expr.Expr#
Spark
mod: remainder ofdividend / divisor(sign follows dividend).dividendanddivisoraccept native numbers orExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.modulus(dfn.lit(10), dfn.lit(3)).alias("v")) >>> r.collect_column("v")[0].as_py() 1
- datafusion.functions.spark.negative(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
negative: unary minus.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.negative(dfn.lit(3)).alias("v")) >>> r.collect_column("v")[0].as_py() -3
- datafusion.functions.spark.next_day(date: datafusion.expr.Expr, dayOfWeek: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
next_day: first date afterstart_datenamedday_of_week.dayOfWeekaccepts a nativestror anExpr.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.next_day(d, "Mon").alias("v")) >>> r.collect_column("v")[0].as_py() datetime.date(2020, 1, 20)
- datafusion.functions.spark.parse_url(url: datafusion.expr.Expr, partToExtract: datafusion.expr.Expr | str, key: datafusion.expr.Expr | str | None = None) datafusion.expr.Expr#
Spark
parse_url: extract a part from a URL; errors on invalid URLs.partToExtractis one of"HOST","PATH","QUERY","REF","PROTOCOL","FILE","AUTHORITY","USERINFO". Passkeyonly with"QUERY"to extract a single parameter. Barestrvalues forpartToExtract/keyare treated as column names (matching pyspark); passlit()for a literal.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.parse_url( ... dfn.lit("http://example.com/path?q=1"), dfn.lit("HOST") ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 'example.com'
>>> r = df.select( ... dfn.functions.spark.parse_url( ... dfn.lit("http://example.com/path?q=1"), ... dfn.lit("QUERY"), ... key=dfn.lit("q"), ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() '1'
- datafusion.functions.spark.pmod(dividend: datafusion.expr.Expr | float, divisor: datafusion.expr.Expr | float) datafusion.expr.Expr#
Spark
pmod: positive remainder of division.dividendanddivisoraccept native numbers orExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.pmod(dfn.lit(-1), dfn.lit(3)).alias("v")) >>> r.collect_column("v")[0].as_py() 2
- datafusion.functions.spark.rint(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
rint: round to nearest mathematical integer (as double).Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.rint(dfn.lit(2.5)).alias("v")) >>> r.collect_column("v")[0].as_py() 2.0
- datafusion.functions.spark.round(col: datafusion.expr.Expr, scale: datafusion.expr.Expr | int | None = None) datafusion.expr.Expr#
Spark
round: round toscaledecimal places, HALF_UP rounding.scaledefaults to zero when omitted, matching pyspark, and accepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.round(dfn.lit(2.5)).alias("v")) >>> r.collect_column("v")[0].as_py() 3.0
>>> r = df.select( ... dfn.functions.spark.round(dfn.lit(2.345), scale=2).alias("v")) >>> r.collect_column("v")[0].as_py() 2.35
- datafusion.functions.spark.sec(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
sec: secant.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.sec(dfn.lit(0.0)).alias("v")) >>> r.collect_column("v")[0].as_py() 1.0
- datafusion.functions.spark.second(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
second: extract second component of a timestamp.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select(dfn.functions.spark.second(ts).alias("v")) >>> r.collect_column("v")[0].as_py() 45
- datafusion.functions.spark.sha1(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
sha1: SHA-1 hash as a hex string.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"s": ["hello"]}) >>> r = df.select(dfn.functions.spark.sha1(dfn.col("s")).alias("v")) >>> r.collect_column("v")[0].as_py() 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d'
- datafusion.functions.spark.sha2(col: datafusion.expr.Expr, numBits: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
sha2: SHA-2 family hash (224, 256, 384, 512). Bit length 0 = 256.numBitsaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"s": ["hello"]}) >>> r = df.select( ... dfn.functions.spark.sha2(dfn.col("s"), 256).alias("v")) >>> r.collect_column("v")[0].as_py() '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824'
- datafusion.functions.spark.shiftleft(col: datafusion.expr.Expr, numBits: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
shiftleft:valueshifted left byshiftbits.numBitsaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.shiftleft(dfn.lit(1), 3).alias("v")) >>> r.collect_column("v")[0].as_py() 8
- datafusion.functions.spark.shiftright(col: datafusion.expr.Expr, numBits: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
shiftright: arithmetic right shift.numBitsaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.shiftright(dfn.lit(8), 2).alias("v")) >>> r.collect_column("v")[0].as_py() 2
- datafusion.functions.spark.shiftrightunsigned(col: datafusion.expr.Expr, numBits: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
shiftrightunsigned: logical (unsigned) right shift.numBitsaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.shiftrightunsigned(dfn.lit(8), 2).alias("v")) >>> r.collect_column("v")[0].as_py() 2
- datafusion.functions.spark.shuffle(col: datafusion.expr.Expr, seed: int | None = None) datafusion.expr.Expr#
Spark
shuffle: returns a random permutation of the input array.seedis accepted for pyspark parity but is not yet wired through the Rust binding; passing a non-Nonevalue raisesNotImplementedError.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.shuffle( ... dfn.functions.spark.array(dfn.lit(1), dfn.lit(2), dfn.lit(3)) ... ).alias("v") ... ) >>> sorted(r.collect_column("v")[0].as_py()) [1, 2, 3]
- datafusion.functions.spark.size(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
size: length of an array or map.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.size( ... dfn.functions.spark.array(dfn.lit(1), dfn.lit(2), dfn.lit(3)) ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 3
- datafusion.functions.spark.slice(x: datafusion.expr.Expr, start: datafusion.expr.Expr | int, length: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
slice: subset of the array from 1-indexedstartwithlength.Negative
startcounts from the end.startandlengthaccept nativeintvalues orExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.slice( ... dfn.functions.spark.array( ... dfn.lit(1), dfn.lit(2), dfn.lit(3), dfn.lit(4)), ... 2, 2, ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() [2, 3]
- datafusion.functions.spark.soundex(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
soundex: Soundex phonetic code.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.soundex(dfn.lit("Robert")).alias("v")) >>> r.collect_column("v")[0].as_py() 'R163'
- datafusion.functions.spark.space(col: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
space: string of n spaces.colaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.space(3).alias("v")) >>> r.collect_column("v")[0].as_py() ' '
- datafusion.functions.spark.spark_cast(arg: datafusion.expr.Expr, type_str: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
cast: castargto the type named bytype_str.Uses Spark cast semantics (e.g. overflow returns NULL, not error).
type_straccepts a nativestror anExpr.Currently only supports casting numeric values to
"timestamp".Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.spark_cast( ... dfn.lit(1579098645), "timestamp" ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py().isoformat() '2020-01-15T14:30:45+00:00'
- datafusion.functions.spark.str_to_map(text: datafusion.expr.Expr, pairDelim: datafusion.expr.Expr | str | None = None, keyValueDelim: datafusion.expr.Expr | str | None = None) datafusion.expr.Expr#
Spark
str_to_map: split text into key/value pairs using delimiters.Delimiters default to
","and":"when omitted, matching pyspark. Parameter names matchpyspark.sql.functions.str_to_map; pyspark types the delimiters as column-or-name, so a barestris treated as a column name. Passlit()for a literal delimiter.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.str_to_map(dfn.lit("a:1,b:2")).alias("v")) >>> r.collect_column("v")[0].as_py() [('a', '1'), ('b', '2')]
>>> r = df.select( ... dfn.functions.spark.str_to_map( ... dfn.lit("a=1;b=2"), ... pairDelim=dfn.lit(";"), ... keyValueDelim=dfn.lit("="), ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() [('a', '1'), ('b', '2')]
- datafusion.functions.spark.substring(str: datafusion.expr.Expr, pos: datafusion.expr.Expr | int, len: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
substring: 1-indexed substring starting atposof givenlength.Negative
poscounts from the end.posandlenaccept nativeintvalues orExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.substring(dfn.lit("hello"), 1, 3).alias("v")) >>> r.collect_column("v")[0].as_py() 'hel'
- datafusion.functions.spark.time_trunc(unit: datafusion.expr.Expr | str, time: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
time_trunc: truncate time value to unitfmt.A bare
strunitis treated as a column name (matching pyspark), not a literal; passlit()for a literal unit.Examples
>>> import pyarrow as pa >>> from datetime import time >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> t = dfn.lit(pa.scalar(time(14, 30, 45), type=pa.time64('us'))) >>> r = df.select( ... dfn.functions.spark.time_trunc(dfn.lit("hour"), t).alias("v")) >>> r.collect_column("v")[0].as_py() datetime.time(14, 0)
- datafusion.functions.spark.to_utc_timestamp(timestamp: datafusion.expr.Expr, tz: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
to_utc_timestamp: interprettsastz, convert to UTC.tzaccepts a nativestror anExpr.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select( ... dfn.functions.spark.to_utc_timestamp(ts, "UTC").alias("v")) >>> r.collect_column("v")[0].as_py() datetime.datetime(2020, 1, 15, 14, 30, 45)
- datafusion.functions.spark.trunc(date: datafusion.expr.Expr, format: datafusion.expr.Expr | str) datafusion.expr.Expr#
Spark
trunc: truncate date to unitfmt.formataccepts a nativestror anExpr.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.trunc(d, "YEAR").alias("v")) >>> r.collect_column("v")[0].as_py() datetime.date(2020, 1, 1)
- datafusion.functions.spark.try_parse_url(url: datafusion.expr.Expr, partToExtract: datafusion.expr.Expr | str, key: datafusion.expr.Expr | str | None = None) datafusion.expr.Expr#
Spark
try_parse_url: likeparse_urlbut returns NULL on invalid URLs.Bare
strvalues forpartToExtract/keyare treated as column names (matching pyspark); passlit()for a literal.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.try_parse_url( ... dfn.lit("http://example.com/"), dfn.lit("HOST") ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 'example.com'
- datafusion.functions.spark.try_sum(col: datafusion.expr.Expr, distinct: bool | None = None, filter: datafusion.expr.Expr | None = None, order_by: list[datafusion.expr.SortKey] | datafusion.expr.SortKey | None = None, null_treatment: datafusion.common.NullTreatment | None = None) datafusion.expr.Expr#
Spark
try_sum: sum that returns NULL on overflow.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1, 2, 3]}) >>> r = df.aggregate( ... [], [dfn.functions.spark.try_sum(dfn.col("a")).alias("v")]) >>> r.collect_column("v")[0].as_py() 6
- datafusion.functions.spark.try_url_decode(str: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
try_url_decode: likeurl_decode; returns NULL on invalid input.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.try_url_decode(dfn.lit("a%20b")).alias("v")) >>> r.collect_column("v")[0].as_py() 'a b'
- datafusion.functions.spark.unbase64(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
unbase64: decode a base64 string to binary.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.unbase64(dfn.lit("aGk=")).alias("v")) >>> r.collect_column("v")[0].as_py() b'hi'
- datafusion.functions.spark.unhex(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
unhex: convert hexadecimal string to binary.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select(dfn.functions.spark.unhex(dfn.lit("FF")).alias("v")) >>> r.collect_column("v")[0].as_py() b'\xff'
- datafusion.functions.spark.unix_date(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
unix_date: days since 1970-01-01 fordt.Examples
>>> import pyarrow as pa >>> from datetime import date >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> d = dfn.lit(pa.scalar(date(2020, 1, 15), type=pa.date32())) >>> r = df.select(dfn.functions.spark.unix_date(d).alias("v")) >>> r.collect_column("v")[0].as_py() 18276
- datafusion.functions.spark.unix_micros(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
unix_micros: microseconds since epoch forts.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select(dfn.functions.spark.unix_micros(ts).alias("v")) >>> r.collect_column("v")[0].as_py() 1579098645000000
- datafusion.functions.spark.unix_millis(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
unix_millis: milliseconds since epoch forts.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select(dfn.functions.spark.unix_millis(ts).alias("v")) >>> r.collect_column("v")[0].as_py() 1579098645000
- datafusion.functions.spark.unix_seconds(col: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
unix_seconds: seconds since epoch forts.Examples
>>> import pyarrow as pa >>> from datetime import datetime >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> ts = dfn.lit( ... pa.scalar(datetime(2020, 1, 15, 14, 30, 45), ... type=pa.timestamp('us'))) >>> r = df.select(dfn.functions.spark.unix_seconds(ts).alias("v")) >>> r.collect_column("v")[0].as_py() 1579098645
- datafusion.functions.spark.url_decode(str: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
url_decode: decode an application/x-www-form-urlencoded string.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.url_decode(dfn.lit("a%20b")).alias("v")) >>> r.collect_column("v")[0].as_py() 'a b'
- datafusion.functions.spark.url_encode(str: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
url_encode: encode a string in application/x-www-form-urlencoded.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.url_encode(dfn.lit("a b")).alias("v")) >>> r.collect_column("v")[0].as_py() 'a+b'
- datafusion.functions.spark.width_bucket(v: datafusion.expr.Expr, min: datafusion.expr.Expr, max: datafusion.expr.Expr, numBucket: datafusion.expr.Expr | int) datafusion.expr.Expr#
Spark
width_bucket: bucket number forvaluein equi-width histogram.numBucketaccepts a nativeintor anExpr.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.width_bucket( ... dfn.lit(5.0), dfn.lit(0.0), dfn.lit(10.0), 5 ... ).alias("v") ... ) >>> r.collect_column("v")[0].as_py() 3
- datafusion.functions.spark.xxhash64(*cols: datafusion.expr.Expr) datafusion.expr.Expr#
Spark
xxhash64: 64-bit xxHash of the arguments.Examples
>>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"x": [1]}) >>> r = df.select( ... dfn.functions.spark.xxhash64(dfn.lit("hello")).alias("v")) >>> r.collect_column("v")[0].as_py() -4367754540140381902