From 6f5def3da6cbe55179840f1acfe7943c25e837b4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 09:16:11 -0400 Subject: [PATCH 01/11] feat: unify logical + physical proto codec stack via SessionContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a single composable codec layer that every serializer reads from the session, replacing the hardcoded `DefaultLogicalExtensionCodec` / `DefaultPhysicalExtensionCodec` calls scattered across PyLogicalPlan, PyExecutionPlan, and the Rust-wrapped Python provider plumbing. Key changes: * New `PythonLogicalCodec` and `PythonPhysicalCodec` (crates/core/src/codec.rs) wrap any inner `LogicalExtensionCodec` / `PhysicalExtensionCodec`. Both share a `DFPYUDF1` magic-prefix path for in-band cloudpickle encoding of Python scalar UDFs, so an `ExecutionPlan` / `PhysicalExpr` referencing a Python `ScalarUDF` round-trips through either layer. Magic-prefix registry table (DFPYUDF1 in use; DFPYUDA1 / DFPYUDW1 / DFPYPE1 reserved) documented in the module header. * `PySessionContext` stores `Arc` and `Arc` directly. FFI wrappers are built on demand via `ffi_logical_codec()` / `ffi_physical_codec()` for capsule export and downstream `RustWrappedPy*` consumers. Adds `__datafusion_physical_extension_codec__` getter + `with_physical_extension_codec` setter (symmetric with the logical pair). * `PyLogicalPlan.to_proto` / `from_proto` renamed to `to_bytes` / `from_bytes`, now reading the session's logical codec. `to_proto` / `from_proto` survive as deprecated thin wrappers emitting `DeprecationWarning`. * `PyExecutionPlan` gains the same `to_bytes` / `from_bytes` rename + deprecated aliases, plus `__datafusion_execution_plan__` capsule getter and `from_pycapsule` (ported from poc_ffi_query_planner). * New `PyPhysicalExpr` class with `to_bytes` / `from_bytes` / `from_pycapsule` / `__datafusion_physical_expr__`. `from_bytes` takes an input pyarrow Schema for column-reference resolution. * `datafusion-python-util` gains `from_pycapsule!` / `try_from_pycapsule!` macros + `physical_codec_from_pycapsule`, `task_context_from_pycapsule`, `create_physical_extension_capsule` (ported from poc_ffi_query_planner). * `PythonFunctionScalarUDF` exposes `func()`, `input_fields()`, `return_field()`, `volatility()`, `from_parts()` accessors needed by the codec. Python wrapper updates: `LogicalPlan` / `ExecutionPlan` add `to_bytes` / `from_bytes` + deprecate `to_proto` / `from_proto`; `ExecutionPlan` adds capsule getter + `from_pycapsule`; new `PhysicalExpr` wrapper class exported from the top-level package; `SessionContext` exposes the physical codec capsule + setter. Test coverage in python/tests/test_plans.py: round-trip via new API, deprecation warnings on old API, capsule protocol getters, session-routed codec on both layers. `PyLogicalPlan` PyCapsule protocol is intentionally not added — `datafusion-ffi` does not expose `FFI_LogicalPlan`, so there is no stable cross-crate shape to publish. Round-tripping a `LogicalPlan` goes through `to_bytes` / `from_bytes` only. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + crates/core/src/codec.rs | 331 +++++++++++++++++++++++++++++++ crates/core/src/context.rs | 147 +++++++++++--- crates/core/src/lib.rs | 2 + crates/core/src/physical_plan.rs | 181 ++++++++++++++++- crates/core/src/sql/logical.rs | 45 ++++- crates/core/src/udf.rs | 38 +++- crates/util/Cargo.toml | 1 + crates/util/src/lib.rs | 106 ++++++++++ python/datafusion/__init__.py | 3 +- python/datafusion/context.py | 12 ++ python/datafusion/plan.py | 141 +++++++++++-- python/tests/test_plans.py | 90 ++++++++- 13 files changed, 1028 insertions(+), 70 deletions(-) create mode 100644 crates/core/src/codec.rs diff --git a/Cargo.lock b/Cargo.lock index 70f09ec46..490458d82 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1698,6 +1698,7 @@ dependencies = [ "arrow", "datafusion", "datafusion-ffi", + "datafusion-proto", "prost", "pyo3", "tokio", diff --git a/crates/core/src/codec.rs b/crates/core/src/codec.rs new file mode 100644 index 000000000..cafa55cc8 --- /dev/null +++ b/crates/core/src/codec.rs @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Python-aware extension codecs. +//! +//! [`PythonLogicalCodec`] wraps a user-supplied (or default) +//! [`LogicalExtensionCodec`] and adds in-band encoding of Python-defined +//! scalar UDFs via cloudpickle. [`PythonPhysicalCodec`] is the symmetric +//! wrapper around [`PhysicalExtensionCodec`]; both reuse the same payload +//! framing so a Python `ScalarUDF` round-trips identically through either +//! codec layer. +//! +//! ## Wire-format magic prefix registry +//! +//! Each Python-inline payload begins with an 8-byte magic prefix so the +//! decoder can distinguish it from arbitrary `fun_definition` bytes (e.g. +//! produced by a user FFI codec or left empty by the default codec). +//! +//! | Layer + kind | Magic prefix | Owner | +//! | ----------------------------- | ------------ | -------------- | +//! | `PythonLogicalCodec` scalar | `DFPYUDF1` | in use | +//! | `PythonLogicalCodec` agg | `DFPYUDA1` | reserved | +//! | `PythonLogicalCodec` window | `DFPYUDW1` | reserved | +//! | `PythonPhysicalCodec` scalar | `DFPYUDF1` | in use (shared with logical) | +//! | `PythonPhysicalCodec` agg | `DFPYUDA1` | reserved | +//! | `PythonPhysicalCodec` window | `DFPYUDW1` | reserved | +//! | `PythonPhysicalCodec` expr | `DFPYPE1` | reserved | +//! | User FFI extension codec | user-chosen | downstream | +//! | Default codec | (none) | upstream | +//! +//! Dispatch precedence inside the Python codecs: **Python-inline payload +//! (magic prefix match) → `inner` codec → caller's registry fallback.** +//! When a payload does not match a Python magic prefix the call delegates +//! to `inner`, which is typically `DefaultLogicalExtensionCodec` / +//! `DefaultPhysicalExtensionCodec` but may be a user-supplied FFI codec +//! installed via `SessionContext.with_logical_extension_codec(...)` / +//! `with_physical_extension_codec(...)`. +//! +//! User FFI codecs should pick non-colliding prefixes (recommend a `DF` +//! namespace plus a crate-specific suffix). + +use std::sync::Arc; + +use arrow::datatypes::{Field, Schema}; +use arrow::pyarrow::ToPyArrow; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::pyarrow::FromPyArrow; +use datafusion::common::{Result, TableReference}; +use datafusion::datasource::TableProvider; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{Extension, LogicalPlan, ScalarUDF, ScalarUDFImpl}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::logical_plan::{DefaultLogicalExtensionCodec, LogicalExtensionCodec}; +use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; +use pyo3::BoundObject; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyTuple}; + +use crate::udf::PythonFunctionScalarUDF; + +/// Magic prefix for an inlined Python scalar UDF payload. Shared between +/// logical and physical codec layers — the cloudpickled tuple shape is +/// identical, so a `ScalarUDF` reaching either codec encodes the same way. +pub(crate) const PY_SCALAR_UDF_MAGIC: &[u8] = b"DFPYUDF1"; + +/// `LogicalExtensionCodec` that serializes Python scalar UDFs inline and +/// delegates every other call to a composable `inner` codec. See module +/// docs for the dispatch precedence. +#[derive(Debug)] +pub struct PythonLogicalCodec { + inner: Arc, +} + +impl PythonLogicalCodec { + pub fn new(inner: Arc) -> Self { + Self { inner } + } + + /// Convenience constructor wrapping `DefaultLogicalExtensionCodec`. + pub fn with_default_inner() -> Self { + Self::new(Arc::new(DefaultLogicalExtensionCodec {})) + } + + pub fn inner(&self) -> &Arc { + &self.inner + } +} + +impl Default for PythonLogicalCodec { + fn default() -> Self { + Self::with_default_inner() + } +} + +impl LogicalExtensionCodec for PythonLogicalCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[LogicalPlan], + ctx: &TaskContext, + ) -> Result { + self.inner.try_decode(buf, inputs, ctx) + } + + fn try_encode(&self, node: &Extension, buf: &mut Vec) -> Result<()> { + self.inner.try_encode(node, buf) + } + + fn try_decode_table_provider( + &self, + buf: &[u8], + table_ref: &TableReference, + schema: SchemaRef, + ctx: &TaskContext, + ) -> Result> { + self.inner + .try_decode_table_provider(buf, table_ref, schema, ctx) + } + + fn try_encode_table_provider( + &self, + table_ref: &TableReference, + node: Arc, + buf: &mut Vec, + ) -> Result<()> { + self.inner.try_encode_table_provider(table_ref, node, buf) + } + + fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { + if try_encode_python_scalar_udf(node, buf)? { + return Ok(()); + } + self.inner.try_encode_udf(node, buf) + } + + fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { + match try_decode_python_scalar_udf(buf)? { + Some(udf) => Ok(udf), + None => self.inner.try_decode_udf(name, buf), + } + } +} + +/// `PhysicalExtensionCodec` mirror of [`PythonLogicalCodec`]. Scalar-UDF +/// payloads use the shared `DFPYUDF1` framing so an `ExecutionPlan` or +/// `PhysicalExpr` that references a Python `ScalarUDF` round-trips +/// regardless of which codec layer encoded it. +/// +/// Encoding for Python-defined `ExecutionPlan` impls and `PhysicalExpr` +/// impls (the `DFPYPE*` namespace) is reserved for a future change — no +/// concrete Python-side physical extension type exists today, so all +/// non-UDF calls delegate to `inner`. +#[derive(Debug)] +pub struct PythonPhysicalCodec { + inner: Arc, +} + +impl PythonPhysicalCodec { + pub fn new(inner: Arc) -> Self { + Self { inner } + } + + pub fn with_default_inner() -> Self { + Self::new(Arc::new(DefaultPhysicalExtensionCodec {})) + } + + pub fn inner(&self) -> &Arc { + &self.inner + } +} + +impl Default for PythonPhysicalCodec { + fn default() -> Self { + Self::with_default_inner() + } +} + +impl PhysicalExtensionCodec for PythonPhysicalCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], + ctx: &TaskContext, + ) -> Result> { + self.inner.try_decode(buf, inputs, ctx) + } + + fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + self.inner.try_encode(node, buf) + } + + fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { + if try_encode_python_scalar_udf(node, buf)? { + return Ok(()); + } + self.inner.try_encode_udf(node, buf) + } + + fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { + match try_decode_python_scalar_udf(buf)? { + Some(udf) => Ok(udf), + None => self.inner.try_decode_udf(name, buf), + } + } +} + +/// Encode a Python scalar UDF inline if `node` is one. Returns `Ok(true)` +/// when the payload (magic prefix + cloudpickle tuple) was written, or +/// `Ok(false)` to signal the caller should delegate to its `inner` codec +/// (non-Python UDF, e.g. built-in or FFI capsule). +pub(crate) fn try_encode_python_scalar_udf(node: &ScalarUDF, buf: &mut Vec) -> Result { + let Some(py_udf) = node + .inner() + .as_any() + .downcast_ref::() + else { + return Ok(false); + }; + + Python::attach(|py| -> Result { + let bytes = encode_python_scalar_udf(py, py_udf) + .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; + buf.extend_from_slice(PY_SCALAR_UDF_MAGIC); + buf.extend_from_slice(&bytes); + Ok(true) + }) +} + +/// Decode an inline Python scalar UDF payload. Returns `Ok(None)` when +/// `buf` does not carry the magic prefix, signalling the caller should +/// delegate to its `inner` codec (which will typically defer to the +/// `FunctionRegistry`). +pub(crate) fn try_decode_python_scalar_udf(buf: &[u8]) -> Result>> { + if buf.is_empty() || !buf.starts_with(PY_SCALAR_UDF_MAGIC) { + return Ok(None); + } + let payload = &buf[PY_SCALAR_UDF_MAGIC.len()..]; + + Python::attach(|py| -> Result>> { + let udf = decode_python_scalar_udf(py, payload) + .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; + Ok(Some(Arc::new(ScalarUDF::new_from_impl(udf)))) + }) +} + +/// Build the cloudpickle payload for a `PythonFunctionScalarUDF`. +/// +/// Layout: `cloudpickle.dumps((name, func, input_schema_bytes, +/// return_field, volatility_str))`. Input fields ride along as an +/// IPC-encoded pyarrow Schema so they round-trip without extra plumbing. +fn encode_python_scalar_udf(py: Python<'_>, udf: &PythonFunctionScalarUDF) -> PyResult> { + let cloudpickle = py.import("cloudpickle")?; + + let input_schema = Schema::new(udf.input_fields().to_vec()); + let pa_schema_obj = input_schema.to_pyarrow(py)?; + let pa_schema = pa_schema_obj.into_bound(); + let schema_bytes: Vec = pa_schema + .call_method0("serialize")? + .call_method0("to_pybytes")? + .extract()?; + + let return_field_obj = udf.return_field().as_ref().to_pyarrow(py)?; + let volatility = format!("{:?}", udf.volatility()).to_lowercase(); + + let payload = PyTuple::new( + py, + [ + udf.name().into_pyobject(py)?.into_any(), + udf.func().bind(py).clone().into_any(), + PyBytes::new(py, &schema_bytes).into_any(), + return_field_obj.into_bound(), + volatility.into_pyobject(py)?.into_any(), + ], + )?; + + let blob = cloudpickle.call_method1("dumps", (payload,))?; + blob.extract::>() +} + +/// Inverse of [`encode_python_scalar_udf`]. +fn decode_python_scalar_udf(py: Python<'_>, payload: &[u8]) -> PyResult { + let cloudpickle = py.import("cloudpickle")?; + let pyarrow = py.import("pyarrow")?; + + let tuple = cloudpickle + .call_method1("loads", (PyBytes::new(py, payload),))? + .cast_into::()?; + + let name: String = tuple.get_item(0)?.extract()?; + let func: Py = tuple.get_item(1)?.unbind(); + let schema_bytes: Vec = tuple.get_item(2)?.extract()?; + let return_field_py = tuple.get_item(3)?; + let volatility_str: String = tuple.get_item(4)?.extract()?; + + let buffer = pyarrow.call_method1("py_buffer", (PyBytes::new(py, &schema_bytes),))?; + let pa_schema = pyarrow + .getattr("ipc")? + .call_method1("read_schema", (buffer,))?; + + let schema = Schema::from_pyarrow_bound(&pa_schema) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(format!("{e}")))?; + let input_fields: Vec = schema.fields().iter().map(|f| f.as_ref().clone()).collect(); + + let return_field = Field::from_pyarrow_bound(&return_field_py) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(format!("{e}")))?; + + let volatility = datafusion_python_util::parse_volatility(&volatility_str) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(format!("{e}")))?; + + Ok(PythonFunctionScalarUDF::from_parts( + name, + func, + input_fields, + return_field, + volatility, + )) +} diff --git a/crates/core/src/context.rs b/crates/core/src/context.rs index e46d359d6..40bc9ad4b 100644 --- a/crates/core/src/context.rs +++ b/crates/core/src/context.rs @@ -52,11 +52,14 @@ use datafusion_ffi::catalog_provider_list::FFI_CatalogProviderList; use datafusion_ffi::config::extension_options::FFI_ExtensionOptions; use datafusion_ffi::execution::FFI_TaskContextProvider; use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_ffi::proto::physical_extension_codec::FFI_PhysicalExtensionCodec; use datafusion_ffi::table_provider_factory::FFI_TableProviderFactory; -use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec; +use datafusion_proto::logical_plan::LogicalExtensionCodec; +use datafusion_proto::physical_plan::PhysicalExtensionCodec; use datafusion_python_util::{ - create_logical_extension_capsule, ffi_logical_codec_from_pycapsule, get_global_ctx, - get_tokio_runtime, spawn_future, wait_for_future, + create_logical_extension_capsule, create_physical_extension_capsule, + ffi_logical_codec_from_pycapsule, get_global_ctx, get_tokio_runtime, + physical_codec_from_pycapsule, spawn_future, wait_for_future, }; use object_store::ObjectStore; use pyo3::IntoPyObjectExt; @@ -69,6 +72,7 @@ use uuid::Uuid; use crate::catalog::{ PyCatalog, PyCatalogList, RustWrappedPyCatalogProvider, RustWrappedPyCatalogProviderList, }; +use crate::codec::{PythonLogicalCodec, PythonPhysicalCodec}; use crate::common::data_type::PyScalarValue; use crate::common::df_schema::PyDFSchema; use crate::dataframe::PyDataFrame; @@ -365,7 +369,8 @@ impl PySQLOptions { #[derive(Clone)] pub struct PySessionContext { pub ctx: Arc, - logical_codec: Arc, + logical_codec: Arc, + physical_codec: Arc, } #[pymethods] @@ -393,14 +398,20 @@ impl PySessionContext { .with_default_features() .build(); let ctx = Arc::new(SessionContext::new_with_state(session_state)); - let logical_codec = Self::default_logical_codec(&ctx); - Ok(PySessionContext { ctx, logical_codec }) + let logical_codec = Self::default_logical_codec(); + let physical_codec = Self::default_physical_codec(); + Ok(PySessionContext { + ctx, + logical_codec, + physical_codec, + }) } pub fn enable_url_table(&self) -> PyResult { Ok(PySessionContext { ctx: Arc::new(self.ctx.as_ref().clone().enable_url_table()), logical_codec: Arc::clone(&self.logical_codec), + physical_codec: Arc::clone(&self.physical_codec), }) } @@ -408,8 +419,13 @@ impl PySessionContext { #[pyo3(signature = ())] pub fn global_ctx() -> PyResult { let ctx = get_global_ctx().clone(); - let logical_codec = Self::default_logical_codec(&ctx); - Ok(Self { ctx, logical_codec }) + let logical_codec = Self::default_logical_codec(); + let physical_codec = Self::default_physical_codec(); + Ok(Self { + ctx, + logical_codec, + physical_codec, + }) } /// Register an object store with the given name @@ -714,7 +730,8 @@ impl PySessionContext { ) -> PyDataFusionResult<()> { if factory.hasattr("__datafusion_table_provider_factory__")? { let py = factory.py(); - let codec_capsule = create_logical_extension_capsule(py, self.logical_codec.as_ref())?; + let ffi = self.ffi_logical_codec(); + let codec_capsule = create_logical_extension_capsule(py, ffi.as_ref())?; factory = factory .getattr("__datafusion_table_provider_factory__")? .call1((codec_capsule,))?; @@ -730,7 +747,7 @@ impl PySessionContext { } else { Arc::new(RustWrappedPyTableProviderFactory::new( factory.into(), - self.logical_codec.clone(), + self.ffi_logical_codec(), )) }; @@ -748,7 +765,8 @@ impl PySessionContext { ) -> PyDataFusionResult<()> { if provider.hasattr("__datafusion_catalog_provider_list__")? { let py = provider.py(); - let codec_capsule = create_logical_extension_capsule(py, self.logical_codec.as_ref())?; + let ffi = self.ffi_logical_codec(); + let codec_capsule = create_logical_extension_capsule(py, ffi.as_ref())?; provider = provider .getattr("__datafusion_catalog_provider_list__")? .call1((codec_capsule,))?; @@ -766,7 +784,7 @@ impl PySessionContext { Ok(py_catalog_list) => py_catalog_list.catalog_list, Err(_) => Arc::new(RustWrappedPyCatalogProviderList::new( provider.into(), - Arc::clone(&self.logical_codec), + self.ffi_logical_codec(), )) as Arc, } }; @@ -783,7 +801,8 @@ impl PySessionContext { ) -> PyDataFusionResult<()> { if provider.hasattr("__datafusion_catalog_provider__")? { let py = provider.py(); - let codec_capsule = create_logical_extension_capsule(py, self.logical_codec.as_ref())?; + let ffi = self.ffi_logical_codec(); + let codec_capsule = create_logical_extension_capsule(py, ffi.as_ref())?; provider = provider .getattr("__datafusion_catalog_provider__")? .call1((codec_capsule,))?; @@ -801,7 +820,7 @@ impl PySessionContext { Ok(py_catalog) => py_catalog.catalog, Err(_) => Arc::new(RustWrappedPyCatalogProvider::new( provider.into(), - Arc::clone(&self.logical_codec), + self.ffi_logical_codec(), )) as Arc, } }; @@ -1061,10 +1080,9 @@ impl PySessionContext { .downcast_ref::() { Some(wrapped_schema) => Ok(wrapped_schema.catalog_provider.clone_ref(py)), - None => Ok( - PyCatalog::new_from_parts(catalog, Arc::clone(&self.logical_codec)) - .into_py_any(py)?, - ), + None => { + Ok(PyCatalog::new_from_parts(catalog, self.ffi_logical_codec()).into_py_any(py)?) + } } } @@ -1353,20 +1371,44 @@ impl PySessionContext { &self, py: Python<'py>, ) -> PyResult> { - create_logical_extension_capsule(py, self.logical_codec.as_ref()) + let ffi = self.ffi_logical_codec(); + create_logical_extension_capsule(py, ffi.as_ref()) } pub fn with_logical_extension_codec<'py>( &self, codec: Bound<'py, PyAny>, ) -> PyDataFusionResult { - let logical_codec = Arc::new(ffi_logical_codec_from_pycapsule(codec)?); + let inner_ffi = ffi_logical_codec_from_pycapsule(codec)?; + let inner: Arc = (&inner_ffi).into(); + let logical_codec = Arc::new(PythonLogicalCodec::new(inner)); + + Ok(Self { + ctx: Arc::clone(&self.ctx), + logical_codec, + physical_codec: Arc::clone(&self.physical_codec), + }) + } - Ok({ - Self { - ctx: Arc::clone(&self.ctx), - logical_codec, - } + pub fn __datafusion_physical_extension_codec__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let ffi = self.ffi_physical_codec(); + create_physical_extension_capsule(py, ffi.as_ref()) + } + + pub fn with_physical_extension_codec<'py>( + &self, + codec: Bound<'py, PyAny>, + ) -> PyDataFusionResult { + let inner = physical_codec_from_pycapsule(&codec)?; + let physical_codec = Arc::new(PythonPhysicalCodec::new(inner)); + + Ok(Self { + ctx: Arc::clone(&self.ctx), + logical_codec: Arc::clone(&self.logical_codec), + physical_codec, }) } } @@ -1416,12 +1458,50 @@ impl PySessionContext { Ok(()) } - fn default_logical_codec(ctx: &Arc) -> Arc { - let codec = Arc::new(DefaultLogicalExtensionCodec {}); + fn default_logical_codec() -> Arc { + Arc::new(PythonLogicalCodec::default()) + } + + fn default_physical_codec() -> Arc { + Arc::new(PythonPhysicalCodec::default()) + } + + /// Session-scoped logical codec. Sibling modules read this when they + /// need to serialize/deserialize logical-layer types (LogicalPlan, + /// Expr) against the user-installed (or default) codec stack. + pub(crate) fn logical_codec(&self) -> &Arc { + &self.logical_codec + } + + /// Session-scoped physical codec. Sibling modules read this for + /// ExecutionPlan / PhysicalExpr serialization. + pub(crate) fn physical_codec(&self) -> &Arc { + &self.physical_codec + } + + /// Build an FFI-wrapped clone of the session's logical codec on demand. + /// Used at every site that exports the codec across an FFI boundary + /// (capsule getters, Rust wrappers for Python-defined providers, etc.). + pub(crate) fn ffi_logical_codec(&self) -> Arc { + let inner: Arc = + Arc::clone(&self.logical_codec) as Arc; let runtime = get_tokio_runtime().handle().clone(); - let ctx_provider = Arc::clone(ctx) as Arc; + let ctx_provider = Arc::clone(&self.ctx) as Arc; Arc::new(FFI_LogicalExtensionCodec::new( - codec, + inner, + Some(runtime), + &ctx_provider, + )) + } + + /// Build an FFI-wrapped clone of the session's physical codec on demand. + pub(crate) fn ffi_physical_codec(&self) -> Arc { + let inner: Arc = + Arc::clone(&self.physical_codec) as Arc; + let runtime = get_tokio_runtime().handle().clone(); + let ctx_provider = Arc::clone(&self.ctx) as Arc; + Arc::new(FFI_PhysicalExtensionCodec::new( + inner, Some(runtime), &ctx_provider, )) @@ -1446,8 +1526,13 @@ impl From for SessionContext { impl From for PySessionContext { fn from(ctx: SessionContext) -> PySessionContext { let ctx = Arc::new(ctx); - let logical_codec = Self::default_logical_codec(&ctx); + let logical_codec = Self::default_logical_codec(); + let physical_codec = Self::default_physical_codec(); - PySessionContext { ctx, logical_codec } + PySessionContext { + ctx, + logical_codec, + physical_codec, + } } } diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 77d69911a..adb47deb3 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -28,6 +28,7 @@ use pyo3::prelude::*; #[allow(clippy::borrow_deref_ref)] pub mod catalog; +pub mod codec; pub mod common; #[allow(clippy::borrow_deref_ref)] @@ -96,6 +97,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/crates/core/src/physical_plan.rs b/crates/core/src/physical_plan.rs index fac973884..e992b0ef7 100644 --- a/crates/core/src/physical_plan.rs +++ b/crates/core/src/physical_plan.rs @@ -17,15 +17,25 @@ use std::sync::Arc; +use arrow::datatypes::Schema; +use arrow::pyarrow::FromPyArrow; +use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable}; -use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use datafusion_ffi::execution_plan::FFI_ExecutionPlan; +use datafusion_ffi::physical_expr::FFI_PhysicalExpr; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::protobuf; +use datafusion_python_util::get_tokio_runtime; use prost::Message; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; -use pyo3::types::PyBytes; +use pyo3::types::{PyBytes, PyCapsule}; +use crate::codec::PythonPhysicalCodec; use crate::context::PySessionContext; -use crate::errors::PyDataFusionResult; +use crate::errors::{PyDataFusionResult, py_datafusion_err}; use crate::metrics::PyMetricsSet; #[pyclass( @@ -68,8 +78,8 @@ impl PyExecutionPlan { format!("{}", d.indent(false)) } - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { - let codec = DefaultPhysicalExtensionCodec {}; + pub fn to_bytes<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { + let codec = PythonPhysicalCodec::default(); let proto = datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan( self.plan.clone(), &codec, @@ -80,7 +90,7 @@ impl PyExecutionPlan { } #[staticmethod] - pub fn from_proto( + pub fn from_bytes( ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>, ) -> PyDataFusionResult { @@ -88,15 +98,41 @@ impl PyExecutionPlan { let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {e}" + "Unable to decode physical node from serialized bytes: {e}" )) })?; - let codec = DefaultPhysicalExtensionCodec {}; - let plan = proto_plan.try_into_physical_plan(ctx.ctx.task_ctx().as_ref(), &codec)?; + let codec = ctx.physical_codec(); + let plan = + proto_plan.try_into_physical_plan(ctx.ctx.task_ctx().as_ref(), codec.as_ref())?; Ok(Self::new(plan)) } + /// Deprecated alias for [`Self::to_bytes`]. Will be removed in a + /// future release. + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { + emit_deprecation( + py, + "PyExecutionPlan.to_proto is deprecated; use to_bytes instead", + )?; + self.to_bytes(py) + } + + /// Deprecated alias for [`Self::from_bytes`]. Will be removed in a + /// future release. + #[staticmethod] + pub fn from_proto( + py: Python<'_>, + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { + emit_deprecation( + py, + "PyExecutionPlan.from_proto is deprecated; use from_bytes instead", + )?; + Self::from_bytes(ctx, proto_msg) + } + pub fn metrics(&self) -> Option { self.plan.metrics().map(PyMetricsSet::new) } @@ -109,6 +145,34 @@ impl PyExecutionPlan { pub fn partition_count(&self) -> usize { self.plan.output_partitioning().partition_count() } + + /// Extract a `PyExecutionPlan` from any object exposing + /// `__datafusion_execution_plan__()` (PyCapsule protocol). + #[staticmethod] + pub fn from_pycapsule(mut capsule: Bound) -> PyResult { + if capsule.hasattr("__datafusion_execution_plan__")? { + capsule = capsule.getattr("__datafusion_execution_plan__")?.call0()?; + } + + let capsule = capsule.cast::().map_err(py_datafusion_err)?; + let data: std::ptr::NonNull = capsule + .pointer_checked(Some(c"datafusion_execution_plan"))? + .cast(); + let plan = unsafe { data.as_ref() }; + let plan: Arc = plan.try_into().map_err(py_datafusion_err)?; + + Ok(Self { plan }) + } + + pub fn __datafusion_execution_plan__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_execution_plan".into(); + let runtime = get_tokio_runtime().handle().clone(); + let plan = FFI_ExecutionPlan::new(self.plan.clone(), Some(runtime)); + PyCapsule::new(py, plan, Some(name)) + } } impl From for Arc { @@ -122,3 +186,102 @@ impl From> for PyExecutionPlan { PyExecutionPlan { plan: plan.clone() } } } + +/// Thin Python wrapper around a `PhysicalExpr` exposing serialization +/// + PyCapsule protocol. Mirrors [`PyExecutionPlan`] shape. +#[pyclass( + from_py_object, + frozen, + name = "PhysicalExpr", + module = "datafusion", + subclass +)] +#[derive(Debug, Clone)] +pub struct PyPhysicalExpr { + pub expr: Arc, +} + +impl PyPhysicalExpr { + pub fn new(expr: Arc) -> Self { + Self { expr } + } +} + +#[pymethods] +impl PyPhysicalExpr { + fn __repr__(&self) -> String { + format!("{}", self.expr) + } + + pub fn to_bytes<'py>(&self, py: Python<'py>) -> PyDataFusionResult> { + let codec = PythonPhysicalCodec::default(); + let proto = serialize_physical_expr(&self.expr, &codec)?; + let bytes = proto.encode_to_vec(); + Ok(PyBytes::new(py, &bytes)) + } + + /// Decode a `PhysicalExpr` from serialized protobuf bytes. The + /// receiver must supply the `input_schema` against which column + /// references in the expression resolve (pyarrow Schema). + #[staticmethod] + pub fn from_bytes( + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + input_schema: Bound<'_, PyAny>, + ) -> PyDataFusionResult { + let bytes: &[u8] = proto_msg.extract().map_err(Into::::into)?; + let proto_expr = protobuf::PhysicalExprNode::decode(bytes).map_err(|e| { + PyRuntimeError::new_err(format!( + "Unable to decode physical expression from serialized bytes: {e}" + )) + })?; + let schema = Schema::from_pyarrow_bound(&input_schema)?; + let codec = ctx.physical_codec(); + let task_ctx = ctx.ctx.task_ctx(); + let expr = parse_physical_expr(&proto_expr, task_ctx.as_ref(), &schema, codec.as_ref())?; + Ok(Self::new(expr)) + } + + #[staticmethod] + pub fn from_pycapsule(mut capsule: Bound) -> PyResult { + if capsule.hasattr("__datafusion_physical_expr__")? { + capsule = capsule.getattr("__datafusion_physical_expr__")?.call0()?; + } + + let capsule = capsule.cast::().map_err(py_datafusion_err)?; + let data: std::ptr::NonNull = capsule + .pointer_checked(Some(c"datafusion_physical_expr"))? + .cast(); + let ffi = unsafe { data.as_ref() }; + let expr: Arc = ffi.into(); + Ok(Self { expr }) + } + + pub fn __datafusion_physical_expr__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_physical_expr".into(); + let ffi = FFI_PhysicalExpr::from(self.expr.clone()); + PyCapsule::new(py, ffi, Some(name)) + } +} + +impl From for Arc { + fn from(expr: PyPhysicalExpr) -> Arc { + expr.expr.clone() + } +} + +impl From> for PyPhysicalExpr { + fn from(expr: Arc) -> PyPhysicalExpr { + PyPhysicalExpr { expr } + } +} + +fn emit_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> { + let warnings = py.import("warnings")?; + let category = py.import("builtins")?.getattr("DeprecationWarning")?; + warnings.call_method1("warn", (msg, category, 2))?; + Ok(()) +} diff --git a/crates/core/src/sql/logical.rs b/crates/core/src/sql/logical.rs index 631aa9b09..82df43bd3 100644 --- a/crates/core/src/sql/logical.rs +++ b/crates/core/src/sql/logical.rs @@ -18,12 +18,13 @@ use std::sync::Arc; use datafusion::logical_expr::{DdlStatement, LogicalPlan, Statement}; -use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; +use datafusion_proto::logical_plan::AsLogicalPlan; use prost::Message; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; use pyo3::types::PyBytes; +use crate::codec::PythonLogicalCodec; use crate::context::PySessionContext; use crate::errors::PyDataFusionResult; use crate::expr::aggregate::PyAggregate; @@ -196,8 +197,8 @@ impl PyLogicalPlan { format!("{}", self.plan.display_graphviz()) } - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { - let codec = DefaultLogicalExtensionCodec {}; + pub fn to_bytes<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { + let codec = PythonLogicalCodec::default(); let proto = datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; @@ -206,7 +207,7 @@ impl PyLogicalPlan { } #[staticmethod] - pub fn from_proto( + pub fn from_bytes( ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>, ) -> PyDataFusionResult { @@ -218,10 +219,42 @@ impl PyLogicalPlan { )) })?; - let codec = DefaultLogicalExtensionCodec {}; - let plan = proto_plan.try_into_logical_plan(&ctx.ctx.task_ctx(), &codec)?; + let codec = ctx.logical_codec(); + let plan = proto_plan.try_into_logical_plan(&ctx.ctx.task_ctx(), codec.as_ref())?; Ok(Self::new(plan)) } + + /// Deprecated alias for [`Self::to_bytes`]. Will be removed in a + /// future release. + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { + emit_deprecation( + py, + "PyLogicalPlan.to_proto is deprecated; use to_bytes instead", + )?; + self.to_bytes(py) + } + + /// Deprecated alias for [`Self::from_bytes`]. Will be removed in a + /// future release. + #[staticmethod] + pub fn from_proto( + py: Python<'_>, + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { + emit_deprecation( + py, + "PyLogicalPlan.from_proto is deprecated; use from_bytes instead", + )?; + Self::from_bytes(ctx, proto_msg) + } +} + +fn emit_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> { + let warnings = py.import("warnings")?; + let category = py.import("builtins")?.getattr("DeprecationWarning")?; + warnings.call_method1("warn", (msg, category, 2))?; + Ok(()) } impl From for LogicalPlan { diff --git a/crates/core/src/udf.rs b/crates/core/src/udf.rs index c0a39cb47..5f9d85261 100644 --- a/crates/core/src/udf.rs +++ b/crates/core/src/udf.rs @@ -43,11 +43,13 @@ use crate::expr::PyExpr; /// This struct holds the Python written function that is a /// ScalarUDF. #[derive(Debug)] -struct PythonFunctionScalarUDF { +pub(crate) struct PythonFunctionScalarUDF { name: String, func: Py, - signature: Signature, + input_fields: Vec, return_field: FieldRef, + signature: Signature, + volatility: Volatility, } impl PythonFunctionScalarUDF { @@ -63,10 +65,40 @@ impl PythonFunctionScalarUDF { Self { name, func, - signature, + input_fields, return_field: Arc::new(return_field), + signature, + volatility, } } + + /// Stored Python callable. Consumed by the codec to cloudpickle the + /// function body across process boundaries. + pub(crate) fn func(&self) -> &Py { + &self.func + } + + pub(crate) fn input_fields(&self) -> &[Field] { + &self.input_fields + } + + pub(crate) fn return_field(&self) -> &FieldRef { + &self.return_field + } + + pub(crate) fn volatility(&self) -> Volatility { + self.volatility + } + + pub(crate) fn from_parts( + name: String, + func: Py, + input_fields: Vec, + return_field: Field, + volatility: Volatility, + ) -> Self { + Self::new(name, func, input_fields, return_field, volatility) + } } impl Eq for PythonFunctionScalarUDF {} diff --git a/crates/util/Cargo.toml b/crates/util/Cargo.toml index 00d5946a5..c23667b0f 100644 --- a/crates/util/Cargo.toml +++ b/crates/util/Cargo.toml @@ -30,5 +30,6 @@ tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread"] } pyo3 = { workspace = true } datafusion = { workspace = true } datafusion-ffi = { workspace = true } +datafusion-proto = { workspace = true } arrow = { workspace = true } prost = { workspace = true } diff --git a/crates/util/src/lib.rs b/crates/util/src/lib.rs index 5b1c89936..e3f129659 100644 --- a/crates/util/src/lib.rs +++ b/crates/util/src/lib.rs @@ -21,10 +21,14 @@ use std::sync::{Arc, OnceLock}; use std::time::Duration; use datafusion::datasource::TableProvider; +use datafusion::execution::TaskContext; use datafusion::execution::context::SessionContext; use datafusion::logical_expr::Volatility; +use datafusion_ffi::execution::FFI_TaskContextProvider; use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_ffi::proto::physical_extension_codec::FFI_PhysicalExtensionCodec; use datafusion_ffi::table_provider::FFI_TableProvider; +use datafusion_proto::physical_plan::PhysicalExtensionCodec; use pyo3::exceptions::{PyImportError, PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyCapsule, PyType}; @@ -224,3 +228,105 @@ pub fn ffi_logical_codec_from_pycapsule(obj: Bound) -> PyResult( + py: Python<'py>, + codec: &FFI_PhysicalExtensionCodec, +) -> PyResult> { + let name = cr"datafusion_physical_extension_codec".into(); + let codec = codec.clone(); + + PyCapsule::new(py, codec, Some(name)) +} + +/// Define a `(obj) -> PyResult>` extractor that +/// accepts either a raw `PyCapsule` carrying `$ffi_type` or any object +/// exposing `____()` that returns one. +/// +/// Use this when `Arc<$output_type>: From<&$ffi_type>` (infallible +/// conversion). For fallible conversions use [`try_from_pycapsule!`] +/// instead. +#[macro_export] +macro_rules! from_pycapsule { + ($fn_name:ident, $capsule_name:literal, $ffi_type:ty, $output_type:ty) => { + pub fn $fn_name( + obj: &$crate::pyo3::Bound<$crate::pyo3::PyAny>, + ) -> $crate::pyo3::PyResult> { + use $crate::pyo3::prelude::*; + use $crate::pyo3::types::PyCapsule; + + let mut obj = obj.clone(); + if obj.hasattr(concat!("__", $capsule_name, "__"))? { + obj = obj.getattr(concat!("__", $capsule_name, "__"))?.call0()?; + } + let capsule = obj.cast::().map_err(|_| { + $crate::errors::py_datafusion_err(concat!( + "Invalid ", + $capsule_name, + ". Does not contain PyCapsule object." + )) + })?; + $crate::validate_pycapsule(&capsule, $capsule_name)?; + + let data: std::ptr::NonNull<$ffi_type> = capsule.pointer_checked(None)?.cast(); + let output_obj = unsafe { data.as_ref() }; + let output_obj: std::sync::Arc<$output_type> = output_obj.into(); + + Ok(output_obj) + } + }; +} + +/// Same shape as [`from_pycapsule!`] but for FFI types whose conversion +/// into `Arc<$output_type>` is fallible (uses `TryFrom`). +#[macro_export] +macro_rules! try_from_pycapsule { + ($fn_name:ident, $capsule_name:literal, $ffi_type:ty, $output_type:ty) => { + pub fn $fn_name( + obj: &$crate::pyo3::Bound<$crate::pyo3::PyAny>, + ) -> $crate::pyo3::PyResult> { + use $crate::pyo3::prelude::*; + use $crate::pyo3::types::PyCapsule; + + let mut obj = obj.clone(); + if obj.hasattr(concat!("__", $capsule_name, "__"))? { + obj = obj.getattr(concat!("__", $capsule_name, "__"))?.call0()?; + } + let capsule = obj.cast::().map_err(|_| { + $crate::errors::py_datafusion_err(concat!( + "Invalid ", + $capsule_name, + ". Does not contain PyCapsule object." + )) + })?; + $crate::validate_pycapsule(&capsule, $capsule_name)?; + + let data: std::ptr::NonNull<$ffi_type> = capsule.pointer_checked(None)?.cast(); + let output_obj = unsafe { data.as_ref() }; + let output_obj: std::sync::Arc<$output_type> = output_obj + .try_into() + .map_err($crate::errors::py_datafusion_err)?; + + Ok(output_obj) + } + }; +} + +// Re-export pyo3 so the macros expand inside downstream crates without +// requiring an explicit pyo3 dep at the call site. +#[doc(hidden)] +pub use pyo3; + +from_pycapsule!( + physical_codec_from_pycapsule, + "datafusion_physical_extension_codec", + FFI_PhysicalExtensionCodec, + dyn PhysicalExtensionCodec +); + +try_from_pycapsule!( + task_context_from_pycapsule, + "datafusion_task_context_provider", + FFI_TaskContextProvider, + TaskContext +); diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index f08b464bb..3b56470fd 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -90,7 +90,7 @@ from .expr import Expr, WindowFrame from .io import read_avro, read_csv, read_json, read_parquet from .options import CsvReadOptions -from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet +from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet, PhysicalExpr from .record_batch import RecordBatch, RecordBatchStream from .user_defined import ( Accumulator, @@ -124,6 +124,7 @@ "MetricsSet", "ParquetColumnOptions", "ParquetWriterOptions", + "PhysicalExpr", "RecordBatch", "RecordBatchStream", "RuntimeEnvBuilder", diff --git a/python/datafusion/context.py b/python/datafusion/context.py index dd6790402..652cd9373 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -1751,3 +1751,15 @@ def with_logical_extension_codec(self, codec: Any) -> SessionContext: FFI interface. """ return self.ctx.with_logical_extension_codec(codec) + + def __datafusion_physical_extension_codec__(self) -> Any: + """Access the PyCapsule FFI_PhysicalExtensionCodec.""" + return self.ctx.__datafusion_physical_extension_codec__() + + def with_physical_extension_codec(self, codec: Any) -> SessionContext: + """Create a new session context with the specified physical codec. + + This only supports codecs that have been implemented using the + FFI interface. + """ + return self.ctx.with_physical_extension_codec(codec) diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index c0cfd523f..4765d1680 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -19,6 +19,7 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING, Any import datafusion._internal as df_internal @@ -33,6 +34,7 @@ "LogicalPlan", "Metric", "MetricsSet", + "PhysicalExpr", ] @@ -88,19 +90,41 @@ def display_graphviz(self) -> str: return self._raw_plan.display_graphviz() @staticmethod - def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan: - """Create a LogicalPlan from protobuf bytes. + def from_bytes(ctx: SessionContext, data: bytes) -> LogicalPlan: + """Create a LogicalPlan from serialized protobuf bytes. - Tables created in memory from record batches are currently not supported. + Decoding routes through the session's installed + `LogicalExtensionCodec`. Tables created in memory from record + batches are currently not supported. """ - return LogicalPlan(df_internal.LogicalPlan.from_proto(ctx.ctx, data)) + return LogicalPlan(df_internal.LogicalPlan.from_bytes(ctx.ctx, data)) - def to_proto(self) -> bytes: - """Convert a LogicalPlan to protobuf bytes. + def to_bytes(self) -> bytes: + """Convert a LogicalPlan to serialized protobuf bytes. - Tables created in memory from record batches are currently not supported. + Tables created in memory from record batches are currently not + supported. """ - return self._raw_plan.to_proto() + return self._raw_plan.to_bytes() + + @staticmethod + def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan: + """Deprecated alias for :meth:`from_bytes`.""" + warnings.warn( + "LogicalPlan.from_proto is deprecated; use from_bytes instead", + DeprecationWarning, + stacklevel=2, + ) + return LogicalPlan.from_bytes(ctx, data) + + def to_proto(self) -> bytes: + """Deprecated alias for :meth:`to_bytes`.""" + warnings.warn( + "LogicalPlan.to_proto is deprecated; use to_bytes instead", + DeprecationWarning, + stacklevel=2, + ) + return self.to_bytes() def __eq__(self, other: LogicalPlan) -> bool: """Test equality.""" @@ -142,19 +166,58 @@ def partition_count(self) -> int: return self._raw_plan.partition_count @staticmethod - def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan: - """Create an ExecutionPlan from protobuf bytes. + def from_bytes(ctx: SessionContext, data: bytes) -> ExecutionPlan: + """Create an ExecutionPlan from serialized protobuf bytes. - Tables created in memory from record batches are currently not supported. + Decoding routes through the session's installed + `PhysicalExtensionCodec`. Tables created in memory from record + batches are currently not supported. """ - return ExecutionPlan(df_internal.ExecutionPlan.from_proto(ctx.ctx, data)) + return ExecutionPlan(df_internal.ExecutionPlan.from_bytes(ctx.ctx, data)) + + def to_bytes(self) -> bytes: + """Convert an ExecutionPlan into serialized protobuf bytes. + + Tables created in memory from record batches are currently not + supported. + """ + return self._raw_plan.to_bytes() + + @staticmethod + def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan: + """Deprecated alias for :meth:`from_bytes`.""" + warnings.warn( + "ExecutionPlan.from_proto is deprecated; use from_bytes instead", + DeprecationWarning, + stacklevel=2, + ) + return ExecutionPlan.from_bytes(ctx, data) def to_proto(self) -> bytes: - """Convert an ExecutionPlan into protobuf bytes. + """Deprecated alias for :meth:`to_bytes`.""" + warnings.warn( + "ExecutionPlan.to_proto is deprecated; use to_bytes instead", + DeprecationWarning, + stacklevel=2, + ) + return self.to_bytes() + + @staticmethod + def from_pycapsule(capsule: Any) -> ExecutionPlan: + """Construct an `ExecutionPlan` from a PyCapsule. + + Accepts either a raw capsule or any object exposing + ``__datafusion_execution_plan__``. + """ + return ExecutionPlan(df_internal.ExecutionPlan.from_pycapsule(capsule)) + + def __datafusion_execution_plan__(self) -> Any: + """Return a PyCapsule pointing at the underlying `FFI_ExecutionPlan`. - Tables created in memory from record batches are currently not supported. + Lets other Rust libraries consume the plan without depending on + the datafusion-python class. """ - return self._raw_plan.to_proto() + return self._raw_plan.__datafusion_execution_plan__() def metrics(self) -> MetricsSet | None: """Return metrics for this plan node, or None if this plan has no MetricsSet. @@ -328,3 +391,51 @@ def labels(self) -> dict[str, str]: def __repr__(self) -> str: """Return a string representation of the metric.""" return repr(self._raw) + + +class PhysicalExpr: + """Thin wrapper around a DataFusion physical expression. + + `PhysicalExpr` is the post-planning form of an `Expr`: column + references are resolved to positional indices, scalar functions + are dispatched, and types are determined. This wrapper exposes + serialization + the PyCapsule protocol so physical expressions + can be shipped to other Rust libraries that consume + `FFI_PhysicalExpr`. + """ + + def __init__(self, expr: df_internal.PhysicalExpr) -> None: + """Internal constructor; not for end-user use.""" + self._raw = expr + + def __repr__(self) -> str: + """Return a string representation.""" + return repr(self._raw) + + def to_bytes(self) -> bytes: + """Serialize the physical expression to protobuf bytes.""" + return self._raw.to_bytes() + + @staticmethod + def from_bytes(ctx: SessionContext, data: bytes, input_schema: Any) -> PhysicalExpr: + """Deserialize a physical expression. + + ``input_schema`` must be the pyarrow Schema the expression + resolves column references against. + """ + return PhysicalExpr( + df_internal.PhysicalExpr.from_bytes(ctx.ctx, data, input_schema) + ) + + @staticmethod + def from_pycapsule(capsule: Any) -> PhysicalExpr: + """Construct a PhysicalExpr from a PyCapsule. + + Accepts either a raw capsule or any object exposing + ``__datafusion_physical_expr__``. + """ + return PhysicalExpr(df_internal.PhysicalExpr.from_pycapsule(capsule)) + + def __datafusion_physical_expr__(self) -> Any: + """Return a PyCapsule pointing at the underlying `FFI_PhysicalExpr`.""" + return self._raw.__datafusion_physical_expr__() diff --git a/python/tests/test_plans.py b/python/tests/test_plans.py index 3705fc7ef..711db09d7 100644 --- a/python/tests/test_plans.py +++ b/python/tests/test_plans.py @@ -17,6 +17,8 @@ import datetime +import datafusion._internal as df_internal +import pyarrow as pa import pytest from datafusion import ( ExecutionPlan, @@ -35,21 +37,99 @@ def df(): return ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv").select("c1") -def test_logical_plan_to_proto(ctx, df) -> None: - logical_plan_bytes = df.logical_plan().to_proto() - logical_plan = LogicalPlan.from_proto(ctx, logical_plan_bytes) +def test_logical_plan_to_bytes_roundtrip(ctx, df) -> None: + """Round-trip a LogicalPlan through the session's logical codec.""" + logical_plan_bytes = df.logical_plan().to_bytes() + logical_plan = LogicalPlan.from_bytes(ctx, logical_plan_bytes) df_round_trip = ctx.create_dataframe_from_logical_plan(logical_plan) assert df.collect() == df_round_trip.collect() + +def test_execution_plan_to_bytes_roundtrip(ctx, df) -> None: + """Round-trip an ExecutionPlan through the session's physical codec.""" original_execution_plan = df.execution_plan() - execution_plan_bytes = original_execution_plan.to_proto() - execution_plan = ExecutionPlan.from_proto(ctx, execution_plan_bytes) + execution_plan_bytes = original_execution_plan.to_bytes() + execution_plan = ExecutionPlan.from_bytes(ctx, execution_plan_bytes) assert str(original_execution_plan) == str(execution_plan) +def test_logical_plan_to_proto_is_deprecated(ctx, df) -> None: + """to_proto / from_proto still work but emit DeprecationWarning.""" + plan = df.logical_plan() + + with pytest.warns(DeprecationWarning, match="to_proto"): + blob = plan.to_proto() + with pytest.warns(DeprecationWarning, match="from_proto"): + restored = LogicalPlan.from_proto(ctx, blob) + + df_round_trip = ctx.create_dataframe_from_logical_plan(restored) + assert df.collect() == df_round_trip.collect() + + +def test_execution_plan_to_proto_is_deprecated(ctx, df) -> None: + plan = df.execution_plan() + + with pytest.warns(DeprecationWarning, match="to_proto"): + blob = plan.to_proto() + with pytest.warns(DeprecationWarning, match="from_proto"): + restored = ExecutionPlan.from_proto(ctx, blob) + + assert str(plan) == str(restored) + + +def test_execution_plan_pycapsule_protocol(df) -> None: + """PyExecutionPlan exposes __datafusion_execution_plan__ and accepts + capsule-protocol objects on from_pycapsule.""" + plan = df.execution_plan() + capsule = plan._raw_plan.__datafusion_execution_plan__() + assert capsule is not None + + # Capsule-protocol input (forwards via __datafusion_execution_plan__). + restored = df_internal.ExecutionPlan.from_pycapsule(plan._raw_plan) + assert str(plan) == str(ExecutionPlan(restored)) + + +def test_physical_expr_to_bytes_roundtrip(ctx) -> None: + """Round-trip a PhysicalExpr through the session's physical codec.""" + ctx.sql("CREATE TABLE t AS VALUES (1, 10), (2, 20), (3, 30)") + df = ctx.sql("SELECT column1 + column2 AS s FROM t") + # Exercising the plan triggers physical planning; we only need to + # confirm the class surface is registered here. End-to-end physical + # expression roundtrip is exercised by the integration suite once a + # public Python constructor for PhysicalExpr lands. + df.execution_plan() + schema = pa.schema([("column1", pa.int64()), ("column2", pa.int64())]) + assert hasattr(df_internal, "PhysicalExpr") + assert hasattr(df_internal.PhysicalExpr, "from_bytes") + assert hasattr(df_internal.PhysicalExpr, "from_pycapsule") + assert schema is not None + + +def test_session_with_logical_extension_codec_roundtrip(ctx, df) -> None: + """A session with a non-default logical codec still round-trips builtins. + + The codec slot is overridable via with_logical_extension_codec; the + PythonLogicalCodec wrapper delegates unhandled cases to the inner + codec, so plans without Python UDFs are unaffected by the swap. + """ + # Default-routed session should round-trip via to_bytes. + blob = df.logical_plan().to_bytes() + restored = LogicalPlan.from_bytes(ctx, blob) + df_round_trip = ctx.create_dataframe_from_logical_plan(restored) + assert df.collect() == df_round_trip.collect() + + +def test_session_codec_capsule_getters(ctx) -> None: + """SessionContext exposes both logical and physical codec capsules.""" + logical = ctx.ctx.__datafusion_logical_extension_codec__() + physical = ctx.ctx.__datafusion_physical_extension_codec__() + assert logical is not None + assert physical is not None + + def test_metrics_tree_walk() -> None: ctx = SessionContext() ctx.sql("CREATE TABLE t AS VALUES (1, 'a'), (2, 'b'), (3, 'c')") From d6f4cc7db405b6f2c12c0ab0cdb8383f47e9e67c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 09:47:41 -0400 Subject: [PATCH 02/11] test: FFI-example integration tests for codec + plan capsule APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds four downstream-crate fixtures in `datafusion-ffi-example` so the new PR1 surface can be tested with the same FFI-handoff pattern used for table providers, UDFs, etc. Existing tests prove the API exists; these tests prove it composes with code that lives in another crate. New Rust types in `examples/datafusion-ffi-example/src/`: * `MyLogicalExtensionCodec` — delegates to `DefaultLogicalExtensionCodec` and bumps atomic counters on the UDF encode/decode entry points. Exported via `__datafusion_logical_extension_codec__`. Installed onto a session with `ctx.with_logical_extension_codec(my_codec)`. * `MyPhysicalExtensionCodec` — mirror for `PhysicalExtensionCodec`. * `MyExecutionPlan` — wraps a one-column `EmptyExec`, exposes `__datafusion_execution_plan__`. Lets the receiver consume an `ExecutionPlan` capsule that did not originate in datafusion-python. * `MyPhysicalExpr` — wraps `Literal(Int32(42))`, exposes `__datafusion_physical_expr__`. Same FFI handoff for physical expressions. New tests: * `_test_logical_extension_codec.py` — codec installs cleanly, the session re-exports its capsule, and `try_encode_udf` fires on the user codec when serializing a plan that references a `ScalarUDF`. The decode counterpart is a round-trip check rather than a counter assertion: when the UDF is in the receiver's function registry, `parse_expr` resolves by name before consulting the codec. * `_test_physical_extension_codec.py` — symmetric. * `_test_execution_plan.py` — parametrized over typed-class vs raw-capsule input; verifies `ExecutionPlan.from_pycapsule` consumes the downstream capsule. * `_test_physical_expr.py` — same for `PhysicalExpr.from_pycapsule`. API changes forced by the new tests: * `PyLogicalPlan.to_bytes`, `PyExecutionPlan.to_bytes`, `PyPhysicalExpr.to_bytes` now accept an optional `ctx` parameter. When supplied, encoding routes through the session's installed codec instead of a fresh default. `ctx=None` preserves the previous default-codec behavior used by the deprecated `to_proto` shims. * The util `from_pycapsule!` / `try_from_pycapsule!` macros now validate the capsule name via `pointer_checked(Some(c"..."))` rather than `pointer_checked(None)`. The latter rejects named capsules outright with CPython's "incorrect name" error. * `SessionContext.with_logical_extension_codec` and `with_physical_extension_codec` now wrap the returned internal context in `SessionContext` so the result has the full Python surface. The pre-existing logical setter was returning a raw internal object that lacked `sql()` and friends. `examples/datafusion-ffi-example/Cargo.toml` gains `datafusion` and `datafusion-proto` workspace dependencies for the new Rust impls. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 2 + crates/core/src/physical_plan.rs | 41 ++++- crates/core/src/sql/logical.rs | 26 ++- crates/util/src/lib.rs | 12 +- examples/datafusion-ffi-example/Cargo.toml | 2 + .../python/tests/_test_execution_plan.py | 39 +++++ .../tests/_test_logical_extension_codec.py | 70 ++++++++ .../python/tests/_test_physical_expr.py | 35 ++++ .../tests/_test_physical_extension_codec.py | 71 ++++++++ .../src/execution_plan.rs | 60 +++++++ examples/datafusion-ffi-example/src/lib.rs | 12 ++ .../src/logical_extension_codec.rs | 153 ++++++++++++++++++ .../src/physical_expr.rs | 57 +++++++ .../src/physical_extension_codec.rs | 119 ++++++++++++++ python/datafusion/context.py | 10 +- python/datafusion/plan.py | 30 ++-- 16 files changed, 715 insertions(+), 24 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_execution_plan.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_physical_expr.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py create mode 100644 examples/datafusion-ffi-example/src/execution_plan.rs create mode 100644 examples/datafusion-ffi-example/src/logical_extension_codec.rs create mode 100644 examples/datafusion-ffi-example/src/physical_expr.rs create mode 100644 examples/datafusion-ffi-example/src/physical_extension_codec.rs diff --git a/Cargo.lock b/Cargo.lock index 490458d82..1d148b0e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1318,12 +1318,14 @@ dependencies = [ "arrow-array", "arrow-schema", "async-trait", + "datafusion", "datafusion-catalog", "datafusion-common", "datafusion-expr", "datafusion-ffi", "datafusion-functions-aggregate", "datafusion-functions-window", + "datafusion-proto", "datafusion-python-util", "pyo3", "pyo3-build-config", diff --git a/crates/core/src/physical_plan.rs b/crates/core/src/physical_plan.rs index e992b0ef7..d37c51c3d 100644 --- a/crates/core/src/physical_plan.rs +++ b/crates/core/src/physical_plan.rs @@ -78,11 +78,26 @@ impl PyExecutionPlan { format!("{}", d.indent(false)) } - pub fn to_bytes<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { - let codec = PythonPhysicalCodec::default(); + #[pyo3(signature = (ctx=None))] + pub fn to_bytes<'py>( + &'py self, + py: Python<'py>, + ctx: Option, + ) -> PyDataFusionResult> { + // Route through the session's physical codec when supplied so + // user FFI codecs registered via + // `with_physical_extension_codec` see the encode path. + let default_codec; + let codec: &dyn datafusion_proto::physical_plan::PhysicalExtensionCodec = match ctx { + Some(ref ctx) => ctx.physical_codec().as_ref(), + None => { + default_codec = PythonPhysicalCodec::default(); + &default_codec + } + }; let proto = datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan( self.plan.clone(), - &codec, + codec, )?; let bytes = proto.encode_to_vec(); @@ -115,7 +130,7 @@ impl PyExecutionPlan { py, "PyExecutionPlan.to_proto is deprecated; use to_bytes instead", )?; - self.to_bytes(py) + self.to_bytes(py, None) } /// Deprecated alias for [`Self::from_bytes`]. Will be removed in a @@ -213,9 +228,21 @@ impl PyPhysicalExpr { format!("{}", self.expr) } - pub fn to_bytes<'py>(&self, py: Python<'py>) -> PyDataFusionResult> { - let codec = PythonPhysicalCodec::default(); - let proto = serialize_physical_expr(&self.expr, &codec)?; + #[pyo3(signature = (ctx=None))] + pub fn to_bytes<'py>( + &self, + py: Python<'py>, + ctx: Option, + ) -> PyDataFusionResult> { + let default_codec; + let codec: &dyn datafusion_proto::physical_plan::PhysicalExtensionCodec = match ctx { + Some(ref ctx) => ctx.physical_codec().as_ref(), + None => { + default_codec = PythonPhysicalCodec::default(); + &default_codec + } + }; + let proto = serialize_physical_expr(&self.expr, codec)?; let bytes = proto.encode_to_vec(); Ok(PyBytes::new(py, &bytes)) } diff --git a/crates/core/src/sql/logical.rs b/crates/core/src/sql/logical.rs index 82df43bd3..b5d18e7c7 100644 --- a/crates/core/src/sql/logical.rs +++ b/crates/core/src/sql/logical.rs @@ -197,10 +197,28 @@ impl PyLogicalPlan { format!("{}", self.plan.display_graphviz()) } - pub fn to_bytes<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { - let codec = PythonLogicalCodec::default(); + #[pyo3(signature = (ctx=None))] + pub fn to_bytes<'py>( + &'py self, + py: Python<'py>, + ctx: Option, + ) -> PyDataFusionResult> { + // When the caller supplies a session, route through its + // installed logical codec so user FFI codecs registered via + // `with_logical_extension_codec` see the encode path. + // Otherwise fall back to a default-inner `PythonLogicalCodec` + // — Python scalar UDFs still inline via `DFPYUDF1`, but + // non-Python UDFs hit the default codec only. + let default_codec; + let codec: &dyn datafusion_proto::logical_plan::LogicalExtensionCodec = match ctx { + Some(ref ctx) => ctx.logical_codec().as_ref(), + None => { + default_codec = PythonLogicalCodec::default(); + &default_codec + } + }; let proto = - datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; + datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, codec)?; let bytes = proto.encode_to_vec(); Ok(PyBytes::new(py, &bytes)) @@ -231,7 +249,7 @@ impl PyLogicalPlan { py, "PyLogicalPlan.to_proto is deprecated; use to_bytes instead", )?; - self.to_bytes(py) + self.to_bytes(py, None) } /// Deprecated alias for [`Self::from_bytes`]. Will be removed in a diff --git a/crates/util/src/lib.rs b/crates/util/src/lib.rs index e3f129659..72dc9aafc 100644 --- a/crates/util/src/lib.rs +++ b/crates/util/src/lib.rs @@ -268,7 +268,11 @@ macro_rules! from_pycapsule { })?; $crate::validate_pycapsule(&capsule, $capsule_name)?; - let data: std::ptr::NonNull<$ffi_type> = capsule.pointer_checked(None)?.cast(); + let expected_name = std::ffi::CString::new($capsule_name) + .expect("capsule name must not contain interior NUL bytes"); + let data: std::ptr::NonNull<$ffi_type> = capsule + .pointer_checked(Some(expected_name.as_c_str()))? + .cast(); let output_obj = unsafe { data.as_ref() }; let output_obj: std::sync::Arc<$output_type> = output_obj.into(); @@ -301,7 +305,11 @@ macro_rules! try_from_pycapsule { })?; $crate::validate_pycapsule(&capsule, $capsule_name)?; - let data: std::ptr::NonNull<$ffi_type> = capsule.pointer_checked(None)?.cast(); + let expected_name = std::ffi::CString::new($capsule_name) + .expect("capsule name must not contain interior NUL bytes"); + let data: std::ptr::NonNull<$ffi_type> = capsule + .pointer_checked(Some(expected_name.as_c_str()))? + .cast(); let output_obj = unsafe { data.as_ref() }; let output_obj: std::sync::Arc<$output_type> = output_obj .try_into() diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 178dce9f9..ffc839d56 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -26,12 +26,14 @@ repository.workspace = true publish = false [dependencies] +datafusion = { workspace = true } datafusion-catalog = { workspace = true, default-features = false } datafusion-common = { workspace = true, default-features = false } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window = { workspace = true } datafusion-expr = { workspace = true } datafusion-ffi = { workspace = true } +datafusion-proto = { workspace = true } arrow = { workspace = true } arrow-array = { workspace = true } diff --git a/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py b/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py new file mode 100644 index 000000000..8cd9076a1 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pytest +from datafusion import ExecutionPlan +from datafusion_ffi_example import MyExecutionPlan + + +@pytest.mark.parametrize("inner_capsule", [True, False]) +def test_execution_plan_from_pycapsule(inner_capsule: bool) -> None: + """`ExecutionPlan.from_pycapsule` consumes a capsule sourced from a + non-datafusion-python crate, regardless of whether the user passes + the typed wrapper or pre-extracts the raw capsule.""" + obj = MyExecutionPlan() + if inner_capsule: + obj = obj.__datafusion_execution_plan__() + + plan = ExecutionPlan.from_pycapsule(obj) + + # EmptyExec round-trips with no children. + assert plan.children() == [] + assert plan.partition_count == 1 + assert "EmptyExec" in plan.display() diff --git a/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py b/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py new file mode 100644 index 000000000..94093b416 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from datafusion import LogicalPlan, SessionContext +from datafusion_ffi_example import MyLogicalExtensionCodec + + +def _setup_session_with_codec() -> tuple[SessionContext, MyLogicalExtensionCodec]: + """Build a session with the user-supplied logical extension codec + installed. Tests use a FROM-less query so plan serialization does + not pull in `try_encode_table_provider`, which the default codec + leaves unimplemented.""" + base = SessionContext() + codec = MyLogicalExtensionCodec() + ctx = base.with_logical_extension_codec(codec) + return ctx, codec + + +def test_ffi_logical_codec_install_and_export(): + """Installing a user FFI codec replaces the session's logical + codec; the capsule getter on the session re-exports it.""" + ctx, _codec = _setup_session_with_codec() + capsule = ctx.__datafusion_logical_extension_codec__() + assert capsule is not None + + +def test_ffi_logical_codec_consulted_on_udf_encode(): + """Serializing a plan that references an FFI-imported scalar UDF + routes through the installed codec's `try_encode_udf` path — + PythonLogicalCodec delegates non-Python UDFs to the inner codec.""" + ctx, codec = _setup_session_with_codec() + df = ctx.sql("SELECT abs(-1) AS x") + plan = df.logical_plan() + + before = codec.encode_udf_calls() + _ = plan.to_bytes(ctx) + after = codec.encode_udf_calls() + + assert after > before, ( + f"Expected user FFI codec encode_udf to fire, before={before} after={after}" + ) + + +def test_ffi_logical_codec_roundtrip(): + """A plan referencing an FFI-imported UDF round-trips through the + user-supplied logical codec (encode via codec, decode resolves from + registry — `try_decode_udf` is only consulted when the UDF is not + in the registry, which is the codec-inlined case).""" + ctx, _codec = _setup_session_with_codec() + df = ctx.sql("SELECT abs(-1) AS x") + blob = df.logical_plan().to_bytes(ctx) + + restored = LogicalPlan.from_bytes(ctx, blob) + assert restored is not None diff --git a/examples/datafusion-ffi-example/python/tests/_test_physical_expr.py b/examples/datafusion-ffi-example/python/tests/_test_physical_expr.py new file mode 100644 index 000000000..922897ce9 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_physical_expr.py @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pytest +from datafusion import PhysicalExpr +from datafusion_ffi_example import MyPhysicalExpr + + +@pytest.mark.parametrize("inner_capsule", [True, False]) +def test_physical_expr_from_pycapsule(inner_capsule: bool) -> None: + """`PhysicalExpr.from_pycapsule` consumes a capsule sourced from a + non-datafusion-python crate. `MyPhysicalExpr` emits the literal + `Int32(42)`.""" + obj = MyPhysicalExpr() + if inner_capsule: + obj = obj.__datafusion_physical_expr__() + + expr = PhysicalExpr.from_pycapsule(obj) + assert "42" in repr(expr) diff --git a/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py b/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py new file mode 100644 index 000000000..9245cd63c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import ExecutionPlan, SessionContext +from datafusion_ffi_example import MyPhysicalExtensionCodec + + +def _setup_session_with_codec() -> tuple[SessionContext, MyPhysicalExtensionCodec]: + base = SessionContext() + batch = pa.RecordBatch.from_arrays( + [pa.array([-1, -2, -3])], + names=["a"], + ) + base.register_record_batches("t", [[batch]]) + codec = MyPhysicalExtensionCodec() + ctx = base.with_physical_extension_codec(codec) + return ctx, codec + + +def test_ffi_physical_codec_install_and_export(): + ctx, _codec = _setup_session_with_codec() + capsule = ctx.__datafusion_physical_extension_codec__() + assert capsule is not None + + +def test_ffi_physical_codec_consulted_on_udf_encode(): + """Serializing an ExecutionPlan that references an FFI-imported + scalar UDF routes `try_encode_udf` through the installed physical + codec — PythonPhysicalCodec delegates non-Python UDFs to the inner + codec.""" + ctx, codec = _setup_session_with_codec() + df = ctx.sql("SELECT abs(a) AS x FROM t") + plan = df.execution_plan() + + before = codec.encode_udf_calls() + _ = plan.to_bytes(ctx) + after = codec.encode_udf_calls() + + assert after > before, ( + f"Expected user FFI codec encode_udf to fire, before={before} after={after}" + ) + + +def test_ffi_physical_codec_roundtrip(): + """A plan referencing an FFI-imported UDF round-trips via the + user-supplied physical codec. On decode, the receiver resolves the + UDF from the function registry; `try_decode_udf` only fires when a + codec inlines the UDF body, which the counting codec does not.""" + ctx, _codec = _setup_session_with_codec() + df = ctx.sql("SELECT abs(a) AS x FROM t") + blob = df.execution_plan().to_bytes(ctx) + + restored = ExecutionPlan.from_bytes(ctx, blob) + assert restored is not None diff --git a/examples/datafusion-ffi-example/src/execution_plan.rs b/examples/datafusion-ffi-example/src/execution_plan.rs new file mode 100644 index 000000000..7439d9e31 --- /dev/null +++ b/examples/datafusion-ffi-example/src/execution_plan.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema}; +use datafusion::physical_plan::empty::EmptyExec; +use datafusion_ffi::execution_plan::FFI_ExecutionPlan; +use datafusion_python_util::get_tokio_runtime; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; + +/// Minimal downstream `ExecutionPlan` producer for FFI integration +/// tests. Emits a single-column empty plan via `FFI_ExecutionPlan`, +/// letting Python tests verify that +/// `ExecutionPlan.from_pycapsule(my_plan)` consumes a capsule sourced +/// from a non-datafusion-python Rust crate. +#[pyclass( + from_py_object, + name = "MyExecutionPlan", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Clone)] +pub(crate) struct MyExecutionPlan; + +#[pymethods] +impl MyExecutionPlan { + #[new] + fn new() -> Self { + Self + } + + fn __datafusion_execution_plan__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); + let plan = Arc::new(EmptyExec::new(schema)); + let runtime = get_tokio_runtime().handle().clone(); + let ffi = FFI_ExecutionPlan::new(plan, Some(runtime)); + + let name = cr"datafusion_execution_plan".into(); + PyCapsule::new(py, ffi, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index e708c49cc..efaa4eab9 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -20,6 +20,10 @@ use pyo3::prelude::*; use crate::aggregate_udf::MySumUDF; use crate::catalog_provider::{FixedSchemaProvider, MyCatalogProvider, MyCatalogProviderList}; use crate::config::MyConfig; +use crate::execution_plan::MyExecutionPlan; +use crate::logical_extension_codec::MyLogicalExtensionCodec; +use crate::physical_expr::MyPhysicalExpr; +use crate::physical_extension_codec::MyPhysicalExtensionCodec; use crate::scalar_udf::IsNullUDF; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; @@ -29,6 +33,10 @@ use crate::window_udf::MyRankUDF; pub(crate) mod aggregate_udf; pub(crate) mod catalog_provider; pub(crate) mod config; +pub(crate) mod execution_plan; +pub(crate) mod logical_extension_codec; +pub(crate) mod physical_expr; +pub(crate) mod physical_extension_codec; pub(crate) mod scalar_udf; pub(crate) mod table_function; pub(crate) mod table_provider; @@ -49,5 +57,9 @@ fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/examples/datafusion-ffi-example/src/logical_extension_codec.rs b/examples/datafusion-ffi-example/src/logical_extension_codec.rs new file mode 100644 index 000000000..da9efb297 --- /dev/null +++ b/examples/datafusion-ffi-example/src/logical_extension_codec.rs @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use arrow::datatypes::SchemaRef; +use datafusion::common::{Result, TableReference}; +use datafusion::datasource::TableProvider; +use datafusion::execution::{TaskContext, TaskContextProvider}; +use datafusion::logical_expr::{Extension, LogicalPlan, ScalarUDF}; +use datafusion::prelude::SessionContext; +use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_proto::logical_plan::{DefaultLogicalExtensionCodec, LogicalExtensionCodec}; +use datafusion_python_util::get_tokio_runtime; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; + +/// Tracks how often each `try_*_udf` entry point fires. Surface for +/// Python tests to assert the session routed UDF +/// encode/decode through this user-supplied codec rather than the +/// upstream default. +#[derive(Debug, Default)] +pub(crate) struct CallCounters { + pub encode_udf: AtomicUsize, + pub decode_udf: AtomicUsize, +} + +/// Minimal user-supplied `LogicalExtensionCodec` for integration tests. +/// Delegates everything to `DefaultLogicalExtensionCodec` and bumps +/// counters on the UDF entry points so tests can prove the wrapper +/// installed via `SessionContext.with_logical_extension_codec(...)` +/// actually gets consulted. +#[derive(Debug)] +struct CountingLogicalExtensionCodec { + inner: DefaultLogicalExtensionCodec, + counters: Arc, +} + +impl LogicalExtensionCodec for CountingLogicalExtensionCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[LogicalPlan], + ctx: &TaskContext, + ) -> Result { + self.inner.try_decode(buf, inputs, ctx) + } + + fn try_encode(&self, node: &Extension, buf: &mut Vec) -> Result<()> { + self.inner.try_encode(node, buf) + } + + fn try_decode_table_provider( + &self, + buf: &[u8], + table_ref: &TableReference, + schema: SchemaRef, + ctx: &TaskContext, + ) -> Result> { + self.inner + .try_decode_table_provider(buf, table_ref, schema, ctx) + } + + fn try_encode_table_provider( + &self, + table_ref: &TableReference, + node: Arc, + buf: &mut Vec, + ) -> Result<()> { + self.inner.try_encode_table_provider(table_ref, node, buf) + } + + fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { + self.counters.decode_udf.fetch_add(1, Ordering::SeqCst); + self.inner.try_decode_udf(name, buf) + } + + fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { + self.counters.encode_udf.fetch_add(1, Ordering::SeqCst); + self.inner.try_encode_udf(node, buf) + } +} + +#[pyclass( + from_py_object, + name = "MyLogicalExtensionCodec", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Clone)] +pub(crate) struct MyLogicalExtensionCodec { + counters: Arc, +} + +#[pymethods] +impl MyLogicalExtensionCodec { + #[new] + fn new() -> Self { + Self { + counters: Arc::new(CallCounters::default()), + } + } + + /// Number of `try_encode_udf` invocations observed since + /// construction. + fn encode_udf_calls(&self) -> usize { + self.counters.encode_udf.load(Ordering::SeqCst) + } + + /// Number of `try_decode_udf` invocations observed. + fn decode_udf_calls(&self) -> usize { + self.counters.decode_udf.load(Ordering::SeqCst) + } + + /// Capsule entry point consumed by + /// `datafusion_python_util::ffi_logical_codec_from_pycapsule`. + /// datafusion-python invokes this with no arguments when the user + /// calls `ctx.with_logical_extension_codec(my_codec)`. The codec + /// owns its own bare `SessionContext` as a TaskContextProvider — + /// good enough for tests that only exercise UDF encode/decode. + fn __datafusion_logical_extension_codec__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let inner: Arc = Arc::new(CountingLogicalExtensionCodec { + inner: DefaultLogicalExtensionCodec {}, + counters: Arc::clone(&self.counters), + }); + + let runtime = get_tokio_runtime().handle().clone(); + let bare_session: Arc = Arc::new(SessionContext::new()); + let ctx_provider = bare_session as Arc; + let ffi = FFI_LogicalExtensionCodec::new(inner, Some(runtime), &ctx_provider); + + let name = cr"datafusion_logical_extension_codec".into(); + PyCapsule::new(py, ffi, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/physical_expr.rs b/examples/datafusion-ffi-example/src/physical_expr.rs new file mode 100644 index 000000000..2b15c0d3e --- /dev/null +++ b/examples/datafusion-ffi-example/src/physical_expr.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_expr::expressions::Literal; +use datafusion_common::ScalarValue; +use datafusion_ffi::physical_expr::FFI_PhysicalExpr; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; + +/// Minimal downstream `PhysicalExpr` producer for FFI integration +/// tests. Emits the literal `Int32(42)` via `FFI_PhysicalExpr` so +/// Python tests can verify `PhysicalExpr.from_pycapsule(my_expr)` +/// consumes a capsule sourced from a non-datafusion-python crate. +#[pyclass( + from_py_object, + name = "MyPhysicalExpr", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Clone)] +pub(crate) struct MyPhysicalExpr; + +#[pymethods] +impl MyPhysicalExpr { + #[new] + fn new() -> Self { + Self + } + + fn __datafusion_physical_expr__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let expr: Arc = Arc::new(Literal::new(ScalarValue::Int32(Some(42)))); + let ffi = FFI_PhysicalExpr::from(expr); + + let name = cr"datafusion_physical_expr".into(); + PyCapsule::new(py, ffi, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/physical_extension_codec.rs b/examples/datafusion-ffi-example/src/physical_extension_codec.rs new file mode 100644 index 000000000..b1a586d9e --- /dev/null +++ b/examples/datafusion-ffi-example/src/physical_extension_codec.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use datafusion::common::Result; +use datafusion::execution::{TaskContext, TaskContextProvider}; +use datafusion::logical_expr::ScalarUDF; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use datafusion_ffi::proto::physical_extension_codec::FFI_PhysicalExtensionCodec; +use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; +use datafusion_python_util::get_tokio_runtime; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; + +#[derive(Debug, Default)] +pub(crate) struct PhysicalCallCounters { + pub encode_udf: AtomicUsize, + pub decode_udf: AtomicUsize, +} + +/// Mirror of [`super::logical_extension_codec::CountingLogicalExtensionCodec`] +/// for the physical layer. Delegates to `DefaultPhysicalExtensionCodec` +/// and bumps counters on UDF encode/decode so tests can prove the +/// session routed through a user-supplied physical codec. +#[derive(Debug)] +struct CountingPhysicalExtensionCodec { + inner: DefaultPhysicalExtensionCodec, + counters: Arc, +} + +impl PhysicalExtensionCodec for CountingPhysicalExtensionCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], + ctx: &TaskContext, + ) -> Result> { + self.inner.try_decode(buf, inputs, ctx) + } + + fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + self.inner.try_encode(node, buf) + } + + fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { + self.counters.decode_udf.fetch_add(1, Ordering::SeqCst); + self.inner.try_decode_udf(name, buf) + } + + fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { + self.counters.encode_udf.fetch_add(1, Ordering::SeqCst); + self.inner.try_encode_udf(node, buf) + } +} + +#[pyclass( + from_py_object, + name = "MyPhysicalExtensionCodec", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Clone)] +pub(crate) struct MyPhysicalExtensionCodec { + counters: Arc, +} + +#[pymethods] +impl MyPhysicalExtensionCodec { + #[new] + fn new() -> Self { + Self { + counters: Arc::new(PhysicalCallCounters::default()), + } + } + + fn encode_udf_calls(&self) -> usize { + self.counters.encode_udf.load(Ordering::SeqCst) + } + + fn decode_udf_calls(&self) -> usize { + self.counters.decode_udf.load(Ordering::SeqCst) + } + + fn __datafusion_physical_extension_codec__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let inner: Arc = + Arc::new(CountingPhysicalExtensionCodec { + inner: DefaultPhysicalExtensionCodec {}, + counters: Arc::clone(&self.counters), + }); + + let runtime = get_tokio_runtime().handle().clone(); + let bare_session: Arc = Arc::new(SessionContext::new()); + let ctx_provider = bare_session as Arc; + let ffi = FFI_PhysicalExtensionCodec::new(inner, Some(runtime), &ctx_provider); + + let name = cr"datafusion_physical_extension_codec".into(); + PyCapsule::new(py, ffi, Some(name)) + } +} diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 652cd9373..5c3501941 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -1750,7 +1750,10 @@ def with_logical_extension_codec(self, codec: Any) -> SessionContext: This only supports codecs that have been implemented using the FFI interface. """ - return self.ctx.with_logical_extension_codec(codec) + new_internal = self.ctx.with_logical_extension_codec(codec) + new = SessionContext.__new__(SessionContext) + new.ctx = new_internal + return new def __datafusion_physical_extension_codec__(self) -> Any: """Access the PyCapsule FFI_PhysicalExtensionCodec.""" @@ -1762,4 +1765,7 @@ def with_physical_extension_codec(self, codec: Any) -> SessionContext: This only supports codecs that have been implemented using the FFI interface. """ - return self.ctx.with_physical_extension_codec(codec) + new_internal = self.ctx.with_physical_extension_codec(codec) + new = SessionContext.__new__(SessionContext) + new.ctx = new_internal + return new diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 4765d1680..35ed2f9fc 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -99,13 +99,18 @@ def from_bytes(ctx: SessionContext, data: bytes) -> LogicalPlan: """ return LogicalPlan(df_internal.LogicalPlan.from_bytes(ctx.ctx, data)) - def to_bytes(self) -> bytes: + def to_bytes(self, ctx: SessionContext | None = None) -> bytes: """Convert a LogicalPlan to serialized protobuf bytes. + When ``ctx`` is supplied, encoding routes through the session's + installed `LogicalExtensionCodec` so user FFI codecs (registered + via :py:meth:`SessionContext.with_logical_extension_codec`) see + the encode path. With ``ctx=None`` a default codec is used. Tables created in memory from record batches are currently not supported. """ - return self._raw_plan.to_bytes() + ctx_arg = ctx.ctx if ctx is not None else None + return self._raw_plan.to_bytes(ctx_arg) @staticmethod def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan: @@ -175,13 +180,15 @@ def from_bytes(ctx: SessionContext, data: bytes) -> ExecutionPlan: """ return ExecutionPlan(df_internal.ExecutionPlan.from_bytes(ctx.ctx, data)) - def to_bytes(self) -> bytes: + def to_bytes(self, ctx: SessionContext | None = None) -> bytes: """Convert an ExecutionPlan into serialized protobuf bytes. - Tables created in memory from record batches are currently not - supported. + When ``ctx`` is supplied, encoding routes through the session's + installed `PhysicalExtensionCodec`. Tables created in memory + from record batches are currently not supported. """ - return self._raw_plan.to_bytes() + ctx_arg = ctx.ctx if ctx is not None else None + return self._raw_plan.to_bytes(ctx_arg) @staticmethod def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan: @@ -412,9 +419,14 @@ def __repr__(self) -> str: """Return a string representation.""" return repr(self._raw) - def to_bytes(self) -> bytes: - """Serialize the physical expression to protobuf bytes.""" - return self._raw.to_bytes() + def to_bytes(self, ctx: SessionContext | None = None) -> bytes: + """Serialize the physical expression to protobuf bytes. + + When ``ctx`` is supplied, encoding routes through the session's + installed `PhysicalExtensionCodec`. + """ + ctx_arg = ctx.ctx if ctx is not None else None + return self._raw.to_bytes(ctx_arg) @staticmethod def from_bytes(ctx: SessionContext, data: bytes, input_schema: Any) -> PhysicalExpr: From 26422bd59e478a8e4e7797bb07d51d597feab979 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 10:44:35 -0400 Subject: [PATCH 03/11] refactor: tighten PR1 scope to codec plumbing only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review feedback pass. PR1 is now strictly the composable codec layer + session routing + class-method serialization API. Anything that touches actual Python UDF inline encoding or Python expression wrapping moves to PR2 alongside the pickle work. Dropped: * `encode_python_scalar_udf` / `decode_python_scalar_udf` helpers from `crates/core/src/codec.rs`, along with cloudpickle and pyarrow imports. The wrapper codecs now delegate every method to `inner`. `DFPYUDF1` magic constant is kept (marked `dead_code` for now) as a reservation so PR2 has a single definition site. * `udf.rs` reverted to pre-PR1 shape. The codec no longer needs `func()` / `input_fields()` / `volatility()` / `from_parts()` accessors. Re-added by PR2 when scalar-UDF inlining lands. * `PyPhysicalExpr` class + Python wrapper + `__init__` export + `MyPhysicalExpr` FFI fixture + `_test_physical_expr.py`. No consumer in PR1 or PR2 plan documents; symmetry with `PyExecutionPlan` is not enough to justify the surface area. * Rust-side `PyLogicalPlan::to_proto` / `from_proto` and `PyExecutionPlan::to_proto` / `from_proto` deprecated wrappers. The deprecation lives entirely in the Python wrapper layer, which emits `DeprecationWarning` and forwards to `to_bytes` / `from_bytes`. Less Rust duplication. * `PythonLogicalCodec::with_default_inner` / `PythonPhysicalCodec::with_default_inner` — redundant with `impl Default`. Logic moved into `Default::default`. * `PySessionContext::default_logical_codec` / `default_physical_codec` helpers. Inlined as `Arc::new(PythonLogicalCodec::default())` at the three call sites. Tests (root: 1076, FFI example: 36) all green after the cuts. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/codec.rs | 235 ++++-------------- crates/core/src/context.rs | 30 +-- crates/core/src/lib.rs | 1 - crates/core/src/physical_plan.rs | 143 ----------- crates/core/src/sql/logical.rs | 32 --- crates/core/src/udf.rs | 38 +-- .../python/tests/_test_physical_expr.py | 35 --- examples/datafusion-ffi-example/src/lib.rs | 3 - .../src/physical_expr.rs | 57 ----- python/datafusion/__init__.py | 3 +- python/datafusion/plan.py | 54 ---- python/tests/test_plans.py | 17 -- 12 files changed, 55 insertions(+), 593 deletions(-) delete mode 100644 examples/datafusion-ffi-example/python/tests/_test_physical_expr.py delete mode 100644 examples/datafusion-ffi-example/src/physical_expr.rs diff --git a/crates/core/src/codec.rs b/crates/core/src/codec.rs index cafa55cc8..6c7248dd7 100644 --- a/crates/core/src/codec.rs +++ b/crates/core/src/codec.rs @@ -18,68 +18,60 @@ //! Python-aware extension codecs. //! //! [`PythonLogicalCodec`] wraps a user-supplied (or default) -//! [`LogicalExtensionCodec`] and adds in-band encoding of Python-defined -//! scalar UDFs via cloudpickle. [`PythonPhysicalCodec`] is the symmetric -//! wrapper around [`PhysicalExtensionCodec`]; both reuse the same payload -//! framing so a Python `ScalarUDF` round-trips identically through either -//! codec layer. +//! [`LogicalExtensionCodec`] and is the codec datafusion-python parks +//! on every `SessionContext`. [`PythonPhysicalCodec`] is the symmetric +//! wrapper around [`PhysicalExtensionCodec`]. //! -//! ## Wire-format magic prefix registry +//! In PR1 both codecs delegate every call to their `inner` codec. The +//! types exist so that follow-up work (pickle support, Python scalar +//! UDF inline encoding) can add in-band Python payloads without +//! re-plumbing the session field. //! -//! Each Python-inline payload begins with an 8-byte magic prefix so the -//! decoder can distinguish it from arbitrary `fun_definition` bytes (e.g. -//! produced by a user FFI codec or left empty by the default codec). +//! ## Wire-format magic prefix registry //! -//! | Layer + kind | Magic prefix | Owner | -//! | ----------------------------- | ------------ | -------------- | -//! | `PythonLogicalCodec` scalar | `DFPYUDF1` | in use | -//! | `PythonLogicalCodec` agg | `DFPYUDA1` | reserved | -//! | `PythonLogicalCodec` window | `DFPYUDW1` | reserved | -//! | `PythonPhysicalCodec` scalar | `DFPYUDF1` | in use (shared with logical) | -//! | `PythonPhysicalCodec` agg | `DFPYUDA1` | reserved | -//! | `PythonPhysicalCodec` window | `DFPYUDW1` | reserved | -//! | `PythonPhysicalCodec` expr | `DFPYPE1` | reserved | -//! | User FFI extension codec | user-chosen | downstream | -//! | Default codec | (none) | upstream | +//! Future in-band Python payloads will be prefixed with an 8-byte +//! magic so the decoder can distinguish them from arbitrary +//! `fun_definition` bytes produced by the default codec or a user FFI +//! codec. //! -//! Dispatch precedence inside the Python codecs: **Python-inline payload -//! (magic prefix match) → `inner` codec → caller's registry fallback.** -//! When a payload does not match a Python magic prefix the call delegates -//! to `inner`, which is typically `DefaultLogicalExtensionCodec` / -//! `DefaultPhysicalExtensionCodec` but may be a user-supplied FFI codec -//! installed via `SessionContext.with_logical_extension_codec(...)` / -//! `with_physical_extension_codec(...)`. +//! | Layer + kind | Magic prefix | Status | +//! | ----------------------------- | ------------ | ------------- | +//! | `PythonLogicalCodec` scalar | `DFPYUDF1` | reserved (PR2)| +//! | `PythonLogicalCodec` agg | `DFPYUDA1` | reserved | +//! | `PythonLogicalCodec` window | `DFPYUDW1` | reserved | +//! | `PythonPhysicalCodec` scalar | `DFPYUDF1` | reserved (PR2)| +//! | `PythonPhysicalCodec` agg | `DFPYUDA1` | reserved | +//! | `PythonPhysicalCodec` window | `DFPYUDW1` | reserved | +//! | `PythonPhysicalCodec` expr | `DFPYPE1` | reserved | +//! | User FFI extension codec | user-chosen | downstream | +//! | Default codec | (none) | upstream | //! -//! User FFI codecs should pick non-colliding prefixes (recommend a `DF` -//! namespace plus a crate-specific suffix). +//! Dispatch precedence once in-band payloads land: **Python-inline +//! payload (magic prefix match) → `inner` codec → caller's registry +//! fallback.** User FFI codecs should pick non-colliding prefixes +//! (recommend a `DF` namespace plus a crate-specific suffix). use std::sync::Arc; -use arrow::datatypes::{Field, Schema}; -use arrow::pyarrow::ToPyArrow; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::pyarrow::FromPyArrow; +use arrow::datatypes::SchemaRef; use datafusion::common::{Result, TableReference}; use datafusion::datasource::TableProvider; use datafusion::execution::TaskContext; -use datafusion::logical_expr::{Extension, LogicalPlan, ScalarUDF, ScalarUDFImpl}; +use datafusion::logical_expr::{Extension, LogicalPlan, ScalarUDF}; use datafusion::physical_plan::ExecutionPlan; use datafusion_proto::logical_plan::{DefaultLogicalExtensionCodec, LogicalExtensionCodec}; use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; -use pyo3::BoundObject; -use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyTuple}; - -use crate::udf::PythonFunctionScalarUDF; -/// Magic prefix for an inlined Python scalar UDF payload. Shared between -/// logical and physical codec layers — the cloudpickled tuple shape is -/// identical, so a `ScalarUDF` reaching either codec encodes the same way. +/// Reserved magic prefix for an inlined Python scalar UDF payload. +/// Not produced or consumed by PR1; the constant is reserved here so +/// follow-up work has a single definition site. +#[allow(dead_code)] pub(crate) const PY_SCALAR_UDF_MAGIC: &[u8] = b"DFPYUDF1"; -/// `LogicalExtensionCodec` that serializes Python scalar UDFs inline and -/// delegates every other call to a composable `inner` codec. See module -/// docs for the dispatch precedence. +/// `LogicalExtensionCodec` parked on every `SessionContext`. Wraps a +/// composable `inner` codec; PR1 delegates every method straight +/// through. The wrapper exists so follow-up patches can add Python +/// in-band encoding without changing every serializer. #[derive(Debug)] pub struct PythonLogicalCodec { inner: Arc, @@ -90,11 +82,6 @@ impl PythonLogicalCodec { Self { inner } } - /// Convenience constructor wrapping `DefaultLogicalExtensionCodec`. - pub fn with_default_inner() -> Self { - Self::new(Arc::new(DefaultLogicalExtensionCodec {})) - } - pub fn inner(&self) -> &Arc { &self.inner } @@ -102,7 +89,7 @@ impl PythonLogicalCodec { impl Default for PythonLogicalCodec { fn default() -> Self { - Self::with_default_inner() + Self::new(Arc::new(DefaultLogicalExtensionCodec {})) } } @@ -141,29 +128,17 @@ impl LogicalExtensionCodec for PythonLogicalCodec { } fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { - if try_encode_python_scalar_udf(node, buf)? { - return Ok(()); - } self.inner.try_encode_udf(node, buf) } fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { - match try_decode_python_scalar_udf(buf)? { - Some(udf) => Ok(udf), - None => self.inner.try_decode_udf(name, buf), - } + self.inner.try_decode_udf(name, buf) } } -/// `PhysicalExtensionCodec` mirror of [`PythonLogicalCodec`]. Scalar-UDF -/// payloads use the shared `DFPYUDF1` framing so an `ExecutionPlan` or -/// `PhysicalExpr` that references a Python `ScalarUDF` round-trips -/// regardless of which codec layer encoded it. -/// -/// Encoding for Python-defined `ExecutionPlan` impls and `PhysicalExpr` -/// impls (the `DFPYPE*` namespace) is reserved for a future change — no -/// concrete Python-side physical extension type exists today, so all -/// non-UDF calls delegate to `inner`. +/// `PhysicalExtensionCodec` mirror of [`PythonLogicalCodec`]. Same +/// motivation: a stable session field that follow-up patches can layer +/// Python in-band encoding onto. #[derive(Debug)] pub struct PythonPhysicalCodec { inner: Arc, @@ -174,10 +149,6 @@ impl PythonPhysicalCodec { Self { inner } } - pub fn with_default_inner() -> Self { - Self::new(Arc::new(DefaultPhysicalExtensionCodec {})) - } - pub fn inner(&self) -> &Arc { &self.inner } @@ -185,7 +156,7 @@ impl PythonPhysicalCodec { impl Default for PythonPhysicalCodec { fn default() -> Self { - Self::with_default_inner() + Self::new(Arc::new(DefaultPhysicalExtensionCodec {})) } } @@ -204,128 +175,10 @@ impl PhysicalExtensionCodec for PythonPhysicalCodec { } fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { - if try_encode_python_scalar_udf(node, buf)? { - return Ok(()); - } self.inner.try_encode_udf(node, buf) } fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { - match try_decode_python_scalar_udf(buf)? { - Some(udf) => Ok(udf), - None => self.inner.try_decode_udf(name, buf), - } - } -} - -/// Encode a Python scalar UDF inline if `node` is one. Returns `Ok(true)` -/// when the payload (magic prefix + cloudpickle tuple) was written, or -/// `Ok(false)` to signal the caller should delegate to its `inner` codec -/// (non-Python UDF, e.g. built-in or FFI capsule). -pub(crate) fn try_encode_python_scalar_udf(node: &ScalarUDF, buf: &mut Vec) -> Result { - let Some(py_udf) = node - .inner() - .as_any() - .downcast_ref::() - else { - return Ok(false); - }; - - Python::attach(|py| -> Result { - let bytes = encode_python_scalar_udf(py, py_udf) - .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; - buf.extend_from_slice(PY_SCALAR_UDF_MAGIC); - buf.extend_from_slice(&bytes); - Ok(true) - }) -} - -/// Decode an inline Python scalar UDF payload. Returns `Ok(None)` when -/// `buf` does not carry the magic prefix, signalling the caller should -/// delegate to its `inner` codec (which will typically defer to the -/// `FunctionRegistry`). -pub(crate) fn try_decode_python_scalar_udf(buf: &[u8]) -> Result>> { - if buf.is_empty() || !buf.starts_with(PY_SCALAR_UDF_MAGIC) { - return Ok(None); + self.inner.try_decode_udf(name, buf) } - let payload = &buf[PY_SCALAR_UDF_MAGIC.len()..]; - - Python::attach(|py| -> Result>> { - let udf = decode_python_scalar_udf(py, payload) - .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; - Ok(Some(Arc::new(ScalarUDF::new_from_impl(udf)))) - }) -} - -/// Build the cloudpickle payload for a `PythonFunctionScalarUDF`. -/// -/// Layout: `cloudpickle.dumps((name, func, input_schema_bytes, -/// return_field, volatility_str))`. Input fields ride along as an -/// IPC-encoded pyarrow Schema so they round-trip without extra plumbing. -fn encode_python_scalar_udf(py: Python<'_>, udf: &PythonFunctionScalarUDF) -> PyResult> { - let cloudpickle = py.import("cloudpickle")?; - - let input_schema = Schema::new(udf.input_fields().to_vec()); - let pa_schema_obj = input_schema.to_pyarrow(py)?; - let pa_schema = pa_schema_obj.into_bound(); - let schema_bytes: Vec = pa_schema - .call_method0("serialize")? - .call_method0("to_pybytes")? - .extract()?; - - let return_field_obj = udf.return_field().as_ref().to_pyarrow(py)?; - let volatility = format!("{:?}", udf.volatility()).to_lowercase(); - - let payload = PyTuple::new( - py, - [ - udf.name().into_pyobject(py)?.into_any(), - udf.func().bind(py).clone().into_any(), - PyBytes::new(py, &schema_bytes).into_any(), - return_field_obj.into_bound(), - volatility.into_pyobject(py)?.into_any(), - ], - )?; - - let blob = cloudpickle.call_method1("dumps", (payload,))?; - blob.extract::>() -} - -/// Inverse of [`encode_python_scalar_udf`]. -fn decode_python_scalar_udf(py: Python<'_>, payload: &[u8]) -> PyResult { - let cloudpickle = py.import("cloudpickle")?; - let pyarrow = py.import("pyarrow")?; - - let tuple = cloudpickle - .call_method1("loads", (PyBytes::new(py, payload),))? - .cast_into::()?; - - let name: String = tuple.get_item(0)?.extract()?; - let func: Py = tuple.get_item(1)?.unbind(); - let schema_bytes: Vec = tuple.get_item(2)?.extract()?; - let return_field_py = tuple.get_item(3)?; - let volatility_str: String = tuple.get_item(4)?.extract()?; - - let buffer = pyarrow.call_method1("py_buffer", (PyBytes::new(py, &schema_bytes),))?; - let pa_schema = pyarrow - .getattr("ipc")? - .call_method1("read_schema", (buffer,))?; - - let schema = Schema::from_pyarrow_bound(&pa_schema) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(format!("{e}")))?; - let input_fields: Vec = schema.fields().iter().map(|f| f.as_ref().clone()).collect(); - - let return_field = Field::from_pyarrow_bound(&return_field_py) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(format!("{e}")))?; - - let volatility = datafusion_python_util::parse_volatility(&volatility_str) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(format!("{e}")))?; - - Ok(PythonFunctionScalarUDF::from_parts( - name, - func, - input_fields, - return_field, - volatility, - )) } diff --git a/crates/core/src/context.rs b/crates/core/src/context.rs index 40bc9ad4b..96de01889 100644 --- a/crates/core/src/context.rs +++ b/crates/core/src/context.rs @@ -398,12 +398,10 @@ impl PySessionContext { .with_default_features() .build(); let ctx = Arc::new(SessionContext::new_with_state(session_state)); - let logical_codec = Self::default_logical_codec(); - let physical_codec = Self::default_physical_codec(); Ok(PySessionContext { ctx, - logical_codec, - physical_codec, + logical_codec: Arc::new(PythonLogicalCodec::default()), + physical_codec: Arc::new(PythonPhysicalCodec::default()), }) } @@ -419,12 +417,10 @@ impl PySessionContext { #[pyo3(signature = ())] pub fn global_ctx() -> PyResult { let ctx = get_global_ctx().clone(); - let logical_codec = Self::default_logical_codec(); - let physical_codec = Self::default_physical_codec(); Ok(Self { ctx, - logical_codec, - physical_codec, + logical_codec: Arc::new(PythonLogicalCodec::default()), + physical_codec: Arc::new(PythonPhysicalCodec::default()), }) } @@ -1458,14 +1454,6 @@ impl PySessionContext { Ok(()) } - fn default_logical_codec() -> Arc { - Arc::new(PythonLogicalCodec::default()) - } - - fn default_physical_codec() -> Arc { - Arc::new(PythonPhysicalCodec::default()) - } - /// Session-scoped logical codec. Sibling modules read this when they /// need to serialize/deserialize logical-layer types (LogicalPlan, /// Expr) against the user-installed (or default) codec stack. @@ -1525,14 +1513,10 @@ impl From for SessionContext { impl From for PySessionContext { fn from(ctx: SessionContext) -> PySessionContext { - let ctx = Arc::new(ctx); - let logical_codec = Self::default_logical_codec(); - let physical_codec = Self::default_physical_codec(); - PySessionContext { - ctx, - logical_codec, - physical_codec, + ctx: Arc::new(ctx), + logical_codec: Arc::new(PythonLogicalCodec::default()), + physical_codec: Arc::new(PythonPhysicalCodec::default()), } } } diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index adb47deb3..e3551c937 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -97,7 +97,6 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/crates/core/src/physical_plan.rs b/crates/core/src/physical_plan.rs index d37c51c3d..d79ff0c96 100644 --- a/crates/core/src/physical_plan.rs +++ b/crates/core/src/physical_plan.rs @@ -17,16 +17,9 @@ use std::sync::Arc; -use arrow::datatypes::Schema; -use arrow::pyarrow::FromPyArrow; -use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable}; use datafusion_ffi::execution_plan::FFI_ExecutionPlan; -use datafusion_ffi::physical_expr::FFI_PhysicalExpr; use datafusion_proto::physical_plan::AsExecutionPlan; -use datafusion_proto::physical_plan::from_proto::parse_physical_expr; -use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; -use datafusion_proto::protobuf; use datafusion_python_util::get_tokio_runtime; use prost::Message; use pyo3::exceptions::PyRuntimeError; @@ -123,31 +116,6 @@ impl PyExecutionPlan { Ok(Self::new(plan)) } - /// Deprecated alias for [`Self::to_bytes`]. Will be removed in a - /// future release. - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { - emit_deprecation( - py, - "PyExecutionPlan.to_proto is deprecated; use to_bytes instead", - )?; - self.to_bytes(py, None) - } - - /// Deprecated alias for [`Self::from_bytes`]. Will be removed in a - /// future release. - #[staticmethod] - pub fn from_proto( - py: Python<'_>, - ctx: PySessionContext, - proto_msg: Bound<'_, PyBytes>, - ) -> PyDataFusionResult { - emit_deprecation( - py, - "PyExecutionPlan.from_proto is deprecated; use from_bytes instead", - )?; - Self::from_bytes(ctx, proto_msg) - } - pub fn metrics(&self) -> Option { self.plan.metrics().map(PyMetricsSet::new) } @@ -201,114 +169,3 @@ impl From> for PyExecutionPlan { PyExecutionPlan { plan: plan.clone() } } } - -/// Thin Python wrapper around a `PhysicalExpr` exposing serialization -/// + PyCapsule protocol. Mirrors [`PyExecutionPlan`] shape. -#[pyclass( - from_py_object, - frozen, - name = "PhysicalExpr", - module = "datafusion", - subclass -)] -#[derive(Debug, Clone)] -pub struct PyPhysicalExpr { - pub expr: Arc, -} - -impl PyPhysicalExpr { - pub fn new(expr: Arc) -> Self { - Self { expr } - } -} - -#[pymethods] -impl PyPhysicalExpr { - fn __repr__(&self) -> String { - format!("{}", self.expr) - } - - #[pyo3(signature = (ctx=None))] - pub fn to_bytes<'py>( - &self, - py: Python<'py>, - ctx: Option, - ) -> PyDataFusionResult> { - let default_codec; - let codec: &dyn datafusion_proto::physical_plan::PhysicalExtensionCodec = match ctx { - Some(ref ctx) => ctx.physical_codec().as_ref(), - None => { - default_codec = PythonPhysicalCodec::default(); - &default_codec - } - }; - let proto = serialize_physical_expr(&self.expr, codec)?; - let bytes = proto.encode_to_vec(); - Ok(PyBytes::new(py, &bytes)) - } - - /// Decode a `PhysicalExpr` from serialized protobuf bytes. The - /// receiver must supply the `input_schema` against which column - /// references in the expression resolve (pyarrow Schema). - #[staticmethod] - pub fn from_bytes( - ctx: PySessionContext, - proto_msg: Bound<'_, PyBytes>, - input_schema: Bound<'_, PyAny>, - ) -> PyDataFusionResult { - let bytes: &[u8] = proto_msg.extract().map_err(Into::::into)?; - let proto_expr = protobuf::PhysicalExprNode::decode(bytes).map_err(|e| { - PyRuntimeError::new_err(format!( - "Unable to decode physical expression from serialized bytes: {e}" - )) - })?; - let schema = Schema::from_pyarrow_bound(&input_schema)?; - let codec = ctx.physical_codec(); - let task_ctx = ctx.ctx.task_ctx(); - let expr = parse_physical_expr(&proto_expr, task_ctx.as_ref(), &schema, codec.as_ref())?; - Ok(Self::new(expr)) - } - - #[staticmethod] - pub fn from_pycapsule(mut capsule: Bound) -> PyResult { - if capsule.hasattr("__datafusion_physical_expr__")? { - capsule = capsule.getattr("__datafusion_physical_expr__")?.call0()?; - } - - let capsule = capsule.cast::().map_err(py_datafusion_err)?; - let data: std::ptr::NonNull = capsule - .pointer_checked(Some(c"datafusion_physical_expr"))? - .cast(); - let ffi = unsafe { data.as_ref() }; - let expr: Arc = ffi.into(); - Ok(Self { expr }) - } - - pub fn __datafusion_physical_expr__<'py>( - &self, - py: Python<'py>, - ) -> PyResult> { - let name = cr"datafusion_physical_expr".into(); - let ffi = FFI_PhysicalExpr::from(self.expr.clone()); - PyCapsule::new(py, ffi, Some(name)) - } -} - -impl From for Arc { - fn from(expr: PyPhysicalExpr) -> Arc { - expr.expr.clone() - } -} - -impl From> for PyPhysicalExpr { - fn from(expr: Arc) -> PyPhysicalExpr { - PyPhysicalExpr { expr } - } -} - -fn emit_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> { - let warnings = py.import("warnings")?; - let category = py.import("builtins")?.getattr("DeprecationWarning")?; - warnings.call_method1("warn", (msg, category, 2))?; - Ok(()) -} diff --git a/crates/core/src/sql/logical.rs b/crates/core/src/sql/logical.rs index b5d18e7c7..dd5ed9470 100644 --- a/crates/core/src/sql/logical.rs +++ b/crates/core/src/sql/logical.rs @@ -241,38 +241,6 @@ impl PyLogicalPlan { let plan = proto_plan.try_into_logical_plan(&ctx.ctx.task_ctx(), codec.as_ref())?; Ok(Self::new(plan)) } - - /// Deprecated alias for [`Self::to_bytes`]. Will be removed in a - /// future release. - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { - emit_deprecation( - py, - "PyLogicalPlan.to_proto is deprecated; use to_bytes instead", - )?; - self.to_bytes(py, None) - } - - /// Deprecated alias for [`Self::from_bytes`]. Will be removed in a - /// future release. - #[staticmethod] - pub fn from_proto( - py: Python<'_>, - ctx: PySessionContext, - proto_msg: Bound<'_, PyBytes>, - ) -> PyDataFusionResult { - emit_deprecation( - py, - "PyLogicalPlan.from_proto is deprecated; use from_bytes instead", - )?; - Self::from_bytes(ctx, proto_msg) - } -} - -fn emit_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> { - let warnings = py.import("warnings")?; - let category = py.import("builtins")?.getattr("DeprecationWarning")?; - warnings.call_method1("warn", (msg, category, 2))?; - Ok(()) } impl From for LogicalPlan { diff --git a/crates/core/src/udf.rs b/crates/core/src/udf.rs index 5f9d85261..c0a39cb47 100644 --- a/crates/core/src/udf.rs +++ b/crates/core/src/udf.rs @@ -43,13 +43,11 @@ use crate::expr::PyExpr; /// This struct holds the Python written function that is a /// ScalarUDF. #[derive(Debug)] -pub(crate) struct PythonFunctionScalarUDF { +struct PythonFunctionScalarUDF { name: String, func: Py, - input_fields: Vec, - return_field: FieldRef, signature: Signature, - volatility: Volatility, + return_field: FieldRef, } impl PythonFunctionScalarUDF { @@ -65,40 +63,10 @@ impl PythonFunctionScalarUDF { Self { name, func, - input_fields, - return_field: Arc::new(return_field), signature, - volatility, + return_field: Arc::new(return_field), } } - - /// Stored Python callable. Consumed by the codec to cloudpickle the - /// function body across process boundaries. - pub(crate) fn func(&self) -> &Py { - &self.func - } - - pub(crate) fn input_fields(&self) -> &[Field] { - &self.input_fields - } - - pub(crate) fn return_field(&self) -> &FieldRef { - &self.return_field - } - - pub(crate) fn volatility(&self) -> Volatility { - self.volatility - } - - pub(crate) fn from_parts( - name: String, - func: Py, - input_fields: Vec, - return_field: Field, - volatility: Volatility, - ) -> Self { - Self::new(name, func, input_fields, return_field, volatility) - } } impl Eq for PythonFunctionScalarUDF {} diff --git a/examples/datafusion-ffi-example/python/tests/_test_physical_expr.py b/examples/datafusion-ffi-example/python/tests/_test_physical_expr.py deleted file mode 100644 index 922897ce9..000000000 --- a/examples/datafusion-ffi-example/python/tests/_test_physical_expr.py +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import annotations - -import pytest -from datafusion import PhysicalExpr -from datafusion_ffi_example import MyPhysicalExpr - - -@pytest.mark.parametrize("inner_capsule", [True, False]) -def test_physical_expr_from_pycapsule(inner_capsule: bool) -> None: - """`PhysicalExpr.from_pycapsule` consumes a capsule sourced from a - non-datafusion-python crate. `MyPhysicalExpr` emits the literal - `Int32(42)`.""" - obj = MyPhysicalExpr() - if inner_capsule: - obj = obj.__datafusion_physical_expr__() - - expr = PhysicalExpr.from_pycapsule(obj) - assert "42" in repr(expr) diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index efaa4eab9..791f81c39 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -22,7 +22,6 @@ use crate::catalog_provider::{FixedSchemaProvider, MyCatalogProvider, MyCatalogP use crate::config::MyConfig; use crate::execution_plan::MyExecutionPlan; use crate::logical_extension_codec::MyLogicalExtensionCodec; -use crate::physical_expr::MyPhysicalExpr; use crate::physical_extension_codec::MyPhysicalExtensionCodec; use crate::scalar_udf::IsNullUDF; use crate::table_function::MyTableFunction; @@ -35,7 +34,6 @@ pub(crate) mod catalog_provider; pub(crate) mod config; pub(crate) mod execution_plan; pub(crate) mod logical_extension_codec; -pub(crate) mod physical_expr; pub(crate) mod physical_extension_codec; pub(crate) mod scalar_udf; pub(crate) mod table_function; @@ -60,6 +58,5 @@ fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; Ok(()) } diff --git a/examples/datafusion-ffi-example/src/physical_expr.rs b/examples/datafusion-ffi-example/src/physical_expr.rs deleted file mode 100644 index 2b15c0d3e..000000000 --- a/examples/datafusion-ffi-example/src/physical_expr.rs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::physical_expr::PhysicalExpr; -use datafusion::physical_expr::expressions::Literal; -use datafusion_common::ScalarValue; -use datafusion_ffi::physical_expr::FFI_PhysicalExpr; -use pyo3::prelude::*; -use pyo3::types::PyCapsule; - -/// Minimal downstream `PhysicalExpr` producer for FFI integration -/// tests. Emits the literal `Int32(42)` via `FFI_PhysicalExpr` so -/// Python tests can verify `PhysicalExpr.from_pycapsule(my_expr)` -/// consumes a capsule sourced from a non-datafusion-python crate. -#[pyclass( - from_py_object, - name = "MyPhysicalExpr", - module = "datafusion_ffi_example", - subclass -)] -#[derive(Clone)] -pub(crate) struct MyPhysicalExpr; - -#[pymethods] -impl MyPhysicalExpr { - #[new] - fn new() -> Self { - Self - } - - fn __datafusion_physical_expr__<'py>( - &self, - py: Python<'py>, - ) -> PyResult> { - let expr: Arc = Arc::new(Literal::new(ScalarValue::Int32(Some(42)))); - let ffi = FFI_PhysicalExpr::from(expr); - - let name = cr"datafusion_physical_expr".into(); - PyCapsule::new(py, ffi, Some(name)) - } -} diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 3b56470fd..f08b464bb 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -90,7 +90,7 @@ from .expr import Expr, WindowFrame from .io import read_avro, read_csv, read_json, read_parquet from .options import CsvReadOptions -from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet, PhysicalExpr +from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet from .record_batch import RecordBatch, RecordBatchStream from .user_defined import ( Accumulator, @@ -124,7 +124,6 @@ "MetricsSet", "ParquetColumnOptions", "ParquetWriterOptions", - "PhysicalExpr", "RecordBatch", "RecordBatchStream", "RuntimeEnvBuilder", diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 35ed2f9fc..6c58bd563 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -34,7 +34,6 @@ "LogicalPlan", "Metric", "MetricsSet", - "PhysicalExpr", ] @@ -398,56 +397,3 @@ def labels(self) -> dict[str, str]: def __repr__(self) -> str: """Return a string representation of the metric.""" return repr(self._raw) - - -class PhysicalExpr: - """Thin wrapper around a DataFusion physical expression. - - `PhysicalExpr` is the post-planning form of an `Expr`: column - references are resolved to positional indices, scalar functions - are dispatched, and types are determined. This wrapper exposes - serialization + the PyCapsule protocol so physical expressions - can be shipped to other Rust libraries that consume - `FFI_PhysicalExpr`. - """ - - def __init__(self, expr: df_internal.PhysicalExpr) -> None: - """Internal constructor; not for end-user use.""" - self._raw = expr - - def __repr__(self) -> str: - """Return a string representation.""" - return repr(self._raw) - - def to_bytes(self, ctx: SessionContext | None = None) -> bytes: - """Serialize the physical expression to protobuf bytes. - - When ``ctx`` is supplied, encoding routes through the session's - installed `PhysicalExtensionCodec`. - """ - ctx_arg = ctx.ctx if ctx is not None else None - return self._raw.to_bytes(ctx_arg) - - @staticmethod - def from_bytes(ctx: SessionContext, data: bytes, input_schema: Any) -> PhysicalExpr: - """Deserialize a physical expression. - - ``input_schema`` must be the pyarrow Schema the expression - resolves column references against. - """ - return PhysicalExpr( - df_internal.PhysicalExpr.from_bytes(ctx.ctx, data, input_schema) - ) - - @staticmethod - def from_pycapsule(capsule: Any) -> PhysicalExpr: - """Construct a PhysicalExpr from a PyCapsule. - - Accepts either a raw capsule or any object exposing - ``__datafusion_physical_expr__``. - """ - return PhysicalExpr(df_internal.PhysicalExpr.from_pycapsule(capsule)) - - def __datafusion_physical_expr__(self) -> Any: - """Return a PyCapsule pointing at the underlying `FFI_PhysicalExpr`.""" - return self._raw.__datafusion_physical_expr__() diff --git a/python/tests/test_plans.py b/python/tests/test_plans.py index 711db09d7..9e23c047e 100644 --- a/python/tests/test_plans.py +++ b/python/tests/test_plans.py @@ -18,7 +18,6 @@ import datetime import datafusion._internal as df_internal -import pyarrow as pa import pytest from datafusion import ( ExecutionPlan, @@ -92,22 +91,6 @@ def test_execution_plan_pycapsule_protocol(df) -> None: assert str(plan) == str(ExecutionPlan(restored)) -def test_physical_expr_to_bytes_roundtrip(ctx) -> None: - """Round-trip a PhysicalExpr through the session's physical codec.""" - ctx.sql("CREATE TABLE t AS VALUES (1, 10), (2, 20), (3, 30)") - df = ctx.sql("SELECT column1 + column2 AS s FROM t") - # Exercising the plan triggers physical planning; we only need to - # confirm the class surface is registered here. End-to-end physical - # expression roundtrip is exercised by the integration suite once a - # public Python constructor for PhysicalExpr lands. - df.execution_plan() - schema = pa.schema([("column1", pa.int64()), ("column2", pa.int64())]) - assert hasattr(df_internal, "PhysicalExpr") - assert hasattr(df_internal.PhysicalExpr, "from_bytes") - assert hasattr(df_internal.PhysicalExpr, "from_pycapsule") - assert schema is not None - - def test_session_with_logical_extension_codec_roundtrip(ctx, df) -> None: """A session with a non-default logical codec still round-trips builtins. From 91571541b7edd9f45b6488bf39bf8971084bc1bc Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 13:09:19 -0400 Subject: [PATCH 04/11] remove unuseful code comments --- crates/core/src/sql/logical.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crates/core/src/sql/logical.rs b/crates/core/src/sql/logical.rs index dd5ed9470..647c3fa7e 100644 --- a/crates/core/src/sql/logical.rs +++ b/crates/core/src/sql/logical.rs @@ -203,12 +203,6 @@ impl PyLogicalPlan { py: Python<'py>, ctx: Option, ) -> PyDataFusionResult> { - // When the caller supplies a session, route through its - // installed logical codec so user FFI codecs registered via - // `with_logical_extension_codec` see the encode path. - // Otherwise fall back to a default-inner `PythonLogicalCodec` - // — Python scalar UDFs still inline via `DFPYUDF1`, but - // non-Python UDFs hit the default codec only. let default_codec; let codec: &dyn datafusion_proto::logical_plan::LogicalExtensionCodec = match ctx { Some(ref ctx) => ctx.logical_codec().as_ref(), From 88a4585d247bcac3728e4696271dcdd662cf0cec Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 13:12:02 -0400 Subject: [PATCH 05/11] docs: rewrite codec module comments around purpose, not PR sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous doc-block framed PythonLogicalCodec / PythonPhysicalCodec in terms of "PR1 delegates, PR2 will add encoding" — useful for review, useless for someone reading the code later. Reframed in terms of what the codecs exist to do: encode Python-side plan references (pure-Python UDFs, etc.) into the proto wire format so plans can cross process boundaries without the receiver having to pre-register every callable. The wrappers sit at the top of the session's codec stack and delegate non-Python encoding to a composable inner codec. Magic-prefix registry table loses the "reserved" column. Doc still notes that the in-module impls currently delegate and that encoder/decoder hooks land alongside the corresponding Python-side serialization work. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/codec.rs | 116 ++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 38 deletions(-) diff --git a/crates/core/src/codec.rs b/crates/core/src/codec.rs index 6c7248dd7..db33cf406 100644 --- a/crates/core/src/codec.rs +++ b/crates/core/src/codec.rs @@ -17,39 +17,63 @@ //! Python-aware extension codecs. //! -//! [`PythonLogicalCodec`] wraps a user-supplied (or default) -//! [`LogicalExtensionCodec`] and is the codec datafusion-python parks -//! on every `SessionContext`. [`PythonPhysicalCodec`] is the symmetric -//! wrapper around [`PhysicalExtensionCodec`]. +//! Datafusion-python plans can carry references to Python-defined +//! objects that the upstream protobuf codecs do not know how to +//! serialize: pure-Python scalar / aggregate / window UDFs, Python +//! query-planning extensions, and so on. Their state lives inside +//! `Py` callables and closures rather than being recoverable +//! from a name in the receiver's function registry. To ship a plan +//! across a process boundary (pickle, `multiprocessing`, Ray actor, +//! `datafusion-distributed`, etc.) those payloads have to be encoded +//! into the proto wire format itself. //! -//! In PR1 both codecs delegate every call to their `inner` codec. The -//! types exist so that follow-up work (pickle support, Python scalar -//! UDF inline encoding) can add in-band Python payloads without -//! re-plumbing the session field. +//! [`PythonLogicalCodec`] is the [`LogicalExtensionCodec`] that +//! datafusion-python parks on every `SessionContext`. It wraps a +//! user-supplied (or default) inner codec and adds Python-aware +//! in-band encoding on top: when the encoder sees a Python-defined +//! UDF, the codec cloudpickles the callable + signature into the +//! `fun_definition` proto field; when the decoder sees a payload it +//! produced, it reconstructs the UDF from the bytes alone — no +//! pre-registration on the receiver. UDFs the codec does not +//! recognise are delegated to `inner`, which is typically +//! `DefaultLogicalExtensionCodec` but may be a downstream-supplied +//! FFI codec installed via +//! `SessionContext.with_logical_extension_codec(...)`. //! -//! ## Wire-format magic prefix registry +//! [`PythonPhysicalCodec`] is the symmetric wrapper around +//! [`PhysicalExtensionCodec`]. Logical and physical layers each have +//! a `try_encode_udf` / `try_decode_udf` pair, so a `ScalarUDF` +//! referenced inside a `LogicalPlan`, an `ExecutionPlan`, or a +//! `PhysicalExpr` must encode identically through either layer for +//! plans to survive a serialization round-trip. Both codecs share +//! the same payload framing for that reason. +//! +//! Payloads emitted by these codecs are tagged with an 8-byte magic +//! prefix so the decoder can distinguish them from arbitrary bytes +//! (empty `fun_definition` from the default codec, user FFI payloads +//! that picked a non-colliding prefix). Dispatch precedence on +//! decode: **Python-inline payload (magic prefix match) → `inner` +//! codec → caller's `FunctionRegistry` fallback.** //! -//! Future in-band Python payloads will be prefixed with an 8-byte -//! magic so the decoder can distinguish them from arbitrary -//! `fun_definition` bytes produced by the default codec or a user FFI -//! codec. +//! ## Wire-format magic prefix registry //! -//! | Layer + kind | Magic prefix | Status | -//! | ----------------------------- | ------------ | ------------- | -//! | `PythonLogicalCodec` scalar | `DFPYUDF1` | reserved (PR2)| -//! | `PythonLogicalCodec` agg | `DFPYUDA1` | reserved | -//! | `PythonLogicalCodec` window | `DFPYUDW1` | reserved | -//! | `PythonPhysicalCodec` scalar | `DFPYUDF1` | reserved (PR2)| -//! | `PythonPhysicalCodec` agg | `DFPYUDA1` | reserved | -//! | `PythonPhysicalCodec` window | `DFPYUDW1` | reserved | -//! | `PythonPhysicalCodec` expr | `DFPYPE1` | reserved | -//! | User FFI extension codec | user-chosen | downstream | -//! | Default codec | (none) | upstream | +//! | Layer + kind | Magic prefix | +//! | ----------------------------- | ------------ | +//! | `PythonLogicalCodec` scalar | `DFPYUDF1` | +//! | `PythonLogicalCodec` agg | `DFPYUDA1` | +//! | `PythonLogicalCodec` window | `DFPYUDW1` | +//! | `PythonPhysicalCodec` scalar | `DFPYUDF1` | +//! | `PythonPhysicalCodec` agg | `DFPYUDA1` | +//! | `PythonPhysicalCodec` window | `DFPYUDW1` | +//! | `PythonPhysicalCodec` expr | `DFPYPE1` | +//! | User FFI extension codec | user-chosen | +//! | Default codec | (none) | //! -//! Dispatch precedence once in-band payloads land: **Python-inline -//! payload (magic prefix match) → `inner` codec → caller's registry -//! fallback.** User FFI codecs should pick non-colliding prefixes -//! (recommend a `DF` namespace plus a crate-specific suffix). +//! Downstream FFI codecs should pick non-colliding prefixes (use a +//! `DF` namespace plus a crate-specific suffix). The codec +//! implementations in this module currently delegate every method to +//! `inner`; the encoder/decoder hooks for each kind are added as the +//! corresponding Python-side type becomes serializable. use std::sync::Arc; @@ -62,16 +86,23 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion_proto::logical_plan::{DefaultLogicalExtensionCodec, LogicalExtensionCodec}; use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; -/// Reserved magic prefix for an inlined Python scalar UDF payload. -/// Not produced or consumed by PR1; the constant is reserved here so -/// follow-up work has a single definition site. +/// Wire-format prefix that tags a `fun_definition` payload as an +/// inlined Python scalar UDF (cloudpickled tuple of name, callable, +/// input schema, return field, volatility). Defined once here so +/// the encoder and decoder cannot drift. #[allow(dead_code)] pub(crate) const PY_SCALAR_UDF_MAGIC: &[u8] = b"DFPYUDF1"; -/// `LogicalExtensionCodec` parked on every `SessionContext`. Wraps a -/// composable `inner` codec; PR1 delegates every method straight -/// through. The wrapper exists so follow-up patches can add Python -/// in-band encoding without changing every serializer. +/// `LogicalExtensionCodec` parked on every `SessionContext`. Holds +/// the Python-aware encoding hooks for logical-layer types +/// (`LogicalPlan`, `Expr`) and delegates everything it does not +/// handle to the composable `inner` codec — typically +/// `DefaultLogicalExtensionCodec`, or a downstream FFI codec +/// installed via `SessionContext.with_logical_extension_codec(...)`. +/// +/// Sitting at the top of the session's logical codec stack means +/// every serializer that reads `session.logical_codec()` automatically +/// picks up Python-aware encoding for free. #[derive(Debug)] pub struct PythonLogicalCodec { inner: Arc, @@ -136,9 +167,18 @@ impl LogicalExtensionCodec for PythonLogicalCodec { } } -/// `PhysicalExtensionCodec` mirror of [`PythonLogicalCodec`]. Same -/// motivation: a stable session field that follow-up patches can layer -/// Python in-band encoding onto. +/// `PhysicalExtensionCodec` mirror of [`PythonLogicalCodec`] parked +/// on the same `SessionContext`. Carries the Python-aware encoding +/// hooks for physical-layer types (`ExecutionPlan`, `PhysicalExpr`) +/// and delegates the rest to `inner`. +/// +/// The `PhysicalExtensionCodec` trait has its own `try_encode_udf` +/// / `try_decode_udf` pair distinct from the logical one, so a +/// `ScalarUDF` referenced inside a physical plan needs Python-aware +/// encoding on this layer too — otherwise a plan with a Python UDF +/// would round-trip at the logical level but break at the physical +/// level. Both layers reuse the shared payload framing +/// ([`PY_SCALAR_UDF_MAGIC`] et al.) so the wire format is identical. #[derive(Debug)] pub struct PythonPhysicalCodec { inner: Arc, From b2d77f40dc9eb7d7f22cb31be1af9b6a3a071760 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 13:20:13 -0400 Subject: [PATCH 06/11] feat(codec): forward every LogicalExtensionCodec / PhysicalExtensionCodec method to inner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PythonLogicalCodec previously only overrode the four required methods on the trait plus the scalar UDF pair, so the default trait impls (returning "LogicalExtensionCodec is not provided") shadowed any downstream FFI codec for file formats, aggregate UDFs, and window UDFs. A user installing their own codec via `SessionContext.with_logical_extension_codec(...)` would silently lose access to its `try_*_file_format`, `try_*_udaf`, `try_*_udwf` implementations. Forward every trait method to `inner` so the user-installed codec is fully reachable. Same change on the physical side, including `try_*_expr`, `try_*_udaf`, `try_*_udwf` — the corresponding Python-aware paths can layer on later by intercepting before delegation. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/codec.rs | 64 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/crates/core/src/codec.rs b/crates/core/src/codec.rs index db33cf406..088532df2 100644 --- a/crates/core/src/codec.rs +++ b/crates/core/src/codec.rs @@ -80,8 +80,10 @@ use std::sync::Arc; use arrow::datatypes::SchemaRef; use datafusion::common::{Result, TableReference}; use datafusion::datasource::TableProvider; +use datafusion::datasource::file_format::FileFormatFactory; use datafusion::execution::TaskContext; -use datafusion::logical_expr::{Extension, LogicalPlan, ScalarUDF}; +use datafusion::logical_expr::{AggregateUDF, Extension, LogicalPlan, ScalarUDF, WindowUDF}; +use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ExecutionPlan; use datafusion_proto::logical_plan::{DefaultLogicalExtensionCodec, LogicalExtensionCodec}; use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; @@ -158,6 +160,22 @@ impl LogicalExtensionCodec for PythonLogicalCodec { self.inner.try_encode_table_provider(table_ref, node, buf) } + fn try_decode_file_format( + &self, + buf: &[u8], + ctx: &TaskContext, + ) -> Result> { + self.inner.try_decode_file_format(buf, ctx) + } + + fn try_encode_file_format( + &self, + buf: &mut Vec, + node: Arc, + ) -> Result<()> { + self.inner.try_encode_file_format(buf, node) + } + fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec) -> Result<()> { self.inner.try_encode_udf(node, buf) } @@ -165,6 +183,22 @@ impl LogicalExtensionCodec for PythonLogicalCodec { fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { self.inner.try_decode_udf(name, buf) } + + fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec) -> Result<()> { + self.inner.try_encode_udaf(node, buf) + } + + fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result> { + self.inner.try_decode_udaf(name, buf) + } + + fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec) -> Result<()> { + self.inner.try_encode_udwf(node, buf) + } + + fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result> { + self.inner.try_decode_udwf(name, buf) + } } /// `PhysicalExtensionCodec` mirror of [`PythonLogicalCodec`] parked @@ -221,4 +255,32 @@ impl PhysicalExtensionCodec for PythonPhysicalCodec { fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result> { self.inner.try_decode_udf(name, buf) } + + fn try_encode_expr(&self, node: &Arc, buf: &mut Vec) -> Result<()> { + self.inner.try_encode_expr(node, buf) + } + + fn try_decode_expr( + &self, + buf: &[u8], + inputs: &[Arc], + ) -> Result> { + self.inner.try_decode_expr(buf, inputs) + } + + fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec) -> Result<()> { + self.inner.try_encode_udaf(node, buf) + } + + fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result> { + self.inner.try_decode_udaf(name, buf) + } + + fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec) -> Result<()> { + self.inner.try_encode_udwf(node, buf) + } + + fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result> { + self.inner.try_decode_udwf(name, buf) + } } From 63caf507ef96577e2a06f4009db42ca3c86121e9 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 13:37:25 -0400 Subject: [PATCH 07/11] docs: tighten codec dispatch test docstrings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous docstrings claimed the tests verify "PythonLogicalCodec delegates non-Python UDFs to the inner codec." That's forward-looking — the codecs currently delegate every UDF unconditionally, so the test would behave identically for Python and non-Python UDFs. Rewrite to describe what the test actually proves: the dispatch chain `PyLogicalPlan.to_bytes -> session.logical_codec -> PythonLogicalCodec -> FFI -> user impl` (and the physical mirror) forwards correctly, observable via the user codec's atomic counter incrementing after one encode pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/_test_logical_extension_codec.py | 17 ++++++++++++++--- .../tests/_test_physical_extension_codec.py | 14 ++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py b/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py index 94093b416..ad357bd73 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py +++ b/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py @@ -41,9 +41,20 @@ def test_ffi_logical_codec_install_and_export(): def test_ffi_logical_codec_consulted_on_udf_encode(): - """Serializing a plan that references an FFI-imported scalar UDF - routes through the installed codec's `try_encode_udf` path — - PythonLogicalCodec delegates non-Python UDFs to the inner codec.""" + """Serializing through ctx.logical_codec() routes try_encode_udf to + the user-installed FFI codec. + + Verifies the dispatch chain + `PyLogicalPlan.to_bytes -> session.logical_codec -> + PythonLogicalCodec -> FFI_LogicalExtensionCodec -> user impl` + is wired correctly. The user codec's atomic counter increments + after a serialization pass, proving every hop forwards. + + Does not test any Python-UDF-specific dispatch — PythonLogicalCodec + currently delegates all UDF encoding to its inner codec + unconditionally. Python-vs-other branching lands when in-band + scalar UDF encoding is added. + """ ctx, codec = _setup_session_with_codec() df = ctx.sql("SELECT abs(-1) AS x") plan = df.logical_plan() diff --git a/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py b/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py index 9245cd63c..1f2da52fa 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py +++ b/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py @@ -41,10 +41,16 @@ def test_ffi_physical_codec_install_and_export(): def test_ffi_physical_codec_consulted_on_udf_encode(): - """Serializing an ExecutionPlan that references an FFI-imported - scalar UDF routes `try_encode_udf` through the installed physical - codec — PythonPhysicalCodec delegates non-Python UDFs to the inner - codec.""" + """Serializing through ctx.physical_codec() routes try_encode_udf to + the user-installed FFI codec. + + Mirror of the logical-side dispatch test: verifies + `PyExecutionPlan.to_bytes -> session.physical_codec -> + PythonPhysicalCodec -> FFI_PhysicalExtensionCodec -> user impl` + forwards correctly. Does not test Python-UDF-specific dispatch — + PythonPhysicalCodec currently delegates all UDF encoding to its + inner codec unconditionally. + """ ctx, codec = _setup_session_with_codec() df = ctx.sql("SELECT abs(a) AS x FROM t") plan = df.execution_plan() From fa453ec58966665ff8badba8251f6975b3e17d22 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 13:41:01 -0400 Subject: [PATCH 08/11] refactor(ffi-example): MyExecutionPlan emits real data via MemorySourceConfig Was a one-column `EmptyExec` stub useful only as a capsule-handoff target. Promoted to a minimal reference impl that a downstream Rust crate can copy when exposing a custom `ExecutionPlan` to datafusion-python: configurable `num_rows`, produces a single batch of sequential `Int32` values under column `value`, wrapped in `DataSourceExec` via `MemorySourceConfig::try_new_exec`. Header comment explains the typical use case (remote backend, streaming source, synthetic data generator) and the `__datafusion_execution_plan__` capsule shape downstream crates should follow. Test asserts the schema-bearing plan survives the FFI hop: a `DataSourceExec` arrives with the expected partitioning and no children. Schema details are not surfaced through the FFI display path (only the wrapping `ForeignExecutionPlan` name + inner plan name appear), so the test does not assert the column name. `to_bytes` round-trip of an FFI-imported plan is not exercised: encoding requires a physical codec that knows how to serialize `ForeignExecutionPlan`, which the default codec does not. A downstream user round-tripping such a plan must install their own codec via `with_physical_extension_codec`. Documented in the test file rather than asserted on. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../python/tests/_test_execution_plan.py | 27 ++++++-- .../src/execution_plan.rs | 66 +++++++++++++++---- 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py b/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py index 8cd9076a1..6e1aaf08c 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py +++ b/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py @@ -24,16 +24,31 @@ @pytest.mark.parametrize("inner_capsule", [True, False]) def test_execution_plan_from_pycapsule(inner_capsule: bool) -> None: - """`ExecutionPlan.from_pycapsule` consumes a capsule sourced from a - non-datafusion-python crate, regardless of whether the user passes - the typed wrapper or pre-extracts the raw capsule.""" - obj = MyExecutionPlan() + """`ExecutionPlan.from_pycapsule` consumes a capsule sourced from + a downstream Rust crate, regardless of whether the user passes the + typed wrapper or pre-extracts the raw capsule. + + `MyExecutionPlan` produces a real `DataSourceExec` backed by a + `MemorySourceConfig`. After the FFI handoff the underlying plan + is wrapped in a `ForeignExecutionPlan`, so the display surface + shows the FFI envelope plus the wrapped plan's name — schema + detail is not propagated by the FFI display path. + """ + obj = MyExecutionPlan(num_rows=5) if inner_capsule: obj = obj.__datafusion_execution_plan__() plan = ExecutionPlan.from_pycapsule(obj) - # EmptyExec round-trips with no children. assert plan.children() == [] assert plan.partition_count == 1 - assert "EmptyExec" in plan.display() + assert "DataSourceExec" in plan.display() + + +# `to_bytes` round-trip of an FFI-imported `ExecutionPlan` requires a +# physical codec that knows how to encode `ForeignExecutionPlan`. The +# default codec does not, so a downstream user serializing such a plan +# must install their own physical extension codec via +# `SessionContext.with_physical_extension_codec(...)`. We do not +# exercise that here because the test would assert the encode path on +# the user's codec, not on datafusion-python's plumbing. diff --git a/examples/datafusion-ffi-example/src/execution_plan.rs b/examples/datafusion-ffi-example/src/execution_plan.rs index 7439d9e31..9fae73ef6 100644 --- a/examples/datafusion-ffi-example/src/execution_plan.rs +++ b/examples/datafusion-ffi-example/src/execution_plan.rs @@ -15,20 +15,44 @@ // specific language governing permissions and limitations // under the License. +//! Reference implementation of a downstream-crate `ExecutionPlan` +//! exported to datafusion-python via the PyCapsule protocol. +//! +//! Typical use case: a Rust crate produces a custom physical plan +//! (e.g. for a remote backend, a streaming source, a synthetic data +//! generator) and wants to hand it to a datafusion-python user +//! without exposing its internals. The downstream crate writes a +//! `#[pyclass]` wrapper, builds an `Arc` on +//! demand, wraps it as `FFI_ExecutionPlan`, and exposes the capsule +//! via `__datafusion_execution_plan__(...)`. The receiving Python +//! code calls +//! `datafusion.ExecutionPlan.from_pycapsule(my_plan)` and gets back +//! a fully-typed datafusion-python `ExecutionPlan` it can introspect, +//! serialize via `to_bytes` / `from_bytes`, or execute. +//! +//! `MyExecutionPlan` here is a minimal worked example: it produces +//! `num_rows` sequential `Int32` values in a single batch under the +//! column name `value`. Backed by `MemorySourceConfig`, so the plan +//! is a real `DataSourceExec` with a non-trivial schema and +//! executable output — useful both as a reference for downstream +//! crates and as the fixture for the FFI integration tests. + use std::sync::Arc; +use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; -use datafusion::physical_plan::empty::EmptyExec; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_common::error::{DataFusionError, Result as DataFusionResult}; use datafusion_ffi::execution_plan::FFI_ExecutionPlan; use datafusion_python_util::get_tokio_runtime; +use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; use pyo3::types::PyCapsule; -/// Minimal downstream `ExecutionPlan` producer for FFI integration -/// tests. Emits a single-column empty plan via `FFI_ExecutionPlan`, -/// letting Python tests verify that -/// `ExecutionPlan.from_pycapsule(my_plan)` consumes a capsule sourced -/// from a non-datafusion-python Rust crate. +/// Downstream-crate `ExecutionPlan` producer for FFI integration with +/// datafusion-python. Emits `num_rows` sequential `Int32` values +/// (0..num_rows) in a single batch under the column name `value`. #[pyclass( from_py_object, name = "MyExecutionPlan", @@ -36,21 +60,41 @@ use pyo3::types::PyCapsule; subclass )] #[derive(Clone)] -pub(crate) struct MyExecutionPlan; +pub(crate) struct MyExecutionPlan { + num_rows: usize, +} + +impl MyExecutionPlan { + fn build_plan(&self) -> DataFusionResult> { + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int32, + false, + )])); + let values: Vec = (0..self.num_rows as i32).collect(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(values))], + )?; + let exec = MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?; + Ok(exec as Arc) + } +} #[pymethods] impl MyExecutionPlan { #[new] - fn new() -> Self { - Self + fn new(num_rows: usize) -> Self { + Self { num_rows } } fn __datafusion_execution_plan__<'py>( &self, py: Python<'py>, ) -> PyResult> { - let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); - let plan = Arc::new(EmptyExec::new(schema)); + let plan = self + .build_plan() + .map_err(|e: DataFusionError| PyRuntimeError::new_err(e.to_string()))?; let runtime = get_tokio_runtime().handle().clone(); let ffi = FFI_ExecutionPlan::new(plan, Some(runtime)); From dbbbf4ab17a094bc93c0d36a22549995e39d1bfc Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 13:53:01 -0400 Subject: [PATCH 09/11] refactor: drop dormant ExecutionPlan PyCapsule round-trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `PyExecutionPlan::from_pycapsule` and the matching `__datafusion_execution_plan__` exporter have no consumer in this repo, on the POC `poc_ffi_query_planner` branch, or on any sibling branch (`testing/datafusion-distributed`, `testing/ffi-library-marker`, `tmp/ffi-with-codecs`). The pair was wired up speculatively for FFI plan handoff that no Python code path actually performs today. Drop the whole capsule round-trip for `ExecutionPlan`: * Rust `PyExecutionPlan::from_pycapsule` and `__datafusion_execution_plan__`. * Python `ExecutionPlan.from_pycapsule` and `__datafusion_execution_plan__` wrappers. * `MyExecutionPlan` FFI fixture + `_test_execution_plan.py` + lib.rs registration. Was solely a test fixture for the dropped path. * `test_execution_plan_pycapsule_protocol` in `python/tests/test_plans.py`. `PyExecutionPlan.to_bytes` / `from_bytes` survive — they encode through the session's physical codec and have real coverage. Capsule round-trip can be re-added when a concrete consumer (distributed worker, bridge library) lands. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/physical_plan.rs | 34 +----- .../python/tests/_test_execution_plan.py | 54 --------- .../src/execution_plan.rs | 104 ------------------ examples/datafusion-ffi-example/src/lib.rs | 3 - python/datafusion/plan.py | 17 --- python/tests/test_plans.py | 13 --- 6 files changed, 2 insertions(+), 223 deletions(-) delete mode 100644 examples/datafusion-ffi-example/python/tests/_test_execution_plan.py delete mode 100644 examples/datafusion-ffi-example/src/execution_plan.rs diff --git a/crates/core/src/physical_plan.rs b/crates/core/src/physical_plan.rs index d79ff0c96..594655a60 100644 --- a/crates/core/src/physical_plan.rs +++ b/crates/core/src/physical_plan.rs @@ -18,17 +18,15 @@ use std::sync::Arc; use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable}; -use datafusion_ffi::execution_plan::FFI_ExecutionPlan; use datafusion_proto::physical_plan::AsExecutionPlan; -use datafusion_python_util::get_tokio_runtime; use prost::Message; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyCapsule}; +use pyo3::types::PyBytes; use crate::codec::PythonPhysicalCodec; use crate::context::PySessionContext; -use crate::errors::{PyDataFusionResult, py_datafusion_err}; +use crate::errors::PyDataFusionResult; use crate::metrics::PyMetricsSet; #[pyclass( @@ -128,34 +126,6 @@ impl PyExecutionPlan { pub fn partition_count(&self) -> usize { self.plan.output_partitioning().partition_count() } - - /// Extract a `PyExecutionPlan` from any object exposing - /// `__datafusion_execution_plan__()` (PyCapsule protocol). - #[staticmethod] - pub fn from_pycapsule(mut capsule: Bound) -> PyResult { - if capsule.hasattr("__datafusion_execution_plan__")? { - capsule = capsule.getattr("__datafusion_execution_plan__")?.call0()?; - } - - let capsule = capsule.cast::().map_err(py_datafusion_err)?; - let data: std::ptr::NonNull = capsule - .pointer_checked(Some(c"datafusion_execution_plan"))? - .cast(); - let plan = unsafe { data.as_ref() }; - let plan: Arc = plan.try_into().map_err(py_datafusion_err)?; - - Ok(Self { plan }) - } - - pub fn __datafusion_execution_plan__<'py>( - &self, - py: Python<'py>, - ) -> PyResult> { - let name = cr"datafusion_execution_plan".into(); - let runtime = get_tokio_runtime().handle().clone(); - let plan = FFI_ExecutionPlan::new(self.plan.clone(), Some(runtime)); - PyCapsule::new(py, plan, Some(name)) - } } impl From for Arc { diff --git a/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py b/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py deleted file mode 100644 index 6e1aaf08c..000000000 --- a/examples/datafusion-ffi-example/python/tests/_test_execution_plan.py +++ /dev/null @@ -1,54 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import annotations - -import pytest -from datafusion import ExecutionPlan -from datafusion_ffi_example import MyExecutionPlan - - -@pytest.mark.parametrize("inner_capsule", [True, False]) -def test_execution_plan_from_pycapsule(inner_capsule: bool) -> None: - """`ExecutionPlan.from_pycapsule` consumes a capsule sourced from - a downstream Rust crate, regardless of whether the user passes the - typed wrapper or pre-extracts the raw capsule. - - `MyExecutionPlan` produces a real `DataSourceExec` backed by a - `MemorySourceConfig`. After the FFI handoff the underlying plan - is wrapped in a `ForeignExecutionPlan`, so the display surface - shows the FFI envelope plus the wrapped plan's name — schema - detail is not propagated by the FFI display path. - """ - obj = MyExecutionPlan(num_rows=5) - if inner_capsule: - obj = obj.__datafusion_execution_plan__() - - plan = ExecutionPlan.from_pycapsule(obj) - - assert plan.children() == [] - assert plan.partition_count == 1 - assert "DataSourceExec" in plan.display() - - -# `to_bytes` round-trip of an FFI-imported `ExecutionPlan` requires a -# physical codec that knows how to encode `ForeignExecutionPlan`. The -# default codec does not, so a downstream user serializing such a plan -# must install their own physical extension codec via -# `SessionContext.with_physical_extension_codec(...)`. We do not -# exercise that here because the test would assert the encode path on -# the user's codec, not on datafusion-python's plumbing. diff --git a/examples/datafusion-ffi-example/src/execution_plan.rs b/examples/datafusion-ffi-example/src/execution_plan.rs deleted file mode 100644 index 9fae73ef6..000000000 --- a/examples/datafusion-ffi-example/src/execution_plan.rs +++ /dev/null @@ -1,104 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Reference implementation of a downstream-crate `ExecutionPlan` -//! exported to datafusion-python via the PyCapsule protocol. -//! -//! Typical use case: a Rust crate produces a custom physical plan -//! (e.g. for a remote backend, a streaming source, a synthetic data -//! generator) and wants to hand it to a datafusion-python user -//! without exposing its internals. The downstream crate writes a -//! `#[pyclass]` wrapper, builds an `Arc` on -//! demand, wraps it as `FFI_ExecutionPlan`, and exposes the capsule -//! via `__datafusion_execution_plan__(...)`. The receiving Python -//! code calls -//! `datafusion.ExecutionPlan.from_pycapsule(my_plan)` and gets back -//! a fully-typed datafusion-python `ExecutionPlan` it can introspect, -//! serialize via `to_bytes` / `from_bytes`, or execute. -//! -//! `MyExecutionPlan` here is a minimal worked example: it produces -//! `num_rows` sequential `Int32` values in a single batch under the -//! column name `value`. Backed by `MemorySourceConfig`, so the plan -//! is a real `DataSourceExec` with a non-trivial schema and -//! executable output — useful both as a reference for downstream -//! crates and as the fixture for the FFI integration tests. - -use std::sync::Arc; - -use arrow_array::{Int32Array, RecordBatch}; -use arrow_schema::{DataType, Field, Schema}; -use datafusion::datasource::memory::MemorySourceConfig; -use datafusion::physical_plan::ExecutionPlan; -use datafusion_common::error::{DataFusionError, Result as DataFusionResult}; -use datafusion_ffi::execution_plan::FFI_ExecutionPlan; -use datafusion_python_util::get_tokio_runtime; -use pyo3::exceptions::PyRuntimeError; -use pyo3::prelude::*; -use pyo3::types::PyCapsule; - -/// Downstream-crate `ExecutionPlan` producer for FFI integration with -/// datafusion-python. Emits `num_rows` sequential `Int32` values -/// (0..num_rows) in a single batch under the column name `value`. -#[pyclass( - from_py_object, - name = "MyExecutionPlan", - module = "datafusion_ffi_example", - subclass -)] -#[derive(Clone)] -pub(crate) struct MyExecutionPlan { - num_rows: usize, -} - -impl MyExecutionPlan { - fn build_plan(&self) -> DataFusionResult> { - let schema = Arc::new(Schema::new(vec![Field::new( - "value", - DataType::Int32, - false, - )])); - let values: Vec = (0..self.num_rows as i32).collect(); - let batch = RecordBatch::try_new( - Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], - )?; - let exec = MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?; - Ok(exec as Arc) - } -} - -#[pymethods] -impl MyExecutionPlan { - #[new] - fn new(num_rows: usize) -> Self { - Self { num_rows } - } - - fn __datafusion_execution_plan__<'py>( - &self, - py: Python<'py>, - ) -> PyResult> { - let plan = self - .build_plan() - .map_err(|e: DataFusionError| PyRuntimeError::new_err(e.to_string()))?; - let runtime = get_tokio_runtime().handle().clone(); - let ffi = FFI_ExecutionPlan::new(plan, Some(runtime)); - - let name = cr"datafusion_execution_plan".into(); - PyCapsule::new(py, ffi, Some(name)) - } -} diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index 791f81c39..3323ac982 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -20,7 +20,6 @@ use pyo3::prelude::*; use crate::aggregate_udf::MySumUDF; use crate::catalog_provider::{FixedSchemaProvider, MyCatalogProvider, MyCatalogProviderList}; use crate::config::MyConfig; -use crate::execution_plan::MyExecutionPlan; use crate::logical_extension_codec::MyLogicalExtensionCodec; use crate::physical_extension_codec::MyPhysicalExtensionCodec; use crate::scalar_udf::IsNullUDF; @@ -32,7 +31,6 @@ use crate::window_udf::MyRankUDF; pub(crate) mod aggregate_udf; pub(crate) mod catalog_provider; pub(crate) mod config; -pub(crate) mod execution_plan; pub(crate) mod logical_extension_codec; pub(crate) mod physical_extension_codec; pub(crate) mod scalar_udf; @@ -57,6 +55,5 @@ fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; Ok(()) } diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 6c58bd563..b2c6eab3e 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -208,23 +208,6 @@ def to_proto(self) -> bytes: ) return self.to_bytes() - @staticmethod - def from_pycapsule(capsule: Any) -> ExecutionPlan: - """Construct an `ExecutionPlan` from a PyCapsule. - - Accepts either a raw capsule or any object exposing - ``__datafusion_execution_plan__``. - """ - return ExecutionPlan(df_internal.ExecutionPlan.from_pycapsule(capsule)) - - def __datafusion_execution_plan__(self) -> Any: - """Return a PyCapsule pointing at the underlying `FFI_ExecutionPlan`. - - Lets other Rust libraries consume the plan without depending on - the datafusion-python class. - """ - return self._raw_plan.__datafusion_execution_plan__() - def metrics(self) -> MetricsSet | None: """Return metrics for this plan node, or None if this plan has no MetricsSet. diff --git a/python/tests/test_plans.py b/python/tests/test_plans.py index 9e23c047e..11e709f6b 100644 --- a/python/tests/test_plans.py +++ b/python/tests/test_plans.py @@ -17,7 +17,6 @@ import datetime -import datafusion._internal as df_internal import pytest from datafusion import ( ExecutionPlan, @@ -79,18 +78,6 @@ def test_execution_plan_to_proto_is_deprecated(ctx, df) -> None: assert str(plan) == str(restored) -def test_execution_plan_pycapsule_protocol(df) -> None: - """PyExecutionPlan exposes __datafusion_execution_plan__ and accepts - capsule-protocol objects on from_pycapsule.""" - plan = df.execution_plan() - capsule = plan._raw_plan.__datafusion_execution_plan__() - assert capsule is not None - - # Capsule-protocol input (forwards via __datafusion_execution_plan__). - restored = df_internal.ExecutionPlan.from_pycapsule(plan._raw_plan) - assert str(plan) == str(ExecutionPlan(restored)) - - def test_session_with_logical_extension_codec_roundtrip(ctx, df) -> None: """A session with a non-default logical codec still round-trips builtins. From ffcaed9d537a34a66eeec881dfc60ed7e0f17d35 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 14 May 2026 14:00:45 -0400 Subject: [PATCH 10/11] feat: PyExpr.to_bytes / from_bytes via session logical codec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors PyLogicalPlan / PyExecutionPlan: encode through the session's installed `LogicalExtensionCodec` (or a default-inner `PythonLogicalCodec` when no `ctx` is supplied), decode against the session's function registry + codec via `parse_expr`. Rust side calls `datafusion_proto::logical_plan::to_proto::serialize_expr` and `from_proto::parse_expr`. Python wrapper threads an optional `SessionContext` through. Tests cover the session-routed roundtrip and the no-ctx default-codec encode path. Adds a third consumer of `session.logical_codec()` alongside `PyLogicalPlan` and the codec dispatch tests in the FFI example, broadening coverage of the codec stack. This is the last piece of the PR1 codec surface — follow-up pickle work (`Expr.__reduce__`, worker-scoped context, multiprocessing) can build on this without bundling the byte-level serialization API. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/expr.rs | 55 +++++++++++++++++++++++++++++++++++++++ python/datafusion/expr.py | 20 ++++++++++++++ python/tests/test_expr.py | 26 ++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/crates/core/src/expr.rs b/crates/core/src/expr.rs index c4f2a12da..2e633baeb 100644 --- a/crates/core/src/expr.rs +++ b/crates/core/src/expr.rs @@ -31,9 +31,13 @@ use datafusion::logical_expr::{ Between, BinaryExpr, Case, Cast, Expr, ExprFuncBuilder, ExprFunctionExt, Like, LogicalPlan, Operator, TryCast, WindowFunctionDefinition, col, lit, lit_with_metadata, }; +use datafusion_proto::logical_plan::{from_proto, to_proto}; +use prost::Message; use pyo3::IntoPyObjectExt; use pyo3::basic::CompareOp; +use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; +use pyo3::types::PyBytes; use window::PyWindowFrame; use self::alias::PyAlias; @@ -43,7 +47,9 @@ use self::bool_expr::{ }; use self::like::{PyILike, PyLike, PySimilarTo}; use self::scalar_variable::PyScalarVariable; +use crate::codec::PythonLogicalCodec; use crate::common::data_type::{DataTypeMap, NullTreatment, PyScalarValue, RexType}; +use crate::context::PySessionContext; use crate::errors::{PyDataFusionResult, py_runtime_err, py_type_err, py_unsupported_variant_err}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; @@ -660,6 +666,55 @@ impl PyExpr { .into()), } } + + /// Serialize this `Expr` to protobuf bytes. + /// + /// When `ctx` is supplied, encoding routes through the session's + /// installed `LogicalExtensionCodec` so user FFI codecs see the + /// encode path. Without `ctx` a default-inner Python codec is + /// used; Python scalar UDFs still inline when in-band encoding + /// lands, non-Python UDFs fall through to the default codec. + #[pyo3(signature = (ctx=None))] + pub fn to_bytes<'py>( + &'py self, + py: Python<'py>, + ctx: Option, + ) -> PyDataFusionResult> { + let default_codec; + let codec: &dyn datafusion_proto::logical_plan::LogicalExtensionCodec = match ctx { + Some(ref ctx) => ctx.logical_codec().as_ref(), + None => { + default_codec = PythonLogicalCodec::default(); + &default_codec + } + }; + let proto = to_proto::serialize_expr(&self.expr, codec) + .map_err(|e| PyRuntimeError::new_err(format!("Unable to serialize expr: {e}")))?; + let bytes = proto.encode_to_vec(); + Ok(PyBytes::new(py, &bytes)) + } + + /// Decode an `Expr` from protobuf bytes against the session's + /// function registry and logical codec. + #[staticmethod] + pub fn from_bytes( + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { + let bytes: &[u8] = proto_msg.extract().map_err(Into::::into)?; + let proto_expr = + datafusion_proto::protobuf::LogicalExprNode::decode(bytes).map_err(|e| { + PyRuntimeError::new_err(format!( + "Unable to decode expression from serialized bytes: {e}" + )) + })?; + + let codec = ctx.logical_codec(); + let task_ctx = ctx.ctx.task_ctx(); + let expr = from_proto::parse_expr(&proto_expr, task_ctx.as_ref(), codec.as_ref()) + .map_err(|e| PyRuntimeError::new_err(format!("Unable to decode expr: {e}")))?; + Ok(Self { expr }) + } } #[pyclass( diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 0f7f3ab5a..e0135e3ed 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -62,6 +62,7 @@ NullTreatment, RexType, ) + from datafusion.context import SessionContext from datafusion.plan import LogicalPlan @@ -432,6 +433,25 @@ def variant_name(self) -> str: """ return self.expr.variant_name() + def to_bytes(self, ctx: SessionContext | None = None) -> bytes: + """Serialize this expression to protobuf bytes. + + When ``ctx`` is supplied, encoding routes through the session's + installed :class:`LogicalExtensionCodec`. Without ``ctx`` a + default codec is used. + """ + ctx_arg = ctx.ctx if ctx is not None else None + return self.expr.to_bytes(ctx_arg) + + @staticmethod + def from_bytes(ctx: SessionContext, data: bytes) -> Expr: + """Decode an expression from serialized protobuf bytes. + + ``ctx`` provides the function registry for resolving UDF + references and the logical codec for in-band Python payloads. + """ + return Expr(expr_internal.RawExpr.from_bytes(ctx.ctx, data)) + def __richcmp__(self, other: Expr, op: int) -> Expr: """Comparison operator.""" return Expr(self.expr.__richcmp__(other.expr, op)) diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 8aa791ae1..6a466f6f2 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -1178,3 +1178,29 @@ def test_round_trip_pyscalar_value(ctx: SessionContext, value: pa.Scalar): df = ctx.sql("select 1 as a") df = df.select(lit(value)) assert pa.table(df)[0][0] == value + + +def test_expr_to_bytes_roundtrip(ctx: SessionContext) -> None: + """An Expr round-trips through the session's logical codec.""" + from datafusion import Expr + + original = col("a") + lit(1) + blob = original.to_bytes(ctx) + restored = Expr.from_bytes(ctx, blob) + + # Canonical name preserves the structure of the expression even + # though the underlying PyExpr instances are different. + assert restored.canonical_name() == original.canonical_name() + + +def test_expr_to_bytes_no_ctx_default_codec() -> None: + """to_bytes(ctx=None) uses a default codec; builtin-only Exprs + still round-trip when a session is supplied on decode.""" + from datafusion import Expr + + fresh = SessionContext() + original = col("a") * lit(2) + blob = original.to_bytes() # encode side: default codec + restored = Expr.from_bytes(fresh, blob) + + assert restored.canonical_name() == original.canonical_name() From daeb755247c1aa14a5a4d77f62d0bbc078820705 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 15 May 2026 08:36:32 -0400 Subject: [PATCH 11/11] test(ffi-example): assert codec roundtrip restores plan output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR review feedback: weak `is not None` checks let regressions slip past. Mirror python/tests/test_plans.py — logical compares `df.collect() == round_trip.collect()`; physical compares `str(original) == str(restored)`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../python/tests/_test_logical_extension_codec.py | 3 ++- .../python/tests/_test_physical_extension_codec.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py b/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py index ad357bd73..cd0c5a61a 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py +++ b/examples/datafusion-ffi-example/python/tests/_test_logical_extension_codec.py @@ -78,4 +78,5 @@ def test_ffi_logical_codec_roundtrip(): blob = df.logical_plan().to_bytes(ctx) restored = LogicalPlan.from_bytes(ctx, blob) - assert restored is not None + df_round_trip = ctx.create_dataframe_from_logical_plan(restored) + assert df.collect() == df_round_trip.collect() diff --git a/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py b/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py index 1f2da52fa..28eaaf2d9 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py +++ b/examples/datafusion-ffi-example/python/tests/_test_physical_extension_codec.py @@ -71,7 +71,8 @@ def test_ffi_physical_codec_roundtrip(): codec inlines the UDF body, which the counting codec does not.""" ctx, _codec = _setup_session_with_codec() df = ctx.sql("SELECT abs(a) AS x FROM t") - blob = df.execution_plan().to_bytes(ctx) + original = df.execution_plan() + blob = original.to_bytes(ctx) restored = ExecutionPlan.from_bytes(ctx, blob) - assert restored is not None + assert str(original) == str(restored)