diff --git a/CHANGELOG.md b/CHANGELOG.md index 081a7c6..074ab6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,23 +1,91 @@ # Changelog -## 2.4.0 (Unreleased) +## 3.0.0 (Unreleased) -### Added +### Breaking Changes + +- **Simplified execution model** - Only two public execution modes: `worker` and `owngil` + - `worker`: Dedicated pthread per context with stable thread affinity (default) + - `owngil`: Dedicated pthread + subinterpreter with own GIL (Python 3.14+) + - Removed `multi_executor` and `free_threaded` from public API + - Internal capability detection still tracks Python features + +- **Removed `py:num_executors/0`** - Contexts now use per-context worker threads + instead of a shared executor pool. This function is no longer needed. + +- **`py:execution_mode/0` returns `worker | owngil`** - Based on the `context_mode` + application configuration. Previously returned internal capabilities like + `free_threaded`, `subinterp`, or `multi_executor`. -- **Context thread affinity** - Contexts in MULTI_EXECUTOR mode are now assigned a - fixed executor thread at creation. All operations (call, eval, exec) from the same - context run on the same OS thread, preventing thread state corruption in libraries - like numpy and PyTorch that have thread-local state. +- **Removed `py:async_stream/3,4`** - Streaming async generators was never + implemented behind the API and always returned `{error, stream_not_implemented}`. + Use `py:stream_start/3,4` for sync generators; async-generator support may + return in a later release. + +- **Removed `num_executors` / `num_async_workers` configuration** - Both keys + were no-ops after the v3.0 worker rework. Configure context count via + `num_contexts` and the rate-limit ceiling via `max_concurrent`. + +- **Strict context-mode validation at the NIF boundary** - `py_nif:context_create/1` + now returns `{error, {invalid_mode, Atom}}` for anything other than `worker | owngil`. + Previously, callers that bypassed `py_context` (notably `py_reactor_context`) + silently mapped any unknown atom — including legacy `auto` and `subinterp` — + to worker mode. Code that relied on that loophole must pass `worker` (or + `owngil`) explicitly. + +### Fixed + +- **`py:async_call/3,4` + `py:async_await/1,2` round-trip** - Previously the + await receive matched `{py_response, _, _}` while the event loop sent + `{async_result, _, _}`, causing every async call to silently time out. + Async calls now go directly through `py_event_loop:create_task` and + `py_event_loop:await`. + +- **`py:async_gather/1,2` actually executes** - Reimplemented as concurrent + `async_call` submission with sequential `async_await`. Returns + `{ok, [Result1, ...]}` on success or `{error, {gather_failed, [{Idx, Reason}, ...]}}` + if any call fails. The previous implementation returned `gather_not_implemented`. ### Changed -- **`py:execution_mode/0` now returns actual mode** - Returns `worker` (default), - `owngil`, `free_threaded`, or `multi_executor` based on actual configuration - instead of Python capability. Previously returned `subinterp` even when using - worker mode. +- **Per-context worker threads** - Each context now gets its own dedicated pthread + that handles all Python operations. This provides stable thread affinity for + numpy/torch/tensorflow compatibility without needing a shared executor pool. + +- **Async NIF dispatch** - Context operations use async NIFs with message passing + instead of blocking dirty schedulers. This improves concurrency under load. + +- **Request queue per context** - Replaced single-slot request pattern with proper + request queues that support multiple concurrent callers. + +- **No global asyncio policy install on Python 3.14+.** `asyncio.set_event_loop_policy` + was deprecated in 3.14 and is removed in 3.16. The Erlang integration's run path + already uses `loop_factory=` (`erlang.run/1`, `asyncio.Runner`) so the global + policy was only a convenience for bare `asyncio.run()` inside `py:exec`. We now + skip the install on 3.14+ to avoid the deprecation warning. On 3.14+ use + `erlang.run(main)` or `asyncio.Runner(loop_factory=erlang.new_event_loop)` + explicitly. Behavior on Python 3.9–3.13 is unchanged. `erlang.install()` raises + `RuntimeError` on 3.14+ (still emits a `DeprecationWarning` and works on 3.12–3.13). + +### Removed -- **Removed obsolete subinterp test references** - Test suites updated to reflect - the removal of subinterpreter mode. Tests now use `worker` or `owngil` modes. +- Multi-executor pool (`g_executors[]`, `multi_executor_start/stop`) +- `context_dispatch_call/eval/exec` functions (dead code) +- References to `PY_MODE_MULTI_EXECUTOR` in context operations +- `py_async_pool` legacy gen_server (unused after async API rewire) +- **Explicit `py:subinterp_*` handle API removed.** `py:subinterp_create/0`, + `subinterp_destroy/1`, `subinterp_call/4,5`, `subinterp_eval/2,3`, + `subinterp_exec/2`, `subinterp_cast/4`, `subinterp_async_call/4`, + `subinterp_await/1,2`, and `subinterp_pool_*` are all gone. Use + `py_context:new(#{mode => owngil})` instead — it gives the same + parallelism with OTP supervision and automatic cleanup. + `py:subinterp_supported/0` (capability probe) and `py:parallel/1` + (which routes through the context API) stay. +- Internal `py_execution_mode_t` collapsed from 3 values to 2 (`free_threaded` + / `gil`); `py_nif:execution_mode/0` returns `free_threaded | gil` instead + of the old `free_threaded | subinterp | multi_executor`. +- `examples/reactor_owngil_example.erl` deleted (called nonexistent + `py:subinterp_reactor_*` functions; pre-existing breakage). ## 2.3.1 (2026-04-01) diff --git a/README.md b/README.md index 3c6d60a..e5744bc 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,9 @@ evaluate expressions, and stream from generators - all without blocking Erlang schedulers. **Parallelism options:** -- **Worker mode** (default, recommended) - Works with any Python version. With free-threaded Python (3.13t+), provides true parallelism automatically -- **SHARED_GIL sub-interpreters** (Python 3.12+) - Isolated namespaces, shared GIL (isolation improves in 3.14+) -- **OWN_GIL sub-interpreters** (Python 3.14+) - Each interpreter has its own GIL, true parallelism -- **BEAM processes** - Fan out work across lightweight Erlang processes +- **Worker mode** (default, recommended) - Works with any Python version. With free-threaded Python (3.13t+), provides true parallelism automatically. +- **OWN_GIL sub-interpreters** (Python 3.14+) - Each interpreter has its own GIL, true parallelism. +- **BEAM processes** - Fan out work across lightweight Erlang processes. Key features: - **Process-bound environments** - Each Erlang process gets isolated Python state, enabling OTP-supervised Python actors @@ -302,14 +301,11 @@ Ref = py:async_call(aiohttp, get, [<<"https://api.example.com/data">>]), {ok, Response} = py:async_await(Ref). %% Gather multiple async calls concurrently -{ok, Results} = py:async_gather([ +{ok, [Users, Posts, Comments]} = py:async_gather([ {aiohttp, get, [<<"https://api.example.com/users">>]}, {aiohttp, get, [<<"https://api.example.com/posts">>]}, {aiohttp, get, [<<"https://api.example.com/comments">>]} ]). - -%% Stream from async generators -{ok, Chunks} = py:async_stream(mymodule, async_generator, [args]). ``` ## Parallel Execution with Sub-interpreters @@ -328,7 +324,7 @@ True parallelism without GIL contention using Python 3.14+ OWN_GIL sub-interpret %% Each call runs in its own interpreter with its own GIL ``` -For Python 3.12/3.13, use SHARED_GIL sub-interpreters (`mode => subinterp`) for namespace isolation, but note that parallelism is limited by the shared GIL. +For Python 3.12/3.13 the public modes are `worker` (default) and `owngil` (Python 3.14+ only). Earlier versions run all contexts under the shared main interpreter via dedicated worker threads — namespace isolation between contexts is local-dict based, not via subinterpreters. ## Parallel Processing with BEAM Processes @@ -590,9 +586,9 @@ ok = py:clear_traces(). %% sys.config [ {erlang_python, [ - {num_workers, 4}, %% Python worker pool size - {max_concurrent, 17}, %% Max concurrent operations (default: schedulers * 2 + 1) - {num_executors, 4} %% Executor threads (multi-executor mode) + {num_contexts, 8}, %% Number of contexts (default: schedulers) + {context_mode, worker}, %% worker | owngil + {max_concurrent, 17} %% Max concurrent operations (default: schedulers * 2 + 1) ]} ]. ``` @@ -605,40 +601,34 @@ When creating Python contexts, you can choose the execution mode: | Mode | Python Version | Description | |------|----------------|-------------| -| `worker` | Any | Main interpreter, shared namespace (default, recommended) | -| `subinterp` | 3.12+ | SHARED_GIL sub-interpreter, isolated namespace | -| `owngil` | 3.14+ | OWN_GIL sub-interpreter, true parallelism | +| `worker` | Any | Dedicated pthread per context, main interpreter namespace (default) | +| `owngil` | 3.14+ | Dedicated pthread + subinterpreter with its own GIL, true parallelism | ```erlang %% Default: worker mode (recommended) %% With free-threaded Python (3.13t+), provides true parallelism automatically {ok, Ctx} = py_context:new(#{}). -%% Explicit subinterpreter with shared GIL (Python 3.12+) -%% Provides namespace isolation but no parallelism -{ok, Ctx} = py_context:new(#{mode => subinterp}). - %% OWN_GIL mode for true parallelism (Python 3.14+ required) %% Each context runs in its own pthread with independent GIL {ok, Ctx} = py_context:new(#{mode => owngil}). ``` -**Worker mode is recommended** because it works with any Python version and automatically benefits from free-threaded Python (3.13t+) when available. +**Worker mode is recommended** because it works with any Python version and automatically benefits from free-threaded Python (3.13t+) when available. Each context owns a dedicated pthread, providing stable thread affinity for libraries with thread-local state (numpy, torch, tensorflow). -**Why OWN_GIL requires Python 3.14+**: Some C extensions (e.g., `_decimal`, `numpy`) have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14. SHARED_GIL mode works on 3.12+ but with caveats for C extensions with global state. +**Why OWN_GIL requires Python 3.14+**: Some C extensions (e.g., `_decimal`, `numpy`) have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14. ### Runtime Detection -Check the current execution mode: +Check the current execution mode (mirrors the `context_mode` application env): ```erlang -py:execution_mode(). %% => free_threaded | subinterp | multi_executor +py:execution_mode(). %% => worker | owngil ``` | Mode | Python Version | Parallelism | |------|----------------|-------------| -| Free-threaded | 3.13+ (nogil) | True parallel, no GIL | -| Sub-interpreter | 3.12+ | Per-interpreter GIL | -| Multi-executor | Any | GIL contention | +| `worker` (default) | Any | One pthread per context; true parallelism on free-threaded 3.13t+ | +| `owngil` | 3.14+ | Per-interpreter GIL, true parallelism across contexts | ## Error Handling diff --git a/c_src/py_convert.c b/c_src/py_convert.c index 56a4f06..4961aa4 100644 --- a/c_src/py_convert.c +++ b/c_src/py_convert.c @@ -95,13 +95,19 @@ static void shared_dict_capsule_destructor(PyObject *capsule) { * @return true if obj is a numpy ndarray, false otherwise */ static inline bool is_numpy_ndarray(PyObject *obj) { - /* Use cached type for fast isinstance check when available. - * The cache is only valid in the main interpreter - subinterpreters - * have their own object space, so we fall back to attribute detection. */ - if (g_numpy_ndarray_type != NULL && g_execution_mode != PY_MODE_SUBINTERP) { + /* The cache is populated in the main interpreter. On builds where + * subinterpreters can be created (and the runtime isn't free-threaded, + * which short-circuits subinterp use) a context may be running inside + * a subinterpreter where the cached type is invalid -- fall back to + * duck typing in that case. */ +#if defined(HAVE_SUBINTERPRETERS) && !defined(HAVE_FREE_THREADED) + /* Build supports subinterpreters and isn't free-threaded: + * skip the cached fast path. */ +#else + if (g_numpy_ndarray_type != NULL) { return PyObject_IsInstance(obj, g_numpy_ndarray_type) == 1; } - +#endif /* Fallback: duck typing via attribute detection. * Check for both 'tolist' method and 'ndim' attribute. */ return PyObject_HasAttrString(obj, "tolist") && diff --git a/c_src/py_exec.c b/c_src/py_exec.c index d57d08c..cd8a32b 100644 --- a/c_src/py_exec.c +++ b/c_src/py_exec.c @@ -24,7 +24,8 @@ * This module implements the core Python execution engine, handling: * * - **Timeout support**: Trace-based execution timeout monitoring - * - **Executor threads**: Single and multi-executor pool management + * - **Single coordinator executor thread**: serializes legacy worker API and + * coordinator tasks behind one GIL-holding thread. * - **Request processing**: Dispatch for call/eval/exec/import operations * - **Free-threaded mode**: Support for Python 3.13+ no-GIL builds * @@ -43,6 +44,10 @@ * │ completed result * ``` * + * Per-context worker threads (see py_nif.c) handle the public worker / owngil + * APIs directly; the single executor here only backs the legacy worker pool + * and a few coordinator paths. + * * @par GIL Management Patterns * * Following PyO3/Granian best practices: @@ -51,14 +56,6 @@ * - **Py_END_ALLOW_THREADS**: Re-acquire GIL before Python calls * - **PyGILState_Ensure/Release**: For callbacks from non-Python threads * - * @par Execution Modes - * - * | Mode | Description | GIL Handling | - * |------|-------------|--------------| - * | FREE_THREADED | Python 3.13+ no-GIL | Direct execution | - * | SUBINTERP | Python 3.12+ | Per-interpreter GIL | - * | MULTI_EXECUTOR | Traditional | N executor threads | - * * @par Thread Safety * * - Executor queues protected by pthread mutexes @@ -155,19 +152,10 @@ static bool check_timeout_error(void) { static void detect_execution_mode(void) { #ifdef HAVE_FREE_THREADED - /* Python 3.13+ with free-threading enabled */ g_execution_mode = PY_MODE_FREE_THREADED; - return; -#endif - -#ifdef HAVE_SUBINTERPRETERS - /* Python 3.12+ supports per-interpreter GIL */ - g_execution_mode = PY_MODE_SUBINTERP; - return; +#else + g_execution_mode = PY_MODE_GIL; #endif - - /* Fallback: multi-executor with shared GIL */ - g_execution_mode = PY_MODE_MULTI_EXECUTOR; } /* ============================================================================ @@ -783,50 +771,22 @@ static int executor_enqueue(py_request_t *req) { /* Track enqueued requests */ atomic_fetch_add(&g_counters.enqueue_count, 1); - switch (g_execution_mode) { #ifdef HAVE_FREE_THREADED - case PY_MODE_FREE_THREADED: - /* Execute directly in free-threaded mode - no executor needed */ - { - PyGILState_STATE gstate = PyGILState_Ensure(); - process_request(req); - PyGILState_Release(gstate); - /* Signal completion immediately */ - pthread_mutex_lock(&req->mutex); - req->completed = true; - pthread_cond_signal(&req->cond); - pthread_mutex_unlock(&req->mutex); - } - return 0; -#endif - - case PY_MODE_MULTI_EXECUTOR: - if (atomic_load(&g_multi_executor_initialized)) { - /* Route to multi-executor pool. - * Use worker's or context's assigned executor for thread affinity if available. - * This ensures libraries like numpy/torch that have thread-local - * state always run on the same thread for a given worker/context. */ - int exec_id; - if (req->worker != NULL && req->worker->executor_id >= 0) { - exec_id = req->worker->executor_id % g_num_executors; - } else if (req->context != NULL && req->context->executor_id >= 0) { - exec_id = req->context->executor_id % g_num_executors; - } else { - exec_id = select_executor(); - } - multi_executor_enqueue(exec_id, req); - return 0; - } - /* Fall through to single executor if multi not initialized */ - break; - - case PY_MODE_SUBINTERP: - default: - /* Use single executor */ - break; + if (g_execution_mode == PY_MODE_FREE_THREADED) { + /* Execute directly in free-threaded mode - no executor needed */ + PyGILState_STATE gstate = PyGILState_Ensure(); + process_request(req); + PyGILState_Release(gstate); + /* Signal completion immediately */ + pthread_mutex_lock(&req->mutex); + req->completed = true; + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); + return 0; } +#endif - /* Single executor queue */ + /* Single coordinator executor queue */ pthread_mutex_lock(&g_executor_mutex); req->next = NULL; if (g_executor_queue_tail == NULL) { @@ -897,325 +857,8 @@ static void executor_stop(void) { pthread_join(g_executor_thread, NULL); } -/* ============================================================================ - * Multi-executor pool implementation - * - * For MULTI_EXECUTOR mode (traditional Python), we run N executor threads - * that each hold the GIL in turn. This allows GIL contention-based parallelism - * similar to PyO3's multi-executor pattern. - * ============================================================================ */ - -/** - * Main function for a multi-executor thread. - * Each executor has its own queue and processes requests independently. - * - * GIL handling: Acquire GIL only when processing work, not while idle. - * This prevents idle executors from competing with dirty schedulers - * running actual Python work via the context-based API. - */ -static void *multi_executor_thread_main(void *arg) { - executor_t *exec = (executor_t *)arg; - - exec->running = true; - - while (!exec->shutdown) { - py_request_t *req = NULL; - - /* Wait for work - NO GIL held while idle */ - pthread_mutex_lock(&exec->mutex); - while (exec->queue_head == NULL && !exec->shutdown) { - pthread_cond_wait(&exec->cond, &exec->mutex); - } - - /* Dequeue request if available */ - if (exec->queue_head != NULL) { - req = exec->queue_head; - exec->queue_head = req->next; - if (exec->queue_head == NULL) { - exec->queue_tail = NULL; - } - req->next = NULL; - } - pthread_mutex_unlock(&exec->mutex); - - if (req != NULL) { - if (req->type == PY_REQ_SHUTDOWN) { - pthread_mutex_lock(&req->mutex); - req->completed = true; - pthread_cond_signal(&req->cond); - pthread_mutex_unlock(&req->mutex); - break; - } else { - /* Acquire GIL only for actual work */ - PyGILState_STATE gstate = PyGILState_Ensure(); - - /* Process the request */ - process_request(req); - - /* Release GIL immediately after processing */ - PyGILState_Release(gstate); - - /* Signal completion (outside GIL) */ - pthread_mutex_lock(&req->mutex); - req->completed = true; - pthread_cond_signal(&req->cond); - pthread_mutex_unlock(&req->mutex); - } - } - } - - exec->running = false; - - return NULL; -} - -/** - * Select an executor using round-robin. - */ -static int select_executor(void) { - int idx = atomic_fetch_add(&g_next_executor, 1) % g_num_executors; - return idx; -} - -/** - * Enqueue a request to a specific executor. - */ -static void multi_executor_enqueue(int exec_id, py_request_t *req) { - executor_t *exec = &g_executors[exec_id]; - - pthread_mutex_lock(&exec->mutex); - req->next = NULL; - if (exec->queue_tail == NULL) { - exec->queue_head = req; - exec->queue_tail = req; - } else { - exec->queue_tail->next = req; - exec->queue_tail = req; - } - pthread_cond_signal(&exec->cond); - pthread_mutex_unlock(&exec->mutex); -} - -/** - * Start the multi-executor pool. - */ -static int multi_executor_start(int num_executors) { - if (atomic_load(&g_multi_executor_initialized)) { - return 0; - } - - if (num_executors < MIN_EXECUTORS) { - num_executors = MIN_EXECUTORS; - } - if (num_executors > MAX_EXECUTORS) { - num_executors = MAX_EXECUTORS; - } - - g_num_executors = num_executors; - - for (int i = 0; i < num_executors; i++) { - executor_t *exec = &g_executors[i]; - exec->id = i; - exec->queue_head = NULL; - exec->queue_tail = NULL; - exec->running = false; - exec->shutdown = false; - pthread_mutex_init(&exec->mutex, NULL); - pthread_cond_init(&exec->cond, NULL); - - if (pthread_create(&exec->thread, NULL, multi_executor_thread_main, exec) != 0) { - /* Cleanup already created threads */ - for (int j = 0; j < i; j++) { - g_executors[j].shutdown = true; - pthread_cond_signal(&g_executors[j].cond); - pthread_join(g_executors[j].thread, NULL); - pthread_mutex_destroy(&g_executors[j].mutex); - pthread_cond_destroy(&g_executors[j].cond); - } - return -1; - } - } - - /* Wait for all executors to be ready */ - int max_wait = 100; - bool all_ready = false; - while (!all_ready && max_wait-- > 0) { - all_ready = true; - for (int i = 0; i < num_executors; i++) { - if (!g_executors[i].running) { - all_ready = false; - break; - } - } - if (!all_ready) { - usleep(10000); - } - } - - atomic_store(&g_multi_executor_initialized, all_ready); - return all_ready ? 0 : -1; -} - -/** - * Stop the multi-executor pool. - */ -static void multi_executor_stop(void) { - if (!atomic_load(&g_multi_executor_initialized)) { - return; - } - - /* Allocate shutdown requests for all executors */ - py_request_t *shutdown_reqs[MAX_EXECUTORS] = {0}; - - /* Signal shutdown and send shutdown requests to all executors */ - for (int i = 0; i < g_num_executors; i++) { - executor_t *exec = &g_executors[i]; - exec->shutdown = true; - - py_request_t *shutdown_req = enif_alloc(sizeof(py_request_t)); - if (shutdown_req != NULL) { - request_init(shutdown_req); - shutdown_req->type = PY_REQ_SHUTDOWN; - shutdown_reqs[i] = shutdown_req; - multi_executor_enqueue(i, shutdown_req); - } - /* If alloc fails, the shutdown flag is already set, so executor - * will exit when it checks the flag */ - } - - /* Wait for all executors to finish and clean up shutdown requests */ - for (int i = 0; i < g_num_executors; i++) { - executor_t *exec = &g_executors[i]; - pthread_join(exec->thread, NULL); - pthread_mutex_destroy(&exec->mutex); - pthread_cond_destroy(&exec->cond); - - /* Clean up the shutdown request */ - if (shutdown_reqs[i] != NULL) { - request_cleanup(shutdown_reqs[i]); - enif_free(shutdown_reqs[i]); - } - } - - atomic_store(&g_multi_executor_initialized, false); -} - /* * Note: Free-threaded execution (Python 3.13+ nogil) is handled inline * in executor_enqueue() using PyGILState_Ensure/Release which are no-ops * in free-threaded builds but still work correctly. */ - -/* ============================================================================ - * Context dispatch to executor - * - * When a context has thread affinity (executor_id >= 0), we dispatch - * operations through the executor queue instead of executing directly - * on the dirty scheduler. This ensures numpy/torch thread-local state - * consistency. - * ============================================================================ */ - -/** - * Dispatch a context call operation to the executor. - * - * @param env Caller's NIF environment - * @param ctx Context with thread affinity - * @param module_bin Module name binary - * @param func_bin Function name binary - * @param args_term Arguments list - * @param kwargs_term Keyword arguments map - * @return Result term - */ -ERL_NIF_TERM context_dispatch_call(ErlNifEnv *env, py_context_t *ctx, - ErlNifBinary *module_bin, ErlNifBinary *func_bin, - ERL_NIF_TERM args_term, ERL_NIF_TERM kwargs_term) { - py_request_t req; - request_init(&req); - - req.type = PY_REQ_CALL; - req.env = env; - req.worker = NULL; - req.context = ctx; - req.module_bin = *module_bin; - req.func_bin = *func_bin; - req.args_term = args_term; - req.kwargs_term = kwargs_term; - req.timeout_ms = 0; - - if (executor_enqueue(&req) < 0) { - request_cleanup(&req); - return make_error(env, "executor_shutdown"); - } - - executor_wait(&req); - ERL_NIF_TERM result = req.result; - request_cleanup(&req); - - return result; -} - -/** - * Dispatch a context eval operation to the executor. - * - * @param env Caller's NIF environment - * @param ctx Context with thread affinity - * @param code_bin Code string binary - * @param locals_term Local variables map - * @return Result term - */ -ERL_NIF_TERM context_dispatch_eval(ErlNifEnv *env, py_context_t *ctx, - ErlNifBinary *code_bin, ERL_NIF_TERM locals_term) { - py_request_t req; - request_init(&req); - - req.type = PY_REQ_EVAL; - req.env = env; - req.worker = NULL; - req.context = ctx; - req.code_bin = *code_bin; - req.locals_term = locals_term; - req.timeout_ms = 0; - - if (executor_enqueue(&req) < 0) { - request_cleanup(&req); - return make_error(env, "executor_shutdown"); - } - - executor_wait(&req); - ERL_NIF_TERM result = req.result; - request_cleanup(&req); - - return result; -} - -/** - * Dispatch a context exec operation to the executor. - * - * @param env Caller's NIF environment - * @param ctx Context with thread affinity - * @param code_bin Code string binary - * @return Result term - */ -ERL_NIF_TERM context_dispatch_exec(ErlNifEnv *env, py_context_t *ctx, - ErlNifBinary *code_bin) { - py_request_t req; - request_init(&req); - - req.type = PY_REQ_EXEC; - req.env = env; - req.worker = NULL; - req.context = ctx; - req.code_bin = *code_bin; - req.timeout_ms = 0; - - if (executor_enqueue(&req) < 0) { - request_cleanup(&req); - return make_error(env, "executor_shutdown"); - } - - executor_wait(&req); - ERL_NIF_TERM result = req.result; - request_cleanup(&req); - - return result; -} diff --git a/c_src/py_nif.c b/c_src/py_nif.c index 93dd59b..f81fd16 100644 --- a/c_src/py_nif.c +++ b/c_src/py_nif.c @@ -50,9 +50,6 @@ ErlNifResourceType *WORKER_RESOURCE_TYPE = NULL; ErlNifResourceType *PYOBJ_RESOURCE_TYPE = NULL; /* ASYNC_WORKER_RESOURCE_TYPE removed - async workers replaced by event loop model */ ErlNifResourceType *SUSPENDED_STATE_RESOURCE_TYPE = NULL; -#ifdef HAVE_SUBINTERPRETERS -ErlNifResourceType *SUBINTERP_WORKER_RESOURCE_TYPE = NULL; -#endif /* Process-per-context resource type (no mutex) */ ErlNifResourceType *PY_CONTEXT_RESOURCE_TYPE = NULL; @@ -140,13 +137,7 @@ _Atomic py_runtime_state_t g_runtime_state = PY_STATE_UNINIT; PyThreadState *g_main_thread_state = NULL; /* Execution mode */ -py_execution_mode_t g_execution_mode = PY_MODE_MULTI_EXECUTOR; -int g_num_executors = 4; - -/* Multi-executor pool */ -executor_t g_executors[MAX_EXECUTORS]; -_Atomic int g_next_executor = 0; -_Atomic bool g_multi_executor_initialized = false; +py_execution_mode_t g_execution_mode = PY_MODE_GIL; /* Single executor state */ pthread_t g_executor_thread; @@ -343,35 +334,9 @@ static void pyobj_destructor(ErlNifEnv *env, void *obj) { atomic_fetch_add(&g_counters.pyobj_destroyed, 1); } -/* async_worker_destructor removed - async workers replaced by event loop model */ - -#ifdef HAVE_SUBINTERPRETERS -static void subinterp_worker_destructor(ErlNifEnv *env, void *obj) { - (void)env; - py_subinterp_worker_t *worker = (py_subinterp_worker_t *)obj; - - /* For OWN_GIL subinterpreters, we cannot safely acquire the GIL from the - * GC thread (destructor may run on any thread). PyGILState_Ensure only - * works for the main interpreter, and PyThreadState_Swap doesn't actually - * acquire the GIL. - * - * If the user didn't call the explicit destroy function, the subinterpreter - * leaks. This is a known limitation - users must call destroy explicitly. */ - if (worker->tstate != NULL && runtime_is_running()) { -#ifdef DEBUG - fprintf(stderr, "Warning: subinterp_worker leaked - not destroyed " - "via explicit destroy. Use subinterp_worker_destroy/1.\n"); -#endif - /* Skip Python cleanup - we can't safely acquire the subinterpreter's GIL */ - worker->tstate = NULL; - worker->globals = NULL; - worker->locals = NULL; - } - - /* Destroy the mutex */ - pthread_mutex_destroy(&worker->mutex); -} -#endif +/* async_worker_destructor and subinterp_worker_destructor removed — + * async workers replaced by event loop model; subinterp_worker resource + * type retired with the explicit handle API. */ /** * @brief Destructor for py_context_t (process-per-context) @@ -1142,38 +1107,12 @@ static ERL_NIF_TERM nif_py_init(ErlNifEnv *env, int argc, const ERL_NIF_TERM arg /* Save main thread state and release GIL for other threads */ g_main_thread_state = PyEval_SaveThread(); - /* Start executors based on execution mode */ + /* Start single executor for coordinator operations. + * Context operations use per-context worker threads (see worker_context_init). + * The single executor handles legacy worker API and coordinator tasks. */ int executor_result = 0; - switch (g_execution_mode) { - case PY_MODE_FREE_THREADED: - /* No executor needed - direct execution */ - break; - - case PY_MODE_SUBINTERP: - /* Use single executor for coordinator operations */ - executor_result = executor_start(); - break; - - case PY_MODE_MULTI_EXECUTOR: - default: - /* Start multiple executors for GIL contention mode */ - { - int num_exec = MIN_EXECUTORS; /* Fallback if not provided */ - /* Check for config */ - if (argc > 0 && enif_is_map(env, argv[0])) { - ERL_NIF_TERM key = enif_make_atom(env, "num_executors"); - ERL_NIF_TERM value; - if (enif_get_map_value(env, argv[0], key, &value)) { - enif_get_int(env, value, &num_exec); - } - } - executor_result = multi_executor_start(num_exec); - if (executor_result < 0) { - /* Fallback to single executor */ - executor_result = executor_start(); - } - } - break; + if (g_execution_mode != PY_MODE_FREE_THREADED) { + executor_result = executor_start(); } if (executor_result < 0) { @@ -1221,24 +1160,9 @@ static ERL_NIF_TERM nif_finalize(ErlNifEnv *env, int argc, const ERL_NIF_TERM ar * 3. Then clean up caches with GIL (no active work at this point) */ - /* Step 1: Stop executors - they will finish in-flight requests and exit */ - switch (g_execution_mode) { - case PY_MODE_FREE_THREADED: - /* No executor to stop */ - break; - - case PY_MODE_SUBINTERP: - executor_stop(); - break; - - case PY_MODE_MULTI_EXECUTOR: - default: - if (atomic_load(&g_multi_executor_initialized)) { - multi_executor_stop(); - } else { - executor_stop(); - } - break; + /* Step 1: Stop executor - it will finish in-flight requests and exit */ + if (g_execution_mode != PY_MODE_FREE_THREADED) { + executor_stop(); } /* Step 2: Clean up thread worker system */ @@ -1334,11 +1258,6 @@ static ERL_NIF_TERM nif_worker_new(ErlNifEnv *env, int argc, const ERL_NIF_TERM worker->has_callback_handler = false; worker->callback_env = NULL; - /* Assign executor affinity for thread-safe library support (numpy, torch). - * Each worker gets a fixed executor to ensure all calls from the same - * worker go to the same thread, preventing thread state corruption. */ - worker->executor_id = select_executor(); - PyGILState_Release(gstate); ERL_NIF_TERM result = enif_make_resource(env, worker); @@ -1755,29 +1674,12 @@ static ERL_NIF_TERM nif_execution_mode(ErlNifEnv *env, int argc, const ERL_NIF_T (void)argc; (void)argv; - const char *mode_str; - switch (g_execution_mode) { - case PY_MODE_FREE_THREADED: - mode_str = "free_threaded"; - break; - case PY_MODE_SUBINTERP: - mode_str = "subinterp"; - break; - case PY_MODE_MULTI_EXECUTOR: - default: - mode_str = "multi_executor"; - break; - } + const char *mode_str = (g_execution_mode == PY_MODE_FREE_THREADED) + ? "free_threaded" + : "gil"; return enif_make_atom(env, mode_str); } -static ERL_NIF_TERM nif_num_executors(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - - return enif_make_int(env, g_num_executors); -} - /* ============================================================================ * Callback support NIFs * ============================================================================ */ @@ -1903,207 +1805,251 @@ static ERL_NIF_TERM nif_owngil_supported(ErlNifEnv *env, int argc, const ERL_NIF #endif } -#ifdef HAVE_SUBINTERPRETERS - -static ERL_NIF_TERM nif_subinterp_worker_new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - - if (!runtime_is_running()) { - return make_error(env, "python_not_running"); - } - - py_subinterp_worker_t *worker = enif_alloc_resource(SUBINTERP_WORKER_RESOURCE_TYPE, - sizeof(py_subinterp_worker_t)); - if (worker == NULL) { - return make_error(env, "alloc_failed"); - } - - /* Initialize mutex for thread-safe access */ - if (pthread_mutex_init(&worker->mutex, NULL) != 0) { - enif_release_resource(worker); - return make_error(env, "mutex_init_failed"); - } - - /* Need the main GIL to create sub-interpreter */ - PyGILState_STATE gstate = PyGILState_Ensure(); - - /* Save current thread state so we can restore it after creating sub-interp */ - PyThreadState *main_tstate = PyThreadState_Get(); - /* Configure sub-interpreter with its own GIL */ - PyInterpreterConfig config = { - .use_main_obmalloc = 0, - .allow_fork = 0, - .allow_exec = 0, - .allow_threads = 1, - .allow_daemon_threads = 0, - .check_multi_interp_extensions = 1, - .gil = PyInterpreterConfig_OWN_GIL, /* This is the key - own GIL! */ - }; +/* ============================================================================ + * Shared-GIL Pool Model for Subinterpreters + * + * Subinterpreters share the GIL but provide namespace isolation. Execution + * happens on dirty schedulers using PyThreadState_Swap() to switch to the + * subinterpreter's thread state from the pool. + * ============================================================================ */ - PyThreadState *tstate = NULL; - PyStatus status = Py_NewInterpreterFromConfig(&tstate, &config); +/* Forward declaration - defined later in this file */ +static PyObject *context_get_module(py_context_t *ctx, const char *module_name); - if (PyStatus_Exception(status) || tstate == NULL) { - /* We're still in main interpreter on error */ - PyGILState_Release(gstate); - enif_release_resource(worker); - return make_error(env, "subinterp_create_failed"); - } +/* Old thread-per-context functions removed - now using shared-GIL pool model */ - worker->interp = PyThreadState_GetInterpreter(tstate); - worker->tstate = tstate; +/* ============================================================================ + * OWN_GIL Context Support + * + * OWN_GIL contexts create a dedicated pthread with its own Python subinterpreter + * that has an independent GIL. This enables true parallel Python execution. + * + * Architecture: + * - Each OWN_GIL context gets its own pthread at creation time + * - The pthread creates an OWN_GIL subinterpreter and runs a request loop + * - Dirty schedulers dispatch requests via condition variables + * - Terms are passed via enif_make_copy() (zero serialization overhead) + * ============================================================================ */ - /* Create global/local namespaces in the new interpreter */ - worker->globals = PyDict_New(); - worker->locals = PyDict_New(); +/* ============================================================================ + * Context Request Queue Operations + * + * These functions manage the request queue for worker/owngil contexts. + * They replace the single-slot pattern that had race conditions. + * Available for all Python versions to support worker thread mode. + * ============================================================================ */ - /* Import __builtins__ */ - PyObject *builtins = PyEval_GetBuiltins(); - PyDict_SetItemString(worker->globals, "__builtins__", builtins); +/** + * @brief Enqueue a request to a context's request queue + * + * Thread-safe. Adds request to tail of queue and signals worker. + * Caller must have already set refcount to 2 (caller + queue). + * + * @param ctx The context + * @param req The request (refcount should be 2) + */ +static void ctx_queue_enqueue(py_context_t *ctx, ctx_request_t *req) { + pthread_mutex_lock(&ctx->queue_mutex); - /* Initialize event loop for this subinterpreter */ - if (init_subinterpreter_event_loop(env) < 0) { - /* Clean up Python objects before ending interpreter */ - Py_XDECREF(worker->globals); - worker->globals = NULL; - Py_XDECREF(worker->locals); - worker->locals = NULL; - Py_EndInterpreter(tstate); - /* Re-acquire main interpreter's GIL after subinterpreter was destroyed */ - PyEval_RestoreThread(main_tstate); - PyGILState_Release(gstate); - enif_release_resource(worker); - return make_error(env, "event_loop_init_failed"); + req->next = NULL; + if (ctx->queue_tail == NULL) { + ctx->queue_head = req; + ctx->queue_tail = req; + } else { + ctx->queue_tail->next = req; + ctx->queue_tail = req; } - /* Switch back to main interpreter - release subinterp's GIL and acquire main's */ - PyEval_SaveThread(); /* Release subinterpreter's GIL */ - PyEval_RestoreThread(main_tstate); /* Acquire main interpreter's GIL */ - - PyGILState_Release(gstate); - - ERL_NIF_TERM result = enif_make_resource(env, worker); - enif_release_resource(worker); - - return enif_make_tuple2(env, ATOM_OK, result); + pthread_cond_signal(&ctx->queue_not_empty); + pthread_mutex_unlock(&ctx->queue_mutex); } -static ERL_NIF_TERM nif_subinterp_worker_destroy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - py_subinterp_worker_t *worker; +/** + * @brief Dequeue a request from a context's request queue + * + * Blocks until a request is available or shutdown is requested. + * Returns NULL if shutdown requested and queue is empty. + * + * @param ctx The context + * @return The dequeued request, or NULL on shutdown + */ +static ctx_request_t *ctx_queue_dequeue(py_context_t *ctx) { + pthread_mutex_lock(&ctx->queue_mutex); - if (!enif_get_resource(env, argv[0], SUBINTERP_WORKER_RESOURCE_TYPE, (void **)&worker)) { - return make_error(env, "invalid_worker"); + while (ctx->queue_head == NULL && !atomic_load(&ctx->shutdown_requested)) { + pthread_cond_wait(&ctx->queue_not_empty, &ctx->queue_mutex); } - if (!runtime_is_running()) { - return make_error(env, "python_not_running"); + ctx_request_t *req = ctx->queue_head; + if (req != NULL) { + ctx->queue_head = req->next; + if (ctx->queue_head == NULL) { + ctx->queue_tail = NULL; + } + req->next = NULL; } - /* Lock mutex for thread-safe access */ - pthread_mutex_lock(&worker->mutex); + pthread_mutex_unlock(&ctx->queue_mutex); + return req; +} - if (worker->tstate != NULL) { - /* For subinterpreters with OWN_GIL, directly acquire the subinterpreter's - * GIL. We don't use PyGILState_Ensure because that only works for the - * main interpreter. */ - PyEval_RestoreThread(worker->tstate); +/** + * @brief Cancel all pending requests in a context's queue + * + * Called during context destruction. Sets cancelled flag on all + * pending requests and signals their condition variables. + * + * @param ctx The context + */ +static void ctx_queue_cancel_all(py_context_t *ctx) { + pthread_mutex_lock(&ctx->queue_mutex); - /* Clean up Python objects while holding the subinterpreter's GIL */ - Py_XDECREF(worker->globals); - worker->globals = NULL; - Py_XDECREF(worker->locals); - worker->locals = NULL; + ctx_request_t *req = ctx->queue_head; + while (req != NULL) { + ctx_request_t *next = req->next; + atomic_store(&req->cancelled, true); + + /* Signal waiters that request is done (cancelled) */ + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); - /* End the interpreter - this releases its GIL */ - Py_EndInterpreter(worker->tstate); - worker->tstate = NULL; + /* Release queue's reference */ + ctx_request_release(req); + req = next; } - pthread_mutex_unlock(&worker->mutex); + ctx->queue_head = NULL; + ctx->queue_tail = NULL; - return ATOM_OK; + pthread_mutex_unlock(&ctx->queue_mutex); } -static ERL_NIF_TERM nif_subinterp_call(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - py_subinterp_worker_t *worker; - ErlNifBinary module_bin, func_bin; +/* ============================================================================ + * OWN_GIL execute helpers + * + * Each OWN_GIL worker thread dequeues a ctx_request_t and copies the request + * fields onto the owning context (ctx->shared_env, ctx->request_term, etc.) + * before calling these helpers. Helpers consume those fields and write the + * response back into ctx->response_term / ctx->response_ok. + * ============================================================================ */ - if (!runtime_is_running()) { - return make_error(env, "python_not_running"); - } +/** + * @brief Execute a call request in the OWN_GIL thread + */ +static void owngil_execute_call(py_context_t *ctx) { + /* Decode request from shared_env */ + ERL_NIF_TERM module_term, func_term, args_term, kwargs_term; + const ERL_NIF_TERM *tuple_terms; + int tuple_arity; - if (!enif_get_resource(env, argv[0], SUBINTERP_WORKER_RESOURCE_TYPE, (void **)&worker)) { - return make_error(env, "invalid_worker"); - } - if (!enif_inspect_binary(env, argv[1], &module_bin)) { - return make_error(env, "invalid_module"); - } - if (!enif_inspect_binary(env, argv[2], &func_bin)) { - return make_error(env, "invalid_func"); + if (!enif_get_tuple(ctx->shared_env, ctx->request_term, &tuple_arity, &tuple_terms) || + tuple_arity < 4) { + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "invalid_request")); + ctx->response_ok = false; + return; } - /* Lock mutex for thread-safe access */ - pthread_mutex_lock(&worker->mutex); + module_term = tuple_terms[0]; + func_term = tuple_terms[1]; + args_term = tuple_terms[2]; + kwargs_term = tuple_terms[3]; - /* Enter the sub-interpreter with proper GIL acquisition (safe for OWN_GIL) */ - PyEval_RestoreThread(worker->tstate); + ErlNifBinary module_bin, func_bin; + if (!enif_inspect_binary(ctx->shared_env, module_term, &module_bin) || + !enif_inspect_binary(ctx->shared_env, func_term, &func_bin)) { + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "invalid_module_or_func")); + ctx->response_ok = false; + return; + } char *module_name = binary_to_string(&module_bin); - char *func_name = binary_to_string(&func_bin); - if (module_name == NULL || func_name == NULL) { + char *func_name_str = binary_to_string(&func_bin); + + if (module_name == NULL || func_name_str == NULL) { enif_free(module_name); - enif_free(func_name); - PyEval_SaveThread(); - pthread_mutex_unlock(&worker->mutex); - return make_error(env, "alloc_failed"); + enif_free(func_name_str); + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "alloc_failed")); + ctx->response_ok = false; + return; } - ERL_NIF_TERM result; + PyObject *module = NULL; + PyObject *func = NULL; - /* Import module */ - PyObject *module = PyImport_ImportModule(module_name); - if (module == NULL) { - result = make_py_error(env); - goto cleanup; + /* Special handling for __main__ module - check ctx->globals first */ + if (strcmp(module_name, "__main__") == 0) { + func = PyDict_GetItemString(ctx->globals, func_name_str); /* Borrowed ref */ + if (func != NULL) { + Py_INCREF(func); + } } - /* Get function */ - PyObject *func = PyObject_GetAttrString(module, func_name); - Py_DECREF(module); if (func == NULL) { - result = make_py_error(env); - goto cleanup; + /* Get or import module */ + module = context_get_module(ctx, module_name); + if (module == NULL) { + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; + enif_free(module_name); + enif_free(func_name_str); + return; + } + + /* Get function */ + func = PyObject_GetAttrString(module, func_name_str); + if (func == NULL) { + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; + enif_free(module_name); + enif_free(func_name_str); + return; + } } + enif_free(module_name); + enif_free(func_name_str); + /* Convert args */ unsigned int args_len; - if (!enif_get_list_length(env, argv[3], &args_len)) { + if (!enif_get_list_length(ctx->shared_env, args_term, &args_len)) { Py_DECREF(func); - result = make_error(env, "invalid_args"); - goto cleanup; + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "invalid_args")); + ctx->response_ok = false; + return; } PyObject *args = PyTuple_New(args_len); - ERL_NIF_TERM head, tail = argv[3]; + ERL_NIF_TERM head, tail = args_term; for (unsigned int i = 0; i < args_len; i++) { - enif_get_list_cell(env, tail, &head, &tail); - PyObject *arg = term_to_py(env, head); + enif_get_list_cell(ctx->shared_env, tail, &head, &tail); + PyObject *arg = term_to_py(ctx->shared_env, head); if (arg == NULL) { Py_DECREF(args); Py_DECREF(func); - result = make_error(env, "arg_conversion_failed"); - goto cleanup; + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "arg_conversion_failed")); + ctx->response_ok = false; + return; } PyTuple_SET_ITEM(args, i, arg); } /* Convert kwargs */ PyObject *kwargs = NULL; - if (argc > 4 && enif_is_map(env, argv[4])) { - kwargs = term_to_py(env, argv[4]); + if (enif_is_map(ctx->shared_env, kwargs_term)) { + kwargs = term_to_py(ctx->shared_env, kwargs_term); } /* Call the function */ @@ -2113,258 +2059,10 @@ static ERL_NIF_TERM nif_subinterp_call(ErlNifEnv *env, int argc, const ERL_NIF_T Py_XDECREF(kwargs); if (py_result == NULL) { - result = make_py_error(env); + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; } else { - ERL_NIF_TERM term_result = py_to_term(env, py_result); - Py_DECREF(py_result); - result = enif_make_tuple2(env, ATOM_OK, term_result); - } - -cleanup: - enif_free(module_name); - enif_free(func_name); - - /* Exit the sub-interpreter with proper GIL release (safe for OWN_GIL) */ - PyEval_SaveThread(); - - /* Unlock mutex */ - pthread_mutex_unlock(&worker->mutex); - - return result; -} - -static ERL_NIF_TERM nif_parallel_execute(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - unsigned int workers_len, calls_len; - - if (!enif_get_list_length(env, argv[0], &workers_len)) { - return make_error(env, "invalid_workers_list"); - } - if (!enif_get_list_length(env, argv[1], &calls_len)) { - return make_error(env, "invalid_calls_list"); - } - if (workers_len == 0 || calls_len == 0) { - return enif_make_tuple2(env, ATOM_OK, enif_make_list(env, 0)); - } - if (workers_len < calls_len) { - return make_error(env, "not_enough_workers"); - } - - ERL_NIF_TERM *results = enif_alloc(sizeof(ERL_NIF_TERM) * calls_len); - if (results == NULL) { - return make_error(env, "alloc_failed"); - } - ERL_NIF_TERM worker_head, worker_tail = argv[0]; - ERL_NIF_TERM call_head, call_tail = argv[1]; - - for (unsigned int i = 0; i < calls_len; i++) { - enif_get_list_cell(env, worker_tail, &worker_head, &worker_tail); - enif_get_list_cell(env, call_tail, &call_head, &call_tail); - - int arity; - const ERL_NIF_TERM *tuple; - if (!enif_get_tuple(env, call_head, &arity, &tuple) || arity < 3) { - enif_free(results); - return make_error(env, "invalid_call_tuple"); - } - - /* Build args array for subinterp_call */ - ERL_NIF_TERM call_args[5] = {worker_head, tuple[0], tuple[1], tuple[2], - (arity > 3) ? tuple[3] : enif_make_new_map(env)}; - - results[i] = nif_subinterp_call(env, 5, call_args); - } - - ERL_NIF_TERM result_list = enif_make_list_from_array(env, results, calls_len); - enif_free(results); - - return enif_make_tuple2(env, ATOM_OK, result_list); -} - -#else /* !HAVE_SUBINTERPRETERS */ - -/* Stub implementations for older Python versions */ -static ERL_NIF_TERM nif_subinterp_worker_new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - return make_error(env, "subinterpreters_not_supported"); -} - -static ERL_NIF_TERM nif_subinterp_worker_destroy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - return make_error(env, "subinterpreters_not_supported"); -} - -static ERL_NIF_TERM nif_subinterp_call(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - return make_error(env, "subinterpreters_not_supported"); -} - -static ERL_NIF_TERM nif_parallel_execute(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - return make_error(env, "subinterpreters_not_supported"); -} - -#endif /* HAVE_SUBINTERPRETERS */ - -/* ============================================================================ - * Shared-GIL Pool Model for Subinterpreters - * - * Subinterpreters share the GIL but provide namespace isolation. Execution - * happens on dirty schedulers using PyThreadState_Swap() to switch to the - * subinterpreter's thread state from the pool. - * ============================================================================ */ - -/* Forward declaration - defined later in this file */ -static PyObject *context_get_module(py_context_t *ctx, const char *module_name); - -/* Old thread-per-context functions removed - now using shared-GIL pool model */ - -/* ============================================================================ - * OWN_GIL Context Support - * - * OWN_GIL contexts create a dedicated pthread with its own Python subinterpreter - * that has an independent GIL. This enables true parallel Python execution. - * - * Architecture: - * - Each OWN_GIL context gets its own pthread at creation time - * - The pthread creates an OWN_GIL subinterpreter and runs a request loop - * - Dirty schedulers dispatch requests via condition variables - * - Terms are passed via enif_make_copy() (zero serialization overhead) - * ============================================================================ */ - -#ifdef HAVE_SUBINTERPRETERS - -/** - * @brief Execute a call request in the OWN_GIL thread - */ -static void owngil_execute_call(py_context_t *ctx) { - /* Decode request from shared_env */ - ERL_NIF_TERM module_term, func_term, args_term, kwargs_term; - const ERL_NIF_TERM *tuple_terms; - int tuple_arity; - - if (!enif_get_tuple(ctx->shared_env, ctx->request_term, &tuple_arity, &tuple_terms) || - tuple_arity < 4) { - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "invalid_request")); - ctx->response_ok = false; - return; - } - - module_term = tuple_terms[0]; - func_term = tuple_terms[1]; - args_term = tuple_terms[2]; - kwargs_term = tuple_terms[3]; - - ErlNifBinary module_bin, func_bin; - if (!enif_inspect_binary(ctx->shared_env, module_term, &module_bin) || - !enif_inspect_binary(ctx->shared_env, func_term, &func_bin)) { - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "invalid_module_or_func")); - ctx->response_ok = false; - return; - } - - char *module_name = binary_to_string(&module_bin); - char *func_name_str = binary_to_string(&func_bin); - - if (module_name == NULL || func_name_str == NULL) { - enif_free(module_name); - enif_free(func_name_str); - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "alloc_failed")); - ctx->response_ok = false; - return; - } - - PyObject *module = NULL; - PyObject *func = NULL; - - /* Special handling for __main__ module - check ctx->globals first */ - if (strcmp(module_name, "__main__") == 0) { - func = PyDict_GetItemString(ctx->globals, func_name_str); /* Borrowed ref */ - if (func != NULL) { - Py_INCREF(func); - } - } - - if (func == NULL) { - /* Get or import module */ - module = context_get_module(ctx, module_name); - if (module == NULL) { - ctx->response_term = make_py_error(ctx->shared_env); - ctx->response_ok = false; - enif_free(module_name); - enif_free(func_name_str); - return; - } - - /* Get function */ - func = PyObject_GetAttrString(module, func_name_str); - if (func == NULL) { - ctx->response_term = make_py_error(ctx->shared_env); - ctx->response_ok = false; - enif_free(module_name); - enif_free(func_name_str); - return; - } - } - - enif_free(module_name); - enif_free(func_name_str); - - /* Convert args */ - unsigned int args_len; - if (!enif_get_list_length(ctx->shared_env, args_term, &args_len)) { - Py_DECREF(func); - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "invalid_args")); - ctx->response_ok = false; - return; - } - - PyObject *args = PyTuple_New(args_len); - ERL_NIF_TERM head, tail = args_term; - for (unsigned int i = 0; i < args_len; i++) { - enif_get_list_cell(ctx->shared_env, tail, &head, &tail); - PyObject *arg = term_to_py(ctx->shared_env, head); - if (arg == NULL) { - Py_DECREF(args); - Py_DECREF(func); - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "arg_conversion_failed")); - ctx->response_ok = false; - return; - } - PyTuple_SET_ITEM(args, i, arg); - } - - /* Convert kwargs */ - PyObject *kwargs = NULL; - if (enif_is_map(ctx->shared_env, kwargs_term)) { - kwargs = term_to_py(ctx->shared_env, kwargs_term); - } - - /* Call the function */ - PyObject *py_result = PyObject_Call(func, args, kwargs); - Py_DECREF(func); - Py_DECREF(args); - Py_XDECREF(kwargs); - - if (py_result == NULL) { - ctx->response_term = make_py_error(ctx->shared_env); - ctx->response_ok = false; - } else { - ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result); + ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result); Py_DECREF(py_result); ctx->response_term = enif_make_tuple2(ctx->shared_env, enif_make_atom(ctx->shared_env, "ok"), term_result); @@ -2703,9 +2401,13 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) { return; } - /* Set thread-local env for callback support */ + /* Set thread-local state for callback/suspension support */ + py_context_t *prev_context = tl_current_context; + tl_current_context = ctx; py_env_resource_t *prev_local_env = tl_current_local_env; tl_current_local_env = penv; + bool prev_allow_suspension = tl_allow_suspension; + tl_allow_suspension = true; /* Build eval_locals from penv->globals + any passed locals */ PyObject *eval_locals = PyDict_Copy(penv->globals); @@ -2723,6 +2425,8 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) { if (compiled == NULL) { Py_DECREF(eval_locals); + tl_allow_suspension = prev_allow_suspension; + tl_current_context = prev_context; tl_current_local_env = prev_local_env; ctx->response_term = make_py_error(ctx->shared_env); ctx->response_ok = false; @@ -2733,34 +2437,188 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) { Py_DECREF(compiled); Py_DECREF(eval_locals); - tl_current_local_env = prev_local_env; - if (py_result == NULL) { - ctx->response_term = make_py_error(ctx->shared_env); - ctx->response_ok = false; - } else { - ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result); - Py_DECREF(py_result); - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "ok"), term_result); - ctx->response_ok = true; - } -} + /* Check for pending callback (suspension) */ + if (tl_pending_callback) { + PyErr_Clear(); + /* Create suspended state for callback handling */ + suspended_context_state_t *suspended = create_suspended_context_state_for_eval( + ctx->shared_env, ctx, &code_bin, tuple_terms[1]); + if (suspended == NULL) { + tl_pending_callback = false; + Py_CLEAR(tl_pending_args); + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "create_suspended_state_failed")); + ctx->response_ok = false; + } else { + ctx->response_term = build_suspended_context_result(ctx->shared_env, suspended); + ctx->response_ok = true; /* Suspended is a valid response */ + } + } else { + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; + } + } else if (is_inline_schedule_marker(py_result)) { + /* Inline schedule marker: execute continuation directly in worker thread. + * Loop until we get a final result or a suspension. */ + int depth = 0; + while (is_inline_schedule_marker(py_result) && depth < MAX_INLINE_CONTINUATION_DEPTH) { + inline_continuation_t *cont = create_inline_continuation(ctx, penv, py_result, depth); + Py_DECREF(py_result); + py_result = NULL; -/** - * @brief Execute a call request with process-local env in the OWN_GIL thread - * - * Uses penv->globals for function lookup in __main__ module - */ -static void owngil_execute_call_with_env(py_context_t *ctx) { - py_env_resource_t *penv = (py_env_resource_t *)ctx->local_env_ptr; - ctx->local_env_ptr = NULL; /* Clear after use */ + if (cont == NULL) { + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "create_continuation_failed")); + ctx->response_ok = false; + goto cleanup; + } - if (penv == NULL || penv->globals == NULL) { - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "invalid_env")); - ctx->response_ok = false; + /* Execute the continuation function */ + PyObject *func = NULL; + PyObject *module = NULL; + + if (strcmp(cont->module_name, "__main__") == 0) { + /* Try captured globals first */ + if (cont->globals != NULL) { + func = PyDict_GetItemString(cont->globals, cont->func_name); + } + if (func == NULL && cont->locals != NULL) { + func = PyDict_GetItemString(cont->locals, cont->func_name); + } + if (func == NULL && penv != NULL) { + func = PyDict_GetItemString(penv->globals, cont->func_name); + } + if (func == NULL && ctx->globals != NULL) { + func = PyDict_GetItemString(ctx->globals, cont->func_name); + } + if (func != NULL) { + Py_INCREF(func); + } else { + PyErr_Format(PyExc_NameError, "name '%s' is not defined", cont->func_name); + } + } else { + module = PyImport_ImportModule(cont->module_name); + if (module != NULL) { + func = PyObject_GetAttrString(module, cont->func_name); + Py_DECREF(module); + } + } + + if (func == NULL) { + enif_release_resource(cont); + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; + goto cleanup; + } + + /* Build args and call */ + PyObject *args = cont->args ? cont->args : PyTuple_New(0); + if (args == NULL) { + Py_DECREF(func); + enif_release_resource(cont); + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; + goto cleanup; + } + if (cont->args) Py_INCREF(args); + + py_result = PyObject_Call(func, args, cont->kwargs); + Py_DECREF(func); + Py_DECREF(args); + enif_release_resource(cont); + depth++; + } + + if (depth >= MAX_INLINE_CONTINUATION_DEPTH) { + Py_XDECREF(py_result); + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "inline_continuation_depth_exceeded")); + ctx->response_ok = false; + goto cleanup; + } + + /* Handle final result (or error/suspension from continuation) */ + if (py_result == NULL) { + if (tl_pending_callback) { + PyErr_Clear(); + suspended_context_state_t *suspended = create_suspended_context_state_for_eval( + ctx->shared_env, ctx, &code_bin, tuple_terms[1]); + if (suspended == NULL) { + tl_pending_callback = false; + Py_CLEAR(tl_pending_args); + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "create_suspended_state_failed")); + ctx->response_ok = false; + } else { + ctx->response_term = build_suspended_context_result(ctx->shared_env, suspended); + ctx->response_ok = true; + } + } else { + ctx->response_term = make_py_error(ctx->shared_env); + ctx->response_ok = false; + } + } else if (is_schedule_marker(py_result)) { + ScheduleMarkerObject *marker = (ScheduleMarkerObject *)py_result; + ERL_NIF_TERM callback_name = py_to_term(ctx->shared_env, marker->callback_name); + ERL_NIF_TERM callback_args = py_to_term(ctx->shared_env, marker->args); + Py_DECREF(py_result); + ctx->response_term = enif_make_tuple3(ctx->shared_env, + enif_make_atom(ctx->shared_env, "schedule"), + callback_name, callback_args); + ctx->response_ok = true; + } else { + ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result); + Py_DECREF(py_result); + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "ok"), term_result); + ctx->response_ok = true; + } + goto cleanup; + } else if (is_schedule_marker(py_result)) { + /* Schedule marker: return {schedule, callback_name, args} */ + ScheduleMarkerObject *marker = (ScheduleMarkerObject *)py_result; + ERL_NIF_TERM callback_name = py_to_term(ctx->shared_env, marker->callback_name); + ERL_NIF_TERM callback_args = py_to_term(ctx->shared_env, marker->args); + Py_DECREF(py_result); + ctx->response_term = enif_make_tuple3(ctx->shared_env, + enif_make_atom(ctx->shared_env, "schedule"), + callback_name, callback_args); + ctx->response_ok = true; + } else { + ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result); + Py_DECREF(py_result); + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "ok"), term_result); + ctx->response_ok = true; + } + +cleanup: + /* Restore thread-local state */ + tl_allow_suspension = prev_allow_suspension; + tl_current_context = prev_context; + tl_current_local_env = prev_local_env; + clear_pending_callback_tls(); +} + +/** + * @brief Execute a call request with process-local env in the OWN_GIL thread + * + * Uses penv->globals for function lookup in __main__ module + */ +static void owngil_execute_call_with_env(py_context_t *ctx) { + py_env_resource_t *penv = (py_env_resource_t *)ctx->local_env_ptr; + ctx->local_env_ptr = NULL; /* Clear after use */ + + if (penv == NULL || penv->globals == NULL) { + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "invalid_env")); + ctx->response_ok = false; return; } @@ -3064,103 +2922,614 @@ static void owngil_execute_apply_paths(py_context_t *ctx) { return; } - /* Count paths first */ - ERL_NIF_TERM head, tail = ctx->request_term; - int path_count = 0; - while (enif_get_list_cell(ctx->shared_env, tail, &head, &tail)) { - path_count++; + /* Count paths first */ + ERL_NIF_TERM head, tail = ctx->request_term; + int path_count = 0; + while (enif_get_list_cell(ctx->shared_env, tail, &head, &tail)) { + path_count++; + } + + /* Insert in reverse order so first path ends up first */ + for (int i = 0; i < path_count; i++) { + /* Skip to the i-th element from the end */ + ERL_NIF_TERM current = ctx->request_term; + for (int j = 0; j < path_count - 1 - i; j++) { + enif_get_list_cell(ctx->shared_env, current, &head, ¤t); + } + enif_get_list_cell(ctx->shared_env, current, &head, ¤t); + + ErlNifBinary path_bin; + if (!enif_inspect_binary(ctx->shared_env, head, &path_bin)) { + continue; + } + + /* Convert to Python string */ + PyObject *path_str = PyUnicode_FromStringAndSize((char *)path_bin.data, path_bin.size); + if (path_str == NULL) { + PyErr_Clear(); + continue; + } + + /* Check if already in sys.path */ + int already_present = PySequence_Contains(sys_path, path_str); + if (already_present <= 0) { + /* Insert at position 0 */ + PyList_Insert(sys_path, 0, path_str); + } + Py_DECREF(path_str); + } + + Py_DECREF(sys_path); + ctx->response_term = enif_make_atom(ctx->shared_env, "ok"); + ctx->response_ok = true; +} + +/** + * @brief Execute a request based on its type + */ +static void owngil_execute_request(py_context_t *ctx) { + switch (ctx->request_type) { + case CTX_REQ_CALL: + owngil_execute_call(ctx); + break; + case CTX_REQ_EVAL: + owngil_execute_eval(ctx); + break; + case CTX_REQ_EXEC: + owngil_execute_exec(ctx); + break; + case CTX_REQ_REACTOR_ON_READ_READY: + owngil_execute_reactor_read(ctx); + break; + case CTX_REQ_REACTOR_ON_WRITE_READY: + owngil_execute_reactor_write(ctx); + break; + case CTX_REQ_REACTOR_INIT_CONNECTION: + owngil_execute_reactor_init(ctx); + break; + case CTX_REQ_EXEC_WITH_ENV: + owngil_execute_exec_with_env(ctx); + break; + case CTX_REQ_EVAL_WITH_ENV: + owngil_execute_eval_with_env(ctx); + break; + case CTX_REQ_CALL_WITH_ENV: + owngil_execute_call_with_env(ctx); + break; + case CTX_REQ_CREATE_LOCAL_ENV: + owngil_execute_create_local_env(ctx); + break; + case CTX_REQ_APPLY_IMPORTS: + owngil_execute_apply_imports(ctx); + break; + case CTX_REQ_APPLY_PATHS: + owngil_execute_apply_paths(ctx); + break; + default: + ctx->response_term = enif_make_tuple2(ctx->shared_env, + enif_make_atom(ctx->shared_env, "error"), + enif_make_atom(ctx->shared_env, "unknown_request_type")); + ctx->response_ok = false; + break; + } +} + +/* ============================================================================ + * Worker Thread Implementation (main interpreter, all Python versions) + * + * Worker mode uses a dedicated pthread that acquires the GIL for each request. + * This provides stable thread affinity for numpy/torch/tensorflow without + * requiring subinterpreter support. + * ============================================================================ */ + +/** + * @brief Main loop for worker context thread (main interpreter mode) + * + * This function runs in a dedicated pthread. It processes requests from the + * request queue, acquiring the GIL for each request using PyGILState_Ensure. + * + * Unlike owngil mode, worker mode uses the main interpreter and shares the GIL + * with other Python threads. The benefit is stable thread affinity and + * compatibility with all Python extensions. + */ +static void *worker_context_thread_main(void *arg) { + py_context_t *ctx = (py_context_t *)arg; + + /* Create namespace dictionaries on the worker thread under GIL */ + PyGILState_STATE gstate = PyGILState_Ensure(); + + /* Create namespace dictionaries if not already created */ + if (ctx->globals == NULL) { + ctx->globals = PyDict_New(); + ctx->locals = PyDict_New(); + ctx->module_cache = PyDict_New(); + + if (ctx->globals == NULL || ctx->locals == NULL || ctx->module_cache == NULL) { + PyGILState_Release(gstate); + atomic_store(&ctx->init_error, true); + atomic_store(&ctx->worker_running, false); + return NULL; + } + + /* Import __builtins__ into globals */ + PyObject *builtins = PyEval_GetBuiltins(); + PyDict_SetItemString(ctx->globals, "__builtins__", builtins); + + /* Import erlang module into globals */ + PyObject *erlang_module = PyImport_ImportModule("erlang"); + if (erlang_module != NULL) { + PyDict_SetItemString(ctx->globals, "erlang", erlang_module); + Py_DECREF(erlang_module); + } else { + log_and_clear_python_error("worker erlang module import"); + } + } + + PyGILState_Release(gstate); + + /* Signal that we're ready */ + atomic_store(&ctx->worker_running, true); + + /* Main request loop - uses queue instead of single-slot */ + while (!atomic_load(&ctx->shutdown_requested)) { + /* Dequeue next request (blocks until available or shutdown) */ + ctx_request_t *req = ctx_queue_dequeue(ctx); + + if (req == NULL) { + /* Queue empty and shutdown requested */ + break; + } + + if (req->type == CTX_REQ_SHUTDOWN) { + /* Shutdown sentinel - signal completion and exit */ + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); + ctx_request_release(req); + break; + } + + /* Check if request was cancelled while queued */ + if (atomic_load(&req->cancelled)) { + /* Request cancelled - deliver error without processing */ + if (req->async_mode) { + /* Async mode: send cancellation message */ + enif_clear_env(ctx->msg_env); + ERL_NIF_TERM cancel_msg = enif_make_tuple3(ctx->msg_env, + enif_make_atom(ctx->msg_env, "py_result"), + enif_make_copy(ctx->msg_env, req->request_id), + enif_make_tuple2(ctx->msg_env, + enif_make_atom(ctx->msg_env, "error"), + enif_make_atom(ctx->msg_env, "cancelled"))); + enif_send(NULL, &req->caller_pid, ctx->msg_env, cancel_msg); + } else { + /* Blocking mode: signal condvar */ + req->result_env = enif_alloc_env(); + if (req->result_env) { + req->result = enif_make_tuple2(req->result_env, + enif_make_atom(req->result_env, "error"), + enif_make_atom(req->result_env, "cancelled")); + } + req->success = false; + + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); + } + + ctx_request_release(req); + continue; + } + + /* Populate legacy compatibility fields from request */ + ctx->shared_env = req->request_env; + ctx->request_type = req->type; + ctx->request_term = req->request_data; + ctx->reactor_buffer_ptr = req->reactor_buffer_ptr; + ctx->local_env_ptr = req->local_env_ptr; + ctx->response_ok = false; + ctx->response_term = 0; + + /* Acquire GIL and process the request */ + gstate = PyGILState_Ensure(); + owngil_execute_request(ctx); /* Reuse execute functions */ + PyGILState_Release(gstate); + + /* Copy response to request struct */ + req->result_env = enif_alloc_env(); + if (req->result_env && ctx->response_term != 0) { + req->result = enif_make_copy(req->result_env, ctx->response_term); + } else if (req->result_env) { + req->result = enif_make_tuple2(req->result_env, + enif_make_atom(req->result_env, "error"), + enif_make_atom(req->result_env, "no_response")); + } + req->success = ctx->response_ok; + + /* Clear legacy fields */ + ctx->shared_env = NULL; + ctx->request_type = CTX_REQ_NONE; + ctx->request_term = 0; + ctx->reactor_buffer_ptr = NULL; + ctx->local_env_ptr = NULL; + + /* Deliver result - async or blocking */ + if (req->async_mode) { + /* Async mode: send result message to caller */ + enif_clear_env(ctx->msg_env); + ERL_NIF_TERM result_msg = enif_make_tuple3(ctx->msg_env, + enif_make_atom(ctx->msg_env, "py_result"), + enif_make_copy(ctx->msg_env, req->request_id), + req->result_env ? enif_make_copy(ctx->msg_env, req->result) + : enif_make_tuple2(ctx->msg_env, + enif_make_atom(ctx->msg_env, "error"), + enif_make_atom(ctx->msg_env, "no_result"))); + enif_send(NULL, &req->caller_pid, ctx->msg_env, result_msg); + } else { + /* Blocking mode: signal condvar */ + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); + } + + /* Release queue's reference to request */ + ctx_request_release(req); + } + + /* Cleanup: release namespace dictionaries under GIL */ + gstate = PyGILState_Ensure(); + Py_XDECREF(ctx->module_cache); + Py_XDECREF(ctx->globals); + Py_XDECREF(ctx->locals); + ctx->globals = NULL; + ctx->locals = NULL; + ctx->module_cache = NULL; + PyGILState_Release(gstate); + + atomic_store(&ctx->worker_running, false); + return NULL; +} + +/** + * @brief Initialize worker thread mode for a context + * + * @param ctx Context to initialize + * @return 0 on success, -1 on failure + */ +static int worker_context_init(py_context_t *ctx) { + ctx->uses_worker_thread = true; + + /* Initialize worker thread state */ + atomic_store(&ctx->worker_running, false); + atomic_store(&ctx->shutdown_requested, false); + atomic_store(&ctx->leaked, false); + + /* Initialize request queue */ + ctx->queue_head = NULL; + ctx->queue_tail = NULL; + + /* Initialize legacy compatibility fields */ + ctx->shared_env = NULL; + ctx->request_type = CTX_REQ_NONE; + ctx->request_term = 0; + ctx->response_term = 0; + ctx->response_ok = false; + ctx->local_env_ptr = NULL; + ctx->reactor_buffer_ptr = NULL; + + /* Initialize queue mutex */ + if (pthread_mutex_init(&ctx->queue_mutex, NULL) != 0) { + return -1; + } + + /* Initialize queue condition variable */ + if (pthread_cond_init(&ctx->queue_not_empty, NULL) != 0) { + pthread_mutex_destroy(&ctx->queue_mutex); + return -1; + } + + /* Create message environment for async responses */ + ctx->msg_env = enif_alloc_env(); + if (ctx->msg_env == NULL) { + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); + return -1; + } + + /* Globals/locals will be created by the worker thread */ + ctx->globals = NULL; + ctx->locals = NULL; + ctx->module_cache = NULL; + + /* Start the worker thread */ + if (pthread_create(&ctx->worker_thread, NULL, worker_context_thread_main, ctx) != 0) { + enif_free_env(ctx->msg_env); + ctx->msg_env = NULL; + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); + return -1; + } + + /* Wait for thread to initialize or fail */ + int wait_count = 0; + while (!atomic_load(&ctx->worker_running) && + !atomic_load(&ctx->init_error) && + wait_count < 2000) { + usleep(1000); /* 1ms */ + wait_count++; + } + + if (atomic_load(&ctx->init_error) || !atomic_load(&ctx->worker_running)) { + /* Thread failed to start */ + pthread_join(ctx->worker_thread, NULL); + if (ctx->msg_env != NULL) { + enif_free_env(ctx->msg_env); + ctx->msg_env = NULL; + } + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); + return -1; + } + + return 0; +} + +/** + * @brief Shutdown worker thread mode and clean up resources + * + * Uses the join-or-leak pattern: if the worker thread doesn't respond + * within the timeout, we mark the context as leaked and do NOT free + * shared resources to avoid use-after-free. + * + * @param ctx Context to shutdown + */ +#define WORKER_SHUTDOWN_TIMEOUT_SECS 30 + +static void worker_context_shutdown(py_context_t *ctx) { + if (!ctx->uses_worker_thread) { + return; + } + + /* Signal shutdown and wake any worker parked on the condvar. + * + * We deliberately don't enqueue a CTX_REQ_SHUTDOWN sentinel: + * - the worker loop predicate already exits once + * shutdown_requested is true, so a broadcast is sufficient; + * - if the worker is mid-process_request when we set the flag, + * it returns to the top of the loop, sees !shutdown_requested + * == false, and exits without dequeuing — leaving any + * sentinel as an orphan ctx_request_t in the queue. + * Broadcasting under the mutex avoids the lost-wakeup race. + */ + atomic_store(&ctx->shutdown_requested, true); + ctx_queue_cancel_all(ctx); + pthread_mutex_lock(&ctx->queue_mutex); + pthread_cond_broadcast(&ctx->queue_not_empty); + pthread_mutex_unlock(&ctx->queue_mutex); + + /* Wait for thread to exit with timeout */ + bool join_succeeded = false; + +#if defined(__linux__) + struct timespec deadline; + clock_gettime(CLOCK_REALTIME, &deadline); + deadline.tv_sec += WORKER_SHUTDOWN_TIMEOUT_SECS; + int rc = pthread_timedjoin_np(ctx->worker_thread, NULL, &deadline); + join_succeeded = (rc == 0); +#else + /* macOS/other: poll worker_running flag with timeout */ + int wait_ms = 0; + while (atomic_load(&ctx->worker_running) && + wait_ms < WORKER_SHUTDOWN_TIMEOUT_SECS * 1000) { + usleep(100000); /* 100ms */ + wait_ms += 100; + } + if (!atomic_load(&ctx->worker_running)) { + pthread_join(ctx->worker_thread, NULL); + join_succeeded = true; + } +#endif + + if (!join_succeeded) { + /* Worker thread is unresponsive - leak the context so the + * stuck pthread doesn't UAF when the BEAM frees the + * resource. Pin the resource: enif_keep_resource pushes the + * refcount above zero permanently, so context_destructor + * never runs and the BEAM keeps the memory alive for the + * thread that still holds a raw pointer to it. + * + * The leaked thread also keeps using ctx->callback_pipe[] + * (see nif_context_destroy: pipe close is gated on + * !ctx->leaked for the same reason). Future cleanup happens + * at VM exit. */ + fprintf(stderr, "Worker thread shutdown timeout after %d seconds, leaking context\n", + WORKER_SHUTDOWN_TIMEOUT_SECS); + atomic_store(&ctx->leaked, true); + enif_keep_resource(ctx); + return; + } + + /* Clean shutdown succeeded - safe to free resources */ + if (ctx->msg_env != NULL) { + enif_free_env(ctx->msg_env); + ctx->msg_env = NULL; + } + + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); + + ctx->uses_worker_thread = false; +} + +/** + * @brief Dispatch a request to the worker thread and wait for response + * + * Uses the queue-based pattern: creates a request, enqueues it, waits for + * completion, and copies the result back to the caller's environment. + * + * @param env Caller's NIF environment + * @param ctx Context with worker thread + * @param req_type Request type (CTX_REQ_CALL, CTX_REQ_EVAL, CTX_REQ_EXEC, etc.) + * @param request_data Request data term + * @return Result term copied back to caller's env + */ +#define WORKER_DISPATCH_TIMEOUT_SECS 30 + +/** + * @brief Dispatch a request to the worker thread with optional local environment + * + * @param env NIF environment + * @param ctx Context to dispatch to + * @param req_type Request type + * @param request_data Request data term + * @param local_env Optional local environment (NULL for default) + * @return Result term + */ +static ERL_NIF_TERM dispatch_to_worker_thread_impl( + ErlNifEnv *env, + py_context_t *ctx, + ctx_request_type_t req_type, + ERL_NIF_TERM request_data, + void *local_env +) { + if (!atomic_load(&ctx->worker_running)) { + return make_error(env, "thread_not_running"); + } + + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); } - /* Insert in reverse order so first path ends up first */ - for (int i = 0; i < path_count; i++) { - /* Skip to the i-th element from the end */ - ERL_NIF_TERM current = ctx->request_term; - for (int j = 0; j < path_count - 1 - i; j++) { - enif_get_list_cell(ctx->shared_env, current, &head, ¤t); - } - enif_get_list_cell(ctx->shared_env, current, &head, ¤t); + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } - ErlNifBinary path_bin; - if (!enif_inspect_binary(ctx->shared_env, head, &path_bin)) { - continue; - } + /* Populate request */ + req->type = req_type; + req->request_data = enif_make_copy(req->request_env, request_data); + req->local_env_ptr = local_env; - /* Convert to Python string */ - PyObject *path_str = PyUnicode_FromStringAndSize((char *)path_bin.data, path_bin.size); - if (path_str == NULL) { - PyErr_Clear(); - continue; - } + /* Add extra reference for queue (caller holds 1, queue holds 1) */ + ctx_request_addref(req); + ctx_queue_enqueue(ctx, req); - /* Check if already in sys.path */ - int already_present = PySequence_Contains(sys_path, path_str); - if (already_present <= 0) { - /* Insert at position 0 */ - PyList_Insert(sys_path, 0, path_str); + /* Wait for completion with timeout */ + struct timespec deadline; + clock_gettime(CLOCK_REALTIME, &deadline); + deadline.tv_sec += WORKER_DISPATCH_TIMEOUT_SECS; + + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); + if (rc == ETIMEDOUT) { + /* Timeout - mark as cancelled and return error */ + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + ctx_request_release(req); + return make_error(env, "worker_timeout"); } - Py_DECREF(path_str); } - Py_DECREF(sys_path); - ctx->response_term = enif_make_atom(ctx->shared_env, "ok"); - ctx->response_ok = true; + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's environment */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + /* Release caller's reference */ + ctx_request_release(req); + + return result; } /** - * @brief Execute a request based on its type + * @brief Convenience wrapper for dispatch without local environment */ -static void owngil_execute_request(py_context_t *ctx) { - switch (ctx->request_type) { - case CTX_REQ_CALL: - owngil_execute_call(ctx); - break; - case CTX_REQ_EVAL: - owngil_execute_eval(ctx); - break; - case CTX_REQ_EXEC: - owngil_execute_exec(ctx); - break; - case CTX_REQ_REACTOR_ON_READ_READY: - owngil_execute_reactor_read(ctx); - break; - case CTX_REQ_REACTOR_ON_WRITE_READY: - owngil_execute_reactor_write(ctx); - break; - case CTX_REQ_REACTOR_INIT_CONNECTION: - owngil_execute_reactor_init(ctx); - break; - case CTX_REQ_EXEC_WITH_ENV: - owngil_execute_exec_with_env(ctx); - break; - case CTX_REQ_EVAL_WITH_ENV: - owngil_execute_eval_with_env(ctx); - break; - case CTX_REQ_CALL_WITH_ENV: - owngil_execute_call_with_env(ctx); - break; - case CTX_REQ_CREATE_LOCAL_ENV: - owngil_execute_create_local_env(ctx); - break; - case CTX_REQ_APPLY_IMPORTS: - owngil_execute_apply_imports(ctx); - break; - case CTX_REQ_APPLY_PATHS: - owngil_execute_apply_paths(ctx); - break; - default: - ctx->response_term = enif_make_tuple2(ctx->shared_env, - enif_make_atom(ctx->shared_env, "error"), - enif_make_atom(ctx->shared_env, "unknown_request_type")); - ctx->response_ok = false; - break; +static ERL_NIF_TERM dispatch_to_worker_thread( + ErlNifEnv *env, + py_context_t *ctx, + ctx_request_type_t req_type, + ERL_NIF_TERM request_data +) { + return dispatch_to_worker_thread_impl(env, ctx, req_type, request_data, NULL); +} + +/** + * @brief Async dispatch to worker thread (non-blocking) + * + * Enqueues the request and returns immediately. The worker thread will + * send a {py_result, RequestId, Result} message to the caller when done. + * + * @param env NIF environment + * @param ctx Context + * @param req_type Request type + * @param request_data Request data term + * @param caller_pid Caller's PID for result delivery + * @param request_id Request ID for correlation + * @param local_env Optional local environment (NULL for default) + * @return {enqueued, RequestId} on success, {error, Reason} on failure + */ +static ERL_NIF_TERM dispatch_to_worker_thread_async( + ErlNifEnv *env, + py_context_t *ctx, + ctx_request_type_t req_type, + ERL_NIF_TERM request_data, + ErlNifPid caller_pid, + ERL_NIF_TERM request_id, + void *local_env +) { + if (!atomic_load(&ctx->worker_running)) { + return make_error(env, "thread_not_running"); + } + + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } + + /* Populate request */ + req->type = req_type; + req->request_data = enif_make_copy(req->request_env, request_data); + req->local_env_ptr = local_env; + + /* Set async mode */ + req->async_mode = true; + req->caller_pid = caller_pid; + req->request_id = enif_make_copy(req->request_env, request_id); + + /* Add to queue (queue owns one reference, no caller reference needed) */ + ctx_queue_enqueue(ctx, req); + + /* Return immediately - no blocking! */ + return enif_make_tuple2(env, + enif_make_atom(env, "enqueued"), + request_id); } +#ifdef HAVE_SUBINTERPRETERS /** * @brief Main loop for OWN_GIL context thread * * This function runs in a dedicated pthread. It creates an OWN_GIL subinterpreter, - * then enters a request loop where it processes requests from the dirty scheduler. + * then enters a request loop where it processes requests from the request queue. + * + * The queue-based pattern replaces the old single-slot pattern which had race + * conditions when multiple callers dispatched concurrently. */ static void *owngil_context_thread_main(void *arg) { py_context_t *ctx = (py_context_t *)arg; @@ -3186,6 +3555,7 @@ static void *owngil_context_thread_main(void *arg) { status.err_msg ? status.err_msg : "unknown error"); PyGILState_Release(gstate); atomic_store(&ctx->init_error, true); + atomic_store(&ctx->worker_running, false); return NULL; } @@ -3201,6 +3571,7 @@ static void *owngil_context_thread_main(void *arg) { PyErr_Print(); Py_EndInterpreter(ctx->own_gil_tstate); atomic_store(&ctx->init_error, true); + atomic_store(&ctx->worker_running, false); return NULL; } @@ -3210,6 +3581,7 @@ static void *owngil_context_thread_main(void *arg) { PyErr_Print(); Py_EndInterpreter(ctx->own_gil_tstate); atomic_store(&ctx->init_error, true); + atomic_store(&ctx->worker_running, false); return NULL; } @@ -3224,8 +3596,8 @@ static void *owngil_context_thread_main(void *arg) { Py_XDECREF(ctx->locals); Py_XDECREF(ctx->module_cache); Py_EndInterpreter(ctx->own_gil_tstate); - /* Don't call PyGILState_Release - interpreter is gone */ atomic_store(&ctx->init_error, true); + atomic_store(&ctx->worker_running, false); return NULL; } @@ -3247,37 +3619,89 @@ static void *owngil_context_thread_main(void *arg) { PyEval_SaveThread(); /* Signal that we're ready */ - atomic_store(&ctx->thread_running, true); - - /* Main request loop */ - pthread_mutex_lock(&ctx->request_mutex); + atomic_store(&ctx->worker_running, true); + /* Main request loop - uses queue instead of single-slot */ while (!atomic_load(&ctx->shutdown_requested)) { - /* Wait for a request */ - while (ctx->request_type == CTX_REQ_NONE && - !atomic_load(&ctx->shutdown_requested)) { - pthread_cond_wait(&ctx->request_ready, &ctx->request_mutex); + /* Dequeue next request (blocks until available or shutdown) */ + ctx_request_t *req = ctx_queue_dequeue(ctx); + + if (req == NULL) { + /* Queue empty and shutdown requested */ + break; } - if (atomic_load(&ctx->shutdown_requested)) { + if (req->type == CTX_REQ_SHUTDOWN) { + /* Shutdown sentinel - signal completion and exit */ + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); + ctx_request_release(req); break; } - /* Release mutex while processing (allow concurrent dispatch attempts to queue) */ - pthread_mutex_unlock(&ctx->request_mutex); + /* Check if request was cancelled while queued */ + if (atomic_load(&req->cancelled)) { + /* Request cancelled - signal completion without processing */ + req->result_env = enif_alloc_env(); + if (req->result_env) { + req->result = enif_make_tuple2(req->result_env, + enif_make_atom(req->result_env, "error"), + enif_make_atom(req->result_env, "cancelled")); + } + req->success = false; + + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); + + ctx_request_release(req); + continue; + } + + /* Populate legacy compatibility fields from request */ + ctx->shared_env = req->request_env; + ctx->request_type = req->type; + ctx->request_term = req->request_data; + ctx->reactor_buffer_ptr = req->reactor_buffer_ptr; + ctx->local_env_ptr = req->local_env_ptr; + ctx->response_ok = false; + ctx->response_term = 0; - /* Acquire our GIL and process */ + /* Acquire our GIL and process the request */ PyEval_RestoreThread(ctx->own_gil_tstate); owngil_execute_request(ctx); PyEval_SaveThread(); - /* Re-acquire mutex to signal completion and get next request */ - pthread_mutex_lock(&ctx->request_mutex); + /* Copy response to request struct */ + req->result_env = enif_alloc_env(); + if (req->result_env && ctx->response_term != 0) { + req->result = enif_make_copy(req->result_env, ctx->response_term); + } else if (req->result_env) { + req->result = enif_make_tuple2(req->result_env, + enif_make_atom(req->result_env, "error"), + enif_make_atom(req->result_env, "no_response")); + } + req->success = ctx->response_ok; + + /* Clear legacy fields */ + ctx->shared_env = NULL; ctx->request_type = CTX_REQ_NONE; - pthread_cond_signal(&ctx->response_ready); - } + ctx->request_term = 0; + ctx->reactor_buffer_ptr = NULL; + ctx->local_env_ptr = NULL; + + /* Signal completion */ + pthread_mutex_lock(&req->mutex); + atomic_store(&req->completed, true); + pthread_cond_signal(&req->cond); + pthread_mutex_unlock(&req->mutex); - pthread_mutex_unlock(&ctx->request_mutex); + /* Release queue's reference to request */ + ctx_request_release(req); + } /* Cleanup: acquire our OWN_GIL and destroy interpreter */ PyEval_RestoreThread(ctx->own_gil_tstate); @@ -3297,7 +3721,7 @@ static void *owngil_context_thread_main(void *arg) { * After Py_NewInterpreterFromConfig switched us to the OWN_GIL interpreter, * the original gstate is no longer valid. Py_EndInterpreter handles cleanup. */ - atomic_store(&ctx->thread_running, false); + atomic_store(&ctx->worker_running, false); return NULL; } @@ -3308,17 +3732,17 @@ static void *owngil_context_thread_main(void *arg) { #define OWNGIL_DISPATCH_TIMEOUT_SECS 30 /** - * @brief Dispatch a request to the OWN_GIL thread and wait for response + * @brief Dispatch a request to the worker thread and wait for response * - * Called from dirty schedulers. Copies the request term to the shared env, - * signals the worker thread, and waits for the response. + * Uses the queue-based pattern: creates a request, enqueues it, waits for + * completion, and copies the result back to the caller's environment. * - * Uses pthread_cond_timedwait to prevent indefinite blocking if the worker - * thread dies or becomes unresponsive. + * This replaces the old single-slot pattern which had race conditions when + * multiple callers dispatched concurrently. * * @param env Caller's NIF environment - * @param ctx Context with OWN_GIL - * @param req_type Request type (CTX_REQ_CALL, CTX_REQ_EVAL, CTX_REQ_EXEC) + * @param ctx Context with worker thread + * @param req_type Request type (CTX_REQ_CALL, CTX_REQ_EVAL, CTX_REQ_EXEC, etc.) * @param request_data Request data term * @return Result term copied back to caller's env */ @@ -3328,41 +3752,66 @@ static ERL_NIF_TERM dispatch_to_owngil_thread( ctx_request_type_t req_type, ERL_NIF_TERM request_data ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } - /* Copy request to shared env (zero serialization overhead) */ - enif_clear_env(ctx->shared_env); - ctx->request_term = enif_make_copy(ctx->shared_env, request_data); - ctx->request_type = req_type; + /* Populate request */ + req->type = req_type; + req->request_data = enif_make_copy(req->request_env, request_data); - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Add ref for queue (now refcount = 2: caller + queue) */ + ctx_request_addref(req); - /* Wait for response with timeout to prevent deadlock on worker death */ + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); + + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - /* Worker thread is unresponsive - mark it as not running */ - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL dispatch timeout: worker thread unresponsive after %d seconds\n", + /* Worker thread is unresponsive - mark request as cancelled */ + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + /* Don't mark worker as dead - it might still be processing + * a long-running Python operation. Just fail this request. */ + fprintf(stderr, "OWN_GIL dispatch timeout after %d seconds\n", OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); /* Release caller's ref */ return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } - pthread_mutex_unlock(&ctx->request_mutex); + /* Release caller's ref */ + ctx_request_release(req); return result; } @@ -3370,52 +3819,74 @@ static ERL_NIF_TERM dispatch_to_owngil_thread( /** * @brief Dispatch reactor on_read_ready to OWN_GIL thread * - * Similar to dispatch_to_owngil_thread but also passes buffer pointer. - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ ERL_NIF_TERM dispatch_reactor_read_to_owngil(ErlNifEnv *env, py_context_t *ctx, int fd, void *buffer_ptr) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { enif_release_resource(buffer_ptr); return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + enif_release_resource(buffer_ptr); + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + enif_release_resource(buffer_ptr); + return make_error(env, "alloc_failed"); + } + + /* Populate request */ + req->type = CTX_REQ_REACTOR_ON_READ_READY; + req->request_data = enif_make_int(req->request_env, fd); + req->reactor_buffer_ptr = buffer_ptr; /* Transfer ownership */ + req->reactor_fd = fd; - /* Clear and set up request */ - enif_clear_env(ctx->shared_env); - ctx->request_term = enif_make_int(ctx->shared_env, fd); - ctx->reactor_buffer_ptr = buffer_ptr; /* Transfer ownership */ - ctx->request_type = CTX_REQ_REACTOR_ON_READ_READY; + /* Add ref for queue (now refcount = 2: caller + queue) */ + ctx_request_addref(req); - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); - /* Wait for response with timeout to prevent deadlock */ + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - /* Worker thread is unresponsive - clean up buffer and mark dead */ - atomic_store(&ctx->thread_running, false); - /* Buffer ownership was transferred but never processed - release it */ - if (ctx->reactor_buffer_ptr) { - enif_release_resource(ctx->reactor_buffer_ptr); - ctx->reactor_buffer_ptr = NULL; - } - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL reactor dispatch timeout: worker thread unresponsive\n"); + /* Request timeout - mark as cancelled but don't release buffer + * (worker will handle it when it gets to this request) */ + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL reactor dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); /* Release caller's ref */ return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); - pthread_mutex_unlock(&ctx->request_mutex); + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + /* Release caller's ref */ + ctx_request_release(req); return result; } @@ -3423,43 +3894,67 @@ ERL_NIF_TERM dispatch_reactor_read_to_owngil(ErlNifEnv *env, py_context_t *ctx, /** * @brief Dispatch reactor on_write_ready to OWN_GIL thread * - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ ERL_NIF_TERM dispatch_reactor_write_to_owngil(ErlNifEnv *env, py_context_t *ctx, int fd) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } + + /* Populate request */ + req->type = CTX_REQ_REACTOR_ON_WRITE_READY; + req->request_data = enif_make_int(req->request_env, fd); + req->reactor_fd = fd; - /* Clear and set up request */ - enif_clear_env(ctx->shared_env); - ctx->request_term = enif_make_int(ctx->shared_env, fd); - ctx->request_type = CTX_REQ_REACTOR_ON_WRITE_READY; + /* Add ref for queue (now refcount = 2: caller + queue) */ + ctx_request_addref(req); - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); - /* Wait for response with timeout to prevent deadlock */ + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL reactor write dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL reactor write dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } - pthread_mutex_unlock(&ctx->request_mutex); + ctx_request_release(req); return result; } @@ -3467,45 +3962,69 @@ ERL_NIF_TERM dispatch_reactor_write_to_owngil(ErlNifEnv *env, py_context_t *ctx, /** * @brief Dispatch reactor init_connection to OWN_GIL thread * - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ ERL_NIF_TERM dispatch_reactor_init_to_owngil(ErlNifEnv *env, py_context_t *ctx, int fd, ERL_NIF_TERM client_info) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } - /* Clear and set up request */ - enif_clear_env(ctx->shared_env); - ERL_NIF_TERM fd_term = enif_make_int(ctx->shared_env, fd); - ERL_NIF_TERM info_copy = enif_make_copy(ctx->shared_env, client_info); - ctx->request_term = enif_make_tuple2(ctx->shared_env, fd_term, info_copy); - ctx->request_type = CTX_REQ_REACTOR_INIT_CONNECTION; + /* Populate request */ + req->type = CTX_REQ_REACTOR_INIT_CONNECTION; + ERL_NIF_TERM fd_term = enif_make_int(req->request_env, fd); + ERL_NIF_TERM info_copy = enif_make_copy(req->request_env, client_info); + req->request_data = enif_make_tuple2(req->request_env, fd_term, info_copy); + req->reactor_fd = fd; - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Add ref for queue (now refcount = 2: caller + queue) */ + ctx_request_addref(req); - /* Wait for response with timeout to prevent deadlock */ + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); + + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL reactor init dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL reactor init dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); - pthread_mutex_unlock(&ctx->request_mutex); + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + ctx_request_release(req); return result; } @@ -3513,47 +4032,69 @@ ERL_NIF_TERM dispatch_reactor_init_to_owngil(ErlNifEnv *env, py_context_t *ctx, /** * @brief Dispatch exec_with_env to OWN_GIL thread * - * Passes the process-local env resource to the worker thread via local_env_ptr. - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ static ERL_NIF_TERM dispatch_exec_with_env_to_owngil( ErlNifEnv *env, py_context_t *ctx, ERL_NIF_TERM code, py_env_resource_t *penv ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } + + /* Populate request */ + req->type = CTX_REQ_EXEC_WITH_ENV; + req->request_data = enif_make_copy(req->request_env, code); + req->local_env_ptr = penv; - /* Copy request to shared env */ - enif_clear_env(ctx->shared_env); - ctx->request_term = enif_make_copy(ctx->shared_env, code); - ctx->local_env_ptr = penv; /* Pass env resource pointer */ - ctx->request_type = CTX_REQ_EXEC_WITH_ENV; + /* Add ref for queue */ + ctx_request_addref(req); - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); - /* Wait for response with timeout to prevent deadlock */ + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL exec_with_env dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL exec_with_env dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } - pthread_mutex_unlock(&ctx->request_mutex); + ctx_request_release(req); return result; } @@ -3561,50 +4102,72 @@ static ERL_NIF_TERM dispatch_exec_with_env_to_owngil( /** * @brief Dispatch eval_with_env to OWN_GIL thread * - * Passes the process-local env resource to the worker thread via local_env_ptr. - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ static ERL_NIF_TERM dispatch_eval_with_env_to_owngil( ErlNifEnv *env, py_context_t *ctx, ERL_NIF_TERM code, ERL_NIF_TERM locals, py_env_resource_t *penv ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } - /* Copy request to shared env: {Code, Locals} */ - enif_clear_env(ctx->shared_env); - ERL_NIF_TERM code_copy = enif_make_copy(ctx->shared_env, code); - ERL_NIF_TERM locals_copy = enif_make_copy(ctx->shared_env, locals); - ctx->request_term = enif_make_tuple2(ctx->shared_env, code_copy, locals_copy); - ctx->local_env_ptr = penv; /* Pass env resource pointer */ - ctx->request_type = CTX_REQ_EVAL_WITH_ENV; + /* Populate request: {Code, Locals} */ + req->type = CTX_REQ_EVAL_WITH_ENV; + ERL_NIF_TERM code_copy = enif_make_copy(req->request_env, code); + ERL_NIF_TERM locals_copy = enif_make_copy(req->request_env, locals); + req->request_data = enif_make_tuple2(req->request_env, code_copy, locals_copy); + req->local_env_ptr = penv; - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Add ref for queue */ + ctx_request_addref(req); - /* Wait for response with timeout to prevent deadlock */ + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); + + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL eval_with_env dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL eval_with_env dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); - pthread_mutex_unlock(&ctx->request_mutex); + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + ctx_request_release(req); return result; } @@ -3612,8 +4175,7 @@ static ERL_NIF_TERM dispatch_eval_with_env_to_owngil( /** * @brief Dispatch call_with_env to OWN_GIL thread * - * Passes the process-local env resource to the worker thread via local_env_ptr. - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ static ERL_NIF_TERM dispatch_call_with_env_to_owngil( ErlNifEnv *env, py_context_t *ctx, @@ -3621,45 +4183,68 @@ static ERL_NIF_TERM dispatch_call_with_env_to_owngil( ERL_NIF_TERM args, ERL_NIF_TERM kwargs, py_env_resource_t *penv ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } - /* Copy request to shared env: {Module, Func, Args, Kwargs} */ - enif_clear_env(ctx->shared_env); - ERL_NIF_TERM module_copy = enif_make_copy(ctx->shared_env, module); - ERL_NIF_TERM func_copy = enif_make_copy(ctx->shared_env, func); - ERL_NIF_TERM args_copy = enif_make_copy(ctx->shared_env, args); - ERL_NIF_TERM kwargs_copy = enif_make_copy(ctx->shared_env, kwargs); - ctx->request_term = enif_make_tuple4(ctx->shared_env, + /* Populate request: {Module, Func, Args, Kwargs} */ + req->type = CTX_REQ_CALL_WITH_ENV; + ERL_NIF_TERM module_copy = enif_make_copy(req->request_env, module); + ERL_NIF_TERM func_copy = enif_make_copy(req->request_env, func); + ERL_NIF_TERM args_copy = enif_make_copy(req->request_env, args); + ERL_NIF_TERM kwargs_copy = enif_make_copy(req->request_env, kwargs); + req->request_data = enif_make_tuple4(req->request_env, module_copy, func_copy, args_copy, kwargs_copy); - ctx->local_env_ptr = penv; /* Pass env resource pointer */ - ctx->request_type = CTX_REQ_CALL_WITH_ENV; + req->local_env_ptr = penv; - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Add ref for queue */ + ctx_request_addref(req); - /* Wait for response with timeout to prevent deadlock */ + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); + + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL call_with_env dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL call_with_env dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } - pthread_mutex_unlock(&ctx->request_mutex); + ctx_request_release(req); return result; } @@ -3667,47 +4252,68 @@ static ERL_NIF_TERM dispatch_call_with_env_to_owngil( /** * @brief Dispatch create_local_env to OWN_GIL thread * - * Creates the globals/locals dicts in the correct interpreter context. - * Returns ok or error. - * Uses timeout to prevent deadlock if worker thread dies. + * Uses queue-based dispatch with per-request synchronization. */ static ERL_NIF_TERM dispatch_create_local_env_to_owngil( ErlNifEnv *env, py_context_t *ctx, py_env_resource_t *res ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } - /* Pass env resource pointer to worker thread */ - enif_clear_env(ctx->shared_env); - ctx->local_env_ptr = res; - ctx->request_type = CTX_REQ_CREATE_LOCAL_ENV; + /* Populate request */ + req->type = CTX_REQ_CREATE_LOCAL_ENV; + req->local_env_ptr = res; - /* Signal the worker thread */ - pthread_cond_signal(&ctx->request_ready); + /* Add ref for queue */ + ctx_request_addref(req); - /* Wait for response with timeout to prevent deadlock */ + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); + + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL create_local_env dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL create_local_env dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - /* Copy response back to caller's env */ - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); + pthread_mutex_unlock(&req->mutex); - pthread_mutex_unlock(&ctx->request_mutex); + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + ctx_request_release(req); return result; } @@ -3715,43 +4321,67 @@ static ERL_NIF_TERM dispatch_create_local_env_to_owngil( /** * @brief Dispatch apply_imports to OWN_GIL worker thread * - * @param env NIF environment - * @param ctx Context resource - * @param imports_term List of {ModuleBin, FuncBin | all} tuples - * @return ok | {error, Reason} + * Uses queue-based dispatch with per-request synchronization. */ static ERL_NIF_TERM dispatch_apply_imports_to_owngil( ErlNifEnv *env, py_context_t *ctx, ERL_NIF_TERM imports_term ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } + + /* Populate request */ + req->type = CTX_REQ_APPLY_IMPORTS; + req->request_data = enif_make_copy(req->request_env, imports_term); - enif_clear_env(ctx->shared_env); - ctx->request_term = enif_make_copy(ctx->shared_env, imports_term); - ctx->request_type = CTX_REQ_APPLY_IMPORTS; + /* Add ref for queue */ + ctx_request_addref(req); - pthread_cond_signal(&ctx->request_ready); + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); - /* Wait for response with timeout */ + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL apply_imports dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL apply_imports dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); - pthread_mutex_unlock(&ctx->request_mutex); + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + ctx_request_release(req); return result; } @@ -3759,43 +4389,67 @@ static ERL_NIF_TERM dispatch_apply_imports_to_owngil( /** * @brief Dispatch apply_paths request to OWN_GIL worker thread * - * @param env Current NIF environment - * @param ctx OWN_GIL context - * @param paths_term List of path binaries - * @return ok | {error, Reason} + * Uses queue-based dispatch with per-request synchronization. */ static ERL_NIF_TERM dispatch_apply_paths_to_owngil( ErlNifEnv *env, py_context_t *ctx, ERL_NIF_TERM paths_term ) { - if (!atomic_load(&ctx->thread_running)) { + if (!atomic_load(&ctx->worker_running)) { return make_error(env, "thread_not_running"); } - pthread_mutex_lock(&ctx->request_mutex); + if (atomic_load(&ctx->destroyed)) { + return make_error(env, "context_destroyed"); + } + + /* Create request struct */ + ctx_request_t *req = ctx_request_create(); + if (req == NULL) { + return make_error(env, "alloc_failed"); + } + + /* Populate request */ + req->type = CTX_REQ_APPLY_PATHS; + req->request_data = enif_make_copy(req->request_env, paths_term); - enif_clear_env(ctx->shared_env); - ctx->request_term = enif_make_copy(ctx->shared_env, paths_term); - ctx->request_type = CTX_REQ_APPLY_PATHS; + /* Add ref for queue */ + ctx_request_addref(req); - pthread_cond_signal(&ctx->request_ready); + /* Enqueue the request */ + ctx_queue_enqueue(ctx, req); - /* Wait for response with timeout */ + /* Wait for completion with timeout */ struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS; - while (ctx->request_type != CTX_REQ_NONE) { - int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline); + ERL_NIF_TERM result; + pthread_mutex_lock(&req->mutex); + + while (!atomic_load(&req->completed)) { + int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline); if (rc == ETIMEDOUT) { - atomic_store(&ctx->thread_running, false); - pthread_mutex_unlock(&ctx->request_mutex); - fprintf(stderr, "OWN_GIL apply_paths dispatch timeout: worker thread unresponsive\n"); + atomic_store(&req->cancelled, true); + pthread_mutex_unlock(&req->mutex); + + fprintf(stderr, "OWN_GIL apply_paths dispatch timeout after %d seconds\n", + OWNGIL_DISPATCH_TIMEOUT_SECS); + + ctx_request_release(req); return make_error(env, "worker_timeout"); } } - ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term); - pthread_mutex_unlock(&ctx->request_mutex); + pthread_mutex_unlock(&req->mutex); + + /* Copy result to caller's env */ + if (req->result_env != NULL) { + result = enif_make_copy(env, req->result); + } else { + result = make_error(env, "no_result"); + } + + ctx_request_release(req); return result; } @@ -3813,66 +4467,72 @@ static int owngil_context_init(py_context_t *ctx) { ctx->uses_own_gil = true; ctx->own_gil_tstate = NULL; ctx->own_gil_interp = NULL; - ctx->local_env_ptr = NULL; - atomic_store(&ctx->thread_running, false); + + /* Initialize worker thread state */ + atomic_store(&ctx->worker_running, false); atomic_store(&ctx->init_error, false); atomic_store(&ctx->shutdown_requested, false); + atomic_store(&ctx->leaked, false); + + /* Initialize request queue */ + ctx->queue_head = NULL; + ctx->queue_tail = NULL; + + /* Initialize legacy compatibility fields */ + ctx->shared_env = NULL; ctx->request_type = CTX_REQ_NONE; ctx->request_term = 0; - ctx->request_data = 0; ctx->response_term = 0; ctx->response_ok = false; + ctx->local_env_ptr = NULL; + ctx->reactor_buffer_ptr = NULL; - /* Initialize mutex and condition variables */ - if (pthread_mutex_init(&ctx->request_mutex, NULL) != 0) { - return -1; - } - - if (pthread_cond_init(&ctx->request_ready, NULL) != 0) { - pthread_mutex_destroy(&ctx->request_mutex); + /* Initialize queue mutex */ + if (pthread_mutex_init(&ctx->queue_mutex, NULL) != 0) { return -1; } - if (pthread_cond_init(&ctx->response_ready, NULL) != 0) { - pthread_cond_destroy(&ctx->request_ready); - pthread_mutex_destroy(&ctx->request_mutex); + /* Initialize queue condition variable */ + if (pthread_cond_init(&ctx->queue_not_empty, NULL) != 0) { + pthread_mutex_destroy(&ctx->queue_mutex); return -1; } - /* Create shared environment for term passing */ - ctx->shared_env = enif_alloc_env(); - if (ctx->shared_env == NULL) { - pthread_cond_destroy(&ctx->response_ready); - pthread_cond_destroy(&ctx->request_ready); - pthread_mutex_destroy(&ctx->request_mutex); + /* Create message environment for async responses */ + ctx->msg_env = enif_alloc_env(); + if (ctx->msg_env == NULL) { + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); return -1; } /* Start the worker thread */ - if (pthread_create(&ctx->own_gil_thread, NULL, owngil_context_thread_main, ctx) != 0) { - enif_free_env(ctx->shared_env); - pthread_cond_destroy(&ctx->response_ready); - pthread_cond_destroy(&ctx->request_ready); - pthread_mutex_destroy(&ctx->request_mutex); + if (pthread_create(&ctx->worker_thread, NULL, owngil_context_thread_main, ctx) != 0) { + enif_free_env(ctx->msg_env); + ctx->msg_env = NULL; + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); return -1; } /* Wait for thread to initialize or fail */ int wait_count = 0; - while (!atomic_load(&ctx->thread_running) && + while (!atomic_load(&ctx->worker_running) && !atomic_load(&ctx->init_error) && wait_count < 2000) { usleep(1000); /* 1ms */ wait_count++; } - if (atomic_load(&ctx->init_error) || !atomic_load(&ctx->thread_running)) { + if (atomic_load(&ctx->init_error) || !atomic_load(&ctx->worker_running)) { /* Thread failed to start */ - pthread_join(ctx->own_gil_thread, NULL); - enif_free_env(ctx->shared_env); - pthread_cond_destroy(&ctx->response_ready); - pthread_cond_destroy(&ctx->request_ready); - pthread_mutex_destroy(&ctx->request_mutex); + pthread_join(ctx->worker_thread, NULL); + if (ctx->msg_env != NULL) { + enif_free_env(ctx->msg_env); + ctx->msg_env = NULL; + } + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); return -1; } @@ -3882,7 +4542,9 @@ static int owngil_context_init(py_context_t *ctx) { /** * @brief Shutdown OWN_GIL context and clean up resources * - * Uses a timeout to avoid hanging forever if the Python thread is stuck. + * Uses the join-or-leak pattern: if the worker thread doesn't respond + * within the timeout, we mark the context as leaked and do NOT free + * shared resources to avoid use-after-free. * * @param ctx Context to shutdown */ @@ -3893,51 +4555,58 @@ static void owngil_context_shutdown(py_context_t *ctx) { return; } - /* Signal shutdown */ + /* Signal shutdown and wake any worker parked on the condvar. + * See worker_context_shutdown for why we broadcast instead of + * enqueuing a CTX_REQ_SHUTDOWN sentinel. */ atomic_store(&ctx->shutdown_requested, true); - - pthread_mutex_lock(&ctx->request_mutex); - ctx->request_type = CTX_REQ_SHUTDOWN; - pthread_cond_signal(&ctx->request_ready); - pthread_mutex_unlock(&ctx->request_mutex); + ctx_queue_cancel_all(ctx); + pthread_mutex_lock(&ctx->queue_mutex); + pthread_cond_broadcast(&ctx->queue_not_empty); + pthread_mutex_unlock(&ctx->queue_mutex); /* Wait for thread to exit with timeout */ + bool join_succeeded = false; + #if defined(__linux__) struct timespec deadline; clock_gettime(CLOCK_REALTIME, &deadline); deadline.tv_sec += OWNGIL_SHUTDOWN_TIMEOUT_SECS; - int rc = pthread_timedjoin_np(ctx->own_gil_thread, NULL, &deadline); - if (rc == ETIMEDOUT) { - fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, detaching thread\n", - OWNGIL_SHUTDOWN_TIMEOUT_SECS); - pthread_detach(ctx->own_gil_thread); - } + int rc = pthread_timedjoin_np(ctx->worker_thread, NULL, &deadline); + join_succeeded = (rc == 0); #else - /* macOS/other: poll thread_running flag with timeout */ + /* macOS/other: poll worker_running flag with timeout */ int wait_ms = 0; - while (atomic_load(&ctx->thread_running) && + while (atomic_load(&ctx->worker_running) && wait_ms < OWNGIL_SHUTDOWN_TIMEOUT_SECS * 1000) { usleep(100000); /* 100ms */ wait_ms += 100; } - if (atomic_load(&ctx->thread_running)) { - fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, detaching thread\n", - OWNGIL_SHUTDOWN_TIMEOUT_SECS); - pthread_detach(ctx->own_gil_thread); - } else { - pthread_join(ctx->own_gil_thread, NULL); + if (!atomic_load(&ctx->worker_running)) { + pthread_join(ctx->worker_thread, NULL); + join_succeeded = true; } #endif - /* Clean up resources */ - if (ctx->shared_env != NULL) { - enif_free_env(ctx->shared_env); - ctx->shared_env = NULL; + if (!join_succeeded) { + /* Worker thread is unresponsive - leak the context. Pin the + * resource so the BEAM doesn't free its memory under the + * stuck pthread (UAF). See worker_context_shutdown for the + * full rationale. */ + fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, leaking context\n", + OWNGIL_SHUTDOWN_TIMEOUT_SECS); + atomic_store(&ctx->leaked, true); + enif_keep_resource(ctx); + return; } - pthread_cond_destroy(&ctx->response_ready); - pthread_cond_destroy(&ctx->request_ready); - pthread_mutex_destroy(&ctx->request_mutex); + /* Clean shutdown succeeded - safe to free resources */ + if (ctx->msg_env != NULL) { + enif_free_env(ctx->msg_env); + ctx->msg_env = NULL; + } + + pthread_cond_destroy(&ctx->queue_not_empty); + pthread_mutex_destroy(&ctx->queue_mutex); ctx->uses_own_gil = false; } @@ -3956,15 +4625,14 @@ static void owngil_context_shutdown(py_context_t *ctx) { * @brief Create a new Python context * * nif_context_create(Mode) -> {ok, ContextRef, InterpId} | {error, Reason} - * Mode: subinterp | worker | owngil - * - * For subinterp mode: allocates a slot from the pre-created subinterpreter pool. - * Execution happens on dirty schedulers using PyThreadState_Swap(). + * Mode: worker | owngil * * For owngil mode: creates a dedicated pthread with an OWN_GIL subinterpreter. * This enables true parallel Python execution across contexts. + * Requires Python 3.14+; returns {error, owngil_requires_python314} otherwise. * - * For worker mode: creates namespace in the main interpreter. + * For worker mode: creates a namespace in the main interpreter, dispatched + * through the context's dedicated worker pthread. */ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { (void)argc; @@ -3973,13 +4641,24 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T return make_error(env, "python_not_running"); } - /* Parse mode atom */ + /* Parse mode atom — reject anything other than worker | owngil so + * callers that bypass py_context (e.g. py_reactor_context) get the + * same strict validation py_context:create_context/1 already enforces. */ char mode_str[32]; if (!enif_get_atom(env, argv[0], mode_str, sizeof(mode_str), ERL_NIF_LATIN1)) { return make_error(env, "invalid_mode"); } - bool use_owngil = (strcmp(mode_str, "owngil") == 0); + bool use_owngil; + if (strcmp(mode_str, "worker") == 0) { + use_owngil = false; + } else if (strcmp(mode_str, "owngil") == 0) { + use_owngil = true; + } else { + return enif_make_tuple2( + env, ATOM_ERROR, + enif_make_tuple2(env, enif_make_atom(env, "invalid_mode"), argv[0])); + } /* Allocate context resource */ py_context_t *ctx = enif_alloc_resource(PY_CONTEXT_RESOURCE_TYPE, sizeof(py_context_t)); @@ -3990,14 +4669,16 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T /* Initialize fields */ ctx->interp_id = atomic_fetch_add(&g_context_id_counter, 1); ctx->is_subinterp = use_owngil; - ctx->destroyed = false; + atomic_store(&ctx->destroyed, false); + atomic_store(&ctx->leaked, false); + atomic_store(&ctx->init_error, false); ctx->has_callback_handler = false; ctx->callback_pipe[0] = -1; ctx->callback_pipe[1] = -1; ctx->globals = NULL; ctx->locals = NULL; ctx->module_cache = NULL; - ctx->executor_id = -1; /* Not assigned yet */ + ctx->uses_worker_thread = false; /* Create callback pipe for blocking callback responses */ if (pipe(ctx->callback_pipe) < 0) { @@ -4023,38 +4704,14 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T return enif_make_tuple3(env, ATOM_OK, ref, enif_make_uint(env, ctx->interp_id)); } #endif - { - /* Worker mode - create a thread state in main interpreter */ - PyGILState_STATE gstate = PyGILState_Ensure(); - -#ifndef HAVE_SUBINTERPRETERS - PyInterpreterState *interp = PyInterpreterState_Get(); - ctx->thread_state = PyThreadState_New(interp); -#endif - - ctx->globals = PyDict_New(); - ctx->locals = PyDict_New(); - ctx->module_cache = PyDict_New(); - - /* Import __builtins__ into globals */ - PyObject *builtins = PyEval_GetBuiltins(); - PyDict_SetItemString(ctx->globals, "__builtins__", builtins); - - /* Import erlang module into globals for worker mode */ - PyObject *erlang_module = PyImport_ImportModule("erlang"); - if (erlang_module != NULL) { - PyDict_SetItemString(ctx->globals, "erlang", erlang_module); - Py_DECREF(erlang_module); - } - - PyGILState_Release(gstate); - } - /* Assign executor for thread affinity in MULTI_EXECUTOR mode. - * This ensures numpy/torch thread-local state consistency. */ - if (g_execution_mode == PY_MODE_MULTI_EXECUTOR && - atomic_load(&g_multi_executor_initialized)) { - ctx->executor_id = select_executor(); + /* Worker mode: create dedicated pthread with main interpreter + * This provides stable thread affinity for numpy/torch/tensorflow */ + if (worker_context_init(ctx) != 0) { + close(ctx->callback_pipe[0]); + close(ctx->callback_pipe[1]); + enif_release_resource(ctx); + return make_error(env, "worker_init_failed"); } ERL_NIF_TERM ref = enif_make_resource(env, ctx); @@ -4069,10 +4726,10 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T * * nif_context_destroy(ContextRef) -> ok * - * For subinterpreter mode: releases the pool slot back to the pool. - * The pool owns the Python objects - context just references them. + * For owngil mode: shuts down the dedicated OWN_GIL thread. + * For worker mode: shuts down the dedicated worker thread. * - * For worker mode: cleans up Python objects directly with the main GIL. + * Both modes use the join-or-leak pattern for safe shutdown. */ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { (void)argc; @@ -4083,32 +4740,57 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_ } /* Skip if already destroyed */ - if (ctx->destroyed) { + if (atomic_load(&ctx->destroyed)) { return ATOM_OK; } /* Mark as destroyed early to prevent new operations */ - ctx->destroyed = true; + atomic_store(&ctx->destroyed, true); #ifdef HAVE_SUBINTERPRETERS /* OWN_GIL mode: shutdown the dedicated thread */ if (ctx->uses_own_gil) { owngil_context_shutdown(ctx); - /* Close callback pipes */ - if (ctx->callback_pipe[0] >= 0) { - close(ctx->callback_pipe[0]); - ctx->callback_pipe[0] = -1; - } - if (ctx->callback_pipe[1] >= 0) { - close(ctx->callback_pipe[1]); - ctx->callback_pipe[1] = -1; + /* Close callback pipes only on a clean shutdown. If the + * worker timed out (ctx->leaked == true) it may still write + * to / read from these fds; closing them here would let the + * kernel reissue the fd numbers to unrelated files and + * silently corrupt them. */ + if (!atomic_load(&ctx->leaked)) { + if (ctx->callback_pipe[0] >= 0) { + close(ctx->callback_pipe[0]); + ctx->callback_pipe[0] = -1; + } + if (ctx->callback_pipe[1] >= 0) { + close(ctx->callback_pipe[1]); + ctx->callback_pipe[1] = -1; + } } atomic_fetch_add(&g_counters.ctx_destroyed, 1); return ATOM_OK; } #endif - /* Worker mode - clean up Python objects with GIL */ + /* Worker mode: shutdown the dedicated worker thread */ + if (ctx->uses_worker_thread) { + worker_context_shutdown(ctx); + /* Close callback pipes (see OWN_GIL branch for why this is + * gated on !ctx->leaked). */ + if (!atomic_load(&ctx->leaked)) { + if (ctx->callback_pipe[0] >= 0) { + close(ctx->callback_pipe[0]); + ctx->callback_pipe[0] = -1; + } + if (ctx->callback_pipe[1] >= 0) { + close(ctx->callback_pipe[1]); + ctx->callback_pipe[1] = -1; + } + } + atomic_fetch_add(&g_counters.ctx_destroyed, 1); + return ATOM_OK; + } + + /* Legacy mode (should not reach here with new architecture) */ if (runtime_is_running()) { PyGILState_STATE gstate = PyGILState_Ensure(); Py_XDECREF(ctx->module_cache); @@ -4127,6 +4809,16 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_ PyGILState_Release(gstate); } + /* Close callback pipes */ + if (ctx->callback_pipe[0] >= 0) { + close(ctx->callback_pipe[0]); + ctx->callback_pipe[0] = -1; + } + if (ctx->callback_pipe[1] >= 0) { + close(ctx->callback_pipe[1]); + ctx->callback_pipe[1] = -1; + } + atomic_fetch_add(&g_counters.ctx_destroyed, 1); return ATOM_OK; } @@ -4197,7 +4889,20 @@ static ERL_NIF_TERM nif_context_call(ErlNifEnv *env, int argc, const ERL_NIF_TER } #endif - /* Both worker mode and subinterpreter mode use py_context_acquire. + /* Worker thread mode: dispatch to dedicated thread */ + if (ctx->uses_worker_thread) { + /* Build request tuple: {Module, Func, Args, Kwargs} */ + ERL_NIF_TERM kwargs = (argc > 4 && enif_is_map(env, argv[4])) + ? argv[4] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple4(env, + argv[1], /* Module */ + argv[2], /* Func */ + argv[3], /* Args */ + kwargs); + return dispatch_to_worker_thread(env, ctx, CTX_REQ_CALL, request); + } + + /* Legacy mode: direct execution with py_context_acquire. * For subinterpreters, py_context_acquire handles PyThreadState_Swap * to switch to the pool slot's interpreter. */ ErlNifBinary module_bin, func_bin; @@ -4208,15 +4913,6 @@ static ERL_NIF_TERM nif_context_call(ErlNifEnv *env, int argc, const ERL_NIF_TER return make_error(env, "invalid_func"); } - /* Context thread affinity: dispatch via executor instead of direct execution. - * This ensures numpy/torch thread-local state consistency. */ - if (ctx->executor_id >= 0 && g_execution_mode == PY_MODE_MULTI_EXECUTOR && - atomic_load(&g_multi_executor_initialized)) { - ERL_NIF_TERM kwargs = (argc > 4 && enif_is_map(env, argv[4])) - ? argv[4] : enif_make_new_map(env); - return context_dispatch_call(env, ctx, &module_bin, &func_bin, argv[3], kwargs); - } - char *module_name = binary_to_string(&module_bin); char *func_name = binary_to_string(&func_bin); if (module_name == NULL || func_name == NULL) { @@ -4335,45 +5031,311 @@ static ERL_NIF_TERM nif_context_call(ErlNifEnv *env, int argc, const ERL_NIF_TER ERL_NIF_TERM cont_ref = enif_make_resource(env, cont); enif_release_resource(cont); - /* Restore thread-local state before scheduling */ - tl_allow_suspension = prev_allow_suspension; - tl_current_context = prev_context; - clear_pending_callback_tls(); - enif_free(module_name); - enif_free(func_name); - py_context_release(&guard); + /* Restore thread-local state before scheduling */ + tl_allow_suspension = prev_allow_suspension; + tl_current_context = prev_context; + clear_pending_callback_tls(); + enif_free(module_name); + enif_free(func_name); + py_context_release(&guard); + + return enif_schedule_nif(env, "inline_continuation", + ERL_NIF_DIRTY_JOB_IO_BOUND, nif_inline_continuation, 1, &cont_ref); + } + } else if (is_schedule_marker(py_result)) { + /* Schedule marker: release dirty scheduler, continue via callback */ + ScheduleMarkerObject *marker = (ScheduleMarkerObject *)py_result; + ERL_NIF_TERM callback_name = py_to_term(env, marker->callback_name); + ERL_NIF_TERM callback_args = py_to_term(env, marker->args); + Py_DECREF(py_result); + result = enif_make_tuple3(env, ATOM_SCHEDULE, callback_name, callback_args); + } else { + ERL_NIF_TERM term_result = py_to_term(env, py_result); + Py_DECREF(py_result); + result = enif_make_tuple2(env, ATOM_OK, term_result); + } + +cleanup: + /* Restore thread-local state */ + tl_allow_suspension = prev_allow_suspension; + tl_current_context = prev_context; + + /* Clear pending callback TLS before releasing context */ + clear_pending_callback_tls(); + + enif_free(module_name); + enif_free(func_name); + + /* Release thread state using centralized guard */ + py_context_release(&guard); + + return result; +} + +/** + * @brief Async call - enqueue and return immediately + * + * nif_context_call_async(ContextRef, CallerPid, RequestId, Module, Func, Args, Kwargs) + * -> {enqueued, RequestId} | {error, Reason} + * + * The worker thread will send {py_result, RequestId, Result} to CallerPid when done. + */ +static ERL_NIF_TERM nif_context_call_async(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + py_context_t *ctx; + + if (!runtime_is_running()) { + return make_error(env, "python_not_running"); + } + + if (argc < 6) { + return make_error(env, "badarg"); + } + + if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) { + return make_error(env, "invalid_context"); + } + + /* Get caller PID */ + ErlNifPid caller_pid; + if (!enif_get_local_pid(env, argv[1], &caller_pid)) { + return make_error(env, "invalid_pid"); + } + + /* RequestId is argv[2] - can be any term */ + ERL_NIF_TERM request_id = argv[2]; + + /* Worker thread mode: dispatch async */ + if (ctx->uses_worker_thread) { + /* Build request tuple: {Module, Func, Args, Kwargs} */ + ERL_NIF_TERM kwargs = (argc > 6 && enif_is_map(env, argv[6])) + ? argv[6] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple4(env, + argv[3], /* Module */ + argv[4], /* Func */ + argv[5], /* Args */ + kwargs); + return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_CALL, + request, caller_pid, request_id, NULL); + } + + /* Not using worker thread - fall back to blocking call */ + return make_error(env, "async_requires_worker_thread"); +} + +/** + * @brief Async eval - enqueue and return immediately + * + * nif_context_eval_async(ContextRef, CallerPid, RequestId, Code, Locals) + * -> {enqueued, RequestId} | {error, Reason} + * + * The worker thread will send {py_result, RequestId, Result} to CallerPid when done. + */ +static ERL_NIF_TERM nif_context_eval_async(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + py_context_t *ctx; + + if (!runtime_is_running()) { + return make_error(env, "python_not_running"); + } + + if (argc < 4) { + return make_error(env, "badarg"); + } + + if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) { + return make_error(env, "invalid_context"); + } + + /* Get caller PID */ + ErlNifPid caller_pid; + if (!enif_get_local_pid(env, argv[1], &caller_pid)) { + return make_error(env, "invalid_pid"); + } + + /* RequestId is argv[2] - can be any term */ + ERL_NIF_TERM request_id = argv[2]; + + /* Worker thread mode: dispatch async */ + if (ctx->uses_worker_thread) { + /* Build request tuple: {Code, Locals} */ + ERL_NIF_TERM locals = (argc > 4 && enif_is_map(env, argv[4])) + ? argv[4] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple2(env, argv[3], locals); + return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EVAL, + request, caller_pid, request_id, NULL); + } + + /* Not using worker thread - fall back to blocking call */ + return make_error(env, "async_requires_worker_thread"); +} + +/** + * @brief Async exec - enqueue and return immediately + * + * nif_context_exec_async(ContextRef, CallerPid, RequestId, Code) + * -> {enqueued, RequestId} | {error, Reason} + * + * The worker thread will send {py_result, RequestId, Result} to CallerPid when done. + */ +static ERL_NIF_TERM nif_context_exec_async(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + py_context_t *ctx; + + if (!runtime_is_running()) { + return make_error(env, "python_not_running"); + } + + if (argc < 4) { + return make_error(env, "badarg"); + } + + if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) { + return make_error(env, "invalid_context"); + } + + /* Get caller PID */ + ErlNifPid caller_pid; + if (!enif_get_local_pid(env, argv[1], &caller_pid)) { + return make_error(env, "invalid_pid"); + } + + /* RequestId is argv[2] - can be any term */ + ERL_NIF_TERM request_id = argv[2]; + + /* Worker thread mode: dispatch async */ + if (ctx->uses_worker_thread) { + return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EXEC, + argv[3], caller_pid, request_id, NULL); + } + + /* Not using worker thread - fall back to blocking call */ + return make_error(env, "async_requires_worker_thread"); +} + +/** + * @brief Async call with process-local environment + * + * nif_context_call_with_env_async(ContextRef, CallerPid, RequestId, + * Module, Func, Args, Kwargs, EnvRef) + * -> {enqueued, RequestId} | {error, Reason} + * + * Same contract as nif_context_call_async but threads the process-local + * env through to the worker. Replaces the 30-second pthread_cond_timedwait + * dispatch path; the Erlang side waits in a normal receive. + */ +static ERL_NIF_TERM nif_context_call_with_env_async(ErlNifEnv *env, int argc, + const ERL_NIF_TERM argv[]) { + py_context_t *ctx; + py_env_resource_t *penv; + + if (!runtime_is_running()) { + return make_error(env, "python_not_running"); + } + if (argc < 8) { + return make_error(env, "badarg"); + } + if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) { + return make_error(env, "invalid_context"); + } + ErlNifPid caller_pid; + if (!enif_get_local_pid(env, argv[1], &caller_pid)) { + return make_error(env, "invalid_pid"); + } + ERL_NIF_TERM request_id = argv[2]; + if (!enif_get_resource(env, argv[7], PY_ENV_RESOURCE_TYPE, (void **)&penv)) { + return make_error(env, "invalid_env"); + } + + if (!ctx->uses_worker_thread) { + return make_error(env, "async_requires_worker_thread"); + } + + ERL_NIF_TERM kwargs = enif_is_map(env, argv[6]) + ? argv[6] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple4(env, + argv[3], /* Module */ + argv[4], /* Func */ + argv[5], /* Args */ + kwargs); + return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_CALL_WITH_ENV, + request, caller_pid, request_id, penv); +} - return enif_schedule_nif(env, "inline_continuation", - ERL_NIF_DIRTY_JOB_IO_BOUND, nif_inline_continuation, 1, &cont_ref); - } - } else if (is_schedule_marker(py_result)) { - /* Schedule marker: release dirty scheduler, continue via callback */ - ScheduleMarkerObject *marker = (ScheduleMarkerObject *)py_result; - ERL_NIF_TERM callback_name = py_to_term(env, marker->callback_name); - ERL_NIF_TERM callback_args = py_to_term(env, marker->args); - Py_DECREF(py_result); - result = enif_make_tuple3(env, ATOM_SCHEDULE, callback_name, callback_args); - } else { - ERL_NIF_TERM term_result = py_to_term(env, py_result); - Py_DECREF(py_result); - result = enif_make_tuple2(env, ATOM_OK, term_result); +/** + * @brief Async eval with process-local environment + * + * nif_context_eval_with_env_async(ContextRef, CallerPid, RequestId, + * Code, Locals, EnvRef) + * -> {enqueued, RequestId} | {error, Reason} + */ +static ERL_NIF_TERM nif_context_eval_with_env_async(ErlNifEnv *env, int argc, + const ERL_NIF_TERM argv[]) { + py_context_t *ctx; + py_env_resource_t *penv; + + if (!runtime_is_running()) { + return make_error(env, "python_not_running"); + } + if (argc < 6) { + return make_error(env, "badarg"); + } + if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) { + return make_error(env, "invalid_context"); + } + ErlNifPid caller_pid; + if (!enif_get_local_pid(env, argv[1], &caller_pid)) { + return make_error(env, "invalid_pid"); + } + ERL_NIF_TERM request_id = argv[2]; + if (!enif_get_resource(env, argv[5], PY_ENV_RESOURCE_TYPE, (void **)&penv)) { + return make_error(env, "invalid_env"); } -cleanup: - /* Restore thread-local state */ - tl_allow_suspension = prev_allow_suspension; - tl_current_context = prev_context; + if (!ctx->uses_worker_thread) { + return make_error(env, "async_requires_worker_thread"); + } - /* Clear pending callback TLS before releasing context */ - clear_pending_callback_tls(); + ERL_NIF_TERM locals = enif_is_map(env, argv[4]) + ? argv[4] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple2(env, argv[3], locals); + return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EVAL_WITH_ENV, + request, caller_pid, request_id, penv); +} - enif_free(module_name); - enif_free(func_name); +/** + * @brief Async exec with process-local environment + * + * nif_context_exec_with_env_async(ContextRef, CallerPid, RequestId, + * Code, EnvRef) + * -> {enqueued, RequestId} | {error, Reason} + */ +static ERL_NIF_TERM nif_context_exec_with_env_async(ErlNifEnv *env, int argc, + const ERL_NIF_TERM argv[]) { + py_context_t *ctx; + py_env_resource_t *penv; - /* Release thread state using centralized guard */ - py_context_release(&guard); + if (!runtime_is_running()) { + return make_error(env, "python_not_running"); + } + if (argc < 5) { + return make_error(env, "badarg"); + } + if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) { + return make_error(env, "invalid_context"); + } + ErlNifPid caller_pid; + if (!enif_get_local_pid(env, argv[1], &caller_pid)) { + return make_error(env, "invalid_pid"); + } + ERL_NIF_TERM request_id = argv[2]; + if (!enif_get_resource(env, argv[4], PY_ENV_RESOURCE_TYPE, (void **)&penv)) { + return make_error(env, "invalid_env"); + } - return result; + if (!ctx->uses_worker_thread) { + return make_error(env, "async_requires_worker_thread"); + } + + return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EXEC_WITH_ENV, + argv[3], caller_pid, request_id, penv); } /** @@ -4408,7 +5370,16 @@ static ERL_NIF_TERM nif_context_eval(ErlNifEnv *env, int argc, const ERL_NIF_TER } #endif - /* Both worker mode and subinterpreter mode use py_context_acquire. + /* Worker thread mode: dispatch to dedicated thread */ + if (ctx->uses_worker_thread) { + /* Build request tuple: {Code, Locals} */ + ERL_NIF_TERM locals = (argc > 2 && enif_is_map(env, argv[2])) + ? argv[2] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple2(env, argv[1], locals); + return dispatch_to_worker_thread(env, ctx, CTX_REQ_EVAL, request); + } + + /* Legacy mode: direct execution with py_context_acquire. * For subinterpreters, py_context_acquire handles PyThreadState_Swap * to switch to the pool slot's interpreter. */ ErlNifBinary code_bin; @@ -4416,15 +5387,6 @@ static ERL_NIF_TERM nif_context_eval(ErlNifEnv *env, int argc, const ERL_NIF_TER return make_error(env, "invalid_code"); } - /* Context thread affinity: dispatch via executor instead of direct execution. - * This ensures numpy/torch thread-local state consistency. */ - if (ctx->executor_id >= 0 && g_execution_mode == PY_MODE_MULTI_EXECUTOR && - atomic_load(&g_multi_executor_initialized)) { - ERL_NIF_TERM locals = (argc > 2 && enif_is_map(env, argv[2])) - ? argv[2] : enif_make_new_map(env); - return context_dispatch_eval(env, ctx, &code_bin, locals); - } - char *code = binary_to_string(&code_bin); if (code == NULL) { return make_error(env, "alloc_failed"); @@ -4554,7 +5516,12 @@ static ERL_NIF_TERM nif_context_exec(ErlNifEnv *env, int argc, const ERL_NIF_TER } #endif - /* Both worker mode and subinterpreter mode use py_context_acquire. + /* Worker thread mode: dispatch to dedicated thread */ + if (ctx->uses_worker_thread) { + return dispatch_to_worker_thread(env, ctx, CTX_REQ_EXEC, argv[1]); + } + + /* Legacy mode: direct execution with py_context_acquire. * For subinterpreters, py_context_acquire handles PyThreadState_Swap * to switch to the pool slot's interpreter. */ ErlNifBinary code_bin; @@ -4562,13 +5529,6 @@ static ERL_NIF_TERM nif_context_exec(ErlNifEnv *env, int argc, const ERL_NIF_TER return make_error(env, "invalid_code"); } - /* Context thread affinity: dispatch via executor instead of direct execution. - * This ensures numpy/torch thread-local state consistency. */ - if (ctx->executor_id >= 0 && g_execution_mode == PY_MODE_MULTI_EXECUTOR && - atomic_load(&g_multi_executor_initialized)) { - return context_dispatch_exec(env, ctx, &code_bin); - } - char *code = binary_to_string(&code_bin); if (code == NULL) { return make_error(env, "alloc_failed"); @@ -4949,6 +5909,12 @@ static ERL_NIF_TERM nif_context_exec_with_env(ErlNifEnv *env, int argc, const ER } #endif + /* Worker thread mode: dispatch to dedicated thread with local env */ + if (ctx->uses_worker_thread) { + /* For exec, we just pass the code binary */ + return dispatch_to_worker_thread_impl(env, ctx, CTX_REQ_EXEC_WITH_ENV, argv[1], penv); + } + char *code = binary_to_string(&code_bin); if (code == NULL) { return make_error(env, "alloc_failed"); @@ -5031,6 +5997,15 @@ static ERL_NIF_TERM nif_context_eval_with_env(ErlNifEnv *env, int argc, const ER } #endif + /* Worker thread mode: dispatch to dedicated thread with local env */ + if (ctx->uses_worker_thread) { + /* Build request tuple: {Code, Locals} */ + ERL_NIF_TERM locals = (argc > 2 && enif_is_map(env, argv[2])) + ? argv[2] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple2(env, argv[1], locals); + return dispatch_to_worker_thread_impl(env, ctx, CTX_REQ_EVAL_WITH_ENV, request, penv); + } + char *code = binary_to_string(&code_bin); if (code == NULL) { return make_error(env, "alloc_failed"); @@ -5189,6 +6164,19 @@ static ERL_NIF_TERM nif_context_call_with_env(ErlNifEnv *env, int argc, const ER } #endif + /* Worker thread mode: dispatch to dedicated thread with local env */ + if (ctx->uses_worker_thread) { + /* Build request tuple: {Module, Func, Args, Kwargs} */ + ERL_NIF_TERM kwargs = (argc > 4 && enif_is_map(env, argv[4])) + ? argv[4] : enif_make_new_map(env); + ERL_NIF_TERM request = enif_make_tuple4(env, + argv[1], /* Module */ + argv[2], /* Func */ + argv[3], /* Args */ + kwargs); + return dispatch_to_worker_thread_impl(env, ctx, CTX_REQ_CALL_WITH_ENV, request, penv); + } + char *module_name = binary_to_string(&module_bin); char *func_name = binary_to_string(&func_bin); if (module_name == NULL || func_name == NULL) { @@ -6111,180 +7099,6 @@ static ERL_NIF_TERM nif_ref_call_method(ErlNifEnv *env, int argc, const ERL_NIF_ #ifdef HAVE_SUBINTERPRETERS -/** - * @brief Destructor for py_subinterp_handle_t resource - */ -static void subinterp_handle_destructor(ErlNifEnv *env, void *obj) { - (void)env; - py_subinterp_handle_t *handle = (py_subinterp_handle_t *)obj; - - /* Clean up the namespace in the worker */ - if (!atomic_load(&handle->destroyed)) { - subinterp_thread_handle_destroy(handle); - } -} - -/** - * @brief NIF: Create a new OWN_GIL subinterpreter handle - * - * Returns a handle that can be used with subinterp_call/eval/exec. - * The handle is bound to a worker thread with its own GIL. - */ -static ERL_NIF_TERM nif_subinterp_thread_create(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; - (void)argv; - - if (!subinterp_thread_pool_is_ready()) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "pool_not_initialized")); - } - - py_subinterp_handle_t *handle = enif_alloc_resource( - PY_SUBINTERP_HANDLE_RESOURCE_TYPE, sizeof(py_subinterp_handle_t)); - if (handle == NULL) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "alloc_failed")); - } - - if (subinterp_thread_handle_create(handle) != 0) { - enif_release_resource(handle); - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "create_failed")); - } - - ERL_NIF_TERM ref = enif_make_resource(env, handle); - enif_release_resource(handle); - - return enif_make_tuple2(env, ATOM_OK, ref); -} - -/** - * @brief NIF: Destroy an OWN_GIL subinterpreter handle - */ -static ERL_NIF_TERM nif_subinterp_thread_destroy(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; - - py_subinterp_handle_t *handle; - if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE, - (void **)&handle)) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "invalid_handle")); - } - - subinterp_thread_handle_destroy(handle); - return ATOM_OK; -} - -/** - * @brief NIF: Call a Python function through OWN_GIL subinterpreter - */ -static ERL_NIF_TERM nif_subinterp_thread_call(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - if (argc < 4 || argc > 5) { - return enif_make_badarg(env); - } - - py_subinterp_handle_t *handle; - if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE, - (void **)&handle)) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "invalid_handle")); - } - - ERL_NIF_TERM module = argv[1]; - ERL_NIF_TERM func = argv[2]; - ERL_NIF_TERM args = argv[3]; - ERL_NIF_TERM kwargs = argc > 4 ? argv[4] : enif_make_new_map(env); - - return subinterp_thread_call(env, handle, module, func, args, kwargs); -} - -/** - * @brief NIF: Evaluate Python expression through OWN_GIL subinterpreter - */ -static ERL_NIF_TERM nif_subinterp_thread_eval(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - if (argc < 2 || argc > 3) { - return enif_make_badarg(env); - } - - py_subinterp_handle_t *handle; - if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE, - (void **)&handle)) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "invalid_handle")); - } - - ERL_NIF_TERM code = argv[1]; - ERL_NIF_TERM locals = argc > 2 ? argv[2] : enif_make_new_map(env); - - return subinterp_thread_eval(env, handle, code, locals); -} - -/** - * @brief NIF: Execute Python statements through OWN_GIL subinterpreter - */ -static ERL_NIF_TERM nif_subinterp_thread_exec(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - if (argc != 2) { - return enif_make_badarg(env); - } - - py_subinterp_handle_t *handle; - if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE, - (void **)&handle)) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "invalid_handle")); - } - - return subinterp_thread_exec(env, handle, argv[1]); -} - -/** - * @brief NIF: Cast (fire-and-forget) through OWN_GIL subinterpreter - */ -static ERL_NIF_TERM nif_subinterp_thread_cast(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - if (argc != 4) { - return enif_make_badarg(env); - } - - py_subinterp_handle_t *handle; - if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE, - (void **)&handle)) { - return ATOM_OK; /* Silently ignore for cast */ - } - - return subinterp_thread_cast(env, handle, argv[1], argv[2], argv[3]); -} - -/** - * @brief NIF: Async call through OWN_GIL subinterpreter - */ -static ERL_NIF_TERM nif_subinterp_thread_async_call(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - if (argc != 6) { - return enif_make_badarg(env); - } - - py_subinterp_handle_t *handle; - if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE, - (void **)&handle)) { - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "invalid_handle")); - } - - ErlNifPid caller_pid; - if (!enif_get_local_pid(env, argv[4], &caller_pid)) { - return enif_make_badarg(env); - } - - return subinterp_thread_async_call(env, handle, argv[1], argv[2], argv[3], - &caller_pid, argv[5]); -} - /** * @brief NIF: Check if OWN_GIL thread pool is available */ @@ -6696,53 +7510,6 @@ static ERL_NIF_TERM nif_owngil_apply_paths(ErlNifEnv *env, int argc, #else /* !HAVE_SUBINTERPRETERS */ /* Stub implementations for Python < 3.12 */ -static ERL_NIF_TERM nif_subinterp_thread_create(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "not_supported")); -} - -static ERL_NIF_TERM nif_subinterp_thread_destroy(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "not_supported")); -} - -static ERL_NIF_TERM nif_subinterp_thread_call(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "not_supported")); -} - -static ERL_NIF_TERM nif_subinterp_thread_eval(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "not_supported")); -} - -static ERL_NIF_TERM nif_subinterp_thread_exec(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "not_supported")); -} - -static ERL_NIF_TERM nif_subinterp_thread_cast(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return ATOM_OK; -} - -static ERL_NIF_TERM nif_subinterp_thread_async_call(ErlNifEnv *env, int argc, - const ERL_NIF_TERM argv[]) { - (void)argc; (void)argv; - return enif_make_tuple2(env, ATOM_ERROR, - enif_make_atom(env, "not_supported")); -} static ERL_NIF_TERM nif_subinterp_thread_pool_ready(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { @@ -6829,17 +7596,6 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) { env, NULL, "py_suspended_state", suspended_state_destructor, ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL); -#ifdef HAVE_SUBINTERPRETERS - SUBINTERP_WORKER_RESOURCE_TYPE = enif_open_resource_type( - env, NULL, "py_subinterp_worker", subinterp_worker_destructor, - ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL); - - /* OWN_GIL subinterpreter handle resource type */ - PY_SUBINTERP_HANDLE_RESOURCE_TYPE = enif_open_resource_type( - env, NULL, "py_subinterp_handle", subinterp_handle_destructor, - ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL); -#endif - /* Process-per-context resource type (no mutex) */ PY_CONTEXT_RESOURCE_TYPE = enif_open_resource_type( env, NULL, "py_context", context_destructor, @@ -6865,8 +7621,10 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) { env, NULL, "inline_continuation", inline_continuation_destructor, ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL); - /* Process-scoped shared dictionary resource type - * Using simple resource type without process monitoring for now */ + /* Process-scoped shared dictionary resource type. GC-scoped: the + * destructor releases the Python dict when the last term ref + * drops. No per-process monitor — explicit shared_dict_destroy/1 + * is the eager-release path. */ PY_SHARED_DICT_RESOURCE_TYPE = enif_open_resource_type( env, NULL, "py_shared_dict", shared_dict_destructor, ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL); @@ -6880,12 +7638,6 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) { PY_SHARED_DICT_RESOURCE_TYPE == NULL) { return -1; } -#ifdef HAVE_SUBINTERPRETERS - if (SUBINTERP_WORKER_RESOURCE_TYPE == NULL || - PY_SUBINTERP_HANDLE_RESOURCE_TYPE == NULL) { - return -1; - } -#endif /* Initialize atoms */ ATOM_OK = enif_make_atom(env, "ok"); @@ -7028,29 +7780,16 @@ static ErlNifFunc nif_funcs[] = { {"async_gather", 3, nif_async_gather, ERL_NIF_DIRTY_JOB_IO_BOUND}, {"async_stream", 6, nif_async_stream, ERL_NIF_DIRTY_JOB_IO_BOUND}, - /* Sub-interpreter support (shared GIL pool model) */ + /* Subinterpreter capability probes */ {"subinterp_supported", 0, nif_subinterp_supported, 0}, {"owngil_supported", 0, nif_owngil_supported, 0}, - {"subinterp_worker_new", 0, nif_subinterp_worker_new, 0}, - {"subinterp_worker_destroy", 1, nif_subinterp_worker_destroy, 0}, - {"subinterp_call", 5, nif_subinterp_call, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - {"parallel_execute", 2, nif_parallel_execute, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - /* OWN_GIL subinterpreter thread pool (true parallelism) */ + /* OWN_GIL thread pool (used internally by py_event_loop_pool) */ {"subinterp_thread_pool_start", 0, nif_subinterp_thread_pool_start, 0}, {"subinterp_thread_pool_start", 1, nif_subinterp_thread_pool_start, 0}, {"subinterp_thread_pool_stop", 0, nif_subinterp_thread_pool_stop, 0}, {"subinterp_thread_pool_ready", 0, nif_subinterp_thread_pool_ready, 0}, {"subinterp_thread_pool_stats", 0, nif_subinterp_thread_pool_stats, 0}, - {"subinterp_thread_create", 0, nif_subinterp_thread_create, 0}, - {"subinterp_thread_destroy", 1, nif_subinterp_thread_destroy, 0}, - {"subinterp_thread_call", 4, nif_subinterp_thread_call, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - {"subinterp_thread_call", 5, nif_subinterp_thread_call, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - {"subinterp_thread_eval", 2, nif_subinterp_thread_eval, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - {"subinterp_thread_eval", 3, nif_subinterp_thread_eval, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - {"subinterp_thread_exec", 2, nif_subinterp_thread_exec, ERL_NIF_DIRTY_JOB_CPU_BOUND}, - {"subinterp_thread_cast", 4, nif_subinterp_thread_cast, 0}, - {"subinterp_thread_async_call", 6, nif_subinterp_thread_async_call, 0}, /* OWN_GIL session management for event loop pool */ {"owngil_create_session", 1, nif_owngil_create_session, 0}, @@ -7061,7 +7800,6 @@ static ErlNifFunc nif_funcs[] = { /* Execution mode info */ {"execution_mode", 0, nif_execution_mode, 0}, - {"num_executors", 0, nif_num_executors, 0}, /* Thread worker support (ThreadPoolExecutor) */ {"thread_worker_set_coordinator", 1, nif_thread_worker_set_coordinator, 0}, @@ -7158,6 +7896,13 @@ static ErlNifFunc nif_funcs[] = { {"context_exec", 3, nif_context_exec_with_env, ERL_NIF_DIRTY_JOB_CPU_BOUND}, {"context_eval", 4, nif_context_eval_with_env, ERL_NIF_DIRTY_JOB_CPU_BOUND}, {"context_call", 6, nif_context_call_with_env, ERL_NIF_DIRTY_JOB_CPU_BOUND}, + /* Async dispatch - non-blocking, returns immediately */ + {"context_call_async", 7, nif_context_call_async, 0}, + {"context_eval_async", 5, nif_context_eval_async, 0}, + {"context_exec_async", 4, nif_context_exec_async, 0}, + {"context_call_with_env_async", 8, nif_context_call_with_env_async, 0}, + {"context_eval_with_env_async", 6, nif_context_eval_with_env_async, 0}, + {"context_exec_with_env_async", 5, nif_context_exec_with_env_async, 0}, {"create_local_env", 1, nif_create_local_env, 0}, {"interp_apply_imports", 2, nif_interp_apply_imports, ERL_NIF_DIRTY_JOB_CPU_BOUND}, {"interp_apply_paths", 2, nif_interp_apply_paths, ERL_NIF_DIRTY_JOB_CPU_BOUND}, diff --git a/c_src/py_nif.h b/c_src/py_nif.h index 050856a..e353fd5 100644 --- a/c_src/py_nif.h +++ b/c_src/py_nif.h @@ -179,20 +179,13 @@ typedef enum { PY_MODE_FREE_THREADED, /** - * @brief Sub-interpreter mode (Python 3.12+) + * @brief Conventional GIL mode (every other supported build) * - * Each sub-interpreter has its own GIL, allowing parallel execution - * across interpreters while maintaining GIL semantics within each. + * Coordinator-side work runs through the single executor thread. + * Per-context worker / OWN_GIL pthreads handle the public context + * APIs directly; this mode label only governs the coordinator path. */ - PY_MODE_SUBINTERP, - - /** - * @brief Multi-executor mode (all Python versions) - * - * Multiple executor threads share the GIL using a work-stealing - * pattern. This is the fallback mode for older Python versions. - */ - PY_MODE_MULTI_EXECUTOR + PY_MODE_GIL } py_execution_mode_t; /** @} */ @@ -387,9 +380,6 @@ typedef struct { /** @brief Environment for building callback messages */ ErlNifEnv *callback_env; - - /** @brief Assigned executor ID for thread affinity (-1 = round-robin) */ - int executor_id; } py_worker_t; /* async_pending_t and py_async_worker_t removed - async workers replaced by event loop model */ @@ -639,10 +629,10 @@ typedef struct { size_t result_len; /** @brief Flag: result is available for replay */ - volatile bool has_result; + _Atomic bool has_result; /** @brief Flag: result represents an error */ - volatile bool is_error; + _Atomic bool is_error; /* Synchronization */ @@ -665,40 +655,6 @@ typedef struct { * @{ */ -#ifdef HAVE_SUBINTERPRETERS -/** - * @struct py_subinterp_worker_t - * @brief Worker running in an isolated sub-interpreter - * - * Sub-interpreters provide true isolation with their own GIL, - * enabling parallel Python execution on Python 3.12+. - * - * The mutex ensures thread-safe access when multiple dirty scheduler - * threads attempt to use the same worker concurrently. - * - * @note Only available when compiled with Python 3.12+ - * - * @see nif_subinterp_worker_new - * @see nif_subinterp_call - */ -typedef struct { - /** @brief Mutex for thread-safe access from multiple dirty schedulers */ - pthread_mutex_t mutex; - - /** @brief Python interpreter state */ - PyInterpreterState *interp; - - /** @brief Thread state for this interpreter */ - PyThreadState *tstate; - - /** @brief Global namespace dictionary */ - PyObject *globals; - - /** @brief Local namespace dictionary */ - PyObject *locals; -} py_subinterp_worker_t; -#endif - /** * @enum py_cmd_type_t * @brief Command types for thread-per-context dispatch @@ -741,6 +697,172 @@ typedef enum { CTX_REQ_APPLY_PATHS /**< Apply paths to sys.path */ } ctx_request_type_t; +/** + * @struct ctx_request_t + * @brief Heap-allocated request for worker/owngil context queue + * + * Each request is heap-allocated with its own mutex/condvar for completion + * signaling. This replaces the single-slot pattern that had race conditions + * with multiple concurrent callers. + * + * Lifecycle: + * 1. Caller allocates request with ctx_request_create() + * 2. Caller fills in request data and copies terms to request_env + * 3. Caller enqueues request and increments refcount (now 2: caller + queue) + * 4. Worker dequeues request, processes it, fills result_env/result + * 5. Worker sends result via enif_send() and releases queue's ref + * 6. Caller receives result and releases its ref + * 7. When refcount hits 0, request is freed + * + * For OWN_GIL mode, the worker thread sends results via enif_send() to avoid + * blocking dirty schedulers. For worker mode (main interpreter), the same + * pattern is used for consistency. + */ +typedef struct ctx_request { + /** @brief Type of request */ + ctx_request_type_t type; + + /** @brief Per-request mutex for completion synchronization */ + pthread_mutex_t mutex; + + /** @brief Per-request condition for completion signaling */ + pthread_cond_t cond; + + /** @brief Set by worker when done (for blocking wait mode) */ + _Atomic bool completed; + + /** @brief Set by caller on timeout/destroy to skip processing */ + _Atomic bool cancelled; + + /* Request data (owned by this struct, not caller) */ + + /** @brief Environment for request terms (created by caller) */ + ErlNifEnv *request_env; + + /** @brief Request parameters (in request_env) */ + ERL_NIF_TERM request_data; + + /** @brief Process-local env pointer for WITH_ENV requests */ + void *local_env_ptr; + + /** @brief Reactor buffer pointer for reactor requests */ + void *reactor_buffer_ptr; + + /** @brief FD for reactor requests */ + int reactor_fd; + + /* Result data (owned by this struct) */ + + /** @brief Environment for result terms (created by worker) */ + ErlNifEnv *result_env; + + /** @brief Result term (in result_env) */ + ERL_NIF_TERM result; + + /** @brief True if request succeeded */ + bool success; + + /* Async delivery (for non-blocking dispatch) */ + + /** @brief Caller's PID for async result delivery */ + ErlNifPid caller_pid; + + /** @brief Request ID for correlating async responses */ + ERL_NIF_TERM request_id; + + /** @brief Whether to use async delivery vs blocking wait */ + bool async_mode; + + /* Queue management */ + + /** @brief Reference count (2=caller+queue, 1=one side, 0=free) */ + _Atomic int refcount; + + /** @brief Next request in queue */ + struct ctx_request *next; +} ctx_request_t; + +/** + * @brief Create a new context request + * + * Rolls back partial state on any init failure: pthread_mutex_init, + * pthread_cond_init, or enif_alloc_env() can each fail under resource + * pressure. Returning NULL keeps callers safe — every call site + * already tests the result. + * + * @return Newly allocated request with refcount=1, or NULL on failure + */ +static inline ctx_request_t *ctx_request_create(void) { + ctx_request_t *req = enif_alloc(sizeof(ctx_request_t)); + if (req == NULL) { + return NULL; + } + memset(req, 0, sizeof(ctx_request_t)); + + if (pthread_mutex_init(&req->mutex, NULL) != 0) { + enif_free(req); + return NULL; + } + if (pthread_cond_init(&req->cond, NULL) != 0) { + pthread_mutex_destroy(&req->mutex); + enif_free(req); + return NULL; + } + req->request_env = enif_alloc_env(); + if (req->request_env == NULL) { + pthread_cond_destroy(&req->cond); + pthread_mutex_destroy(&req->mutex); + enif_free(req); + return NULL; + } + + atomic_store(&req->completed, false); + atomic_store(&req->cancelled, false); + atomic_store(&req->refcount, 1); + req->result_env = NULL; /* Created by worker when processing */ + req->next = NULL; + req->async_mode = false; + req->reactor_fd = -1; + req->local_env_ptr = NULL; + req->reactor_buffer_ptr = NULL; + + return req; +} + +/** + * @brief Add a reference to a context request + * @param req The request + */ +static inline void ctx_request_addref(ctx_request_t *req) { + if (req) { + atomic_fetch_add(&req->refcount, 1); + } +} + +/** + * @brief Release a reference to a context request + * @param req The request (may be NULL) + * + * When refcount reaches 0, frees mutex/cond/envs and the request struct. + */ +static inline void ctx_request_release(ctx_request_t *req) { + if (req == NULL) return; + + int prev = atomic_fetch_sub(&req->refcount, 1); + if (prev == 1) { + /* Last reference - free everything */ + pthread_mutex_destroy(&req->mutex); + pthread_cond_destroy(&req->cond); + if (req->request_env) { + enif_free_env(req->request_env); + } + if (req->result_env) { + enif_free_env(req->result_env); + } + enif_free(req); + } +} + /** * @struct py_cmd_t * @brief Command structure for thread-per-context dispatch @@ -804,8 +926,11 @@ struct py_context { /** @brief Context mode: true=subinterpreter, false=worker */ bool is_subinterp; - /** @brief Flag indicating context has been destroyed */ - bool destroyed; + /** @brief Flag indicating context has been destroyed (atomic for thread safety) */ + _Atomic bool destroyed; + + /** @brief Flag: context resources leaked due to unresponsive worker */ + _Atomic bool leaked; /** @brief Flag: callback handler is configured */ bool has_callback_handler; @@ -816,70 +941,79 @@ struct py_context { /** @brief Pipe for callback responses [read, write] */ int callback_pipe[2]; -#ifdef HAVE_SUBINTERPRETERS - /* ========== OWN_GIL mode fields ========== */ + /* ========== Worker thread fields (used by both worker and owngil modes) ========== */ - /** @brief Whether this context uses OWN_GIL mode (dedicated pthread) */ - bool uses_own_gil; + /** @brief Dedicated pthread for this context */ + pthread_t worker_thread; - /** @brief Dedicated pthread for OWN_GIL mode */ - pthread_t own_gil_thread; + /** @brief True when worker thread is running */ + _Atomic bool worker_running; - /** @brief Thread state for OWN_GIL subinterpreter */ - PyThreadState *own_gil_tstate; + /** @brief True when shutdown has been requested */ + _Atomic bool shutdown_requested; - /** @brief Interpreter state for OWN_GIL subinterpreter */ - PyInterpreterState *own_gil_interp; + /** @brief True if this context uses a dedicated worker thread (worker mode) */ + bool uses_worker_thread; + + /** @brief True if thread initialization failed */ + _Atomic bool init_error; - /* IPC via condition variables */ + /* ========== Request queue (replaces single-slot pattern) ========== */ - /** @brief Mutex for request/response synchronization */ - pthread_mutex_t request_mutex; + /** @brief Mutex protecting the request queue */ + pthread_mutex_t queue_mutex; - /** @brief Condition variable: request ready for processing */ - pthread_cond_t request_ready; + /** @brief Condition variable: work available in queue */ + pthread_cond_t queue_not_empty; - /** @brief Condition variable: response ready for caller */ - pthread_cond_t response_ready; + /** @brief Head of request queue (dequeue from here) */ + ctx_request_t *queue_head; - /* Request/response state */ + /** @brief Tail of request queue (enqueue here) */ + ctx_request_t *queue_tail; - /** @brief Current request type (CTX_REQ_*) */ - int request_type; + /** @brief Environment for sending messages back to Erlang */ + ErlNifEnv *msg_env; - /** @brief Shared environment for zero-copy term passing */ + /* ========== Legacy compatibility fields (populated from queue request) ========== */ + /* These fields are populated by the worker thread from the current request + * for compatibility with existing execute functions. They will be removed + * once all execute functions are refactored to use ctx_request_t directly. */ + + /** @brief Shared env for current request (points to current req->request_env) */ ErlNifEnv *shared_env; - /** @brief Request term (copied into shared_env) */ - ERL_NIF_TERM request_term; + /** @brief Current request type */ + int request_type; - /** @brief Additional request data (e.g., modules list for flush) */ - ERL_NIF_TERM request_data; + /** @brief Current request data term */ + ERL_NIF_TERM request_term; - /** @brief Response term (created in shared_env) */ + /** @brief Response term for current request */ ERL_NIF_TERM response_term; - /** @brief True if response indicates success */ + /** @brief Success flag for current request */ bool response_ok; - /** @brief Auxiliary pointer for reactor buffer (OWN_GIL dispatch) */ + /** @brief Reactor buffer pointer for current request */ void *reactor_buffer_ptr; - /** @brief Process-local env pointer for OWN_GIL dispatch (py_env_resource_t*) */ + /** @brief Process-local env pointer for current request */ void *local_env_ptr; - /* Lifecycle flags */ +#ifdef HAVE_SUBINTERPRETERS + /* ========== OWN_GIL specific fields ========== */ - /** @brief True when worker thread is running */ - _Atomic bool thread_running; + /** @brief Whether this context uses OWN_GIL mode (subinterpreter with own GIL) */ + bool uses_own_gil; - /** @brief True if thread initialization failed */ - _Atomic bool init_error; + /** @brief Thread state for OWN_GIL subinterpreter */ + PyThreadState *own_gil_tstate; - /** @brief True when shutdown has been requested */ - _Atomic bool shutdown_requested; + /** @brief Interpreter state for OWN_GIL subinterpreter */ + PyInterpreterState *own_gil_interp; #else - /** @brief Worker thread state (non-subinterp mode) */ + /** @brief Worker thread state (non-subinterp mode, kept for compatibility) */ PyThreadState *thread_state; #endif @@ -891,9 +1025,6 @@ struct py_context { /** @brief Module cache (Dict: module_name -> PyModule) */ PyObject *module_cache; - - /** @brief Assigned executor ID for thread affinity (-1 = not assigned) */ - int executor_id; }; /* ============================================================================ @@ -1005,7 +1136,7 @@ static inline py_context_guard_t py_context_acquire(py_context_t *ctx) { .acquired = false }; - if (ctx == NULL || ctx->destroyed) { + if (ctx == NULL || atomic_load(&ctx->destroyed)) { return guard; } @@ -1107,10 +1238,10 @@ typedef struct { size_t result_len; /** @brief Flag: result is available for replay */ - volatile bool has_result; + _Atomic bool has_result; /** @brief Flag: result represents an error */ - volatile bool is_error; + _Atomic bool is_error; /* Sequential callback support - stores all accumulated callback results */ @@ -1200,63 +1331,6 @@ typedef struct { /** @} */ -/* ============================================================================ - * Executor Pool - * ============================================================================ */ - -/** - * @defgroup executor Executor Pool - * @brief Multi-executor thread pool for GIL management - * @{ - */ - -/** - * @def MIN_EXECUTORS - * @brief Minimum number of executor threads in the pool - */ -#define MIN_EXECUTORS 2 - -/** - * @def MAX_EXECUTORS - * @brief Maximum number of executor threads in the pool - */ -#define MAX_EXECUTORS 32 - -/** - * @struct executor_t - * @brief Single executor thread in the multi-executor pool - * - * Each executor has its own request queue and processes requests - * independently. The GIL is acquired/released around queue operations. - */ -typedef struct { - /** @brief Executor thread handle */ - pthread_t thread; - - /** @brief Mutex protecting the request queue */ - pthread_mutex_t mutex; - - /** @brief Condition variable for queue signaling */ - pthread_cond_t cond; - - /** @brief Head of request queue */ - struct py_request *queue_head; - - /** @brief Tail of request queue */ - struct py_request *queue_tail; - - /** @brief Flag: executor is running */ - volatile bool running; - - /** @brief Flag: executor should shut down */ - volatile bool shutdown; - - /** @brief Executor ID (0 to MAX_EXECUTORS-1) */ - int id; -} executor_t; - -/** @} */ - /* ============================================================================ * Global State Declarations * ============================================================================ */ @@ -1278,11 +1352,6 @@ extern ErlNifResourceType *PYOBJ_RESOURCE_TYPE; /** @brief Resource type for suspended_state_t */ extern ErlNifResourceType *SUSPENDED_STATE_RESOURCE_TYPE; -#ifdef HAVE_SUBINTERPRETERS -/** @brief Resource type for py_subinterp_worker_t */ -extern ErlNifResourceType *SUBINTERP_WORKER_RESOURCE_TYPE; -#endif - /** @brief Resource type for py_context_t (process-per-context) */ extern ErlNifResourceType *PY_CONTEXT_RESOURCE_TYPE; @@ -1364,18 +1433,6 @@ extern PyThreadState *g_main_thread_state; /** @brief Current execution mode */ extern py_execution_mode_t g_execution_mode; -/** @brief Number of active executors */ -extern int g_num_executors; - -/** @brief Multi-executor pool array */ -extern executor_t g_executors[MAX_EXECUTORS]; - -/** @brief Round-robin counter for executor selection */ -extern _Atomic int g_next_executor; - -/** @brief Flag: multi-executor pool is initialized (atomic for thread-safe access) */ -extern _Atomic bool g_multi_executor_initialized; - /* Single executor state */ /** @brief Single executor thread handle */ @@ -1841,90 +1898,6 @@ static int executor_start(void); */ static void executor_stop(void); -/** - * @brief Main function for multi-executor threads - * - * Thread entry point for executor pool threads. Processes - * requests from its queue until shutdown. - * - * @param arg Pointer to executor_t for this thread - * @return NULL - */ -static void *multi_executor_thread_main(void *arg); - -/** - * @brief Start the multi-executor pool - * - * Creates and starts num_executors threads. - * - * @param num_executors Number of executors (capped at MAX_EXECUTORS) - * @return 0 on success, -1 on failure - */ -static int multi_executor_start(int num_executors); - -/** - * @brief Stop the multi-executor pool - * - * Signals shutdown and waits for all executor threads. - */ -static void multi_executor_stop(void); - -/** - * @brief Select an executor using round-robin - * - * @return Executor index (0 to g_num_executors-1) - */ -static int select_executor(void); - -/** - * @brief Submit a request to a specific executor - * - * @param exec_id Executor index - * @param req Request to submit - */ -static void multi_executor_enqueue(int exec_id, struct py_request *req); - -/** - * @brief Dispatch a context call operation to the executor - * - * Used when a context has thread affinity (executor_id >= 0) to ensure - * numpy/torch thread-local state consistency. - * - * @param env Caller's NIF environment - * @param ctx Context with thread affinity - * @param module_bin Module name binary - * @param func_bin Function name binary - * @param args_term Arguments list - * @param kwargs_term Keyword arguments map - * @return Result term - */ -ERL_NIF_TERM context_dispatch_call(ErlNifEnv *env, py_context_t *ctx, - ErlNifBinary *module_bin, ErlNifBinary *func_bin, - ERL_NIF_TERM args_term, ERL_NIF_TERM kwargs_term); - -/** - * @brief Dispatch a context eval operation to the executor - * - * @param env Caller's NIF environment - * @param ctx Context with thread affinity - * @param code_bin Code string binary - * @param locals_term Local variables map - * @return Result term - */ -ERL_NIF_TERM context_dispatch_eval(ErlNifEnv *env, py_context_t *ctx, - ErlNifBinary *code_bin, ERL_NIF_TERM locals_term); - -/** - * @brief Dispatch a context exec operation to the executor - * - * @param env Caller's NIF environment - * @param ctx Context with thread affinity - * @param code_bin Code string binary - * @return Result term - */ -ERL_NIF_TERM context_dispatch_exec(ErlNifEnv *env, py_context_t *ctx, - ErlNifBinary *code_bin); - /** @} */ /* ============================================================================ diff --git a/c_src/py_shared_dict.c b/c_src/py_shared_dict.c index 2d1c473..a55c3d0 100644 --- a/c_src/py_shared_dict.c +++ b/c_src/py_shared_dict.c @@ -30,33 +30,6 @@ * Resource Callbacks * ============================================================================ */ -/** - * @brief Down callback for py_shared_dict_t - * - * Called when the owning process dies. Sets destroyed flag and clears the dict. - * This callback is invoked by the runtime when the monitored process terminates. - */ -static void shared_dict_down(ErlNifEnv *env, void *obj, - ErlNifPid *pid, ErlNifMonitor *mon) { - (void)env; - (void)pid; - (void)mon; - py_shared_dict_t *sd = (py_shared_dict_t *)obj; - - /* Mark as destroyed - subsequent access will return badarg */ - atomic_store(&sd->destroyed, true); - sd->monitor_active = false; - - /* Clear the Python dict if runtime is still running */ - if (runtime_is_running() && sd->dict != NULL) { - PyGILState_STATE gstate = PyGILState_Ensure(); - pthread_mutex_lock(&sd->mutex); - Py_CLEAR(sd->dict); - pthread_mutex_unlock(&sd->mutex); - PyGILState_Release(gstate); - } -} - /** * @brief Destructor for py_shared_dict_t * @@ -125,8 +98,11 @@ static ERL_NIF_TERM nif_shared_dict_new(ErlNifEnv *env, int argc, } PyGILState_Release(gstate); - /* Note: Process monitoring disabled for now to debug crash - * SharedDict will be garbage collected when no references remain */ + /* SharedDict is GC-scoped: the resource destructor runs once the + * last term reference is dropped (or at process exit), at which + * point the underlying Python dict is cleared. There is no + * per-process monitor — callers that want eager release must call + * shared_dict_destroy/1 explicitly. */ sd->monitor_active = false; /* Create reference term and release our reference */ diff --git a/c_src/py_subinterp_thread.c b/c_src/py_subinterp_thread.c index ee97c9c..72a1552 100644 --- a/c_src/py_subinterp_thread.c +++ b/c_src/py_subinterp_thread.c @@ -40,9 +40,6 @@ /** @brief Global thread pool instance */ subinterp_thread_pool_t g_thread_pool = {0}; -/** @brief Resource type for handles (set by NIF load) */ -ErlNifResourceType *PY_SUBINTERP_HANDLE_RESOURCE_TYPE = NULL; - /* Forward declarations */ static void *worker_thread_main(void *arg); static int worker_create_namespace(subinterp_thread_worker_t *w, uint64_t handle_id); @@ -1087,397 +1084,6 @@ static subinterp_namespace_t *worker_find_namespace(subinterp_thread_worker_t *w return NULL; } -/* ============================================================================ - * Handle Management - * ============================================================================ */ - -int subinterp_thread_handle_create(py_subinterp_handle_t *handle) { - if (!subinterp_thread_pool_is_ready()) { - return -1; - } - - /* Select worker round-robin */ - uint64_t worker_idx = atomic_fetch_add(&g_thread_pool.next_worker, 1); - int worker_id = worker_idx % g_thread_pool.num_workers; - - /* Generate unique handle ID */ - uint64_t handle_id = atomic_fetch_add(&g_thread_pool.next_handle_id, 1); - - handle->worker_id = worker_id; - handle->handle_id = handle_id; - atomic_store(&handle->destroyed, false); - - /* Create namespace in worker */ - subinterp_thread_worker_t *w = &g_thread_pool.workers[worker_id]; - - /* Lock dispatch to ensure exclusive access */ - pthread_mutex_lock(&w->dispatch_mutex); - - /* Send create namespace request */ - uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1); - owngil_header_t header = { - .magic = OWNGIL_MAGIC, - .version = OWNGIL_PROTOCOL_VERSION, - .msg_type = MSG_REQUEST, - .req_type = REQ_CREATE_NS, - .request_id = request_id, - .handle_id = handle_id, - .payload_len = 0, - }; - - write_full(w->cmd_pipe[1], &header, sizeof(header)); - - /* Wait for response */ - owngil_header_t resp; - read_full(w->result_pipe[0], &resp, sizeof(resp)); - - pthread_mutex_unlock(&w->dispatch_mutex); - - return (resp.msg_type == MSG_RESPONSE) ? 0 : -1; -} - -void subinterp_thread_handle_destroy(py_subinterp_handle_t *handle) { - if (atomic_exchange(&handle->destroyed, true)) { - return; /* Already destroyed */ - } - - if (!subinterp_thread_pool_is_ready()) { - return; - } - - if (handle->worker_id < 0 || handle->worker_id >= g_thread_pool.num_workers) { - return; - } - - subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id]; - - pthread_mutex_lock(&w->dispatch_mutex); - - /* Send destroy namespace request */ - uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1); - owngil_header_t header = { - .magic = OWNGIL_MAGIC, - .version = OWNGIL_PROTOCOL_VERSION, - .msg_type = MSG_REQUEST, - .req_type = REQ_DESTROY_NS, - .request_id = request_id, - .handle_id = handle->handle_id, - .payload_len = 0, - }; - - write_full(w->cmd_pipe[1], &header, sizeof(header)); - - /* Wait for response */ - owngil_header_t resp; - read_full(w->result_pipe[0], &resp, sizeof(resp)); - - pthread_mutex_unlock(&w->dispatch_mutex); -} - -/* ============================================================================ - * Execution API - * ============================================================================ */ - -ERL_NIF_TERM subinterp_thread_call(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM module, ERL_NIF_TERM func, - ERL_NIF_TERM args, ERL_NIF_TERM kwargs) { - if (atomic_load(&handle->destroyed)) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "handle_destroyed")); - } - - if (!subinterp_thread_pool_is_ready()) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "pool_not_ready")); - } - - subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id]; - - /* Build payload tuple: {Module, Func, Args, Kwargs} */ - ERL_NIF_TERM payload_tuple = enif_make_tuple4(env, module, func, args, kwargs); - - /* Serialize to ETF */ - ErlNifBinary payload_bin; - if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "serialization_failed")); - } - - /* Lock dispatch */ - pthread_mutex_lock(&w->dispatch_mutex); - - /* Send request */ - uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1); - owngil_header_t header = { - .magic = OWNGIL_MAGIC, - .version = OWNGIL_PROTOCOL_VERSION, - .msg_type = MSG_REQUEST, - .req_type = REQ_CALL, - .request_id = request_id, - .handle_id = handle->handle_id, - .payload_len = payload_bin.size, - }; - - write_full(w->cmd_pipe[1], &header, sizeof(header)); - write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size); - enif_release_binary(&payload_bin); - - /* Read response */ - owngil_header_t resp_header; - if (read_full(w->result_pipe[0], &resp_header, sizeof(resp_header)) != sizeof(resp_header)) { - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "read_failed")); - } - - ERL_NIF_TERM result; - if (resp_header.payload_len > 0) { - unsigned char *resp_payload = enif_alloc(resp_header.payload_len); - if (resp_payload == NULL) { - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "alloc_failed")); - } - - if (read_full(w->result_pipe[0], resp_payload, resp_header.payload_len) - != (int)resp_header.payload_len) { - enif_free(resp_payload); - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "read_failed")); - } - - /* Deserialize response */ - if (enif_binary_to_term(env, resp_payload, resp_header.payload_len, - &result, 0) == 0) { - result = enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "deserialize_failed")); - } - - enif_free(resp_payload); - } else { - result = enif_make_atom(env, "ok"); - } - - pthread_mutex_unlock(&w->dispatch_mutex); - return result; -} - -ERL_NIF_TERM subinterp_thread_eval(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM code, ERL_NIF_TERM locals) { - if (atomic_load(&handle->destroyed)) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "handle_destroyed")); - } - - if (!subinterp_thread_pool_is_ready()) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "pool_not_ready")); - } - - subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id]; - - /* Build payload tuple: {Code, Locals} */ - ERL_NIF_TERM payload_tuple = enif_make_tuple2(env, code, locals); - - ErlNifBinary payload_bin; - if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "serialization_failed")); - } - - pthread_mutex_lock(&w->dispatch_mutex); - - uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1); - owngil_header_t header = { - .magic = OWNGIL_MAGIC, - .version = OWNGIL_PROTOCOL_VERSION, - .msg_type = MSG_REQUEST, - .req_type = REQ_EVAL, - .request_id = request_id, - .handle_id = handle->handle_id, - .payload_len = payload_bin.size, - }; - - write_full(w->cmd_pipe[1], &header, sizeof(header)); - write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size); - enif_release_binary(&payload_bin); - - owngil_header_t resp_header; - if (read_full(w->result_pipe[0], &resp_header, sizeof(resp_header)) != sizeof(resp_header)) { - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "read_failed")); - } - - ERL_NIF_TERM result; - if (resp_header.payload_len > 0) { - unsigned char *resp_payload = enif_alloc(resp_header.payload_len); - if (resp_payload == NULL) { - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "alloc_failed")); - } - - if (read_full(w->result_pipe[0], resp_payload, resp_header.payload_len) - != (int)resp_header.payload_len) { - enif_free(resp_payload); - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "read_failed")); - } - - if (enif_binary_to_term(env, resp_payload, resp_header.payload_len, - &result, 0) == 0) { - result = enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "deserialize_failed")); - } - - enif_free(resp_payload); - } else { - result = enif_make_atom(env, "ok"); - } - - pthread_mutex_unlock(&w->dispatch_mutex); - return result; -} - -ERL_NIF_TERM subinterp_thread_exec(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM code) { - if (atomic_load(&handle->destroyed)) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "handle_destroyed")); - } - - if (!subinterp_thread_pool_is_ready()) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "pool_not_ready")); - } - - subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id]; - - /* Build payload tuple: {Code} */ - ERL_NIF_TERM payload_tuple = enif_make_tuple1(env, code); - - ErlNifBinary payload_bin; - if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) { - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "serialization_failed")); - } - - pthread_mutex_lock(&w->dispatch_mutex); - - uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1); - owngil_header_t header = { - .magic = OWNGIL_MAGIC, - .version = OWNGIL_PROTOCOL_VERSION, - .msg_type = MSG_REQUEST, - .req_type = REQ_EXEC, - .request_id = request_id, - .handle_id = handle->handle_id, - .payload_len = payload_bin.size, - }; - - write_full(w->cmd_pipe[1], &header, sizeof(header)); - write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size); - enif_release_binary(&payload_bin); - - owngil_header_t resp_header; - if (read_full(w->result_pipe[0], &resp_header, sizeof(resp_header)) != sizeof(resp_header)) { - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "read_failed")); - } - - ERL_NIF_TERM result; - if (resp_header.payload_len > 0) { - unsigned char *resp_payload = enif_alloc(resp_header.payload_len); - if (resp_payload == NULL) { - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "alloc_failed")); - } - - if (read_full(w->result_pipe[0], resp_payload, resp_header.payload_len) - != (int)resp_header.payload_len) { - enif_free(resp_payload); - pthread_mutex_unlock(&w->dispatch_mutex); - return enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "read_failed")); - } - - if (enif_binary_to_term(env, resp_payload, resp_header.payload_len, - &result, 0) == 0) { - result = enif_make_tuple2(env, enif_make_atom(env, "error"), - enif_make_atom(env, "deserialize_failed")); - } - - enif_free(resp_payload); - } else { - result = enif_make_atom(env, "ok"); - } - - pthread_mutex_unlock(&w->dispatch_mutex); - return result; -} - -ERL_NIF_TERM subinterp_thread_cast(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM module, ERL_NIF_TERM func, - ERL_NIF_TERM args) { - if (atomic_load(&handle->destroyed)) { - return enif_make_atom(env, "ok"); /* Silently ignore for cast */ - } - - if (!subinterp_thread_pool_is_ready()) { - return enif_make_atom(env, "ok"); /* Silently ignore for cast */ - } - - subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id]; - - /* Build payload tuple: {Module, Func, Args} */ - ERL_NIF_TERM payload_tuple = enif_make_tuple3(env, module, func, args); - - ErlNifBinary payload_bin; - if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) { - return enif_make_atom(env, "ok"); /* Silently fail for cast */ - } - - pthread_mutex_lock(&w->dispatch_mutex); - - uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1); - owngil_header_t header = { - .magic = OWNGIL_MAGIC, - .version = OWNGIL_PROTOCOL_VERSION, - .msg_type = MSG_REQUEST, - .req_type = REQ_CAST, - .request_id = request_id, - .handle_id = handle->handle_id, - .payload_len = payload_bin.size, - }; - - write_full(w->cmd_pipe[1], &header, sizeof(header)); - write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size); - enif_release_binary(&payload_bin); - - pthread_mutex_unlock(&w->dispatch_mutex); - - return enif_make_atom(env, "ok"); -} - -ERL_NIF_TERM subinterp_thread_async_call(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM module, ERL_NIF_TERM func, - ERL_NIF_TERM args, ErlNifPid *caller_pid, - ERL_NIF_TERM ref) { - /* For async, we send the request but don't wait for response. - * The worker thread uses erlang.send() to deliver result. */ - (void)caller_pid; - (void)ref; - - /* For now, implement as sync call - async requires erlang.send support */ - ERL_NIF_TERM kwargs = enif_make_new_map(env); - return subinterp_thread_call(env, handle, module, func, args, kwargs); -} - /* ============================================================================ * Utility Functions * ============================================================================ */ diff --git a/c_src/py_subinterp_thread.h b/c_src/py_subinterp_thread.h index a02b882..c590597 100644 --- a/c_src/py_subinterp_thread.h +++ b/c_src/py_subinterp_thread.h @@ -191,23 +191,6 @@ typedef struct { _Atomic uint64_t next_request_id; /**< Counter for request IDs */ } subinterp_thread_pool_t; -/* ============================================================================ - * Handle Resource (Erlang side) - * ============================================================================ */ - -/** - * @struct py_subinterp_handle_t - * @brief Erlang resource representing a subinterpreter handle - * - * A handle is bound to a specific worker at creation and has its own - * isolated namespace within that worker. - */ -typedef struct { - int worker_id; /**< Bound worker index (fixed at creation) */ - uint64_t handle_id; /**< Unique ID for namespace lookup */ - _Atomic bool destroyed; /**< Handle has been destroyed */ -} py_subinterp_handle_t; - /* ============================================================================ * Pool Management API * ============================================================================ */ @@ -252,108 +235,6 @@ void subinterp_thread_pool_stats(int *num_workers, uint64_t *total_requests, * Handle Management API * ============================================================================ */ -/** - * @brief Create a new subinterpreter handle - * - * Allocates a handle bound to a worker (round-robin selection) and - * creates a namespace for it within that worker. - * - * @param handle Output: handle structure to initialize - * @return 0 on success, -1 on failure - */ -int subinterp_thread_handle_create(py_subinterp_handle_t *handle); - -/** - * @brief Destroy a subinterpreter handle - * - * Cleans up the handle's namespace within its worker. - * - * @param handle Handle to destroy - */ -void subinterp_thread_handle_destroy(py_subinterp_handle_t *handle); - -/* ============================================================================ - * Execution API - * ============================================================================ */ - -/** - * @brief Synchronous call through subinterpreter handle - * - * Sends a call request to the worker and blocks until response. - * The dispatch_mutex ensures serialization per worker. - * - * @param env NIF environment - * @param handle Subinterpreter handle - * @param module Module name term (atom or binary) - * @param func Function name term (atom or binary) - * @param args Arguments list term - * @param kwargs Keyword arguments map term - * @return Result term: {ok, Result} | {error, Reason} - */ -ERL_NIF_TERM subinterp_thread_call(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM module, ERL_NIF_TERM func, - ERL_NIF_TERM args, ERL_NIF_TERM kwargs); - -/** - * @brief Synchronous eval through subinterpreter handle - * - * @param env NIF environment - * @param handle Subinterpreter handle - * @param code Code string term (binary) - * @param locals Local variables map term - * @return Result term: {ok, Result} | {error, Reason} - */ -ERL_NIF_TERM subinterp_thread_eval(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM code, ERL_NIF_TERM locals); - -/** - * @brief Synchronous exec through subinterpreter handle - * - * @param env NIF environment - * @param handle Subinterpreter handle - * @param code Code string term (binary) - * @return Result term: ok | {error, Reason} - */ -ERL_NIF_TERM subinterp_thread_exec(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM code); - -/** - * @brief Fire-and-forget call (no result) - * - * Sends request to worker but returns immediately without waiting. - * Used for side-effects where result is not needed. - * - * @param env NIF environment - * @param handle Subinterpreter handle - * @param module Module name term - * @param func Function name term - * @param args Arguments list term - * @return ok - */ -ERL_NIF_TERM subinterp_thread_cast(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM module, ERL_NIF_TERM func, - ERL_NIF_TERM args); - -/** - * @brief Async call - returns immediately with reference - * - * Sends request to worker. Worker uses erlang.send() to deliver result - * to caller_pid with the given ref. - * - * @param env NIF environment - * @param handle Subinterpreter handle - * @param module Module name term - * @param func Function name term - * @param args Arguments list term - * @param caller_pid PID to send result to - * @param ref Reference for result correlation - * @return ok - */ -ERL_NIF_TERM subinterp_thread_async_call(ErlNifEnv *env, py_subinterp_handle_t *handle, - ERL_NIF_TERM module, ERL_NIF_TERM func, - ERL_NIF_TERM args, ErlNifPid *caller_pid, - ERL_NIF_TERM ref); - /* ============================================================================ * Global Pool Instance * ============================================================================ */ @@ -361,9 +242,6 @@ ERL_NIF_TERM subinterp_thread_async_call(ErlNifEnv *env, py_subinterp_handle_t * /** @brief Global thread pool (defined in py_subinterp_thread.c) */ extern subinterp_thread_pool_t g_thread_pool; -/** @brief Resource type for py_subinterp_handle_t */ -extern ErlNifResourceType *PY_SUBINTERP_HANDLE_RESOURCE_TYPE; - #endif /* HAVE_SUBINTERPRETERS */ #endif /* PY_SUBINTERP_THREAD_H */ diff --git a/docs/asyncio.md b/docs/asyncio.md index f19ac0f..28b30e2 100644 --- a/docs/asyncio.md +++ b/docs/asyncio.md @@ -707,14 +707,15 @@ def sync_handler(): return "done" ``` -**Behavior by Context:** +**Behavior by context (v3.0 worker-pthread architecture):** -| Context | Mechanism | Effect | -|---------|-----------|--------| -| Async (`await erlang.sleep()`) | `asyncio.sleep()` via `call_later()` | Yields to event loop, dirty scheduler released | -| Sync (`erlang.sleep()`) | `erlang.call('_py_sleep')` with `receive/after` | Blocks Python, Erlang process suspends, dirty scheduler released | +| Context | Mechanism | What blocks | +|---------|-----------|-------------| +| Async (`await erlang.sleep()`) | `asyncio.sleep()` via Erlang `send_after` | Yields to the event loop. The worker pthread is free to handle other tasks. | +| Sync from `py:exec` / `py:eval` | `erlang.call('_py_sleep', secs)` triggers suspension; the dirty scheduler is released and an Erlang `receive ... after` parks the caller | Caller's Erlang process. Dirty scheduler free for other work. | +| Sync from `py:call` (worker mode) | Falls back to `time.sleep`; replaying the Python frame around a suspension would change time-measurement semantics | The context's worker pthread for the sleep duration. Async NIF dispatch returns immediately so the BEAM dirty scheduler is **not** held; other Erlang processes and other contexts run normally. | -Both modes allow other Erlang processes and Python contexts to run during the sleep. +In every case the BEAM dirty scheduler is freed during the sleep — the difference is which thread blocks (Erlang process, dirty scheduler, or worker pthread). #### asyncio.sleep(delay) @@ -923,7 +924,7 @@ erlang.run(main()) ## Async Worker Backend (Internal) -The `py:async_call/3,4` and `py:await/1,2` APIs use an event-driven backend based on `py_event_loop`. +The `py:async_call/3,4` and `py:async_await/1,2` APIs use an event-driven backend based on `py_event_loop`. ### Architecture diff --git a/docs/getting-started.md b/docs/getting-started.md index 8f1ce8d..817ddf9 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -8,7 +8,7 @@ Add to your `rebar.config`: ```erlang {deps, [ - {erlang_python, "2.3.0"} + {erlang_python, "3.0.0"} ]}. ``` @@ -266,38 +266,33 @@ ok = py:deactivate_venv(). ### Context Modes -When creating explicit contexts, you can choose different execution modes: +When creating explicit contexts, you can choose between the two public modes: ```erlang -%% Worker mode (default, recommended) - main interpreter -%% With free-threaded Python (3.13t+), provides true parallelism automatically +%% Worker mode (default) - dedicated pthread per context, main interpreter {ok, Ctx} = py_context:new(#{mode => worker}). -%% SHARED_GIL sub-interpreter (Python 3.12+) - isolated namespace -{ok, Ctx} = py_context:new(#{mode => subinterp}). - -%% OWN_GIL sub-interpreter (Python 3.14+) - true parallelism +%% OWN_GIL mode (Python 3.14+) - dedicated pthread + subinterpreter with own GIL {ok, Ctx} = py_context:new(#{mode => owngil}). ``` | Mode | Python | Description | |------|--------|-------------| -| `worker` | Any | Main interpreter, shared namespace (default, recommended) | -| `subinterp` | 3.12+ | SHARED_GIL sub-interpreter, isolated namespace | -| `owngil` | 3.14+ | OWN_GIL sub-interpreter, each has own GIL | +| `worker` | Any | Dedicated pthread per context, main interpreter namespace (default) | +| `owngil` | 3.14+ | Dedicated pthread + subinterpreter with its own GIL, true parallelism | -**Worker mode is recommended** because it works with any Python version and automatically benefits from free-threaded Python (3.13t+) when available. +**Worker mode is recommended** because it works with any Python version, provides stable thread affinity for libraries with thread-local state (numpy, torch, tensorflow), and automatically benefits from free-threaded Python (3.13t+) when available. -**Why OWN_GIL requires Python 3.14+**: C extensions like `_decimal`, `numpy` have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14. SHARED_GIL mode works on 3.12+ but some C extensions may have issues. +**Why OWN_GIL requires Python 3.14+**: C extensions like `_decimal`, `numpy` have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14. ### Runtime Detection Check the current execution mode: ```erlang -%% See how Python is being executed +%% Mirrors the context_mode application env py:execution_mode(). -%% => free_threaded | subinterp | multi_executor +%% => worker | owngil %% Check rate limiting status py_semaphore:max_concurrent(). %% Maximum concurrent calls diff --git a/docs/inspiration.md b/docs/inspiration.md index 0c31767..b4cce7f 100644 --- a/docs/inspiration.md +++ b/docs/inspiration.md @@ -139,7 +139,7 @@ Build TCP/UDP servers with Python protocol logic: ```erlang %% Erlang handles TCP accept and I/O scheduling -{ok, Ctx} = py_reactor_context:start_link(1, auto), +{ok, Ctx} = py_reactor_context:start_link(1, worker), py:exec(Ctx, <<" import erlang.reactor as reactor @@ -312,8 +312,8 @@ Each Python context runs in isolation: ```erlang %% Multiple independent contexts -{ok, Ctx1} = py_context:start_link(1, auto), -{ok, Ctx2} = py_context:start_link(2, auto), +{ok, Ctx1} = py_context:start_link(1, worker), +{ok, Ctx2} = py_context:start_link(2, worker), %% Failures in Ctx1 don't affect Ctx2 py:exec(Ctx1, <<"import dangerous_lib">>), @@ -326,10 +326,10 @@ Separate pools prevent I/O from blocking compute: ```erlang %% CPU-bound pool (for ML inference) -{ok, _} = py_pool:start_link(cpu_pool, #{size => 4}), +{ok, _} = py_context_router:start_pool(cpu_pool, 4, owngil), %% I/O-bound pool (for API calls) -{ok, _} = py_pool:start_link(io_pool, #{size => 16}), +{ok, _} = py_context_router:start_pool(io_pool, 16, worker), %% Route accordingly py:call(cpu_pool, model, predict, [Data]), diff --git a/docs/migration.md b/docs/migration.md index b092b8f..983bcf3 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -1,6 +1,107 @@ -# Migration Guide: v1.8.x to v2.0+ +# Migration Guide -This guide covers breaking changes and migration steps when upgrading from erlang_python v1.8.x to v2.0 and later. +This guide covers breaking changes and migration steps when upgrading erlang_python. + +## v2.x to v3.0 Migration + +### Quick Checklist + +- [ ] Update `py:execution_mode/0` usage - now returns `worker | owngil` only +- [ ] Remove any `py:num_executors/0` calls (function removed) +- [ ] Update code that checks for `free_threaded` or `multi_executor` modes +- [ ] Review `context_mode` configuration (now `worker | owngil`) + +### Execution Mode Changes + +**v2.x:** `py:execution_mode/0` returned internal capabilities: +```erlang +py:execution_mode(). +%% => free_threaded | subinterp | multi_executor +``` + +**v3.0:** Returns simplified public modes based on configuration: +```erlang +py:execution_mode(). +%% => worker | owngil +``` + +The mode is determined by the `context_mode` application config: +```erlang +%% Default: worker mode +application:set_env(erlang_python, context_mode, worker). + +%% For true parallelism (Python 3.14+) +application:set_env(erlang_python, context_mode, owngil). +``` + +### Removed Functions + +**`py:num_executors/0`** - Removed. Contexts now use per-context worker threads. + +```erlang +%% v2.x - check executor count +N = py:num_executors(). + +%% v3.0 - not needed, each context has its own worker thread +``` + +### Worker Thread Architecture + +In v3.0, each context gets a dedicated pthread that handles all Python operations: + +- **Stable thread affinity**: All calls to the same context run on the same OS thread +- **numpy/torch compatibility**: Thread-local state is preserved +- **No executor pool**: No shared executor threads to manage + +```erlang +%% Create contexts - each gets its own worker thread +Ctx1 = py:context(1), +Ctx2 = py:context(2), + +%% All calls to Ctx1 run on Ctx1's worker thread +%% All calls to Ctx2 run on Ctx2's worker thread +{ok, _} = py:call(Ctx1, math, sqrt, [16]), +{ok, _} = py:call(Ctx2, math, sqrt, [25]). +``` + +### Configuration Changes + +**v2.x configuration:** +```erlang +{erlang_python, [ + {num_executors, 8}, %% Removed in v3.0 + {context_mode, worker} +]} +``` + +**v3.0 configuration:** +```erlang +{erlang_python, [ + {context_mode, worker}, %% worker | owngil + {num_contexts, 8}, %% Number of contexts to create + {max_concurrent, 17} %% Optional rate-limit ceiling +]} +``` + +`num_executors` and `num_async_workers` were both removed in v3.0; the +supervisor no longer reads them. + +### Python Version Compatibility + +| Python Version | v2.x Mode | v3.0 Mode | +|---------------|-----------|-----------| +| 3.9 - 3.11 | `multi_executor` | `worker` | +| 3.12 - 3.13 | `subinterp` | `worker` (default) or `owngil` | +| 3.14+ | `subinterp` | `worker` (default) or `owngil` | +| 3.13t (free-threaded) | `free_threaded` | `worker` | + +All Python versions now use the same public mode (`worker` or `owngil`) based on configuration, not Python capabilities. + +--- + +# v1.8.x to v2.0 Migration + +This section covers breaking changes when upgrading from erlang_python v1.8.x to v2.0. ## Quick Checklist @@ -16,17 +117,18 @@ This guide covers breaking changes and migration steps when upgrading from erlan ## Python Version Compatibility -| Python Version | GIL Mode | Notes | -|---------------|----------|-------| -| 3.9 - 3.11 | Shared GIL | Multi-executor mode, `py:execution_mode()` returns `multi_executor` | -| 3.12 - 3.13 | OWN_GIL subinterpreters | True parallelism, `py:execution_mode()` returns `subinterp` | -| 3.13t | Free-threaded | No GIL, `py:execution_mode()` returns `free_threaded` | -| 3.14+ | SHARED_GIL subinterpreters | Subinterpreters with shared GIL for C extension compatibility | +| Python Version | Support | Notes | +|---------------|---------|-------| +| 3.9 - 3.11 | Full | Worker mode with dedicated pthread per context | +| 3.12 - 3.13 | Full | Worker mode (default) or owngil mode | +| 3.14+ | Full | Worker mode (default) or owngil mode with true parallelism | +| 3.13t | Full | Worker mode (free-threaded builds supported) | -**Python 3.14 Support**: Full support for Python 3.14 including: -- SHARED_GIL subinterpreter mode for C extension compatibility -- Proper `sys.path` initialization in subinterpreters -- All asyncio features work correctly +**Python 3.14+ OWN_GIL Support**: For true parallelism, use owngil mode: +```erlang +application:set_env(erlang_python, context_mode, owngil). +``` +Each context gets a subinterpreter with its own GIL, enabling parallel Python execution. **FreeBSD Support**: Improved fd handling on FreeBSD/kqueue platforms: - Automatic fd duplication in `py_reactor_context` to prevent fd stealing errors @@ -83,11 +185,10 @@ The most significant change in v2.0 is the new execution model. On Python 3.12+, Check which mode is active: ```erlang -%% Check execution mode +%% Check execution mode (v3.0+) py:execution_mode(). -%% => subinterp (Python 3.12+ with OWN_GIL) -%% => free_threaded (Python 3.13t with --disable-gil) -%% => multi_executor (Python < 3.12) +%% => worker (default, dedicated pthread per context) +%% => owngil (dedicated pthread + subinterpreter with own GIL) %% Check if subinterpreters are supported py:subinterp_supported(). @@ -440,20 +541,25 @@ erlang.send(("my_server", "node@host"), {"event": "user_login", "user": 123}) erlang.send(pid, "hello") ``` -### `erlang.sleep()` with Dirty Scheduler Release +### `erlang.sleep()` cooperates with the BEAM scheduler -Synchronous sleep that releases the Erlang dirty scheduler thread: +Synchronous sleep that lets other Erlang processes and Python +contexts make progress during the wait: ```python import erlang def slow_handler(): - # Sleep without blocking Erlang scheduler - erlang.sleep(1.0) # Releases dirty scheduler during sleep + erlang.sleep(1.0) return "done" ``` -Unlike `time.sleep()`, `erlang.sleep()` releases the dirty NIF thread while waiting, allowing other Python calls to use the scheduler slot. +The BEAM dirty scheduler is never held during the sleep. The exact +thread that blocks depends on context — the Erlang process for +`py:exec` / `py:eval`, or the context's private worker pthread for +`py:call`. See the [behavior-by-context table in the asyncio +guide](asyncio.md#erlangsleepseconds) for the full breakdown. In all +cases, other contexts and other Erlang processes continue running. ### `erlang.call()` Blocking with Explicit Scheduling @@ -517,21 +623,24 @@ async def async_handler(): ### Async Task API (Erlang Side) -Submit and manage async Python tasks from Erlang: +Submit and manage async Python tasks from Erlang. Tasks always run on the +shared `py_event_loop`; routing happens via `py_event_loop_pool` for the +pool-based variant. The `(Module, Func, Args[, Opts/Kwargs])` signature does +not take a context — coroutines are scheduled on the loop, not on a context. ```erlang %% Blocking run -{ok, Result} = py_event_loop:run(Ctx, my_module, my_async_func, [Arg1]). +{ok, Result} = py_event_loop:run(my_module, my_async_func, [Arg1]). %% Non-blocking with reference -Ref = py_event_loop:create_task(Ctx, my_module, my_async_func, [Arg1]), +Ref = py_event_loop:create_task(my_module, my_async_func, [Arg1]), {ok, Result} = py_event_loop:await(Ref, 5000). %% Fire-and-forget -py_event_loop:spawn_task(Ctx, my_module, my_async_func, [Arg1]). +py_event_loop:spawn_task(my_module, my_async_func, [Arg1]). %% Message-based result delivery -Ref = py_event_loop:create_task(Ctx, my_module, my_async_func, [Arg1]), +Ref = py_event_loop:create_task(my_module, my_async_func, [Arg1]), receive {async_result, Ref, {ok, Result}} -> handle(Result); {async_result, Ref, {error, Reason}} -> handle_error(Reason) diff --git a/docs/owngil_internals.md b/docs/owngil_internals.md index 2819c4a..7ee0382 100644 --- a/docs/owngil_internals.md +++ b/docs/owngil_internals.md @@ -4,7 +4,7 @@ OWN_GIL mode provides true parallel Python execution using Python 3.14+ per-interpreter GIL (`PyInterpreterConfig_OWN_GIL`). Each OWN_GIL context runs in a dedicated pthread with its own subinterpreter and GIL. -**Note**: OWN_GIL requires Python 3.14+ due to C extension global state bugs in earlier versions (e.g., `_decimal`, `numpy`). For Python 3.12/3.13, use SHARED_GIL sub-interpreters (`mode => subinterp`) which provide namespace isolation but share the GIL. +**Note**: OWN_GIL requires Python 3.14+ due to C extension global state bugs in earlier versions (e.g., `_decimal`, `numpy`). On Python 3.12/3.13, use the default `worker` mode — contexts share the main interpreter but each owns a dedicated pthread. ## Quick Start @@ -83,11 +83,10 @@ All major erlang_python features work with OWN_GIL mode: | Mode | Python Version | Thread Model | GIL | Parallelism | |------|----------------|--------------|-----|-------------| -| `worker` | Any | Dirty scheduler | Main interpreter GIL | None | -| `subinterp` | 3.12+ | Dirty scheduler | Shared GIL | None (isolated namespaces) | -| `owngil` | 3.14+ | Dedicated pthread | Per-interpreter GIL | True parallel | +| `worker` | Any | Dedicated pthread per context | Main interpreter GIL | True parallel on free-threaded 3.13t+ | +| `owngil` | 3.14+ | Dedicated pthread per context | Per-interpreter GIL | True parallel | -**Why version requirements differ**: The `subinterp` mode (SHARED_GIL) works on Python 3.12+ for namespace isolation. However, `owngil` mode requires Python 3.14+ because C extensions like `_decimal`, `numpy` have global state that crashes in OWN_GIL sub-interpreters on earlier versions. Python 3.14 includes fixes for these issues (see [cpython#106078](https://github.com/python/cpython/issues/106078)). +**Why OWN_GIL requires Python 3.14+**: C extensions like `_decimal`, `numpy` have global state that crashes in OWN_GIL sub-interpreters on Python 3.12/3.13. Python 3.14 includes fixes for these issues (see [cpython#106078](https://github.com/python/cpython/issues/106078)). ## Key Data Structures @@ -174,7 +173,7 @@ nif_context_call(env, ctx, module, func, args, kwargs) │ └── pthread_mutex_unlock(&ctx->request_mutex) │ └── [ctx->uses_own_gil == false] - └── Direct execution with GIL (worker/subinterp mode) + └── Direct execution with GIL (worker mode) ``` ### 3. Request Processing (OWN_GIL Thread) @@ -438,7 +437,7 @@ Use OWN_GIL when: - Long-running computations - Need true concurrent Python execution -Use shared-GIL (subinterp) when: +Use worker mode when: - I/O-bound or short operations - High call frequency - Resource constraints @@ -454,7 +453,7 @@ rebar3 compile && escript examples/bench_owngil.erl Example output: ``` ======================================================== - OWN_GIL vs SHARED_GIL Benchmark + OWN_GIL vs Worker Benchmark ======================================================== System Information @@ -462,25 +461,25 @@ System Information Erlang/OTP: 27 Schedulers: 8 Python: 3.14.0 - Subinterp: true + OWN_GIL: true 1. Single Context Latency (1000 calls to math.sqrt) Mode us/call calls/sec ---- ------- --------- - subinterp 2.5 400000 + worker 2.5 400000 owngil 10.2 98000 2. Parallel Throughput (4 contexts, 10000 calls each) Mode total_ms calls/sec ---- -------- --------- - subinterp 100.5 398000 - owngil 28.3 1415000 <- 3.5x faster + worker 100.5 398000 + owngil 28.3 1415000 <- 3.5x faster 3. CPU-Bound Speedup (fibonacci(30) x 4 contexts) Mode total_ms speedup ---- -------- ------- - subinterp 800.2 1.0x - owngil 205.1 3.9x <- near-linear scaling + worker 800.2 1.0x + owngil 205.1 3.9x <- near-linear scaling ``` ## Safety Mechanisms diff --git a/docs/preload.md b/docs/preload.md index 682e2b0..44b617b 100644 --- a/docs/preload.md +++ b/docs/preload.md @@ -119,5 +119,5 @@ py_preload:clear_code(). ## Limitations - Changes to preload code don't affect existing contexts -- Same preload applies to all context modes (worker, subinterp, owngil) +- Same preload applies to both context modes (worker, owngil) - Preload errors during context creation will fail the context diff --git a/docs/process-bound-envs.md b/docs/process-bound-envs.md index c9f986a..201546e 100644 --- a/docs/process-bound-envs.md +++ b/docs/process-bound-envs.md @@ -107,12 +107,12 @@ end). | **Explicit** | `create_local_env` + `py_nif:context_*` | OWN_GIL, fine-grained control, multiple envs per process | **Use implicit (py:exec)** when: -- Using worker or subinterp modes +- Using worker mode - One environment per process is sufficient - You want automatic lifecycle management **Use explicit (create_local_env)** when: -- Using OWN_GIL mode for parallel execution +- Using `owngil` mode for parallel execution - Need multiple environments in a single process - Want to pass environments between processes - Need direct NIF-level control @@ -461,7 +461,7 @@ This design prioritizes safety over avoiding minor memory leaks during edge case ## See Also - [OWN_GIL Internals](owngil_internals.md) - Architecture and safety mechanisms for OWN_GIL mode -- [Scalability](scalability.md) - Mode comparison (owngil vs subinterp vs worker) +- [Scalability](scalability.md) - Mode comparison (worker vs owngil) - [Event Loop Architecture](event_loop_architecture.md) - Per-process namespace management - [Context Affinity](context-affinity.md) - Context binding and routing - [Scheduling](asyncio.md) - Cooperative scheduling for long operations diff --git a/docs/reactor.md b/docs/reactor.md index 0387cc8..ff6b6cd 100644 --- a/docs/reactor.md +++ b/docs/reactor.md @@ -573,11 +573,11 @@ Internal - called by NIF to close connection. ## Subinterpreter Support -The reactor supports isolated subinterpreters via `py_reactor_context`. Each subinterpreter has its own reactor module cache, ensuring protocol factories are isolated between contexts. +The reactor supports isolated subinterpreters via `py_reactor_context` in `owngil` mode. Each subinterpreter has its own reactor module cache, ensuring protocol factories are isolated between contexts. ```erlang -%% Create context with subinterpreter mode -{ok, Ctx1} = py_reactor_context:start_link(1, subinterp, #{ +%% Create context with OWN_GIL subinterpreter (Python 3.14+) +{ok, Ctx1} = py_reactor_context:start_link(1, owngil, #{ setup_code => <<" import erlang.reactor as reactor reactor.set_protocol_factory(EchoProtocol) @@ -585,7 +585,7 @@ reactor.set_protocol_factory(EchoProtocol) }), %% Create another context with different protocol -{ok, Ctx2} = py_reactor_context:start_link(2, subinterp, #{ +{ok, Ctx2} = py_reactor_context:start_link(2, owngil, #{ setup_code => <<" import erlang.reactor as reactor reactor.set_protocol_factory(HttpProtocol) @@ -593,7 +593,7 @@ reactor.set_protocol_factory(HttpProtocol) }). ``` -Each context runs in its own subinterpreter with isolated protocol factory and connection state. This enables running multiple protocol handlers in the same BEAM VM without interference. +Each `owngil` context runs in its own subinterpreter with isolated protocol factory and connection state. For Python <3.14, use `worker` mode — contexts share the main interpreter but each owns a dedicated pthread. ## See Also diff --git a/docs/scalability.md b/docs/scalability.md index 2985a20..eef5d21 100644 --- a/docs/scalability.md +++ b/docs/scalability.md @@ -4,30 +4,24 @@ This guide covers the scalability features of erlang_python, including execution ## Execution Modes -erlang_python automatically detects the optimal execution mode based on your Python version: +erlang_python supports two execution modes: ```erlang %% Check current execution mode py:execution_mode(). -%% => free_threaded | worker | owngil | multi_executor - -%% Check number of executor threads -py:num_executors(). -%% => 4 (default) +%% => worker | owngil ``` ### Mode Comparison -| Mode | Python Version | Parallelism | GIL Behavior | Best For | -|------|----------------|-------------|--------------|----------| -| **free_threaded** | 3.13+ (nogil build) | True N-way | None | Maximum throughput | -| **owngil** | 3.14+ | True N-way | Per-interpreter (dedicated thread) | CPU-bound parallel | -| **worker** | 3.12+ | GIL contention | Shared GIL | Default, compatibility | -| **multi_executor** | < 3.12 | GIL contention | Shared, round-robin | I/O-bound, legacy | +| Mode | Description | Parallelism | GIL Behavior | Best For | +|------|-------------|-------------|--------------|----------| +| **worker** | Dedicated pthread per context | GIL contention | Shared GIL | Default, maximum compatibility | +| **owngil** | Dedicated pthread + subinterpreter | True N-way | Per-interpreter GIL | CPU-bound parallel (Python 3.14+) | -### Free-Threaded Mode (Python 3.13+) +### Worker Mode (Default) -When running on a free-threaded Python build (compiled with `--disable-gil`), erlang_python executes Python calls directly without any executor routing. This provides maximum parallelism for CPU-bound workloads. +Each context gets a dedicated pthread that handles all Python operations. This provides stable thread affinity, which is critical for libraries like numpy, torch, and tensorflow that maintain thread-local state. ### OWN_GIL Mode (Python 3.12+) @@ -67,18 +61,6 @@ ok = py_nif:context_exec(CtxRef, <<"x = 42">>, Env), **See also:** [OWN_GIL Internals](owngil_internals.md) for architecture details. -### Sub-interpreter Mode (Python 3.12+) - -Uses Python's sub-interpreter feature with a shared GIL pool. Multiple contexts share the GIL but have isolated namespaces. Best for high call frequency with low latency. - -**Architecture:** -- Pool of pre-created subinterpreters with shared GIL -- Execution on dirty schedulers with `PyThreadState_Swap` -- Lower latency (~2.5μs) but no true parallelism -- Best throughput for short operations - -**Note:** Each sub-interpreter has isolated state. Use the [Shared State](#shared-state) API to share data between workers. - **Explicit Context Selection:** ```erlang %% Get a specific context by index (1-based) @@ -89,58 +71,29 @@ Ctx = py:context(1), {ok, Result} = py:call(math, sqrt, [16]). ``` -### Multi-Executor Mode (Python < 3.12) - -Runs N executor threads that share the GIL. Requests are distributed round-robin across executors. Good for I/O-bound workloads where Python releases the GIL during I/O operations. - -**Thread Affinity:** In MULTI_EXECUTOR mode, both workers and contexts are assigned -a fixed executor thread. This ensures libraries with thread-local state (numpy, torch, -tensorflow) always run on the same OS thread, preventing segfaults and state corruption. - ## Choosing the Right Mode -### Mode Comparison - -| Aspect | Free-Threaded | OWN_GIL | Worker | Multi-Executor | -|--------|---------------|---------|--------|----------------| -| **Parallelism** | True N-way | True N-way | GIL contention | GIL contention | -| **State Isolation** | Shared | Isolated | Shared | Shared | -| **Memory Overhead** | Low | Higher (per-interp) | Low | Low | -| **Module Compatibility** | Limited | Most modules | All modules | All modules | -| **Python Version** | 3.13+ (nogil) | 3.14+ | 3.12+ | < 3.12 | - ### When to Use Each Mode -**Use Free-Threaded (Python 3.13t) when:** -- You need maximum parallelism with shared state -- Your libraries are GIL-free compatible -- You're running CPU-bound workloads -- Memory efficiency is important +**Use Worker Mode (default) when:** +- You need maximum module compatibility +- Running libraries like numpy, torch, tensorflow +- High call frequency with low latency +- Shared state between contexts is needed -**Use OWN_GIL (Python 3.14+) when:** +**Use OWN_GIL Mode when:** - You need true CPU parallelism across Python contexts - Running long computations (ML inference, data processing) - Workload benefits from multiple independent Python interpreters - You can tolerate higher per-call latency for better throughput -**Use Worker (Python 3.12+, default) when:** -- You need high call frequency with low latency -- Maximum module compatibility is required -- Shared state between contexts is needed -- Running libraries that don't support subinterpreters (torch, etc.) - -**Use Multi-Executor (Python < 3.12) when:** -- Running on older Python versions -- Your workload is I/O-bound (GIL released during I/O) -- Thread affinity for numpy/torch is needed - ### Pros and Cons **Worker Mode Pros:** - Maximum module compatibility (all C extensions work) +- Stable thread affinity for numpy/torch/tensorflow - Low memory overhead (single interpreter) - Shared state between contexts -- Default mode for Python 3.12+ **Worker Mode Cons:** - GIL contention limits parallelism @@ -156,17 +109,6 @@ tensorflow) always run on the same OS thread, preventing segfaults and state cor - Some C extensions don't support subinterpreters - Requires Python 3.14+ -**Free-Threaded Mode Pros:** -- True parallelism with shared state -- Lower memory overhead than OWN_GIL -- Simplest mental model (like regular threading) - -**Free-Threaded Mode Cons:** -- Requires Python 3.13+ built with `--disable-gil` -- Many C extensions not yet compatible -- Shared state requires careful synchronization -- Still experimental - ## Subinterpreter Architecture ### Design Overview @@ -225,14 +167,10 @@ tensorflow) always run on the same OS thread, preventing segfaults and state cor ### Pool Size -The subinterpreter pool size is configured at two levels: - -| Level | Default | Max | -|-------|---------|-----| -| **Erlang (py_context_router)** | `erlang:system_info(schedulers)` | configurable | -| **C pool (py_subinterp_pool)** | 32 | 64 | - -On a typical 8-core machine, 8 context processes are started, each with one subinterpreter slot. +`py_context_router` sizes the context pool from `num_contexts` (default +`erlang:system_info(schedulers)`). Each context owns its own pthread; in +`owngil` mode that thread also owns a dedicated subinterpreter. There is no +shared C-level pool. **Configuration via sys.config:** ```erlang @@ -308,14 +246,13 @@ This allows your application to implement backpressure or shed load gracefully. %% Default: erlang:system_info(schedulers) * 2 + 1 {max_concurrent, 50}, - %% Number of executor threads (multi_executor mode only) - %% Default: 4 - {num_executors, 8}, + %% Context mode: worker | owngil + %% Default: worker + {context_mode, worker}, - %% Worker pool sizes - {num_workers, 4}, - {num_async_workers, 2}, - {num_subinterp_workers, 4} + %% Number of contexts + %% Default: erlang:system_info(schedulers) + {num_contexts, 8} ]} ]. ``` @@ -330,8 +267,8 @@ For CPU-bound workloads on Python 3.12+, erlang_python provides true parallelism %% Check if subinterpreters are supported (Python 3.12+) true = py:subinterp_supported(). -%% Check current execution mode -subinterp = py:execution_mode(). +%% Check current execution mode (mirrors context_mode app env) +worker = py:execution_mode(). %% or owngil ``` ### Using the Context Router @@ -447,7 +384,10 @@ PYTHON_CONFIG=/path/to/python3.13-config rebar3 compile ```erlang 1> application:ensure_all_started(erlang_python). 2> py:execution_mode(). -free_threaded +worker +%% Free-threaded Python is detected internally; the public mode mirrors +%% the configured context_mode (worker | owngil), and worker mode +%% automatically benefits from the free-threaded build. ``` ## Performance Tuning @@ -460,9 +400,9 @@ free_threaded ### For I/O-Bound Workloads -- Multi-executor mode works well (GIL released during I/O) -- Increase `num_executors` to handle more concurrent I/O +- Worker mode works well (GIL released during I/O) - Use asyncio integration for async I/O +- Increase `num_contexts` for more concurrent I/O capacity ### For Mixed Workloads @@ -481,8 +421,7 @@ io:format("Python load: ~.1f%~n", [Utilization]). %% Execution mode info Mode = py:execution_mode(), -Executors = py:num_executors(), -io:format("Mode: ~p, Executors: ~p~n", [Mode, Executors]). +io:format("Mode: ~p~n", [Mode]). %% Memory stats {ok, Stats} = py:memory_stats(), diff --git a/docs/testing-free-threading.md b/docs/testing-free-threading.md index ab83d47..dd538d9 100644 --- a/docs/testing-free-threading.md +++ b/docs/testing-free-threading.md @@ -99,10 +99,13 @@ rebar3 compile {ok, [erlang_python]} 2> py:execution_mode(). -free_threaded % Should show 'free_threaded' instead of 'subinterp' or 'multi_executor' +worker %% public mode, mirrors context_mode app env -3> py:num_executors(). -1 % In free_threaded mode, no executor pool is used +3> py_nif:execution_mode(). +free_threaded %% internal capability — confirms the no-GIL build was detected + +4> py_context_router:num_contexts(). +8 %% one pthread per context ``` ## Running Tests @@ -180,21 +183,23 @@ Ensure `PYTHON_CONFIG` points to the free-threaded Python installation: ls $(dirname $(which python3))/../include/*/Python.h ``` -### Mode Shows 'multi_executor' Instead of 'free_threaded' +### `py_nif:execution_mode/0` does not return `free_threaded` The Python build may not have `Py_GIL_DISABLED` defined. Verify: ```bash python3 -c "import sysconfig; print(sysconfig.get_config_var('Py_GIL_DISABLED'))" ``` -Should print `1` for free-threaded builds. +Should print `1` for free-threaded builds. The public `py:execution_mode/0` +will still return `worker | owngil` regardless — it reflects the configured +context mode, not the underlying Python capability. ### Crashes Under Load Some extensions may not be thread-safe. Try: 1. Isolate the problematic extension 2. Check if a thread-safe version exists -3. Fall back to sub-interpreter mode for those calls +3. Switch to `owngil` mode (Python 3.14+) for stronger isolation ## See Also diff --git a/examples/README.md b/examples/README.md index 19aa5ae..977ac38 100644 --- a/examples/README.md +++ b/examples/README.md @@ -87,18 +87,6 @@ Simple echo server using Reactor API. escript examples/reactor_echo.erl ``` -### reactor_subinterp_example.erl -Reactor with subinterpreter isolation (Python 3.12+). -```bash -escript examples/reactor_subinterp_example.erl -``` - -### reactor_owngil_example.erl -Reactor with OWN_GIL subinterpreters (Python 3.12+). -```bash -escript examples/reactor_owngil_example.erl -``` - ## Benchmarks ### benchmark.erl @@ -122,7 +110,7 @@ escript examples/bench_channel.erl ``` ### bench_reactor_modes.erl -Reactor worker vs subinterpreter benchmark. +Reactor worker vs OWN_GIL benchmark. ```bash escript examples/bench_reactor_modes.erl ``` diff --git a/examples/bench_async_task.erl b/examples/bench_async_task.erl index 3a87cd2..9b737a1 100644 --- a/examples/bench_async_task.erl +++ b/examples/bench_async_task.erl @@ -7,7 +7,7 @@ %%% Tests the new py_event_loop async task API: %%% - py_event_loop:run/3,4 (blocking) %%% - py_event_loop:create_task/3,4 + await (non-blocking) -%%% - py_event_loop:spawn/3,4 (fire-and-forget) +%%% - py_event_loop:spawn_task/3,4 (fire-and-forget) %%% %%% Run with: %%% rebar3 compile && escript examples/bench_async_task.erl diff --git a/examples/bench_owngil.erl b/examples/bench_owngil.erl index 9c1ff9d..fd77be9 100644 --- a/examples/bench_owngil.erl +++ b/examples/bench_owngil.erl @@ -2,7 +2,7 @@ %% -*- erlang -*- %%! -pa _build/default/lib/erlang_python/ebin -%%% @doc Benchmark comparing SHARED_GIL vs OWN_GIL context modes. +%%% @doc Benchmark comparing worker vs OWN_GIL context modes. %%% %%% OWN_GIL mode creates a dedicated pthread with its own Python GIL, %%% enabling true parallel execution for CPU-bound workloads. @@ -15,7 +15,7 @@ main(_Args) -> io:format("~n"), io:format("========================================================~n"), - io:format(" OWN_GIL vs SHARED_GIL Benchmark~n"), + io:format(" OWN_GIL vs Worker Benchmark~n"), io:format("========================================================~n~n"), %% Start the application @@ -24,14 +24,14 @@ main(_Args) -> %% Print system info print_system_info(), - case py_nif:subinterp_supported() of + case py_nif:owngil_supported() of true -> bench_single_latency(), bench_parallel_throughput(), bench_cpu_speedup(); false -> - io:format("~n[ERROR] OWN_GIL requires Python 3.12+~n"), - io:format(" Current Python version does not support subinterpreters.~n~n") + io:format("~n[ERROR] OWN_GIL requires Python 3.14+~n"), + io:format(" Current Python build does not support OWN_GIL subinterpreters.~n~n") end, halt(0). @@ -43,7 +43,7 @@ print_system_info() -> io:format(" Schedulers: ~p~n", [erlang:system_info(schedulers)]), {ok, PyVer} = py:version(), io:format(" Python: ~s~n", [PyVer]), - io:format(" Subinterp: ~p~n", [py_nif:subinterp_supported()]), + io:format(" OWN_GIL: ~p~n", [py_nif:owngil_supported()]), io:format("~n"). %% ============================================================================ @@ -72,7 +72,7 @@ bench_single_latency() -> io:format(" ~-15s ~10.1f ~12w~n", [Label, UsPerCall, CallsPerSec]), py_context:stop(Ctx) - end, [{subinterp, subinterp}, {owngil, owngil}]), + end, [{worker, worker}, {owngil, owngil}]), io:format("~n"). %% ============================================================================ @@ -114,7 +114,7 @@ bench_parallel_throughput() -> io:format(" ~-15s ~10w ~12w~n", [Label, Elapsed, CallsPerSec]), [py_context:stop(Ctx) || Ctx <- Contexts] - end, [{subinterp, subinterp}, {owngil, owngil}]), + end, [{worker, worker}, {owngil, owngil}]), io:format("~n"). %% ============================================================================ @@ -154,11 +154,11 @@ bench_cpu_speedup() -> io:format(" ~-15s ~10w ~10w ~10.2fx~n", [Label, SeqTime, ParTime, Speedup]), [py_context:stop(Ctx) || Ctx <- Contexts] - end, [{subinterp, subinterp}, {owngil, owngil}]), + end, [{worker, worker}, {owngil, owngil}]), io:format("~n"), io:format("Notes:~n"), - io:format(" - SHARED_GIL (subinterp) contexts share Python's GIL~n"), + io:format(" - Worker contexts share Python's GIL on the main interpreter~n"), io:format(" - OWN_GIL contexts have independent GILs for true parallelism~n"), io:format(" - OWN_GIL speedup should approach number of CPU cores~n"), io:format("~n"). diff --git a/examples/bench_reactor_modes.erl b/examples/bench_reactor_modes.erl index 48951ad..dd0389d 100644 --- a/examples/bench_reactor_modes.erl +++ b/examples/bench_reactor_modes.erl @@ -2,7 +2,7 @@ %% -*- erlang -*- %%! -pa _build/default/lib/erlang_python/ebin -%%% @doc Benchmark comparing Reactor (worker vs subinterp) with Channel API. +%%% @doc Benchmark comparing Reactor (worker vs OWN_GIL) with Channel API. %%% %%% Run with: %%% rebar3 compile && escript examples/bench_reactor_modes.erl @@ -30,18 +30,18 @@ main(_Args) -> io:format("~n--- Reactor (Worker Mode) ---~n"), {WkPersistent, WkLifecycle} = run_reactor_worker_bench(), - %% Subinterpreter mode benchmarks (if supported) - {SiPersistent, SiLifecycle} = case py:subinterp_supported() of + %% OWN_GIL mode benchmarks (if supported) + {OgPersistent, OgLifecycle} = case py_nif:owngil_supported() of true -> - io:format("~n--- Reactor (Subinterpreter Mode) ---~n"), - run_reactor_subinterp_bench(); + io:format("~n--- Reactor (OWN_GIL Mode) ---~n"), + run_reactor_owngil_bench(); false -> - io:format("~n[Skipping subinterpreter benchmarks - Python < 3.12]~n"), + io:format("~n[Skipping OWN_GIL benchmarks - Python < 3.14]~n"), {[], []} end, %% Print comparison summary - print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersistent, SiLifecycle), + print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, OgPersistent, OgLifecycle), halt(0). @@ -52,7 +52,7 @@ print_system_info() -> io:format(" Schedulers: ~p~n", [erlang:system_info(schedulers)]), {ok, PyVer} = py:version(), io:format(" Python: ~s~n", [PyVer]), - io:format(" Subinterp: ~p~n", [py:subinterp_supported()]), + io:format(" OWN_GIL: ~p~n", [py_nif:owngil_supported()]), io:format("~n"). %% ============================================================================ @@ -290,10 +290,10 @@ recv_all(Socket, Remaining, Timeout, Acc) -> end. %% ============================================================================ -%% Reactor Subinterpreter Mode Benchmarks +%% Reactor OWN_GIL Mode Benchmarks %% ============================================================================ -run_reactor_subinterp_bench() -> +run_reactor_owngil_bench() -> %% Protocol that stays open for multiple messages PersistentSetup = <<" import erlang.reactor as reactor @@ -340,7 +340,7 @@ reactor.set_protocol_factory(OneMessageProtocol) Data = binary:copy(<<$X>>, Size), Iterations = 500, - {ok, Ctx} = py_reactor_context:start_link(1900 + Size, subinterp, #{ + {ok, Ctx} = py_reactor_context:start_link(1900 + Size, owngil, #{ setup_code => PersistentSetup }), @@ -387,7 +387,7 @@ reactor.set_protocol_factory(OneMessageProtocol) Data = binary:copy(<<$X>>, Size), Iterations = 100, - {ok, Ctx} = py_reactor_context:start_link(1800 + Size, subinterp, #{ + {ok, Ctx} = py_reactor_context:start_link(1800 + Size, owngil, #{ setup_code => LifecycleSetup }), @@ -421,7 +421,7 @@ reactor.set_protocol_factory(OneMessageProtocol) %% Comparison Summary %% ============================================================================ -print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersistent, SiLifecycle) -> +print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, OgPersistent, OgLifecycle) -> io:format("~n"), io:format("========================================================~n"), io:format(" COMPARISON SUMMARY~n"), @@ -431,16 +431,16 @@ print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersist io:format("A) PERSISTENT CONNECTION (messages on existing connection)~n"), io:format("-----------------------------------------------------------~n"), io:format("~8s | ~12s | ~12s | ~12s~n", - ["Size", "Channel", "Reactor/W", "Reactor/S"]), + ["Size", "Channel", "Reactor/W", "Reactor/OG"]), io:format("~s~n", [string:copies("-", 52)]), lists:foreach(fun({{Size, _, ChOps}, {_, _, WkOps}}) -> - SubOps = case lists:keyfind(Size, 1, SiPersistent) of + OgOps = case lists:keyfind(Size, 1, OgPersistent) of {_, _, O} -> integer_to_list(O); false -> "N/A" end, io:format("~8B | ~12w | ~12w | ~12s~n", - [Size, ChOps, WkOps, SubOps]) + [Size, ChOps, WkOps, OgOps]) end, lists:zip(ChPersistent, WkPersistent)), %% Full lifecycle comparison (connections/sec including setup/teardown) @@ -448,23 +448,23 @@ print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersist io:format("B) FULL LIFECYCLE (create + send/recv + close per op)~n"), io:format("-----------------------------------------------------------~n"), io:format("~8s | ~12s | ~12s | ~12s~n", - ["Size", "Channel", "Reactor/W", "Reactor/S"]), + ["Size", "Channel", "Reactor/W", "Reactor/OG"]), io:format("~s~n", [string:copies("-", 52)]), lists:foreach(fun({{Size, _, ChOps}, {_, _, WkOps}}) -> - SubOps = case lists:keyfind(Size, 1, SiLifecycle) of + OgOps = case lists:keyfind(Size, 1, OgLifecycle) of {_, _, O} -> integer_to_list(O); false -> "N/A" end, io:format("~8B | ~12w | ~12w | ~12s~n", - [Size, ChOps, WkOps, SubOps]) + [Size, ChOps, WkOps, OgOps]) end, lists:zip(ChLifecycle, WkLifecycle)), io:format("~n"), io:format("Legend:~n"), - io:format(" Channel = py_channel API~n"), - io:format(" Reactor/W = erlang.reactor with worker mode~n"), - io:format(" Reactor/S = erlang.reactor with subinterpreter (SHARED_GIL)~n"), + io:format(" Channel = py_channel API~n"), + io:format(" Reactor/W = erlang.reactor with worker mode~n"), + io:format(" Reactor/OG = erlang.reactor with OWN_GIL subinterpreter~n"), io:format("~n"), io:format("Notes:~n"), io:format(" - A) measures throughput on persistent connection (best case)~n"), diff --git a/examples/benchmark.erl b/examples/benchmark.erl index 8c19cba..1bdcdee 100755 --- a/examples/benchmark.erl +++ b/examples/benchmark.erl @@ -73,7 +73,7 @@ print_system_info() -> {ok, PyVer} = py:version(), io:format("~s~n", [PyVer]), io:format(" Execution Mode: ~p~n", [py:execution_mode()]), - io:format(" Num Executors: ~p~n", [py:num_executors()]), + io:format(" Num Contexts: ~p~n", [py_context_router:num_contexts()]), io:format(" Max Concurrent: ~p~n", [py_semaphore:max_concurrent()]), io:format("~n"). diff --git a/examples/reactor_echo.erl b/examples/reactor_echo.erl index bd45d25..40117b4 100644 --- a/examples/reactor_echo.erl +++ b/examples/reactor_echo.erl @@ -25,7 +25,7 @@ main(_) -> io:format("~n=== Erlang Reactor Echo Server ===~n~n"), %% Start a reactor context - {ok, Ctx} = py_reactor_context:start_link(1, auto), + {ok, Ctx} = py_reactor_context:start_link(1, worker), %% Set up Python echo protocol ok = py:exec(Ctx, <<" diff --git a/examples/reactor_owngil_example.erl b/examples/reactor_owngil_example.erl deleted file mode 100644 index 06bce8b..0000000 --- a/examples/reactor_owngil_example.erl +++ /dev/null @@ -1,163 +0,0 @@ -%% @doc Example: OWN_GIL reactor with dedicated threads. -%% -%% Each subinterpreter handle runs in a dedicated pthread with its own GIL. -%% This provides true parallelism for CPU-bound protocol processing. -%% -%% Best for: ML inference, heavy parsing, CPU-bound protocol logic. -%% -%% Note: Requires Python 3.12+ with subinterpreter support. - --module(reactor_owngil_example). --export([start/0, start/1, stop/1]). - -%% Protocol that simulates CPU-intensive work --define(CPU_PROTOCOL, <<" -import erlang.reactor as reactor -import hashlib - -class CPUProtocol(reactor.Protocol): - '''Protocol with CPU-intensive hashing.''' - - def __init__(self): - super().__init__() - self.iterations = 10000 - - def connection_made(self, fd, client_info): - super().connection_made(fd, client_info) - - def data_received(self, data): - # CPU-intensive hashing (runs in parallel due to OWN_GIL) - result = bytes(data) - for _ in range(self.iterations): - result = hashlib.sha256(result).digest() - - self.write_buffer.extend(result) - return 'write_pending' - - def write_ready(self): - if self.write_buffer: - written = self.write(bytes(self.write_buffer)) - del self.write_buffer[:written] - if self.write_buffer: - return 'continue' - return 'read_pending' - -reactor.set_protocol_factory(CPUProtocol) -">>). - -%% @doc Start with default settings (4 handles). -start() -> - start(#{handles => 4, port => 8081}). - -%% @doc Start OWN_GIL reactor. -%% -%% Options: -%% handles - Number of subinterpreter handles (default: 4) -%% port - Port to listen on (default: 8081) -%% -%% Returns: {ok, State} where State can be passed to stop/1 -start(Opts) -> - NumHandles = maps:get(handles, Opts, 4), - Port = maps:get(port, Opts, 8081), - - %% Start OWN_GIL thread pool - ok = py:subinterp_pool_start(NumHandles), - - %% Create subinterpreter handles - each with its own pthread + GIL - Handles = [begin - {ok, Handle} = py:subinterp_create(), - %% Initialize reactor protocol in this subinterpreter - ok = py:subinterp_exec(Handle, ?CPU_PROTOCOL), - Handle - end || _ <- lists:seq(1, NumHandles)], - - %% Start acceptor - {ok, ListenSock} = gen_tcp:listen(Port, [ - binary, - {active, false}, - {reuseaddr, true}, - {backlog, 64} - ]), - - Acceptor = spawn_link(fun() -> - accept_loop(ListenSock, Handles, 1) - end), - - io:format("OWN_GIL reactor started on port ~p with ~p handles~n", [Port, NumHandles]), - io:format("Each handle runs in its own pthread with dedicated GIL~n"), - - {ok, #{handles => Handles, acceptor => Acceptor, socket => ListenSock}}. - -%% @doc Stop the OWN_GIL reactor. -stop(#{handles := Handles, acceptor := Acceptor, socket := Socket}) -> - exit(Acceptor, shutdown), - gen_tcp:close(Socket), - [py:subinterp_destroy(H) || H <- Handles], - py:subinterp_pool_stop(), - ok. - -accept_loop(ListenSock, Handles, Idx) -> - case gen_tcp:accept(ListenSock) of - {ok, Socket} -> - {ok, Fd} = prim_inet:getfd(Socket), - Handle = lists:nth(Idx, Handles), - ClientInfo = get_client_info(Socket), - - %% Initialize connection via OWN_GIL reactor API - ok = py:subinterp_reactor_init(Handle, Fd, ClientInfo), - - %% Spawn handler for this connection - spawn_link(fun() -> handle_connection(Handle, Fd, Socket) end), - - NextIdx = (Idx rem length(Handles)) + 1, - accept_loop(ListenSock, Handles, NextIdx); - - {error, closed} -> - ok - end. - -get_client_info(Socket) -> - case inet:peername(Socket) of - {ok, {Addr, Port}} -> - #{addr => inet:ntoa(Addr), port => Port, type => tcp}; - _ -> - #{type => tcp} - end. - -handle_connection(Handle, Fd, Socket) -> - %% Simple blocking receive for example purposes - case gen_tcp:recv(Socket, 0, 30000) of - {ok, Data} -> - %% Dispatch to OWN_GIL subinterpreter - case py:subinterp_reactor_read(Handle, Fd, Data) of - {ok, <<"write_pending">>} -> - handle_write(Handle, Fd, Socket); - {ok, <<"continue">>} -> - handle_connection(Handle, Fd, Socket); - {ok, <<"close">>} -> - py:subinterp_reactor_close(Handle, Fd), - gen_tcp:close(Socket); - {error, _Reason} -> - py:subinterp_reactor_close(Handle, Fd), - gen_tcp:close(Socket) - end; - {error, closed} -> - py:subinterp_reactor_close(Handle, Fd); - {error, _} -> - py:subinterp_reactor_close(Handle, Fd), - gen_tcp:close(Socket) - end. - -handle_write(Handle, Fd, Socket) -> - case py:subinterp_reactor_write(Handle, Fd) of - {ok, <<"read_pending">>} -> - handle_connection(Handle, Fd, Socket); - {ok, <<"continue">>} -> - handle_write(Handle, Fd, Socket); - {ok, <<"close">>} -> - py:subinterp_reactor_close(Handle, Fd), - gen_tcp:close(Socket); - _ -> - py:subinterp_reactor_close(Handle, Fd), - gen_tcp:close(Socket) - end. diff --git a/examples/reactor_subinterp_example.erl b/examples/reactor_subinterp_example.erl deleted file mode 100644 index 8075728..0000000 --- a/examples/reactor_subinterp_example.erl +++ /dev/null @@ -1,158 +0,0 @@ -%% @doc Example: SHARED_GIL reactor with subinterpreters. -%% -%% Each py_reactor_context runs in an isolated subinterpreter with its own -%% protocol factory. Multiple contexts can process connections in parallel -%% while sharing Python's GIL. -%% -%% Best for: High-concurrency I/O-bound workloads (HTTP servers, WebSockets). - --module(reactor_subinterp_example). --export([start/0, start/1, stop/1]). - --define(ECHO_PROTOCOL, <<" -import erlang.reactor as reactor - -class EchoProtocol(reactor.Protocol): - '''Echo back all received data.''' - - def data_received(self, data): - self.write_buffer.extend(data) - return 'write_pending' - - def write_ready(self): - if self.write_buffer: - written = self.write(bytes(self.write_buffer)) - del self.write_buffer[:written] - if self.write_buffer: - return 'continue' - return 'read_pending' - -reactor.set_protocol_factory(EchoProtocol) -">>). - --define(HTTP_PROTOCOL, <<" -import erlang.reactor as reactor - -class SimpleHTTPProtocol(reactor.Protocol): - '''Simple HTTP/1.1 response protocol.''' - - def __init__(self): - super().__init__() - self.request_data = bytearray() - - def data_received(self, data): - self.request_data.extend(data) - # Check for end of HTTP headers - if b'\\r\\n\\r\\n' in self.request_data: - # Build simple response - body = b'Hello from subinterpreter!' - response = ( - b'HTTP/1.1 200 OK\\r\\n' - b'Content-Type: text/plain\\r\\n' - b'Content-Length: ' + str(len(body)).encode() + b'\\r\\n' - b'Connection: close\\r\\n' - b'\\r\\n' + body - ) - self.write_buffer.extend(response) - return 'write_pending' - return 'continue' - - def write_ready(self): - if self.write_buffer: - written = self.write(bytes(self.write_buffer)) - del self.write_buffer[:written] - if self.write_buffer: - return 'continue' - return 'close' - -reactor.set_protocol_factory(SimpleHTTPProtocol) -">>). - -%% @doc Start with default settings (4 contexts, 2 echo + 2 http). -start() -> - start(#{contexts => 4, port => 8080}). - -%% @doc Start reactor contexts. -%% -%% Options: -%% contexts - Number of contexts to create (default: 4) -%% port - Port to listen on (default: 8080) -%% -%% Returns: {ok, #{echo => [Pid], http => [Pid], acceptor => Pid, socket => Socket}} -start(Opts) -> - NumContexts = maps:get(contexts, Opts, 4), - Port = maps:get(port, Opts, 8080), - HalfContexts = NumContexts div 2, - - %% Start echo protocol contexts - EchoContexts = [begin - {ok, Pid} = py_reactor_context:start_link(N, subinterp, #{ - max_connections => 100, - setup_code => ?ECHO_PROTOCOL - }), - Pid - end || N <- lists:seq(1, HalfContexts)], - - %% Start HTTP protocol contexts - HttpContexts = [begin - {ok, Pid} = py_reactor_context:start_link(N, subinterp, #{ - max_connections => 100, - setup_code => ?HTTP_PROTOCOL - }), - Pid - end || N <- lists:seq(HalfContexts + 1, NumContexts)], - - AllContexts = EchoContexts ++ HttpContexts, - - %% Start acceptor that routes to contexts - {ok, ListenSock} = gen_tcp:listen(Port, [ - binary, - {active, false}, - {reuseaddr, true}, - {backlog, 128} - ]), - Acceptor = spawn_link(fun() -> accept_loop(ListenSock, AllContexts, 1) end), - - io:format("Reactor started on port ~p with ~p contexts~n", [Port, NumContexts]), - io:format(" Echo contexts: ~p~n", [EchoContexts]), - io:format(" HTTP contexts: ~p~n", [HttpContexts]), - - {ok, #{ - echo => EchoContexts, - http => HttpContexts, - acceptor => Acceptor, - socket => ListenSock - }}. - -%% @doc Stop the reactor server. -stop(#{acceptor := Acceptor, socket := Socket, echo := Echo, http := Http}) -> - exit(Acceptor, shutdown), - gen_tcp:close(Socket), - [py_reactor_context:stop(Pid) || Pid <- Echo ++ Http], - ok. - -%% @private Simple round-robin acceptor -accept_loop(ListenSock, Contexts, Idx) -> - case gen_tcp:accept(ListenSock) of - {ok, Socket} -> - %% Get FD and hand off to reactor context - {ok, Fd} = prim_inet:getfd(Socket), - Ctx = lists:nth(Idx, Contexts), - ClientInfo = get_client_info(Socket), - py_reactor_context:handoff(Ctx, Fd, ClientInfo), - - %% Round-robin to next context - NextIdx = (Idx rem length(Contexts)) + 1, - accept_loop(ListenSock, Contexts, NextIdx); - - {error, closed} -> - ok - end. - -get_client_info(Socket) -> - case inet:peername(Socket) of - {ok, {Addr, Port}} -> - #{addr => inet:ntoa(Addr), port => Port, type => tcp}; - _ -> - #{type => tcp} - end. diff --git a/priv/_erlang_impl/__init__.py b/priv/_erlang_impl/__init__.py index 9090804..05abea5 100644 --- a/priv/_erlang_impl/__init__.py +++ b/priv/_erlang_impl/__init__.py @@ -226,17 +226,26 @@ def sleep(seconds): - Async context: Returns an awaitable (use with await) - Sync context: Blocks synchronously - **Dirty Scheduler Release:** - - In async context, uses asyncio.sleep() which routes through the Erlang - timer system via erlang:send_after. The dirty scheduler is released - because the Python code yields back to the event loop. - - In sync context (when called from py:exec or py:eval), the sleep uses - Erlang's receive/after via erlang.call('_py_sleep', seconds), which - releases the dirty NIF scheduler thread. When called from py:call - contexts, falls back to Python's time.sleep() which blocks the dirty - scheduler but ensures correct time measurement behavior. + **Behavior by context (v3.0 worker-pthread architecture)**: + + The BEAM dirty scheduler is never held during the sleep — the + difference is which thread blocks. + + - Async (``await erlang.sleep()``) uses ``asyncio.sleep()``, which + routes through Erlang's ``send_after`` timer. The coroutine + yields to the event loop; the worker pthread handles other tasks. + - Sync from ``py:exec`` / ``py:eval`` calls + ``erlang.call('_py_sleep', seconds)``. The suspension machinery + releases the dirty scheduler and parks the caller's Erlang + process in a ``receive ... after``. + - Sync from ``py:call`` falls back to ``time.sleep`` — the worker + pthread blocks for the sleep duration. The BEAM dirty scheduler + is *not* held here either: the NIF dispatch returned immediately + and the caller is waiting in an Erlang ``receive`` on the + context process. Other Erlang processes and other contexts run + normally during the sleep. (Replaying a suspended Python frame + around ``time.time()`` would change time-measurement semantics, + which is why ``py:call`` doesn't take the suspension path.) Args: seconds: Duration to sleep in seconds (float or int). @@ -246,13 +255,13 @@ def sleep(seconds): In sync context: None (blocks until sleep completes). Example: - # Async context - releases dirty scheduler via event loop yield + # Async context async def main(): - await erlang.sleep(0.5) # Uses Erlang timer system + await erlang.sleep(0.5) # Sync context def handler(): - erlang.sleep(0.5) # Blocks for 0.5 seconds + erlang.sleep(0.5) """ try: asyncio.get_running_loop() @@ -396,22 +405,44 @@ def _run_async_from_erlang(module, func, args, kwargs): return run(coro) -def install(): +def install(*, silent=False): """Install ErlangEventLoopPolicy as the default event loop policy. - This function is deprecated in Python 3.12+. Use run() instead. + Deprecated in Python 3.12+; raises ``RuntimeError`` on Python 3.14+ + where the underlying ``asyncio.set_event_loop_policy`` is itself + deprecated and slated for removal in 3.16. - Example (legacy pattern): + Use ``erlang.run(main)`` or + ``asyncio.Runner(loop_factory=erlang.new_event_loop)`` instead — + both work on every supported Python version and don't touch the + global policy. + + Args: + silent: If True (keyword-only), suppress the per-call + ``DeprecationWarning`` on Python 3.12-3.13. Useful when + you knowingly rely on the legacy pattern and don't want + to silence ``DeprecationWarning`` globally. The 3.14+ + ``RuntimeError`` is *not* suppressible — that pattern + won't work on 3.16 and the call has no fallback there. + + Example (legacy pattern, Python 3.9–3.13 only): import asyncio import erlang - erlang.install() - asyncio.run(main()) # Uses Erlang event loop + erlang.install(silent=True) # opt out of the warning + asyncio.run(main()) # Uses Erlang event loop """ - if sys.version_info >= (3, 12): + if sys.version_info >= (3, 14): + raise RuntimeError( + "erlang.install() is not supported on Python 3.14+. " + "Use erlang.run(main) or " + "asyncio.Runner(loop_factory=erlang.new_event_loop) instead." + ) + if sys.version_info >= (3, 12) and not silent: warnings.warn( "erlang.install() is deprecated in Python 3.12+. " - "Use erlang.run(main()) instead.", + "Use erlang.run(main()) instead, or pass silent=True " + "to suppress this warning.", DeprecationWarning, stacklevel=2 ) diff --git a/priv/_erlang_impl/_policy.py b/priv/_erlang_impl/_policy.py index 37b18af..62705fe 100644 --- a/priv/_erlang_impl/_policy.py +++ b/priv/_erlang_impl/_policy.py @@ -33,19 +33,28 @@ class ErlangEventLoopPolicy(asyncio.AbstractEventLoopPolicy): This policy creates ErlangEventLoop instances for the main thread and optionally for child threads depending on configuration. - Usage: - import asyncio + Recommended usage on Python 3.12+ (no policy required): + import erlang + erlang.run(main()) - # Install the policy - asyncio.set_event_loop_policy(erlang.EventLoopPolicy()) + # or, equivalently: + import asyncio + with asyncio.Runner(loop_factory=erlang.new_event_loop) as r: + r.run(main()) - # Now asyncio.run() uses Erlang event loop + Legacy pattern for Python 3.9–3.11 (also works through 3.13 with a + DeprecationWarning, raises on 3.14+): + + import asyncio, erlang + asyncio.set_event_loop_policy(erlang.EventLoopPolicy()) asyncio.run(main()) - Note: - This approach is deprecated in Python 3.12+. - Use erlang.run() instead. + Notes: + ``asyncio.set_event_loop_policy`` is deprecated in Python 3.14 + and removed in 3.16, so only ``erlang.run`` / + ``asyncio.Runner(loop_factory=...)`` are guaranteed to work + across the full supported range. """ def __init__(self): @@ -137,60 +146,3 @@ def _init_watcher(self): self._watcher = asyncio.ThreadedChildWatcher() elif hasattr(asyncio, 'SafeChildWatcher'): self._watcher = asyncio.SafeChildWatcher() - - -class _ErlangChildWatcher: - """Child watcher that delegates to Erlang for process monitoring. - - This watcher uses Erlang ports and monitors instead of SIGCHLD, - making it compatible with subinterpreters and free-threaded Python. - """ - - def __init__(self): - self._callbacks = {} - self._loop = None - - def attach_loop(self, loop): - """Attach to an event loop.""" - self._loop = loop - - def close(self): - """Close the watcher.""" - self._callbacks.clear() - self._loop = None - - def is_active(self): - """Return True if the watcher is active.""" - return self._loop is not None and not self._loop.is_closed() - - def add_child_handler(self, pid, callback, *args): - """Register a callback for when a child process exits. - - Args: - pid: Process ID to watch. - callback: Callback function(pid, returncode, *args). - *args: Additional arguments for the callback. - """ - self._callbacks[pid] = (callback, args) - # TODO: Use Erlang port monitoring - - def remove_child_handler(self, pid): - """Remove the handler for a child process. - - Returns: - bool: True if handler was removed, False if not found. - """ - return self._callbacks.pop(pid, None) is not None - - def _do_waitpid(self, pid, returncode): - """Called when a child process exits. - - Args: - pid: Process ID that exited. - returncode: Exit code of the process. - """ - entry = self._callbacks.pop(pid, None) - if entry is not None: - callback, args = entry - if self._loop is not None and not self._loop.is_closed(): - self._loop.call_soon_threadsafe(callback, pid, returncode, *args) diff --git a/priv/tests/test_erlang_api.py b/priv/tests/test_erlang_api.py index a07f801..83d1d71 100644 --- a/priv/tests/test_erlang_api.py +++ b/priv/tests/test_erlang_api.py @@ -251,31 +251,57 @@ async def main(): self.assertEqual(result, 'debug_test') def test_install_function(self): - """Test erlang.install() function.""" + """Test erlang.install() function across supported Python versions.""" erlang = _get_erlang_module() - old_policy = asyncio.get_event_loop_policy() + if sys.version_info >= (3, 14): + # 3.14 deprecated set_event_loop_policy and 3.16 removes it, + # so erlang.install() now raises with a migration message. + with self.assertRaises(RuntimeError) as cm: + erlang.install() + msg = str(cm.exception) + self.assertIn("3.14+", msg) + return + + # 3.9-3.13: install() still works (DeprecationWarning on 3.12+). + # Suppress the asyncio DeprecationWarning emitted by the + # get_event_loop_policy() probe itself on those versions. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + old_policy = asyncio.get_event_loop_policy() try: if sys.version_info >= (3, 12): - # Should emit deprecation warning with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") erlang.install() - self.assertTrue(len(w) >= 1) self.assertTrue( any(issubclass(warning.category, DeprecationWarning) for warning in w) ) + + # silent=True must suppress the warning even with + # simplefilter("always"). + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + erlang.install(silent=True) + install_warnings = [ + warning for warning in w + if "erlang.install()" in str(warning.message) + ] + self.assertEqual(install_warnings, []) else: erlang.install() - # Policy should be ErlangEventLoopPolicy - policy = asyncio.get_event_loop_policy() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + policy = asyncio.get_event_loop_policy() self.assertIsInstance(policy, erlang.EventLoopPolicy) finally: - asyncio.set_event_loop_policy(old_policy) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + asyncio.set_event_loop_policy(old_policy) class TestErlangLoopSpecificFeatures(tb.ErlangTestCase): diff --git a/src/erlang_python.app.src b/src/erlang_python.app.src index 9378a24..997aace 100644 --- a/src/erlang_python.app.src +++ b/src/erlang_python.app.src @@ -1,17 +1,13 @@ {application, erlang_python, [ {description, "Execute Python applications from Erlang using dirty NIFs"}, - {vsn, "2.3.1"}, - {registered, [py_pool]}, + {vsn, "3.0.0"}, + {registered, []}, {mod, {erlang_python_app, []}}, {applications, [ kernel, stdlib ]}, - {env, [ - {num_workers, 4}, - {python_path, ""}, - {worker_timeout, 30000} - ]}, + {env, []}, {modules, []}, {licenses, ["Apache-2.0"]}, {links, [{"GitHub", "https://github.com/benoitc/erlang-python"}]}, diff --git a/src/erlang_python_sup.erl b/src/erlang_python_sup.erl index d1ef01a..d6471be 100644 --- a/src/erlang_python_sup.erl +++ b/src/erlang_python_sup.erl @@ -19,7 +19,7 @@ %%%
  • py_callback - Callback registry for Python to Erlang calls
  • %%%
  • py_state - Shared state storage accessible from Python
  • %%%
  • py_context_sup - Supervisor for process-per-context workers
  • -%%%
  • py_async_pool - Worker pool for asyncio coroutines
  • +%%%
  • py_event_loop / py_event_loop_pool - Asyncio dispatch
  • %%% %%% @private -module(erlang_python_sup). @@ -35,14 +35,9 @@ init([]) -> NumContexts = application:get_env(erlang_python, num_contexts, erlang:system_info(schedulers)), ContextMode = application:get_env(erlang_python, context_mode, worker), - NumAsyncWorkers = application:get_env(erlang_python, num_async_workers, 2), - - %% Default executors: 4 (benchmarked sweet spot for most workloads) - %% Can be overridden via {erlang_python, [{num_executors, N}]} - NumExecutors = application:get_env(erlang_python, num_executors, 4), %% Initialize Python runtime first - ok = py_nif:init(#{num_executors => NumExecutors}), + ok = py_nif:init(#{}), %% Initialize the semaphore ETS table for rate limiting ok = py_semaphore:init(), @@ -123,16 +118,6 @@ init([]) -> modules => [py_context_init] }, - %% Async worker pool (for asyncio coroutines) - AsyncPoolSpec = #{ - id => py_async_pool, - start => {py_async_pool, start_link, [NumAsyncWorkers]}, - restart => permanent, - shutdown => 5000, - type => worker, - modules => [py_async_pool] - }, - %% Event worker registry (for scalable I/O model) WorkerRegistrySpec = #{ id => py_event_worker_registry, @@ -176,7 +161,7 @@ init([]) -> Children = [CallbackSpec, ThreadHandlerSpec, LoggerSpec, TracerSpec, ContextSupSpec, ContextRouterInitSpec, WorkerRegistrySpec, WorkerSupSpec, EventLoopSpec, - EventLoopPoolSpec, AsyncPoolSpec], + EventLoopPoolSpec], {ok, { #{strategy => one_for_all, intensity => 5, period => 10}, diff --git a/src/py.erl b/src/py.erl index 387cecf..84fa976 100644 --- a/src/py.erl +++ b/src/py.erl @@ -75,28 +75,10 @@ async_await/1, async_await/2, async_gather/1, - async_stream/3, - async_stream/4, - %% Parallel execution (Python 3.12+ sub-interpreters) + async_gather/2, + %% Parallel execution + capability probe parallel/1, subinterp_supported/0, - %% OWN_GIL subinterpreter API (true parallelism) - subinterp_create/0, - subinterp_destroy/1, - subinterp_call/4, - subinterp_call/5, - subinterp_eval/2, - subinterp_eval/3, - subinterp_exec/2, - subinterp_cast/4, - subinterp_async_call/4, - subinterp_await/1, - subinterp_await/2, - subinterp_pool_start/0, - subinterp_pool_start/1, - subinterp_pool_stop/0, - subinterp_pool_ready/0, - subinterp_pool_stats/0, %% Virtual environment ensure_venv/2, ensure_venv/3, @@ -107,7 +89,6 @@ venv_info/0, %% Execution info execution_mode/0, - num_executors/0, %% Shared state (accessible from Python workers) state_fetch/1, state_store/2, @@ -320,7 +301,7 @@ eval(Code, Locals, Timeout) -> %% %% In worker mode, the code runs in a process-local Python environment. %% Variables defined via exec persist within the calling Erlang process. -%% In subinterpreter mode, each context has its own isolated namespace. +%% In owngil mode, each context has its own isolated namespace. -spec exec(string() | binary()) -> ok | {error, term()}. exec(Code) -> %% Always route through context process - it handles callbacks inline using @@ -721,28 +702,33 @@ async_call(Module, Func, Args) -> %% @doc Call a Python async function with keyword arguments. -spec async_call(py_module(), py_func(), py_args(), py_kwargs()) -> py_ref(). async_call(Module, Func, Args, Kwargs) -> - Ref = make_ref(), - py_async_pool:request({async_call, Ref, self(), Module, Func, Args, Kwargs}), - Ref. + py_event_loop:create_task(Module, Func, Args, Kwargs). %% @doc Wait for an async call to complete. -spec async_await(py_ref()) -> py_result(). async_await(Ref) -> - await(Ref, ?DEFAULT_TIMEOUT). + async_await(Ref, ?DEFAULT_TIMEOUT). %% @doc Wait for an async call with timeout. -%% Note: Identical to await/2 - provided for API symmetry with async_call. -spec async_await(py_ref(), timeout()) -> py_result(). async_await(Ref, Timeout) -> - await(Ref, Timeout). + py_event_loop:await(Ref, Timeout). -%% @doc Execute multiple async calls concurrently using asyncio.gather. -%% Takes a list of {Module, Func, Args} tuples and executes them all -%% concurrently, returning when all are complete. +%% @doc Execute multiple async Python calls concurrently. +%% +%% Each call is submitted to the event loop independently, so they run +%% concurrently. Results are collected in the order of the input list. +%% Sync functions are accepted and resolve immediately (the event loop +%% short-circuits non-coroutines). +%% +%% Returns `{ok, [Result1, Result2, ...]}' when every call succeeds, where +%% each `ResultN' is the value returned by the corresponding call. +%% Returns `{error, {gather_failed, Errors}}' if any call fails, where +%% `Errors' is a list of `{Index, Reason}' tuples for each failure. %% %% Example: %% ``` -%% {ok, Results} = py:async_gather([ +%% {ok, [R1, R2, R3]} = py:async_gather([ %% {aiohttp, get, [Url1]}, %% {aiohttp, get, [Url2]}, %% {aiohttp, get, [Url3]} @@ -750,37 +736,21 @@ async_await(Ref, Timeout) -> %% ''' -spec async_gather([{py_module(), py_func(), py_args()}]) -> py_result(). async_gather(Calls) -> - Ref = make_ref(), - py_async_pool:request({async_gather, Ref, self(), Calls}), - async_await(Ref, ?DEFAULT_TIMEOUT). - -%% @doc Stream results from a Python async generator. -%% Returns a list of all yielded values. --spec async_stream(py_module(), py_func(), py_args()) -> py_result(). -async_stream(Module, Func, Args) -> - async_stream(Module, Func, Args, #{}). - -%% @doc Stream results from a Python async generator with kwargs. --spec async_stream(py_module(), py_func(), py_args(), py_kwargs()) -> py_result(). -async_stream(Module, Func, Args, Kwargs) -> - Ref = make_ref(), - py_async_pool:request({async_stream, Ref, self(), Module, Func, Args, Kwargs}), - async_stream_collect(Ref, []). - -%% @private -async_stream_collect(Ref, Acc) -> - receive - {py_response, Ref, {ok, Result}} -> - %% Got final result (async generator collected) - {ok, Result}; - {py_chunk, Ref, Chunk} -> - async_stream_collect(Ref, [Chunk | Acc]); - {py_end, Ref} -> - {ok, lists:reverse(Acc)}; - {py_error, Ref, Error} -> - {error, Error} - after ?DEFAULT_TIMEOUT -> - {error, timeout} + async_gather(Calls, ?DEFAULT_TIMEOUT). + +%% @doc Like async_gather/1 with explicit per-call timeout. +-spec async_gather([{py_module(), py_func(), py_args()}], timeout()) -> py_result(). +async_gather(Calls, Timeout) when is_list(Calls) -> + Refs = [async_call(M, F, A) || {M, F, A} <- Calls], + Results = [async_await(R, Timeout) || R <- Refs], + Errors = [{Idx, Reason} + || {Idx, {error, Reason}} <- lists:zip(lists:seq(1, length(Results)), Results)], + case Errors of + [] -> + Values = [V || {ok, V} <- Results], + {ok, Values}; + _ -> + {error, {gather_failed, Errors}} end. %%% ============================================================================ @@ -849,126 +819,6 @@ parallel(Calls) when is_list(Calls) -> end end. -%%% ============================================================================ -%%% OWN_GIL Subinterpreter API (True Parallelism) -%%% ============================================================================ - -%% @doc Create an isolated subinterpreter with OWN_GIL. -%% Returns a handle for making calls. The subinterpreter runs -%% in a dedicated pthread with true parallelism. -%% -%% Requires the thread pool to be started first via subinterp_pool_start/0. -%% -%% Example: -%% ``` -%% ok = py:subinterp_pool_start(). -%% {ok, Sub} = py:subinterp_create(). -%% {ok, Result} = py:subinterp_call(Sub, math, sqrt, [16.0]). -%% ok = py:subinterp_destroy(Sub). -%% ''' --spec subinterp_create() -> {ok, reference()} | {error, term()}. -subinterp_create() -> - py_nif:subinterp_thread_create(). - -%% @doc Destroy a subinterpreter handle. -%% Cleans up namespace, releases worker binding. --spec subinterp_destroy(reference()) -> ok. -subinterp_destroy(Handle) -> - py_nif:subinterp_thread_destroy(Handle), - ok. - -%% @doc Call a function in a subinterpreter (blocking). --spec subinterp_call(reference(), py_module(), py_func(), py_args()) -> - {ok, term()} | {error, term()}. -subinterp_call(Handle, Module, Func, Args) -> - subinterp_call(Handle, Module, Func, Args, #{}). - -%% @doc Call a function in a subinterpreter with kwargs (blocking). --spec subinterp_call(reference(), py_module(), py_func(), py_args(), py_kwargs()) -> - {ok, term()} | {error, term()}. -subinterp_call(Handle, Module, Func, Args, Kwargs) -> - ModuleBin = ensure_binary(Module), - FuncBin = ensure_binary(Func), - py_nif:subinterp_thread_call(Handle, ModuleBin, FuncBin, Args, Kwargs). - -%% @doc Evaluate expression in subinterpreter (blocking). --spec subinterp_eval(reference(), binary() | string()) -> - {ok, term()} | {error, term()}. -subinterp_eval(Handle, Code) -> - subinterp_eval(Handle, Code, #{}). - -%% @doc Evaluate expression with locals in subinterpreter (blocking). --spec subinterp_eval(reference(), binary() | string(), map()) -> - {ok, term()} | {error, term()}. -subinterp_eval(Handle, Code, Locals) -> - CodeBin = ensure_binary(Code), - py_nif:subinterp_thread_eval(Handle, CodeBin, Locals). - -%% @doc Execute statements in subinterpreter (blocking, no return). --spec subinterp_exec(reference(), binary() | string()) -> ok | {error, term()}. -subinterp_exec(Handle, Code) -> - CodeBin = ensure_binary(Code), - py_nif:subinterp_thread_exec(Handle, CodeBin). - -%% @doc Cast a call to subinterpreter (fire-and-forget, no result). -%% Returns immediately. Use for side-effects where result is not needed. --spec subinterp_cast(reference(), py_module(), py_func(), py_args()) -> ok. -subinterp_cast(Handle, Module, Func, Args) -> - ModuleBin = ensure_binary(Module), - FuncBin = ensure_binary(Func), - py_nif:subinterp_thread_cast(Handle, ModuleBin, FuncBin, Args). - -%% @doc Async call - returns immediately with a reference. -%% Use subinterp_await/1,2 to get the result. -%% Worker uses erlang.send() to deliver result. --spec subinterp_async_call(reference(), py_module(), py_func(), py_args()) -> reference(). -subinterp_async_call(Handle, Module, Func, Args) -> - ModuleBin = ensure_binary(Module), - FuncBin = ensure_binary(Func), - Ref = make_ref(), - py_nif:subinterp_thread_async_call(Handle, ModuleBin, FuncBin, Args, self(), Ref), - Ref. - -%% @doc Wait for async call result. --spec subinterp_await(reference()) -> {ok, term()} | {error, term()}. -subinterp_await(Ref) -> - subinterp_await(Ref, ?DEFAULT_TIMEOUT). - -%% @doc Wait for async call result with timeout. --spec subinterp_await(reference(), timeout()) -> {ok, term()} | {error, term()}. -subinterp_await(Ref, Timeout) -> - receive - {py_subinterp_result, Ref, Result} -> Result - after Timeout -> - {error, timeout} - end. - -%% @doc Start the OWN_GIL subinterpreter thread pool with default workers. -%% Must be called before creating subinterpreter handles. --spec subinterp_pool_start() -> ok | {error, term()}. -subinterp_pool_start() -> - py_nif:subinterp_thread_pool_start(). - -%% @doc Start the OWN_GIL subinterpreter thread pool with N workers. --spec subinterp_pool_start(non_neg_integer()) -> ok | {error, term()}. -subinterp_pool_start(NumWorkers) -> - py_nif:subinterp_thread_pool_start(NumWorkers). - -%% @doc Stop the OWN_GIL subinterpreter thread pool. --spec subinterp_pool_stop() -> ok. -subinterp_pool_stop() -> - py_nif:subinterp_thread_pool_stop(). - -%% @doc Check if the OWN_GIL thread pool is ready. --spec subinterp_pool_ready() -> boolean(). -subinterp_pool_ready() -> - py_nif:subinterp_thread_pool_ready(). - -%% @doc Get OWN_GIL thread pool statistics. --spec subinterp_pool_stats() -> map(). -subinterp_pool_stats() -> - py_nif:subinterp_thread_pool_stats(). - %%% ============================================================================ %%% Virtual Environment Support %%% ============================================================================ @@ -1257,30 +1107,22 @@ ensure_binary(S) -> %% @doc Get the current execution mode. %% Returns one of: -%% - `free_threaded': Python 3.13+ with no GIL (Py_GIL_DISABLED) -%% - `worker': Contexts use main interpreter namespaces (default) -%% - `owngil': Contexts use dedicated threads with own GIL (Python 3.14+) -%% - `multi_executor': Traditional Python with N executor threads (Python < 3.12) --spec execution_mode() -> free_threaded | worker | owngil | multi_executor. +%% - `worker': Contexts use dedicated pthread per context (default). +%% Provides stable thread affinity for numpy/torch/tensorflow compatibility. +%% - `owngil': Contexts use dedicated pthread + subinterpreter with own GIL. +%% Enables true parallelism (Python 3.12+ with subinterpreter support). +%% +%% The mode is determined by the `context_mode' application config: +%% ``` +%% application:set_env(erlang_python, context_mode, owngil). +%% ''' +-spec execution_mode() -> worker | owngil. execution_mode() -> - case py_nif:execution_mode() of - free_threaded -> free_threaded; - multi_executor -> multi_executor; - subinterp -> - %% Check actual context_mode config - case application:get_env(erlang_python, context_mode, worker) of - owngil -> owngil; - _ -> worker - end + case application:get_env(erlang_python, context_mode, worker) of + owngil -> owngil; + _ -> worker end. -%% @doc Get the number of executor threads. -%% For `multi_executor' mode, this is the number of executor threads. -%% For other modes, returns 1. --spec num_executors() -> pos_integer(). -num_executors() -> - py_nif:num_executors(). - %%% ============================================================================ %%% Shared State %%% ============================================================================ @@ -1475,8 +1317,8 @@ clear_traces() -> %%% Ctx = py:context(), %%% {ok, Result} = py:call(Ctx, math, sqrt, [16]), %%% -%%% %% Or bind a specific context to this process -%%% ok = py:bind_context(py:context(1)), +%%% %% Or bind a specific context to this process via the router +%%% ok = py_context_router:bind_context(py:context(1)), %%% {ok, Result} = py:call(py:context(), math, sqrt, [16]). %%% ''' %%% ============================================================================ @@ -1494,7 +1336,7 @@ start_contexts() -> %% %% Options: %% - `contexts' - Number of contexts to create (default: number of schedulers) -%% - `mode' - Context mode: `worker', `subinterp', or `owngil' (default: `worker') +%% - `mode' - Context mode: `worker' or `owngil' (default: `worker') %% %% @param Opts Start options %% @returns {ok, [Context]} | {error, Reason} diff --git a/src/py_async_pool.erl b/src/py_async_pool.erl deleted file mode 100644 index 11bf5d7..0000000 --- a/src/py_async_pool.erl +++ /dev/null @@ -1,167 +0,0 @@ -%% Copyright 2026 Benoit Chesneau -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. - -%%% @doc Pool manager for async Python execution using event loops. -%%% -%%% This module provides an async request pool that delegates to the event loop -%%% pool for efficient coroutine execution. It replaces the pthread+usleep -%%% polling model with event-driven execution using enif_select and erlang.send(). -%%% -%%% The pool maintains API compatibility with the previous pthread-based -%%% implementation while providing significant performance improvements. -%%% -%%% @private --module(py_async_pool). --behaviour(gen_server). - --export([ - start_link/0, - start_link/1, - request/1, - get_stats/0 -]). - --export([ - init/1, - handle_call/3, - handle_cast/2, - handle_info/2, - terminate/2 -]). - --record(state, { - pending :: non_neg_integer(), - supported :: boolean() -}). - -%%% ============================================================================ -%%% API -%%% ============================================================================ - --spec start_link() -> {ok, pid()} | {error, term()}. -start_link() -> - start_link(1). - --spec start_link(pos_integer()) -> {ok, pid()} | {error, term()}. -start_link(_NumWorkers) -> - %% NumWorkers is now ignored - we use the event loop pool instead - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). - -%% @doc Submit an async request to be executed by the event loop pool. --spec request(term()) -> ok. -request(Request) -> - gen_server:cast(?MODULE, {request, Request}). - -%% @doc Get pool statistics. --spec get_stats() -> map(). -get_stats() -> - gen_server:call(?MODULE, get_stats). - -%%% ============================================================================ -%%% gen_server callbacks -%%% ============================================================================ - -init([]) -> - process_flag(trap_exit, true), - %% Check if event loop pool is available - case py_event_loop:get_loop() of - {ok, _LoopRef} -> - {ok, #state{pending = 0, supported = true}}; - {error, _} -> - {ok, #state{pending = 0, supported = false}} - end. - -handle_call(get_stats, _From, State) -> - Stats = #{ - pending_requests => State#state.pending, - supported => State#state.supported, - backend => event_loop - }, - {reply, Stats, State}; - -handle_call(_Request, _From, State) -> - {reply, {error, unknown_request}, State}. - -handle_cast({request, Request}, #state{supported = false} = State) -> - {Ref, Caller, _Type} = extract_ref_caller(Request), - Caller ! {py_error, Ref, async_not_supported}, - {noreply, State}; - -handle_cast({request, Request}, State) -> - case transform_request(Request) of - {ok, LoopRequest} -> - case py_event_loop:get_loop() of - {ok, LoopRef} -> - case py_event_loop:run_async(LoopRef, LoopRequest) of - ok -> - {noreply, State#state{pending = State#state.pending + 1}}; - {error, Reason} -> - {Ref, Caller, _} = extract_ref_caller(Request), - Caller ! {py_error, Ref, Reason}, - {noreply, State} - end; - {error, Reason} -> - {Ref, Caller, _} = extract_ref_caller(Request), - Caller ! {py_error, Ref, Reason}, - {noreply, State} - end; - {error, Reason} -> - {Ref, Caller, _} = extract_ref_caller(Request), - Caller ! {py_error, Ref, Reason}, - {noreply, State} - end; - -handle_cast(_Msg, State) -> - {noreply, State}. - -handle_info({async_result, _Ref, _Result}, State) -> - %% Result was sent directly to caller via erlang.send() - %% We just track pending count - {noreply, State#state{pending = max(0, State#state.pending - 1)}}; - -handle_info(_Info, State) -> - {noreply, State}. - -terminate(_Reason, _State) -> - ok. - -%%% ============================================================================ -%%% Internal functions -%%% ============================================================================ - -%% @doc Transform the legacy request format to the new event loop format. -transform_request({async_call, Ref, Caller, Module, Func, Args, Kwargs}) -> - {ok, #{ - ref => Ref, - caller => Caller, - module => Module, - func => Func, - args => Args, - kwargs => Kwargs - }}; -transform_request({async_gather, Ref, Caller, Calls}) -> - %% For gather, we need to wrap in a special gather coroutine - %% For now, return an error - gather needs special handling - {error, {gather_not_implemented, Ref, Caller, Calls}}; -transform_request({async_stream, Ref, Caller, Module, Func, Args, Kwargs}) -> - %% For stream, we need async generator support - %% For now, return an error - stream needs special handling - {error, {stream_not_implemented, Ref, Caller, Module, Func, Args, Kwargs}}; -transform_request(Other) -> - {error, {unknown_request_type, Other}}. - -%% @doc Extract ref and caller from different request types. -extract_ref_caller({async_call, Ref, Caller, _, _, _, _}) -> {Ref, Caller, async_call}; -extract_ref_caller({async_gather, Ref, Caller, _}) -> {Ref, Caller, async_gather}; -extract_ref_caller({async_stream, Ref, Caller, _, _, _, _}) -> {Ref, Caller, async_stream}. diff --git a/src/py_context.erl b/src/py_context.erl index 9c342b6..83b8e79 100644 --- a/src/py_context.erl +++ b/src/py_context.erl @@ -125,7 +125,7 @@ stop(Ctx) when is_pid(Ctx) -> %% @doc Create a new context with options map. %% %% Options: -%% - `mode' - Context mode (worker | subinterp | owngil), default: worker +%% - `mode' - Context mode (worker | owngil), default: worker %% %% @param Opts Options map %% @returns {ok, Pid} | {error, Reason} @@ -563,13 +563,14 @@ loop(#state{ref = Ref, interp_id = InterpId} = State) -> loop(State); {exec, From, MRef, Code} -> - Result = py_nif:context_exec(Ref, Code), + Result = handle_exec_with_async(Ref, Code), From ! {MRef, Result}, loop(State); - %% Exec with process-local environment (worker mode) + %% Exec with process-local environment (worker mode). + %% Async dispatch with sync fallback (mirrors call/eval). {exec, From, MRef, Code, EnvRef} -> - Result = py_nif:context_exec(Ref, Code, EnvRef), + Result = handle_exec_with_async_and_env(Ref, Code, EnvRef), From ! {MRef, Result}, loop(State); @@ -723,7 +724,23 @@ handle_blocking_callback(Ref, FuncName, Args) -> %% @private %% Handle call with potential suspension for callbacks +%% Uses async dispatch to avoid blocking dirty schedulers when possible. handle_call_with_suspension(Ref, Module, Func, Args, Kwargs) -> + RequestId = make_ref(), + case py_nif:context_call_async(Ref, self(), RequestId, Module, Func, Args, Kwargs) of + {enqueued, RequestId} -> + %% Async dispatch succeeded - wait for result message + wait_for_async_result(Ref, RequestId); + {error, async_requires_worker_thread} -> + %% Fall back to blocking call for non-worker-thread contexts + handle_call_blocking(Ref, Module, Func, Args, Kwargs); + {error, Reason} -> + {error, Reason} + end. + +%% @private +%% Blocking call handler (used when async is not available) +handle_call_blocking(Ref, Module, Func, Args, Kwargs) -> case py_nif:context_call(Ref, Module, Func, Args, Kwargs) of {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} -> %% Callback needed - handle it with recursive receive @@ -740,7 +757,36 @@ handle_call_with_suspension(Ref, Module, Func, Args, Kwargs) -> %% @private %% Handle eval with potential suspension for callbacks +%% Uses async dispatch to avoid blocking dirty schedulers when possible. handle_eval_with_suspension(Ref, Code, Locals) -> + RequestId = make_ref(), + case py_nif:context_eval_async(Ref, self(), RequestId, Code, Locals) of + {enqueued, RequestId} -> + %% Async dispatch succeeded - wait for result message + wait_for_async_result(Ref, RequestId); + {error, async_requires_worker_thread} -> + %% Fall back to blocking call for non-worker-thread contexts + handle_eval_blocking(Ref, Code, Locals); + {error, Reason} -> + {error, Reason} + end. + +%% @private +%% Handle exec with async dispatch +handle_exec_with_async(Ref, Code) -> + RequestId = make_ref(), + case py_nif:context_exec_async(Ref, self(), RequestId, Code) of + {enqueued, RequestId} -> + wait_for_async_result(Ref, RequestId); + {error, async_requires_worker_thread} -> + py_nif:context_exec(Ref, Code); + {error, Reason} -> + {error, Reason} + end. + +%% @private +%% Blocking eval handler (used when async is not available) +handle_eval_blocking(Ref, Code, Locals) -> case py_nif:context_eval(Ref, Code, Locals) of {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} -> %% Callback needed - handle it with recursive receive @@ -756,8 +802,68 @@ handle_eval_with_suspension(Ref, Code, Locals) -> end. %% @private -%% Handle call with process-local environment +%% Wait for async result from worker thread +%% The worker thread sends {py_result, RequestId, Result} when done. +%% +%% Drains stale {py_result, _, _} messages from prior timed-out +%% requests before the matching receive so a context that experiences +%% repeat timeouts doesn't grow an unbounded mailbox: when +%% wait_for_async_result/2 returns {error, async_timeout}, the C +%% worker can still finish later and deliver the result; without the +%% drain those messages would accumulate forever. +%% +%% Safe because the context process is the sole receiver for its own +%% async results and only one wait_for_async_result/2 is in flight at +%% a time, so the drain cannot consume the result of a concurrent live +%% request. +wait_for_async_result(Ref, RequestId) -> + drain_stale_async_results(RequestId), + receive + {py_result, RequestId, Result} -> + process_async_result(Ref, Result) + after 300000 -> %% 5 minute timeout + {error, async_timeout} + end. + +%% @private +drain_stale_async_results(CurrentId) -> + receive + {py_result, OldId, _} when OldId =/= CurrentId -> + drain_stale_async_results(CurrentId) + after 0 -> + ok + end. + +%% @private +%% Process the result from async dispatch +%% Handles suspension, schedule markers, and normal results. +process_async_result(Ref, {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}}) -> + CallbackResult = handle_callback_with_nested_receive(Ref, FuncName, CallbackArgs), + resume_and_continue(Ref, StateRef, CallbackResult); +process_async_result(Ref, {schedule, CallbackName, CallbackArgs}) -> + handle_schedule(Ref, CallbackName, CallbackArgs); +process_async_result(_Ref, Result) -> + Result. + +%% @private +%% Handle call with process-local environment. +%% Tries async dispatch first (no 30 s NIF timeout); falls back to the +%% blocking NIF only when the worker thread isn't available. handle_call_with_suspension_and_env(Ref, Module, Func, Args, Kwargs, EnvRef) -> + RequestId = make_ref(), + case py_nif:context_call_with_env_async(Ref, self(), RequestId, + Module, Func, Args, Kwargs, + EnvRef) of + {enqueued, RequestId} -> + wait_for_async_result(Ref, RequestId); + {error, async_requires_worker_thread} -> + handle_call_with_env_blocking(Ref, Module, Func, Args, Kwargs, EnvRef); + {error, Reason} -> + {error, Reason} + end. + +%% @private +handle_call_with_env_blocking(Ref, Module, Func, Args, Kwargs, EnvRef) -> case py_nif:context_call(Ref, Module, Func, Args, Kwargs, EnvRef) of {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} -> CallbackResult = handle_callback_with_nested_receive(Ref, FuncName, CallbackArgs), @@ -769,8 +875,23 @@ handle_call_with_suspension_and_env(Ref, Module, Func, Args, Kwargs, EnvRef) -> end. %% @private -%% Handle eval with process-local environment +%% Handle eval with process-local environment. +%% Tries async dispatch first; falls back to the blocking NIF only when +%% the worker thread isn't available. handle_eval_with_suspension_and_env(Ref, Code, Locals, EnvRef) -> + RequestId = make_ref(), + case py_nif:context_eval_with_env_async(Ref, self(), RequestId, + Code, Locals, EnvRef) of + {enqueued, RequestId} -> + wait_for_async_result(Ref, RequestId); + {error, async_requires_worker_thread} -> + handle_eval_with_env_blocking(Ref, Code, Locals, EnvRef); + {error, Reason} -> + {error, Reason} + end. + +%% @private +handle_eval_with_env_blocking(Ref, Code, Locals, EnvRef) -> case py_nif:context_eval(Ref, Code, Locals, EnvRef) of {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} -> CallbackResult = handle_callback_with_nested_receive(Ref, FuncName, CallbackArgs), @@ -781,6 +902,21 @@ handle_eval_with_suspension_and_env(Ref, Code, Locals, EnvRef) -> Result end. +%% @private +%% Handle exec with process-local environment via the same async-first +%% path used for call/eval. +handle_exec_with_async_and_env(Ref, Code, EnvRef) -> + RequestId = make_ref(), + case py_nif:context_exec_with_env_async(Ref, self(), RequestId, + Code, EnvRef) of + {enqueued, RequestId} -> + wait_for_async_result(Ref, RequestId); + {error, async_requires_worker_thread} -> + py_nif:context_exec(Ref, Code, EnvRef); + {error, Reason} -> + {error, Reason} + end. + %% @private %% Check if a context is a subinterpreter (has interp_id > 0) is_context_subinterp(Ref) -> diff --git a/src/py_context_router.erl b/src/py_context_router.erl index f27814a..67fea1f 100644 --- a/src/py_context_router.erl +++ b/src/py_context_router.erl @@ -135,7 +135,7 @@ start() -> %% %% Options: %% - `contexts' - Number of contexts to create (default: number of schedulers) -%% - `mode' - Context mode: `worker', `subinterp', or `owngil' (default: `worker') +%% - `mode' - Context mode: `worker' or `owngil' (default: `worker') %% %% @param Opts Start options %% @returns {ok, [Context]} | {error, Reason} @@ -279,7 +279,7 @@ start_pool(Pool, Size) -> %% %% @param Pool Pool name (default, io, or custom) %% @param Size Number of contexts in the pool -%% @param Mode Context mode (worker, subinterp, owngil) +%% @param Mode Context mode (worker, owngil) %% @returns {ok, [Context]} | {error, Reason} -spec start_pool(pool_name(), pos_integer(), py_context:context_mode()) -> {ok, [pid()]} | {error, term()}. diff --git a/src/py_context_sup.erl b/src/py_context_sup.erl index 97001ac..f633b01 100644 --- a/src/py_context_sup.erl +++ b/src/py_context_sup.erl @@ -44,7 +44,7 @@ start_link() -> %% @doc Start a new py_context under this supervisor. %% %% @param Id Unique identifier for the context (integer or {Pool, N} tuple) -%% @param Mode Context mode (worker | subinterp | owngil) +%% @param Mode Context mode (worker | owngil) %% @returns {ok, Pid} | {error, Reason} -spec start_context(term(), py_context:context_mode()) -> {ok, pid()} | {error, term()}. diff --git a/src/py_event_loop.erl b/src/py_event_loop.erl index 65093f9..b89bda1 100644 --- a/src/py_event_loop.erl +++ b/src/py_event_loop.erl @@ -340,19 +340,28 @@ init([]) -> %% @doc Set ErlangEventLoop as the default asyncio event loop policy. %% Also extends the C 'erlang' module with Python event loop exports. +%% +%% Python 3.14 deprecated `asyncio.set_event_loop_policy' and 3.16 removes +%% it. The integration's run path uses `loop_factory=' directly via +%% `erlang.run/1' and `asyncio.Runner', so the global policy install is +%% only a convenience for user code that calls bare asyncio APIs inside +%% `py:exec'. We skip the install on 3.14+ to avoid the warning; users +%% on 3.14+ should call `erlang.run(main)' or +%% `asyncio.Runner(loop_factory=erlang.new_event_loop)' explicitly. set_default_policy() -> PrivDir = code:priv_dir(erlang_python), %% First, extend the erlang module with Python event loop exports extend_erlang_module(PrivDir), - %% Then set the event loop policy + %% Then set the event loop policy (only on Python < 3.14) Code = iolist_to_binary([ "import sys\n", "priv_dir = '", PrivDir, "'\n", "if priv_dir not in sys.path:\n", " sys.path.insert(0, priv_dir)\n", - "from _erlang_impl import get_event_loop_policy\n", "import asyncio\n", - "asyncio.set_event_loop_policy(get_event_loop_policy())\n" + "if sys.version_info < (3, 14):\n", + " from _erlang_impl import get_event_loop_policy\n", + " asyncio.set_event_loop_policy(get_event_loop_policy())\n" ]), case py:exec(Code) of ok -> ok; @@ -425,10 +434,13 @@ terminate(_Reason, #state{loop_ref = LoopRef, worker_pid = WorkerPid}) -> ok. %% @doc Reset asyncio back to the default event loop policy. +%% Skipped on Python 3.14+ since we never installed one (see +%% set_default_policy/0). reset_default_policy() -> Code = <<" -import asyncio -asyncio.set_event_loop_policy(None) +import sys, asyncio +if sys.version_info < (3, 14): + asyncio.set_event_loop_policy(None) ">>, catch py:exec(Code), ok. diff --git a/src/py_nif.erl b/src/py_nif.erl index 2625ea2..55b8b17 100644 --- a/src/py_nif.erl +++ b/src/py_nif.erl @@ -52,28 +52,15 @@ async_call/6, async_gather/3, async_stream/6, - %% Sub-interpreters (Python 3.12+) - shared GIL pool model + %% Subinterpreter capability probes (Python 3.12+ / 3.14+) subinterp_supported/0, owngil_supported/0, - subinterp_worker_new/0, - subinterp_worker_destroy/1, - subinterp_call/5, - parallel_execute/2, - %% OWN_GIL subinterpreter thread pool (true parallelism) + %% OWN_GIL thread pool (internal, used by py_event_loop_pool) subinterp_thread_pool_start/0, subinterp_thread_pool_start/1, subinterp_thread_pool_stop/0, subinterp_thread_pool_ready/0, subinterp_thread_pool_stats/0, - subinterp_thread_create/0, - subinterp_thread_destroy/1, - subinterp_thread_call/4, - subinterp_thread_call/5, - subinterp_thread_eval/2, - subinterp_thread_eval/3, - subinterp_thread_exec/2, - subinterp_thread_cast/4, - subinterp_thread_async_call/6, %% OWN_GIL session management for event loop pool owngil_create_session/1, owngil_submit_task/7, @@ -82,7 +69,6 @@ owngil_apply_paths/3, %% Execution mode info execution_mode/0, - num_executors/0, %% Thread worker support (ThreadPoolExecutor) thread_worker_set_coordinator/1, thread_worker_write/2, @@ -176,6 +162,13 @@ context_eval/4, context_exec/2, context_exec/3, + %% Async dispatch (non-blocking) + context_call_async/7, + context_eval_async/5, + context_exec_async/4, + context_call_with_env_async/8, + context_eval_with_env_async/6, + context_exec_with_env_async/5, context_call_method/4, create_local_env/1, context_to_term/1, @@ -498,116 +491,40 @@ subinterp_supported() -> owngil_supported() -> ?NIF_STUB. -%% @doc Create a new sub-interpreter worker with its own GIL. -%% Returns an opaque reference to be used with subinterp functions. --spec subinterp_worker_new() -> {ok, reference()} | {error, term()}. -subinterp_worker_new() -> - ?NIF_STUB. - -%% @doc Destroy a sub-interpreter worker. --spec subinterp_worker_destroy(reference()) -> ok | {error, term()}. -subinterp_worker_destroy(_WorkerRef) -> - ?NIF_STUB. - -%% @doc Call a Python function in a sub-interpreter. -%% Args: WorkerRef, Module (binary), Func (binary), Args (list), Kwargs (map) --spec subinterp_call(reference(), binary(), binary(), list(), map()) -> - {ok, term()} | {error, term()}. -subinterp_call(_WorkerRef, _Module, _Func, _Args, _Kwargs) -> - ?NIF_STUB. - -%% @doc Execute multiple calls in parallel across sub-interpreters. -%% Args: WorkerRefs (list of refs), Calls (list of {Module, Func, Args}) -%% Returns: List of results (one per call) --spec parallel_execute([reference()], [{binary(), binary(), list()}]) -> - {ok, list()} | {error, term()}. -parallel_execute(_WorkerRefs, _Calls) -> - ?NIF_STUB. - %%% ============================================================================ -%%% OWN_GIL Subinterpreter Thread Pool (True Parallelism) +%%% OWN_GIL Thread Pool (internal, used by py_event_loop_pool) %%% ============================================================================ %% @doc Start the OWN_GIL subinterpreter thread pool with default workers. -%% Creates a pool of pthreads, each with an OWN_GIL subinterpreter. +%% @private -spec subinterp_thread_pool_start() -> ok | {error, term()}. subinterp_thread_pool_start() -> ?NIF_STUB. %% @doc Start the OWN_GIL subinterpreter thread pool with N workers. +%% @private -spec subinterp_thread_pool_start(non_neg_integer()) -> ok | {error, term()}. subinterp_thread_pool_start(_NumWorkers) -> ?NIF_STUB. %% @doc Stop the OWN_GIL subinterpreter thread pool. +%% @private -spec subinterp_thread_pool_stop() -> ok. subinterp_thread_pool_stop() -> ?NIF_STUB. %% @doc Check if the OWN_GIL thread pool is ready. +%% @private -spec subinterp_thread_pool_ready() -> boolean(). subinterp_thread_pool_ready() -> ?NIF_STUB. %% @doc Get OWN_GIL thread pool statistics. +%% @private -spec subinterp_thread_pool_stats() -> map(). subinterp_thread_pool_stats() -> ?NIF_STUB. -%% @doc Create a new OWN_GIL subinterpreter handle. -%% The handle is bound to a worker thread and has isolated namespace. --spec subinterp_thread_create() -> {ok, reference()} | {error, term()}. -subinterp_thread_create() -> - ?NIF_STUB. - -%% @doc Destroy an OWN_GIL subinterpreter handle. --spec subinterp_thread_destroy(reference()) -> ok | {error, term()}. -subinterp_thread_destroy(_Handle) -> - ?NIF_STUB. - -%% @doc Call a Python function through OWN_GIL subinterpreter (blocking). --spec subinterp_thread_call(reference(), binary(), binary(), list()) -> - {ok, term()} | {error, term()}. -subinterp_thread_call(_Handle, _Module, _Func, _Args) -> - ?NIF_STUB. - -%% @doc Call a Python function through OWN_GIL subinterpreter with kwargs. --spec subinterp_thread_call(reference(), binary(), binary(), list(), map()) -> - {ok, term()} | {error, term()}. -subinterp_thread_call(_Handle, _Module, _Func, _Args, _Kwargs) -> - ?NIF_STUB. - -%% @doc Evaluate Python expression through OWN_GIL subinterpreter. --spec subinterp_thread_eval(reference(), binary()) -> - {ok, term()} | {error, term()}. -subinterp_thread_eval(_Handle, _Code) -> - ?NIF_STUB. - -%% @doc Evaluate Python expression with locals through OWN_GIL subinterpreter. --spec subinterp_thread_eval(reference(), binary(), map()) -> - {ok, term()} | {error, term()}. -subinterp_thread_eval(_Handle, _Code, _Locals) -> - ?NIF_STUB. - -%% @doc Execute Python statements through OWN_GIL subinterpreter (no return). --spec subinterp_thread_exec(reference(), binary()) -> ok | {error, term()}. -subinterp_thread_exec(_Handle, _Code) -> - ?NIF_STUB. - -%% @doc Cast (fire-and-forget) through OWN_GIL subinterpreter. -%% Returns immediately, result is discarded. --spec subinterp_thread_cast(reference(), binary(), binary(), list()) -> ok. -subinterp_thread_cast(_Handle, _Module, _Func, _Args) -> - ?NIF_STUB. - -%% @doc Async call through OWN_GIL subinterpreter. -%% Args: Handle, Module, Func, Args, CallerPid, Ref -%% Result is sent to CallerPid as {py_subinterp_result, Ref, Result}. --spec subinterp_thread_async_call(reference(), binary(), binary(), list(), pid(), reference()) -> - ok | {error, term()}. -subinterp_thread_async_call(_Handle, _Module, _Func, _Args, _CallerPid, _Ref) -> - ?NIF_STUB. - %%% ============================================================================ %%% OWN_GIL Session Management (for event loop pool) %%% ============================================================================ @@ -653,22 +570,18 @@ owngil_apply_paths(_WorkerId, _HandleId, _Paths) -> %%% Execution Mode Info %%% ============================================================================ -%% @doc Get the current execution mode. -%% Returns one of: free_threaded | subinterp | multi_executor +%% @doc Get Python capability (internal use). +%% Returns the detected Python runtime capability: %% - free_threaded: Python 3.13+ with no GIL (Py_GIL_DISABLED) -%% - subinterp: Python 3.12+ with per-interpreter GIL -%% - multi_executor: Traditional Python with N executor threads --spec execution_mode() -> free_threaded | subinterp | multi_executor. +%% - gil: Conventional GIL build (any other supported version) +%% +%% For public execution mode, use py:execution_mode/0 which returns +%% `worker | owngil' based on the application configuration. +%% @private +-spec execution_mode() -> free_threaded | gil. execution_mode() -> ?NIF_STUB. -%% @doc Get the number of executor threads. -%% For multi_executor mode, this is the number of executor threads. -%% For other modes, returns 1. --spec num_executors() -> pos_integer(). -num_executors() -> - ?NIF_STUB. - %%% ============================================================================ %%% Thread Worker Support (ThreadPoolExecutor) %%% ============================================================================ @@ -1337,6 +1250,86 @@ context_exec(_ContextRef, _Code) -> context_exec(_ContextRef, _Code, _EnvRef) -> ?NIF_STUB. +%% @doc Async call - enqueue and return immediately. +%% +%% Dispatches a Python function call to the worker thread and returns +%% immediately with {enqueued, RequestId}. The worker thread will send +%% {py_result, RequestId, Result} to CallerPid when done. +%% +%% @param ContextRef Context reference +%% @param CallerPid PID to send result to +%% @param RequestId Request ID for correlation +%% @param Module Python module name +%% @param Func Function name +%% @param Args List of arguments +%% @param Kwargs Keyword arguments map +%% @returns {enqueued, RequestId} | {error, Reason} +-spec context_call_async(reference(), pid(), term(), binary(), binary(), list(), map()) -> + {enqueued, term()} | {error, term()}. +context_call_async(_ContextRef, _CallerPid, _RequestId, _Module, _Func, _Args, _Kwargs) -> + ?NIF_STUB. + +%% @doc Async eval - enqueue and return immediately. +%% +%% Dispatches a Python eval to the worker thread and returns immediately +%% with {enqueued, RequestId}. The worker thread will send +%% {py_result, RequestId, Result} to CallerPid when done. +%% +%% @param ContextRef Context reference +%% @param CallerPid PID to send result to +%% @param RequestId Request ID for correlation +%% @param Code Python expression to evaluate +%% @param Locals Local variables map +%% @returns {enqueued, RequestId} | {error, Reason} +-spec context_eval_async(reference(), pid(), term(), binary(), map()) -> + {enqueued, term()} | {error, term()}. +context_eval_async(_ContextRef, _CallerPid, _RequestId, _Code, _Locals) -> + ?NIF_STUB. + +%% @doc Async exec - enqueue and return immediately. +%% +%% Dispatches Python code execution to the worker thread and returns +%% immediately with {enqueued, RequestId}. The worker thread will send +%% {py_result, RequestId, Result} to CallerPid when done. +%% +%% @param ContextRef Context reference +%% @param CallerPid PID to send result to +%% @param RequestId Request ID for correlation +%% @param Code Python code to execute +%% @returns {enqueued, RequestId} | {error, Reason} +-spec context_exec_async(reference(), pid(), term(), binary()) -> + {enqueued, term()} | {error, term()}. +context_exec_async(_ContextRef, _CallerPid, _RequestId, _Code) -> + ?NIF_STUB. + +%% @doc Async call with process-local environment. +%% @private +-spec context_call_with_env_async(reference(), pid(), term(), + binary(), binary(), list(), map(), + reference()) -> + {enqueued, term()} | {error, term()}. +context_call_with_env_async(_CtxRef, _CallerPid, _RequestId, + _Module, _Func, _Args, _Kwargs, _EnvRef) -> + ?NIF_STUB. + +%% @doc Async eval with process-local environment. +%% @private +-spec context_eval_with_env_async(reference(), pid(), term(), + binary(), map(), reference()) -> + {enqueued, term()} | {error, term()}. +context_eval_with_env_async(_CtxRef, _CallerPid, _RequestId, + _Code, _Locals, _EnvRef) -> + ?NIF_STUB. + +%% @doc Async exec with process-local environment. +%% @private +-spec context_exec_with_env_async(reference(), pid(), term(), + binary(), reference()) -> + {enqueued, term()} | {error, term()}. +context_exec_with_env_async(_CtxRef, _CallerPid, _RequestId, + _Code, _EnvRef) -> + ?NIF_STUB. + %% @doc Call a method on a Python object in a context. %% %% NO MUTEX - caller must ensure exclusive access (process ownership). diff --git a/src/py_reactor_context.erl b/src/py_reactor_context.erl index 6576eee..0261f8e 100644 --- a/src/py_reactor_context.erl +++ b/src/py_reactor_context.erl @@ -79,7 +79,7 @@ %% @doc Start a new py_reactor_context process. %% %% @param Id Unique identifier for this context -%% @param Mode Context mode (worker, subinterp, owngil) +%% @param Mode Context mode (worker, owngil) %% @returns {ok, Pid} | {error, Reason} -spec start_link(pos_integer(), atom()) -> {ok, pid()} | {error, term()}. start_link(Id, Mode) -> @@ -95,7 +95,7 @@ start_link(Id, Mode) -> %% (useful for setting up protocol factory) %% %% @param Id Unique identifier for this context -%% @param Mode Context mode (worker, subinterp, owngil) +%% @param Mode Context mode (worker, owngil) %% @param Opts Options map %% @returns {ok, Pid} | {error, Reason} -spec start_link(pos_integer(), atom(), map()) -> {ok, pid()} | {error, term()}. diff --git a/test/py_SUITE.erl b/test/py_SUITE.erl index dc41700..5df9d98 100644 --- a/test/py_SUITE.erl +++ b/test/py_SUITE.erl @@ -41,7 +41,6 @@ test_venv_pth/1, %% New scalability tests test_execution_mode/1, - test_num_executors/1, test_semaphore_basic/1, test_semaphore_acquire_release/1, test_semaphore_concurrent/1, @@ -101,7 +100,6 @@ all() -> test_venv_pth, %% Scalability tests test_execution_mode, - test_num_executors, test_semaphore_basic, test_semaphore_acquire_release, test_semaphore_concurrent, @@ -578,40 +576,21 @@ test_erlang_attr_syntax(_Config) -> ok. test_asyncio_call(_Config) -> - %% Test async call to asyncio coroutine - %% The async pool runs async functions in a background asyncio event loop - Ref = py:async_call('__main__', 'eval', [<<"1 + 1">>]), + %% Sync function dispatched through the event loop short-circuits to + %% the result, so async_call/async_await round-trips end-to-end. + Ref = py:async_call(math, sqrt, [16]), true = is_reference(Ref), - - %% We may not get a result for simple eval since it's not a real coroutine - %% Just verify the call mechanism works - _Result = py:async_await(Ref, 5000), + {ok, 4.0} = py:async_await(Ref, 5000), ok. test_asyncio_gather(_Config) -> - %% Test gathering multiple async calls - %% This tests the async_gather functionality Calls = [ {math, sqrt, [16]}, {math, sqrt, [25]}, {math, sqrt, [36]} ], - Result = py:async_gather(Calls), - ct:pal("async_gather result: ~p~n", [Result]), - - %% Verify the result structure - case Result of - {ok, Results} when is_list(Results) -> - %% Should have 3 results - 3 = length(Results), - %% Verify the values if they're successful - ct:pal("Gathered results: ~p~n", [Results]), - ok; - {error, Reason} -> - %% Async pool might not be fully functional in test env - ct:pal("async_gather returned error (may be expected): ~p~n", [Reason]), - ok - end. + {ok, [4.0, 5.0, 6.0]} = py:async_gather(Calls), + ok. test_subinterp_supported(_Config) -> %% Test that subinterp_supported returns a boolean @@ -733,15 +712,7 @@ test_execution_mode(_Config) -> %% Test that execution_mode returns a valid mode Mode = py:execution_mode(), ct:pal("Execution mode: ~p~n", [Mode]), - true = lists:member(Mode, [free_threaded, subinterp, multi_executor]), - ok. - -test_num_executors(_Config) -> - %% Test that num_executors returns a positive integer - Num = py:num_executors(), - ct:pal("Number of executors: ~p~n", [Num]), - true = is_integer(Num), - true = Num > 0, + true = lists:member(Mode, [worker, owngil]), ok. test_semaphore_basic(_Config) -> diff --git a/test/py_actor_SUITE.erl b/test/py_actor_SUITE.erl index 6ddec82..6fc44c2 100644 --- a/test/py_actor_SUITE.erl +++ b/test/py_actor_SUITE.erl @@ -70,7 +70,7 @@ test_process_isolation(_Config) -> {ok, <<"main_process">>} = py:eval(Ctx, <<"isolation_test">>), %% Spawn another process using the same context - Pid = spawn(fun() -> + _Pid = spawn(fun() -> %% This process should have its own environment ok = py:exec(Ctx, <<"isolation_test = 'spawned_process'">>), {ok, Value} = py:eval(Ctx, <<"isolation_test">>), diff --git a/test/py_async_task_SUITE.erl b/test/py_async_task_SUITE.erl index 0e777e8..0646832 100644 --- a/test/py_async_task_SUITE.erl +++ b/test/py_async_task_SUITE.erl @@ -164,7 +164,7 @@ test_async_coroutine(_Config) -> Ref = py_event_loop:create_task(math, sin, [0.0]), Result = py_event_loop:await(Ref, 5000), ct:log("math.sin(0.0) = ~p", [Result]), - {ok, 0.0} = Result. + {ok, +0.0} = Result. test_async_with_args(_Config) -> %% Test with args using operator module @@ -328,7 +328,7 @@ test_interleaved_sync_async(_Config) -> R4 = py_event_loop:create_task(math, sqrt, [64.0]), {ok, 3} = py_event_loop:await(R1, 5000), - {ok, 0.0} = py_event_loop:await(R2, 5000), + {ok, +0.0} = py_event_loop:await(R2, 5000), {ok, 30} = py_event_loop:await(R3, 5000), {ok, 8.0} = py_event_loop:await(R4, 5000), ct:log("Interleaved sync/async tests passed"). diff --git a/test/py_asyncio_policy_SUITE.erl b/test/py_asyncio_policy_SUITE.erl new file mode 100644 index 0000000..2203678 --- /dev/null +++ b/test/py_asyncio_policy_SUITE.erl @@ -0,0 +1,173 @@ +%%% @doc CT suite pinning the version-gated asyncio policy install. +%%% +%%% On Python 3.14+ the integration must NOT call +%%% `asyncio.set_event_loop_policy/0` (deprecated in 3.14, removed in +%%% 3.16). On Python <3.14 the policy install is preserved as the +%%% historical convenience for bare `asyncio.run()` inside `py:exec`. +%%% +%%% These cases verify both halves of the gate plus the architectural +%%% claim that the run path works without the global policy. +-module(py_asyncio_policy_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([ + all/0, + init_per_suite/1, + end_per_suite/1 +]). + +-export([ + policy_install_skipped_on_3_14_plus/1, + policy_install_active_below_3_14/1, + async_call_round_trip/1, + erlang_run_uses_erlang_loop/1, + install_raises_on_3_14_plus/1, + install_works_below_3_14/1, + no_deprecation_warning_during_init/1 +]). + +all() -> + [ + policy_install_skipped_on_3_14_plus, + policy_install_active_below_3_14, + async_call_round_trip, + erlang_run_uses_erlang_loop, + install_raises_on_3_14_plus, + install_works_below_3_14, + no_deprecation_warning_during_init + ]. + +init_per_suite(Config) -> + {ok, _} = application:ensure_all_started(erlang_python), + {ok, _} = py:start_contexts(), + Config. + +end_per_suite(_Config) -> + ok = application:stop(erlang_python), + ok. + +%%% --------------------------------------------------------------------------- +%%% Helpers +%%% --------------------------------------------------------------------------- + +python_at_least(Major, Minor) -> + ok = py:exec(<<"import sys">>), + {ok, {Maj, Min}} = py:eval(<<"sys.version_info[:2]">>), + {Maj, Min} >= {Major, Minor}. + +policy_class_name() -> + %% asyncio.get_event_loop_policy() itself emits a DeprecationWarning + %% on 3.14+; suppress it locally so the probe doesn't pollute the run. + Code = << + "import asyncio, warnings\n" + "with warnings.catch_warnings():\n" + " warnings.simplefilter('ignore', DeprecationWarning)\n" + " _pol_name = asyncio.get_event_loop_policy().__class__.__name__\n" + >>, + ok = py:exec(Code), + {ok, Name} = py:eval(<<"_pol_name">>), + Name. + +%%% --------------------------------------------------------------------------- +%%% Tests +%%% --------------------------------------------------------------------------- + +policy_install_skipped_on_3_14_plus(_Config) -> + case python_at_least(3, 14) of + false -> + {skip, "Python <3.14 — global policy install is the right way"}; + true -> + Name = policy_class_name(), + true = Name =/= <<"ErlangEventLoopPolicy">>, + ct:pal("Policy on 3.14+ is ~p (not ErlangEventLoopPolicy, as expected)", + [Name]), + ok + end. + +policy_install_active_below_3_14(_Config) -> + case python_at_least(3, 14) of + true -> + {skip, "Python 3.14+ — policy is intentionally not installed"}; + false -> + <<"ErlangEventLoopPolicy">> = policy_class_name(), + ok + end. + +async_call_round_trip(_Config) -> + %% Independent of policy state: async_call → async_await must succeed. + Ref = py:async_call(math, sqrt, [16]), + true = is_reference(Ref), + {ok, 4.0} = py:async_await(Ref, 5000), + ok. + +erlang_run_uses_erlang_loop(_Config) -> + %% Confirm erlang.run picks up ErlangEventLoop on every supported + %% version, regardless of the global policy state. + ok = py:exec(<< + "import erlang, asyncio\n" + "async def _probe():\n" + " return type(asyncio.get_running_loop()).__name__\n" + "_loop_class = erlang.run(_probe())\n" + >>), + {ok, <<"ErlangEventLoop">>} = py:eval(<<"_loop_class">>), + ok. + +install_raises_on_3_14_plus(_Config) -> + case python_at_least(3, 14) of + false -> + {skip, "Python <3.14 — erlang.install() still functional"}; + true -> + ok = py:exec(<< + "import erlang\n" + "_install_err = None\n" + "try:\n" + " erlang.install()\n" + "except RuntimeError as e:\n" + " _install_err = str(e)\n" + >>), + {ok, ErrMsg} = py:eval(<<"_install_err">>), + true = is_binary(ErrMsg), + true = byte_size(ErrMsg) > 0, + true = binary:match(ErrMsg, <<"3.14+">>) =/= nomatch + orelse binary:match(ErrMsg, <<"loop_factory">>) =/= nomatch, + ok + end. + +install_works_below_3_14(_Config) -> + case python_at_least(3, 14) of + true -> + {skip, "Python 3.14+ — erlang.install() raises by design"}; + false -> + %% A DeprecationWarning is acceptable on 3.12-3.13; any + %% exception is not. + ok = py:exec(<< + "import erlang, warnings\n" + "with warnings.catch_warnings():\n" + " warnings.simplefilter('ignore', DeprecationWarning)\n" + " erlang.install()\n" + >>), + ok + end. + +no_deprecation_warning_during_init(_Config) -> + case python_at_least(3, 14) of + false -> + {skip, "Python <3.14 — no deprecation warning to verify"}; + true -> + %% Mimic the set_default_policy snippet inside a + %% catch_warnings block; assert no set_event_loop_policy + %% warning surfaces. + ok = py:exec(<< + "import asyncio, sys, warnings\n" + "with warnings.catch_warnings(record=True) as _caught:\n" + " warnings.simplefilter('always')\n" + " if sys.version_info < (3, 14):\n" + " from _erlang_impl import get_event_loop_policy\n" + " asyncio.set_event_loop_policy(get_event_loop_policy())\n" + "_relevant = [str(w.message) for w in _caught\n" + " if 'set_event_loop_policy' in str(w.message)]\n" + >>), + {ok, []} = py:eval(<<"_relevant">>), + ok + end. diff --git a/test/py_context_async_drain_SUITE.erl b/test/py_context_async_drain_SUITE.erl new file mode 100644 index 0000000..b2a2ae9 --- /dev/null +++ b/test/py_context_async_drain_SUITE.erl @@ -0,0 +1,64 @@ +%%% @doc Pin the stale-{py_result,_,_}-drain behavior in py_context. +%%% +%%% wait_for_async_result/2 returns {error, async_timeout} after 5 minutes +%%% but the C worker may eventually finish and deliver a {py_result, OldId, +%%% _} message anyway. Without a drain, those messages would pile up on +%%% the context process's mailbox forever. This suite injects a stale +%%% message directly into the context's mailbox and asserts it is gone +%%% after the next legitimate dispatch. +-module(py_context_async_drain_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([ + all/0, + init_per_suite/1, + end_per_suite/1 +]). + +-export([ + drain_stale_results/1 +]). + +all() -> + [drain_stale_results]. + +init_per_suite(Config) -> + {ok, _} = application:ensure_all_started(erlang_python), + {ok, _} = py:start_contexts(), + Config. + +end_per_suite(_Config) -> + ok = application:stop(erlang_python), + ok. + +drain_stale_results(_Config) -> + Ctx = py:context(1), + %% Warm up the context: ensure math is importable, then exercise + %% an async-dispatch call so the loop is fully primed. + {ok, 4.0} = py_context:call(Ctx, math, sqrt, [16]), + + %% Inject a stale result message directly into the context's mailbox. + %% py_context's outer loop matches on specific tags only; an + %% unrecognized {py_result, FakeId, _} is left in place by selective + %% receive and would accumulate forever without the drain. + FakeId = make_ref(), + Ctx ! {py_result, FakeId, junk_should_be_drained}, + {message_queue_len, QLenBefore} = + erlang:process_info(Ctx, message_queue_len), + true = QLenBefore >= 1, + + %% Trigger an async-dispatch call (py_context:call/4 -> handle_call_ + %% with_suspension -> wait_for_async_result/2). The drain runs first + %% and consumes the stale message; the matching receive gets the + %% real result. + {ok, 5.0} = py_context:call(Ctx, math, sqrt, [25]), + + %% Brief settle to let any in-flight worker message land. + timer:sleep(50), + {message_queue_len, QLenAfter} = + erlang:process_info(Ctx, message_queue_len), + {messages, Msgs} = erlang:process_info(Ctx, messages), + ct:pal("Ctx mailbox after drain: len=~p msgs=~p", [QLenAfter, Msgs]), + 0 = QLenAfter, + ok. diff --git a/test/py_context_async_env_SUITE.erl b/test/py_context_async_env_SUITE.erl new file mode 100644 index 0000000..18f13c5 --- /dev/null +++ b/test/py_context_async_env_SUITE.erl @@ -0,0 +1,71 @@ +%%% @doc Pin the async-with-env dispatch path. +%%% +%%% v3.0 introduced an async dispatch path for call / eval / exec that +%%% returns {enqueued, RequestId} from the NIF and lets the Erlang side +%%% wait in a normal receive. The env-bearing variants +%%% (py_context:call/5, eval/5 with EnvRef, exec/3) used to take a +%%% blocking sync dispatch with a 30-second pthread_cond_timedwait, +%%% returning {error, worker_timeout} for long-running Python while +%%% the worker kept going. +%%% +%%% These cases verify the env path now uses the async dispatch and +%%% completes correctly. +-module(py_context_async_env_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([ + all/0, + init_per_suite/1, + end_per_suite/1 +]). + +-export([ + async_env_call_returns_correct_result/1, + env_call_does_not_dispatch_timeout/1 +]). + +all() -> + [ + async_env_call_returns_correct_result, + env_call_does_not_dispatch_timeout + ]. + +init_per_suite(Config) -> + {ok, _} = application:ensure_all_started(erlang_python), + {ok, _} = py:start_contexts(), + Config. + +end_per_suite(_Config) -> + ok = application:stop(erlang_python), + ok. + +async_env_call_returns_correct_result(_Config) -> + %% py:call/3 wraps an EnvRef under the hood, so a successful + %% round-trip proves the new context_call_with_env_async path is + %% wired and the worker delivers a {py_result, _, _} for it. + {ok, 4.0} = py:call(math, sqrt, [16]), + {ok, 5.0} = py:call(math, sqrt, [25]), + ok. + +env_call_does_not_dispatch_timeout(_Config) -> + %% Have the Python side block for 1 second. Under the old sync + %% dispatch this exercised the 30-second pthread_cond_timedwait; + %% now it's an Erlang-side receive on {py_result, _, _} so latency + %% should track wall-clock and never produce {error, worker_timeout}. + Ctx = py:context(1), + EnvRef = py:get_local_env(Ctx), + ok = py_context:exec(Ctx, << + "import time\n" + "def _slow_round(x):\n" + " time.sleep(1.0)\n" + " return x * 2\n" + >>, EnvRef), + Start = erlang:monotonic_time(millisecond), + {ok, 14} = py_context:call(Ctx, '__main__', '_slow_round', [7], #{}, + infinity, EnvRef), + Elapsed = erlang:monotonic_time(millisecond) - Start, + ct:pal("env-async call elapsed: ~p ms", [Elapsed]), + true = Elapsed >= 900, + true = Elapsed < 5000, + ok. diff --git a/test/py_pid_send_SUITE.erl b/test/py_pid_send_SUITE.erl index 0f9f6ea..45dc33d 100644 --- a/test/py_pid_send_SUITE.erl +++ b/test/py_pid_send_SUITE.erl @@ -84,7 +84,7 @@ init_per_suite(Config) -> {ok, _} = application:ensure_all_started(erlang_python), %% Add test directory to Python path on ALL contexts %% (subinterpreters have isolated sys.path) - TestDir = code:lib_dir(erlang_python, test), + TestDir = filename:join(code:lib_dir(erlang_python), "test"), PathCmd = iolist_to_binary(io_lib:format( "import sys; sys.path.insert(0, '~s')", [TestDir])), NumContexts = py_context_router:num_contexts(), diff --git a/test/py_reactor_SUITE.erl b/test/py_reactor_SUITE.erl index f0b4d74..d48003b 100644 --- a/test/py_reactor_SUITE.erl +++ b/test/py_reactor_SUITE.erl @@ -247,7 +247,7 @@ reactor.set_protocol_factory(AsyncPendingProtocol) ">>, %% Start reactor context with protocol factory setup - {ok, ReactorCtx} = py_reactor_context:start_link(1, auto, #{ + {ok, ReactorCtx} = py_reactor_context:start_link(1, worker, #{ setup_code => SetupCode }), diff --git a/test/py_reentrant_SUITE.erl b/test/py_reentrant_SUITE.erl index ca01693..0db9909 100644 --- a/test/py_reentrant_SUITE.erl +++ b/test/py_reentrant_SUITE.erl @@ -288,7 +288,7 @@ test_callback_with_try_except(_Config) -> end), %% Add test directory to Python path so we can import the test module - TestDir = code:lib_dir(erlang_python, test), + TestDir = filename:join(code:lib_dir(erlang_python), "test"), ok = py:exec(iolist_to_binary(io_lib:format( "import sys; sys.path.insert(0, '~s')", [TestDir]))), @@ -324,7 +324,7 @@ test_async_call(_Config) -> py:register_function(async_multiply, fun([X, Y]) -> X * Y end), %% Add test directory to Python path - TestDir = code:lib_dir(erlang_python, test), + TestDir = filename:join(code:lib_dir(erlang_python), "test"), ok = py:exec(iolist_to_binary(io_lib:format( "import sys; sys.path.insert(0, '~s')", [TestDir]))), diff --git a/test/py_subinterp_SUITE.erl b/test/py_subinterp_SUITE.erl deleted file mode 100644 index 83dcc74..0000000 --- a/test/py_subinterp_SUITE.erl +++ /dev/null @@ -1,356 +0,0 @@ -%% Copyright 2026 Benoit Chesneau -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. - -%%% @doc Test suite for OWN_GIL subinterpreter thread pool API. -%%% -%%% Tests the py:subinterp_* functions which provide true parallelism -%%% using Python subinterpreters with OWN_GIL (Python 3.12+). --module(py_subinterp_SUITE). - --include_lib("common_test/include/ct.hrl"). --include_lib("stdlib/include/assert.hrl"). - --export([ - all/0, - groups/0, - init_per_suite/1, - end_per_suite/1, - init_per_group/2, - end_per_group/2, - init_per_testcase/2, - end_per_testcase/2 -]). - -%% Test cases --export([ - test_pool_not_ready/1, - test_pool_start_stop/1, - test_pool_stats/1, - test_create_destroy_handle/1, - test_simple_call/1, - test_call_with_args/1, - test_call_builtin/1, - test_eval_expression/1, - test_eval_with_locals/1, - test_exec_statements/1, - test_cast_fire_and_forget/1, - test_namespace_isolation/1, - test_multiple_handles/1, - test_parallel_execution/1 -]). - -%%% ============================================================================ -%%% CT Callbacks -%%% ============================================================================ - -all() -> - case py:subinterp_supported() of - true -> - [{group, pool_lifecycle}, - {group, handle_lifecycle}, - {group, execution}, - {group, isolation}]; - false -> - ct:pal("Skipping subinterpreter tests - not supported on this Python version"), - [] - end. - -groups() -> - [{pool_lifecycle, [sequence], [ - test_pool_not_ready, - test_pool_start_stop, - test_pool_stats - ]}, - {handle_lifecycle, [sequence], [ - test_create_destroy_handle, - test_multiple_handles - ]}, - {execution, [parallel], [ - test_simple_call, - test_call_with_args, - test_call_builtin, - test_eval_expression, - test_eval_with_locals, - test_exec_statements, - test_cast_fire_and_forget - ]}, - {isolation, [sequence], [ - test_namespace_isolation, - test_parallel_execution - ]}]. - -init_per_suite(Config) -> - %% Ensure erlang_python application is started - case application:ensure_all_started(erlang_python) of - {ok, _} -> ok; - {error, {already_started, _}} -> ok - end, - Config. - -end_per_suite(_Config) -> - %% Stop pool if running - catch py:subinterp_pool_stop(), - ok. - -init_per_group(pool_lifecycle, Config) -> - %% Pool tests manage their own pool lifecycle - Config; -init_per_group(_Group, Config) -> - %% Ensure pool is started for other groups - case py:subinterp_pool_ready() of - true -> ok; - false -> - ok = py:subinterp_pool_start(4) - end, - Config. - -end_per_group(pool_lifecycle, _Config) -> - %% Clean up pool after lifecycle tests - catch py:subinterp_pool_stop(), - ok; -end_per_group(_Group, _Config) -> - ok. - -init_per_testcase(_TestCase, Config) -> - Config. - -end_per_testcase(_TestCase, _Config) -> - ok. - -%%% ============================================================================ -%%% Pool Lifecycle Tests -%%% ============================================================================ - -test_pool_not_ready(_Config) -> - %% Pool should not be ready initially (after stop in end_per_suite) - ?assertEqual(false, py:subinterp_pool_ready()), - - %% Creating handle should fail when pool not ready - Result = py:subinterp_create(), - ?assertMatch({error, _}, Result), - ok. - -test_pool_start_stop(_Config) -> - %% Start with default workers - ?assertEqual(ok, py:subinterp_pool_start()), - ?assertEqual(true, py:subinterp_pool_ready()), - - %% Stop - ?assertEqual(ok, py:subinterp_pool_stop()), - ?assertEqual(false, py:subinterp_pool_ready()), - - %% Start with specific number of workers - ?assertEqual(ok, py:subinterp_pool_start(2)), - ?assertEqual(true, py:subinterp_pool_ready()), - - Stats = py:subinterp_pool_stats(), - ?assertEqual(2, maps:get(num_workers, Stats)), - - %% Stop for next tests - ?assertEqual(ok, py:subinterp_pool_stop()), - ok. - -test_pool_stats(_Config) -> - %% Start pool - ?assertEqual(ok, py:subinterp_pool_start(4)), - - Stats = py:subinterp_pool_stats(), - ?assertEqual(4, maps:get(num_workers, Stats)), - ?assertEqual(true, maps:get(initialized, Stats)), - ?assertEqual(0, maps:get(total_requests, Stats)), - ?assertEqual(0, maps:get(total_errors, Stats)), - - %% Stop for next group - ?assertEqual(ok, py:subinterp_pool_stop()), - ok. - -%%% ============================================================================ -%%% Handle Lifecycle Tests -%%% ============================================================================ - -test_create_destroy_handle(_Config) -> - %% Create handle - {ok, Handle} = py:subinterp_create(), - ?assert(is_reference(Handle)), - - %% Destroy handle - ?assertEqual(ok, py:subinterp_destroy(Handle)), - - %% Creating another handle should work - {ok, Handle2} = py:subinterp_create(), - ?assert(is_reference(Handle2)), - ?assertEqual(ok, py:subinterp_destroy(Handle2)), - ok. - -test_multiple_handles(_Config) -> - %% Create multiple handles - Handles = [begin - {ok, H} = py:subinterp_create(), - H - end || _ <- lists:seq(1, 8)], - - ?assertEqual(8, length(Handles)), - - %% Destroy all handles - [py:subinterp_destroy(H) || H <- Handles], - ok. - -%%% ============================================================================ -%%% Execution Tests -%%% ============================================================================ - -test_simple_call(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Simple math operation - Result = py:subinterp_call(Handle, math, sqrt, [16.0]), - ?assertMatch({ok, _}, Result), - - py:subinterp_destroy(Handle), - ok. - -test_call_with_args(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Call with multiple args - max function - Result = py:subinterp_call(Handle, builtins, max, [[1, 5, 3, 9, 2]]), - case Result of - {ok, 9} -> ok; - {ok, _} -> ok; % Accept any successful result - {error, _} = Err -> ct:pal("Call failed: ~p", [Err]) - end, - - py:subinterp_destroy(Handle), - ok. - -test_call_builtin(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Call builtin len - Result = py:subinterp_call(Handle, builtins, len, [<<"hello">>]), - case Result of - {ok, 5} -> ok; - {ok, _} -> ok; - {error, _} = Err -> ct:pal("Call failed: ~p", [Err]) - end, - - py:subinterp_destroy(Handle), - ok. - -test_eval_expression(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Simple expression - Result = py:subinterp_eval(Handle, <<"1 + 2 + 3">>), - ?assertMatch({ok, 6}, Result), - - py:subinterp_destroy(Handle), - ok. - -test_eval_with_locals(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Expression with local variables - Result = py:subinterp_eval(Handle, <<"x + y">>, #{x => 10, y => 20}), - case Result of - {ok, 30} -> ok; - {ok, _} -> ok; - {error, _} = Err -> ct:pal("Eval failed: ~p", [Err]) - end, - - py:subinterp_destroy(Handle), - ok. - -test_exec_statements(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Execute Python statements - Result = py:subinterp_exec(Handle, <<"x = 5\ny = 10\nresult = x + y">>), - ?assertMatch({ok, _}, Result), - - py:subinterp_destroy(Handle), - ok. - -test_cast_fire_and_forget(_Config) -> - {ok, Handle} = py:subinterp_create(), - - %% Cast should return immediately - ?assertEqual(ok, py:subinterp_cast(Handle, math, sqrt, [100.0])), - - %% Small delay to let cast execute - timer:sleep(50), - - py:subinterp_destroy(Handle), - ok. - -%%% ============================================================================ -%%% Isolation Tests -%%% ============================================================================ - -test_namespace_isolation(_Config) -> - %% Create two handles - {ok, Handle1} = py:subinterp_create(), - {ok, Handle2} = py:subinterp_create(), - - %% Set variable in Handle1 - py:subinterp_exec(Handle1, <<"test_var = 42">>), - - %% Try to access in Handle2 - should not be visible - Result = py:subinterp_eval(Handle2, <<"test_var">>), - ?assertMatch({error, _}, Result), - - py:subinterp_destroy(Handle1), - py:subinterp_destroy(Handle2), - ok. - -test_parallel_execution(_Config) -> - %% Create handles - {ok, H1} = py:subinterp_create(), - {ok, H2} = py:subinterp_create(), - - Parent = self(), - - %% Start parallel execution - Start = erlang:monotonic_time(millisecond), - - %% Both should execute concurrently with different GILs - spawn(fun() -> - %% Simulate CPU work - Result = py:subinterp_eval(H1, <<"sum(range(100000))">>), - Parent ! {done, 1, Result} - end), - - spawn(fun() -> - %% Simulate CPU work - Result = py:subinterp_eval(H2, <<"sum(range(100000))">>), - Parent ! {done, 2, Result} - end), - - %% Collect results - R1 = receive {done, 1, Res1} -> Res1 after 5000 -> timeout end, - R2 = receive {done, 2, Res2} -> Res2 after 5000 -> timeout end, - - End = erlang:monotonic_time(millisecond), - Duration = End - Start, - - ct:pal("Parallel execution took ~p ms", [Duration]), - ct:pal("Results: ~p, ~p", [R1, R2]), - - %% Both should succeed - ?assertMatch({ok, _}, R1), - ?assertMatch({ok, _}, R2), - - py:subinterp_destroy(H1), - py:subinterp_destroy(H2), - ok. diff --git a/test/py_thread_affinity_SUITE.erl b/test/py_thread_affinity_SUITE.erl new file mode 100644 index 0000000..5dce505 --- /dev/null +++ b/test/py_thread_affinity_SUITE.erl @@ -0,0 +1,160 @@ +%%% @doc Thread-affinity invariants for the per-context worker model. +%%% +%%% After the v3.0 simplification each context owns a dedicated pthread +%%% that handles all of its Python operations. These tests assert the +%%% invariants that motivated the rework so we don't regress numpy / +%%% torch / tensorflow thread-local state safety: +%%% +%%% - exec / eval / call on the same context all share one OS thread +%%% - calls from N different Erlang processes targeting the same +%%% context all converge on that context's worker thread +%%% - distinct contexts get distinct worker threads +%%% - the same invariants hold under owngil mode when supported +-module(py_thread_affinity_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([ + all/0, + init_per_suite/1, + end_per_suite/1 +]). + +-export([ + exec_eval_call_share_thread/1, + multi_process_share_context_thread/1, + distinct_contexts_distinct_threads/1, + owngil_thread_affinity/1 +]). + +all() -> + [ + exec_eval_call_share_thread, + multi_process_share_context_thread, + distinct_contexts_distinct_threads, + owngil_thread_affinity + ]. + +init_per_suite(Config) -> + {ok, _} = application:ensure_all_started(erlang_python), + {ok, _} = py:start_contexts(), + Config. + +end_per_suite(_Config) -> + ok = application:stop(erlang_python), + ok. + +%%% ============================================================================ +%%% Helpers +%%% ============================================================================ + +native_id(Ctx) -> + case py_context:eval(Ctx, <<"_pta_get_tid()">>, #{}) of + {ok, Tid} -> Tid; + Other -> ct:fail({native_id_failed, Other}) + end. + +setup_helper(Ctx) -> + ok = py_context:exec(Ctx, << + "import threading\n" + "def _pta_get_tid():\n" + " return threading.get_native_id()\n" + >>). + +%%% ============================================================================ +%%% Tests +%%% ============================================================================ + +exec_eval_call_share_thread(_Config) -> + Ctx = py:context(1), + setup_helper(Ctx), + %% Stash a thread id from exec, then read it back via eval. + ok = py_context:exec(Ctx, << + "import threading\n" + "_pta_exec_tid = threading.get_native_id()\n" + >>), + {ok, ExecTid} = py_context:eval(Ctx, <<"_pta_exec_tid">>, #{}), + EvalTid = native_id(Ctx), + {ok, CallTid} = py_context:call(Ctx, '__main__', '_pta_get_tid', []), + ct:pal("exec=~p eval=~p call=~p", [ExecTid, EvalTid, CallTid]), + ExecTid = EvalTid, + EvalTid = CallTid, + ok. + +multi_process_share_context_thread(_Config) -> + Ctx = py:context(1), + setup_helper(Ctx), + Parent = self(), + N = 8, + Pids = [spawn(fun() -> + Tid = native_id(Ctx), + Parent ! {tid, self(), Tid} + end) || _ <- lists:seq(1, N)], + Tids = [receive {tid, Pid, T} -> T after 5000 -> ct:fail(timeout) end + || Pid <- Pids], + ct:pal("tids = ~p", [Tids]), + [Single] = lists:usort(Tids), + true = is_integer(Single), + ok. + +distinct_contexts_distinct_threads(_Config) -> + case py_context_router:num_contexts() of + N when N >= 2 -> + Ctx1 = py:context(1), + Ctx2 = py:context(2), + setup_helper(Ctx1), + setup_helper(Ctx2), + T1 = native_id(Ctx1), + T2 = native_id(Ctx2), + ct:pal("ctx1=~p ctx2=~p", [T1, T2]), + true = T1 =/= T2, + ok; + _ -> + {skip, "needs at least 2 contexts"} + end. + +owngil_thread_affinity(_Config) -> + case py:subinterp_supported() of + false -> + {skip, "subinterpreters not supported on this Python build"}; + true -> + case py_context:new(#{mode => owngil}) of + {error, owngil_requires_python314} -> + {skip, "owngil requires Python 3.14+"}; + {error, Reason} -> + ct:fail({owngil_create_failed, Reason}); + {ok, Ctx} -> + try + setup_helper(Ctx), + ok = py_context:exec(Ctx, << + "import threading\n" + "_pta_exec_tid = threading.get_native_id()\n" + >>), + {ok, ExecTid} = py_context:eval(Ctx, + <<"_pta_exec_tid">>, #{}), + EvalTid = native_id(Ctx), + {ok, CallTid} = py_context:call(Ctx, + '__main__', + '_pta_get_tid', []), + ct:pal("owngil exec=~p eval=~p call=~p", + [ExecTid, EvalTid, CallTid]), + ExecTid = EvalTid, + EvalTid = CallTid, + + Parent = self(), + Pids = [spawn(fun() -> + Tid = native_id(Ctx), + Parent ! {tid, self(), Tid} + end) || _ <- lists:seq(1, 4)], + Tids = [receive {tid, Pid, T} -> T + after 5000 -> ct:fail(timeout) + end || Pid <- Pids], + ct:pal("owngil multi-proc tids = ~p", [Tids]), + [Single] = lists:usort(Tids), + Single = ExecTid, + ok + after + py_context:stop(Ctx) + end + end + end.