From dc0c7a18119969b93c17868de309729182dbb981 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Wed, 8 Apr 2026 18:17:26 +0200
Subject: [PATCH 01/17] Simplify execution model to worker + owngil modes

Breaking changes for v3.0.0:
- py:execution_mode/0 now returns worker | owngil only
- Removed py:num_executors/0 (each context has own worker thread)
- Removed multi_executor pool and dead dispatch code

Architecture changes:
- Per-context worker threads with stable thread affinity
- Async NIF dispatch with message passing (no dirty scheduler blocking)
- Request queue per context (replaces single-slot pattern)

Fixes numpy/torch/tensorflow thread-local state issues.
---
 CHANGELOG.md              |   40 +-
 c_src/py_nif.c            | 2171 +++++++++++++++++++++++++++++--------
 c_src/py_nif.h            |  236 +++-
 docs/getting-started.md   |    2 +-
 docs/migration.md         |  129 ++-
 docs/scalability.md       |  108 +-
 src/erlang_python.app.src |    2 +-
 src/py.erl                |   35 +-
 src/py_context.erl        |   72 +-
 src/py_nif.erl            |   76 +-
 test/py_SUITE.erl         |   12 +-
 11 files changed, 2204 insertions(+), 679 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 081a7c6..02beb7c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,23 +1,39 @@
 # Changelog
 
-## 2.4.0 (Unreleased)
+## 3.0.0 (Unreleased)
 
-### Added
+### Breaking Changes
+
+- **Simplified execution model** - Only two public execution modes: `worker` and `owngil`
+  - `worker`: Dedicated pthread per context with stable thread affinity (default)
+  - `owngil`: Dedicated pthread + subinterpreter with own GIL (Python 3.14+)
+  - Removed `multi_executor` and `free_threaded` from public API
+  - Internal capability detection still tracks Python features
 
-- **Context thread affinity** - Contexts in MULTI_EXECUTOR mode are now assigned a
-  fixed executor thread at creation. All operations (call, eval, exec) from the same
-  context run on the same OS thread, preventing thread state corruption in libraries
-  like numpy and PyTorch that have thread-local state.
+- **Removed `py:num_executors/0`** - Contexts now use per-context worker threads
+  instead of a shared executor pool. This function is no longer needed.
+
+- **`py:execution_mode/0` returns `worker | owngil`** - Based on the `context_mode`
+  application configuration. Previously returned internal capabilities like
+  `free_threaded`, `subinterp`, or `multi_executor`.
 
 ### Changed
 
-- **`py:execution_mode/0` now returns actual mode** - Returns `worker` (default),
-  `owngil`, `free_threaded`, or `multi_executor` based on actual configuration
-  instead of Python capability. Previously returned `subinterp` even when using
-  worker mode.
+- **Per-context worker threads** - Each context now gets its own dedicated pthread
+  that handles all Python operations. This provides stable thread affinity for
+  numpy/torch/tensorflow compatibility without needing a shared executor pool.
+
+- **Async NIF dispatch** - Context operations use async NIFs with message passing
+  instead of blocking dirty schedulers. This improves concurrency under load.
+
+- **Request queue per context** - Replaced single-slot request pattern with proper
+  request queues that support multiple concurrent callers.
+
+### Removed
 
-- **Removed obsolete subinterp test references** - Test suites updated to reflect
-  the removal of subinterpreter mode. Tests now use `worker` or `owngil` modes.
+- Multi-executor pool (`g_executors[]`, `multi_executor_start/stop`)
+- `context_dispatch_call/eval/exec` functions (dead code)
+- References to `PY_MODE_MULTI_EXECUTOR` in context operations
 
 ## 2.3.1 (2026-04-01)
 
diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index 93dd59b..a7e0612 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -1142,7 +1142,9 @@ static ERL_NIF_TERM nif_py_init(ErlNifEnv *env, int argc, const ERL_NIF_TERM arg
     /* Save main thread state and release GIL for other threads */
     g_main_thread_state = PyEval_SaveThread();
 
-    /* Start executors based on execution mode */
+    /* Start single executor for coordinator operations.
+     * Context operations use per-context worker threads (see worker_context_init).
+     * The single executor handles legacy worker API and coordinator tasks. */
     int executor_result = 0;
     switch (g_execution_mode) {
         case PY_MODE_FREE_THREADED:
@@ -1150,29 +1152,10 @@ static ERL_NIF_TERM nif_py_init(ErlNifEnv *env, int argc, const ERL_NIF_TERM arg
             break;
 
         case PY_MODE_SUBINTERP:
-            /* Use single executor for coordinator operations */
-            executor_result = executor_start();
-            break;
-
         case PY_MODE_MULTI_EXECUTOR:
         default:
-            /* Start multiple executors for GIL contention mode */
-            {
-                int num_exec = MIN_EXECUTORS;  /* Fallback if not provided */
-                /* Check for config */
-                if (argc > 0 && enif_is_map(env, argv[0])) {
-                    ERL_NIF_TERM key = enif_make_atom(env, "num_executors");
-                    ERL_NIF_TERM value;
-                    if (enif_get_map_value(env, argv[0], key, &value)) {
-                        enif_get_int(env, value, &num_exec);
-                    }
-                }
-                executor_result = multi_executor_start(num_exec);
-                if (executor_result < 0) {
-                    /* Fallback to single executor */
-                    executor_result = executor_start();
-                }
-            }
+            /* Use single executor for coordinator operations */
+            executor_result = executor_start();
             break;
     }
 
@@ -1221,23 +1204,16 @@ static ERL_NIF_TERM nif_finalize(ErlNifEnv *env, int argc, const ERL_NIF_TERM ar
      * 3. Then clean up caches with GIL (no active work at this point)
      */
 
-    /* Step 1: Stop executors - they will finish in-flight requests and exit */
+    /* Step 1: Stop executor - it will finish in-flight requests and exit */
     switch (g_execution_mode) {
         case PY_MODE_FREE_THREADED:
             /* No executor to stop */
             break;
 
         case PY_MODE_SUBINTERP:
-            executor_stop();
-            break;
-
         case PY_MODE_MULTI_EXECUTOR:
         default:
-            if (atomic_load(&g_multi_executor_initialized)) {
-                multi_executor_stop();
-            } else {
-                executor_stop();
-            }
+            executor_stop();
             break;
     }
 
@@ -1771,13 +1747,6 @@ static ERL_NIF_TERM nif_execution_mode(ErlNifEnv *env, int argc, const ERL_NIF_T
     return enif_make_atom(env, mode_str);
 }
 
-static ERL_NIF_TERM nif_num_executors(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-
-    return enif_make_int(env, g_num_executors);
-}
-
 /* ============================================================================
  * Callback support NIFs
  * ============================================================================ */
@@ -2236,7 +2205,116 @@ static PyObject *context_get_module(py_context_t *ctx, const char *module_name);
  *   - Terms are passed via enif_make_copy() (zero serialization overhead)
  * ============================================================================ */
 
-#ifdef HAVE_SUBINTERPRETERS
+/* ============================================================================
+ * Context Request Queue Operations
+ *
+ * These functions manage the request queue for worker/owngil contexts.
+ * They replace the single-slot pattern that had race conditions.
+ * Available for all Python versions to support worker thread mode.
+ * ============================================================================ */
+
+/**
+ * @brief Enqueue a request to a context's request queue
+ *
+ * Thread-safe. Adds request to tail of queue and signals worker.
+ * Caller must have already set refcount to 2 (caller + queue).
+ *
+ * @param ctx The context
+ * @param req The request (refcount should be 2)
+ */
+static void ctx_queue_enqueue(py_context_t *ctx, ctx_request_t *req) {
+    pthread_mutex_lock(&ctx->queue_mutex);
+
+    req->next = NULL;
+    if (ctx->queue_tail == NULL) {
+        ctx->queue_head = req;
+        ctx->queue_tail = req;
+    } else {
+        ctx->queue_tail->next = req;
+        ctx->queue_tail = req;
+    }
+
+    pthread_cond_signal(&ctx->queue_not_empty);
+    pthread_mutex_unlock(&ctx->queue_mutex);
+}
+
+/**
+ * @brief Dequeue a request from a context's request queue
+ *
+ * Blocks until a request is available or shutdown is requested.
+ * Returns NULL if shutdown requested and queue is empty.
+ *
+ * @param ctx The context
+ * @return The dequeued request, or NULL on shutdown
+ */
+static ctx_request_t *ctx_queue_dequeue(py_context_t *ctx) {
+    pthread_mutex_lock(&ctx->queue_mutex);
+
+    while (ctx->queue_head == NULL && !atomic_load(&ctx->shutdown_requested)) {
+        pthread_cond_wait(&ctx->queue_not_empty, &ctx->queue_mutex);
+    }
+
+    ctx_request_t *req = ctx->queue_head;
+    if (req != NULL) {
+        ctx->queue_head = req->next;
+        if (ctx->queue_head == NULL) {
+            ctx->queue_tail = NULL;
+        }
+        req->next = NULL;
+    }
+
+    pthread_mutex_unlock(&ctx->queue_mutex);
+    return req;
+}
+
+/**
+ * @brief Cancel all pending requests in a context's queue
+ *
+ * Called during context destruction. Sets cancelled flag on all
+ * pending requests and signals their condition variables.
+ *
+ * @param ctx The context
+ */
+static void ctx_queue_cancel_all(py_context_t *ctx) {
+    pthread_mutex_lock(&ctx->queue_mutex);
+
+    ctx_request_t *req = ctx->queue_head;
+    while (req != NULL) {
+        ctx_request_t *next = req->next;
+        atomic_store(&req->cancelled, true);
+
+        /* Signal waiters that request is done (cancelled) */
+        pthread_mutex_lock(&req->mutex);
+        atomic_store(&req->completed, true);
+        pthread_cond_signal(&req->cond);
+        pthread_mutex_unlock(&req->mutex);
+
+        /* Release queue's reference */
+        ctx_request_release(req);
+        req = next;
+    }
+
+    ctx->queue_head = NULL;
+    ctx->queue_tail = NULL;
+
+    pthread_mutex_unlock(&ctx->queue_mutex);
+}
+
+/* ============================================================================
+ * Legacy execute functions (use context fields for compatibility)
+ *
+ * These functions read from ctx->shared_env/request_term and write to
+ * ctx->response_term/response_ok. The new queue-based approach populates
+ * these fields from the dequeued request for compatibility.
+ *
+ * TODO: Refactor these to take ctx_request_t* directly in a future phase.
+ * ============================================================================ */
+
+/* Thread-local for current request being processed (for compatibility layer) */
+static __thread ErlNifEnv *tl_current_req_env = NULL;
+static __thread ERL_NIF_TERM tl_current_req_data = 0;
+static __thread ERL_NIF_TERM *tl_current_response = NULL;
+static __thread bool *tl_current_response_ok = NULL;
 
 /**
  * @brief Execute a call request in the OWN_GIL thread
@@ -2703,9 +2781,13 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) {
         return;
     }
 
-    /* Set thread-local env for callback support */
+    /* Set thread-local state for callback/suspension support */
+    py_context_t *prev_context = tl_current_context;
+    tl_current_context = ctx;
     py_env_resource_t *prev_local_env = tl_current_local_env;
     tl_current_local_env = penv;
+    bool prev_allow_suspension = tl_allow_suspension;
+    tl_allow_suspension = true;
 
     /* Build eval_locals from penv->globals + any passed locals */
     PyObject *eval_locals = PyDict_Copy(penv->globals);
@@ -2723,6 +2805,8 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) {
 
     if (compiled == NULL) {
         Py_DECREF(eval_locals);
+        tl_allow_suspension = prev_allow_suspension;
+        tl_current_context = prev_context;
         tl_current_local_env = prev_local_env;
         ctx->response_term = make_py_error(ctx->shared_env);
         ctx->response_ok = false;
@@ -2733,11 +2817,158 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) {
     Py_DECREF(compiled);
     Py_DECREF(eval_locals);
 
-    tl_current_local_env = prev_local_env;
-
     if (py_result == NULL) {
-        ctx->response_term = make_py_error(ctx->shared_env);
-        ctx->response_ok = false;
+        /* Check for pending callback (suspension) */
+        if (tl_pending_callback) {
+            PyErr_Clear();
+            /* Create suspended state for callback handling */
+            suspended_context_state_t *suspended = create_suspended_context_state_for_eval(
+                ctx->shared_env, ctx, &code_bin, tuple_terms[1]);
+            if (suspended == NULL) {
+                tl_pending_callback = false;
+                Py_CLEAR(tl_pending_args);
+                ctx->response_term = enif_make_tuple2(ctx->shared_env,
+                    enif_make_atom(ctx->shared_env, "error"),
+                    enif_make_atom(ctx->shared_env, "create_suspended_state_failed"));
+                ctx->response_ok = false;
+            } else {
+                ctx->response_term = build_suspended_context_result(ctx->shared_env, suspended);
+                ctx->response_ok = true;  /* Suspended is a valid response */
+            }
+        } else {
+            ctx->response_term = make_py_error(ctx->shared_env);
+            ctx->response_ok = false;
+        }
+    } else if (is_inline_schedule_marker(py_result)) {
+        /* Inline schedule marker: execute continuation directly in worker thread.
+         * Loop until we get a final result or a suspension. */
+        int depth = 0;
+        while (is_inline_schedule_marker(py_result) && depth < MAX_INLINE_CONTINUATION_DEPTH) {
+            inline_continuation_t *cont = create_inline_continuation(ctx, penv, py_result, depth);
+            Py_DECREF(py_result);
+            py_result = NULL;
+
+            if (cont == NULL) {
+                ctx->response_term = enif_make_tuple2(ctx->shared_env,
+                    enif_make_atom(ctx->shared_env, "error"),
+                    enif_make_atom(ctx->shared_env, "create_continuation_failed"));
+                ctx->response_ok = false;
+                goto cleanup;
+            }
+
+            /* Execute the continuation function */
+            PyObject *func = NULL;
+            PyObject *module = NULL;
+
+            if (strcmp(cont->module_name, "__main__") == 0) {
+                /* Try captured globals first */
+                if (cont->globals != NULL) {
+                    func = PyDict_GetItemString(cont->globals, cont->func_name);
+                }
+                if (func == NULL && cont->locals != NULL) {
+                    func = PyDict_GetItemString(cont->locals, cont->func_name);
+                }
+                if (func == NULL && penv != NULL) {
+                    func = PyDict_GetItemString(penv->globals, cont->func_name);
+                }
+                if (func == NULL && ctx->globals != NULL) {
+                    func = PyDict_GetItemString(ctx->globals, cont->func_name);
+                }
+                if (func != NULL) {
+                    Py_INCREF(func);
+                } else {
+                    PyErr_Format(PyExc_NameError, "name '%s' is not defined", cont->func_name);
+                }
+            } else {
+                module = PyImport_ImportModule(cont->module_name);
+                if (module != NULL) {
+                    func = PyObject_GetAttrString(module, cont->func_name);
+                    Py_DECREF(module);
+                }
+            }
+
+            if (func == NULL) {
+                enif_release_resource(cont);
+                ctx->response_term = make_py_error(ctx->shared_env);
+                ctx->response_ok = false;
+                goto cleanup;
+            }
+
+            /* Build args and call */
+            PyObject *args = cont->args ? cont->args : PyTuple_New(0);
+            if (args == NULL) {
+                Py_DECREF(func);
+                enif_release_resource(cont);
+                ctx->response_term = make_py_error(ctx->shared_env);
+                ctx->response_ok = false;
+                goto cleanup;
+            }
+            if (cont->args) Py_INCREF(args);
+
+            py_result = PyObject_Call(func, args, cont->kwargs);
+            Py_DECREF(func);
+            Py_DECREF(args);
+            enif_release_resource(cont);
+            depth++;
+        }
+
+        if (depth >= MAX_INLINE_CONTINUATION_DEPTH) {
+            Py_XDECREF(py_result);
+            ctx->response_term = enif_make_tuple2(ctx->shared_env,
+                enif_make_atom(ctx->shared_env, "error"),
+                enif_make_atom(ctx->shared_env, "inline_continuation_depth_exceeded"));
+            ctx->response_ok = false;
+            goto cleanup;
+        }
+
+        /* Handle final result (or error/suspension from continuation) */
+        if (py_result == NULL) {
+            if (tl_pending_callback) {
+                PyErr_Clear();
+                suspended_context_state_t *suspended = create_suspended_context_state_for_eval(
+                    ctx->shared_env, ctx, &code_bin, tuple_terms[1]);
+                if (suspended == NULL) {
+                    tl_pending_callback = false;
+                    Py_CLEAR(tl_pending_args);
+                    ctx->response_term = enif_make_tuple2(ctx->shared_env,
+                        enif_make_atom(ctx->shared_env, "error"),
+                        enif_make_atom(ctx->shared_env, "create_suspended_state_failed"));
+                    ctx->response_ok = false;
+                } else {
+                    ctx->response_term = build_suspended_context_result(ctx->shared_env, suspended);
+                    ctx->response_ok = true;
+                }
+            } else {
+                ctx->response_term = make_py_error(ctx->shared_env);
+                ctx->response_ok = false;
+            }
+        } else if (is_schedule_marker(py_result)) {
+            ScheduleMarkerObject *marker = (ScheduleMarkerObject *)py_result;
+            ERL_NIF_TERM callback_name = py_to_term(ctx->shared_env, marker->callback_name);
+            ERL_NIF_TERM callback_args = py_to_term(ctx->shared_env, marker->args);
+            Py_DECREF(py_result);
+            ctx->response_term = enif_make_tuple3(ctx->shared_env,
+                enif_make_atom(ctx->shared_env, "schedule"),
+                callback_name, callback_args);
+            ctx->response_ok = true;
+        } else {
+            ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result);
+            Py_DECREF(py_result);
+            ctx->response_term = enif_make_tuple2(ctx->shared_env,
+                enif_make_atom(ctx->shared_env, "ok"), term_result);
+            ctx->response_ok = true;
+        }
+        goto cleanup;
+    } else if (is_schedule_marker(py_result)) {
+        /* Schedule marker: return {schedule, callback_name, args} */
+        ScheduleMarkerObject *marker = (ScheduleMarkerObject *)py_result;
+        ERL_NIF_TERM callback_name = py_to_term(ctx->shared_env, marker->callback_name);
+        ERL_NIF_TERM callback_args = py_to_term(ctx->shared_env, marker->args);
+        Py_DECREF(py_result);
+        ctx->response_term = enif_make_tuple3(ctx->shared_env,
+            enif_make_atom(ctx->shared_env, "schedule"),
+            callback_name, callback_args);
+        ctx->response_ok = true;
     } else {
         ERL_NIF_TERM term_result = py_to_term(ctx->shared_env, py_result);
         Py_DECREF(py_result);
@@ -2745,6 +2976,13 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) {
             enif_make_atom(ctx->shared_env, "ok"), term_result);
         ctx->response_ok = true;
     }
+
+cleanup:
+    /* Restore thread-local state */
+    tl_allow_suspension = prev_allow_suspension;
+    tl_current_context = prev_context;
+    tl_current_local_env = prev_local_env;
+    clear_pending_callback_tls();
 }
 
 /**
@@ -3156,128 +3394,679 @@ static void owngil_execute_request(py_context_t *ctx) {
     }
 }
 
+/* ============================================================================
+ * Worker Thread Implementation (main interpreter, all Python versions)
+ *
+ * Worker mode uses a dedicated pthread that acquires the GIL for each request.
+ * This provides stable thread affinity for numpy/torch/tensorflow without
+ * requiring subinterpreter support.
+ * ============================================================================ */
+
 /**
- * @brief Main loop for OWN_GIL context thread
+ * @brief Main loop for worker context thread (main interpreter mode)
  *
- * This function runs in a dedicated pthread. It creates an OWN_GIL subinterpreter,
- * then enters a request loop where it processes requests from the dirty scheduler.
+ * This function runs in a dedicated pthread. It processes requests from the
+ * request queue, acquiring the GIL for each request using PyGILState_Ensure.
+ *
+ * Unlike owngil mode, worker mode uses the main interpreter and shares the GIL
+ * with other Python threads. The benefit is stable thread affinity and
+ * compatibility with all Python extensions.
  */
-static void *owngil_context_thread_main(void *arg) {
+static void *worker_context_thread_main(void *arg) {
     py_context_t *ctx = (py_context_t *)arg;
 
-    /* Attach to Python runtime to create the subinterpreter.
-     * We need to hold the main GIL while creating the subinterpreter. */
+    /* Create namespace dictionaries on the worker thread under GIL */
     PyGILState_STATE gstate = PyGILState_Ensure();
 
-    /* Create OWN_GIL subinterpreter */
-    PyInterpreterConfig config = {
-        .use_main_obmalloc = 0,
-        .allow_fork = 0,
-        .allow_exec = 0,
-        .allow_threads = 1,
-        .allow_daemon_threads = 0,
-        .check_multi_interp_extensions = 1,
-        .gil = PyInterpreterConfig_OWN_GIL,
-    };
-
-    PyStatus status = Py_NewInterpreterFromConfig(&ctx->own_gil_tstate, &config);
-    if (PyStatus_IsError(status)) {
-        fprintf(stderr, "OWN_GIL: Py_NewInterpreterFromConfig failed: %s\n",
-                status.err_msg ? status.err_msg : "unknown error");
-        PyGILState_Release(gstate);
-        atomic_store(&ctx->init_error, true);
-        return NULL;
-    }
-
-    ctx->own_gil_interp = PyThreadState_GetInterpreter(ctx->own_gil_tstate);
-
-    /* After Py_NewInterpreterFromConfig, we are now in the new interpreter's
-     * thread state and hold its GIL. The main interpreter's gstate is no longer
-     * relevant for this thread. */
-
-    /* Register erlang module in this subinterpreter */
-    if (create_erlang_module() < 0) {
-        fprintf(stderr, "OWN_GIL: create_erlang_module failed\n");
-        PyErr_Print();
-        Py_EndInterpreter(ctx->own_gil_tstate);
-        atomic_store(&ctx->init_error, true);
-        return NULL;
-    }
-
-    /* Register py_event_loop module for reactor support */
-    if (create_py_event_loop_module() < 0) {
-        fprintf(stderr, "OWN_GIL: create_py_event_loop_module failed\n");
-        PyErr_Print();
-        Py_EndInterpreter(ctx->own_gil_tstate);
-        atomic_store(&ctx->init_error, true);
-        return NULL;
-    }
-
-    /* Create namespace dictionaries */
-    ctx->globals = PyDict_New();
-    ctx->locals = PyDict_New();
-    ctx->module_cache = PyDict_New();
+    /* Create namespace dictionaries if not already created */
+    if (ctx->globals == NULL) {
+        ctx->globals = PyDict_New();
+        ctx->locals = PyDict_New();
+        ctx->module_cache = PyDict_New();
 
-    if (ctx->globals == NULL || ctx->locals == NULL || ctx->module_cache == NULL) {
-        fprintf(stderr, "OWN_GIL: PyDict_New failed for namespace dicts\n");
-        Py_XDECREF(ctx->globals);
-        Py_XDECREF(ctx->locals);
-        Py_XDECREF(ctx->module_cache);
-        Py_EndInterpreter(ctx->own_gil_tstate);
-        /* Don't call PyGILState_Release - interpreter is gone */
-        atomic_store(&ctx->init_error, true);
-        return NULL;
-    }
+        if (ctx->globals == NULL || ctx->locals == NULL || ctx->module_cache == NULL) {
+            PyGILState_Release(gstate);
+            atomic_store(&ctx->init_error, true);
+            atomic_store(&ctx->worker_running, false);
+            return NULL;
+        }
 
-    /* Import __builtins__ into globals */
-    PyObject *builtins = PyEval_GetBuiltins();
-    PyDict_SetItemString(ctx->globals, "__builtins__", builtins);
+        /* Import __builtins__ into globals */
+        PyObject *builtins = PyEval_GetBuiltins();
+        PyDict_SetItemString(ctx->globals, "__builtins__", builtins);
 
-    /* Import erlang module into globals */
-    PyObject *erlang_module = PyImport_ImportModule("erlang");
-    if (erlang_module != NULL) {
-        PyDict_SetItemString(ctx->globals, "erlang", erlang_module);
-        Py_DECREF(erlang_module);
-    } else {
-        /* Non-fatal - basic operations still work, but log for debugging */
-        log_and_clear_python_error("OWN_GIL erlang module import");
+        /* Import erlang module into globals */
+        PyObject *erlang_module = PyImport_ImportModule("erlang");
+        if (erlang_module != NULL) {
+            PyDict_SetItemString(ctx->globals, "erlang", erlang_module);
+            Py_DECREF(erlang_module);
+        } else {
+            log_and_clear_python_error("worker erlang module import");
+        }
     }
 
-    /* Release our OWN_GIL (we'll reacquire when processing requests) */
-    PyEval_SaveThread();
+    PyGILState_Release(gstate);
 
     /* Signal that we're ready */
-    atomic_store(&ctx->thread_running, true);
-
-    /* Main request loop */
-    pthread_mutex_lock(&ctx->request_mutex);
+    atomic_store(&ctx->worker_running, true);
 
+    /* Main request loop - uses queue instead of single-slot */
     while (!atomic_load(&ctx->shutdown_requested)) {
-        /* Wait for a request */
-        while (ctx->request_type == CTX_REQ_NONE &&
-               !atomic_load(&ctx->shutdown_requested)) {
-            pthread_cond_wait(&ctx->request_ready, &ctx->request_mutex);
+        /* Dequeue next request (blocks until available or shutdown) */
+        ctx_request_t *req = ctx_queue_dequeue(ctx);
+
+        if (req == NULL) {
+            /* Queue empty and shutdown requested */
+            break;
         }
 
-        if (atomic_load(&ctx->shutdown_requested)) {
+        if (req->type == CTX_REQ_SHUTDOWN) {
+            /* Shutdown sentinel - signal completion and exit */
+            pthread_mutex_lock(&req->mutex);
+            atomic_store(&req->completed, true);
+            pthread_cond_signal(&req->cond);
+            pthread_mutex_unlock(&req->mutex);
+            ctx_request_release(req);
             break;
         }
 
-        /* Release mutex while processing (allow concurrent dispatch attempts to queue) */
-        pthread_mutex_unlock(&ctx->request_mutex);
+        /* Check if request was cancelled while queued */
+        if (atomic_load(&req->cancelled)) {
+            /* Request cancelled - deliver error without processing */
+            if (req->async_mode) {
+                /* Async mode: send cancellation message */
+                enif_clear_env(ctx->msg_env);
+                ERL_NIF_TERM cancel_msg = enif_make_tuple3(ctx->msg_env,
+                    enif_make_atom(ctx->msg_env, "py_result"),
+                    enif_make_copy(ctx->msg_env, req->request_id),
+                    enif_make_tuple2(ctx->msg_env,
+                        enif_make_atom(ctx->msg_env, "error"),
+                        enif_make_atom(ctx->msg_env, "cancelled")));
+                enif_send(NULL, &req->caller_pid, ctx->msg_env, cancel_msg);
+            } else {
+                /* Blocking mode: signal condvar */
+                req->result_env = enif_alloc_env();
+                if (req->result_env) {
+                    req->result = enif_make_tuple2(req->result_env,
+                        enif_make_atom(req->result_env, "error"),
+                        enif_make_atom(req->result_env, "cancelled"));
+                }
+                req->success = false;
 
-        /* Acquire our GIL and process */
-        PyEval_RestoreThread(ctx->own_gil_tstate);
-        owngil_execute_request(ctx);
-        PyEval_SaveThread();
+                pthread_mutex_lock(&req->mutex);
+                atomic_store(&req->completed, true);
+                pthread_cond_signal(&req->cond);
+                pthread_mutex_unlock(&req->mutex);
+            }
 
-        /* Re-acquire mutex to signal completion and get next request */
-        pthread_mutex_lock(&ctx->request_mutex);
-        ctx->request_type = CTX_REQ_NONE;
-        pthread_cond_signal(&ctx->response_ready);
-    }
+            ctx_request_release(req);
+            continue;
+        }
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+        /* Populate legacy compatibility fields from request */
+        ctx->shared_env = req->request_env;
+        ctx->request_type = req->type;
+        ctx->request_term = req->request_data;
+        ctx->reactor_buffer_ptr = req->reactor_buffer_ptr;
+        ctx->local_env_ptr = req->local_env_ptr;
+        ctx->response_ok = false;
+        ctx->response_term = 0;
+
+        /* Acquire GIL and process the request */
+        gstate = PyGILState_Ensure();
+        owngil_execute_request(ctx);  /* Reuse execute functions */
+        PyGILState_Release(gstate);
+
+        /* Copy response to request struct */
+        req->result_env = enif_alloc_env();
+        if (req->result_env && ctx->response_term != 0) {
+            req->result = enif_make_copy(req->result_env, ctx->response_term);
+        } else if (req->result_env) {
+            req->result = enif_make_tuple2(req->result_env,
+                enif_make_atom(req->result_env, "error"),
+                enif_make_atom(req->result_env, "no_response"));
+        }
+        req->success = ctx->response_ok;
+
+        /* Clear legacy fields */
+        ctx->shared_env = NULL;
+        ctx->request_type = CTX_REQ_NONE;
+        ctx->request_term = 0;
+        ctx->reactor_buffer_ptr = NULL;
+        ctx->local_env_ptr = NULL;
+
+        /* Deliver result - async or blocking */
+        if (req->async_mode) {
+            /* Async mode: send result message to caller */
+            enif_clear_env(ctx->msg_env);
+            ERL_NIF_TERM result_msg = enif_make_tuple3(ctx->msg_env,
+                enif_make_atom(ctx->msg_env, "py_result"),
+                enif_make_copy(ctx->msg_env, req->request_id),
+                req->result_env ? enif_make_copy(ctx->msg_env, req->result)
+                    : enif_make_tuple2(ctx->msg_env,
+                        enif_make_atom(ctx->msg_env, "error"),
+                        enif_make_atom(ctx->msg_env, "no_result")));
+            enif_send(NULL, &req->caller_pid, ctx->msg_env, result_msg);
+        } else {
+            /* Blocking mode: signal condvar */
+            pthread_mutex_lock(&req->mutex);
+            atomic_store(&req->completed, true);
+            pthread_cond_signal(&req->cond);
+            pthread_mutex_unlock(&req->mutex);
+        }
+
+        /* Release queue's reference to request */
+        ctx_request_release(req);
+    }
+
+    /* Cleanup: release namespace dictionaries under GIL */
+    gstate = PyGILState_Ensure();
+    Py_XDECREF(ctx->module_cache);
+    Py_XDECREF(ctx->globals);
+    Py_XDECREF(ctx->locals);
+    ctx->globals = NULL;
+    ctx->locals = NULL;
+    ctx->module_cache = NULL;
+    PyGILState_Release(gstate);
+
+    atomic_store(&ctx->worker_running, false);
+    return NULL;
+}
+
+/**
+ * @brief Initialize worker thread mode for a context
+ *
+ * @param ctx Context to initialize
+ * @return 0 on success, -1 on failure
+ */
+static int worker_context_init(py_context_t *ctx) {
+    ctx->uses_worker_thread = true;
+
+    /* Initialize worker thread state */
+    atomic_store(&ctx->worker_running, false);
+    atomic_store(&ctx->shutdown_requested, false);
+    atomic_store(&ctx->leaked, false);
+
+    /* Initialize request queue */
+    ctx->queue_head = NULL;
+    ctx->queue_tail = NULL;
+
+    /* Initialize legacy compatibility fields */
+    ctx->shared_env = NULL;
+    ctx->request_type = CTX_REQ_NONE;
+    ctx->request_term = 0;
+    ctx->response_term = 0;
+    ctx->response_ok = false;
+    ctx->local_env_ptr = NULL;
+    ctx->reactor_buffer_ptr = NULL;
+
+    /* Initialize queue mutex */
+    if (pthread_mutex_init(&ctx->queue_mutex, NULL) != 0) {
+        return -1;
+    }
+
+    /* Initialize queue condition variable */
+    if (pthread_cond_init(&ctx->queue_not_empty, NULL) != 0) {
+        pthread_mutex_destroy(&ctx->queue_mutex);
+        return -1;
+    }
+
+    /* Create message environment for async responses */
+    ctx->msg_env = enif_alloc_env();
+    if (ctx->msg_env == NULL) {
+        pthread_cond_destroy(&ctx->queue_not_empty);
+        pthread_mutex_destroy(&ctx->queue_mutex);
+        return -1;
+    }
+
+    /* Globals/locals will be created by the worker thread */
+    ctx->globals = NULL;
+    ctx->locals = NULL;
+    ctx->module_cache = NULL;
+
+    /* Start the worker thread */
+    if (pthread_create(&ctx->worker_thread, NULL, worker_context_thread_main, ctx) != 0) {
+        enif_free_env(ctx->msg_env);
+        ctx->msg_env = NULL;
+        pthread_cond_destroy(&ctx->queue_not_empty);
+        pthread_mutex_destroy(&ctx->queue_mutex);
+        return -1;
+    }
+
+    /* Wait for thread to initialize or fail */
+    int wait_count = 0;
+    while (!atomic_load(&ctx->worker_running) &&
+           !atomic_load(&ctx->init_error) &&
+           wait_count < 2000) {
+        usleep(1000);  /* 1ms */
+        wait_count++;
+    }
+
+    if (atomic_load(&ctx->init_error) || !atomic_load(&ctx->worker_running)) {
+        /* Thread failed to start */
+        pthread_join(ctx->worker_thread, NULL);
+        if (ctx->msg_env != NULL) {
+            enif_free_env(ctx->msg_env);
+            ctx->msg_env = NULL;
+        }
+        pthread_cond_destroy(&ctx->queue_not_empty);
+        pthread_mutex_destroy(&ctx->queue_mutex);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief Shutdown worker thread mode and clean up resources
+ *
+ * Uses the join-or-leak pattern: if the worker thread doesn't respond
+ * within the timeout, we mark the context as leaked and do NOT free
+ * shared resources to avoid use-after-free.
+ *
+ * @param ctx Context to shutdown
+ */
+#define WORKER_SHUTDOWN_TIMEOUT_SECS 30
+
+static void worker_context_shutdown(py_context_t *ctx) {
+    if (!ctx->uses_worker_thread) {
+        return;
+    }
+
+    /* Signal shutdown */
+    atomic_store(&ctx->shutdown_requested, true);
+
+    /* Cancel all pending (not-yet-started) requests */
+    ctx_queue_cancel_all(ctx);
+
+    /* Enqueue shutdown request to wake worker if idle */
+    ctx_request_t *shutdown_req = ctx_request_create();
+    if (shutdown_req != NULL) {
+        shutdown_req->type = CTX_REQ_SHUTDOWN;
+        ctx_queue_enqueue(ctx, shutdown_req);
+    }
+
+    /* Wait for thread to exit with timeout */
+    bool join_succeeded = false;
+
+#if defined(__linux__)
+    struct timespec deadline;
+    clock_gettime(CLOCK_REALTIME, &deadline);
+    deadline.tv_sec += WORKER_SHUTDOWN_TIMEOUT_SECS;
+    int rc = pthread_timedjoin_np(ctx->worker_thread, NULL, &deadline);
+    join_succeeded = (rc == 0);
+#else
+    /* macOS/other: poll worker_running flag with timeout */
+    int wait_ms = 0;
+    while (atomic_load(&ctx->worker_running) &&
+           wait_ms < WORKER_SHUTDOWN_TIMEOUT_SECS * 1000) {
+        usleep(100000);  /* 100ms */
+        wait_ms += 100;
+    }
+    if (!atomic_load(&ctx->worker_running)) {
+        pthread_join(ctx->worker_thread, NULL);
+        join_succeeded = true;
+    }
+#endif
+
+    if (!join_succeeded) {
+        /* Worker thread is unresponsive - use leak pattern */
+        fprintf(stderr, "Worker thread shutdown timeout after %d seconds, leaking context\n",
+                WORKER_SHUTDOWN_TIMEOUT_SECS);
+        atomic_store(&ctx->leaked, true);
+        return;
+    }
+
+    /* Clean shutdown succeeded - safe to free resources */
+    if (ctx->msg_env != NULL) {
+        enif_free_env(ctx->msg_env);
+        ctx->msg_env = NULL;
+    }
+
+    pthread_cond_destroy(&ctx->queue_not_empty);
+    pthread_mutex_destroy(&ctx->queue_mutex);
+
+    ctx->uses_worker_thread = false;
+}
+
+/**
+ * @brief Dispatch a request to the worker thread and wait for response
+ *
+ * Uses the queue-based pattern: creates a request, enqueues it, waits for
+ * completion, and copies the result back to the caller's environment.
+ *
+ * @param env Caller's NIF environment
+ * @param ctx Context with worker thread
+ * @param req_type Request type (CTX_REQ_CALL, CTX_REQ_EVAL, CTX_REQ_EXEC, etc.)
+ * @param request_data Request data term
+ * @return Result term copied back to caller's env
+ */
+#define WORKER_DISPATCH_TIMEOUT_SECS 30
+
+/**
+ * @brief Dispatch a request to the worker thread with optional local environment
+ *
+ * @param env NIF environment
+ * @param ctx Context to dispatch to
+ * @param req_type Request type
+ * @param request_data Request data term
+ * @param local_env Optional local environment (NULL for default)
+ * @return Result term
+ */
+static ERL_NIF_TERM dispatch_to_worker_thread_impl(
+    ErlNifEnv *env,
+    py_context_t *ctx,
+    ctx_request_type_t req_type,
+    ERL_NIF_TERM request_data,
+    void *local_env
+) {
+    if (!atomic_load(&ctx->worker_running)) {
+        return make_error(env, "thread_not_running");
+    }
+
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = req_type;
+    req->request_data = enif_make_copy(req->request_env, request_data);
+    req->local_env_ptr = local_env;
+
+    /* Add extra reference for queue (caller holds 1, queue holds 1) */
+    ctx_request_addref(req);
+    ctx_queue_enqueue(ctx, req);
+
+    /* Wait for completion with timeout */
+    struct timespec deadline;
+    clock_gettime(CLOCK_REALTIME, &deadline);
+    deadline.tv_sec += WORKER_DISPATCH_TIMEOUT_SECS;
+
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
+        if (rc == ETIMEDOUT) {
+            /* Timeout - mark as cancelled and return error */
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+            ctx_request_release(req);
+            return make_error(env, "worker_timeout");
+        }
+    }
+
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's environment */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
+
+    /* Release caller's reference */
+    ctx_request_release(req);
+
+    return result;
+}
+
+/**
+ * @brief Convenience wrapper for dispatch without local environment
+ */
+static ERL_NIF_TERM dispatch_to_worker_thread(
+    ErlNifEnv *env,
+    py_context_t *ctx,
+    ctx_request_type_t req_type,
+    ERL_NIF_TERM request_data
+) {
+    return dispatch_to_worker_thread_impl(env, ctx, req_type, request_data, NULL);
+}
+
+/**
+ * @brief Async dispatch to worker thread (non-blocking)
+ *
+ * Enqueues the request and returns immediately. The worker thread will
+ * send a {py_result, RequestId, Result} message to the caller when done.
+ *
+ * @param env NIF environment
+ * @param ctx Context
+ * @param req_type Request type
+ * @param request_data Request data term
+ * @param caller_pid Caller's PID for result delivery
+ * @param request_id Request ID for correlation
+ * @param local_env Optional local environment (NULL for default)
+ * @return {enqueued, RequestId} on success, {error, Reason} on failure
+ */
+static ERL_NIF_TERM dispatch_to_worker_thread_async(
+    ErlNifEnv *env,
+    py_context_t *ctx,
+    ctx_request_type_t req_type,
+    ERL_NIF_TERM request_data,
+    ErlNifPid caller_pid,
+    ERL_NIF_TERM request_id,
+    void *local_env
+) {
+    if (!atomic_load(&ctx->worker_running)) {
+        return make_error(env, "thread_not_running");
+    }
+
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = req_type;
+    req->request_data = enif_make_copy(req->request_env, request_data);
+    req->local_env_ptr = local_env;
+
+    /* Set async mode */
+    req->async_mode = true;
+    req->caller_pid = caller_pid;
+    req->request_id = enif_make_copy(req->request_env, request_id);
+
+    /* Add to queue (queue owns one reference, no caller reference needed) */
+    ctx_queue_enqueue(ctx, req);
+
+    /* Return immediately - no blocking! */
+    return enif_make_tuple2(env,
+        enif_make_atom(env, "enqueued"),
+        request_id);
+}
+
+#ifdef HAVE_SUBINTERPRETERS
+/**
+ * @brief Main loop for OWN_GIL context thread
+ *
+ * This function runs in a dedicated pthread. It creates an OWN_GIL subinterpreter,
+ * then enters a request loop where it processes requests from the request queue.
+ *
+ * The queue-based pattern replaces the old single-slot pattern which had race
+ * conditions when multiple callers dispatched concurrently.
+ */
+static void *owngil_context_thread_main(void *arg) {
+    py_context_t *ctx = (py_context_t *)arg;
+
+    /* Attach to Python runtime to create the subinterpreter.
+     * We need to hold the main GIL while creating the subinterpreter. */
+    PyGILState_STATE gstate = PyGILState_Ensure();
+
+    /* Create OWN_GIL subinterpreter */
+    PyInterpreterConfig config = {
+        .use_main_obmalloc = 0,
+        .allow_fork = 0,
+        .allow_exec = 0,
+        .allow_threads = 1,
+        .allow_daemon_threads = 0,
+        .check_multi_interp_extensions = 1,
+        .gil = PyInterpreterConfig_OWN_GIL,
+    };
+
+    PyStatus status = Py_NewInterpreterFromConfig(&ctx->own_gil_tstate, &config);
+    if (PyStatus_IsError(status)) {
+        fprintf(stderr, "OWN_GIL: Py_NewInterpreterFromConfig failed: %s\n",
+                status.err_msg ? status.err_msg : "unknown error");
+        PyGILState_Release(gstate);
+        atomic_store(&ctx->init_error, true);
+        atomic_store(&ctx->worker_running, false);
+        return NULL;
+    }
+
+    ctx->own_gil_interp = PyThreadState_GetInterpreter(ctx->own_gil_tstate);
+
+    /* After Py_NewInterpreterFromConfig, we are now in the new interpreter's
+     * thread state and hold its GIL. The main interpreter's gstate is no longer
+     * relevant for this thread. */
+
+    /* Register erlang module in this subinterpreter */
+    if (create_erlang_module() < 0) {
+        fprintf(stderr, "OWN_GIL: create_erlang_module failed\n");
+        PyErr_Print();
+        Py_EndInterpreter(ctx->own_gil_tstate);
+        atomic_store(&ctx->init_error, true);
+        atomic_store(&ctx->worker_running, false);
+        return NULL;
+    }
+
+    /* Register py_event_loop module for reactor support */
+    if (create_py_event_loop_module() < 0) {
+        fprintf(stderr, "OWN_GIL: create_py_event_loop_module failed\n");
+        PyErr_Print();
+        Py_EndInterpreter(ctx->own_gil_tstate);
+        atomic_store(&ctx->init_error, true);
+        atomic_store(&ctx->worker_running, false);
+        return NULL;
+    }
+
+    /* Create namespace dictionaries */
+    ctx->globals = PyDict_New();
+    ctx->locals = PyDict_New();
+    ctx->module_cache = PyDict_New();
+
+    if (ctx->globals == NULL || ctx->locals == NULL || ctx->module_cache == NULL) {
+        fprintf(stderr, "OWN_GIL: PyDict_New failed for namespace dicts\n");
+        Py_XDECREF(ctx->globals);
+        Py_XDECREF(ctx->locals);
+        Py_XDECREF(ctx->module_cache);
+        Py_EndInterpreter(ctx->own_gil_tstate);
+        atomic_store(&ctx->init_error, true);
+        atomic_store(&ctx->worker_running, false);
+        return NULL;
+    }
+
+    /* Import __builtins__ into globals */
+    PyObject *builtins = PyEval_GetBuiltins();
+    PyDict_SetItemString(ctx->globals, "__builtins__", builtins);
+
+    /* Import erlang module into globals */
+    PyObject *erlang_module = PyImport_ImportModule("erlang");
+    if (erlang_module != NULL) {
+        PyDict_SetItemString(ctx->globals, "erlang", erlang_module);
+        Py_DECREF(erlang_module);
+    } else {
+        /* Non-fatal - basic operations still work, but log for debugging */
+        log_and_clear_python_error("OWN_GIL erlang module import");
+    }
+
+    /* Release our OWN_GIL (we'll reacquire when processing requests) */
+    PyEval_SaveThread();
+
+    /* Signal that we're ready */
+    atomic_store(&ctx->worker_running, true);
+
+    /* Main request loop - uses queue instead of single-slot */
+    while (!atomic_load(&ctx->shutdown_requested)) {
+        /* Dequeue next request (blocks until available or shutdown) */
+        ctx_request_t *req = ctx_queue_dequeue(ctx);
+
+        if (req == NULL) {
+            /* Queue empty and shutdown requested */
+            break;
+        }
+
+        if (req->type == CTX_REQ_SHUTDOWN) {
+            /* Shutdown sentinel - signal completion and exit */
+            pthread_mutex_lock(&req->mutex);
+            atomic_store(&req->completed, true);
+            pthread_cond_signal(&req->cond);
+            pthread_mutex_unlock(&req->mutex);
+            ctx_request_release(req);
+            break;
+        }
+
+        /* Check if request was cancelled while queued */
+        if (atomic_load(&req->cancelled)) {
+            /* Request cancelled - signal completion without processing */
+            req->result_env = enif_alloc_env();
+            if (req->result_env) {
+                req->result = enif_make_tuple2(req->result_env,
+                    enif_make_atom(req->result_env, "error"),
+                    enif_make_atom(req->result_env, "cancelled"));
+            }
+            req->success = false;
+
+            pthread_mutex_lock(&req->mutex);
+            atomic_store(&req->completed, true);
+            pthread_cond_signal(&req->cond);
+            pthread_mutex_unlock(&req->mutex);
+
+            ctx_request_release(req);
+            continue;
+        }
+
+        /* Populate legacy compatibility fields from request */
+        ctx->shared_env = req->request_env;
+        ctx->request_type = req->type;
+        ctx->request_term = req->request_data;
+        ctx->reactor_buffer_ptr = req->reactor_buffer_ptr;
+        ctx->local_env_ptr = req->local_env_ptr;
+        ctx->response_ok = false;
+        ctx->response_term = 0;
+
+        /* Acquire our GIL and process the request */
+        PyEval_RestoreThread(ctx->own_gil_tstate);
+        owngil_execute_request(ctx);
+        PyEval_SaveThread();
+
+        /* Copy response to request struct */
+        req->result_env = enif_alloc_env();
+        if (req->result_env && ctx->response_term != 0) {
+            req->result = enif_make_copy(req->result_env, ctx->response_term);
+        } else if (req->result_env) {
+            req->result = enif_make_tuple2(req->result_env,
+                enif_make_atom(req->result_env, "error"),
+                enif_make_atom(req->result_env, "no_response"));
+        }
+        req->success = ctx->response_ok;
+
+        /* Clear legacy fields */
+        ctx->shared_env = NULL;
+        ctx->request_type = CTX_REQ_NONE;
+        ctx->request_term = 0;
+        ctx->reactor_buffer_ptr = NULL;
+        ctx->local_env_ptr = NULL;
+
+        /* Signal completion */
+        pthread_mutex_lock(&req->mutex);
+        atomic_store(&req->completed, true);
+        pthread_cond_signal(&req->cond);
+        pthread_mutex_unlock(&req->mutex);
+
+        /* Release queue's reference to request */
+        ctx_request_release(req);
+    }
 
     /* Cleanup: acquire our OWN_GIL and destroy interpreter */
     PyEval_RestoreThread(ctx->own_gil_tstate);
@@ -3297,7 +4086,7 @@ static void *owngil_context_thread_main(void *arg) {
      * After Py_NewInterpreterFromConfig switched us to the OWN_GIL interpreter,
      * the original gstate is no longer valid. Py_EndInterpreter handles cleanup. */
 
-    atomic_store(&ctx->thread_running, false);
+    atomic_store(&ctx->worker_running, false);
     return NULL;
 }
 
@@ -3308,17 +4097,17 @@ static void *owngil_context_thread_main(void *arg) {
 #define OWNGIL_DISPATCH_TIMEOUT_SECS 30
 
 /**
- * @brief Dispatch a request to the OWN_GIL thread and wait for response
+ * @brief Dispatch a request to the worker thread and wait for response
  *
- * Called from dirty schedulers. Copies the request term to the shared env,
- * signals the worker thread, and waits for the response.
+ * Uses the queue-based pattern: creates a request, enqueues it, waits for
+ * completion, and copies the result back to the caller's environment.
  *
- * Uses pthread_cond_timedwait to prevent indefinite blocking if the worker
- * thread dies or becomes unresponsive.
+ * This replaces the old single-slot pattern which had race conditions when
+ * multiple callers dispatched concurrently.
  *
  * @param env Caller's NIF environment
- * @param ctx Context with OWN_GIL
- * @param req_type Request type (CTX_REQ_CALL, CTX_REQ_EVAL, CTX_REQ_EXEC)
+ * @param ctx Context with worker thread
+ * @param req_type Request type (CTX_REQ_CALL, CTX_REQ_EVAL, CTX_REQ_EXEC, etc.)
  * @param request_data Request data term
  * @return Result term copied back to caller's env
  */
@@ -3328,41 +4117,66 @@ static ERL_NIF_TERM dispatch_to_owngil_thread(
     ctx_request_type_t req_type,
     ERL_NIF_TERM request_data
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = req_type;
+    req->request_data = enif_make_copy(req->request_env, request_data);
 
-    /* Copy request to shared env (zero serialization overhead) */
-    enif_clear_env(ctx->shared_env);
-    ctx->request_term = enif_make_copy(ctx->shared_env, request_data);
-    ctx->request_type = req_type;
+    /* Add ref for queue (now refcount = 2: caller + queue) */
+    ctx_request_addref(req);
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
 
-    /* Wait for response with timeout to prevent deadlock on worker death */
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            /* Worker thread is unresponsive - mark it as not running */
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL dispatch timeout: worker thread unresponsive after %d seconds\n",
+            /* Worker thread is unresponsive - mark request as cancelled */
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            /* Don't mark worker as dead - it might still be processing
+             * a long-running Python operation. Just fail this request. */
+            fprintf(stderr, "OWN_GIL dispatch timeout after %d seconds\n",
                     OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);  /* Release caller's ref */
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
+
+    /* Release caller's ref */
+    ctx_request_release(req);
 
     return result;
 }
@@ -3370,52 +4184,74 @@ static ERL_NIF_TERM dispatch_to_owngil_thread(
 /**
  * @brief Dispatch reactor on_read_ready to OWN_GIL thread
  *
- * Similar to dispatch_to_owngil_thread but also passes buffer pointer.
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 ERL_NIF_TERM dispatch_reactor_read_to_owngil(ErlNifEnv *env, py_context_t *ctx,
                                               int fd, void *buffer_ptr) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         enif_release_resource(buffer_ptr);
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        enif_release_resource(buffer_ptr);
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        enif_release_resource(buffer_ptr);
+        return make_error(env, "alloc_failed");
+    }
 
-    /* Clear and set up request */
-    enif_clear_env(ctx->shared_env);
-    ctx->request_term = enif_make_int(ctx->shared_env, fd);
-    ctx->reactor_buffer_ptr = buffer_ptr;  /* Transfer ownership */
-    ctx->request_type = CTX_REQ_REACTOR_ON_READ_READY;
+    /* Populate request */
+    req->type = CTX_REQ_REACTOR_ON_READ_READY;
+    req->request_data = enif_make_int(req->request_env, fd);
+    req->reactor_buffer_ptr = buffer_ptr;  /* Transfer ownership */
+    req->reactor_fd = fd;
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Add ref for queue (now refcount = 2: caller + queue) */
+    ctx_request_addref(req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
+
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            /* Worker thread is unresponsive - clean up buffer and mark dead */
-            atomic_store(&ctx->thread_running, false);
-            /* Buffer ownership was transferred but never processed - release it */
-            if (ctx->reactor_buffer_ptr) {
-                enif_release_resource(ctx->reactor_buffer_ptr);
-                ctx->reactor_buffer_ptr = NULL;
-            }
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL reactor dispatch timeout: worker thread unresponsive\n");
+            /* Request timeout - mark as cancelled but don't release buffer
+             * (worker will handle it when it gets to this request) */
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL reactor dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);  /* Release caller's ref */
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    /* Release caller's ref */
+    ctx_request_release(req);
 
     return result;
 }
@@ -3423,43 +4259,67 @@ ERL_NIF_TERM dispatch_reactor_read_to_owngil(ErlNifEnv *env, py_context_t *ctx,
 /**
  * @brief Dispatch reactor on_write_ready to OWN_GIL thread
  *
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 ERL_NIF_TERM dispatch_reactor_write_to_owngil(ErlNifEnv *env, py_context_t *ctx,
                                                int fd) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
 
-    /* Clear and set up request */
-    enif_clear_env(ctx->shared_env);
-    ctx->request_term = enif_make_int(ctx->shared_env, fd);
-    ctx->request_type = CTX_REQ_REACTOR_ON_WRITE_READY;
+    /* Populate request */
+    req->type = CTX_REQ_REACTOR_ON_WRITE_READY;
+    req->request_data = enif_make_int(req->request_env, fd);
+    req->reactor_fd = fd;
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Add ref for queue (now refcount = 2: caller + queue) */
+    ctx_request_addref(req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
+
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL reactor write dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL reactor write dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
+
+    ctx_request_release(req);
 
     return result;
 }
@@ -3467,45 +4327,69 @@ ERL_NIF_TERM dispatch_reactor_write_to_owngil(ErlNifEnv *env, py_context_t *ctx,
 /**
  * @brief Dispatch reactor init_connection to OWN_GIL thread
  *
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 ERL_NIF_TERM dispatch_reactor_init_to_owngil(ErlNifEnv *env, py_context_t *ctx,
                                               int fd, ERL_NIF_TERM client_info) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = CTX_REQ_REACTOR_INIT_CONNECTION;
+    ERL_NIF_TERM fd_term = enif_make_int(req->request_env, fd);
+    ERL_NIF_TERM info_copy = enif_make_copy(req->request_env, client_info);
+    req->request_data = enif_make_tuple2(req->request_env, fd_term, info_copy);
+    req->reactor_fd = fd;
 
-    /* Clear and set up request */
-    enif_clear_env(ctx->shared_env);
-    ERL_NIF_TERM fd_term = enif_make_int(ctx->shared_env, fd);
-    ERL_NIF_TERM info_copy = enif_make_copy(ctx->shared_env, client_info);
-    ctx->request_term = enif_make_tuple2(ctx->shared_env, fd_term, info_copy);
-    ctx->request_type = CTX_REQ_REACTOR_INIT_CONNECTION;
+    /* Add ref for queue (now refcount = 2: caller + queue) */
+    ctx_request_addref(req);
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL reactor init dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL reactor init dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    ctx_request_release(req);
 
     return result;
 }
@@ -3513,47 +4397,69 @@ ERL_NIF_TERM dispatch_reactor_init_to_owngil(ErlNifEnv *env, py_context_t *ctx,
 /**
  * @brief Dispatch exec_with_env to OWN_GIL thread
  *
- * Passes the process-local env resource to the worker thread via local_env_ptr.
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 static ERL_NIF_TERM dispatch_exec_with_env_to_owngil(
     ErlNifEnv *env, py_context_t *ctx,
     ERL_NIF_TERM code, py_env_resource_t *penv
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = CTX_REQ_EXEC_WITH_ENV;
+    req->request_data = enif_make_copy(req->request_env, code);
+    req->local_env_ptr = penv;
 
-    /* Copy request to shared env */
-    enif_clear_env(ctx->shared_env);
-    ctx->request_term = enif_make_copy(ctx->shared_env, code);
-    ctx->local_env_ptr = penv;  /* Pass env resource pointer */
-    ctx->request_type = CTX_REQ_EXEC_WITH_ENV;
+    /* Add ref for queue */
+    ctx_request_addref(req);
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL exec_with_env dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL exec_with_env dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    ctx_request_release(req);
 
     return result;
 }
@@ -3561,50 +4467,72 @@ static ERL_NIF_TERM dispatch_exec_with_env_to_owngil(
 /**
  * @brief Dispatch eval_with_env to OWN_GIL thread
  *
- * Passes the process-local env resource to the worker thread via local_env_ptr.
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 static ERL_NIF_TERM dispatch_eval_with_env_to_owngil(
     ErlNifEnv *env, py_context_t *ctx,
     ERL_NIF_TERM code, ERL_NIF_TERM locals,
     py_env_resource_t *penv
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request: {Code, Locals} */
+    req->type = CTX_REQ_EVAL_WITH_ENV;
+    ERL_NIF_TERM code_copy = enif_make_copy(req->request_env, code);
+    ERL_NIF_TERM locals_copy = enif_make_copy(req->request_env, locals);
+    req->request_data = enif_make_tuple2(req->request_env, code_copy, locals_copy);
+    req->local_env_ptr = penv;
 
-    /* Copy request to shared env: {Code, Locals} */
-    enif_clear_env(ctx->shared_env);
-    ERL_NIF_TERM code_copy = enif_make_copy(ctx->shared_env, code);
-    ERL_NIF_TERM locals_copy = enif_make_copy(ctx->shared_env, locals);
-    ctx->request_term = enif_make_tuple2(ctx->shared_env, code_copy, locals_copy);
-    ctx->local_env_ptr = penv;  /* Pass env resource pointer */
-    ctx->request_type = CTX_REQ_EVAL_WITH_ENV;
+    /* Add ref for queue */
+    ctx_request_addref(req);
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
-        if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL eval_with_env dispatch timeout: worker thread unresponsive\n");
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
+        if (rc == ETIMEDOUT) {
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL eval_with_env dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    ctx_request_release(req);
 
     return result;
 }
@@ -3612,8 +4540,7 @@ static ERL_NIF_TERM dispatch_eval_with_env_to_owngil(
 /**
  * @brief Dispatch call_with_env to OWN_GIL thread
  *
- * Passes the process-local env resource to the worker thread via local_env_ptr.
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 static ERL_NIF_TERM dispatch_call_with_env_to_owngil(
     ErlNifEnv *env, py_context_t *ctx,
@@ -3621,45 +4548,68 @@ static ERL_NIF_TERM dispatch_call_with_env_to_owngil(
     ERL_NIF_TERM args, ERL_NIF_TERM kwargs,
     py_env_resource_t *penv
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
 
-    /* Copy request to shared env: {Module, Func, Args, Kwargs} */
-    enif_clear_env(ctx->shared_env);
-    ERL_NIF_TERM module_copy = enif_make_copy(ctx->shared_env, module);
-    ERL_NIF_TERM func_copy = enif_make_copy(ctx->shared_env, func);
-    ERL_NIF_TERM args_copy = enif_make_copy(ctx->shared_env, args);
-    ERL_NIF_TERM kwargs_copy = enif_make_copy(ctx->shared_env, kwargs);
-    ctx->request_term = enif_make_tuple4(ctx->shared_env,
+    /* Populate request: {Module, Func, Args, Kwargs} */
+    req->type = CTX_REQ_CALL_WITH_ENV;
+    ERL_NIF_TERM module_copy = enif_make_copy(req->request_env, module);
+    ERL_NIF_TERM func_copy = enif_make_copy(req->request_env, func);
+    ERL_NIF_TERM args_copy = enif_make_copy(req->request_env, args);
+    ERL_NIF_TERM kwargs_copy = enif_make_copy(req->request_env, kwargs);
+    req->request_data = enif_make_tuple4(req->request_env,
         module_copy, func_copy, args_copy, kwargs_copy);
-    ctx->local_env_ptr = penv;  /* Pass env resource pointer */
-    ctx->request_type = CTX_REQ_CALL_WITH_ENV;
+    req->local_env_ptr = penv;
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Add ref for queue */
+    ctx_request_addref(req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
+
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL call_with_env dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL call_with_env dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
+
+    ctx_request_release(req);
 
     return result;
 }
@@ -3667,47 +4617,68 @@ static ERL_NIF_TERM dispatch_call_with_env_to_owngil(
 /**
  * @brief Dispatch create_local_env to OWN_GIL thread
  *
- * Creates the globals/locals dicts in the correct interpreter context.
- * Returns ok or error.
- * Uses timeout to prevent deadlock if worker thread dies.
+ * Uses queue-based dispatch with per-request synchronization.
  */
 static ERL_NIF_TERM dispatch_create_local_env_to_owngil(
     ErlNifEnv *env, py_context_t *ctx,
     py_env_resource_t *res
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = CTX_REQ_CREATE_LOCAL_ENV;
+    req->local_env_ptr = res;
 
-    /* Pass env resource pointer to worker thread */
-    enif_clear_env(ctx->shared_env);
-    ctx->local_env_ptr = res;
-    ctx->request_type = CTX_REQ_CREATE_LOCAL_ENV;
+    /* Add ref for queue */
+    ctx_request_addref(req);
 
-    /* Signal the worker thread */
-    pthread_cond_signal(&ctx->request_ready);
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
 
-    /* Wait for response with timeout to prevent deadlock */
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL create_local_env dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL create_local_env dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    /* Copy response back to caller's env */
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
 
-    pthread_mutex_unlock(&ctx->request_mutex);
+    ctx_request_release(req);
 
     return result;
 }
@@ -3715,43 +4686,67 @@ static ERL_NIF_TERM dispatch_create_local_env_to_owngil(
 /**
  * @brief Dispatch apply_imports to OWN_GIL worker thread
  *
- * @param env NIF environment
- * @param ctx Context resource
- * @param imports_term List of {ModuleBin, FuncBin | all} tuples
- * @return ok | {error, Reason}
+ * Uses queue-based dispatch with per-request synchronization.
  */
 static ERL_NIF_TERM dispatch_apply_imports_to_owngil(
     ErlNifEnv *env, py_context_t *ctx, ERL_NIF_TERM imports_term
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
+
+    /* Populate request */
+    req->type = CTX_REQ_APPLY_IMPORTS;
+    req->request_data = enif_make_copy(req->request_env, imports_term);
 
-    enif_clear_env(ctx->shared_env);
-    ctx->request_term = enif_make_copy(ctx->shared_env, imports_term);
-    ctx->request_type = CTX_REQ_APPLY_IMPORTS;
+    /* Add ref for queue */
+    ctx_request_addref(req);
 
-    pthread_cond_signal(&ctx->request_ready);
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
 
-    /* Wait for response with timeout */
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL apply_imports dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL apply_imports dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
-    pthread_mutex_unlock(&ctx->request_mutex);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
+
+    ctx_request_release(req);
 
     return result;
 }
@@ -3759,43 +4754,67 @@ static ERL_NIF_TERM dispatch_apply_imports_to_owngil(
 /**
  * @brief Dispatch apply_paths request to OWN_GIL worker thread
  *
- * @param env Current NIF environment
- * @param ctx OWN_GIL context
- * @param paths_term List of path binaries
- * @return ok | {error, Reason}
+ * Uses queue-based dispatch with per-request synchronization.
  */
 static ERL_NIF_TERM dispatch_apply_paths_to_owngil(
     ErlNifEnv *env, py_context_t *ctx, ERL_NIF_TERM paths_term
 ) {
-    if (!atomic_load(&ctx->thread_running)) {
+    if (!atomic_load(&ctx->worker_running)) {
         return make_error(env, "thread_not_running");
     }
 
-    pthread_mutex_lock(&ctx->request_mutex);
+    if (atomic_load(&ctx->destroyed)) {
+        return make_error(env, "context_destroyed");
+    }
+
+    /* Create request struct */
+    ctx_request_t *req = ctx_request_create();
+    if (req == NULL) {
+        return make_error(env, "alloc_failed");
+    }
 
-    enif_clear_env(ctx->shared_env);
-    ctx->request_term = enif_make_copy(ctx->shared_env, paths_term);
-    ctx->request_type = CTX_REQ_APPLY_PATHS;
+    /* Populate request */
+    req->type = CTX_REQ_APPLY_PATHS;
+    req->request_data = enif_make_copy(req->request_env, paths_term);
 
-    pthread_cond_signal(&ctx->request_ready);
+    /* Add ref for queue */
+    ctx_request_addref(req);
 
-    /* Wait for response with timeout */
+    /* Enqueue the request */
+    ctx_queue_enqueue(ctx, req);
+
+    /* Wait for completion with timeout */
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_DISPATCH_TIMEOUT_SECS;
 
-    while (ctx->request_type != CTX_REQ_NONE) {
-        int rc = pthread_cond_timedwait(&ctx->response_ready, &ctx->request_mutex, &deadline);
+    ERL_NIF_TERM result;
+    pthread_mutex_lock(&req->mutex);
+
+    while (!atomic_load(&req->completed)) {
+        int rc = pthread_cond_timedwait(&req->cond, &req->mutex, &deadline);
         if (rc == ETIMEDOUT) {
-            atomic_store(&ctx->thread_running, false);
-            pthread_mutex_unlock(&ctx->request_mutex);
-            fprintf(stderr, "OWN_GIL apply_paths dispatch timeout: worker thread unresponsive\n");
+            atomic_store(&req->cancelled, true);
+            pthread_mutex_unlock(&req->mutex);
+
+            fprintf(stderr, "OWN_GIL apply_paths dispatch timeout after %d seconds\n",
+                    OWNGIL_DISPATCH_TIMEOUT_SECS);
+
+            ctx_request_release(req);
             return make_error(env, "worker_timeout");
         }
     }
 
-    ERL_NIF_TERM result = enif_make_copy(env, ctx->response_term);
-    pthread_mutex_unlock(&ctx->request_mutex);
+    pthread_mutex_unlock(&req->mutex);
+
+    /* Copy result to caller's env */
+    if (req->result_env != NULL) {
+        result = enif_make_copy(env, req->result);
+    } else {
+        result = make_error(env, "no_result");
+    }
+
+    ctx_request_release(req);
 
     return result;
 }
@@ -3813,66 +4832,72 @@ static int owngil_context_init(py_context_t *ctx) {
     ctx->uses_own_gil = true;
     ctx->own_gil_tstate = NULL;
     ctx->own_gil_interp = NULL;
-    ctx->local_env_ptr = NULL;
-    atomic_store(&ctx->thread_running, false);
+
+    /* Initialize worker thread state */
+    atomic_store(&ctx->worker_running, false);
     atomic_store(&ctx->init_error, false);
     atomic_store(&ctx->shutdown_requested, false);
+    atomic_store(&ctx->leaked, false);
+
+    /* Initialize request queue */
+    ctx->queue_head = NULL;
+    ctx->queue_tail = NULL;
+
+    /* Initialize legacy compatibility fields */
+    ctx->shared_env = NULL;
     ctx->request_type = CTX_REQ_NONE;
     ctx->request_term = 0;
-    ctx->request_data = 0;
     ctx->response_term = 0;
     ctx->response_ok = false;
+    ctx->local_env_ptr = NULL;
+    ctx->reactor_buffer_ptr = NULL;
 
-    /* Initialize mutex and condition variables */
-    if (pthread_mutex_init(&ctx->request_mutex, NULL) != 0) {
-        return -1;
-    }
-
-    if (pthread_cond_init(&ctx->request_ready, NULL) != 0) {
-        pthread_mutex_destroy(&ctx->request_mutex);
+    /* Initialize queue mutex */
+    if (pthread_mutex_init(&ctx->queue_mutex, NULL) != 0) {
         return -1;
     }
 
-    if (pthread_cond_init(&ctx->response_ready, NULL) != 0) {
-        pthread_cond_destroy(&ctx->request_ready);
-        pthread_mutex_destroy(&ctx->request_mutex);
+    /* Initialize queue condition variable */
+    if (pthread_cond_init(&ctx->queue_not_empty, NULL) != 0) {
+        pthread_mutex_destroy(&ctx->queue_mutex);
         return -1;
     }
 
-    /* Create shared environment for term passing */
-    ctx->shared_env = enif_alloc_env();
-    if (ctx->shared_env == NULL) {
-        pthread_cond_destroy(&ctx->response_ready);
-        pthread_cond_destroy(&ctx->request_ready);
-        pthread_mutex_destroy(&ctx->request_mutex);
+    /* Create message environment for async responses */
+    ctx->msg_env = enif_alloc_env();
+    if (ctx->msg_env == NULL) {
+        pthread_cond_destroy(&ctx->queue_not_empty);
+        pthread_mutex_destroy(&ctx->queue_mutex);
         return -1;
     }
 
     /* Start the worker thread */
-    if (pthread_create(&ctx->own_gil_thread, NULL, owngil_context_thread_main, ctx) != 0) {
-        enif_free_env(ctx->shared_env);
-        pthread_cond_destroy(&ctx->response_ready);
-        pthread_cond_destroy(&ctx->request_ready);
-        pthread_mutex_destroy(&ctx->request_mutex);
+    if (pthread_create(&ctx->worker_thread, NULL, owngil_context_thread_main, ctx) != 0) {
+        enif_free_env(ctx->msg_env);
+        ctx->msg_env = NULL;
+        pthread_cond_destroy(&ctx->queue_not_empty);
+        pthread_mutex_destroy(&ctx->queue_mutex);
         return -1;
     }
 
     /* Wait for thread to initialize or fail */
     int wait_count = 0;
-    while (!atomic_load(&ctx->thread_running) &&
+    while (!atomic_load(&ctx->worker_running) &&
            !atomic_load(&ctx->init_error) &&
            wait_count < 2000) {
         usleep(1000);  /* 1ms */
         wait_count++;
     }
 
-    if (atomic_load(&ctx->init_error) || !atomic_load(&ctx->thread_running)) {
+    if (atomic_load(&ctx->init_error) || !atomic_load(&ctx->worker_running)) {
         /* Thread failed to start */
-        pthread_join(ctx->own_gil_thread, NULL);
-        enif_free_env(ctx->shared_env);
-        pthread_cond_destroy(&ctx->response_ready);
-        pthread_cond_destroy(&ctx->request_ready);
-        pthread_mutex_destroy(&ctx->request_mutex);
+        pthread_join(ctx->worker_thread, NULL);
+        if (ctx->msg_env != NULL) {
+            enif_free_env(ctx->msg_env);
+            ctx->msg_env = NULL;
+        }
+        pthread_cond_destroy(&ctx->queue_not_empty);
+        pthread_mutex_destroy(&ctx->queue_mutex);
         return -1;
     }
 
@@ -3882,7 +4907,9 @@ static int owngil_context_init(py_context_t *ctx) {
 /**
  * @brief Shutdown OWN_GIL context and clean up resources
  *
- * Uses a timeout to avoid hanging forever if the Python thread is stuck.
+ * Uses the join-or-leak pattern: if the worker thread doesn't respond
+ * within the timeout, we mark the context as leaked and do NOT free
+ * shared resources to avoid use-after-free.
  *
  * @param ctx Context to shutdown
  */
@@ -3896,48 +4923,58 @@ static void owngil_context_shutdown(py_context_t *ctx) {
     /* Signal shutdown */
     atomic_store(&ctx->shutdown_requested, true);
 
-    pthread_mutex_lock(&ctx->request_mutex);
-    ctx->request_type = CTX_REQ_SHUTDOWN;
-    pthread_cond_signal(&ctx->request_ready);
-    pthread_mutex_unlock(&ctx->request_mutex);
+    /* Cancel all pending (not-yet-started) requests */
+    ctx_queue_cancel_all(ctx);
+
+    /* Enqueue shutdown request to wake worker if idle */
+    ctx_request_t *shutdown_req = ctx_request_create();
+    if (shutdown_req != NULL) {
+        shutdown_req->type = CTX_REQ_SHUTDOWN;
+        ctx_queue_enqueue(ctx, shutdown_req);
+    }
 
     /* Wait for thread to exit with timeout */
+    bool join_succeeded = false;
+
 #if defined(__linux__)
     struct timespec deadline;
     clock_gettime(CLOCK_REALTIME, &deadline);
     deadline.tv_sec += OWNGIL_SHUTDOWN_TIMEOUT_SECS;
-    int rc = pthread_timedjoin_np(ctx->own_gil_thread, NULL, &deadline);
-    if (rc == ETIMEDOUT) {
-        fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, detaching thread\n",
-                OWNGIL_SHUTDOWN_TIMEOUT_SECS);
-        pthread_detach(ctx->own_gil_thread);
-    }
+    int rc = pthread_timedjoin_np(ctx->worker_thread, NULL, &deadline);
+    join_succeeded = (rc == 0);
 #else
-    /* macOS/other: poll thread_running flag with timeout */
+    /* macOS/other: poll worker_running flag with timeout */
     int wait_ms = 0;
-    while (atomic_load(&ctx->thread_running) &&
+    while (atomic_load(&ctx->worker_running) &&
            wait_ms < OWNGIL_SHUTDOWN_TIMEOUT_SECS * 1000) {
         usleep(100000);  /* 100ms */
         wait_ms += 100;
     }
-    if (atomic_load(&ctx->thread_running)) {
-        fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, detaching thread\n",
-                OWNGIL_SHUTDOWN_TIMEOUT_SECS);
-        pthread_detach(ctx->own_gil_thread);
-    } else {
-        pthread_join(ctx->own_gil_thread, NULL);
+    if (!atomic_load(&ctx->worker_running)) {
+        pthread_join(ctx->worker_thread, NULL);
+        join_succeeded = true;
     }
 #endif
 
-    /* Clean up resources */
-    if (ctx->shared_env != NULL) {
-        enif_free_env(ctx->shared_env);
-        ctx->shared_env = NULL;
+    if (!join_succeeded) {
+        /* Worker thread is unresponsive - use leak pattern */
+        fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, leaking context\n",
+                OWNGIL_SHUTDOWN_TIMEOUT_SECS);
+        atomic_store(&ctx->leaked, true);
+        /* Do NOT free shared resources - worker thread may still be using them.
+         * The leaked thread is isolated and will eventually clean up itself
+         * when Python exits, or persist until VM exit. */
+        return;
+    }
+
+    /* Clean shutdown succeeded - safe to free resources */
+    if (ctx->msg_env != NULL) {
+        enif_free_env(ctx->msg_env);
+        ctx->msg_env = NULL;
     }
 
-    pthread_cond_destroy(&ctx->response_ready);
-    pthread_cond_destroy(&ctx->request_ready);
-    pthread_mutex_destroy(&ctx->request_mutex);
+    pthread_cond_destroy(&ctx->queue_not_empty);
+    pthread_mutex_destroy(&ctx->queue_mutex);
 
     ctx->uses_own_gil = false;
 }
@@ -3990,7 +5027,9 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T
     /* Initialize fields */
     ctx->interp_id = atomic_fetch_add(&g_context_id_counter, 1);
     ctx->is_subinterp = use_owngil;
-    ctx->destroyed = false;
+    atomic_store(&ctx->destroyed, false);
+    atomic_store(&ctx->leaked, false);
+    atomic_store(&ctx->init_error, false);
     ctx->has_callback_handler = false;
     ctx->callback_pipe[0] = -1;
     ctx->callback_pipe[1] = -1;
@@ -3998,6 +5037,7 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T
     ctx->locals = NULL;
     ctx->module_cache = NULL;
     ctx->executor_id = -1;  /* Not assigned yet */
+    ctx->uses_worker_thread = false;
 
     /* Create callback pipe for blocking callback responses */
     if (pipe(ctx->callback_pipe) < 0) {
@@ -4023,38 +5063,14 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T
         return enif_make_tuple3(env, ATOM_OK, ref, enif_make_uint(env, ctx->interp_id));
     }
 #endif
-    {
-        /* Worker mode - create a thread state in main interpreter */
-        PyGILState_STATE gstate = PyGILState_Ensure();
-
-#ifndef HAVE_SUBINTERPRETERS
-        PyInterpreterState *interp = PyInterpreterState_Get();
-        ctx->thread_state = PyThreadState_New(interp);
-#endif
-
-        ctx->globals = PyDict_New();
-        ctx->locals = PyDict_New();
-        ctx->module_cache = PyDict_New();
-
-        /* Import __builtins__ into globals */
-        PyObject *builtins = PyEval_GetBuiltins();
-        PyDict_SetItemString(ctx->globals, "__builtins__", builtins);
-
-        /* Import erlang module into globals for worker mode */
-        PyObject *erlang_module = PyImport_ImportModule("erlang");
-        if (erlang_module != NULL) {
-            PyDict_SetItemString(ctx->globals, "erlang", erlang_module);
-            Py_DECREF(erlang_module);
-        }
-
-        PyGILState_Release(gstate);
-    }
 
-    /* Assign executor for thread affinity in MULTI_EXECUTOR mode.
-     * This ensures numpy/torch thread-local state consistency. */
-    if (g_execution_mode == PY_MODE_MULTI_EXECUTOR &&
-        atomic_load(&g_multi_executor_initialized)) {
-        ctx->executor_id = select_executor();
+    /* Worker mode: create dedicated pthread with main interpreter
+     * This provides stable thread affinity for numpy/torch/tensorflow */
+    if (worker_context_init(ctx) != 0) {
+        close(ctx->callback_pipe[0]);
+        close(ctx->callback_pipe[1]);
+        enif_release_resource(ctx);
+        return make_error(env, "worker_init_failed");
     }
 
     ERL_NIF_TERM ref = enif_make_resource(env, ctx);
@@ -4069,10 +5085,10 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T
  *
  * nif_context_destroy(ContextRef) -> ok
  *
- * For subinterpreter mode: releases the pool slot back to the pool.
- * The pool owns the Python objects - context just references them.
+ * For owngil mode: shuts down the dedicated OWN_GIL thread.
+ * For worker mode: shuts down the dedicated worker thread.
  *
- * For worker mode: cleans up Python objects directly with the main GIL.
+ * Both modes use the join-or-leak pattern for safe shutdown.
  */
 static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     (void)argc;
@@ -4083,12 +5099,12 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_
     }
 
     /* Skip if already destroyed */
-    if (ctx->destroyed) {
+    if (atomic_load(&ctx->destroyed)) {
         return ATOM_OK;
     }
 
     /* Mark as destroyed early to prevent new operations */
-    ctx->destroyed = true;
+    atomic_store(&ctx->destroyed, true);
 
 #ifdef HAVE_SUBINTERPRETERS
     /* OWN_GIL mode: shutdown the dedicated thread */
@@ -4108,7 +5124,23 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_
     }
 #endif
 
-    /* Worker mode - clean up Python objects with GIL */
+    /* Worker mode: shutdown the dedicated worker thread */
+    if (ctx->uses_worker_thread) {
+        worker_context_shutdown(ctx);
+        /* Close callback pipes */
+        if (ctx->callback_pipe[0] >= 0) {
+            close(ctx->callback_pipe[0]);
+            ctx->callback_pipe[0] = -1;
+        }
+        if (ctx->callback_pipe[1] >= 0) {
+            close(ctx->callback_pipe[1]);
+            ctx->callback_pipe[1] = -1;
+        }
+        atomic_fetch_add(&g_counters.ctx_destroyed, 1);
+        return ATOM_OK;
+    }
+
+    /* Legacy mode (should not reach here with new architecture) */
     if (runtime_is_running()) {
         PyGILState_STATE gstate = PyGILState_Ensure();
         Py_XDECREF(ctx->module_cache);
@@ -4127,6 +5159,16 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_
         PyGILState_Release(gstate);
     }
 
+    /* Close callback pipes */
+    if (ctx->callback_pipe[0] >= 0) {
+        close(ctx->callback_pipe[0]);
+        ctx->callback_pipe[0] = -1;
+    }
+    if (ctx->callback_pipe[1] >= 0) {
+        close(ctx->callback_pipe[1]);
+        ctx->callback_pipe[1] = -1;
+    }
+
     atomic_fetch_add(&g_counters.ctx_destroyed, 1);
     return ATOM_OK;
 }
@@ -4197,7 +5239,20 @@ static ERL_NIF_TERM nif_context_call(ErlNifEnv *env, int argc, const ERL_NIF_TER
     }
 #endif
 
-    /* Both worker mode and subinterpreter mode use py_context_acquire.
+    /* Worker thread mode: dispatch to dedicated thread */
+    if (ctx->uses_worker_thread) {
+        /* Build request tuple: {Module, Func, Args, Kwargs} */
+        ERL_NIF_TERM kwargs = (argc > 4 && enif_is_map(env, argv[4]))
+            ? argv[4] : enif_make_new_map(env);
+        ERL_NIF_TERM request = enif_make_tuple4(env,
+            argv[1],  /* Module */
+            argv[2],  /* Func */
+            argv[3],  /* Args */
+            kwargs);
+        return dispatch_to_worker_thread(env, ctx, CTX_REQ_CALL, request);
+    }
+
+    /* Legacy mode: direct execution with py_context_acquire.
      * For subinterpreters, py_context_acquire handles PyThreadState_Swap
      * to switch to the pool slot's interpreter. */
     ErlNifBinary module_bin, func_bin;
@@ -4208,15 +5263,6 @@ static ERL_NIF_TERM nif_context_call(ErlNifEnv *env, int argc, const ERL_NIF_TER
         return make_error(env, "invalid_func");
     }
 
-    /* Context thread affinity: dispatch via executor instead of direct execution.
-     * This ensures numpy/torch thread-local state consistency. */
-    if (ctx->executor_id >= 0 && g_execution_mode == PY_MODE_MULTI_EXECUTOR &&
-        atomic_load(&g_multi_executor_initialized)) {
-        ERL_NIF_TERM kwargs = (argc > 4 && enif_is_map(env, argv[4]))
-            ? argv[4] : enif_make_new_map(env);
-        return context_dispatch_call(env, ctx, &module_bin, &func_bin, argv[3], kwargs);
-    }
-
     char *module_name = binary_to_string(&module_bin);
     char *func_name = binary_to_string(&func_bin);
     if (module_name == NULL || func_name == NULL) {
@@ -4376,6 +5422,144 @@ static ERL_NIF_TERM nif_context_call(ErlNifEnv *env, int argc, const ERL_NIF_TER
     return result;
 }
 
+/**
+ * @brief Async call - enqueue and return immediately
+ *
+ * nif_context_call_async(ContextRef, CallerPid, RequestId, Module, Func, Args, Kwargs)
+ *     -> {enqueued, RequestId} | {error, Reason}
+ *
+ * The worker thread will send {py_result, RequestId, Result} to CallerPid when done.
+ */
+static ERL_NIF_TERM nif_context_call_async(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+    py_context_t *ctx;
+
+    if (!runtime_is_running()) {
+        return make_error(env, "python_not_running");
+    }
+
+    if (argc < 6) {
+        return make_error(env, "badarg");
+    }
+
+    if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) {
+        return make_error(env, "invalid_context");
+    }
+
+    /* Get caller PID */
+    ErlNifPid caller_pid;
+    if (!enif_get_local_pid(env, argv[1], &caller_pid)) {
+        return make_error(env, "invalid_pid");
+    }
+
+    /* RequestId is argv[2] - can be any term */
+    ERL_NIF_TERM request_id = argv[2];
+
+    /* Worker thread mode: dispatch async */
+    if (ctx->uses_worker_thread) {
+        /* Build request tuple: {Module, Func, Args, Kwargs} */
+        ERL_NIF_TERM kwargs = (argc > 6 && enif_is_map(env, argv[6]))
+            ? argv[6] : enif_make_new_map(env);
+        ERL_NIF_TERM request = enif_make_tuple4(env,
+            argv[3],  /* Module */
+            argv[4],  /* Func */
+            argv[5],  /* Args */
+            kwargs);
+        return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_CALL,
+            request, caller_pid, request_id, NULL);
+    }
+
+    /* Not using worker thread - fall back to blocking call */
+    return make_error(env, "async_requires_worker_thread");
+}
+
+/**
+ * @brief Async eval - enqueue and return immediately
+ *
+ * nif_context_eval_async(ContextRef, CallerPid, RequestId, Code, Locals)
+ *     -> {enqueued, RequestId} | {error, Reason}
+ *
+ * The worker thread will send {py_result, RequestId, Result} to CallerPid when done.
+ */
+static ERL_NIF_TERM nif_context_eval_async(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+    py_context_t *ctx;
+
+    if (!runtime_is_running()) {
+        return make_error(env, "python_not_running");
+    }
+
+    if (argc < 4) {
+        return make_error(env, "badarg");
+    }
+
+    if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) {
+        return make_error(env, "invalid_context");
+    }
+
+    /* Get caller PID */
+    ErlNifPid caller_pid;
+    if (!enif_get_local_pid(env, argv[1], &caller_pid)) {
+        return make_error(env, "invalid_pid");
+    }
+
+    /* RequestId is argv[2] - can be any term */
+    ERL_NIF_TERM request_id = argv[2];
+
+    /* Worker thread mode: dispatch async */
+    if (ctx->uses_worker_thread) {
+        /* Build request tuple: {Code, Locals} */
+        ERL_NIF_TERM locals = (argc > 4 && enif_is_map(env, argv[4]))
+            ? argv[4] : enif_make_new_map(env);
+        ERL_NIF_TERM request = enif_make_tuple2(env, argv[3], locals);
+        return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EVAL,
+            request, caller_pid, request_id, NULL);
+    }
+
+    /* Not using worker thread - fall back to blocking call */
+    return make_error(env, "async_requires_worker_thread");
+}
+
+/**
+ * @brief Async exec - enqueue and return immediately
+ *
+ * nif_context_exec_async(ContextRef, CallerPid, RequestId, Code)
+ *     -> {enqueued, RequestId} | {error, Reason}
+ *
+ * The worker thread will send {py_result, RequestId, Result} to CallerPid when done.
+ */
+static ERL_NIF_TERM nif_context_exec_async(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+    py_context_t *ctx;
+
+    if (!runtime_is_running()) {
+        return make_error(env, "python_not_running");
+    }
+
+    if (argc < 4) {
+        return make_error(env, "badarg");
+    }
+
+    if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) {
+        return make_error(env, "invalid_context");
+    }
+
+    /* Get caller PID */
+    ErlNifPid caller_pid;
+    if (!enif_get_local_pid(env, argv[1], &caller_pid)) {
+        return make_error(env, "invalid_pid");
+    }
+
+    /* RequestId is argv[2] - can be any term */
+    ERL_NIF_TERM request_id = argv[2];
+
+    /* Worker thread mode: dispatch async */
+    if (ctx->uses_worker_thread) {
+        return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EXEC,
+            argv[3], caller_pid, request_id, NULL);
+    }
+
+    /* Not using worker thread - fall back to blocking call */
+    return make_error(env, "async_requires_worker_thread");
+}
+
 /**
  * @brief Evaluate a Python expression in a context
  *
@@ -4408,7 +5592,16 @@ static ERL_NIF_TERM nif_context_eval(ErlNifEnv *env, int argc, const ERL_NIF_TER
     }
 #endif
 
-    /* Both worker mode and subinterpreter mode use py_context_acquire.
+    /* Worker thread mode: dispatch to dedicated thread */
+    if (ctx->uses_worker_thread) {
+        /* Build request tuple: {Code, Locals} */
+        ERL_NIF_TERM locals = (argc > 2 && enif_is_map(env, argv[2]))
+            ? argv[2] : enif_make_new_map(env);
+        ERL_NIF_TERM request = enif_make_tuple2(env, argv[1], locals);
+        return dispatch_to_worker_thread(env, ctx, CTX_REQ_EVAL, request);
+    }
+
+    /* Legacy mode: direct execution with py_context_acquire.
      * For subinterpreters, py_context_acquire handles PyThreadState_Swap
      * to switch to the pool slot's interpreter. */
     ErlNifBinary code_bin;
@@ -4416,15 +5609,6 @@ static ERL_NIF_TERM nif_context_eval(ErlNifEnv *env, int argc, const ERL_NIF_TER
         return make_error(env, "invalid_code");
     }
 
-    /* Context thread affinity: dispatch via executor instead of direct execution.
-     * This ensures numpy/torch thread-local state consistency. */
-    if (ctx->executor_id >= 0 && g_execution_mode == PY_MODE_MULTI_EXECUTOR &&
-        atomic_load(&g_multi_executor_initialized)) {
-        ERL_NIF_TERM locals = (argc > 2 && enif_is_map(env, argv[2]))
-            ? argv[2] : enif_make_new_map(env);
-        return context_dispatch_eval(env, ctx, &code_bin, locals);
-    }
-
     char *code = binary_to_string(&code_bin);
     if (code == NULL) {
         return make_error(env, "alloc_failed");
@@ -4554,7 +5738,12 @@ static ERL_NIF_TERM nif_context_exec(ErlNifEnv *env, int argc, const ERL_NIF_TER
     }
 #endif
 
-    /* Both worker mode and subinterpreter mode use py_context_acquire.
+    /* Worker thread mode: dispatch to dedicated thread */
+    if (ctx->uses_worker_thread) {
+        return dispatch_to_worker_thread(env, ctx, CTX_REQ_EXEC, argv[1]);
+    }
+
+    /* Legacy mode: direct execution with py_context_acquire.
      * For subinterpreters, py_context_acquire handles PyThreadState_Swap
      * to switch to the pool slot's interpreter. */
     ErlNifBinary code_bin;
@@ -4562,13 +5751,6 @@ static ERL_NIF_TERM nif_context_exec(ErlNifEnv *env, int argc, const ERL_NIF_TER
         return make_error(env, "invalid_code");
     }
 
-    /* Context thread affinity: dispatch via executor instead of direct execution.
-     * This ensures numpy/torch thread-local state consistency. */
-    if (ctx->executor_id >= 0 && g_execution_mode == PY_MODE_MULTI_EXECUTOR &&
-        atomic_load(&g_multi_executor_initialized)) {
-        return context_dispatch_exec(env, ctx, &code_bin);
-    }
-
     char *code = binary_to_string(&code_bin);
     if (code == NULL) {
         return make_error(env, "alloc_failed");
@@ -4949,6 +6131,12 @@ static ERL_NIF_TERM nif_context_exec_with_env(ErlNifEnv *env, int argc, const ER
     }
 #endif
 
+    /* Worker thread mode: dispatch to dedicated thread with local env */
+    if (ctx->uses_worker_thread) {
+        /* For exec, we just pass the code binary */
+        return dispatch_to_worker_thread_impl(env, ctx, CTX_REQ_EXEC_WITH_ENV, argv[1], penv);
+    }
+
     char *code = binary_to_string(&code_bin);
     if (code == NULL) {
         return make_error(env, "alloc_failed");
@@ -5031,6 +6219,15 @@ static ERL_NIF_TERM nif_context_eval_with_env(ErlNifEnv *env, int argc, const ER
     }
 #endif
 
+    /* Worker thread mode: dispatch to dedicated thread with local env */
+    if (ctx->uses_worker_thread) {
+        /* Build request tuple: {Code, Locals} */
+        ERL_NIF_TERM locals = (argc > 2 && enif_is_map(env, argv[2]))
+            ? argv[2] : enif_make_new_map(env);
+        ERL_NIF_TERM request = enif_make_tuple2(env, argv[1], locals);
+        return dispatch_to_worker_thread_impl(env, ctx, CTX_REQ_EVAL_WITH_ENV, request, penv);
+    }
+
     char *code = binary_to_string(&code_bin);
     if (code == NULL) {
         return make_error(env, "alloc_failed");
@@ -5189,6 +6386,19 @@ static ERL_NIF_TERM nif_context_call_with_env(ErlNifEnv *env, int argc, const ER
     }
 #endif
 
+    /* Worker thread mode: dispatch to dedicated thread with local env */
+    if (ctx->uses_worker_thread) {
+        /* Build request tuple: {Module, Func, Args, Kwargs} */
+        ERL_NIF_TERM kwargs = (argc > 4 && enif_is_map(env, argv[4]))
+            ? argv[4] : enif_make_new_map(env);
+        ERL_NIF_TERM request = enif_make_tuple4(env,
+            argv[1],  /* Module */
+            argv[2],  /* Func */
+            argv[3],  /* Args */
+            kwargs);
+        return dispatch_to_worker_thread_impl(env, ctx, CTX_REQ_CALL_WITH_ENV, request, penv);
+    }
+
     char *module_name = binary_to_string(&module_bin);
     char *func_name = binary_to_string(&func_bin);
     if (module_name == NULL || func_name == NULL) {
@@ -7061,7 +8271,6 @@ static ErlNifFunc nif_funcs[] = {
 
     /* Execution mode info */
     {"execution_mode", 0, nif_execution_mode, 0},
-    {"num_executors", 0, nif_num_executors, 0},
 
     /* Thread worker support (ThreadPoolExecutor) */
     {"thread_worker_set_coordinator", 1, nif_thread_worker_set_coordinator, 0},
@@ -7158,6 +8367,10 @@ static ErlNifFunc nif_funcs[] = {
     {"context_exec", 3, nif_context_exec_with_env, ERL_NIF_DIRTY_JOB_CPU_BOUND},
     {"context_eval", 4, nif_context_eval_with_env, ERL_NIF_DIRTY_JOB_CPU_BOUND},
     {"context_call", 6, nif_context_call_with_env, ERL_NIF_DIRTY_JOB_CPU_BOUND},
+    /* Async dispatch - non-blocking, returns immediately */
+    {"context_call_async", 7, nif_context_call_async, 0},
+    {"context_eval_async", 5, nif_context_eval_async, 0},
+    {"context_exec_async", 4, nif_context_exec_async, 0},
     {"create_local_env", 1, nif_create_local_env, 0},
     {"interp_apply_imports", 2, nif_interp_apply_imports, ERL_NIF_DIRTY_JOB_CPU_BOUND},
     {"interp_apply_paths", 2, nif_interp_apply_paths, ERL_NIF_DIRTY_JOB_CPU_BOUND},
diff --git a/c_src/py_nif.h b/c_src/py_nif.h
index 050856a..d6ca87b 100644
--- a/c_src/py_nif.h
+++ b/c_src/py_nif.h
@@ -741,6 +741,150 @@ typedef enum {
     CTX_REQ_APPLY_PATHS         /**< Apply paths to sys.path */
 } ctx_request_type_t;
 
+/**
+ * @struct ctx_request_t
+ * @brief Heap-allocated request for worker/owngil context queue
+ *
+ * Each request is heap-allocated with its own mutex/condvar for completion
+ * signaling. This replaces the single-slot pattern that had race conditions
+ * with multiple concurrent callers.
+ *
+ * Lifecycle:
+ * 1. Caller allocates request with ctx_request_create()
+ * 2. Caller fills in request data and copies terms to request_env
+ * 3. Caller enqueues request and increments refcount (now 2: caller + queue)
+ * 4. Worker dequeues request, processes it, fills result_env/result
+ * 5. Worker sends result via enif_send() and releases queue's ref
+ * 6. Caller receives result and releases its ref
+ * 7. When refcount hits 0, request is freed
+ *
+ * For OWN_GIL mode, the worker thread sends results via enif_send() to avoid
+ * blocking dirty schedulers. For worker mode (main interpreter), the same
+ * pattern is used for consistency.
+ */
+typedef struct ctx_request {
+    /** @brief Type of request */
+    ctx_request_type_t type;
+
+    /** @brief Per-request mutex for completion synchronization */
+    pthread_mutex_t mutex;
+
+    /** @brief Per-request condition for completion signaling */
+    pthread_cond_t cond;
+
+    /** @brief Set by worker when done (for blocking wait mode) */
+    _Atomic bool completed;
+
+    /** @brief Set by caller on timeout/destroy to skip processing */
+    _Atomic bool cancelled;
+
+    /* Request data (owned by this struct, not caller) */
+
+    /** @brief Environment for request terms (created by caller) */
+    ErlNifEnv *request_env;
+
+    /** @brief Request parameters (in request_env) */
+    ERL_NIF_TERM request_data;
+
+    /** @brief Process-local env pointer for WITH_ENV requests */
+    void *local_env_ptr;
+
+    /** @brief Reactor buffer pointer for reactor requests */
+    void *reactor_buffer_ptr;
+
+    /** @brief FD for reactor requests */
+    int reactor_fd;
+
+    /* Result data (owned by this struct) */
+
+    /** @brief Environment for result terms (created by worker) */
+    ErlNifEnv *result_env;
+
+    /** @brief Result term (in result_env) */
+    ERL_NIF_TERM result;
+
+    /** @brief True if request succeeded */
+    bool success;
+
+    /* Async delivery (for non-blocking dispatch) */
+
+    /** @brief Caller's PID for async result delivery */
+    ErlNifPid caller_pid;
+
+    /** @brief Request ID for correlating async responses */
+    ERL_NIF_TERM request_id;
+
+    /** @brief Whether to use async delivery vs blocking wait */
+    bool async_mode;
+
+    /* Queue management */
+
+    /** @brief Reference count (2=caller+queue, 1=one side, 0=free) */
+    _Atomic int refcount;
+
+    /** @brief Next request in queue */
+    struct ctx_request *next;
+} ctx_request_t;
+
+/**
+ * @brief Create a new context request
+ * @return Newly allocated request with refcount=1, or NULL on failure
+ */
+static inline ctx_request_t *ctx_request_create(void) {
+    ctx_request_t *req = enif_alloc(sizeof(ctx_request_t));
+    if (req == NULL) return NULL;
+
+    memset(req, 0, sizeof(ctx_request_t));
+    pthread_mutex_init(&req->mutex, NULL);
+    pthread_cond_init(&req->cond, NULL);
+    atomic_store(&req->completed, false);
+    atomic_store(&req->cancelled, false);
+    atomic_store(&req->refcount, 1);
+    req->request_env = enif_alloc_env();
+    req->result_env = NULL;  /* Created by worker when processing */
+    req->next = NULL;
+    req->async_mode = false;
+    req->reactor_fd = -1;
+    req->local_env_ptr = NULL;
+    req->reactor_buffer_ptr = NULL;
+
+    return req;
+}
+
+/**
+ * @brief Add a reference to a context request
+ * @param req The request
+ */
+static inline void ctx_request_addref(ctx_request_t *req) {
+    if (req) {
+        atomic_fetch_add(&req->refcount, 1);
+    }
+}
+
+/**
+ * @brief Release a reference to a context request
+ * @param req The request (may be NULL)
+ *
+ * When refcount reaches 0, frees mutex/cond/envs and the request struct.
+ */
+static inline void ctx_request_release(ctx_request_t *req) {
+    if (req == NULL) return;
+
+    int prev = atomic_fetch_sub(&req->refcount, 1);
+    if (prev == 1) {
+        /* Last reference - free everything */
+        pthread_mutex_destroy(&req->mutex);
+        pthread_cond_destroy(&req->cond);
+        if (req->request_env) {
+            enif_free_env(req->request_env);
+        }
+        if (req->result_env) {
+            enif_free_env(req->result_env);
+        }
+        enif_free(req);
+    }
+}
+
 /**
  * @struct py_cmd_t
  * @brief Command structure for thread-per-context dispatch
@@ -804,8 +948,11 @@ struct py_context {
     /** @brief Context mode: true=subinterpreter, false=worker */
     bool is_subinterp;
 
-    /** @brief Flag indicating context has been destroyed */
-    bool destroyed;
+    /** @brief Flag indicating context has been destroyed (atomic for thread safety) */
+    _Atomic bool destroyed;
+
+    /** @brief Flag: context resources leaked due to unresponsive worker */
+    _Atomic bool leaked;
 
     /** @brief Flag: callback handler is configured */
     bool has_callback_handler;
@@ -816,70 +963,79 @@ struct py_context {
     /** @brief Pipe for callback responses [read, write] */
     int callback_pipe[2];
 
-#ifdef HAVE_SUBINTERPRETERS
-    /* ========== OWN_GIL mode fields ========== */
+    /* ========== Worker thread fields (used by both worker and owngil modes) ========== */
 
-    /** @brief Whether this context uses OWN_GIL mode (dedicated pthread) */
-    bool uses_own_gil;
+    /** @brief Dedicated pthread for this context */
+    pthread_t worker_thread;
 
-    /** @brief Dedicated pthread for OWN_GIL mode */
-    pthread_t own_gil_thread;
+    /** @brief True when worker thread is running */
+    _Atomic bool worker_running;
 
-    /** @brief Thread state for OWN_GIL subinterpreter */
-    PyThreadState *own_gil_tstate;
+    /** @brief True when shutdown has been requested */
+    _Atomic bool shutdown_requested;
 
-    /** @brief Interpreter state for OWN_GIL subinterpreter */
-    PyInterpreterState *own_gil_interp;
+    /** @brief True if this context uses a dedicated worker thread (worker mode) */
+    bool uses_worker_thread;
 
-    /* IPC via condition variables */
+    /** @brief True if thread initialization failed */
+    _Atomic bool init_error;
 
-    /** @brief Mutex for request/response synchronization */
-    pthread_mutex_t request_mutex;
+    /* ========== Request queue (replaces single-slot pattern) ========== */
 
-    /** @brief Condition variable: request ready for processing */
-    pthread_cond_t request_ready;
+    /** @brief Mutex protecting the request queue */
+    pthread_mutex_t queue_mutex;
 
-    /** @brief Condition variable: response ready for caller */
-    pthread_cond_t response_ready;
+    /** @brief Condition variable: work available in queue */
+    pthread_cond_t queue_not_empty;
 
-    /* Request/response state */
+    /** @brief Head of request queue (dequeue from here) */
+    ctx_request_t *queue_head;
 
-    /** @brief Current request type (CTX_REQ_*) */
-    int request_type;
+    /** @brief Tail of request queue (enqueue here) */
+    ctx_request_t *queue_tail;
+
+    /** @brief Environment for sending messages back to Erlang */
+    ErlNifEnv *msg_env;
+
+    /* ========== Legacy compatibility fields (populated from queue request) ========== */
+    /* These fields are populated by the worker thread from the current request
+     * for compatibility with existing execute functions. They will be removed
+     * once all execute functions are refactored to use ctx_request_t directly. */
 
-    /** @brief Shared environment for zero-copy term passing */
+    /** @brief Shared env for current request (points to current req->request_env) */
     ErlNifEnv *shared_env;
 
-    /** @brief Request term (copied into shared_env) */
-    ERL_NIF_TERM request_term;
+    /** @brief Current request type */
+    int request_type;
 
-    /** @brief Additional request data (e.g., modules list for flush) */
-    ERL_NIF_TERM request_data;
+    /** @brief Current request data term */
+    ERL_NIF_TERM request_term;
 
-    /** @brief Response term (created in shared_env) */
+    /** @brief Response term for current request */
     ERL_NIF_TERM response_term;
 
-    /** @brief True if response indicates success */
+    /** @brief Success flag for current request */
     bool response_ok;
 
-    /** @brief Auxiliary pointer for reactor buffer (OWN_GIL dispatch) */
+    /** @brief Reactor buffer pointer for current request */
     void *reactor_buffer_ptr;
 
-    /** @brief Process-local env pointer for OWN_GIL dispatch (py_env_resource_t*) */
+    /** @brief Process-local env pointer for current request */
     void *local_env_ptr;
 
-    /* Lifecycle flags */
+#ifdef HAVE_SUBINTERPRETERS
+    /* ========== OWN_GIL specific fields ========== */
 
-    /** @brief True when worker thread is running */
-    _Atomic bool thread_running;
+    /** @brief Whether this context uses OWN_GIL mode (subinterpreter with own GIL) */
+    bool uses_own_gil;
 
-    /** @brief True if thread initialization failed */
-    _Atomic bool init_error;
+    /** @brief Thread state for OWN_GIL subinterpreter */
+    PyThreadState *own_gil_tstate;
 
-    /** @brief True when shutdown has been requested */
-    _Atomic bool shutdown_requested;
+    /** @brief Interpreter state for OWN_GIL subinterpreter */
+    PyInterpreterState *own_gil_interp;
 #else
-    /** @brief Worker thread state (non-subinterp mode) */
+    /** @brief Worker thread state (non-subinterp mode, kept for compatibility) */
     PyThreadState *thread_state;
 #endif
 
@@ -1005,7 +1161,7 @@ static inline py_context_guard_t py_context_acquire(py_context_t *ctx) {
         .acquired = false
     };
 
-    if (ctx == NULL || ctx->destroyed) {
+    if (ctx == NULL || atomic_load(&ctx->destroyed)) {
         return guard;
     }
 
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 8f1ce8d..3f40fc6 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -8,7 +8,7 @@ Add to your `rebar.config`:
 
 ```erlang
 {deps, [
-    {erlang_python, "2.3.0"}
+    {erlang_python, "3.0.0"}
 ]}.
 ```
 
diff --git a/docs/migration.md b/docs/migration.md
index b092b8f..5d0b2fb 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -1,6 +1,103 @@
-# Migration Guide: v1.8.x to v2.0+
+# Migration Guide
 
-This guide covers breaking changes and migration steps when upgrading from erlang_python v1.8.x to v2.0 and later.
+This guide covers breaking changes and migration steps when upgrading erlang_python.
+
+## v2.x to v3.0 Migration
+
+### Quick Checklist
+
+- [ ] Update `py:execution_mode/0` usage - now returns `worker | owngil` only
+- [ ] Remove any `py:num_executors/0` calls (function removed)
+- [ ] Update code that checks for `free_threaded` or `multi_executor` modes
+- [ ] Review `context_mode` configuration (now `worker | owngil`)
+
+### Execution Mode Changes
+
+**v2.x:** `py:execution_mode/0` returned internal capabilities:
+```erlang
+py:execution_mode().
+%% => free_threaded | subinterp | multi_executor
+```
+
+**v3.0:** Returns simplified public modes based on configuration:
+```erlang
+py:execution_mode().
+%% => worker | owngil
+```
+
+The mode is determined by the `context_mode` application config:
+```erlang
+%% Default: worker mode
+application:set_env(erlang_python, context_mode, worker).
+
+%% For true parallelism (Python 3.14+)
+application:set_env(erlang_python, context_mode, owngil).
+```
+
+### Removed Functions
+
+**`py:num_executors/0`** - Removed. Contexts now use per-context worker threads.
+
+```erlang
+%% v2.x - check executor count
+N = py:num_executors().
+
+%% v3.0 - not needed, each context has its own worker thread
+```
+
+### Worker Thread Architecture
+
+In v3.0, each context gets a dedicated pthread that handles all Python operations:
+
+- **Stable thread affinity**: All calls to the same context run on the same OS thread
+- **numpy/torch compatibility**: Thread-local state is preserved
+- **No executor pool**: No shared executor threads to manage
+
+```erlang
+%% Create contexts - each gets its own worker thread
+Ctx1 = py:context(1),
+Ctx2 = py:context(2),
+
+%% All calls to Ctx1 run on Ctx1's worker thread
+%% All calls to Ctx2 run on Ctx2's worker thread
+{ok, _} = py:call(Ctx1, math, sqrt, [16]),
+{ok, _} = py:call(Ctx2, math, sqrt, [25]).
+```
+
+### Configuration Changes
+
+**v2.x configuration:**
+```erlang
+{erlang_python, [
+    {num_executors, 8},  %% Removed in v3.0
+    {context_mode, worker}
+]}
+```
+
+**v3.0 configuration:**
+```erlang
+{erlang_python, [
+    {context_mode, worker},  %% worker | owngil
+    {num_contexts, 8}        %% Number of contexts to create
+]}
+```
+
+### Python Version Compatibility
+
+| Python Version | v2.x Mode | v3.0 Mode |
+|---------------|-----------|-----------|
+| 3.9 - 3.11 | `multi_executor` | `worker` |
+| 3.12 - 3.13 | `subinterp` | `worker` (default) or `owngil` |
+| 3.14+ | `subinterp` | `worker` (default) or `owngil` |
+| 3.13t (free-threaded) | `free_threaded` | `worker` |
+
+All Python versions now use the same public mode (`worker` or `owngil`) based on configuration, not Python capabilities.
+
+---
+
+# v1.8.x to v2.0 Migration
+
+This section covers breaking changes when upgrading from erlang_python v1.8.x to v2.0.
 
 ## Quick Checklist
 
@@ -16,17 +113,18 @@ This guide covers breaking changes and migration steps when upgrading from erlan
 
 ## Python Version Compatibility
 
-| Python Version | GIL Mode | Notes |
-|---------------|----------|-------|
-| 3.9 - 3.11 | Shared GIL | Multi-executor mode, `py:execution_mode()` returns `multi_executor` |
-| 3.12 - 3.13 | OWN_GIL subinterpreters | True parallelism, `py:execution_mode()` returns `subinterp` |
-| 3.13t | Free-threaded | No GIL, `py:execution_mode()` returns `free_threaded` |
-| 3.14+ | SHARED_GIL subinterpreters | Subinterpreters with shared GIL for C extension compatibility |
+| Python Version | Support | Notes |
+|---------------|---------|-------|
+| 3.9 - 3.11 | Full | Worker mode with dedicated pthread per context |
+| 3.12 - 3.13 | Full | Worker mode (default) or owngil mode |
+| 3.14+ | Full | Worker mode (default) or owngil mode with true parallelism |
+| 3.13t | Full | Worker mode (free-threaded builds supported) |
 
-**Python 3.14 Support**: Full support for Python 3.14 including:
-- SHARED_GIL subinterpreter mode for C extension compatibility
-- Proper `sys.path` initialization in subinterpreters
-- All asyncio features work correctly
+**Python 3.14+ OWN_GIL Support**: For true parallelism, use owngil mode:
+```erlang
+application:set_env(erlang_python, context_mode, owngil).
+```
+Each context gets a subinterpreter with its own GIL, enabling parallel Python execution.
 
 **FreeBSD Support**: Improved fd handling on FreeBSD/kqueue platforms:
 - Automatic fd duplication in `py_reactor_context` to prevent fd stealing errors
@@ -83,11 +181,10 @@ The most significant change in v2.0 is the new execution model. On Python 3.12+,
 Check which mode is active:
 
 ```erlang
-%% Check execution mode
+%% Check execution mode (v3.0+)
 py:execution_mode().
-%% => subinterp     (Python 3.12+ with OWN_GIL)
-%% => free_threaded (Python 3.13t with --disable-gil)
-%% => multi_executor (Python < 3.12)
+%% => worker  (default, dedicated pthread per context)
+%% => owngil  (dedicated pthread + subinterpreter with own GIL)
 
 %% Check if subinterpreters are supported
 py:subinterp_supported().
diff --git a/docs/scalability.md b/docs/scalability.md
index 2985a20..cefe59f 100644
--- a/docs/scalability.md
+++ b/docs/scalability.md
@@ -4,30 +4,24 @@ This guide covers the scalability features of erlang_python, including execution
 
 ## Execution Modes
 
-erlang_python automatically detects the optimal execution mode based on your Python version:
+erlang_python supports two execution modes:
 
 ```erlang
 %% Check current execution mode
 py:execution_mode().
-%% => free_threaded | worker | owngil | multi_executor
-
-%% Check number of executor threads
-py:num_executors().
-%% => 4 (default)
+%% => worker | owngil
 ```
 
 ### Mode Comparison
 
-| Mode | Python Version | Parallelism | GIL Behavior | Best For |
-|------|----------------|-------------|--------------|----------|
-| **free_threaded** | 3.13+ (nogil build) | True N-way | None | Maximum throughput |
-| **owngil** | 3.14+ | True N-way | Per-interpreter (dedicated thread) | CPU-bound parallel |
-| **worker** | 3.12+ | GIL contention | Shared GIL | Default, compatibility |
-| **multi_executor** | < 3.12 | GIL contention | Shared, round-robin | I/O-bound, legacy |
+| Mode | Description | Parallelism | GIL Behavior | Best For |
+|------|-------------|-------------|--------------|----------|
+| **worker** | Dedicated pthread per context | GIL contention | Shared GIL | Default, maximum compatibility |
+| **owngil** | Dedicated pthread + subinterpreter | True N-way | Per-interpreter GIL | CPU-bound parallel (Python 3.14+) |
 
-### Free-Threaded Mode (Python 3.13+)
+### Worker Mode (Default)
 
-When running on a free-threaded Python build (compiled with `--disable-gil`), erlang_python executes Python calls directly without any executor routing. This provides maximum parallelism for CPU-bound workloads.
+Each context gets a dedicated pthread that handles all Python operations. This provides stable thread affinity, which is critical for libraries like numpy, torch, and tensorflow that maintain thread-local state.
 
 ### OWN_GIL Mode (Python 3.12+)
 
@@ -67,18 +61,6 @@ ok = py_nif:context_exec(CtxRef, <<"x = 42">>, Env),
 
 **See also:** [OWN_GIL Internals](owngil_internals.md) for architecture details.
 
-### Sub-interpreter Mode (Python 3.12+)
-
-Uses Python's sub-interpreter feature with a shared GIL pool. Multiple contexts share the GIL but have isolated namespaces. Best for high call frequency with low latency.
-
-**Architecture:**
-- Pool of pre-created subinterpreters with shared GIL
-- Execution on dirty schedulers with `PyThreadState_Swap`
-- Lower latency (~2.5μs) but no true parallelism
-- Best throughput for short operations
-
-**Note:** Each sub-interpreter has isolated state. Use the [Shared State](#shared-state) API to share data between workers.
-
 **Explicit Context Selection:**
 ```erlang
 %% Get a specific context by index (1-based)
@@ -89,58 +71,29 @@ Ctx = py:context(1),
 {ok, Result} = py:call(math, sqrt, [16]).
 ```
 
-### Multi-Executor Mode (Python < 3.12)
-
-Runs N executor threads that share the GIL. Requests are distributed round-robin across executors. Good for I/O-bound workloads where Python releases the GIL during I/O operations.
-
-**Thread Affinity:** In MULTI_EXECUTOR mode, both workers and contexts are assigned
-a fixed executor thread. This ensures libraries with thread-local state (numpy, torch,
-tensorflow) always run on the same OS thread, preventing segfaults and state corruption.
-
 ## Choosing the Right Mode
 
-### Mode Comparison
-
-| Aspect | Free-Threaded | OWN_GIL | Worker | Multi-Executor |
-|--------|---------------|---------|--------|----------------|
-| **Parallelism** | True N-way | True N-way | GIL contention | GIL contention |
-| **State Isolation** | Shared | Isolated | Shared | Shared |
-| **Memory Overhead** | Low | Higher (per-interp) | Low | Low |
-| **Module Compatibility** | Limited | Most modules | All modules | All modules |
-| **Python Version** | 3.13+ (nogil) | 3.14+ | 3.12+ | < 3.12 |
-
 ### When to Use Each Mode
 
-**Use Free-Threaded (Python 3.13t) when:**
-- You need maximum parallelism with shared state
-- Your libraries are GIL-free compatible
-- You're running CPU-bound workloads
-- Memory efficiency is important
+**Use Worker Mode (default) when:**
+- You need maximum module compatibility
+- Running libraries like numpy, torch, tensorflow
+- High call frequency with low latency
+- Shared state between contexts is needed
 
-**Use OWN_GIL (Python 3.14+) when:**
+**Use OWN_GIL Mode when:**
 - You need true CPU parallelism across Python contexts
 - Running long computations (ML inference, data processing)
 - Workload benefits from multiple independent Python interpreters
 - You can tolerate higher per-call latency for better throughput
 
-**Use Worker (Python 3.12+, default) when:**
-- You need high call frequency with low latency
-- Maximum module compatibility is required
-- Shared state between contexts is needed
-- Running libraries that don't support subinterpreters (torch, etc.)
-
-**Use Multi-Executor (Python < 3.12) when:**
-- Running on older Python versions
-- Your workload is I/O-bound (GIL released during I/O)
-- Thread affinity for numpy/torch is needed
-
 ### Pros and Cons
 
 **Worker Mode Pros:**
 - Maximum module compatibility (all C extensions work)
+- Stable thread affinity for numpy/torch/tensorflow
 - Low memory overhead (single interpreter)
 - Shared state between contexts
-- Default mode for Python 3.12+
 
 **Worker Mode Cons:**
 - GIL contention limits parallelism
@@ -156,17 +109,6 @@ tensorflow) always run on the same OS thread, preventing segfaults and state cor
 - Some C extensions don't support subinterpreters
 - Requires Python 3.14+
 
-**Free-Threaded Mode Pros:**
-- True parallelism with shared state
-- Lower memory overhead than OWN_GIL
-- Simplest mental model (like regular threading)
-
-**Free-Threaded Mode Cons:**
-- Requires Python 3.13+ built with `--disable-gil`
-- Many C extensions not yet compatible
-- Shared state requires careful synchronization
-- Still experimental
-
 ## Subinterpreter Architecture
 
 ### Design Overview
@@ -308,14 +250,13 @@ This allows your application to implement backpressure or shed load gracefully.
         %% Default: erlang:system_info(schedulers) * 2 + 1
         {max_concurrent, 50},
 
-        %% Number of executor threads (multi_executor mode only)
-        %% Default: 4
-        {num_executors, 8},
+        %% Context mode: worker | owngil
+        %% Default: worker
+        {context_mode, worker},
 
-        %% Worker pool sizes
-        {num_workers, 4},
-        {num_async_workers, 2},
-        {num_subinterp_workers, 4}
+        %% Number of contexts
+        %% Default: erlang:system_info(schedulers)
+        {num_contexts, 8}
     ]}
 ].
 ```
@@ -460,9 +401,9 @@ free_threaded
 
 ### For I/O-Bound Workloads
 
-- Multi-executor mode works well (GIL released during I/O)
-- Increase `num_executors` to handle more concurrent I/O
+- Worker mode works well (GIL released during I/O)
 - Use asyncio integration for async I/O
+- Increase `num_contexts` for more concurrent I/O capacity
 
 ### For Mixed Workloads
 
@@ -481,8 +422,7 @@ io:format("Python load: ~.1f%~n", [Utilization]).
 
 %% Execution mode info
 Mode = py:execution_mode(),
-Executors = py:num_executors(),
-io:format("Mode: ~p, Executors: ~p~n", [Mode, Executors]).
+io:format("Mode: ~p~n", [Mode]).
 
 %% Memory stats
 {ok, Stats} = py:memory_stats(),
diff --git a/src/erlang_python.app.src b/src/erlang_python.app.src
index 9378a24..ae6e135 100644
--- a/src/erlang_python.app.src
+++ b/src/erlang_python.app.src
@@ -1,6 +1,6 @@
 {application, erlang_python, [
     {description, "Execute Python applications from Erlang using dirty NIFs"},
-    {vsn, "2.3.1"},
+    {vsn, "3.0.0"},
     {registered, [py_pool]},
     {mod, {erlang_python_app, []}},
     {applications, [
diff --git a/src/py.erl b/src/py.erl
index 387cecf..8f719a8 100644
--- a/src/py.erl
+++ b/src/py.erl
@@ -107,7 +107,6 @@
     venv_info/0,
     %% Execution info
     execution_mode/0,
-    num_executors/0,
     %% Shared state (accessible from Python workers)
     state_fetch/1,
     state_store/2,
@@ -1257,30 +1256,22 @@ ensure_binary(S) ->
 
 %% @doc Get the current execution mode.
 %% Returns one of:
-%% - `free_threaded': Python 3.13+ with no GIL (Py_GIL_DISABLED)
-%% - `worker': Contexts use main interpreter namespaces (default)
-%% - `owngil': Contexts use dedicated threads with own GIL (Python 3.14+)
-%% - `multi_executor': Traditional Python with N executor threads (Python < 3.12)
--spec execution_mode() -> free_threaded | worker | owngil | multi_executor.
+%% - `worker': Contexts use dedicated pthread per context (default).
+%%   Provides stable thread affinity for numpy/torch/tensorflow compatibility.
+%% - `owngil': Contexts use dedicated pthread + subinterpreter with own GIL.
+%%   Enables true parallelism (Python 3.12+ with subinterpreter support).
+%%
+%% The mode is determined by the `context_mode' application config:
+%% ```
+%% application:set_env(erlang_python, context_mode, owngil).
+%% '''
+-spec execution_mode() -> worker | owngil.
 execution_mode() ->
-    case py_nif:execution_mode() of
-        free_threaded -> free_threaded;
-        multi_executor -> multi_executor;
-        subinterp ->
-            %% Check actual context_mode config
-            case application:get_env(erlang_python, context_mode, worker) of
-                owngil -> owngil;
-                _ -> worker
-            end
+    case application:get_env(erlang_python, context_mode, worker) of
+        owngil -> owngil;
+        _ -> worker
     end.
 
-%% @doc Get the number of executor threads.
-%% For `multi_executor' mode, this is the number of executor threads.
-%% For other modes, returns 1.
--spec num_executors() -> pos_integer().
-num_executors() ->
-    py_nif:num_executors().
-
 %%% ============================================================================
 %%% Shared State
 %%% ============================================================================
diff --git a/src/py_context.erl b/src/py_context.erl
index 9c342b6..5c5dbad 100644
--- a/src/py_context.erl
+++ b/src/py_context.erl
@@ -563,11 +563,12 @@ loop(#state{ref = Ref, interp_id = InterpId} = State) ->
             loop(State);
 
         {exec, From, MRef, Code} ->
-            Result = py_nif:context_exec(Ref, Code),
+            Result = handle_exec_with_async(Ref, Code),
             From ! {MRef, Result},
             loop(State);
 
         %% Exec with process-local environment (worker mode)
+        %% Note: Uses blocking dispatch since async+env isn't implemented yet.
         {exec, From, MRef, Code, EnvRef} ->
             Result = py_nif:context_exec(Ref, Code, EnvRef),
             From ! {MRef, Result},
@@ -723,7 +724,23 @@ handle_blocking_callback(Ref, FuncName, Args) ->
 
 %% @private
 %% Handle call with potential suspension for callbacks
+%% Uses async dispatch to avoid blocking dirty schedulers when possible.
 handle_call_with_suspension(Ref, Module, Func, Args, Kwargs) ->
+    RequestId = make_ref(),
+    case py_nif:context_call_async(Ref, self(), RequestId, Module, Func, Args, Kwargs) of
+        {enqueued, RequestId} ->
+            %% Async dispatch succeeded - wait for result message
+            wait_for_async_result(Ref, RequestId);
+        {error, async_requires_worker_thread} ->
+            %% Fall back to blocking call for non-worker-thread contexts
+            handle_call_blocking(Ref, Module, Func, Args, Kwargs);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%% @private
+%% Blocking call handler (used when async is not available)
+handle_call_blocking(Ref, Module, Func, Args, Kwargs) ->
     case py_nif:context_call(Ref, Module, Func, Args, Kwargs) of
         {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} ->
             %% Callback needed - handle it with recursive receive
@@ -740,7 +757,36 @@ handle_call_with_suspension(Ref, Module, Func, Args, Kwargs) ->
 
 %% @private
 %% Handle eval with potential suspension for callbacks
+%% Uses async dispatch to avoid blocking dirty schedulers when possible.
 handle_eval_with_suspension(Ref, Code, Locals) ->
+    RequestId = make_ref(),
+    case py_nif:context_eval_async(Ref, self(), RequestId, Code, Locals) of
+        {enqueued, RequestId} ->
+            %% Async dispatch succeeded - wait for result message
+            wait_for_async_result(Ref, RequestId);
+        {error, async_requires_worker_thread} ->
+            %% Fall back to blocking call for non-worker-thread contexts
+            handle_eval_blocking(Ref, Code, Locals);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%% @private
+%% Handle exec with async dispatch
+handle_exec_with_async(Ref, Code) ->
+    RequestId = make_ref(),
+    case py_nif:context_exec_async(Ref, self(), RequestId, Code) of
+        {enqueued, RequestId} ->
+            wait_for_async_result(Ref, RequestId);
+        {error, async_requires_worker_thread} ->
+            py_nif:context_exec(Ref, Code);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%% @private
+%% Blocking eval handler (used when async is not available)
+handle_eval_blocking(Ref, Code, Locals) ->
     case py_nif:context_eval(Ref, Code, Locals) of
         {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} ->
             %% Callback needed - handle it with recursive receive
@@ -755,8 +801,31 @@ handle_eval_with_suspension(Ref, Code, Locals) ->
             Result
     end.
 
+%% @private
+%% Wait for async result from worker thread
+%% The worker thread sends {py_result, RequestId, Result} when done.
+wait_for_async_result(Ref, RequestId) ->
+    receive
+        {py_result, RequestId, Result} ->
+            process_async_result(Ref, Result)
+    after 300000 ->  %% 5 minute timeout
+        {error, async_timeout}
+    end.
+
+%% @private
+%% Process the result from async dispatch
+%% Handles suspension, schedule markers, and normal results.
+process_async_result(Ref, {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}}) ->
+    CallbackResult = handle_callback_with_nested_receive(Ref, FuncName, CallbackArgs),
+    resume_and_continue(Ref, StateRef, CallbackResult);
+process_async_result(Ref, {schedule, CallbackName, CallbackArgs}) ->
+    handle_schedule(Ref, CallbackName, CallbackArgs);
+process_async_result(_Ref, Result) ->
+    Result.
+
 %% @private
 %% Handle call with process-local environment
+%% Note: Uses blocking dispatch since async+env isn't implemented yet.
 handle_call_with_suspension_and_env(Ref, Module, Func, Args, Kwargs, EnvRef) ->
     case py_nif:context_call(Ref, Module, Func, Args, Kwargs, EnvRef) of
         {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} ->
@@ -770,6 +839,7 @@ handle_call_with_suspension_and_env(Ref, Module, Func, Args, Kwargs, EnvRef) ->
 
 %% @private
 %% Handle eval with process-local environment
+%% Note: Uses blocking dispatch since async+env isn't implemented yet.
 handle_eval_with_suspension_and_env(Ref, Code, Locals, EnvRef) ->
     case py_nif:context_eval(Ref, Code, Locals, EnvRef) of
         {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} ->
diff --git a/src/py_nif.erl b/src/py_nif.erl
index 2625ea2..42084b0 100644
--- a/src/py_nif.erl
+++ b/src/py_nif.erl
@@ -82,7 +82,6 @@
     owngil_apply_paths/3,
     %% Execution mode info
     execution_mode/0,
-    num_executors/0,
     %% Thread worker support (ThreadPoolExecutor)
     thread_worker_set_coordinator/1,
     thread_worker_write/2,
@@ -176,6 +175,10 @@
     context_eval/4,
     context_exec/2,
     context_exec/3,
+    %% Async dispatch (non-blocking)
+    context_call_async/7,
+    context_eval_async/5,
+    context_exec_async/4,
     context_call_method/4,
     create_local_env/1,
     context_to_term/1,
@@ -653,22 +656,19 @@ owngil_apply_paths(_WorkerId, _HandleId, _Paths) ->
 %%% Execution Mode Info
 %%% ============================================================================
 
-%% @doc Get the current execution mode.
-%% Returns one of: free_threaded | subinterp | multi_executor
+%% @doc Get Python capability (internal use).
+%% Returns the detected Python runtime capability:
 %% - free_threaded: Python 3.13+ with no GIL (Py_GIL_DISABLED)
-%% - subinterp: Python 3.12+ with per-interpreter GIL
-%% - multi_executor: Traditional Python with N executor threads
+%% - subinterp: Python 3.12+ with per-interpreter GIL support
+%% - multi_executor: Traditional Python with executor threads
+%%
+%% For public execution mode, use py:execution_mode/0 which returns
+%% `worker | owngil' based on the application configuration.
+%% @private
 -spec execution_mode() -> free_threaded | subinterp | multi_executor.
 execution_mode() ->
     ?NIF_STUB.
 
-%% @doc Get the number of executor threads.
-%% For multi_executor mode, this is the number of executor threads.
-%% For other modes, returns 1.
--spec num_executors() -> pos_integer().
-num_executors() ->
-    ?NIF_STUB.
-
 %%% ============================================================================
 %%% Thread Worker Support (ThreadPoolExecutor)
 %%% ============================================================================
@@ -1337,6 +1337,58 @@ context_exec(_ContextRef, _Code) ->
 context_exec(_ContextRef, _Code, _EnvRef) ->
     ?NIF_STUB.
 
+%% @doc Async call - enqueue and return immediately.
+%%
+%% Dispatches a Python function call to the worker thread and returns
+%% immediately with {enqueued, RequestId}. The worker thread will send
+%% {py_result, RequestId, Result} to CallerPid when done.
+%%
+%% @param ContextRef Context reference
+%% @param CallerPid PID to send result to
+%% @param RequestId Request ID for correlation
+%% @param Module Python module name
+%% @param Func Function name
+%% @param Args List of arguments
+%% @param Kwargs Keyword arguments map
+%% @returns {enqueued, RequestId} | {error, Reason}
+-spec context_call_async(reference(), pid(), term(), binary(), binary(), list(), map()) ->
+    {enqueued, term()} | {error, term()}.
+context_call_async(_ContextRef, _CallerPid, _RequestId, _Module, _Func, _Args, _Kwargs) ->
+    ?NIF_STUB.
+
+%% @doc Async eval - enqueue and return immediately.
+%%
+%% Dispatches a Python eval to the worker thread and returns immediately
+%% with {enqueued, RequestId}. The worker thread will send
+%% {py_result, RequestId, Result} to CallerPid when done.
+%%
+%% @param ContextRef Context reference
+%% @param CallerPid PID to send result to
+%% @param RequestId Request ID for correlation
+%% @param Code Python expression to evaluate
+%% @param Locals Local variables map
+%% @returns {enqueued, RequestId} | {error, Reason}
+-spec context_eval_async(reference(), pid(), term(), binary(), map()) ->
+    {enqueued, term()} | {error, term()}.
+context_eval_async(_ContextRef, _CallerPid, _RequestId, _Code, _Locals) ->
+    ?NIF_STUB.
+
+%% @doc Async exec - enqueue and return immediately.
+%%
+%% Dispatches Python code execution to the worker thread and returns
+%% immediately with {enqueued, RequestId}. The worker thread will send
+%% {py_result, RequestId, Result} to CallerPid when done.
+%%
+%% @param ContextRef Context reference
+%% @param CallerPid PID to send result to
+%% @param RequestId Request ID for correlation
+%% @param Code Python code to execute
+%% @returns {enqueued, RequestId} | {error, Reason}
+-spec context_exec_async(reference(), pid(), term(), binary()) ->
+    {enqueued, term()} | {error, term()}.
+context_exec_async(_ContextRef, _CallerPid, _RequestId, _Code) ->
+    ?NIF_STUB.
+
 %% @doc Call a method on a Python object in a context.
 %%
 %% NO MUTEX - caller must ensure exclusive access (process ownership).
diff --git a/test/py_SUITE.erl b/test/py_SUITE.erl
index dc41700..d3e4f5d 100644
--- a/test/py_SUITE.erl
+++ b/test/py_SUITE.erl
@@ -41,7 +41,6 @@
     test_venv_pth/1,
     %% New scalability tests
     test_execution_mode/1,
-    test_num_executors/1,
     test_semaphore_basic/1,
     test_semaphore_acquire_release/1,
     test_semaphore_concurrent/1,
@@ -101,7 +100,6 @@ all() ->
         test_venv_pth,
         %% Scalability tests
         test_execution_mode,
-        test_num_executors,
         test_semaphore_basic,
         test_semaphore_acquire_release,
         test_semaphore_concurrent,
@@ -733,15 +731,7 @@ test_execution_mode(_Config) ->
     %% Test that execution_mode returns a valid mode
     Mode = py:execution_mode(),
     ct:pal("Execution mode: ~p~n", [Mode]),
-    true = lists:member(Mode, [free_threaded, subinterp, multi_executor]),
-    ok.
-
-test_num_executors(_Config) ->
-    %% Test that num_executors returns a positive integer
-    Num = py:num_executors(),
-    ct:pal("Number of executors: ~p~n", [Num]),
-    true = is_integer(Num),
-    true = Num > 0,
+    true = lists:member(Mode, [worker, owngil]),
     ok.
 
 test_semaphore_basic(_Config) ->

From 39247b59c4feec3cc94fdf347124a00e88830e1c Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 15:21:12 +0200
Subject: [PATCH 02/17] Finish v3.0 simplification: async API fix, dead code,
 doc/example sync

Async API:
- Reimplement py:async_gather/1 over py_event_loop (concurrent submit,
  sequential await); add async_gather/2 with explicit timeout.
- Remove py:async_stream/3,4 from public API (was always returning
  {error, stream_not_implemented}).
- Fix py:async_call/3,4 + async_await/1,2 round-trip: use
  py_event_loop:create_task and py_event_loop:await directly so the
  receive matches the {async_result, _, _} message the NIF sends.
- Delete the legacy py_async_pool gen_server (the async_call call site
  now goes through py_event_loop directly).

C-side dead code:
- Remove multi_executor_* functions, g_executors[], multi_executor_thread_main,
  multi_executor_enqueue, MAX/MIN_EXECUTORS macros and the executor_t struct.
- Remove the PY_MODE_MULTI_EXECUTOR case in executor_enqueue and the unused
  tl_current_req_* thread-locals from the legacy compatibility layer.
- Remove unused shared_dict_down (resource type was registered without
  process monitoring).
- Remove context_dispatch_call/eval/exec (declared but never called).
- Drop executor_id field from py_worker_t and py_context_t.

Internal mode enum collapse:
- py_execution_mode_t reduced to {PY_MODE_FREE_THREADED, PY_MODE_GIL}.
  Both retired values (PY_MODE_SUBINTERP, PY_MODE_MULTI_EXECUTOR) had
  identical behavior in every switch.
- Replace runtime numpy-cache check with precise compile-time gate
  (#if defined(HAVE_SUBINTERPRETERS) && !defined(HAVE_FREE_THREADED));
  preserves today's truth table on every supported build.
- nif_execution_mode now returns free_threaded | gil; update spec on
  py_nif:execution_mode/0.

Configuration cleanup:
- Remove num_executors / num_async_workers env keys (no-ops post-rework).
- Reset .app.src env to [] (the prior keys were never read).

Doc + example sync:
- README.md: drop SHARED_GIL bullet; py:execution_mode/0 returns
  worker | owngil; configuration block uses num_contexts / context_mode
  / max_concurrent.
- docs/getting-started.md, scalability.md, migration.md, owngil_internals.md,
  preload.md, process-bound-envs.md, reactor.md, testing-free-threading.md:
  drop subinterp as a context mode; correct py_event_loop:run signature.
- src/py.erl, src/py_context.erl, src/py_context_router.erl,
  src/py_context_sup.erl, src/py_reactor_context.erl: drop subinterp
  from @doc mode lists.
- c_src/py_nif.c nif_context_create header doc: same scrub.
- examples/bench_owngil.erl: rename labels (worker baseline vs OWN_GIL),
  gate on py_nif:owngil_supported (3.14+).
- examples/bench_reactor_modes.erl: convert subinterp arm to OWN_GIL
  (run_reactor_owngil_bench, Reactor/OG legend).
- examples/reactor_subinterp_example.erl: deleted (reactor_owngil_example
  covers OWN_GIL; reactor_echo covers worker).
- examples/benchmark.erl: replace py:num_executors() with
  py_context_router:num_contexts().

Tests:
- New test/py_thread_affinity_SUITE.erl asserts:
  exec/eval/call on one context share threading.get_native_id;
  N processes targeting one context converge on its worker thread;
  distinct contexts use distinct threads;
  same invariants hold under owngil mode.
  Replaces 8 untracked diagnostic escripts (deleted).
- test/py_SUITE.erl: tighten test_asyncio_call and test_asyncio_gather
  to assert real success values instead of swallowing failures.

CHANGELOG entries describe each breaking change and added behavior.

30 files changed, +244/-1199 LOC, plus the new affinity suite.
---
 CHANGELOG.md                           |  23 ++
 README.md                              |  42 +--
 c_src/py_convert.c                     |  16 +-
 c_src/py_exec.c                        | 399 ++-----------------------
 c_src/py_nif.c                         |  81 ++---
 c_src/py_nif.h                         | 176 +----------
 c_src/py_shared_dict.c                 |  27 --
 docs/getting-started.md                |  23 +-
 docs/migration.md                      |  19 +-
 docs/owngil_internals.md               |  27 +-
 docs/preload.md                        |   2 +-
 docs/process-bound-envs.md             |   6 +-
 docs/reactor.md                        |  10 +-
 docs/scalability.md                    |  21 +-
 docs/testing-free-threading.md         |  17 +-
 examples/README.md                     |  10 +-
 examples/bench_owngil.erl              |  20 +-
 examples/bench_reactor_modes.erl       |  44 +--
 examples/benchmark.erl                 |   2 +-
 examples/reactor_subinterp_example.erl | 158 ----------
 src/erlang_python.app.src              |   6 +-
 src/erlang_python_sup.erl              |  21 +-
 src/py.erl                             |  78 ++---
 src/py_async_pool.erl                  | 167 -----------
 src/py_context.erl                     |   2 +-
 src/py_context_router.erl              |   4 +-
 src/py_context_sup.erl                 |   2 +-
 src/py_nif.erl                         |   5 +-
 src/py_reactor_context.erl             |   4 +-
 test/py_SUITE.erl                      |  31 +-
 test/py_thread_affinity_SUITE.erl      | 160 ++++++++++
 31 files changed, 404 insertions(+), 1199 deletions(-)
 delete mode 100644 examples/reactor_subinterp_example.erl
 delete mode 100644 src/py_async_pool.erl
 create mode 100644 test/py_thread_affinity_SUITE.erl

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 02beb7c..5092c9d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,28 @@
   application configuration. Previously returned internal capabilities like
   `free_threaded`, `subinterp`, or `multi_executor`.
 
+- **Removed `py:async_stream/3,4`** - Streaming async generators was never
+  implemented behind the API and always returned `{error, stream_not_implemented}`.
+  Use `py:stream_start/3,4` for sync generators; async-generator support may
+  return in a later release.
+
+- **Removed `num_executors` / `num_async_workers` configuration** - Both keys
+  were no-ops after the v3.0 worker rework. Configure context count via
+  `num_contexts` and the rate-limit ceiling via `max_concurrent`.
+
+### Fixed
+
+- **`py:async_call/3,4` + `py:async_await/1,2` round-trip** - Previously the
+  await receive matched `{py_response, _, _}` while the event loop sent
+  `{async_result, _, _}`, causing every async call to silently time out.
+  Async calls now go directly through `py_event_loop:create_task` and
+  `py_event_loop:await`.
+
+- **`py:async_gather/1,2` actually executes** - Reimplemented as concurrent
+  `async_call` submission with sequential `async_await`. Returns
+  `{ok, [Result1, ...]}` on success or `{error, {gather_failed, [{Idx, Reason}, ...]}}`
+  if any call fails. The previous implementation returned `gather_not_implemented`.
+
 ### Changed
 
 - **Per-context worker threads** - Each context now gets its own dedicated pthread
@@ -34,6 +56,7 @@
 - Multi-executor pool (`g_executors[]`, `multi_executor_start/stop`)
 - `context_dispatch_call/eval/exec` functions (dead code)
 - References to `PY_MODE_MULTI_EXECUTOR` in context operations
+- `py_async_pool` legacy gen_server (unused after async API rewire)
 
 ## 2.3.1 (2026-04-01)
 
diff --git a/README.md b/README.md
index 3c6d60a..e5744bc 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,9 @@ evaluate expressions, and stream from generators - all without blocking Erlang
 schedulers.
 
 **Parallelism options:**
-- **Worker mode** (default, recommended) - Works with any Python version. With free-threaded Python (3.13t+), provides true parallelism automatically
-- **SHARED_GIL sub-interpreters** (Python 3.12+) - Isolated namespaces, shared GIL (isolation improves in 3.14+)
-- **OWN_GIL sub-interpreters** (Python 3.14+) - Each interpreter has its own GIL, true parallelism
-- **BEAM processes** - Fan out work across lightweight Erlang processes
+- **Worker mode** (default, recommended) - Works with any Python version. With free-threaded Python (3.13t+), provides true parallelism automatically.
+- **OWN_GIL sub-interpreters** (Python 3.14+) - Each interpreter has its own GIL, true parallelism.
+- **BEAM processes** - Fan out work across lightweight Erlang processes.
 
 Key features:
 - **Process-bound environments** - Each Erlang process gets isolated Python state, enabling OTP-supervised Python actors
@@ -302,14 +301,11 @@ Ref = py:async_call(aiohttp, get, [<<"https://api.example.com/data">>]),
 {ok, Response} = py:async_await(Ref).
 
 %% Gather multiple async calls concurrently
-{ok, Results} = py:async_gather([
+{ok, [Users, Posts, Comments]} = py:async_gather([
     {aiohttp, get, [<<"https://api.example.com/users">>]},
     {aiohttp, get, [<<"https://api.example.com/posts">>]},
     {aiohttp, get, [<<"https://api.example.com/comments">>]}
 ]).
-
-%% Stream from async generators
-{ok, Chunks} = py:async_stream(mymodule, async_generator, [args]).
 ```
 
 ## Parallel Execution with Sub-interpreters
@@ -328,7 +324,7 @@ True parallelism without GIL contention using Python 3.14+ OWN_GIL sub-interpret
 %% Each call runs in its own interpreter with its own GIL
 ```
 
-For Python 3.12/3.13, use SHARED_GIL sub-interpreters (`mode => subinterp`) for namespace isolation, but note that parallelism is limited by the shared GIL.
+For Python 3.12/3.13 the public modes are `worker` (default) and `owngil` (Python 3.14+ only). Earlier versions run all contexts under the shared main interpreter via dedicated worker threads — namespace isolation between contexts is local-dict based, not via subinterpreters.
 
 ## Parallel Processing with BEAM Processes
 
@@ -590,9 +586,9 @@ ok = py:clear_traces().
 %% sys.config
 [
   {erlang_python, [
-    {num_workers, 4},           %% Python worker pool size
-    {max_concurrent, 17},       %% Max concurrent operations (default: schedulers * 2 + 1)
-    {num_executors, 4}          %% Executor threads (multi-executor mode)
+    {num_contexts, 8},          %% Number of contexts (default: schedulers)
+    {context_mode, worker},     %% worker | owngil
+    {max_concurrent, 17}        %% Max concurrent operations (default: schedulers * 2 + 1)
   ]}
 ].
 ```
@@ -605,40 +601,34 @@ When creating Python contexts, you can choose the execution mode:
 
 | Mode | Python Version | Description |
 |------|----------------|-------------|
-| `worker` | Any | Main interpreter, shared namespace (default, recommended) |
-| `subinterp` | 3.12+ | SHARED_GIL sub-interpreter, isolated namespace |
-| `owngil` | 3.14+ | OWN_GIL sub-interpreter, true parallelism |
+| `worker` | Any | Dedicated pthread per context, main interpreter namespace (default) |
+| `owngil` | 3.14+ | Dedicated pthread + subinterpreter with its own GIL, true parallelism |
 
 ```erlang
 %% Default: worker mode (recommended)
 %% With free-threaded Python (3.13t+), provides true parallelism automatically
 {ok, Ctx} = py_context:new(#{}).
 
-%% Explicit subinterpreter with shared GIL (Python 3.12+)
-%% Provides namespace isolation but no parallelism
-{ok, Ctx} = py_context:new(#{mode => subinterp}).
-
 %% OWN_GIL mode for true parallelism (Python 3.14+ required)
 %% Each context runs in its own pthread with independent GIL
 {ok, Ctx} = py_context:new(#{mode => owngil}).
 ```
 
-**Worker mode is recommended** because it works with any Python version and automatically benefits from free-threaded Python (3.13t+) when available.
+**Worker mode is recommended** because it works with any Python version and automatically benefits from free-threaded Python (3.13t+) when available. Each context owns a dedicated pthread, providing stable thread affinity for libraries with thread-local state (numpy, torch, tensorflow).
 
-**Why OWN_GIL requires Python 3.14+**: Some C extensions (e.g., `_decimal`, `numpy`) have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14. SHARED_GIL mode works on 3.12+ but with caveats for C extensions with global state.
+**Why OWN_GIL requires Python 3.14+**: Some C extensions (e.g., `_decimal`, `numpy`) have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14.
 
 ### Runtime Detection
 
-Check the current execution mode:
+Check the current execution mode (mirrors the `context_mode` application env):
 ```erlang
-py:execution_mode().  %% => free_threaded | subinterp | multi_executor
+py:execution_mode().  %% => worker | owngil
 ```
 
 | Mode | Python Version | Parallelism |
 |------|----------------|-------------|
-| Free-threaded | 3.13+ (nogil) | True parallel, no GIL |
-| Sub-interpreter | 3.12+ | Per-interpreter GIL |
-| Multi-executor | Any | GIL contention |
+| `worker` (default) | Any | One pthread per context; true parallelism on free-threaded 3.13t+ |
+| `owngil` | 3.14+ | Per-interpreter GIL, true parallelism across contexts |
 
 ## Error Handling
 
diff --git a/c_src/py_convert.c b/c_src/py_convert.c
index 56a4f06..4961aa4 100644
--- a/c_src/py_convert.c
+++ b/c_src/py_convert.c
@@ -95,13 +95,19 @@ static void shared_dict_capsule_destructor(PyObject *capsule) {
  * @return true if obj is a numpy ndarray, false otherwise
  */
 static inline bool is_numpy_ndarray(PyObject *obj) {
-    /* Use cached type for fast isinstance check when available.
-     * The cache is only valid in the main interpreter - subinterpreters
-     * have their own object space, so we fall back to attribute detection. */
-    if (g_numpy_ndarray_type != NULL && g_execution_mode != PY_MODE_SUBINTERP) {
+    /* The cache is populated in the main interpreter. On builds where
+     * subinterpreters can be created (and the runtime isn't free-threaded,
+     * which short-circuits subinterp use) a context may be running inside
+     * a subinterpreter where the cached type is invalid -- fall back to
+     * duck typing in that case. */
+#if defined(HAVE_SUBINTERPRETERS) && !defined(HAVE_FREE_THREADED)
+    /* Build supports subinterpreters and isn't free-threaded:
+     * skip the cached fast path. */
+#else
+    if (g_numpy_ndarray_type != NULL) {
         return PyObject_IsInstance(obj, g_numpy_ndarray_type) == 1;
     }
-
+#endif
     /* Fallback: duck typing via attribute detection.
      * Check for both 'tolist' method and 'ndim' attribute. */
     return PyObject_HasAttrString(obj, "tolist") &&
diff --git a/c_src/py_exec.c b/c_src/py_exec.c
index d57d08c..cd8a32b 100644
--- a/c_src/py_exec.c
+++ b/c_src/py_exec.c
@@ -24,7 +24,8 @@
  * This module implements the core Python execution engine, handling:
  *
  * - **Timeout support**: Trace-based execution timeout monitoring
- * - **Executor threads**: Single and multi-executor pool management
+ * - **Single coordinator executor thread**: serializes legacy worker API and
+ *   coordinator tasks behind one GIL-holding thread.
  * - **Request processing**: Dispatch for call/eval/exec/import operations
  * - **Free-threaded mode**: Support for Python 3.13+ no-GIL builds
  *
@@ -43,6 +44,10 @@
  *       │                          completed                       result
  * ```
  *
+ * Per-context worker threads (see py_nif.c) handle the public worker / owngil
+ * APIs directly; the single executor here only backs the legacy worker pool
+ * and a few coordinator paths.
+ *
  * @par GIL Management Patterns
  *
  * Following PyO3/Granian best practices:
@@ -51,14 +56,6 @@
  * - **Py_END_ALLOW_THREADS**: Re-acquire GIL before Python calls
  * - **PyGILState_Ensure/Release**: For callbacks from non-Python threads
  *
- * @par Execution Modes
- *
- * | Mode | Description | GIL Handling |
- * |------|-------------|--------------|
- * | FREE_THREADED | Python 3.13+ no-GIL | Direct execution |
- * | SUBINTERP | Python 3.12+ | Per-interpreter GIL |
- * | MULTI_EXECUTOR | Traditional | N executor threads |
- *
  * @par Thread Safety
  *
  * - Executor queues protected by pthread mutexes
@@ -155,19 +152,10 @@ static bool check_timeout_error(void) {
 
 static void detect_execution_mode(void) {
 #ifdef HAVE_FREE_THREADED
-    /* Python 3.13+ with free-threading enabled */
     g_execution_mode = PY_MODE_FREE_THREADED;
-    return;
-#endif
-
-#ifdef HAVE_SUBINTERPRETERS
-    /* Python 3.12+ supports per-interpreter GIL */
-    g_execution_mode = PY_MODE_SUBINTERP;
-    return;
+#else
+    g_execution_mode = PY_MODE_GIL;
 #endif
-
-    /* Fallback: multi-executor with shared GIL */
-    g_execution_mode = PY_MODE_MULTI_EXECUTOR;
 }
 
 /* ============================================================================
@@ -783,50 +771,22 @@ static int executor_enqueue(py_request_t *req) {
     /* Track enqueued requests */
     atomic_fetch_add(&g_counters.enqueue_count, 1);
 
-    switch (g_execution_mode) {
 #ifdef HAVE_FREE_THREADED
-        case PY_MODE_FREE_THREADED:
-            /* Execute directly in free-threaded mode - no executor needed */
-            {
-                PyGILState_STATE gstate = PyGILState_Ensure();
-                process_request(req);
-                PyGILState_Release(gstate);
-                /* Signal completion immediately */
-                pthread_mutex_lock(&req->mutex);
-                req->completed = true;
-                pthread_cond_signal(&req->cond);
-                pthread_mutex_unlock(&req->mutex);
-            }
-            return 0;
-#endif
-
-        case PY_MODE_MULTI_EXECUTOR:
-            if (atomic_load(&g_multi_executor_initialized)) {
-                /* Route to multi-executor pool.
-                 * Use worker's or context's assigned executor for thread affinity if available.
-                 * This ensures libraries like numpy/torch that have thread-local
-                 * state always run on the same thread for a given worker/context. */
-                int exec_id;
-                if (req->worker != NULL && req->worker->executor_id >= 0) {
-                    exec_id = req->worker->executor_id % g_num_executors;
-                } else if (req->context != NULL && req->context->executor_id >= 0) {
-                    exec_id = req->context->executor_id % g_num_executors;
-                } else {
-                    exec_id = select_executor();
-                }
-                multi_executor_enqueue(exec_id, req);
-                return 0;
-            }
-            /* Fall through to single executor if multi not initialized */
-            break;
-
-        case PY_MODE_SUBINTERP:
-        default:
-            /* Use single executor */
-            break;
+    if (g_execution_mode == PY_MODE_FREE_THREADED) {
+        /* Execute directly in free-threaded mode - no executor needed */
+        PyGILState_STATE gstate = PyGILState_Ensure();
+        process_request(req);
+        PyGILState_Release(gstate);
+        /* Signal completion immediately */
+        pthread_mutex_lock(&req->mutex);
+        req->completed = true;
+        pthread_cond_signal(&req->cond);
+        pthread_mutex_unlock(&req->mutex);
+        return 0;
     }
+#endif
 
-    /* Single executor queue */
+    /* Single coordinator executor queue */
     pthread_mutex_lock(&g_executor_mutex);
     req->next = NULL;
     if (g_executor_queue_tail == NULL) {
@@ -897,325 +857,8 @@ static void executor_stop(void) {
     pthread_join(g_executor_thread, NULL);
 }
 
-/* ============================================================================
- * Multi-executor pool implementation
- *
- * For MULTI_EXECUTOR mode (traditional Python), we run N executor threads
- * that each hold the GIL in turn. This allows GIL contention-based parallelism
- * similar to PyO3's multi-executor pattern.
- * ============================================================================ */
-
-/**
- * Main function for a multi-executor thread.
- * Each executor has its own queue and processes requests independently.
- *
- * GIL handling: Acquire GIL only when processing work, not while idle.
- * This prevents idle executors from competing with dirty schedulers
- * running actual Python work via the context-based API.
- */
-static void *multi_executor_thread_main(void *arg) {
-    executor_t *exec = (executor_t *)arg;
-
-    exec->running = true;
-
-    while (!exec->shutdown) {
-        py_request_t *req = NULL;
-
-        /* Wait for work - NO GIL held while idle */
-        pthread_mutex_lock(&exec->mutex);
-        while (exec->queue_head == NULL && !exec->shutdown) {
-            pthread_cond_wait(&exec->cond, &exec->mutex);
-        }
-
-        /* Dequeue request if available */
-        if (exec->queue_head != NULL) {
-            req = exec->queue_head;
-            exec->queue_head = req->next;
-            if (exec->queue_head == NULL) {
-                exec->queue_tail = NULL;
-            }
-            req->next = NULL;
-        }
-        pthread_mutex_unlock(&exec->mutex);
-
-        if (req != NULL) {
-            if (req->type == PY_REQ_SHUTDOWN) {
-                pthread_mutex_lock(&req->mutex);
-                req->completed = true;
-                pthread_cond_signal(&req->cond);
-                pthread_mutex_unlock(&req->mutex);
-                break;
-            } else {
-                /* Acquire GIL only for actual work */
-                PyGILState_STATE gstate = PyGILState_Ensure();
-
-                /* Process the request */
-                process_request(req);
-
-                /* Release GIL immediately after processing */
-                PyGILState_Release(gstate);
-
-                /* Signal completion (outside GIL) */
-                pthread_mutex_lock(&req->mutex);
-                req->completed = true;
-                pthread_cond_signal(&req->cond);
-                pthread_mutex_unlock(&req->mutex);
-            }
-        }
-    }
-
-    exec->running = false;
-
-    return NULL;
-}
-
-/**
- * Select an executor using round-robin.
- */
-static int select_executor(void) {
-    int idx = atomic_fetch_add(&g_next_executor, 1) % g_num_executors;
-    return idx;
-}
-
-/**
- * Enqueue a request to a specific executor.
- */
-static void multi_executor_enqueue(int exec_id, py_request_t *req) {
-    executor_t *exec = &g_executors[exec_id];
-
-    pthread_mutex_lock(&exec->mutex);
-    req->next = NULL;
-    if (exec->queue_tail == NULL) {
-        exec->queue_head = req;
-        exec->queue_tail = req;
-    } else {
-        exec->queue_tail->next = req;
-        exec->queue_tail = req;
-    }
-    pthread_cond_signal(&exec->cond);
-    pthread_mutex_unlock(&exec->mutex);
-}
-
-/**
- * Start the multi-executor pool.
- */
-static int multi_executor_start(int num_executors) {
-    if (atomic_load(&g_multi_executor_initialized)) {
-        return 0;
-    }
-
-    if (num_executors < MIN_EXECUTORS) {
-        num_executors = MIN_EXECUTORS;
-    }
-    if (num_executors > MAX_EXECUTORS) {
-        num_executors = MAX_EXECUTORS;
-    }
-
-    g_num_executors = num_executors;
-
-    for (int i = 0; i < num_executors; i++) {
-        executor_t *exec = &g_executors[i];
-        exec->id = i;
-        exec->queue_head = NULL;
-        exec->queue_tail = NULL;
-        exec->running = false;
-        exec->shutdown = false;
-        pthread_mutex_init(&exec->mutex, NULL);
-        pthread_cond_init(&exec->cond, NULL);
-
-        if (pthread_create(&exec->thread, NULL, multi_executor_thread_main, exec) != 0) {
-            /* Cleanup already created threads */
-            for (int j = 0; j < i; j++) {
-                g_executors[j].shutdown = true;
-                pthread_cond_signal(&g_executors[j].cond);
-                pthread_join(g_executors[j].thread, NULL);
-                pthread_mutex_destroy(&g_executors[j].mutex);
-                pthread_cond_destroy(&g_executors[j].cond);
-            }
-            return -1;
-        }
-    }
-
-    /* Wait for all executors to be ready */
-    int max_wait = 100;
-    bool all_ready = false;
-    while (!all_ready && max_wait-- > 0) {
-        all_ready = true;
-        for (int i = 0; i < num_executors; i++) {
-            if (!g_executors[i].running) {
-                all_ready = false;
-                break;
-            }
-        }
-        if (!all_ready) {
-            usleep(10000);
-        }
-    }
-
-    atomic_store(&g_multi_executor_initialized, all_ready);
-    return all_ready ? 0 : -1;
-}
-
-/**
- * Stop the multi-executor pool.
- */
-static void multi_executor_stop(void) {
-    if (!atomic_load(&g_multi_executor_initialized)) {
-        return;
-    }
-
-    /* Allocate shutdown requests for all executors */
-    py_request_t *shutdown_reqs[MAX_EXECUTORS] = {0};
-
-    /* Signal shutdown and send shutdown requests to all executors */
-    for (int i = 0; i < g_num_executors; i++) {
-        executor_t *exec = &g_executors[i];
-        exec->shutdown = true;
-
-        py_request_t *shutdown_req = enif_alloc(sizeof(py_request_t));
-        if (shutdown_req != NULL) {
-            request_init(shutdown_req);
-            shutdown_req->type = PY_REQ_SHUTDOWN;
-            shutdown_reqs[i] = shutdown_req;
-            multi_executor_enqueue(i, shutdown_req);
-        }
-        /* If alloc fails, the shutdown flag is already set, so executor
-         * will exit when it checks the flag */
-    }
-
-    /* Wait for all executors to finish and clean up shutdown requests */
-    for (int i = 0; i < g_num_executors; i++) {
-        executor_t *exec = &g_executors[i];
-        pthread_join(exec->thread, NULL);
-        pthread_mutex_destroy(&exec->mutex);
-        pthread_cond_destroy(&exec->cond);
-
-        /* Clean up the shutdown request */
-        if (shutdown_reqs[i] != NULL) {
-            request_cleanup(shutdown_reqs[i]);
-            enif_free(shutdown_reqs[i]);
-        }
-    }
-
-    atomic_store(&g_multi_executor_initialized, false);
-}
-
 /*
  * Note: Free-threaded execution (Python 3.13+ nogil) is handled inline
  * in executor_enqueue() using PyGILState_Ensure/Release which are no-ops
  * in free-threaded builds but still work correctly.
  */
-
-/* ============================================================================
- * Context dispatch to executor
- *
- * When a context has thread affinity (executor_id >= 0), we dispatch
- * operations through the executor queue instead of executing directly
- * on the dirty scheduler. This ensures numpy/torch thread-local state
- * consistency.
- * ============================================================================ */
-
-/**
- * Dispatch a context call operation to the executor.
- *
- * @param env Caller's NIF environment
- * @param ctx Context with thread affinity
- * @param module_bin Module name binary
- * @param func_bin Function name binary
- * @param args_term Arguments list
- * @param kwargs_term Keyword arguments map
- * @return Result term
- */
-ERL_NIF_TERM context_dispatch_call(ErlNifEnv *env, py_context_t *ctx,
-                                    ErlNifBinary *module_bin, ErlNifBinary *func_bin,
-                                    ERL_NIF_TERM args_term, ERL_NIF_TERM kwargs_term) {
-    py_request_t req;
-    request_init(&req);
-
-    req.type = PY_REQ_CALL;
-    req.env = env;
-    req.worker = NULL;
-    req.context = ctx;
-    req.module_bin = *module_bin;
-    req.func_bin = *func_bin;
-    req.args_term = args_term;
-    req.kwargs_term = kwargs_term;
-    req.timeout_ms = 0;
-
-    if (executor_enqueue(&req) < 0) {
-        request_cleanup(&req);
-        return make_error(env, "executor_shutdown");
-    }
-
-    executor_wait(&req);
-    ERL_NIF_TERM result = req.result;
-    request_cleanup(&req);
-
-    return result;
-}
-
-/**
- * Dispatch a context eval operation to the executor.
- *
- * @param env Caller's NIF environment
- * @param ctx Context with thread affinity
- * @param code_bin Code string binary
- * @param locals_term Local variables map
- * @return Result term
- */
-ERL_NIF_TERM context_dispatch_eval(ErlNifEnv *env, py_context_t *ctx,
-                                    ErlNifBinary *code_bin, ERL_NIF_TERM locals_term) {
-    py_request_t req;
-    request_init(&req);
-
-    req.type = PY_REQ_EVAL;
-    req.env = env;
-    req.worker = NULL;
-    req.context = ctx;
-    req.code_bin = *code_bin;
-    req.locals_term = locals_term;
-    req.timeout_ms = 0;
-
-    if (executor_enqueue(&req) < 0) {
-        request_cleanup(&req);
-        return make_error(env, "executor_shutdown");
-    }
-
-    executor_wait(&req);
-    ERL_NIF_TERM result = req.result;
-    request_cleanup(&req);
-
-    return result;
-}
-
-/**
- * Dispatch a context exec operation to the executor.
- *
- * @param env Caller's NIF environment
- * @param ctx Context with thread affinity
- * @param code_bin Code string binary
- * @return Result term
- */
-ERL_NIF_TERM context_dispatch_exec(ErlNifEnv *env, py_context_t *ctx,
-                                    ErlNifBinary *code_bin) {
-    py_request_t req;
-    request_init(&req);
-
-    req.type = PY_REQ_EXEC;
-    req.env = env;
-    req.worker = NULL;
-    req.context = ctx;
-    req.code_bin = *code_bin;
-    req.timeout_ms = 0;
-
-    if (executor_enqueue(&req) < 0) {
-        request_cleanup(&req);
-        return make_error(env, "executor_shutdown");
-    }
-
-    executor_wait(&req);
-    ERL_NIF_TERM result = req.result;
-    request_cleanup(&req);
-
-    return result;
-}
diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index a7e0612..239c15c 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -140,13 +140,7 @@ _Atomic py_runtime_state_t g_runtime_state = PY_STATE_UNINIT;
 PyThreadState *g_main_thread_state = NULL;
 
 /* Execution mode */
-py_execution_mode_t g_execution_mode = PY_MODE_MULTI_EXECUTOR;
-int g_num_executors = 4;
-
-/* Multi-executor pool */
-executor_t g_executors[MAX_EXECUTORS];
-_Atomic int g_next_executor = 0;
-_Atomic bool g_multi_executor_initialized = false;
+py_execution_mode_t g_execution_mode = PY_MODE_GIL;
 
 /* Single executor state */
 pthread_t g_executor_thread;
@@ -1146,17 +1140,8 @@ static ERL_NIF_TERM nif_py_init(ErlNifEnv *env, int argc, const ERL_NIF_TERM arg
      * Context operations use per-context worker threads (see worker_context_init).
      * The single executor handles legacy worker API and coordinator tasks. */
     int executor_result = 0;
-    switch (g_execution_mode) {
-        case PY_MODE_FREE_THREADED:
-            /* No executor needed - direct execution */
-            break;
-
-        case PY_MODE_SUBINTERP:
-        case PY_MODE_MULTI_EXECUTOR:
-        default:
-            /* Use single executor for coordinator operations */
-            executor_result = executor_start();
-            break;
+    if (g_execution_mode != PY_MODE_FREE_THREADED) {
+        executor_result = executor_start();
     }
 
     if (executor_result < 0) {
@@ -1205,16 +1190,8 @@ static ERL_NIF_TERM nif_finalize(ErlNifEnv *env, int argc, const ERL_NIF_TERM ar
      */
 
     /* Step 1: Stop executor - it will finish in-flight requests and exit */
-    switch (g_execution_mode) {
-        case PY_MODE_FREE_THREADED:
-            /* No executor to stop */
-            break;
-
-        case PY_MODE_SUBINTERP:
-        case PY_MODE_MULTI_EXECUTOR:
-        default:
-            executor_stop();
-            break;
+    if (g_execution_mode != PY_MODE_FREE_THREADED) {
+        executor_stop();
     }
 
     /* Step 2: Clean up thread worker system */
@@ -1310,11 +1287,6 @@ static ERL_NIF_TERM nif_worker_new(ErlNifEnv *env, int argc, const ERL_NIF_TERM
     worker->has_callback_handler = false;
     worker->callback_env = NULL;
 
-    /* Assign executor affinity for thread-safe library support (numpy, torch).
-     * Each worker gets a fixed executor to ensure all calls from the same
-     * worker go to the same thread, preventing thread state corruption. */
-    worker->executor_id = select_executor();
-
     PyGILState_Release(gstate);
 
     ERL_NIF_TERM result = enif_make_resource(env, worker);
@@ -1731,19 +1703,9 @@ static ERL_NIF_TERM nif_execution_mode(ErlNifEnv *env, int argc, const ERL_NIF_T
     (void)argc;
     (void)argv;
 
-    const char *mode_str;
-    switch (g_execution_mode) {
-        case PY_MODE_FREE_THREADED:
-            mode_str = "free_threaded";
-            break;
-        case PY_MODE_SUBINTERP:
-            mode_str = "subinterp";
-            break;
-        case PY_MODE_MULTI_EXECUTOR:
-        default:
-            mode_str = "multi_executor";
-            break;
-    }
+    const char *mode_str = (g_execution_mode == PY_MODE_FREE_THREADED)
+                           ? "free_threaded"
+                           : "gil";
     return enif_make_atom(env, mode_str);
 }
 
@@ -2301,21 +2263,14 @@ static void ctx_queue_cancel_all(py_context_t *ctx) {
 }
 
 /* ============================================================================
- * Legacy execute functions (use context fields for compatibility)
- *
- * These functions read from ctx->shared_env/request_term and write to
- * ctx->response_term/response_ok. The new queue-based approach populates
- * these fields from the dequeued request for compatibility.
+ * OWN_GIL execute helpers
  *
- * TODO: Refactor these to take ctx_request_t* directly in a future phase.
+ * Each OWN_GIL worker thread dequeues a ctx_request_t and copies the request
+ * fields onto the owning context (ctx->shared_env, ctx->request_term, etc.)
+ * before calling these helpers. Helpers consume those fields and write the
+ * response back into ctx->response_term / ctx->response_ok.
  * ============================================================================ */
 
-/* Thread-local for current request being processed (for compatibility layer) */
-static __thread ErlNifEnv *tl_current_req_env = NULL;
-static __thread ERL_NIF_TERM tl_current_req_data = 0;
-static __thread ERL_NIF_TERM *tl_current_response = NULL;
-static __thread bool *tl_current_response_ok = NULL;
-
 /**
  * @brief Execute a call request in the OWN_GIL thread
  */
@@ -4993,15 +4948,14 @@ static void owngil_context_shutdown(py_context_t *ctx) {
  * @brief Create a new Python context
  *
  * nif_context_create(Mode) -> {ok, ContextRef, InterpId} | {error, Reason}
- * Mode: subinterp | worker | owngil
- *
- * For subinterp mode: allocates a slot from the pre-created subinterpreter pool.
- * Execution happens on dirty schedulers using PyThreadState_Swap().
+ * Mode: worker | owngil
  *
  * For owngil mode: creates a dedicated pthread with an OWN_GIL subinterpreter.
  * This enables true parallel Python execution across contexts.
+ * Requires Python 3.14+; returns {error, owngil_requires_python314} otherwise.
  *
- * For worker mode: creates namespace in the main interpreter.
+ * For worker mode: creates a namespace in the main interpreter, dispatched
+ * through the context's dedicated worker pthread.
  */
 static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     (void)argc;
@@ -5036,7 +4990,6 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T
     ctx->globals = NULL;
     ctx->locals = NULL;
     ctx->module_cache = NULL;
-    ctx->executor_id = -1;  /* Not assigned yet */
     ctx->uses_worker_thread = false;
 
     /* Create callback pipe for blocking callback responses */
diff --git a/c_src/py_nif.h b/c_src/py_nif.h
index d6ca87b..45296ac 100644
--- a/c_src/py_nif.h
+++ b/c_src/py_nif.h
@@ -179,20 +179,13 @@ typedef enum {
     PY_MODE_FREE_THREADED,
 
     /**
-     * @brief Sub-interpreter mode (Python 3.12+)
+     * @brief Conventional GIL mode (every other supported build)
      *
-     * Each sub-interpreter has its own GIL, allowing parallel execution
-     * across interpreters while maintaining GIL semantics within each.
+     * Coordinator-side work runs through the single executor thread.
+     * Per-context worker / OWN_GIL pthreads handle the public context
+     * APIs directly; this mode label only governs the coordinator path.
      */
-    PY_MODE_SUBINTERP,
-
-    /**
-     * @brief Multi-executor mode (all Python versions)
-     *
-     * Multiple executor threads share the GIL using a work-stealing
-     * pattern. This is the fallback mode for older Python versions.
-     */
-    PY_MODE_MULTI_EXECUTOR
+    PY_MODE_GIL
 } py_execution_mode_t;
 
 /** @} */
@@ -387,9 +380,6 @@ typedef struct {
 
     /** @brief Environment for building callback messages */
     ErlNifEnv *callback_env;
-
-    /** @brief Assigned executor ID for thread affinity (-1 = round-robin) */
-    int executor_id;
 } py_worker_t;
 
 /* async_pending_t and py_async_worker_t removed - async workers replaced by event loop model */
@@ -1047,9 +1037,6 @@ struct py_context {
 
     /** @brief Module cache (Dict: module_name -> PyModule) */
     PyObject *module_cache;
-
-    /** @brief Assigned executor ID for thread affinity (-1 = not assigned) */
-    int executor_id;
 };
 
 /* ============================================================================
@@ -1356,63 +1343,6 @@ typedef struct {
 
 /** @} */
 
-/* ============================================================================
- * Executor Pool
- * ============================================================================ */
-
-/**
- * @defgroup executor Executor Pool
- * @brief Multi-executor thread pool for GIL management
- * @{
- */
-
-/**
- * @def MIN_EXECUTORS
- * @brief Minimum number of executor threads in the pool
- */
-#define MIN_EXECUTORS 2
-
-/**
- * @def MAX_EXECUTORS
- * @brief Maximum number of executor threads in the pool
- */
-#define MAX_EXECUTORS 32
-
-/**
- * @struct executor_t
- * @brief Single executor thread in the multi-executor pool
- *
- * Each executor has its own request queue and processes requests
- * independently. The GIL is acquired/released around queue operations.
- */
-typedef struct {
-    /** @brief Executor thread handle */
-    pthread_t thread;
-
-    /** @brief Mutex protecting the request queue */
-    pthread_mutex_t mutex;
-
-    /** @brief Condition variable for queue signaling */
-    pthread_cond_t cond;
-
-    /** @brief Head of request queue */
-    struct py_request *queue_head;
-
-    /** @brief Tail of request queue */
-    struct py_request *queue_tail;
-
-    /** @brief Flag: executor is running */
-    volatile bool running;
-
-    /** @brief Flag: executor should shut down */
-    volatile bool shutdown;
-
-    /** @brief Executor ID (0 to MAX_EXECUTORS-1) */
-    int id;
-} executor_t;
-
-/** @} */
-
 /* ============================================================================
  * Global State Declarations
  * ============================================================================ */
@@ -1520,18 +1450,6 @@ extern PyThreadState *g_main_thread_state;
 /** @brief Current execution mode */
 extern py_execution_mode_t g_execution_mode;
 
-/** @brief Number of active executors */
-extern int g_num_executors;
-
-/** @brief Multi-executor pool array */
-extern executor_t g_executors[MAX_EXECUTORS];
-
-/** @brief Round-robin counter for executor selection */
-extern _Atomic int g_next_executor;
-
-/** @brief Flag: multi-executor pool is initialized (atomic for thread-safe access) */
-extern _Atomic bool g_multi_executor_initialized;
-
 /* Single executor state */
 
 /** @brief Single executor thread handle */
@@ -1997,90 +1915,6 @@ static int executor_start(void);
  */
 static void executor_stop(void);
 
-/**
- * @brief Main function for multi-executor threads
- *
- * Thread entry point for executor pool threads. Processes
- * requests from its queue until shutdown.
- *
- * @param arg Pointer to executor_t for this thread
- * @return NULL
- */
-static void *multi_executor_thread_main(void *arg);
-
-/**
- * @brief Start the multi-executor pool
- *
- * Creates and starts num_executors threads.
- *
- * @param num_executors Number of executors (capped at MAX_EXECUTORS)
- * @return 0 on success, -1 on failure
- */
-static int multi_executor_start(int num_executors);
-
-/**
- * @brief Stop the multi-executor pool
- *
- * Signals shutdown and waits for all executor threads.
- */
-static void multi_executor_stop(void);
-
-/**
- * @brief Select an executor using round-robin
- *
- * @return Executor index (0 to g_num_executors-1)
- */
-static int select_executor(void);
-
-/**
- * @brief Submit a request to a specific executor
- *
- * @param exec_id Executor index
- * @param req Request to submit
- */
-static void multi_executor_enqueue(int exec_id, struct py_request *req);
-
-/**
- * @brief Dispatch a context call operation to the executor
- *
- * Used when a context has thread affinity (executor_id >= 0) to ensure
- * numpy/torch thread-local state consistency.
- *
- * @param env Caller's NIF environment
- * @param ctx Context with thread affinity
- * @param module_bin Module name binary
- * @param func_bin Function name binary
- * @param args_term Arguments list
- * @param kwargs_term Keyword arguments map
- * @return Result term
- */
-ERL_NIF_TERM context_dispatch_call(ErlNifEnv *env, py_context_t *ctx,
-                                    ErlNifBinary *module_bin, ErlNifBinary *func_bin,
-                                    ERL_NIF_TERM args_term, ERL_NIF_TERM kwargs_term);
-
-/**
- * @brief Dispatch a context eval operation to the executor
- *
- * @param env Caller's NIF environment
- * @param ctx Context with thread affinity
- * @param code_bin Code string binary
- * @param locals_term Local variables map
- * @return Result term
- */
-ERL_NIF_TERM context_dispatch_eval(ErlNifEnv *env, py_context_t *ctx,
-                                    ErlNifBinary *code_bin, ERL_NIF_TERM locals_term);
-
-/**
- * @brief Dispatch a context exec operation to the executor
- *
- * @param env Caller's NIF environment
- * @param ctx Context with thread affinity
- * @param code_bin Code string binary
- * @return Result term
- */
-ERL_NIF_TERM context_dispatch_exec(ErlNifEnv *env, py_context_t *ctx,
-                                    ErlNifBinary *code_bin);
-
 /** @} */
 
 /* ============================================================================
diff --git a/c_src/py_shared_dict.c b/c_src/py_shared_dict.c
index 2d1c473..381cb9d 100644
--- a/c_src/py_shared_dict.c
+++ b/c_src/py_shared_dict.c
@@ -30,33 +30,6 @@
  * Resource Callbacks
  * ============================================================================ */
 
-/**
- * @brief Down callback for py_shared_dict_t
- *
- * Called when the owning process dies. Sets destroyed flag and clears the dict.
- * This callback is invoked by the runtime when the monitored process terminates.
- */
-static void shared_dict_down(ErlNifEnv *env, void *obj,
-                              ErlNifPid *pid, ErlNifMonitor *mon) {
-    (void)env;
-    (void)pid;
-    (void)mon;
-    py_shared_dict_t *sd = (py_shared_dict_t *)obj;
-
-    /* Mark as destroyed - subsequent access will return badarg */
-    atomic_store(&sd->destroyed, true);
-    sd->monitor_active = false;
-
-    /* Clear the Python dict if runtime is still running */
-    if (runtime_is_running() && sd->dict != NULL) {
-        PyGILState_STATE gstate = PyGILState_Ensure();
-        pthread_mutex_lock(&sd->mutex);
-        Py_CLEAR(sd->dict);
-        pthread_mutex_unlock(&sd->mutex);
-        PyGILState_Release(gstate);
-    }
-}
-
 /**
  * @brief Destructor for py_shared_dict_t
  *
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 3f40fc6..817ddf9 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -266,38 +266,33 @@ ok = py:deactivate_venv().
 
 ### Context Modes
 
-When creating explicit contexts, you can choose different execution modes:
+When creating explicit contexts, you can choose between the two public modes:
 
 ```erlang
-%% Worker mode (default, recommended) - main interpreter
-%% With free-threaded Python (3.13t+), provides true parallelism automatically
+%% Worker mode (default) - dedicated pthread per context, main interpreter
 {ok, Ctx} = py_context:new(#{mode => worker}).
 
-%% SHARED_GIL sub-interpreter (Python 3.12+) - isolated namespace
-{ok, Ctx} = py_context:new(#{mode => subinterp}).
-
-%% OWN_GIL sub-interpreter (Python 3.14+) - true parallelism
+%% OWN_GIL mode (Python 3.14+) - dedicated pthread + subinterpreter with own GIL
 {ok, Ctx} = py_context:new(#{mode => owngil}).
 ```
 
 | Mode | Python | Description |
 |------|--------|-------------|
-| `worker` | Any | Main interpreter, shared namespace (default, recommended) |
-| `subinterp` | 3.12+ | SHARED_GIL sub-interpreter, isolated namespace |
-| `owngil` | 3.14+ | OWN_GIL sub-interpreter, each has own GIL |
+| `worker` | Any | Dedicated pthread per context, main interpreter namespace (default) |
+| `owngil` | 3.14+ | Dedicated pthread + subinterpreter with its own GIL, true parallelism |
 
-**Worker mode is recommended** because it works with any Python version and automatically benefits from free-threaded Python (3.13t+) when available.
+**Worker mode is recommended** because it works with any Python version, provides stable thread affinity for libraries with thread-local state (numpy, torch, tensorflow), and automatically benefits from free-threaded Python (3.13t+) when available.
 
-**Why OWN_GIL requires Python 3.14+**: C extensions like `_decimal`, `numpy` have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14. SHARED_GIL mode works on 3.12+ but some C extensions may have issues.
+**Why OWN_GIL requires Python 3.14+**: C extensions like `_decimal`, `numpy` have global state bugs in sub-interpreters on Python 3.12/3.13. These are fixed in Python 3.14.
 
 ### Runtime Detection
 
 Check the current execution mode:
 
 ```erlang
-%% See how Python is being executed
+%% Mirrors the context_mode application env
 py:execution_mode().
-%% => free_threaded | subinterp | multi_executor
+%% => worker | owngil
 
 %% Check rate limiting status
 py_semaphore:max_concurrent().  %% Maximum concurrent calls
diff --git a/docs/migration.md b/docs/migration.md
index 5d0b2fb..da2e21c 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -78,10 +78,14 @@ Ctx2 = py:context(2),
 ```erlang
 {erlang_python, [
     {context_mode, worker},  %% worker | owngil
-    {num_contexts, 8}        %% Number of contexts to create
+    {num_contexts, 8},       %% Number of contexts to create
+    {max_concurrent, 17}     %% Optional rate-limit ceiling
 ]}
 ```
 
+`num_executors` and `num_async_workers` were both removed in v3.0; the
+supervisor no longer reads them.
+
 ### Python Version Compatibility
 
 | Python Version | v2.x Mode | v3.0 Mode |
@@ -614,21 +618,24 @@ async def async_handler():
 
 ### Async Task API (Erlang Side)
 
-Submit and manage async Python tasks from Erlang:
+Submit and manage async Python tasks from Erlang. Tasks always run on the
+shared `py_event_loop`; routing happens via `py_event_loop_pool` for the
+pool-based variant. The `(Module, Func, Args[, Opts/Kwargs])` signature does
+not take a context — coroutines are scheduled on the loop, not on a context.
 
 ```erlang
 %% Blocking run
-{ok, Result} = py_event_loop:run(Ctx, my_module, my_async_func, [Arg1]).
+{ok, Result} = py_event_loop:run(my_module, my_async_func, [Arg1]).
 
 %% Non-blocking with reference
-Ref = py_event_loop:create_task(Ctx, my_module, my_async_func, [Arg1]),
+Ref = py_event_loop:create_task(my_module, my_async_func, [Arg1]),
 {ok, Result} = py_event_loop:await(Ref, 5000).
 
 %% Fire-and-forget
-py_event_loop:spawn_task(Ctx, my_module, my_async_func, [Arg1]).
+py_event_loop:spawn_task(my_module, my_async_func, [Arg1]).
 
 %% Message-based result delivery
-Ref = py_event_loop:create_task(Ctx, my_module, my_async_func, [Arg1]),
+Ref = py_event_loop:create_task(my_module, my_async_func, [Arg1]),
 receive
     {async_result, Ref, {ok, Result}} -> handle(Result);
     {async_result, Ref, {error, Reason}} -> handle_error(Reason)
diff --git a/docs/owngil_internals.md b/docs/owngil_internals.md
index 2819c4a..7ee0382 100644
--- a/docs/owngil_internals.md
+++ b/docs/owngil_internals.md
@@ -4,7 +4,7 @@
 
 OWN_GIL mode provides true parallel Python execution using Python 3.14+ per-interpreter GIL (`PyInterpreterConfig_OWN_GIL`). Each OWN_GIL context runs in a dedicated pthread with its own subinterpreter and GIL.
 
-**Note**: OWN_GIL requires Python 3.14+ due to C extension global state bugs in earlier versions (e.g., `_decimal`, `numpy`). For Python 3.12/3.13, use SHARED_GIL sub-interpreters (`mode => subinterp`) which provide namespace isolation but share the GIL.
+**Note**: OWN_GIL requires Python 3.14+ due to C extension global state bugs in earlier versions (e.g., `_decimal`, `numpy`). On Python 3.12/3.13, use the default `worker` mode — contexts share the main interpreter but each owns a dedicated pthread.
 
 ## Quick Start
 
@@ -83,11 +83,10 @@ All major erlang_python features work with OWN_GIL mode:
 
 | Mode | Python Version | Thread Model | GIL | Parallelism |
 |------|----------------|--------------|-----|-------------|
-| `worker` | Any | Dirty scheduler | Main interpreter GIL | None |
-| `subinterp` | 3.12+ | Dirty scheduler | Shared GIL | None (isolated namespaces) |
-| `owngil` | 3.14+ | Dedicated pthread | Per-interpreter GIL | True parallel |
+| `worker` | Any | Dedicated pthread per context | Main interpreter GIL | True parallel on free-threaded 3.13t+ |
+| `owngil` | 3.14+ | Dedicated pthread per context | Per-interpreter GIL | True parallel |
 
-**Why version requirements differ**: The `subinterp` mode (SHARED_GIL) works on Python 3.12+ for namespace isolation. However, `owngil` mode requires Python 3.14+ because C extensions like `_decimal`, `numpy` have global state that crashes in OWN_GIL sub-interpreters on earlier versions. Python 3.14 includes fixes for these issues (see [cpython#106078](https://github.com/python/cpython/issues/106078)).
+**Why OWN_GIL requires Python 3.14+**: C extensions like `_decimal`, `numpy` have global state that crashes in OWN_GIL sub-interpreters on Python 3.12/3.13. Python 3.14 includes fixes for these issues (see [cpython#106078](https://github.com/python/cpython/issues/106078)).
 
 ## Key Data Structures
 
@@ -174,7 +173,7 @@ nif_context_call(env, ctx, module, func, args, kwargs)
     │       └── pthread_mutex_unlock(&ctx->request_mutex)
     │
     └── [ctx->uses_own_gil == false]
-        └── Direct execution with GIL (worker/subinterp mode)
+        └── Direct execution with GIL (worker mode)
 ```
 
 ### 3. Request Processing (OWN_GIL Thread)
@@ -438,7 +437,7 @@ Use OWN_GIL when:
 - Long-running computations
 - Need true concurrent Python execution
 
-Use shared-GIL (subinterp) when:
+Use worker mode when:
 - I/O-bound or short operations
 - High call frequency
 - Resource constraints
@@ -454,7 +453,7 @@ rebar3 compile && escript examples/bench_owngil.erl
 Example output:
 ```
 ========================================================
-  OWN_GIL vs SHARED_GIL Benchmark
+  OWN_GIL vs Worker Benchmark
 ========================================================
 
 System Information
@@ -462,25 +461,25 @@ System Information
   Erlang/OTP:       27
   Schedulers:       8
   Python:           3.14.0
-  Subinterp:        true
+  OWN_GIL:          true
 
 1. Single Context Latency (1000 calls to math.sqrt)
    Mode            us/call    calls/sec
    ----            -------    ---------
-   subinterp           2.5       400000
+   worker              2.5       400000
    owngil             10.2        98000
 
 2. Parallel Throughput (4 contexts, 10000 calls each)
    Mode            total_ms   calls/sec
    ----            --------   ---------
-   subinterp          100.5       398000
-   owngil              28.3      1415000   <- 3.5x faster
+   worker            100.5       398000
+   owngil             28.3      1415000   <- 3.5x faster
 
 3. CPU-Bound Speedup (fibonacci(30) x 4 contexts)
    Mode            total_ms   speedup
    ----            --------   -------
-   subinterp          800.2      1.0x
-   owngil             205.1      3.9x     <- near-linear scaling
+   worker            800.2      1.0x
+   owngil            205.1      3.9x     <- near-linear scaling
 ```
 
 ## Safety Mechanisms
diff --git a/docs/preload.md b/docs/preload.md
index 682e2b0..44b617b 100644
--- a/docs/preload.md
+++ b/docs/preload.md
@@ -119,5 +119,5 @@ py_preload:clear_code().
 ## Limitations
 
 - Changes to preload code don't affect existing contexts
-- Same preload applies to all context modes (worker, subinterp, owngil)
+- Same preload applies to both context modes (worker, owngil)
 - Preload errors during context creation will fail the context
diff --git a/docs/process-bound-envs.md b/docs/process-bound-envs.md
index c9f986a..201546e 100644
--- a/docs/process-bound-envs.md
+++ b/docs/process-bound-envs.md
@@ -107,12 +107,12 @@ end).
 | **Explicit** | `create_local_env` + `py_nif:context_*` | OWN_GIL, fine-grained control, multiple envs per process |
 
 **Use implicit (py:exec)** when:
-- Using worker or subinterp modes
+- Using worker mode
 - One environment per process is sufficient
 - You want automatic lifecycle management
 
 **Use explicit (create_local_env)** when:
-- Using OWN_GIL mode for parallel execution
+- Using `owngil` mode for parallel execution
 - Need multiple environments in a single process
 - Want to pass environments between processes
 - Need direct NIF-level control
@@ -461,7 +461,7 @@ This design prioritizes safety over avoiding minor memory leaks during edge case
 ## See Also
 
 - [OWN_GIL Internals](owngil_internals.md) - Architecture and safety mechanisms for OWN_GIL mode
-- [Scalability](scalability.md) - Mode comparison (owngil vs subinterp vs worker)
+- [Scalability](scalability.md) - Mode comparison (worker vs owngil)
 - [Event Loop Architecture](event_loop_architecture.md) - Per-process namespace management
 - [Context Affinity](context-affinity.md) - Context binding and routing
 - [Scheduling](asyncio.md) - Cooperative scheduling for long operations
diff --git a/docs/reactor.md b/docs/reactor.md
index 0387cc8..ff6b6cd 100644
--- a/docs/reactor.md
+++ b/docs/reactor.md
@@ -573,11 +573,11 @@ Internal - called by NIF to close connection.
 
 ## Subinterpreter Support
 
-The reactor supports isolated subinterpreters via `py_reactor_context`. Each subinterpreter has its own reactor module cache, ensuring protocol factories are isolated between contexts.
+The reactor supports isolated subinterpreters via `py_reactor_context` in `owngil` mode. Each subinterpreter has its own reactor module cache, ensuring protocol factories are isolated between contexts.
 
 ```erlang
-%% Create context with subinterpreter mode
-{ok, Ctx1} = py_reactor_context:start_link(1, subinterp, #{
+%% Create context with OWN_GIL subinterpreter (Python 3.14+)
+{ok, Ctx1} = py_reactor_context:start_link(1, owngil, #{
     setup_code => <<"
 import erlang.reactor as reactor
 reactor.set_protocol_factory(EchoProtocol)
@@ -585,7 +585,7 @@ reactor.set_protocol_factory(EchoProtocol)
 }),
 
 %% Create another context with different protocol
-{ok, Ctx2} = py_reactor_context:start_link(2, subinterp, #{
+{ok, Ctx2} = py_reactor_context:start_link(2, owngil, #{
     setup_code => <<"
 import erlang.reactor as reactor
 reactor.set_protocol_factory(HttpProtocol)
@@ -593,7 +593,7 @@ reactor.set_protocol_factory(HttpProtocol)
 }).
 ```
 
-Each context runs in its own subinterpreter with isolated protocol factory and connection state. This enables running multiple protocol handlers in the same BEAM VM without interference.
+Each `owngil` context runs in its own subinterpreter with isolated protocol factory and connection state. For Python <3.14, use `worker` mode — contexts share the main interpreter but each owns a dedicated pthread.
 
 ## See Also
 
diff --git a/docs/scalability.md b/docs/scalability.md
index cefe59f..eef5d21 100644
--- a/docs/scalability.md
+++ b/docs/scalability.md
@@ -167,14 +167,10 @@ Ctx = py:context(1),
 
 ### Pool Size
 
-The subinterpreter pool size is configured at two levels:
-
-| Level | Default | Max |
-|-------|---------|-----|
-| **Erlang (py_context_router)** | `erlang:system_info(schedulers)` | configurable |
-| **C pool (py_subinterp_pool)** | 32 | 64 |
-
-On a typical 8-core machine, 8 context processes are started, each with one subinterpreter slot.
+`py_context_router` sizes the context pool from `num_contexts` (default
+`erlang:system_info(schedulers)`). Each context owns its own pthread; in
+`owngil` mode that thread also owns a dedicated subinterpreter. There is no
+shared C-level pool.
 
 **Configuration via sys.config:**
 ```erlang
@@ -271,8 +267,8 @@ For CPU-bound workloads on Python 3.12+, erlang_python provides true parallelism
 %% Check if subinterpreters are supported (Python 3.12+)
 true = py:subinterp_supported().
 
-%% Check current execution mode
-subinterp = py:execution_mode().
+%% Check current execution mode (mirrors context_mode app env)
+worker = py:execution_mode().  %% or owngil
 ```
 
 ### Using the Context Router
@@ -388,7 +384,10 @@ PYTHON_CONFIG=/path/to/python3.13-config rebar3 compile
 ```erlang
 1> application:ensure_all_started(erlang_python).
 2> py:execution_mode().
-free_threaded
+worker
+%% Free-threaded Python is detected internally; the public mode mirrors
+%% the configured context_mode (worker | owngil), and worker mode
+%% automatically benefits from the free-threaded build.
 ```
 
 ## Performance Tuning
diff --git a/docs/testing-free-threading.md b/docs/testing-free-threading.md
index ab83d47..dd538d9 100644
--- a/docs/testing-free-threading.md
+++ b/docs/testing-free-threading.md
@@ -99,10 +99,13 @@ rebar3 compile
 {ok, [erlang_python]}
 
 2> py:execution_mode().
-free_threaded  % Should show 'free_threaded' instead of 'subinterp' or 'multi_executor'
+worker  %% public mode, mirrors context_mode app env
 
-3> py:num_executors().
-1  % In free_threaded mode, no executor pool is used
+3> py_nif:execution_mode().
+free_threaded  %% internal capability — confirms the no-GIL build was detected
+
+4> py_context_router:num_contexts().
+8  %% one pthread per context
 ```
 
 ## Running Tests
@@ -180,21 +183,23 @@ Ensure `PYTHON_CONFIG` points to the free-threaded Python installation:
 ls $(dirname $(which python3))/../include/*/Python.h
 ```
 
-### Mode Shows 'multi_executor' Instead of 'free_threaded'
+### `py_nif:execution_mode/0` does not return `free_threaded`
 
 The Python build may not have `Py_GIL_DISABLED` defined. Verify:
 ```bash
 python3 -c "import sysconfig; print(sysconfig.get_config_var('Py_GIL_DISABLED'))"
 ```
 
-Should print `1` for free-threaded builds.
+Should print `1` for free-threaded builds. The public `py:execution_mode/0`
+will still return `worker | owngil` regardless — it reflects the configured
+context mode, not the underlying Python capability.
 
 ### Crashes Under Load
 
 Some extensions may not be thread-safe. Try:
 1. Isolate the problematic extension
 2. Check if a thread-safe version exists
-3. Fall back to sub-interpreter mode for those calls
+3. Switch to `owngil` mode (Python 3.14+) for stronger isolation
 
 ## See Also
 
diff --git a/examples/README.md b/examples/README.md
index 19aa5ae..2538292 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -87,14 +87,8 @@ Simple echo server using Reactor API.
 escript examples/reactor_echo.erl
 ```
 
-### reactor_subinterp_example.erl
-Reactor with subinterpreter isolation (Python 3.12+).
-```bash
-escript examples/reactor_subinterp_example.erl
-```
-
 ### reactor_owngil_example.erl
-Reactor with OWN_GIL subinterpreters (Python 3.12+).
+Reactor with OWN_GIL subinterpreters (Python 3.14+).
 ```bash
 escript examples/reactor_owngil_example.erl
 ```
@@ -122,7 +116,7 @@ escript examples/bench_channel.erl
 ```
 
 ### bench_reactor_modes.erl
-Reactor worker vs subinterpreter benchmark.
+Reactor worker vs OWN_GIL benchmark.
 ```bash
 escript examples/bench_reactor_modes.erl
 ```
diff --git a/examples/bench_owngil.erl b/examples/bench_owngil.erl
index 9c1ff9d..fd77be9 100644
--- a/examples/bench_owngil.erl
+++ b/examples/bench_owngil.erl
@@ -2,7 +2,7 @@
 %% -*- erlang -*-
 %%! -pa _build/default/lib/erlang_python/ebin
 
-%%% @doc Benchmark comparing SHARED_GIL vs OWN_GIL context modes.
+%%% @doc Benchmark comparing worker vs OWN_GIL context modes.
 %%%
 %%% OWN_GIL mode creates a dedicated pthread with its own Python GIL,
 %%% enabling true parallel execution for CPU-bound workloads.
@@ -15,7 +15,7 @@
 main(_Args) ->
     io:format("~n"),
     io:format("========================================================~n"),
-    io:format("  OWN_GIL vs SHARED_GIL Benchmark~n"),
+    io:format("  OWN_GIL vs Worker Benchmark~n"),
     io:format("========================================================~n~n"),
 
     %% Start the application
@@ -24,14 +24,14 @@ main(_Args) ->
     %% Print system info
     print_system_info(),
 
-    case py_nif:subinterp_supported() of
+    case py_nif:owngil_supported() of
         true ->
             bench_single_latency(),
             bench_parallel_throughput(),
             bench_cpu_speedup();
         false ->
-            io:format("~n[ERROR] OWN_GIL requires Python 3.12+~n"),
-            io:format("        Current Python version does not support subinterpreters.~n~n")
+            io:format("~n[ERROR] OWN_GIL requires Python 3.14+~n"),
+            io:format("        Current Python build does not support OWN_GIL subinterpreters.~n~n")
     end,
 
     halt(0).
@@ -43,7 +43,7 @@ print_system_info() ->
     io:format("  Schedulers:       ~p~n", [erlang:system_info(schedulers)]),
     {ok, PyVer} = py:version(),
     io:format("  Python:           ~s~n", [PyVer]),
-    io:format("  Subinterp:        ~p~n", [py_nif:subinterp_supported()]),
+    io:format("  OWN_GIL:          ~p~n", [py_nif:owngil_supported()]),
     io:format("~n").
 
 %% ============================================================================
@@ -72,7 +72,7 @@ bench_single_latency() ->
         io:format("   ~-15s ~10.1f ~12w~n", [Label, UsPerCall, CallsPerSec]),
 
         py_context:stop(Ctx)
-    end, [{subinterp, subinterp}, {owngil, owngil}]),
+    end, [{worker, worker}, {owngil, owngil}]),
     io:format("~n").
 
 %% ============================================================================
@@ -114,7 +114,7 @@ bench_parallel_throughput() ->
         io:format("   ~-15s ~10w ~12w~n", [Label, Elapsed, CallsPerSec]),
 
         [py_context:stop(Ctx) || Ctx <- Contexts]
-    end, [{subinterp, subinterp}, {owngil, owngil}]),
+    end, [{worker, worker}, {owngil, owngil}]),
     io:format("~n").
 
 %% ============================================================================
@@ -154,11 +154,11 @@ bench_cpu_speedup() ->
         io:format("   ~-15s ~10w ~10w ~10.2fx~n", [Label, SeqTime, ParTime, Speedup]),
 
         [py_context:stop(Ctx) || Ctx <- Contexts]
-    end, [{subinterp, subinterp}, {owngil, owngil}]),
+    end, [{worker, worker}, {owngil, owngil}]),
 
     io:format("~n"),
     io:format("Notes:~n"),
-    io:format("  - SHARED_GIL (subinterp) contexts share Python's GIL~n"),
+    io:format("  - Worker contexts share Python's GIL on the main interpreter~n"),
     io:format("  - OWN_GIL contexts have independent GILs for true parallelism~n"),
     io:format("  - OWN_GIL speedup should approach number of CPU cores~n"),
     io:format("~n").
diff --git a/examples/bench_reactor_modes.erl b/examples/bench_reactor_modes.erl
index 48951ad..dd0389d 100644
--- a/examples/bench_reactor_modes.erl
+++ b/examples/bench_reactor_modes.erl
@@ -2,7 +2,7 @@
 %% -*- erlang -*-
 %%! -pa _build/default/lib/erlang_python/ebin
 
-%%% @doc Benchmark comparing Reactor (worker vs subinterp) with Channel API.
+%%% @doc Benchmark comparing Reactor (worker vs OWN_GIL) with Channel API.
 %%%
 %%% Run with:
 %%%   rebar3 compile && escript examples/bench_reactor_modes.erl
@@ -30,18 +30,18 @@ main(_Args) ->
     io:format("~n--- Reactor (Worker Mode) ---~n"),
     {WkPersistent, WkLifecycle} = run_reactor_worker_bench(),
 
-    %% Subinterpreter mode benchmarks (if supported)
-    {SiPersistent, SiLifecycle} = case py:subinterp_supported() of
+    %% OWN_GIL mode benchmarks (if supported)
+    {OgPersistent, OgLifecycle} = case py_nif:owngil_supported() of
         true ->
-            io:format("~n--- Reactor (Subinterpreter Mode) ---~n"),
-            run_reactor_subinterp_bench();
+            io:format("~n--- Reactor (OWN_GIL Mode) ---~n"),
+            run_reactor_owngil_bench();
         false ->
-            io:format("~n[Skipping subinterpreter benchmarks - Python < 3.12]~n"),
+            io:format("~n[Skipping OWN_GIL benchmarks - Python < 3.14]~n"),
             {[], []}
     end,
 
     %% Print comparison summary
-    print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersistent, SiLifecycle),
+    print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, OgPersistent, OgLifecycle),
 
     halt(0).
 
@@ -52,7 +52,7 @@ print_system_info() ->
     io:format("  Schedulers:       ~p~n", [erlang:system_info(schedulers)]),
     {ok, PyVer} = py:version(),
     io:format("  Python:           ~s~n", [PyVer]),
-    io:format("  Subinterp:        ~p~n", [py:subinterp_supported()]),
+    io:format("  OWN_GIL:          ~p~n", [py_nif:owngil_supported()]),
     io:format("~n").
 
 %% ============================================================================
@@ -290,10 +290,10 @@ recv_all(Socket, Remaining, Timeout, Acc) ->
     end.
 
 %% ============================================================================
-%% Reactor Subinterpreter Mode Benchmarks
+%% Reactor OWN_GIL Mode Benchmarks
 %% ============================================================================
 
-run_reactor_subinterp_bench() ->
+run_reactor_owngil_bench() ->
     %% Protocol that stays open for multiple messages
     PersistentSetup = <<"
 import erlang.reactor as reactor
@@ -340,7 +340,7 @@ reactor.set_protocol_factory(OneMessageProtocol)
         Data = binary:copy(<<$X>>, Size),
         Iterations = 500,
 
-        {ok, Ctx} = py_reactor_context:start_link(1900 + Size, subinterp, #{
+        {ok, Ctx} = py_reactor_context:start_link(1900 + Size, owngil, #{
             setup_code => PersistentSetup
         }),
 
@@ -387,7 +387,7 @@ reactor.set_protocol_factory(OneMessageProtocol)
         Data = binary:copy(<<$X>>, Size),
         Iterations = 100,
 
-        {ok, Ctx} = py_reactor_context:start_link(1800 + Size, subinterp, #{
+        {ok, Ctx} = py_reactor_context:start_link(1800 + Size, owngil, #{
             setup_code => LifecycleSetup
         }),
 
@@ -421,7 +421,7 @@ reactor.set_protocol_factory(OneMessageProtocol)
 %% Comparison Summary
 %% ============================================================================
 
-print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersistent, SiLifecycle) ->
+print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, OgPersistent, OgLifecycle) ->
     io:format("~n"),
     io:format("========================================================~n"),
     io:format("  COMPARISON SUMMARY~n"),
@@ -431,16 +431,16 @@ print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersist
     io:format("A) PERSISTENT CONNECTION (messages on existing connection)~n"),
     io:format("-----------------------------------------------------------~n"),
     io:format("~8s | ~12s | ~12s | ~12s~n",
-              ["Size", "Channel", "Reactor/W", "Reactor/S"]),
+              ["Size", "Channel", "Reactor/W", "Reactor/OG"]),
     io:format("~s~n", [string:copies("-", 52)]),
 
     lists:foreach(fun({{Size, _, ChOps}, {_, _, WkOps}}) ->
-        SubOps = case lists:keyfind(Size, 1, SiPersistent) of
+        OgOps = case lists:keyfind(Size, 1, OgPersistent) of
             {_, _, O} -> integer_to_list(O);
             false -> "N/A"
         end,
         io:format("~8B | ~12w | ~12w | ~12s~n",
-                  [Size, ChOps, WkOps, SubOps])
+                  [Size, ChOps, WkOps, OgOps])
     end, lists:zip(ChPersistent, WkPersistent)),
 
     %% Full lifecycle comparison (connections/sec including setup/teardown)
@@ -448,23 +448,23 @@ print_comparison(ChPersistent, ChLifecycle, WkPersistent, WkLifecycle, SiPersist
     io:format("B) FULL LIFECYCLE (create + send/recv + close per op)~n"),
     io:format("-----------------------------------------------------------~n"),
     io:format("~8s | ~12s | ~12s | ~12s~n",
-              ["Size", "Channel", "Reactor/W", "Reactor/S"]),
+              ["Size", "Channel", "Reactor/W", "Reactor/OG"]),
     io:format("~s~n", [string:copies("-", 52)]),
 
     lists:foreach(fun({{Size, _, ChOps}, {_, _, WkOps}}) ->
-        SubOps = case lists:keyfind(Size, 1, SiLifecycle) of
+        OgOps = case lists:keyfind(Size, 1, OgLifecycle) of
             {_, _, O} -> integer_to_list(O);
             false -> "N/A"
         end,
         io:format("~8B | ~12w | ~12w | ~12s~n",
-                  [Size, ChOps, WkOps, SubOps])
+                  [Size, ChOps, WkOps, OgOps])
     end, lists:zip(ChLifecycle, WkLifecycle)),
 
     io:format("~n"),
     io:format("Legend:~n"),
-    io:format("  Channel   = py_channel API~n"),
-    io:format("  Reactor/W = erlang.reactor with worker mode~n"),
-    io:format("  Reactor/S = erlang.reactor with subinterpreter (SHARED_GIL)~n"),
+    io:format("  Channel    = py_channel API~n"),
+    io:format("  Reactor/W  = erlang.reactor with worker mode~n"),
+    io:format("  Reactor/OG = erlang.reactor with OWN_GIL subinterpreter~n"),
     io:format("~n"),
     io:format("Notes:~n"),
     io:format("  - A) measures throughput on persistent connection (best case)~n"),
diff --git a/examples/benchmark.erl b/examples/benchmark.erl
index 8c19cba..1bdcdee 100755
--- a/examples/benchmark.erl
+++ b/examples/benchmark.erl
@@ -73,7 +73,7 @@ print_system_info() ->
     {ok, PyVer} = py:version(),
     io:format("~s~n", [PyVer]),
     io:format("  Execution Mode: ~p~n", [py:execution_mode()]),
-    io:format("  Num Executors: ~p~n", [py:num_executors()]),
+    io:format("  Num Contexts: ~p~n", [py_context_router:num_contexts()]),
     io:format("  Max Concurrent: ~p~n", [py_semaphore:max_concurrent()]),
     io:format("~n").
 
diff --git a/examples/reactor_subinterp_example.erl b/examples/reactor_subinterp_example.erl
deleted file mode 100644
index 8075728..0000000
--- a/examples/reactor_subinterp_example.erl
+++ /dev/null
@@ -1,158 +0,0 @@
-%% @doc Example: SHARED_GIL reactor with subinterpreters.
-%%
-%% Each py_reactor_context runs in an isolated subinterpreter with its own
-%% protocol factory. Multiple contexts can process connections in parallel
-%% while sharing Python's GIL.
-%%
-%% Best for: High-concurrency I/O-bound workloads (HTTP servers, WebSockets).
-
--module(reactor_subinterp_example).
--export([start/0, start/1, stop/1]).
-
--define(ECHO_PROTOCOL, <<"
-import erlang.reactor as reactor
-
-class EchoProtocol(reactor.Protocol):
-    '''Echo back all received data.'''
-
-    def data_received(self, data):
-        self.write_buffer.extend(data)
-        return 'write_pending'
-
-    def write_ready(self):
-        if self.write_buffer:
-            written = self.write(bytes(self.write_buffer))
-            del self.write_buffer[:written]
-            if self.write_buffer:
-                return 'continue'
-        return 'read_pending'
-
-reactor.set_protocol_factory(EchoProtocol)
-">>).
-
--define(HTTP_PROTOCOL, <<"
-import erlang.reactor as reactor
-
-class SimpleHTTPProtocol(reactor.Protocol):
-    '''Simple HTTP/1.1 response protocol.'''
-
-    def __init__(self):
-        super().__init__()
-        self.request_data = bytearray()
-
-    def data_received(self, data):
-        self.request_data.extend(data)
-        # Check for end of HTTP headers
-        if b'\\r\\n\\r\\n' in self.request_data:
-            # Build simple response
-            body = b'Hello from subinterpreter!'
-            response = (
-                b'HTTP/1.1 200 OK\\r\\n'
-                b'Content-Type: text/plain\\r\\n'
-                b'Content-Length: ' + str(len(body)).encode() + b'\\r\\n'
-                b'Connection: close\\r\\n'
-                b'\\r\\n' + body
-            )
-            self.write_buffer.extend(response)
-            return 'write_pending'
-        return 'continue'
-
-    def write_ready(self):
-        if self.write_buffer:
-            written = self.write(bytes(self.write_buffer))
-            del self.write_buffer[:written]
-            if self.write_buffer:
-                return 'continue'
-        return 'close'
-
-reactor.set_protocol_factory(SimpleHTTPProtocol)
-">>).
-
-%% @doc Start with default settings (4 contexts, 2 echo + 2 http).
-start() ->
-    start(#{contexts => 4, port => 8080}).
-
-%% @doc Start reactor contexts.
-%%
-%% Options:
-%%   contexts - Number of contexts to create (default: 4)
-%%   port - Port to listen on (default: 8080)
-%%
-%% Returns: {ok, #{echo => [Pid], http => [Pid], acceptor => Pid, socket => Socket}}
-start(Opts) ->
-    NumContexts = maps:get(contexts, Opts, 4),
-    Port = maps:get(port, Opts, 8080),
-    HalfContexts = NumContexts div 2,
-
-    %% Start echo protocol contexts
-    EchoContexts = [begin
-        {ok, Pid} = py_reactor_context:start_link(N, subinterp, #{
-            max_connections => 100,
-            setup_code => ?ECHO_PROTOCOL
-        }),
-        Pid
-    end || N <- lists:seq(1, HalfContexts)],
-
-    %% Start HTTP protocol contexts
-    HttpContexts = [begin
-        {ok, Pid} = py_reactor_context:start_link(N, subinterp, #{
-            max_connections => 100,
-            setup_code => ?HTTP_PROTOCOL
-        }),
-        Pid
-    end || N <- lists:seq(HalfContexts + 1, NumContexts)],
-
-    AllContexts = EchoContexts ++ HttpContexts,
-
-    %% Start acceptor that routes to contexts
-    {ok, ListenSock} = gen_tcp:listen(Port, [
-        binary,
-        {active, false},
-        {reuseaddr, true},
-        {backlog, 128}
-    ]),
-    Acceptor = spawn_link(fun() -> accept_loop(ListenSock, AllContexts, 1) end),
-
-    io:format("Reactor started on port ~p with ~p contexts~n", [Port, NumContexts]),
-    io:format("  Echo contexts: ~p~n", [EchoContexts]),
-    io:format("  HTTP contexts: ~p~n", [HttpContexts]),
-
-    {ok, #{
-        echo => EchoContexts,
-        http => HttpContexts,
-        acceptor => Acceptor,
-        socket => ListenSock
-    }}.
-
-%% @doc Stop the reactor server.
-stop(#{acceptor := Acceptor, socket := Socket, echo := Echo, http := Http}) ->
-    exit(Acceptor, shutdown),
-    gen_tcp:close(Socket),
-    [py_reactor_context:stop(Pid) || Pid <- Echo ++ Http],
-    ok.
-
-%% @private Simple round-robin acceptor
-accept_loop(ListenSock, Contexts, Idx) ->
-    case gen_tcp:accept(ListenSock) of
-        {ok, Socket} ->
-            %% Get FD and hand off to reactor context
-            {ok, Fd} = prim_inet:getfd(Socket),
-            Ctx = lists:nth(Idx, Contexts),
-            ClientInfo = get_client_info(Socket),
-            py_reactor_context:handoff(Ctx, Fd, ClientInfo),
-
-            %% Round-robin to next context
-            NextIdx = (Idx rem length(Contexts)) + 1,
-            accept_loop(ListenSock, Contexts, NextIdx);
-
-        {error, closed} ->
-            ok
-    end.
-
-get_client_info(Socket) ->
-    case inet:peername(Socket) of
-        {ok, {Addr, Port}} ->
-            #{addr => inet:ntoa(Addr), port => Port, type => tcp};
-        _ ->
-            #{type => tcp}
-    end.
diff --git a/src/erlang_python.app.src b/src/erlang_python.app.src
index ae6e135..6534c1d 100644
--- a/src/erlang_python.app.src
+++ b/src/erlang_python.app.src
@@ -7,11 +7,7 @@
         kernel,
         stdlib
     ]},
-    {env, [
-        {num_workers, 4},
-        {python_path, ""},
-        {worker_timeout, 30000}
-    ]},
+    {env, []},
     {modules, []},
     {licenses, ["Apache-2.0"]},
     {links, [{"GitHub", "https://github.com/benoitc/erlang-python"}]},
diff --git a/src/erlang_python_sup.erl b/src/erlang_python_sup.erl
index d1ef01a..d6471be 100644
--- a/src/erlang_python_sup.erl
+++ b/src/erlang_python_sup.erl
@@ -19,7 +19,7 @@
 %%%   <li>py_callback - Callback registry for Python to Erlang calls</li>
 %%%   <li>py_state - Shared state storage accessible from Python</li>
 %%%   <li>py_context_sup - Supervisor for process-per-context workers</li>
-%%%   <li>py_async_pool - Worker pool for asyncio coroutines</li>
+%%%   <li>py_event_loop / py_event_loop_pool - Asyncio dispatch</li>
 %%% </ul>
 %%% @private
 -module(erlang_python_sup).
@@ -35,14 +35,9 @@ init([]) ->
     NumContexts = application:get_env(erlang_python, num_contexts,
                                        erlang:system_info(schedulers)),
     ContextMode = application:get_env(erlang_python, context_mode, worker),
-    NumAsyncWorkers = application:get_env(erlang_python, num_async_workers, 2),
-
-    %% Default executors: 4 (benchmarked sweet spot for most workloads)
-    %% Can be overridden via {erlang_python, [{num_executors, N}]}
-    NumExecutors = application:get_env(erlang_python, num_executors, 4),
 
     %% Initialize Python runtime first
-    ok = py_nif:init(#{num_executors => NumExecutors}),
+    ok = py_nif:init(#{}),
 
     %% Initialize the semaphore ETS table for rate limiting
     ok = py_semaphore:init(),
@@ -123,16 +118,6 @@ init([]) ->
         modules => [py_context_init]
     },
 
-    %% Async worker pool (for asyncio coroutines)
-    AsyncPoolSpec = #{
-        id => py_async_pool,
-        start => {py_async_pool, start_link, [NumAsyncWorkers]},
-        restart => permanent,
-        shutdown => 5000,
-        type => worker,
-        modules => [py_async_pool]
-    },
-
     %% Event worker registry (for scalable I/O model)
     WorkerRegistrySpec = #{
         id => py_event_worker_registry,
@@ -176,7 +161,7 @@ init([]) ->
     Children = [CallbackSpec, ThreadHandlerSpec, LoggerSpec, TracerSpec,
                 ContextSupSpec, ContextRouterInitSpec,
                 WorkerRegistrySpec, WorkerSupSpec, EventLoopSpec,
-                EventLoopPoolSpec, AsyncPoolSpec],
+                EventLoopPoolSpec],
 
     {ok, {
         #{strategy => one_for_all, intensity => 5, period => 10},
diff --git a/src/py.erl b/src/py.erl
index 8f719a8..4c4b5d9 100644
--- a/src/py.erl
+++ b/src/py.erl
@@ -75,8 +75,7 @@
     async_await/1,
     async_await/2,
     async_gather/1,
-    async_stream/3,
-    async_stream/4,
+    async_gather/2,
     %% Parallel execution (Python 3.12+ sub-interpreters)
     parallel/1,
     subinterp_supported/0,
@@ -319,7 +318,7 @@ eval(Code, Locals, Timeout) ->
 %%
 %% In worker mode, the code runs in a process-local Python environment.
 %% Variables defined via exec persist within the calling Erlang process.
-%% In subinterpreter mode, each context has its own isolated namespace.
+%% In owngil mode, each context has its own isolated namespace.
 -spec exec(string() | binary()) -> ok | {error, term()}.
 exec(Code) ->
     %% Always route through context process - it handles callbacks inline using
@@ -720,28 +719,33 @@ async_call(Module, Func, Args) ->
 %% @doc Call a Python async function with keyword arguments.
 -spec async_call(py_module(), py_func(), py_args(), py_kwargs()) -> py_ref().
 async_call(Module, Func, Args, Kwargs) ->
-    Ref = make_ref(),
-    py_async_pool:request({async_call, Ref, self(), Module, Func, Args, Kwargs}),
-    Ref.
+    py_event_loop:create_task(Module, Func, Args, Kwargs).
 
 %% @doc Wait for an async call to complete.
 -spec async_await(py_ref()) -> py_result().
 async_await(Ref) ->
-    await(Ref, ?DEFAULT_TIMEOUT).
+    async_await(Ref, ?DEFAULT_TIMEOUT).
 
 %% @doc Wait for an async call with timeout.
-%% Note: Identical to await/2 - provided for API symmetry with async_call.
 -spec async_await(py_ref(), timeout()) -> py_result().
 async_await(Ref, Timeout) ->
-    await(Ref, Timeout).
+    py_event_loop:await(Ref, Timeout).
 
-%% @doc Execute multiple async calls concurrently using asyncio.gather.
-%% Takes a list of {Module, Func, Args} tuples and executes them all
-%% concurrently, returning when all are complete.
+%% @doc Execute multiple async Python calls concurrently.
+%%
+%% Each call is submitted to the event loop independently, so they run
+%% concurrently. Results are collected in the order of the input list.
+%% Sync functions are accepted and resolve immediately (the event loop
+%% short-circuits non-coroutines).
+%%
+%% Returns `{ok, [Result1, Result2, ...]}' when every call succeeds, where
+%% each `ResultN' is the value returned by the corresponding call.
+%% Returns `{error, {gather_failed, Errors}}' if any call fails, where
+%% `Errors' is a list of `{Index, Reason}' tuples for each failure.
 %%
 %% Example:
 %% ```
-%% {ok, Results} = py:async_gather([
+%% {ok, [R1, R2, R3]} = py:async_gather([
 %%     {aiohttp, get, [Url1]},
 %%     {aiohttp, get, [Url2]},
 %%     {aiohttp, get, [Url3]}
@@ -749,37 +753,21 @@ async_await(Ref, Timeout) ->
 %% '''
 -spec async_gather([{py_module(), py_func(), py_args()}]) -> py_result().
 async_gather(Calls) ->
-    Ref = make_ref(),
-    py_async_pool:request({async_gather, Ref, self(), Calls}),
-    async_await(Ref, ?DEFAULT_TIMEOUT).
-
-%% @doc Stream results from a Python async generator.
-%% Returns a list of all yielded values.
--spec async_stream(py_module(), py_func(), py_args()) -> py_result().
-async_stream(Module, Func, Args) ->
-    async_stream(Module, Func, Args, #{}).
-
-%% @doc Stream results from a Python async generator with kwargs.
--spec async_stream(py_module(), py_func(), py_args(), py_kwargs()) -> py_result().
-async_stream(Module, Func, Args, Kwargs) ->
-    Ref = make_ref(),
-    py_async_pool:request({async_stream, Ref, self(), Module, Func, Args, Kwargs}),
-    async_stream_collect(Ref, []).
-
-%% @private
-async_stream_collect(Ref, Acc) ->
-    receive
-        {py_response, Ref, {ok, Result}} ->
-            %% Got final result (async generator collected)
-            {ok, Result};
-        {py_chunk, Ref, Chunk} ->
-            async_stream_collect(Ref, [Chunk | Acc]);
-        {py_end, Ref} ->
-            {ok, lists:reverse(Acc)};
-        {py_error, Ref, Error} ->
-            {error, Error}
-    after ?DEFAULT_TIMEOUT ->
-        {error, timeout}
+    async_gather(Calls, ?DEFAULT_TIMEOUT).
+
+%% @doc Like async_gather/1 with explicit per-call timeout.
+-spec async_gather([{py_module(), py_func(), py_args()}], timeout()) -> py_result().
+async_gather(Calls, Timeout) when is_list(Calls) ->
+    Refs = [async_call(M, F, A) || {M, F, A} <- Calls],
+    Results = [async_await(R, Timeout) || R <- Refs],
+    Errors = [{Idx, Reason}
+              || {Idx, {error, Reason}} <- lists:zip(lists:seq(1, length(Results)), Results)],
+    case Errors of
+        [] ->
+            Values = [V || {ok, V} <- Results],
+            {ok, Values};
+        _ ->
+            {error, {gather_failed, Errors}}
     end.
 
 %%% ============================================================================
@@ -1485,7 +1473,7 @@ start_contexts() ->
 %%
 %% Options:
 %% - `contexts' - Number of contexts to create (default: number of schedulers)
-%% - `mode' - Context mode: `worker', `subinterp', or `owngil' (default: `worker')
+%% - `mode' - Context mode: `worker' or `owngil' (default: `worker')
 %%
 %% @param Opts Start options
 %% @returns {ok, [Context]} | {error, Reason}
diff --git a/src/py_async_pool.erl b/src/py_async_pool.erl
deleted file mode 100644
index 11bf5d7..0000000
--- a/src/py_async_pool.erl
+++ /dev/null
@@ -1,167 +0,0 @@
-%% Copyright 2026 Benoit Chesneau
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
-
-%%% @doc Pool manager for async Python execution using event loops.
-%%%
-%%% This module provides an async request pool that delegates to the event loop
-%%% pool for efficient coroutine execution. It replaces the pthread+usleep
-%%% polling model with event-driven execution using enif_select and erlang.send().
-%%%
-%%% The pool maintains API compatibility with the previous pthread-based
-%%% implementation while providing significant performance improvements.
-%%%
-%%% @private
--module(py_async_pool).
--behaviour(gen_server).
-
--export([
-    start_link/0,
-    start_link/1,
-    request/1,
-    get_stats/0
-]).
-
--export([
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2,
-    terminate/2
-]).
-
--record(state, {
-    pending :: non_neg_integer(),
-    supported :: boolean()
-}).
-
-%%% ============================================================================
-%%% API
-%%% ============================================================================
-
--spec start_link() -> {ok, pid()} | {error, term()}.
-start_link() ->
-    start_link(1).
-
--spec start_link(pos_integer()) -> {ok, pid()} | {error, term()}.
-start_link(_NumWorkers) ->
-    %% NumWorkers is now ignored - we use the event loop pool instead
-    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-
-%% @doc Submit an async request to be executed by the event loop pool.
--spec request(term()) -> ok.
-request(Request) ->
-    gen_server:cast(?MODULE, {request, Request}).
-
-%% @doc Get pool statistics.
--spec get_stats() -> map().
-get_stats() ->
-    gen_server:call(?MODULE, get_stats).
-
-%%% ============================================================================
-%%% gen_server callbacks
-%%% ============================================================================
-
-init([]) ->
-    process_flag(trap_exit, true),
-    %% Check if event loop pool is available
-    case py_event_loop:get_loop() of
-        {ok, _LoopRef} ->
-            {ok, #state{pending = 0, supported = true}};
-        {error, _} ->
-            {ok, #state{pending = 0, supported = false}}
-    end.
-
-handle_call(get_stats, _From, State) ->
-    Stats = #{
-        pending_requests => State#state.pending,
-        supported => State#state.supported,
-        backend => event_loop
-    },
-    {reply, Stats, State};
-
-handle_call(_Request, _From, State) ->
-    {reply, {error, unknown_request}, State}.
-
-handle_cast({request, Request}, #state{supported = false} = State) ->
-    {Ref, Caller, _Type} = extract_ref_caller(Request),
-    Caller ! {py_error, Ref, async_not_supported},
-    {noreply, State};
-
-handle_cast({request, Request}, State) ->
-    case transform_request(Request) of
-        {ok, LoopRequest} ->
-            case py_event_loop:get_loop() of
-                {ok, LoopRef} ->
-                    case py_event_loop:run_async(LoopRef, LoopRequest) of
-                        ok ->
-                            {noreply, State#state{pending = State#state.pending + 1}};
-                        {error, Reason} ->
-                            {Ref, Caller, _} = extract_ref_caller(Request),
-                            Caller ! {py_error, Ref, Reason},
-                            {noreply, State}
-                    end;
-                {error, Reason} ->
-                    {Ref, Caller, _} = extract_ref_caller(Request),
-                    Caller ! {py_error, Ref, Reason},
-                    {noreply, State}
-            end;
-        {error, Reason} ->
-            {Ref, Caller, _} = extract_ref_caller(Request),
-            Caller ! {py_error, Ref, Reason},
-            {noreply, State}
-    end;
-
-handle_cast(_Msg, State) ->
-    {noreply, State}.
-
-handle_info({async_result, _Ref, _Result}, State) ->
-    %% Result was sent directly to caller via erlang.send()
-    %% We just track pending count
-    {noreply, State#state{pending = max(0, State#state.pending - 1)}};
-
-handle_info(_Info, State) ->
-    {noreply, State}.
-
-terminate(_Reason, _State) ->
-    ok.
-
-%%% ============================================================================
-%%% Internal functions
-%%% ============================================================================
-
-%% @doc Transform the legacy request format to the new event loop format.
-transform_request({async_call, Ref, Caller, Module, Func, Args, Kwargs}) ->
-    {ok, #{
-        ref => Ref,
-        caller => Caller,
-        module => Module,
-        func => Func,
-        args => Args,
-        kwargs => Kwargs
-    }};
-transform_request({async_gather, Ref, Caller, Calls}) ->
-    %% For gather, we need to wrap in a special gather coroutine
-    %% For now, return an error - gather needs special handling
-    {error, {gather_not_implemented, Ref, Caller, Calls}};
-transform_request({async_stream, Ref, Caller, Module, Func, Args, Kwargs}) ->
-    %% For stream, we need async generator support
-    %% For now, return an error - stream needs special handling
-    {error, {stream_not_implemented, Ref, Caller, Module, Func, Args, Kwargs}};
-transform_request(Other) ->
-    {error, {unknown_request_type, Other}}.
-
-%% @doc Extract ref and caller from different request types.
-extract_ref_caller({async_call, Ref, Caller, _, _, _, _}) -> {Ref, Caller, async_call};
-extract_ref_caller({async_gather, Ref, Caller, _}) -> {Ref, Caller, async_gather};
-extract_ref_caller({async_stream, Ref, Caller, _, _, _, _}) -> {Ref, Caller, async_stream}.
diff --git a/src/py_context.erl b/src/py_context.erl
index 5c5dbad..a879f30 100644
--- a/src/py_context.erl
+++ b/src/py_context.erl
@@ -125,7 +125,7 @@ stop(Ctx) when is_pid(Ctx) ->
 %% @doc Create a new context with options map.
 %%
 %% Options:
-%% - `mode' - Context mode (worker | subinterp | owngil), default: worker
+%% - `mode' - Context mode (worker | owngil), default: worker
 %%
 %% @param Opts Options map
 %% @returns {ok, Pid} | {error, Reason}
diff --git a/src/py_context_router.erl b/src/py_context_router.erl
index f27814a..67fea1f 100644
--- a/src/py_context_router.erl
+++ b/src/py_context_router.erl
@@ -135,7 +135,7 @@ start() ->
 %%
 %% Options:
 %% - `contexts' - Number of contexts to create (default: number of schedulers)
-%% - `mode' - Context mode: `worker', `subinterp', or `owngil' (default: `worker')
+%% - `mode' - Context mode: `worker' or `owngil' (default: `worker')
 %%
 %% @param Opts Start options
 %% @returns {ok, [Context]} | {error, Reason}
@@ -279,7 +279,7 @@ start_pool(Pool, Size) ->
 %%
 %% @param Pool Pool name (default, io, or custom)
 %% @param Size Number of contexts in the pool
-%% @param Mode Context mode (worker, subinterp, owngil)
+%% @param Mode Context mode (worker, owngil)
 %% @returns {ok, [Context]} | {error, Reason}
 -spec start_pool(pool_name(), pos_integer(), py_context:context_mode()) ->
     {ok, [pid()]} | {error, term()}.
diff --git a/src/py_context_sup.erl b/src/py_context_sup.erl
index 97001ac..f633b01 100644
--- a/src/py_context_sup.erl
+++ b/src/py_context_sup.erl
@@ -44,7 +44,7 @@ start_link() ->
 %% @doc Start a new py_context under this supervisor.
 %%
 %% @param Id Unique identifier for the context (integer or {Pool, N} tuple)
-%% @param Mode Context mode (worker | subinterp | owngil)
+%% @param Mode Context mode (worker | owngil)
 %% @returns {ok, Pid} | {error, Reason}
 -spec start_context(term(), py_context:context_mode()) ->
     {ok, pid()} | {error, term()}.
diff --git a/src/py_nif.erl b/src/py_nif.erl
index 42084b0..ee0e04b 100644
--- a/src/py_nif.erl
+++ b/src/py_nif.erl
@@ -659,13 +659,12 @@ owngil_apply_paths(_WorkerId, _HandleId, _Paths) ->
 %% @doc Get Python capability (internal use).
 %% Returns the detected Python runtime capability:
 %% - free_threaded: Python 3.13+ with no GIL (Py_GIL_DISABLED)
-%% - subinterp: Python 3.12+ with per-interpreter GIL support
-%% - multi_executor: Traditional Python with executor threads
+%% - gil: Conventional GIL build (any other supported version)
 %%
 %% For public execution mode, use py:execution_mode/0 which returns
 %% `worker | owngil' based on the application configuration.
 %% @private
--spec execution_mode() -> free_threaded | subinterp | multi_executor.
+-spec execution_mode() -> free_threaded | gil.
 execution_mode() ->
     ?NIF_STUB.
 
diff --git a/src/py_reactor_context.erl b/src/py_reactor_context.erl
index 6576eee..0261f8e 100644
--- a/src/py_reactor_context.erl
+++ b/src/py_reactor_context.erl
@@ -79,7 +79,7 @@
 %% @doc Start a new py_reactor_context process.
 %%
 %% @param Id Unique identifier for this context
-%% @param Mode Context mode (worker, subinterp, owngil)
+%% @param Mode Context mode (worker, owngil)
 %% @returns {ok, Pid} | {error, Reason}
 -spec start_link(pos_integer(), atom()) -> {ok, pid()} | {error, term()}.
 start_link(Id, Mode) ->
@@ -95,7 +95,7 @@ start_link(Id, Mode) ->
 %%               (useful for setting up protocol factory)
 %%
 %% @param Id Unique identifier for this context
-%% @param Mode Context mode (worker, subinterp, owngil)
+%% @param Mode Context mode (worker, owngil)
 %% @param Opts Options map
 %% @returns {ok, Pid} | {error, Reason}
 -spec start_link(pos_integer(), atom(), map()) -> {ok, pid()} | {error, term()}.
diff --git a/test/py_SUITE.erl b/test/py_SUITE.erl
index d3e4f5d..5df9d98 100644
--- a/test/py_SUITE.erl
+++ b/test/py_SUITE.erl
@@ -576,40 +576,21 @@ test_erlang_attr_syntax(_Config) ->
     ok.
 
 test_asyncio_call(_Config) ->
-    %% Test async call to asyncio coroutine
-    %% The async pool runs async functions in a background asyncio event loop
-    Ref = py:async_call('__main__', 'eval', [<<"1 + 1">>]),
+    %% Sync function dispatched through the event loop short-circuits to
+    %% the result, so async_call/async_await round-trips end-to-end.
+    Ref = py:async_call(math, sqrt, [16]),
     true = is_reference(Ref),
-
-    %% We may not get a result for simple eval since it's not a real coroutine
-    %% Just verify the call mechanism works
-    _Result = py:async_await(Ref, 5000),
+    {ok, 4.0} = py:async_await(Ref, 5000),
     ok.
 
 test_asyncio_gather(_Config) ->
-    %% Test gathering multiple async calls
-    %% This tests the async_gather functionality
     Calls = [
         {math, sqrt, [16]},
         {math, sqrt, [25]},
         {math, sqrt, [36]}
     ],
-    Result = py:async_gather(Calls),
-    ct:pal("async_gather result: ~p~n", [Result]),
-
-    %% Verify the result structure
-    case Result of
-        {ok, Results} when is_list(Results) ->
-            %% Should have 3 results
-            3 = length(Results),
-            %% Verify the values if they're successful
-            ct:pal("Gathered results: ~p~n", [Results]),
-            ok;
-        {error, Reason} ->
-            %% Async pool might not be fully functional in test env
-            ct:pal("async_gather returned error (may be expected): ~p~n", [Reason]),
-            ok
-    end.
+    {ok, [4.0, 5.0, 6.0]} = py:async_gather(Calls),
+    ok.
 
 test_subinterp_supported(_Config) ->
     %% Test that subinterp_supported returns a boolean
diff --git a/test/py_thread_affinity_SUITE.erl b/test/py_thread_affinity_SUITE.erl
new file mode 100644
index 0000000..5dce505
--- /dev/null
+++ b/test/py_thread_affinity_SUITE.erl
@@ -0,0 +1,160 @@
+%%% @doc Thread-affinity invariants for the per-context worker model.
+%%%
+%%% After the v3.0 simplification each context owns a dedicated pthread
+%%% that handles all of its Python operations. These tests assert the
+%%% invariants that motivated the rework so we don't regress numpy /
+%%% torch / tensorflow thread-local state safety:
+%%%
+%%%   - exec / eval / call on the same context all share one OS thread
+%%%   - calls from N different Erlang processes targeting the same
+%%%     context all converge on that context's worker thread
+%%%   - distinct contexts get distinct worker threads
+%%%   - the same invariants hold under owngil mode when supported
+-module(py_thread_affinity_SUITE).
+
+-include_lib("common_test/include/ct.hrl").
+
+-export([
+    all/0,
+    init_per_suite/1,
+    end_per_suite/1
+]).
+
+-export([
+    exec_eval_call_share_thread/1,
+    multi_process_share_context_thread/1,
+    distinct_contexts_distinct_threads/1,
+    owngil_thread_affinity/1
+]).
+
+all() ->
+    [
+        exec_eval_call_share_thread,
+        multi_process_share_context_thread,
+        distinct_contexts_distinct_threads,
+        owngil_thread_affinity
+    ].
+
+init_per_suite(Config) ->
+    {ok, _} = application:ensure_all_started(erlang_python),
+    {ok, _} = py:start_contexts(),
+    Config.
+
+end_per_suite(_Config) ->
+    ok = application:stop(erlang_python),
+    ok.
+
+%%% ============================================================================
+%%% Helpers
+%%% ============================================================================
+
+native_id(Ctx) ->
+    case py_context:eval(Ctx, <<"_pta_get_tid()">>, #{}) of
+        {ok, Tid} -> Tid;
+        Other -> ct:fail({native_id_failed, Other})
+    end.
+
+setup_helper(Ctx) ->
+    ok = py_context:exec(Ctx, <<
+        "import threading\n"
+        "def _pta_get_tid():\n"
+        "    return threading.get_native_id()\n"
+    >>).
+
+%%% ============================================================================
+%%% Tests
+%%% ============================================================================
+
+exec_eval_call_share_thread(_Config) ->
+    Ctx = py:context(1),
+    setup_helper(Ctx),
+    %% Stash a thread id from exec, then read it back via eval.
+    ok = py_context:exec(Ctx, <<
+        "import threading\n"
+        "_pta_exec_tid = threading.get_native_id()\n"
+    >>),
+    {ok, ExecTid} = py_context:eval(Ctx, <<"_pta_exec_tid">>, #{}),
+    EvalTid = native_id(Ctx),
+    {ok, CallTid} = py_context:call(Ctx, '__main__', '_pta_get_tid', []),
+    ct:pal("exec=~p eval=~p call=~p", [ExecTid, EvalTid, CallTid]),
+    ExecTid = EvalTid,
+    EvalTid = CallTid,
+    ok.
+
+multi_process_share_context_thread(_Config) ->
+    Ctx = py:context(1),
+    setup_helper(Ctx),
+    Parent = self(),
+    N = 8,
+    Pids = [spawn(fun() ->
+                  Tid = native_id(Ctx),
+                  Parent ! {tid, self(), Tid}
+              end) || _ <- lists:seq(1, N)],
+    Tids = [receive {tid, Pid, T} -> T after 5000 -> ct:fail(timeout) end
+            || Pid <- Pids],
+    ct:pal("tids = ~p", [Tids]),
+    [Single] = lists:usort(Tids),
+    true = is_integer(Single),
+    ok.
+
+distinct_contexts_distinct_threads(_Config) ->
+    case py_context_router:num_contexts() of
+        N when N >= 2 ->
+            Ctx1 = py:context(1),
+            Ctx2 = py:context(2),
+            setup_helper(Ctx1),
+            setup_helper(Ctx2),
+            T1 = native_id(Ctx1),
+            T2 = native_id(Ctx2),
+            ct:pal("ctx1=~p ctx2=~p", [T1, T2]),
+            true = T1 =/= T2,
+            ok;
+        _ ->
+            {skip, "needs at least 2 contexts"}
+    end.
+
+owngil_thread_affinity(_Config) ->
+    case py:subinterp_supported() of
+        false ->
+            {skip, "subinterpreters not supported on this Python build"};
+        true ->
+            case py_context:new(#{mode => owngil}) of
+                {error, owngil_requires_python314} ->
+                    {skip, "owngil requires Python 3.14+"};
+                {error, Reason} ->
+                    ct:fail({owngil_create_failed, Reason});
+                {ok, Ctx} ->
+                    try
+                        setup_helper(Ctx),
+                        ok = py_context:exec(Ctx, <<
+                            "import threading\n"
+                            "_pta_exec_tid = threading.get_native_id()\n"
+                        >>),
+                        {ok, ExecTid} = py_context:eval(Ctx,
+                                                          <<"_pta_exec_tid">>, #{}),
+                        EvalTid = native_id(Ctx),
+                        {ok, CallTid} = py_context:call(Ctx,
+                                                          '__main__',
+                                                          '_pta_get_tid', []),
+                        ct:pal("owngil exec=~p eval=~p call=~p",
+                               [ExecTid, EvalTid, CallTid]),
+                        ExecTid = EvalTid,
+                        EvalTid = CallTid,
+
+                        Parent = self(),
+                        Pids = [spawn(fun() ->
+                                          Tid = native_id(Ctx),
+                                          Parent ! {tid, self(), Tid}
+                                      end) || _ <- lists:seq(1, 4)],
+                        Tids = [receive {tid, Pid, T} -> T
+                                after 5000 -> ct:fail(timeout)
+                                end || Pid <- Pids],
+                        ct:pal("owngil multi-proc tids = ~p", [Tids]),
+                        [Single] = lists:usort(Tids),
+                        Single = ExecTid,
+                        ok
+                    after
+                        py_context:stop(Ctx)
+                    end
+            end
+    end.

From e15d24f505598ef009fd5bf78ff878394e24e2b4 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 17:08:07 +0200
Subject: [PATCH 03/17] Remove py:subinterp_* explicit handle API

py_context:new(#{mode => owngil}) covers the same OWN_GIL parallelism
with OTP supervision and automatic cleanup, so the older handle-based
surface (subinterp_create/destroy/call/eval/exec/cast/async_call/await
+ subinterp_pool_*) is redundant. Drop the public API, the wrapper NIFs,
and the dead reactor example that called nonexistent subinterp_reactor_*
functions. Internal subinterp_thread_pool_* (used by py_event_loop_pool's
OWN_GIL backend) and the capability probe py:subinterp_supported/0 stay.

-1390/+27 LOC across 8 files; py_subinterp_SUITE deleted.
---
 CHANGELOG.md                        |  13 +
 c_src/py_nif.c                      | 596 +---------------------------
 c_src/py_nif.h                      |  39 --
 examples/README.md                  |   6 -
 examples/reactor_owngil_example.erl | 163 --------
 src/py.erl                          | 139 +------
 src/py_nif.erl                      | 105 +----
 test/py_subinterp_SUITE.erl         | 356 -----------------
 8 files changed, 27 insertions(+), 1390 deletions(-)
 delete mode 100644 examples/reactor_owngil_example.erl
 delete mode 100644 test/py_subinterp_SUITE.erl

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5092c9d..7f3c3a5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -57,6 +57,19 @@
 - `context_dispatch_call/eval/exec` functions (dead code)
 - References to `PY_MODE_MULTI_EXECUTOR` in context operations
 - `py_async_pool` legacy gen_server (unused after async API rewire)
+- **Explicit `py:subinterp_*` handle API removed.** `py:subinterp_create/0`,
+  `subinterp_destroy/1`, `subinterp_call/4,5`, `subinterp_eval/2,3`,
+  `subinterp_exec/2`, `subinterp_cast/4`, `subinterp_async_call/4`,
+  `subinterp_await/1,2`, and `subinterp_pool_*` are all gone. Use
+  `py_context:new(#{mode => owngil})` instead — it gives the same
+  parallelism with OTP supervision and automatic cleanup.
+  `py:subinterp_supported/0` (capability probe) and `py:parallel/1`
+  (which routes through the context API) stay.
+- Internal `py_execution_mode_t` collapsed from 3 values to 2 (`free_threaded`
+  / `gil`); `py_nif:execution_mode/0` returns `free_threaded | gil` instead
+  of the old `free_threaded | subinterp | multi_executor`.
+- `examples/reactor_owngil_example.erl` deleted (called nonexistent
+  `py:subinterp_reactor_*` functions; pre-existing breakage).
 
 ## 2.3.1 (2026-04-01)
 
diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index 239c15c..bcfb671 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -50,9 +50,6 @@ ErlNifResourceType *WORKER_RESOURCE_TYPE = NULL;
 ErlNifResourceType *PYOBJ_RESOURCE_TYPE = NULL;
 /* ASYNC_WORKER_RESOURCE_TYPE removed - async workers replaced by event loop model */
 ErlNifResourceType *SUSPENDED_STATE_RESOURCE_TYPE = NULL;
-#ifdef HAVE_SUBINTERPRETERS
-ErlNifResourceType *SUBINTERP_WORKER_RESOURCE_TYPE = NULL;
-#endif
 
 /* Process-per-context resource type (no mutex) */
 ErlNifResourceType *PY_CONTEXT_RESOURCE_TYPE = NULL;
@@ -337,35 +334,9 @@ static void pyobj_destructor(ErlNifEnv *env, void *obj) {
     atomic_fetch_add(&g_counters.pyobj_destroyed, 1);
 }
 
-/* async_worker_destructor removed - async workers replaced by event loop model */
-
-#ifdef HAVE_SUBINTERPRETERS
-static void subinterp_worker_destructor(ErlNifEnv *env, void *obj) {
-    (void)env;
-    py_subinterp_worker_t *worker = (py_subinterp_worker_t *)obj;
-
-    /* For OWN_GIL subinterpreters, we cannot safely acquire the GIL from the
-     * GC thread (destructor may run on any thread). PyGILState_Ensure only
-     * works for the main interpreter, and PyThreadState_Swap doesn't actually
-     * acquire the GIL.
-     *
-     * If the user didn't call the explicit destroy function, the subinterpreter
-     * leaks. This is a known limitation - users must call destroy explicitly. */
-    if (worker->tstate != NULL && runtime_is_running()) {
-#ifdef DEBUG
-        fprintf(stderr, "Warning: subinterp_worker leaked - not destroyed "
-                "via explicit destroy. Use subinterp_worker_destroy/1.\n");
-#endif
-        /* Skip Python cleanup - we can't safely acquire the subinterpreter's GIL */
-        worker->tstate = NULL;
-        worker->globals = NULL;
-        worker->locals = NULL;
-    }
-
-    /* Destroy the mutex */
-    pthread_mutex_destroy(&worker->mutex);
-}
-#endif
+/* async_worker_destructor and subinterp_worker_destructor removed —
+ * async workers replaced by event loop model; subinterp_worker resource
+ * type retired with the explicit handle API. */
 
 /**
  * @brief Destructor for py_context_t (process-per-context)
@@ -1834,312 +1805,6 @@ static ERL_NIF_TERM nif_owngil_supported(ErlNifEnv *env, int argc, const ERL_NIF
 #endif
 }
 
-#ifdef HAVE_SUBINTERPRETERS
-
-static ERL_NIF_TERM nif_subinterp_worker_new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-
-    if (!runtime_is_running()) {
-        return make_error(env, "python_not_running");
-    }
-
-    py_subinterp_worker_t *worker = enif_alloc_resource(SUBINTERP_WORKER_RESOURCE_TYPE,
-                                                         sizeof(py_subinterp_worker_t));
-    if (worker == NULL) {
-        return make_error(env, "alloc_failed");
-    }
-
-    /* Initialize mutex for thread-safe access */
-    if (pthread_mutex_init(&worker->mutex, NULL) != 0) {
-        enif_release_resource(worker);
-        return make_error(env, "mutex_init_failed");
-    }
-
-    /* Need the main GIL to create sub-interpreter */
-    PyGILState_STATE gstate = PyGILState_Ensure();
-
-    /* Save current thread state so we can restore it after creating sub-interp */
-    PyThreadState *main_tstate = PyThreadState_Get();
-
-    /* Configure sub-interpreter with its own GIL */
-    PyInterpreterConfig config = {
-        .use_main_obmalloc = 0,
-        .allow_fork = 0,
-        .allow_exec = 0,
-        .allow_threads = 1,
-        .allow_daemon_threads = 0,
-        .check_multi_interp_extensions = 1,
-        .gil = PyInterpreterConfig_OWN_GIL,  /* This is the key - own GIL! */
-    };
-
-    PyThreadState *tstate = NULL;
-    PyStatus status = Py_NewInterpreterFromConfig(&tstate, &config);
-
-    if (PyStatus_Exception(status) || tstate == NULL) {
-        /* We're still in main interpreter on error */
-        PyGILState_Release(gstate);
-        enif_release_resource(worker);
-        return make_error(env, "subinterp_create_failed");
-    }
-
-    worker->interp = PyThreadState_GetInterpreter(tstate);
-    worker->tstate = tstate;
-
-    /* Create global/local namespaces in the new interpreter */
-    worker->globals = PyDict_New();
-    worker->locals = PyDict_New();
-
-    /* Import __builtins__ */
-    PyObject *builtins = PyEval_GetBuiltins();
-    PyDict_SetItemString(worker->globals, "__builtins__", builtins);
-
-    /* Initialize event loop for this subinterpreter */
-    if (init_subinterpreter_event_loop(env) < 0) {
-        /* Clean up Python objects before ending interpreter */
-        Py_XDECREF(worker->globals);
-        worker->globals = NULL;
-        Py_XDECREF(worker->locals);
-        worker->locals = NULL;
-        Py_EndInterpreter(tstate);
-        /* Re-acquire main interpreter's GIL after subinterpreter was destroyed */
-        PyEval_RestoreThread(main_tstate);
-        PyGILState_Release(gstate);
-        enif_release_resource(worker);
-        return make_error(env, "event_loop_init_failed");
-    }
-
-    /* Switch back to main interpreter - release subinterp's GIL and acquire main's */
-    PyEval_SaveThread();  /* Release subinterpreter's GIL */
-    PyEval_RestoreThread(main_tstate);  /* Acquire main interpreter's GIL */
-
-    PyGILState_Release(gstate);
-
-    ERL_NIF_TERM result = enif_make_resource(env, worker);
-    enif_release_resource(worker);
-
-    return enif_make_tuple2(env, ATOM_OK, result);
-}
-
-static ERL_NIF_TERM nif_subinterp_worker_destroy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    py_subinterp_worker_t *worker;
-
-    if (!enif_get_resource(env, argv[0], SUBINTERP_WORKER_RESOURCE_TYPE, (void **)&worker)) {
-        return make_error(env, "invalid_worker");
-    }
-
-    if (!runtime_is_running()) {
-        return make_error(env, "python_not_running");
-    }
-
-    /* Lock mutex for thread-safe access */
-    pthread_mutex_lock(&worker->mutex);
-
-    if (worker->tstate != NULL) {
-        /* For subinterpreters with OWN_GIL, directly acquire the subinterpreter's
-         * GIL. We don't use PyGILState_Ensure because that only works for the
-         * main interpreter. */
-        PyEval_RestoreThread(worker->tstate);
-
-        /* Clean up Python objects while holding the subinterpreter's GIL */
-        Py_XDECREF(worker->globals);
-        worker->globals = NULL;
-        Py_XDECREF(worker->locals);
-        worker->locals = NULL;
-
-        /* End the interpreter - this releases its GIL */
-        Py_EndInterpreter(worker->tstate);
-        worker->tstate = NULL;
-    }
-
-    pthread_mutex_unlock(&worker->mutex);
-
-    return ATOM_OK;
-}
-
-static ERL_NIF_TERM nif_subinterp_call(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    py_subinterp_worker_t *worker;
-    ErlNifBinary module_bin, func_bin;
-
-    if (!runtime_is_running()) {
-        return make_error(env, "python_not_running");
-    }
-
-    if (!enif_get_resource(env, argv[0], SUBINTERP_WORKER_RESOURCE_TYPE, (void **)&worker)) {
-        return make_error(env, "invalid_worker");
-    }
-    if (!enif_inspect_binary(env, argv[1], &module_bin)) {
-        return make_error(env, "invalid_module");
-    }
-    if (!enif_inspect_binary(env, argv[2], &func_bin)) {
-        return make_error(env, "invalid_func");
-    }
-
-    /* Lock mutex for thread-safe access */
-    pthread_mutex_lock(&worker->mutex);
-
-    /* Enter the sub-interpreter with proper GIL acquisition (safe for OWN_GIL) */
-    PyEval_RestoreThread(worker->tstate);
-
-    char *module_name = binary_to_string(&module_bin);
-    char *func_name = binary_to_string(&func_bin);
-    if (module_name == NULL || func_name == NULL) {
-        enif_free(module_name);
-        enif_free(func_name);
-        PyEval_SaveThread();
-        pthread_mutex_unlock(&worker->mutex);
-        return make_error(env, "alloc_failed");
-    }
-
-    ERL_NIF_TERM result;
-
-    /* Import module */
-    PyObject *module = PyImport_ImportModule(module_name);
-    if (module == NULL) {
-        result = make_py_error(env);
-        goto cleanup;
-    }
-
-    /* Get function */
-    PyObject *func = PyObject_GetAttrString(module, func_name);
-    Py_DECREF(module);
-    if (func == NULL) {
-        result = make_py_error(env);
-        goto cleanup;
-    }
-
-    /* Convert args */
-    unsigned int args_len;
-    if (!enif_get_list_length(env, argv[3], &args_len)) {
-        Py_DECREF(func);
-        result = make_error(env, "invalid_args");
-        goto cleanup;
-    }
-
-    PyObject *args = PyTuple_New(args_len);
-    ERL_NIF_TERM head, tail = argv[3];
-    for (unsigned int i = 0; i < args_len; i++) {
-        enif_get_list_cell(env, tail, &head, &tail);
-        PyObject *arg = term_to_py(env, head);
-        if (arg == NULL) {
-            Py_DECREF(args);
-            Py_DECREF(func);
-            result = make_error(env, "arg_conversion_failed");
-            goto cleanup;
-        }
-        PyTuple_SET_ITEM(args, i, arg);
-    }
-
-    /* Convert kwargs */
-    PyObject *kwargs = NULL;
-    if (argc > 4 && enif_is_map(env, argv[4])) {
-        kwargs = term_to_py(env, argv[4]);
-    }
-
-    /* Call the function */
-    PyObject *py_result = PyObject_Call(func, args, kwargs);
-    Py_DECREF(func);
-    Py_DECREF(args);
-    Py_XDECREF(kwargs);
-
-    if (py_result == NULL) {
-        result = make_py_error(env);
-    } else {
-        ERL_NIF_TERM term_result = py_to_term(env, py_result);
-        Py_DECREF(py_result);
-        result = enif_make_tuple2(env, ATOM_OK, term_result);
-    }
-
-cleanup:
-    enif_free(module_name);
-    enif_free(func_name);
-
-    /* Exit the sub-interpreter with proper GIL release (safe for OWN_GIL) */
-    PyEval_SaveThread();
-
-    /* Unlock mutex */
-    pthread_mutex_unlock(&worker->mutex);
-
-    return result;
-}
-
-static ERL_NIF_TERM nif_parallel_execute(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    unsigned int workers_len, calls_len;
-
-    if (!enif_get_list_length(env, argv[0], &workers_len)) {
-        return make_error(env, "invalid_workers_list");
-    }
-    if (!enif_get_list_length(env, argv[1], &calls_len)) {
-        return make_error(env, "invalid_calls_list");
-    }
-    if (workers_len == 0 || calls_len == 0) {
-        return enif_make_tuple2(env, ATOM_OK, enif_make_list(env, 0));
-    }
-    if (workers_len < calls_len) {
-        return make_error(env, "not_enough_workers");
-    }
-
-    ERL_NIF_TERM *results = enif_alloc(sizeof(ERL_NIF_TERM) * calls_len);
-    if (results == NULL) {
-        return make_error(env, "alloc_failed");
-    }
-    ERL_NIF_TERM worker_head, worker_tail = argv[0];
-    ERL_NIF_TERM call_head, call_tail = argv[1];
-
-    for (unsigned int i = 0; i < calls_len; i++) {
-        enif_get_list_cell(env, worker_tail, &worker_head, &worker_tail);
-        enif_get_list_cell(env, call_tail, &call_head, &call_tail);
-
-        int arity;
-        const ERL_NIF_TERM *tuple;
-        if (!enif_get_tuple(env, call_head, &arity, &tuple) || arity < 3) {
-            enif_free(results);
-            return make_error(env, "invalid_call_tuple");
-        }
-
-        /* Build args array for subinterp_call */
-        ERL_NIF_TERM call_args[5] = {worker_head, tuple[0], tuple[1], tuple[2],
-                                      (arity > 3) ? tuple[3] : enif_make_new_map(env)};
-
-        results[i] = nif_subinterp_call(env, 5, call_args);
-    }
-
-    ERL_NIF_TERM result_list = enif_make_list_from_array(env, results, calls_len);
-    enif_free(results);
-
-    return enif_make_tuple2(env, ATOM_OK, result_list);
-}
-
-#else /* !HAVE_SUBINTERPRETERS */
-
-/* Stub implementations for older Python versions */
-static ERL_NIF_TERM nif_subinterp_worker_new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-    return make_error(env, "subinterpreters_not_supported");
-}
-
-static ERL_NIF_TERM nif_subinterp_worker_destroy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-    return make_error(env, "subinterpreters_not_supported");
-}
-
-static ERL_NIF_TERM nif_subinterp_call(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-    return make_error(env, "subinterpreters_not_supported");
-}
-
-static ERL_NIF_TERM nif_parallel_execute(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-    return make_error(env, "subinterpreters_not_supported");
-}
-
-#endif /* HAVE_SUBINTERPRETERS */
 
 /* ============================================================================
  * Shared-GIL Pool Model for Subinterpreters
@@ -7274,180 +6939,6 @@ static ERL_NIF_TERM nif_ref_call_method(ErlNifEnv *env, int argc, const ERL_NIF_
 
 #ifdef HAVE_SUBINTERPRETERS
 
-/**
- * @brief Destructor for py_subinterp_handle_t resource
- */
-static void subinterp_handle_destructor(ErlNifEnv *env, void *obj) {
-    (void)env;
-    py_subinterp_handle_t *handle = (py_subinterp_handle_t *)obj;
-
-    /* Clean up the namespace in the worker */
-    if (!atomic_load(&handle->destroyed)) {
-        subinterp_thread_handle_destroy(handle);
-    }
-}
-
-/**
- * @brief NIF: Create a new OWN_GIL subinterpreter handle
- *
- * Returns a handle that can be used with subinterp_call/eval/exec.
- * The handle is bound to a worker thread with its own GIL.
- */
-static ERL_NIF_TERM nif_subinterp_thread_create(ErlNifEnv *env, int argc,
-                                                  const ERL_NIF_TERM argv[]) {
-    (void)argc;
-    (void)argv;
-
-    if (!subinterp_thread_pool_is_ready()) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "pool_not_initialized"));
-    }
-
-    py_subinterp_handle_t *handle = enif_alloc_resource(
-        PY_SUBINTERP_HANDLE_RESOURCE_TYPE, sizeof(py_subinterp_handle_t));
-    if (handle == NULL) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "alloc_failed"));
-    }
-
-    if (subinterp_thread_handle_create(handle) != 0) {
-        enif_release_resource(handle);
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "create_failed"));
-    }
-
-    ERL_NIF_TERM ref = enif_make_resource(env, handle);
-    enif_release_resource(handle);
-
-    return enif_make_tuple2(env, ATOM_OK, ref);
-}
-
-/**
- * @brief NIF: Destroy an OWN_GIL subinterpreter handle
- */
-static ERL_NIF_TERM nif_subinterp_thread_destroy(ErlNifEnv *env, int argc,
-                                                   const ERL_NIF_TERM argv[]) {
-    (void)argc;
-
-    py_subinterp_handle_t *handle;
-    if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE,
-                           (void **)&handle)) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "invalid_handle"));
-    }
-
-    subinterp_thread_handle_destroy(handle);
-    return ATOM_OK;
-}
-
-/**
- * @brief NIF: Call a Python function through OWN_GIL subinterpreter
- */
-static ERL_NIF_TERM nif_subinterp_thread_call(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    if (argc < 4 || argc > 5) {
-        return enif_make_badarg(env);
-    }
-
-    py_subinterp_handle_t *handle;
-    if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE,
-                           (void **)&handle)) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "invalid_handle"));
-    }
-
-    ERL_NIF_TERM module = argv[1];
-    ERL_NIF_TERM func = argv[2];
-    ERL_NIF_TERM args = argv[3];
-    ERL_NIF_TERM kwargs = argc > 4 ? argv[4] : enif_make_new_map(env);
-
-    return subinterp_thread_call(env, handle, module, func, args, kwargs);
-}
-
-/**
- * @brief NIF: Evaluate Python expression through OWN_GIL subinterpreter
- */
-static ERL_NIF_TERM nif_subinterp_thread_eval(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    if (argc < 2 || argc > 3) {
-        return enif_make_badarg(env);
-    }
-
-    py_subinterp_handle_t *handle;
-    if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE,
-                           (void **)&handle)) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "invalid_handle"));
-    }
-
-    ERL_NIF_TERM code = argv[1];
-    ERL_NIF_TERM locals = argc > 2 ? argv[2] : enif_make_new_map(env);
-
-    return subinterp_thread_eval(env, handle, code, locals);
-}
-
-/**
- * @brief NIF: Execute Python statements through OWN_GIL subinterpreter
- */
-static ERL_NIF_TERM nif_subinterp_thread_exec(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    if (argc != 2) {
-        return enif_make_badarg(env);
-    }
-
-    py_subinterp_handle_t *handle;
-    if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE,
-                           (void **)&handle)) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "invalid_handle"));
-    }
-
-    return subinterp_thread_exec(env, handle, argv[1]);
-}
-
-/**
- * @brief NIF: Cast (fire-and-forget) through OWN_GIL subinterpreter
- */
-static ERL_NIF_TERM nif_subinterp_thread_cast(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    if (argc != 4) {
-        return enif_make_badarg(env);
-    }
-
-    py_subinterp_handle_t *handle;
-    if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE,
-                           (void **)&handle)) {
-        return ATOM_OK;  /* Silently ignore for cast */
-    }
-
-    return subinterp_thread_cast(env, handle, argv[1], argv[2], argv[3]);
-}
-
-/**
- * @brief NIF: Async call through OWN_GIL subinterpreter
- */
-static ERL_NIF_TERM nif_subinterp_thread_async_call(ErlNifEnv *env, int argc,
-                                                      const ERL_NIF_TERM argv[]) {
-    if (argc != 6) {
-        return enif_make_badarg(env);
-    }
-
-    py_subinterp_handle_t *handle;
-    if (!enif_get_resource(env, argv[0], PY_SUBINTERP_HANDLE_RESOURCE_TYPE,
-                           (void **)&handle)) {
-        return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_atom(env, "invalid_handle"));
-    }
-
-    ErlNifPid caller_pid;
-    if (!enif_get_local_pid(env, argv[4], &caller_pid)) {
-        return enif_make_badarg(env);
-    }
-
-    return subinterp_thread_async_call(env, handle, argv[1], argv[2], argv[3],
-                                        &caller_pid, argv[5]);
-}
-
 /**
  * @brief NIF: Check if OWN_GIL thread pool is available
  */
@@ -7859,53 +7350,6 @@ static ERL_NIF_TERM nif_owngil_apply_paths(ErlNifEnv *env, int argc,
 #else /* !HAVE_SUBINTERPRETERS */
 
 /* Stub implementations for Python < 3.12 */
-static ERL_NIF_TERM nif_subinterp_thread_create(ErlNifEnv *env, int argc,
-                                                  const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return enif_make_tuple2(env, ATOM_ERROR,
-                            enif_make_atom(env, "not_supported"));
-}
-
-static ERL_NIF_TERM nif_subinterp_thread_destroy(ErlNifEnv *env, int argc,
-                                                   const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return enif_make_tuple2(env, ATOM_ERROR,
-                            enif_make_atom(env, "not_supported"));
-}
-
-static ERL_NIF_TERM nif_subinterp_thread_call(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return enif_make_tuple2(env, ATOM_ERROR,
-                            enif_make_atom(env, "not_supported"));
-}
-
-static ERL_NIF_TERM nif_subinterp_thread_eval(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return enif_make_tuple2(env, ATOM_ERROR,
-                            enif_make_atom(env, "not_supported"));
-}
-
-static ERL_NIF_TERM nif_subinterp_thread_exec(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return enif_make_tuple2(env, ATOM_ERROR,
-                            enif_make_atom(env, "not_supported"));
-}
-
-static ERL_NIF_TERM nif_subinterp_thread_cast(ErlNifEnv *env, int argc,
-                                                const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return ATOM_OK;
-}
-
-static ERL_NIF_TERM nif_subinterp_thread_async_call(ErlNifEnv *env, int argc,
-                                                      const ERL_NIF_TERM argv[]) {
-    (void)argc; (void)argv;
-    return enif_make_tuple2(env, ATOM_ERROR,
-                            enif_make_atom(env, "not_supported"));
-}
 
 static ERL_NIF_TERM nif_subinterp_thread_pool_ready(ErlNifEnv *env, int argc,
                                                       const ERL_NIF_TERM argv[]) {
@@ -7992,17 +7436,6 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
         env, NULL, "py_suspended_state", suspended_state_destructor,
         ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);
 
-#ifdef HAVE_SUBINTERPRETERS
-    SUBINTERP_WORKER_RESOURCE_TYPE = enif_open_resource_type(
-        env, NULL, "py_subinterp_worker", subinterp_worker_destructor,
-        ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);
-
-    /* OWN_GIL subinterpreter handle resource type */
-    PY_SUBINTERP_HANDLE_RESOURCE_TYPE = enif_open_resource_type(
-        env, NULL, "py_subinterp_handle", subinterp_handle_destructor,
-        ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);
-#endif
-
     /* Process-per-context resource type (no mutex) */
     PY_CONTEXT_RESOURCE_TYPE = enif_open_resource_type(
         env, NULL, "py_context", context_destructor,
@@ -8043,12 +7476,6 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
         PY_SHARED_DICT_RESOURCE_TYPE == NULL) {
         return -1;
     }
-#ifdef HAVE_SUBINTERPRETERS
-    if (SUBINTERP_WORKER_RESOURCE_TYPE == NULL ||
-        PY_SUBINTERP_HANDLE_RESOURCE_TYPE == NULL) {
-        return -1;
-    }
-#endif
 
     /* Initialize atoms */
     ATOM_OK = enif_make_atom(env, "ok");
@@ -8191,29 +7618,16 @@ static ErlNifFunc nif_funcs[] = {
     {"async_gather", 3, nif_async_gather, ERL_NIF_DIRTY_JOB_IO_BOUND},
     {"async_stream", 6, nif_async_stream, ERL_NIF_DIRTY_JOB_IO_BOUND},
 
-    /* Sub-interpreter support (shared GIL pool model) */
+    /* Subinterpreter capability probes */
     {"subinterp_supported", 0, nif_subinterp_supported, 0},
     {"owngil_supported", 0, nif_owngil_supported, 0},
-    {"subinterp_worker_new", 0, nif_subinterp_worker_new, 0},
-    {"subinterp_worker_destroy", 1, nif_subinterp_worker_destroy, 0},
-    {"subinterp_call", 5, nif_subinterp_call, ERL_NIF_DIRTY_JOB_CPU_BOUND},
-    {"parallel_execute", 2, nif_parallel_execute, ERL_NIF_DIRTY_JOB_CPU_BOUND},
 
-    /* OWN_GIL subinterpreter thread pool (true parallelism) */
+    /* OWN_GIL thread pool (used internally by py_event_loop_pool) */
     {"subinterp_thread_pool_start", 0, nif_subinterp_thread_pool_start, 0},
     {"subinterp_thread_pool_start", 1, nif_subinterp_thread_pool_start, 0},
     {"subinterp_thread_pool_stop", 0, nif_subinterp_thread_pool_stop, 0},
     {"subinterp_thread_pool_ready", 0, nif_subinterp_thread_pool_ready, 0},
     {"subinterp_thread_pool_stats", 0, nif_subinterp_thread_pool_stats, 0},
-    {"subinterp_thread_create", 0, nif_subinterp_thread_create, 0},
-    {"subinterp_thread_destroy", 1, nif_subinterp_thread_destroy, 0},
-    {"subinterp_thread_call", 4, nif_subinterp_thread_call, ERL_NIF_DIRTY_JOB_CPU_BOUND},
-    {"subinterp_thread_call", 5, nif_subinterp_thread_call, ERL_NIF_DIRTY_JOB_CPU_BOUND},
-    {"subinterp_thread_eval", 2, nif_subinterp_thread_eval, ERL_NIF_DIRTY_JOB_CPU_BOUND},
-    {"subinterp_thread_eval", 3, nif_subinterp_thread_eval, ERL_NIF_DIRTY_JOB_CPU_BOUND},
-    {"subinterp_thread_exec", 2, nif_subinterp_thread_exec, ERL_NIF_DIRTY_JOB_CPU_BOUND},
-    {"subinterp_thread_cast", 4, nif_subinterp_thread_cast, 0},
-    {"subinterp_thread_async_call", 6, nif_subinterp_thread_async_call, 0},
 
     /* OWN_GIL session management for event loop pool */
     {"owngil_create_session", 1, nif_owngil_create_session, 0},
diff --git a/c_src/py_nif.h b/c_src/py_nif.h
index 45296ac..5abc71d 100644
--- a/c_src/py_nif.h
+++ b/c_src/py_nif.h
@@ -655,40 +655,6 @@ typedef struct {
  * @{
  */
 
-#ifdef HAVE_SUBINTERPRETERS
-/**
- * @struct py_subinterp_worker_t
- * @brief Worker running in an isolated sub-interpreter
- *
- * Sub-interpreters provide true isolation with their own GIL,
- * enabling parallel Python execution on Python 3.12+.
- *
- * The mutex ensures thread-safe access when multiple dirty scheduler
- * threads attempt to use the same worker concurrently.
- *
- * @note Only available when compiled with Python 3.12+
- *
- * @see nif_subinterp_worker_new
- * @see nif_subinterp_call
- */
-typedef struct {
-    /** @brief Mutex for thread-safe access from multiple dirty schedulers */
-    pthread_mutex_t mutex;
-
-    /** @brief Python interpreter state */
-    PyInterpreterState *interp;
-
-    /** @brief Thread state for this interpreter */
-    PyThreadState *tstate;
-
-    /** @brief Global namespace dictionary */
-    PyObject *globals;
-
-    /** @brief Local namespace dictionary */
-    PyObject *locals;
-} py_subinterp_worker_t;
-#endif
-
 /**
  * @enum py_cmd_type_t
  * @brief Command types for thread-per-context dispatch
@@ -1364,11 +1330,6 @@ extern ErlNifResourceType *PYOBJ_RESOURCE_TYPE;
 /** @brief Resource type for suspended_state_t */
 extern ErlNifResourceType *SUSPENDED_STATE_RESOURCE_TYPE;
 
-#ifdef HAVE_SUBINTERPRETERS
-/** @brief Resource type for py_subinterp_worker_t */
-extern ErlNifResourceType *SUBINTERP_WORKER_RESOURCE_TYPE;
-#endif
-
 /** @brief Resource type for py_context_t (process-per-context) */
 extern ErlNifResourceType *PY_CONTEXT_RESOURCE_TYPE;
 
diff --git a/examples/README.md b/examples/README.md
index 2538292..977ac38 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -87,12 +87,6 @@ Simple echo server using Reactor API.
 escript examples/reactor_echo.erl
 ```
 
-### reactor_owngil_example.erl
-Reactor with OWN_GIL subinterpreters (Python 3.14+).
-```bash
-escript examples/reactor_owngil_example.erl
-```
-
 ## Benchmarks
 
 ### benchmark.erl
diff --git a/examples/reactor_owngil_example.erl b/examples/reactor_owngil_example.erl
deleted file mode 100644
index 06bce8b..0000000
--- a/examples/reactor_owngil_example.erl
+++ /dev/null
@@ -1,163 +0,0 @@
-%% @doc Example: OWN_GIL reactor with dedicated threads.
-%%
-%% Each subinterpreter handle runs in a dedicated pthread with its own GIL.
-%% This provides true parallelism for CPU-bound protocol processing.
-%%
-%% Best for: ML inference, heavy parsing, CPU-bound protocol logic.
-%%
-%% Note: Requires Python 3.12+ with subinterpreter support.
-
--module(reactor_owngil_example).
--export([start/0, start/1, stop/1]).
-
-%% Protocol that simulates CPU-intensive work
--define(CPU_PROTOCOL, <<"
-import erlang.reactor as reactor
-import hashlib
-
-class CPUProtocol(reactor.Protocol):
-    '''Protocol with CPU-intensive hashing.'''
-
-    def __init__(self):
-        super().__init__()
-        self.iterations = 10000
-
-    def connection_made(self, fd, client_info):
-        super().connection_made(fd, client_info)
-
-    def data_received(self, data):
-        # CPU-intensive hashing (runs in parallel due to OWN_GIL)
-        result = bytes(data)
-        for _ in range(self.iterations):
-            result = hashlib.sha256(result).digest()
-
-        self.write_buffer.extend(result)
-        return 'write_pending'
-
-    def write_ready(self):
-        if self.write_buffer:
-            written = self.write(bytes(self.write_buffer))
-            del self.write_buffer[:written]
-            if self.write_buffer:
-                return 'continue'
-        return 'read_pending'
-
-reactor.set_protocol_factory(CPUProtocol)
-">>).
-
-%% @doc Start with default settings (4 handles).
-start() ->
-    start(#{handles => 4, port => 8081}).
-
-%% @doc Start OWN_GIL reactor.
-%%
-%% Options:
-%%   handles - Number of subinterpreter handles (default: 4)
-%%   port - Port to listen on (default: 8081)
-%%
-%% Returns: {ok, State} where State can be passed to stop/1
-start(Opts) ->
-    NumHandles = maps:get(handles, Opts, 4),
-    Port = maps:get(port, Opts, 8081),
-
-    %% Start OWN_GIL thread pool
-    ok = py:subinterp_pool_start(NumHandles),
-
-    %% Create subinterpreter handles - each with its own pthread + GIL
-    Handles = [begin
-        {ok, Handle} = py:subinterp_create(),
-        %% Initialize reactor protocol in this subinterpreter
-        ok = py:subinterp_exec(Handle, ?CPU_PROTOCOL),
-        Handle
-    end || _ <- lists:seq(1, NumHandles)],
-
-    %% Start acceptor
-    {ok, ListenSock} = gen_tcp:listen(Port, [
-        binary,
-        {active, false},
-        {reuseaddr, true},
-        {backlog, 64}
-    ]),
-
-    Acceptor = spawn_link(fun() ->
-        accept_loop(ListenSock, Handles, 1)
-    end),
-
-    io:format("OWN_GIL reactor started on port ~p with ~p handles~n", [Port, NumHandles]),
-    io:format("Each handle runs in its own pthread with dedicated GIL~n"),
-
-    {ok, #{handles => Handles, acceptor => Acceptor, socket => ListenSock}}.
-
-%% @doc Stop the OWN_GIL reactor.
-stop(#{handles := Handles, acceptor := Acceptor, socket := Socket}) ->
-    exit(Acceptor, shutdown),
-    gen_tcp:close(Socket),
-    [py:subinterp_destroy(H) || H <- Handles],
-    py:subinterp_pool_stop(),
-    ok.
-
-accept_loop(ListenSock, Handles, Idx) ->
-    case gen_tcp:accept(ListenSock) of
-        {ok, Socket} ->
-            {ok, Fd} = prim_inet:getfd(Socket),
-            Handle = lists:nth(Idx, Handles),
-            ClientInfo = get_client_info(Socket),
-
-            %% Initialize connection via OWN_GIL reactor API
-            ok = py:subinterp_reactor_init(Handle, Fd, ClientInfo),
-
-            %% Spawn handler for this connection
-            spawn_link(fun() -> handle_connection(Handle, Fd, Socket) end),
-
-            NextIdx = (Idx rem length(Handles)) + 1,
-            accept_loop(ListenSock, Handles, NextIdx);
-
-        {error, closed} ->
-            ok
-    end.
-
-get_client_info(Socket) ->
-    case inet:peername(Socket) of
-        {ok, {Addr, Port}} ->
-            #{addr => inet:ntoa(Addr), port => Port, type => tcp};
-        _ ->
-            #{type => tcp}
-    end.
-
-handle_connection(Handle, Fd, Socket) ->
-    %% Simple blocking receive for example purposes
-    case gen_tcp:recv(Socket, 0, 30000) of
-        {ok, Data} ->
-            %% Dispatch to OWN_GIL subinterpreter
-            case py:subinterp_reactor_read(Handle, Fd, Data) of
-                {ok, <<"write_pending">>} ->
-                    handle_write(Handle, Fd, Socket);
-                {ok, <<"continue">>} ->
-                    handle_connection(Handle, Fd, Socket);
-                {ok, <<"close">>} ->
-                    py:subinterp_reactor_close(Handle, Fd),
-                    gen_tcp:close(Socket);
-                {error, _Reason} ->
-                    py:subinterp_reactor_close(Handle, Fd),
-                    gen_tcp:close(Socket)
-            end;
-        {error, closed} ->
-            py:subinterp_reactor_close(Handle, Fd);
-        {error, _} ->
-            py:subinterp_reactor_close(Handle, Fd),
-            gen_tcp:close(Socket)
-    end.
-
-handle_write(Handle, Fd, Socket) ->
-    case py:subinterp_reactor_write(Handle, Fd) of
-        {ok, <<"read_pending">>} ->
-            handle_connection(Handle, Fd, Socket);
-        {ok, <<"continue">>} ->
-            handle_write(Handle, Fd, Socket);
-        {ok, <<"close">>} ->
-            py:subinterp_reactor_close(Handle, Fd),
-            gen_tcp:close(Socket);
-        _ ->
-            py:subinterp_reactor_close(Handle, Fd),
-            gen_tcp:close(Socket)
-    end.
diff --git a/src/py.erl b/src/py.erl
index 4c4b5d9..c16f944 100644
--- a/src/py.erl
+++ b/src/py.erl
@@ -76,26 +76,9 @@
     async_await/2,
     async_gather/1,
     async_gather/2,
-    %% Parallel execution (Python 3.12+ sub-interpreters)
+    %% Parallel execution + capability probe
     parallel/1,
     subinterp_supported/0,
-    %% OWN_GIL subinterpreter API (true parallelism)
-    subinterp_create/0,
-    subinterp_destroy/1,
-    subinterp_call/4,
-    subinterp_call/5,
-    subinterp_eval/2,
-    subinterp_eval/3,
-    subinterp_exec/2,
-    subinterp_cast/4,
-    subinterp_async_call/4,
-    subinterp_await/1,
-    subinterp_await/2,
-    subinterp_pool_start/0,
-    subinterp_pool_start/1,
-    subinterp_pool_stop/0,
-    subinterp_pool_ready/0,
-    subinterp_pool_stats/0,
     %% Virtual environment
     ensure_venv/2,
     ensure_venv/3,
@@ -836,126 +819,6 @@ parallel(Calls) when is_list(Calls) ->
             end
     end.
 
-%%% ============================================================================
-%%% OWN_GIL Subinterpreter API (True Parallelism)
-%%% ============================================================================
-
-%% @doc Create an isolated subinterpreter with OWN_GIL.
-%% Returns a handle for making calls. The subinterpreter runs
-%% in a dedicated pthread with true parallelism.
-%%
-%% Requires the thread pool to be started first via subinterp_pool_start/0.
-%%
-%% Example:
-%% ```
-%% ok = py:subinterp_pool_start().
-%% {ok, Sub} = py:subinterp_create().
-%% {ok, Result} = py:subinterp_call(Sub, math, sqrt, [16.0]).
-%% ok = py:subinterp_destroy(Sub).
-%% '''
--spec subinterp_create() -> {ok, reference()} | {error, term()}.
-subinterp_create() ->
-    py_nif:subinterp_thread_create().
-
-%% @doc Destroy a subinterpreter handle.
-%% Cleans up namespace, releases worker binding.
--spec subinterp_destroy(reference()) -> ok.
-subinterp_destroy(Handle) ->
-    py_nif:subinterp_thread_destroy(Handle),
-    ok.
-
-%% @doc Call a function in a subinterpreter (blocking).
--spec subinterp_call(reference(), py_module(), py_func(), py_args()) ->
-    {ok, term()} | {error, term()}.
-subinterp_call(Handle, Module, Func, Args) ->
-    subinterp_call(Handle, Module, Func, Args, #{}).
-
-%% @doc Call a function in a subinterpreter with kwargs (blocking).
--spec subinterp_call(reference(), py_module(), py_func(), py_args(), py_kwargs()) ->
-    {ok, term()} | {error, term()}.
-subinterp_call(Handle, Module, Func, Args, Kwargs) ->
-    ModuleBin = ensure_binary(Module),
-    FuncBin = ensure_binary(Func),
-    py_nif:subinterp_thread_call(Handle, ModuleBin, FuncBin, Args, Kwargs).
-
-%% @doc Evaluate expression in subinterpreter (blocking).
--spec subinterp_eval(reference(), binary() | string()) ->
-    {ok, term()} | {error, term()}.
-subinterp_eval(Handle, Code) ->
-    subinterp_eval(Handle, Code, #{}).
-
-%% @doc Evaluate expression with locals in subinterpreter (blocking).
--spec subinterp_eval(reference(), binary() | string(), map()) ->
-    {ok, term()} | {error, term()}.
-subinterp_eval(Handle, Code, Locals) ->
-    CodeBin = ensure_binary(Code),
-    py_nif:subinterp_thread_eval(Handle, CodeBin, Locals).
-
-%% @doc Execute statements in subinterpreter (blocking, no return).
--spec subinterp_exec(reference(), binary() | string()) -> ok | {error, term()}.
-subinterp_exec(Handle, Code) ->
-    CodeBin = ensure_binary(Code),
-    py_nif:subinterp_thread_exec(Handle, CodeBin).
-
-%% @doc Cast a call to subinterpreter (fire-and-forget, no result).
-%% Returns immediately. Use for side-effects where result is not needed.
--spec subinterp_cast(reference(), py_module(), py_func(), py_args()) -> ok.
-subinterp_cast(Handle, Module, Func, Args) ->
-    ModuleBin = ensure_binary(Module),
-    FuncBin = ensure_binary(Func),
-    py_nif:subinterp_thread_cast(Handle, ModuleBin, FuncBin, Args).
-
-%% @doc Async call - returns immediately with a reference.
-%% Use subinterp_await/1,2 to get the result.
-%% Worker uses erlang.send() to deliver result.
--spec subinterp_async_call(reference(), py_module(), py_func(), py_args()) -> reference().
-subinterp_async_call(Handle, Module, Func, Args) ->
-    ModuleBin = ensure_binary(Module),
-    FuncBin = ensure_binary(Func),
-    Ref = make_ref(),
-    py_nif:subinterp_thread_async_call(Handle, ModuleBin, FuncBin, Args, self(), Ref),
-    Ref.
-
-%% @doc Wait for async call result.
--spec subinterp_await(reference()) -> {ok, term()} | {error, term()}.
-subinterp_await(Ref) ->
-    subinterp_await(Ref, ?DEFAULT_TIMEOUT).
-
-%% @doc Wait for async call result with timeout.
--spec subinterp_await(reference(), timeout()) -> {ok, term()} | {error, term()}.
-subinterp_await(Ref, Timeout) ->
-    receive
-        {py_subinterp_result, Ref, Result} -> Result
-    after Timeout ->
-        {error, timeout}
-    end.
-
-%% @doc Start the OWN_GIL subinterpreter thread pool with default workers.
-%% Must be called before creating subinterpreter handles.
--spec subinterp_pool_start() -> ok | {error, term()}.
-subinterp_pool_start() ->
-    py_nif:subinterp_thread_pool_start().
-
-%% @doc Start the OWN_GIL subinterpreter thread pool with N workers.
--spec subinterp_pool_start(non_neg_integer()) -> ok | {error, term()}.
-subinterp_pool_start(NumWorkers) ->
-    py_nif:subinterp_thread_pool_start(NumWorkers).
-
-%% @doc Stop the OWN_GIL subinterpreter thread pool.
--spec subinterp_pool_stop() -> ok.
-subinterp_pool_stop() ->
-    py_nif:subinterp_thread_pool_stop().
-
-%% @doc Check if the OWN_GIL thread pool is ready.
--spec subinterp_pool_ready() -> boolean().
-subinterp_pool_ready() ->
-    py_nif:subinterp_thread_pool_ready().
-
-%% @doc Get OWN_GIL thread pool statistics.
--spec subinterp_pool_stats() -> map().
-subinterp_pool_stats() ->
-    py_nif:subinterp_thread_pool_stats().
-
 %%% ============================================================================
 %%% Virtual Environment Support
 %%% ============================================================================
diff --git a/src/py_nif.erl b/src/py_nif.erl
index ee0e04b..13e2c5b 100644
--- a/src/py_nif.erl
+++ b/src/py_nif.erl
@@ -52,28 +52,15 @@
     async_call/6,
     async_gather/3,
     async_stream/6,
-    %% Sub-interpreters (Python 3.12+) - shared GIL pool model
+    %% Subinterpreter capability probes (Python 3.12+ / 3.14+)
     subinterp_supported/0,
     owngil_supported/0,
-    subinterp_worker_new/0,
-    subinterp_worker_destroy/1,
-    subinterp_call/5,
-    parallel_execute/2,
-    %% OWN_GIL subinterpreter thread pool (true parallelism)
+    %% OWN_GIL thread pool (internal, used by py_event_loop_pool)
     subinterp_thread_pool_start/0,
     subinterp_thread_pool_start/1,
     subinterp_thread_pool_stop/0,
     subinterp_thread_pool_ready/0,
     subinterp_thread_pool_stats/0,
-    subinterp_thread_create/0,
-    subinterp_thread_destroy/1,
-    subinterp_thread_call/4,
-    subinterp_thread_call/5,
-    subinterp_thread_eval/2,
-    subinterp_thread_eval/3,
-    subinterp_thread_exec/2,
-    subinterp_thread_cast/4,
-    subinterp_thread_async_call/6,
     %% OWN_GIL session management for event loop pool
     owngil_create_session/1,
     owngil_submit_task/7,
@@ -501,116 +488,40 @@ subinterp_supported() ->
 owngil_supported() ->
     ?NIF_STUB.
 
-%% @doc Create a new sub-interpreter worker with its own GIL.
-%% Returns an opaque reference to be used with subinterp functions.
--spec subinterp_worker_new() -> {ok, reference()} | {error, term()}.
-subinterp_worker_new() ->
-    ?NIF_STUB.
-
-%% @doc Destroy a sub-interpreter worker.
--spec subinterp_worker_destroy(reference()) -> ok | {error, term()}.
-subinterp_worker_destroy(_WorkerRef) ->
-    ?NIF_STUB.
-
-%% @doc Call a Python function in a sub-interpreter.
-%% Args: WorkerRef, Module (binary), Func (binary), Args (list), Kwargs (map)
--spec subinterp_call(reference(), binary(), binary(), list(), map()) ->
-    {ok, term()} | {error, term()}.
-subinterp_call(_WorkerRef, _Module, _Func, _Args, _Kwargs) ->
-    ?NIF_STUB.
-
-%% @doc Execute multiple calls in parallel across sub-interpreters.
-%% Args: WorkerRefs (list of refs), Calls (list of {Module, Func, Args})
-%% Returns: List of results (one per call)
--spec parallel_execute([reference()], [{binary(), binary(), list()}]) ->
-    {ok, list()} | {error, term()}.
-parallel_execute(_WorkerRefs, _Calls) ->
-    ?NIF_STUB.
-
 %%% ============================================================================
-%%% OWN_GIL Subinterpreter Thread Pool (True Parallelism)
+%%% OWN_GIL Thread Pool (internal, used by py_event_loop_pool)
 %%% ============================================================================
 
 %% @doc Start the OWN_GIL subinterpreter thread pool with default workers.
-%% Creates a pool of pthreads, each with an OWN_GIL subinterpreter.
+%% @private
 -spec subinterp_thread_pool_start() -> ok | {error, term()}.
 subinterp_thread_pool_start() ->
     ?NIF_STUB.
 
 %% @doc Start the OWN_GIL subinterpreter thread pool with N workers.
+%% @private
 -spec subinterp_thread_pool_start(non_neg_integer()) -> ok | {error, term()}.
 subinterp_thread_pool_start(_NumWorkers) ->
     ?NIF_STUB.
 
 %% @doc Stop the OWN_GIL subinterpreter thread pool.
+%% @private
 -spec subinterp_thread_pool_stop() -> ok.
 subinterp_thread_pool_stop() ->
     ?NIF_STUB.
 
 %% @doc Check if the OWN_GIL thread pool is ready.
+%% @private
 -spec subinterp_thread_pool_ready() -> boolean().
 subinterp_thread_pool_ready() ->
     ?NIF_STUB.
 
 %% @doc Get OWN_GIL thread pool statistics.
+%% @private
 -spec subinterp_thread_pool_stats() -> map().
 subinterp_thread_pool_stats() ->
     ?NIF_STUB.
 
-%% @doc Create a new OWN_GIL subinterpreter handle.
-%% The handle is bound to a worker thread and has isolated namespace.
--spec subinterp_thread_create() -> {ok, reference()} | {error, term()}.
-subinterp_thread_create() ->
-    ?NIF_STUB.
-
-%% @doc Destroy an OWN_GIL subinterpreter handle.
--spec subinterp_thread_destroy(reference()) -> ok | {error, term()}.
-subinterp_thread_destroy(_Handle) ->
-    ?NIF_STUB.
-
-%% @doc Call a Python function through OWN_GIL subinterpreter (blocking).
--spec subinterp_thread_call(reference(), binary(), binary(), list()) ->
-    {ok, term()} | {error, term()}.
-subinterp_thread_call(_Handle, _Module, _Func, _Args) ->
-    ?NIF_STUB.
-
-%% @doc Call a Python function through OWN_GIL subinterpreter with kwargs.
--spec subinterp_thread_call(reference(), binary(), binary(), list(), map()) ->
-    {ok, term()} | {error, term()}.
-subinterp_thread_call(_Handle, _Module, _Func, _Args, _Kwargs) ->
-    ?NIF_STUB.
-
-%% @doc Evaluate Python expression through OWN_GIL subinterpreter.
--spec subinterp_thread_eval(reference(), binary()) ->
-    {ok, term()} | {error, term()}.
-subinterp_thread_eval(_Handle, _Code) ->
-    ?NIF_STUB.
-
-%% @doc Evaluate Python expression with locals through OWN_GIL subinterpreter.
--spec subinterp_thread_eval(reference(), binary(), map()) ->
-    {ok, term()} | {error, term()}.
-subinterp_thread_eval(_Handle, _Code, _Locals) ->
-    ?NIF_STUB.
-
-%% @doc Execute Python statements through OWN_GIL subinterpreter (no return).
--spec subinterp_thread_exec(reference(), binary()) -> ok | {error, term()}.
-subinterp_thread_exec(_Handle, _Code) ->
-    ?NIF_STUB.
-
-%% @doc Cast (fire-and-forget) through OWN_GIL subinterpreter.
-%% Returns immediately, result is discarded.
--spec subinterp_thread_cast(reference(), binary(), binary(), list()) -> ok.
-subinterp_thread_cast(_Handle, _Module, _Func, _Args) ->
-    ?NIF_STUB.
-
-%% @doc Async call through OWN_GIL subinterpreter.
-%% Args: Handle, Module, Func, Args, CallerPid, Ref
-%% Result is sent to CallerPid as {py_subinterp_result, Ref, Result}.
--spec subinterp_thread_async_call(reference(), binary(), binary(), list(), pid(), reference()) ->
-    ok | {error, term()}.
-subinterp_thread_async_call(_Handle, _Module, _Func, _Args, _CallerPid, _Ref) ->
-    ?NIF_STUB.
-
 %%% ============================================================================
 %%% OWN_GIL Session Management (for event loop pool)
 %%% ============================================================================
diff --git a/test/py_subinterp_SUITE.erl b/test/py_subinterp_SUITE.erl
deleted file mode 100644
index 83dcc74..0000000
--- a/test/py_subinterp_SUITE.erl
+++ /dev/null
@@ -1,356 +0,0 @@
-%% Copyright 2026 Benoit Chesneau
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
-
-%%% @doc Test suite for OWN_GIL subinterpreter thread pool API.
-%%%
-%%% Tests the py:subinterp_* functions which provide true parallelism
-%%% using Python subinterpreters with OWN_GIL (Python 3.12+).
--module(py_subinterp_SUITE).
-
--include_lib("common_test/include/ct.hrl").
--include_lib("stdlib/include/assert.hrl").
-
--export([
-    all/0,
-    groups/0,
-    init_per_suite/1,
-    end_per_suite/1,
-    init_per_group/2,
-    end_per_group/2,
-    init_per_testcase/2,
-    end_per_testcase/2
-]).
-
-%% Test cases
--export([
-    test_pool_not_ready/1,
-    test_pool_start_stop/1,
-    test_pool_stats/1,
-    test_create_destroy_handle/1,
-    test_simple_call/1,
-    test_call_with_args/1,
-    test_call_builtin/1,
-    test_eval_expression/1,
-    test_eval_with_locals/1,
-    test_exec_statements/1,
-    test_cast_fire_and_forget/1,
-    test_namespace_isolation/1,
-    test_multiple_handles/1,
-    test_parallel_execution/1
-]).
-
-%%% ============================================================================
-%%% CT Callbacks
-%%% ============================================================================
-
-all() ->
-    case py:subinterp_supported() of
-        true ->
-            [{group, pool_lifecycle},
-             {group, handle_lifecycle},
-             {group, execution},
-             {group, isolation}];
-        false ->
-            ct:pal("Skipping subinterpreter tests - not supported on this Python version"),
-            []
-    end.
-
-groups() ->
-    [{pool_lifecycle, [sequence], [
-        test_pool_not_ready,
-        test_pool_start_stop,
-        test_pool_stats
-    ]},
-     {handle_lifecycle, [sequence], [
-        test_create_destroy_handle,
-        test_multiple_handles
-    ]},
-     {execution, [parallel], [
-        test_simple_call,
-        test_call_with_args,
-        test_call_builtin,
-        test_eval_expression,
-        test_eval_with_locals,
-        test_exec_statements,
-        test_cast_fire_and_forget
-    ]},
-     {isolation, [sequence], [
-        test_namespace_isolation,
-        test_parallel_execution
-    ]}].
-
-init_per_suite(Config) ->
-    %% Ensure erlang_python application is started
-    case application:ensure_all_started(erlang_python) of
-        {ok, _} -> ok;
-        {error, {already_started, _}} -> ok
-    end,
-    Config.
-
-end_per_suite(_Config) ->
-    %% Stop pool if running
-    catch py:subinterp_pool_stop(),
-    ok.
-
-init_per_group(pool_lifecycle, Config) ->
-    %% Pool tests manage their own pool lifecycle
-    Config;
-init_per_group(_Group, Config) ->
-    %% Ensure pool is started for other groups
-    case py:subinterp_pool_ready() of
-        true -> ok;
-        false ->
-            ok = py:subinterp_pool_start(4)
-    end,
-    Config.
-
-end_per_group(pool_lifecycle, _Config) ->
-    %% Clean up pool after lifecycle tests
-    catch py:subinterp_pool_stop(),
-    ok;
-end_per_group(_Group, _Config) ->
-    ok.
-
-init_per_testcase(_TestCase, Config) ->
-    Config.
-
-end_per_testcase(_TestCase, _Config) ->
-    ok.
-
-%%% ============================================================================
-%%% Pool Lifecycle Tests
-%%% ============================================================================
-
-test_pool_not_ready(_Config) ->
-    %% Pool should not be ready initially (after stop in end_per_suite)
-    ?assertEqual(false, py:subinterp_pool_ready()),
-
-    %% Creating handle should fail when pool not ready
-    Result = py:subinterp_create(),
-    ?assertMatch({error, _}, Result),
-    ok.
-
-test_pool_start_stop(_Config) ->
-    %% Start with default workers
-    ?assertEqual(ok, py:subinterp_pool_start()),
-    ?assertEqual(true, py:subinterp_pool_ready()),
-
-    %% Stop
-    ?assertEqual(ok, py:subinterp_pool_stop()),
-    ?assertEqual(false, py:subinterp_pool_ready()),
-
-    %% Start with specific number of workers
-    ?assertEqual(ok, py:subinterp_pool_start(2)),
-    ?assertEqual(true, py:subinterp_pool_ready()),
-
-    Stats = py:subinterp_pool_stats(),
-    ?assertEqual(2, maps:get(num_workers, Stats)),
-
-    %% Stop for next tests
-    ?assertEqual(ok, py:subinterp_pool_stop()),
-    ok.
-
-test_pool_stats(_Config) ->
-    %% Start pool
-    ?assertEqual(ok, py:subinterp_pool_start(4)),
-
-    Stats = py:subinterp_pool_stats(),
-    ?assertEqual(4, maps:get(num_workers, Stats)),
-    ?assertEqual(true, maps:get(initialized, Stats)),
-    ?assertEqual(0, maps:get(total_requests, Stats)),
-    ?assertEqual(0, maps:get(total_errors, Stats)),
-
-    %% Stop for next group
-    ?assertEqual(ok, py:subinterp_pool_stop()),
-    ok.
-
-%%% ============================================================================
-%%% Handle Lifecycle Tests
-%%% ============================================================================
-
-test_create_destroy_handle(_Config) ->
-    %% Create handle
-    {ok, Handle} = py:subinterp_create(),
-    ?assert(is_reference(Handle)),
-
-    %% Destroy handle
-    ?assertEqual(ok, py:subinterp_destroy(Handle)),
-
-    %% Creating another handle should work
-    {ok, Handle2} = py:subinterp_create(),
-    ?assert(is_reference(Handle2)),
-    ?assertEqual(ok, py:subinterp_destroy(Handle2)),
-    ok.
-
-test_multiple_handles(_Config) ->
-    %% Create multiple handles
-    Handles = [begin
-        {ok, H} = py:subinterp_create(),
-        H
-    end || _ <- lists:seq(1, 8)],
-
-    ?assertEqual(8, length(Handles)),
-
-    %% Destroy all handles
-    [py:subinterp_destroy(H) || H <- Handles],
-    ok.
-
-%%% ============================================================================
-%%% Execution Tests
-%%% ============================================================================
-
-test_simple_call(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Simple math operation
-    Result = py:subinterp_call(Handle, math, sqrt, [16.0]),
-    ?assertMatch({ok, _}, Result),
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-test_call_with_args(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Call with multiple args - max function
-    Result = py:subinterp_call(Handle, builtins, max, [[1, 5, 3, 9, 2]]),
-    case Result of
-        {ok, 9} -> ok;
-        {ok, _} -> ok; % Accept any successful result
-        {error, _} = Err -> ct:pal("Call failed: ~p", [Err])
-    end,
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-test_call_builtin(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Call builtin len
-    Result = py:subinterp_call(Handle, builtins, len, [<<"hello">>]),
-    case Result of
-        {ok, 5} -> ok;
-        {ok, _} -> ok;
-        {error, _} = Err -> ct:pal("Call failed: ~p", [Err])
-    end,
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-test_eval_expression(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Simple expression
-    Result = py:subinterp_eval(Handle, <<"1 + 2 + 3">>),
-    ?assertMatch({ok, 6}, Result),
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-test_eval_with_locals(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Expression with local variables
-    Result = py:subinterp_eval(Handle, <<"x + y">>, #{x => 10, y => 20}),
-    case Result of
-        {ok, 30} -> ok;
-        {ok, _} -> ok;
-        {error, _} = Err -> ct:pal("Eval failed: ~p", [Err])
-    end,
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-test_exec_statements(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Execute Python statements
-    Result = py:subinterp_exec(Handle, <<"x = 5\ny = 10\nresult = x + y">>),
-    ?assertMatch({ok, _}, Result),
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-test_cast_fire_and_forget(_Config) ->
-    {ok, Handle} = py:subinterp_create(),
-
-    %% Cast should return immediately
-    ?assertEqual(ok, py:subinterp_cast(Handle, math, sqrt, [100.0])),
-
-    %% Small delay to let cast execute
-    timer:sleep(50),
-
-    py:subinterp_destroy(Handle),
-    ok.
-
-%%% ============================================================================
-%%% Isolation Tests
-%%% ============================================================================
-
-test_namespace_isolation(_Config) ->
-    %% Create two handles
-    {ok, Handle1} = py:subinterp_create(),
-    {ok, Handle2} = py:subinterp_create(),
-
-    %% Set variable in Handle1
-    py:subinterp_exec(Handle1, <<"test_var = 42">>),
-
-    %% Try to access in Handle2 - should not be visible
-    Result = py:subinterp_eval(Handle2, <<"test_var">>),
-    ?assertMatch({error, _}, Result),
-
-    py:subinterp_destroy(Handle1),
-    py:subinterp_destroy(Handle2),
-    ok.
-
-test_parallel_execution(_Config) ->
-    %% Create handles
-    {ok, H1} = py:subinterp_create(),
-    {ok, H2} = py:subinterp_create(),
-
-    Parent = self(),
-
-    %% Start parallel execution
-    Start = erlang:monotonic_time(millisecond),
-
-    %% Both should execute concurrently with different GILs
-    spawn(fun() ->
-        %% Simulate CPU work
-        Result = py:subinterp_eval(H1, <<"sum(range(100000))">>),
-        Parent ! {done, 1, Result}
-    end),
-
-    spawn(fun() ->
-        %% Simulate CPU work
-        Result = py:subinterp_eval(H2, <<"sum(range(100000))">>),
-        Parent ! {done, 2, Result}
-    end),
-
-    %% Collect results
-    R1 = receive {done, 1, Res1} -> Res1 after 5000 -> timeout end,
-    R2 = receive {done, 2, Res2} -> Res2 after 5000 -> timeout end,
-
-    End = erlang:monotonic_time(millisecond),
-    Duration = End - Start,
-
-    ct:pal("Parallel execution took ~p ms", [Duration]),
-    ct:pal("Results: ~p, ~p", [R1, R2]),
-
-    %% Both should succeed
-    ?assertMatch({ok, _}, R1),
-    ?assertMatch({ok, _}, R2),
-
-    py:subinterp_destroy(H1),
-    py:subinterp_destroy(H2),
-    ok.

From db4a6f2588ecf12ca3081fa54a7d33d1f01b4462 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 18:27:09 +0200
Subject: [PATCH 04/17] Fix stale doc/example snippets after v3.0 cleanup

- inspiration.md: 'auto' mode -> 'worker'; py_pool:start_link -> py_context_router:start_pool
- asyncio.md: paired async_call with async_await (not await)
- py.erl: bind_context lives in py_context_router, not py
- reactor_echo example: 'auto' mode -> 'worker'
- bench_async_task header: spawn -> spawn_task
- .app.src: drop stale py_pool from registered list
---
 docs/asyncio.md               |  2 +-
 docs/inspiration.md           | 10 +++++-----
 examples/bench_async_task.erl |  2 +-
 examples/reactor_echo.erl     |  2 +-
 src/erlang_python.app.src     |  2 +-
 src/py.erl                    |  4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/asyncio.md b/docs/asyncio.md
index f19ac0f..a86a874 100644
--- a/docs/asyncio.md
+++ b/docs/asyncio.md
@@ -923,7 +923,7 @@ erlang.run(main())
 
 ## Async Worker Backend (Internal)
 
-The `py:async_call/3,4` and `py:await/1,2` APIs use an event-driven backend based on `py_event_loop`.
+The `py:async_call/3,4` and `py:async_await/1,2` APIs use an event-driven backend based on `py_event_loop`.
 
 ### Architecture
 
diff --git a/docs/inspiration.md b/docs/inspiration.md
index 0c31767..b4cce7f 100644
--- a/docs/inspiration.md
+++ b/docs/inspiration.md
@@ -139,7 +139,7 @@ Build TCP/UDP servers with Python protocol logic:
 
 ```erlang
 %% Erlang handles TCP accept and I/O scheduling
-{ok, Ctx} = py_reactor_context:start_link(1, auto),
+{ok, Ctx} = py_reactor_context:start_link(1, worker),
 
 py:exec(Ctx, <<"
 import erlang.reactor as reactor
@@ -312,8 +312,8 @@ Each Python context runs in isolation:
 
 ```erlang
 %% Multiple independent contexts
-{ok, Ctx1} = py_context:start_link(1, auto),
-{ok, Ctx2} = py_context:start_link(2, auto),
+{ok, Ctx1} = py_context:start_link(1, worker),
+{ok, Ctx2} = py_context:start_link(2, worker),
 
 %% Failures in Ctx1 don't affect Ctx2
 py:exec(Ctx1, <<"import dangerous_lib">>),
@@ -326,10 +326,10 @@ Separate pools prevent I/O from blocking compute:
 
 ```erlang
 %% CPU-bound pool (for ML inference)
-{ok, _} = py_pool:start_link(cpu_pool, #{size => 4}),
+{ok, _} = py_context_router:start_pool(cpu_pool, 4, owngil),
 
 %% I/O-bound pool (for API calls)
-{ok, _} = py_pool:start_link(io_pool, #{size => 16}),
+{ok, _} = py_context_router:start_pool(io_pool, 16, worker),
 
 %% Route accordingly
 py:call(cpu_pool, model, predict, [Data]),
diff --git a/examples/bench_async_task.erl b/examples/bench_async_task.erl
index 3a87cd2..9b737a1 100644
--- a/examples/bench_async_task.erl
+++ b/examples/bench_async_task.erl
@@ -7,7 +7,7 @@
 %%% Tests the new py_event_loop async task API:
 %%%   - py_event_loop:run/3,4 (blocking)
 %%%   - py_event_loop:create_task/3,4 + await (non-blocking)
-%%%   - py_event_loop:spawn/3,4 (fire-and-forget)
+%%%   - py_event_loop:spawn_task/3,4 (fire-and-forget)
 %%%
 %%% Run with:
 %%%   rebar3 compile && escript examples/bench_async_task.erl
diff --git a/examples/reactor_echo.erl b/examples/reactor_echo.erl
index bd45d25..40117b4 100644
--- a/examples/reactor_echo.erl
+++ b/examples/reactor_echo.erl
@@ -25,7 +25,7 @@ main(_) ->
     io:format("~n=== Erlang Reactor Echo Server ===~n~n"),
 
     %% Start a reactor context
-    {ok, Ctx} = py_reactor_context:start_link(1, auto),
+    {ok, Ctx} = py_reactor_context:start_link(1, worker),
 
     %% Set up Python echo protocol
     ok = py:exec(Ctx, <<"
diff --git a/src/erlang_python.app.src b/src/erlang_python.app.src
index 6534c1d..997aace 100644
--- a/src/erlang_python.app.src
+++ b/src/erlang_python.app.src
@@ -1,7 +1,7 @@
 {application, erlang_python, [
     {description, "Execute Python applications from Erlang using dirty NIFs"},
     {vsn, "3.0.0"},
-    {registered, [py_pool]},
+    {registered, []},
     {mod, {erlang_python_app, []}},
     {applications, [
         kernel,
diff --git a/src/py.erl b/src/py.erl
index c16f944..84fa976 100644
--- a/src/py.erl
+++ b/src/py.erl
@@ -1317,8 +1317,8 @@ clear_traces() ->
 %%% Ctx = py:context(),
 %%% {ok, Result} = py:call(Ctx, math, sqrt, [16]),
 %%%
-%%% %% Or bind a specific context to this process
-%%% ok = py:bind_context(py:context(1)),
+%%% %% Or bind a specific context to this process via the router
+%%% ok = py_context_router:bind_context(py:context(1)),
 %%% {ok, Result} = py:call(py:context(), math, sqrt, [16]).
 %%% '''
 %%% ============================================================================

From af9615645bdbb689e543a030f73e5cb18ff90890 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 20:55:47 +0200
Subject: [PATCH 05/17] Reject auto / unknown context-mode atoms at the NIF
 boundary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

py_context already validates worker | owngil via pattern matching, but
py_reactor_context calls py_nif:context_create/1 directly, which silently
mapped any non-owngil atom to worker. That hid genuine misuse — including
the auto atom that lingered in a doc snippet and one CT test.

Tighten nif_context_create to return {error, {invalid_mode, Atom}} for
anything other than worker | owngil, and update the one test that relied
on the silent-accept behavior.
---
 c_src/py_nif.c            | 15 +++++++++++++--
 test/py_reactor_SUITE.erl |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index bcfb671..3fd2c93 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -4629,13 +4629,24 @@ static ERL_NIF_TERM nif_context_create(ErlNifEnv *env, int argc, const ERL_NIF_T
         return make_error(env, "python_not_running");
     }
 
-    /* Parse mode atom */
+    /* Parse mode atom — reject anything other than worker | owngil so
+     * callers that bypass py_context (e.g. py_reactor_context) get the
+     * same strict validation py_context:create_context/1 already enforces. */
     char mode_str[32];
     if (!enif_get_atom(env, argv[0], mode_str, sizeof(mode_str), ERL_NIF_LATIN1)) {
         return make_error(env, "invalid_mode");
     }
 
-    bool use_owngil = (strcmp(mode_str, "owngil") == 0);
+    bool use_owngil;
+    if (strcmp(mode_str, "worker") == 0) {
+        use_owngil = false;
+    } else if (strcmp(mode_str, "owngil") == 0) {
+        use_owngil = true;
+    } else {
+        return enif_make_tuple2(
+            env, ATOM_ERROR,
+            enif_make_tuple2(env, enif_make_atom(env, "invalid_mode"), argv[0]));
+    }
 
     /* Allocate context resource */
     py_context_t *ctx = enif_alloc_resource(PY_CONTEXT_RESOURCE_TYPE, sizeof(py_context_t));
diff --git a/test/py_reactor_SUITE.erl b/test/py_reactor_SUITE.erl
index f0b4d74..d48003b 100644
--- a/test/py_reactor_SUITE.erl
+++ b/test/py_reactor_SUITE.erl
@@ -247,7 +247,7 @@ reactor.set_protocol_factory(AsyncPendingProtocol)
 ">>,
 
     %% Start reactor context with protocol factory setup
-    {ok, ReactorCtx} = py_reactor_context:start_link(1, auto, #{
+    {ok, ReactorCtx} = py_reactor_context:start_link(1, worker, #{
         setup_code => SetupCode
     }),
 

From 8e620d5dd425ce54750c71fd2eb429e537b1031b Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 21:31:02 +0200
Subject: [PATCH 06/17] Fix Erlang compile warnings in test suites

- py_actor_SUITE: prefix unused Pid variable with underscore
- py_async_task_SUITE: match +0.0 instead of 0.0 (OTP 27+ no longer
  matches -0.0 to 0.0)
- py_reentrant_SUITE, py_pid_send_SUITE: replace deprecated
  code:lib_dir/2 with filename:join(code:lib_dir/1, "test")
---
 test/py_actor_SUITE.erl      | 2 +-
 test/py_async_task_SUITE.erl | 4 ++--
 test/py_pid_send_SUITE.erl   | 2 +-
 test/py_reentrant_SUITE.erl  | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/py_actor_SUITE.erl b/test/py_actor_SUITE.erl
index 6ddec82..6fc44c2 100644
--- a/test/py_actor_SUITE.erl
+++ b/test/py_actor_SUITE.erl
@@ -70,7 +70,7 @@ test_process_isolation(_Config) ->
     {ok, <<"main_process">>} = py:eval(Ctx, <<"isolation_test">>),
 
     %% Spawn another process using the same context
-    Pid = spawn(fun() ->
+    _Pid = spawn(fun() ->
         %% This process should have its own environment
         ok = py:exec(Ctx, <<"isolation_test = 'spawned_process'">>),
         {ok, Value} = py:eval(Ctx, <<"isolation_test">>),
diff --git a/test/py_async_task_SUITE.erl b/test/py_async_task_SUITE.erl
index 0e777e8..0646832 100644
--- a/test/py_async_task_SUITE.erl
+++ b/test/py_async_task_SUITE.erl
@@ -164,7 +164,7 @@ test_async_coroutine(_Config) ->
     Ref = py_event_loop:create_task(math, sin, [0.0]),
     Result = py_event_loop:await(Ref, 5000),
     ct:log("math.sin(0.0) = ~p", [Result]),
-    {ok, 0.0} = Result.
+    {ok, +0.0} = Result.
 
 test_async_with_args(_Config) ->
     %% Test with args using operator module
@@ -328,7 +328,7 @@ test_interleaved_sync_async(_Config) ->
     R4 = py_event_loop:create_task(math, sqrt, [64.0]),
 
     {ok, 3} = py_event_loop:await(R1, 5000),
-    {ok, 0.0} = py_event_loop:await(R2, 5000),
+    {ok, +0.0} = py_event_loop:await(R2, 5000),
     {ok, 30} = py_event_loop:await(R3, 5000),
     {ok, 8.0} = py_event_loop:await(R4, 5000),
     ct:log("Interleaved sync/async tests passed").
diff --git a/test/py_pid_send_SUITE.erl b/test/py_pid_send_SUITE.erl
index 0f9f6ea..45dc33d 100644
--- a/test/py_pid_send_SUITE.erl
+++ b/test/py_pid_send_SUITE.erl
@@ -84,7 +84,7 @@ init_per_suite(Config) ->
     {ok, _} = application:ensure_all_started(erlang_python),
     %% Add test directory to Python path on ALL contexts
     %% (subinterpreters have isolated sys.path)
-    TestDir = code:lib_dir(erlang_python, test),
+    TestDir = filename:join(code:lib_dir(erlang_python), "test"),
     PathCmd = iolist_to_binary(io_lib:format(
         "import sys; sys.path.insert(0, '~s')", [TestDir])),
     NumContexts = py_context_router:num_contexts(),
diff --git a/test/py_reentrant_SUITE.erl b/test/py_reentrant_SUITE.erl
index ca01693..0db9909 100644
--- a/test/py_reentrant_SUITE.erl
+++ b/test/py_reentrant_SUITE.erl
@@ -288,7 +288,7 @@ test_callback_with_try_except(_Config) ->
     end),
 
     %% Add test directory to Python path so we can import the test module
-    TestDir = code:lib_dir(erlang_python, test),
+    TestDir = filename:join(code:lib_dir(erlang_python), "test"),
     ok = py:exec(iolist_to_binary(io_lib:format(
         "import sys; sys.path.insert(0, '~s')", [TestDir]))),
 
@@ -324,7 +324,7 @@ test_async_call(_Config) ->
     py:register_function(async_multiply, fun([X, Y]) -> X * Y end),
 
     %% Add test directory to Python path
-    TestDir = code:lib_dir(erlang_python, test),
+    TestDir = filename:join(code:lib_dir(erlang_python), "test"),
     ok = py:exec(iolist_to_binary(io_lib:format(
         "import sys; sys.path.insert(0, '~s')", [TestDir]))),
 

From f118114b452e30195a68df85e67d452e26d9c530 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 22:20:10 +0200
Subject: [PATCH 07/17] Skip asyncio.set_event_loop_policy install on Python
 3.14+
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

3.14 deprecated the call; 3.16 removes it. Our run path
(erlang.run / asyncio.Runner with loop_factory=) doesn't need the
global policy — it was only convenience for bare asyncio.run()
inside py:exec. Gate both Erlang-side helpers and the Python
erlang.install() entry point on sys.version_info < (3, 14); the
latter now raises with a migration message on 3.14+.

Behavior on Python 3.9-3.13 is unchanged. Adds
test/py_asyncio_policy_SUITE.erl (7 self-skipping cases) pinning the
gate, the run-path-without-policy invariant, and a sentinel asserting
no set_event_loop_policy DeprecationWarning fires on 3.14+ init.
---
 CHANGELOG.md                     |   9 ++
 priv/_erlang_impl/__init__.py    |  17 ++-
 priv/_erlang_impl/_policy.py     |  25 +++--
 priv/tests/test_erlang_api.py    |  29 ++++--
 src/py_event_loop.erl            |  22 +++-
 test/py_asyncio_policy_SUITE.erl | 173 +++++++++++++++++++++++++++++++
 6 files changed, 253 insertions(+), 22 deletions(-)
 create mode 100644 test/py_asyncio_policy_SUITE.erl

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f3c3a5..cf7c017 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -51,6 +51,15 @@
 - **Request queue per context** - Replaced single-slot request pattern with proper
   request queues that support multiple concurrent callers.
 
+- **No global asyncio policy install on Python 3.14+.** `asyncio.set_event_loop_policy`
+  was deprecated in 3.14 and is removed in 3.16. The Erlang integration's run path
+  already uses `loop_factory=` (`erlang.run/1`, `asyncio.Runner`) so the global
+  policy was only a convenience for bare `asyncio.run()` inside `py:exec`. We now
+  skip the install on 3.14+ to avoid the deprecation warning. On 3.14+ use
+  `erlang.run(main)` or `asyncio.Runner(loop_factory=erlang.new_event_loop)`
+  explicitly. Behavior on Python 3.9–3.13 is unchanged. `erlang.install()` raises
+  `RuntimeError` on 3.14+ (still emits a `DeprecationWarning` and works on 3.12–3.13).
+
 ### Removed
 
 - Multi-executor pool (`g_executors[]`, `multi_executor_start/stop`)
diff --git a/priv/_erlang_impl/__init__.py b/priv/_erlang_impl/__init__.py
index 9090804..7dd1ac8 100644
--- a/priv/_erlang_impl/__init__.py
+++ b/priv/_erlang_impl/__init__.py
@@ -399,15 +399,28 @@ def _run_async_from_erlang(module, func, args, kwargs):
 def install():
     """Install ErlangEventLoopPolicy as the default event loop policy.
 
-    This function is deprecated in Python 3.12+. Use run() instead.
+    Deprecated in Python 3.12+; raises ``RuntimeError`` on Python 3.14+
+    where the underlying ``asyncio.set_event_loop_policy`` is itself
+    deprecated and slated for removal in 3.16.
 
-    Example (legacy pattern):
+    Use ``erlang.run(main)`` or
+    ``asyncio.Runner(loop_factory=erlang.new_event_loop)`` instead —
+    both work on every supported Python version and don't touch the
+    global policy.
+
+    Example (legacy pattern, Python 3.9–3.13 only):
         import asyncio
         import erlang
 
         erlang.install()
         asyncio.run(main())  # Uses Erlang event loop
     """
+    if sys.version_info >= (3, 14):
+        raise RuntimeError(
+            "erlang.install() is not supported on Python 3.14+. "
+            "Use erlang.run(main) or "
+            "asyncio.Runner(loop_factory=erlang.new_event_loop) instead."
+        )
     if sys.version_info >= (3, 12):
         warnings.warn(
             "erlang.install() is deprecated in Python 3.12+. "
diff --git a/priv/_erlang_impl/_policy.py b/priv/_erlang_impl/_policy.py
index 37b18af..bff6603 100644
--- a/priv/_erlang_impl/_policy.py
+++ b/priv/_erlang_impl/_policy.py
@@ -33,19 +33,28 @@ class ErlangEventLoopPolicy(asyncio.AbstractEventLoopPolicy):
     This policy creates ErlangEventLoop instances for the main thread
     and optionally for child threads depending on configuration.
 
-    Usage:
-        import asyncio
+    Recommended usage on Python 3.12+ (no policy required):
+
         import erlang
+        erlang.run(main())
 
-        # Install the policy
-        asyncio.set_event_loop_policy(erlang.EventLoopPolicy())
+        # or, equivalently:
+        import asyncio
+        with asyncio.Runner(loop_factory=erlang.new_event_loop) as r:
+            r.run(main())
+
+    Legacy pattern for Python 3.9–3.11 (also works through 3.13 with a
+    DeprecationWarning, raises on 3.14+):
 
-        # Now asyncio.run() uses Erlang event loop
+        import asyncio, erlang
+        asyncio.set_event_loop_policy(erlang.EventLoopPolicy())
         asyncio.run(main())
 
-    Note:
-        This approach is deprecated in Python 3.12+.
-        Use erlang.run() instead.
+    Notes:
+        ``asyncio.set_event_loop_policy`` is deprecated in Python 3.14
+        and removed in 3.16, so only ``erlang.run`` /
+        ``asyncio.Runner(loop_factory=...)`` are guaranteed to work
+        across the full supported range.
     """
 
     def __init__(self):
diff --git a/priv/tests/test_erlang_api.py b/priv/tests/test_erlang_api.py
index a07f801..ee725c6 100644
--- a/priv/tests/test_erlang_api.py
+++ b/priv/tests/test_erlang_api.py
@@ -251,18 +251,30 @@ async def main():
         self.assertEqual(result, 'debug_test')
 
     def test_install_function(self):
-        """Test erlang.install() function."""
+        """Test erlang.install() function across supported Python versions."""
         erlang = _get_erlang_module()
 
-        old_policy = asyncio.get_event_loop_policy()
+        if sys.version_info >= (3, 14):
+            # 3.14 deprecated set_event_loop_policy and 3.16 removes it,
+            # so erlang.install() now raises with a migration message.
+            with self.assertRaises(RuntimeError) as cm:
+                erlang.install()
+            msg = str(cm.exception)
+            self.assertIn("3.14+", msg)
+            return
+
+        # 3.9-3.13: install() still works (DeprecationWarning on 3.12+).
+        # Suppress the asyncio DeprecationWarning emitted by the
+        # get_event_loop_policy() probe itself on those versions.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", DeprecationWarning)
+            old_policy = asyncio.get_event_loop_policy()
 
         try:
             if sys.version_info >= (3, 12):
-                # Should emit deprecation warning
                 with warnings.catch_warnings(record=True) as w:
                     warnings.simplefilter("always")
                     erlang.install()
-                    self.assertTrue(len(w) >= 1)
                     self.assertTrue(
                         any(issubclass(warning.category, DeprecationWarning)
                             for warning in w)
@@ -270,12 +282,15 @@ def test_install_function(self):
             else:
                 erlang.install()
 
-            # Policy should be ErlangEventLoopPolicy
-            policy = asyncio.get_event_loop_policy()
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", DeprecationWarning)
+                policy = asyncio.get_event_loop_policy()
             self.assertIsInstance(policy, erlang.EventLoopPolicy)
 
         finally:
-            asyncio.set_event_loop_policy(old_policy)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", DeprecationWarning)
+                asyncio.set_event_loop_policy(old_policy)
 
 
 class TestErlangLoopSpecificFeatures(tb.ErlangTestCase):
diff --git a/src/py_event_loop.erl b/src/py_event_loop.erl
index 65093f9..dfcabdd 100644
--- a/src/py_event_loop.erl
+++ b/src/py_event_loop.erl
@@ -340,19 +340,28 @@ init([]) ->
 
 %% @doc Set ErlangEventLoop as the default asyncio event loop policy.
 %% Also extends the C 'erlang' module with Python event loop exports.
+%%
+%% Python 3.14 deprecated `asyncio.set_event_loop_policy` and 3.16 removes
+%% it. The integration's run path uses `loop_factory=` directly via
+%% `erlang.run/1` and `asyncio.Runner`, so the global policy install is
+%% only a convenience for user code that calls bare asyncio APIs inside
+%% `py:exec`. We skip the install on 3.14+ to avoid the warning; users
+%% on 3.14+ should call `erlang.run(main)` or
+%% `asyncio.Runner(loop_factory=erlang.new_event_loop)` explicitly.
 set_default_policy() ->
     PrivDir = code:priv_dir(erlang_python),
     %% First, extend the erlang module with Python event loop exports
     extend_erlang_module(PrivDir),
-    %% Then set the event loop policy
+    %% Then set the event loop policy (only on Python < 3.14)
     Code = iolist_to_binary([
         "import sys\n",
         "priv_dir = '", PrivDir, "'\n",
         "if priv_dir not in sys.path:\n",
         "    sys.path.insert(0, priv_dir)\n",
-        "from _erlang_impl import get_event_loop_policy\n",
         "import asyncio\n",
-        "asyncio.set_event_loop_policy(get_event_loop_policy())\n"
+        "if sys.version_info < (3, 14):\n",
+        "    from _erlang_impl import get_event_loop_policy\n",
+        "    asyncio.set_event_loop_policy(get_event_loop_policy())\n"
     ]),
     case py:exec(Code) of
         ok -> ok;
@@ -425,10 +434,13 @@ terminate(_Reason, #state{loop_ref = LoopRef, worker_pid = WorkerPid}) ->
     ok.
 
 %% @doc Reset asyncio back to the default event loop policy.
+%% Skipped on Python 3.14+ since we never installed one (see
+%% set_default_policy/0).
 reset_default_policy() ->
     Code = <<"
-import asyncio
-asyncio.set_event_loop_policy(None)
+import sys, asyncio
+if sys.version_info < (3, 14):
+    asyncio.set_event_loop_policy(None)
 ">>,
     catch py:exec(Code),
     ok.
diff --git a/test/py_asyncio_policy_SUITE.erl b/test/py_asyncio_policy_SUITE.erl
new file mode 100644
index 0000000..2203678
--- /dev/null
+++ b/test/py_asyncio_policy_SUITE.erl
@@ -0,0 +1,173 @@
+%%% @doc CT suite pinning the version-gated asyncio policy install.
+%%%
+%%% On Python 3.14+ the integration must NOT call
+%%% `asyncio.set_event_loop_policy/0` (deprecated in 3.14, removed in
+%%% 3.16). On Python <3.14 the policy install is preserved as the
+%%% historical convenience for bare `asyncio.run()` inside `py:exec`.
+%%%
+%%% These cases verify both halves of the gate plus the architectural
+%%% claim that the run path works without the global policy.
+-module(py_asyncio_policy_SUITE).
+
+-include_lib("common_test/include/ct.hrl").
+
+-export([
+    all/0,
+    init_per_suite/1,
+    end_per_suite/1
+]).
+
+-export([
+    policy_install_skipped_on_3_14_plus/1,
+    policy_install_active_below_3_14/1,
+    async_call_round_trip/1,
+    erlang_run_uses_erlang_loop/1,
+    install_raises_on_3_14_plus/1,
+    install_works_below_3_14/1,
+    no_deprecation_warning_during_init/1
+]).
+
+all() ->
+    [
+        policy_install_skipped_on_3_14_plus,
+        policy_install_active_below_3_14,
+        async_call_round_trip,
+        erlang_run_uses_erlang_loop,
+        install_raises_on_3_14_plus,
+        install_works_below_3_14,
+        no_deprecation_warning_during_init
+    ].
+
+init_per_suite(Config) ->
+    {ok, _} = application:ensure_all_started(erlang_python),
+    {ok, _} = py:start_contexts(),
+    Config.
+
+end_per_suite(_Config) ->
+    ok = application:stop(erlang_python),
+    ok.
+
+%%% ---------------------------------------------------------------------------
+%%% Helpers
+%%% ---------------------------------------------------------------------------
+
+python_at_least(Major, Minor) ->
+    ok = py:exec(<<"import sys">>),
+    {ok, {Maj, Min}} = py:eval(<<"sys.version_info[:2]">>),
+    {Maj, Min} >= {Major, Minor}.
+
+policy_class_name() ->
+    %% asyncio.get_event_loop_policy() itself emits a DeprecationWarning
+    %% on 3.14+; suppress it locally so the probe doesn't pollute the run.
+    Code = <<
+        "import asyncio, warnings\n"
+        "with warnings.catch_warnings():\n"
+        "    warnings.simplefilter('ignore', DeprecationWarning)\n"
+        "    _pol_name = asyncio.get_event_loop_policy().__class__.__name__\n"
+    >>,
+    ok = py:exec(Code),
+    {ok, Name} = py:eval(<<"_pol_name">>),
+    Name.
+
+%%% ---------------------------------------------------------------------------
+%%% Tests
+%%% ---------------------------------------------------------------------------
+
+policy_install_skipped_on_3_14_plus(_Config) ->
+    case python_at_least(3, 14) of
+        false ->
+            {skip, "Python <3.14 — global policy install is the right way"};
+        true ->
+            Name = policy_class_name(),
+            true = Name =/= <<"ErlangEventLoopPolicy">>,
+            ct:pal("Policy on 3.14+ is ~p (not ErlangEventLoopPolicy, as expected)",
+                   [Name]),
+            ok
+    end.
+
+policy_install_active_below_3_14(_Config) ->
+    case python_at_least(3, 14) of
+        true ->
+            {skip, "Python 3.14+ — policy is intentionally not installed"};
+        false ->
+            <<"ErlangEventLoopPolicy">> = policy_class_name(),
+            ok
+    end.
+
+async_call_round_trip(_Config) ->
+    %% Independent of policy state: async_call → async_await must succeed.
+    Ref = py:async_call(math, sqrt, [16]),
+    true = is_reference(Ref),
+    {ok, 4.0} = py:async_await(Ref, 5000),
+    ok.
+
+erlang_run_uses_erlang_loop(_Config) ->
+    %% Confirm erlang.run picks up ErlangEventLoop on every supported
+    %% version, regardless of the global policy state.
+    ok = py:exec(<<
+        "import erlang, asyncio\n"
+        "async def _probe():\n"
+        "    return type(asyncio.get_running_loop()).__name__\n"
+        "_loop_class = erlang.run(_probe())\n"
+    >>),
+    {ok, <<"ErlangEventLoop">>} = py:eval(<<"_loop_class">>),
+    ok.
+
+install_raises_on_3_14_plus(_Config) ->
+    case python_at_least(3, 14) of
+        false ->
+            {skip, "Python <3.14 — erlang.install() still functional"};
+        true ->
+            ok = py:exec(<<
+                "import erlang\n"
+                "_install_err = None\n"
+                "try:\n"
+                "    erlang.install()\n"
+                "except RuntimeError as e:\n"
+                "    _install_err = str(e)\n"
+            >>),
+            {ok, ErrMsg} = py:eval(<<"_install_err">>),
+            true = is_binary(ErrMsg),
+            true = byte_size(ErrMsg) > 0,
+            true = binary:match(ErrMsg, <<"3.14+">>) =/= nomatch
+                orelse binary:match(ErrMsg, <<"loop_factory">>) =/= nomatch,
+            ok
+    end.
+
+install_works_below_3_14(_Config) ->
+    case python_at_least(3, 14) of
+        true ->
+            {skip, "Python 3.14+ — erlang.install() raises by design"};
+        false ->
+            %% A DeprecationWarning is acceptable on 3.12-3.13; any
+            %% exception is not.
+            ok = py:exec(<<
+                "import erlang, warnings\n"
+                "with warnings.catch_warnings():\n"
+                "    warnings.simplefilter('ignore', DeprecationWarning)\n"
+                "    erlang.install()\n"
+            >>),
+            ok
+    end.
+
+no_deprecation_warning_during_init(_Config) ->
+    case python_at_least(3, 14) of
+        false ->
+            {skip, "Python <3.14 — no deprecation warning to verify"};
+        true ->
+            %% Mimic the set_default_policy snippet inside a
+            %% catch_warnings block; assert no set_event_loop_policy
+            %% warning surfaces.
+            ok = py:exec(<<
+                "import asyncio, sys, warnings\n"
+                "with warnings.catch_warnings(record=True) as _caught:\n"
+                "    warnings.simplefilter('always')\n"
+                "    if sys.version_info < (3, 14):\n"
+                "        from _erlang_impl import get_event_loop_policy\n"
+                "        asyncio.set_event_loop_policy(get_event_loop_policy())\n"
+                "_relevant = [str(w.message) for w in _caught\n"
+                "             if 'set_event_loop_policy' in str(w.message)]\n"
+            >>),
+            {ok, []} = py:eval(<<"_relevant">>),
+            ok
+    end.

From c2673943c58debed5efa70396139bc218543adbc Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 22:48:44 +0200
Subject: [PATCH 08/17] CHANGELOG: note strict context-mode validation

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf7c017..074ab6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,13 @@
   were no-ops after the v3.0 worker rework. Configure context count via
   `num_contexts` and the rate-limit ceiling via `max_concurrent`.
 
+- **Strict context-mode validation at the NIF boundary** - `py_nif:context_create/1`
+  now returns `{error, {invalid_mode, Atom}}` for anything other than `worker | owngil`.
+  Previously, callers that bypassed `py_context` (notably `py_reactor_context`)
+  silently mapped any unknown atom — including legacy `auto` and `subinterp` —
+  to worker mode. Code that relied on that loophole must pass `worker` (or
+  `owngil`) explicitly.
+
 ### Fixed
 
 - **`py:async_call/3,4` + `py:async_await/1,2` round-trip** - Previously the

From 55cdd017335128c832b441ca51dc8a48beb7b75b Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 22:55:15 +0200
Subject: [PATCH 09/17] Fix edoc parse error in set_default_policy/0

EDoc expects code-quote spans as `text', not `text`; the markdown-style
backticks I added in the v3.14 deprecation note crashed edoc_doclet_chunks.
---
 src/py_event_loop.erl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/py_event_loop.erl b/src/py_event_loop.erl
index dfcabdd..b89bda1 100644
--- a/src/py_event_loop.erl
+++ b/src/py_event_loop.erl
@@ -341,13 +341,13 @@ init([]) ->
 %% @doc Set ErlangEventLoop as the default asyncio event loop policy.
 %% Also extends the C 'erlang' module with Python event loop exports.
 %%
-%% Python 3.14 deprecated `asyncio.set_event_loop_policy` and 3.16 removes
-%% it. The integration's run path uses `loop_factory=` directly via
-%% `erlang.run/1` and `asyncio.Runner`, so the global policy install is
+%% Python 3.14 deprecated `asyncio.set_event_loop_policy' and 3.16 removes
+%% it. The integration's run path uses `loop_factory=' directly via
+%% `erlang.run/1' and `asyncio.Runner', so the global policy install is
 %% only a convenience for user code that calls bare asyncio APIs inside
-%% `py:exec`. We skip the install on 3.14+ to avoid the warning; users
-%% on 3.14+ should call `erlang.run(main)` or
-%% `asyncio.Runner(loop_factory=erlang.new_event_loop)` explicitly.
+%% `py:exec'. We skip the install on 3.14+ to avoid the warning; users
+%% on 3.14+ should call `erlang.run(main)' or
+%% `asyncio.Runner(loop_factory=erlang.new_event_loop)' explicitly.
 set_default_policy() ->
     PrivDir = code:priv_dir(erlang_python),
     %% First, extend the erlang module with Python event loop exports

From 28b809082dd59fccd60c8b0d24ac68599f7e38ce Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 23:12:40 +0200
Subject: [PATCH 10/17] Drop dead subinterp_thread handle helpers

The handle-level entry points (subinterp_thread_handle_create/destroy,
subinterp_thread_call/eval/exec/cast/async_call) along with the
py_subinterp_handle_t struct and PY_SUBINTERP_HANDLE_RESOURCE_TYPE
became unreachable when the public py:subinterp_* API was removed.
Only the OWN_GIL session backend (subinterp_thread_pool_*) is still
used by py_event_loop_pool, so keep that path and drop the rest.
---
 c_src/py_subinterp_thread.c | 394 ------------------------------------
 c_src/py_subinterp_thread.h | 122 -----------
 2 files changed, 516 deletions(-)

diff --git a/c_src/py_subinterp_thread.c b/c_src/py_subinterp_thread.c
index ee97c9c..72a1552 100644
--- a/c_src/py_subinterp_thread.c
+++ b/c_src/py_subinterp_thread.c
@@ -40,9 +40,6 @@
 /** @brief Global thread pool instance */
 subinterp_thread_pool_t g_thread_pool = {0};
 
-/** @brief Resource type for handles (set by NIF load) */
-ErlNifResourceType *PY_SUBINTERP_HANDLE_RESOURCE_TYPE = NULL;
-
 /* Forward declarations */
 static void *worker_thread_main(void *arg);
 static int worker_create_namespace(subinterp_thread_worker_t *w, uint64_t handle_id);
@@ -1087,397 +1084,6 @@ static subinterp_namespace_t *worker_find_namespace(subinterp_thread_worker_t *w
     return NULL;
 }
 
-/* ============================================================================
- * Handle Management
- * ============================================================================ */
-
-int subinterp_thread_handle_create(py_subinterp_handle_t *handle) {
-    if (!subinterp_thread_pool_is_ready()) {
-        return -1;
-    }
-
-    /* Select worker round-robin */
-    uint64_t worker_idx = atomic_fetch_add(&g_thread_pool.next_worker, 1);
-    int worker_id = worker_idx % g_thread_pool.num_workers;
-
-    /* Generate unique handle ID */
-    uint64_t handle_id = atomic_fetch_add(&g_thread_pool.next_handle_id, 1);
-
-    handle->worker_id = worker_id;
-    handle->handle_id = handle_id;
-    atomic_store(&handle->destroyed, false);
-
-    /* Create namespace in worker */
-    subinterp_thread_worker_t *w = &g_thread_pool.workers[worker_id];
-
-    /* Lock dispatch to ensure exclusive access */
-    pthread_mutex_lock(&w->dispatch_mutex);
-
-    /* Send create namespace request */
-    uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1);
-    owngil_header_t header = {
-        .magic = OWNGIL_MAGIC,
-        .version = OWNGIL_PROTOCOL_VERSION,
-        .msg_type = MSG_REQUEST,
-        .req_type = REQ_CREATE_NS,
-        .request_id = request_id,
-        .handle_id = handle_id,
-        .payload_len = 0,
-    };
-
-    write_full(w->cmd_pipe[1], &header, sizeof(header));
-
-    /* Wait for response */
-    owngil_header_t resp;
-    read_full(w->result_pipe[0], &resp, sizeof(resp));
-
-    pthread_mutex_unlock(&w->dispatch_mutex);
-
-    return (resp.msg_type == MSG_RESPONSE) ? 0 : -1;
-}
-
-void subinterp_thread_handle_destroy(py_subinterp_handle_t *handle) {
-    if (atomic_exchange(&handle->destroyed, true)) {
-        return;  /* Already destroyed */
-    }
-
-    if (!subinterp_thread_pool_is_ready()) {
-        return;
-    }
-
-    if (handle->worker_id < 0 || handle->worker_id >= g_thread_pool.num_workers) {
-        return;
-    }
-
-    subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id];
-
-    pthread_mutex_lock(&w->dispatch_mutex);
-
-    /* Send destroy namespace request */
-    uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1);
-    owngil_header_t header = {
-        .magic = OWNGIL_MAGIC,
-        .version = OWNGIL_PROTOCOL_VERSION,
-        .msg_type = MSG_REQUEST,
-        .req_type = REQ_DESTROY_NS,
-        .request_id = request_id,
-        .handle_id = handle->handle_id,
-        .payload_len = 0,
-    };
-
-    write_full(w->cmd_pipe[1], &header, sizeof(header));
-
-    /* Wait for response */
-    owngil_header_t resp;
-    read_full(w->result_pipe[0], &resp, sizeof(resp));
-
-    pthread_mutex_unlock(&w->dispatch_mutex);
-}
-
-/* ============================================================================
- * Execution API
- * ============================================================================ */
-
-ERL_NIF_TERM subinterp_thread_call(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM module, ERL_NIF_TERM func,
-                                    ERL_NIF_TERM args, ERL_NIF_TERM kwargs) {
-    if (atomic_load(&handle->destroyed)) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "handle_destroyed"));
-    }
-
-    if (!subinterp_thread_pool_is_ready()) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "pool_not_ready"));
-    }
-
-    subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id];
-
-    /* Build payload tuple: {Module, Func, Args, Kwargs} */
-    ERL_NIF_TERM payload_tuple = enif_make_tuple4(env, module, func, args, kwargs);
-
-    /* Serialize to ETF */
-    ErlNifBinary payload_bin;
-    if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "serialization_failed"));
-    }
-
-    /* Lock dispatch */
-    pthread_mutex_lock(&w->dispatch_mutex);
-
-    /* Send request */
-    uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1);
-    owngil_header_t header = {
-        .magic = OWNGIL_MAGIC,
-        .version = OWNGIL_PROTOCOL_VERSION,
-        .msg_type = MSG_REQUEST,
-        .req_type = REQ_CALL,
-        .request_id = request_id,
-        .handle_id = handle->handle_id,
-        .payload_len = payload_bin.size,
-    };
-
-    write_full(w->cmd_pipe[1], &header, sizeof(header));
-    write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size);
-    enif_release_binary(&payload_bin);
-
-    /* Read response */
-    owngil_header_t resp_header;
-    if (read_full(w->result_pipe[0], &resp_header, sizeof(resp_header)) != sizeof(resp_header)) {
-        pthread_mutex_unlock(&w->dispatch_mutex);
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "read_failed"));
-    }
-
-    ERL_NIF_TERM result;
-    if (resp_header.payload_len > 0) {
-        unsigned char *resp_payload = enif_alloc(resp_header.payload_len);
-        if (resp_payload == NULL) {
-            pthread_mutex_unlock(&w->dispatch_mutex);
-            return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                    enif_make_atom(env, "alloc_failed"));
-        }
-
-        if (read_full(w->result_pipe[0], resp_payload, resp_header.payload_len)
-                != (int)resp_header.payload_len) {
-            enif_free(resp_payload);
-            pthread_mutex_unlock(&w->dispatch_mutex);
-            return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                    enif_make_atom(env, "read_failed"));
-        }
-
-        /* Deserialize response */
-        if (enif_binary_to_term(env, resp_payload, resp_header.payload_len,
-                                 &result, 0) == 0) {
-            result = enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                      enif_make_atom(env, "deserialize_failed"));
-        }
-
-        enif_free(resp_payload);
-    } else {
-        result = enif_make_atom(env, "ok");
-    }
-
-    pthread_mutex_unlock(&w->dispatch_mutex);
-    return result;
-}
-
-ERL_NIF_TERM subinterp_thread_eval(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM code, ERL_NIF_TERM locals) {
-    if (atomic_load(&handle->destroyed)) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "handle_destroyed"));
-    }
-
-    if (!subinterp_thread_pool_is_ready()) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "pool_not_ready"));
-    }
-
-    subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id];
-
-    /* Build payload tuple: {Code, Locals} */
-    ERL_NIF_TERM payload_tuple = enif_make_tuple2(env, code, locals);
-
-    ErlNifBinary payload_bin;
-    if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "serialization_failed"));
-    }
-
-    pthread_mutex_lock(&w->dispatch_mutex);
-
-    uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1);
-    owngil_header_t header = {
-        .magic = OWNGIL_MAGIC,
-        .version = OWNGIL_PROTOCOL_VERSION,
-        .msg_type = MSG_REQUEST,
-        .req_type = REQ_EVAL,
-        .request_id = request_id,
-        .handle_id = handle->handle_id,
-        .payload_len = payload_bin.size,
-    };
-
-    write_full(w->cmd_pipe[1], &header, sizeof(header));
-    write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size);
-    enif_release_binary(&payload_bin);
-
-    owngil_header_t resp_header;
-    if (read_full(w->result_pipe[0], &resp_header, sizeof(resp_header)) != sizeof(resp_header)) {
-        pthread_mutex_unlock(&w->dispatch_mutex);
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "read_failed"));
-    }
-
-    ERL_NIF_TERM result;
-    if (resp_header.payload_len > 0) {
-        unsigned char *resp_payload = enif_alloc(resp_header.payload_len);
-        if (resp_payload == NULL) {
-            pthread_mutex_unlock(&w->dispatch_mutex);
-            return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                    enif_make_atom(env, "alloc_failed"));
-        }
-
-        if (read_full(w->result_pipe[0], resp_payload, resp_header.payload_len)
-                != (int)resp_header.payload_len) {
-            enif_free(resp_payload);
-            pthread_mutex_unlock(&w->dispatch_mutex);
-            return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                    enif_make_atom(env, "read_failed"));
-        }
-
-        if (enif_binary_to_term(env, resp_payload, resp_header.payload_len,
-                                 &result, 0) == 0) {
-            result = enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                      enif_make_atom(env, "deserialize_failed"));
-        }
-
-        enif_free(resp_payload);
-    } else {
-        result = enif_make_atom(env, "ok");
-    }
-
-    pthread_mutex_unlock(&w->dispatch_mutex);
-    return result;
-}
-
-ERL_NIF_TERM subinterp_thread_exec(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM code) {
-    if (atomic_load(&handle->destroyed)) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "handle_destroyed"));
-    }
-
-    if (!subinterp_thread_pool_is_ready()) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "pool_not_ready"));
-    }
-
-    subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id];
-
-    /* Build payload tuple: {Code} */
-    ERL_NIF_TERM payload_tuple = enif_make_tuple1(env, code);
-
-    ErlNifBinary payload_bin;
-    if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) {
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "serialization_failed"));
-    }
-
-    pthread_mutex_lock(&w->dispatch_mutex);
-
-    uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1);
-    owngil_header_t header = {
-        .magic = OWNGIL_MAGIC,
-        .version = OWNGIL_PROTOCOL_VERSION,
-        .msg_type = MSG_REQUEST,
-        .req_type = REQ_EXEC,
-        .request_id = request_id,
-        .handle_id = handle->handle_id,
-        .payload_len = payload_bin.size,
-    };
-
-    write_full(w->cmd_pipe[1], &header, sizeof(header));
-    write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size);
-    enif_release_binary(&payload_bin);
-
-    owngil_header_t resp_header;
-    if (read_full(w->result_pipe[0], &resp_header, sizeof(resp_header)) != sizeof(resp_header)) {
-        pthread_mutex_unlock(&w->dispatch_mutex);
-        return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                enif_make_atom(env, "read_failed"));
-    }
-
-    ERL_NIF_TERM result;
-    if (resp_header.payload_len > 0) {
-        unsigned char *resp_payload = enif_alloc(resp_header.payload_len);
-        if (resp_payload == NULL) {
-            pthread_mutex_unlock(&w->dispatch_mutex);
-            return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                    enif_make_atom(env, "alloc_failed"));
-        }
-
-        if (read_full(w->result_pipe[0], resp_payload, resp_header.payload_len)
-                != (int)resp_header.payload_len) {
-            enif_free(resp_payload);
-            pthread_mutex_unlock(&w->dispatch_mutex);
-            return enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                    enif_make_atom(env, "read_failed"));
-        }
-
-        if (enif_binary_to_term(env, resp_payload, resp_header.payload_len,
-                                 &result, 0) == 0) {
-            result = enif_make_tuple2(env, enif_make_atom(env, "error"),
-                                      enif_make_atom(env, "deserialize_failed"));
-        }
-
-        enif_free(resp_payload);
-    } else {
-        result = enif_make_atom(env, "ok");
-    }
-
-    pthread_mutex_unlock(&w->dispatch_mutex);
-    return result;
-}
-
-ERL_NIF_TERM subinterp_thread_cast(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM module, ERL_NIF_TERM func,
-                                    ERL_NIF_TERM args) {
-    if (atomic_load(&handle->destroyed)) {
-        return enif_make_atom(env, "ok");  /* Silently ignore for cast */
-    }
-
-    if (!subinterp_thread_pool_is_ready()) {
-        return enif_make_atom(env, "ok");  /* Silently ignore for cast */
-    }
-
-    subinterp_thread_worker_t *w = &g_thread_pool.workers[handle->worker_id];
-
-    /* Build payload tuple: {Module, Func, Args} */
-    ERL_NIF_TERM payload_tuple = enif_make_tuple3(env, module, func, args);
-
-    ErlNifBinary payload_bin;
-    if (!enif_term_to_binary(env, payload_tuple, &payload_bin)) {
-        return enif_make_atom(env, "ok");  /* Silently fail for cast */
-    }
-
-    pthread_mutex_lock(&w->dispatch_mutex);
-
-    uint64_t request_id = atomic_fetch_add(&g_thread_pool.next_request_id, 1);
-    owngil_header_t header = {
-        .magic = OWNGIL_MAGIC,
-        .version = OWNGIL_PROTOCOL_VERSION,
-        .msg_type = MSG_REQUEST,
-        .req_type = REQ_CAST,
-        .request_id = request_id,
-        .handle_id = handle->handle_id,
-        .payload_len = payload_bin.size,
-    };
-
-    write_full(w->cmd_pipe[1], &header, sizeof(header));
-    write_full(w->cmd_pipe[1], payload_bin.data, payload_bin.size);
-    enif_release_binary(&payload_bin);
-
-    pthread_mutex_unlock(&w->dispatch_mutex);
-
-    return enif_make_atom(env, "ok");
-}
-
-ERL_NIF_TERM subinterp_thread_async_call(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                          ERL_NIF_TERM module, ERL_NIF_TERM func,
-                                          ERL_NIF_TERM args, ErlNifPid *caller_pid,
-                                          ERL_NIF_TERM ref) {
-    /* For async, we send the request but don't wait for response.
-     * The worker thread uses erlang.send() to deliver result. */
-    (void)caller_pid;
-    (void)ref;
-
-    /* For now, implement as sync call - async requires erlang.send support */
-    ERL_NIF_TERM kwargs = enif_make_new_map(env);
-    return subinterp_thread_call(env, handle, module, func, args, kwargs);
-}
-
 /* ============================================================================
  * Utility Functions
  * ============================================================================ */
diff --git a/c_src/py_subinterp_thread.h b/c_src/py_subinterp_thread.h
index a02b882..c590597 100644
--- a/c_src/py_subinterp_thread.h
+++ b/c_src/py_subinterp_thread.h
@@ -191,23 +191,6 @@ typedef struct {
     _Atomic uint64_t next_request_id; /**< Counter for request IDs */
 } subinterp_thread_pool_t;
 
-/* ============================================================================
- * Handle Resource (Erlang side)
- * ============================================================================ */
-
-/**
- * @struct py_subinterp_handle_t
- * @brief Erlang resource representing a subinterpreter handle
- *
- * A handle is bound to a specific worker at creation and has its own
- * isolated namespace within that worker.
- */
-typedef struct {
-    int worker_id;               /**< Bound worker index (fixed at creation) */
-    uint64_t handle_id;          /**< Unique ID for namespace lookup */
-    _Atomic bool destroyed;      /**< Handle has been destroyed */
-} py_subinterp_handle_t;
-
 /* ============================================================================
  * Pool Management API
  * ============================================================================ */
@@ -252,108 +235,6 @@ void subinterp_thread_pool_stats(int *num_workers, uint64_t *total_requests,
  * Handle Management API
  * ============================================================================ */
 
-/**
- * @brief Create a new subinterpreter handle
- *
- * Allocates a handle bound to a worker (round-robin selection) and
- * creates a namespace for it within that worker.
- *
- * @param handle Output: handle structure to initialize
- * @return 0 on success, -1 on failure
- */
-int subinterp_thread_handle_create(py_subinterp_handle_t *handle);
-
-/**
- * @brief Destroy a subinterpreter handle
- *
- * Cleans up the handle's namespace within its worker.
- *
- * @param handle Handle to destroy
- */
-void subinterp_thread_handle_destroy(py_subinterp_handle_t *handle);
-
-/* ============================================================================
- * Execution API
- * ============================================================================ */
-
-/**
- * @brief Synchronous call through subinterpreter handle
- *
- * Sends a call request to the worker and blocks until response.
- * The dispatch_mutex ensures serialization per worker.
- *
- * @param env NIF environment
- * @param handle Subinterpreter handle
- * @param module Module name term (atom or binary)
- * @param func Function name term (atom or binary)
- * @param args Arguments list term
- * @param kwargs Keyword arguments map term
- * @return Result term: {ok, Result} | {error, Reason}
- */
-ERL_NIF_TERM subinterp_thread_call(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM module, ERL_NIF_TERM func,
-                                    ERL_NIF_TERM args, ERL_NIF_TERM kwargs);
-
-/**
- * @brief Synchronous eval through subinterpreter handle
- *
- * @param env NIF environment
- * @param handle Subinterpreter handle
- * @param code Code string term (binary)
- * @param locals Local variables map term
- * @return Result term: {ok, Result} | {error, Reason}
- */
-ERL_NIF_TERM subinterp_thread_eval(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM code, ERL_NIF_TERM locals);
-
-/**
- * @brief Synchronous exec through subinterpreter handle
- *
- * @param env NIF environment
- * @param handle Subinterpreter handle
- * @param code Code string term (binary)
- * @return Result term: ok | {error, Reason}
- */
-ERL_NIF_TERM subinterp_thread_exec(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM code);
-
-/**
- * @brief Fire-and-forget call (no result)
- *
- * Sends request to worker but returns immediately without waiting.
- * Used for side-effects where result is not needed.
- *
- * @param env NIF environment
- * @param handle Subinterpreter handle
- * @param module Module name term
- * @param func Function name term
- * @param args Arguments list term
- * @return ok
- */
-ERL_NIF_TERM subinterp_thread_cast(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                    ERL_NIF_TERM module, ERL_NIF_TERM func,
-                                    ERL_NIF_TERM args);
-
-/**
- * @brief Async call - returns immediately with reference
- *
- * Sends request to worker. Worker uses erlang.send() to deliver result
- * to caller_pid with the given ref.
- *
- * @param env NIF environment
- * @param handle Subinterpreter handle
- * @param module Module name term
- * @param func Function name term
- * @param args Arguments list term
- * @param caller_pid PID to send result to
- * @param ref Reference for result correlation
- * @return ok
- */
-ERL_NIF_TERM subinterp_thread_async_call(ErlNifEnv *env, py_subinterp_handle_t *handle,
-                                          ERL_NIF_TERM module, ERL_NIF_TERM func,
-                                          ERL_NIF_TERM args, ErlNifPid *caller_pid,
-                                          ERL_NIF_TERM ref);
-
 /* ============================================================================
  * Global Pool Instance
  * ============================================================================ */
@@ -361,9 +242,6 @@ ERL_NIF_TERM subinterp_thread_async_call(ErlNifEnv *env, py_subinterp_handle_t *
 /** @brief Global thread pool (defined in py_subinterp_thread.c) */
 extern subinterp_thread_pool_t g_thread_pool;
 
-/** @brief Resource type for py_subinterp_handle_t */
-extern ErlNifResourceType *PY_SUBINTERP_HANDLE_RESOURCE_TYPE;
-
 #endif /* HAVE_SUBINTERPRETERS */
 
 #endif /* PY_SUBINTERP_THREAD_H */

From 69ff429466b16aa3ef0c8e6ec8a114380de9f5d7 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 23:12:46 +0200
Subject: [PATCH 11/17] Use _Atomic bool for suspended-state result flags

has_result and is_error in suspended_state_t / suspended_context_state_t
were declared volatile, which doesn't guarantee atomicity or memory
ordering on weakly-ordered architectures (ARM). Upgrade to _Atomic bool
so the read in the resumer thread sees the write from the callback
thread without relying on incidental sequencing.
---
 c_src/py_nif.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/c_src/py_nif.h b/c_src/py_nif.h
index 5abc71d..54635d0 100644
--- a/c_src/py_nif.h
+++ b/c_src/py_nif.h
@@ -629,10 +629,10 @@ typedef struct {
     size_t result_len;
 
     /** @brief Flag: result is available for replay */
-    volatile bool has_result;
+    _Atomic bool has_result;
 
     /** @brief Flag: result represents an error */
-    volatile bool is_error;
+    _Atomic bool is_error;
 
     /* Synchronization */
 
@@ -1216,10 +1216,10 @@ typedef struct {
     size_t result_len;
 
     /** @brief Flag: result is available for replay */
-    volatile bool has_result;
+    _Atomic bool has_result;
 
     /** @brief Flag: result represents an error */
-    volatile bool is_error;
+    _Atomic bool is_error;
 
     /* Sequential callback support - stores all accumulated callback results */
 

From 1691c0df95fba556305b9601524df13923dc5be5 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 23:44:47 +0200
Subject: [PATCH 12/17] Pin context resource on shutdown timeout to prevent UAF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a worker pthread is stuck in long-running Python, the 30s shutdown
join times out, ctx->leaked is set, and the helper returns. Today the
BEAM later runs context_destructor (which sees ctx->destroyed and
returns early) and frees the resource memory — under a thread that is
still dereferencing it. Call enif_keep_resource(ctx) on the leak path
so the refcount stays above zero forever and the memory survives the
stuck thread.

Replace the CTX_REQ_SHUTDOWN sentinel enqueue with a direct
pthread_cond_broadcast under the queue mutex. The sentinel could be
orphaned (worker mid-request returns to top of loop, sees
shutdown_requested, exits without dequeuing) and now allocations can
fail more often after ctx_request_create() honors init failures.
The broadcast wakes any parked worker so it observes the predicate
and exits cleanly.

Gate the callback_pipe close in nif_context_destroy on !ctx->leaked.
A leaked pthread inside Python that triggers erlang.call() reads from
callback_pipe[0] and writes to callback_pipe[1]; closing those fds
lets the kernel reissue the numbers to unrelated files and silently
corrupt them. The leaked pipes stay alive with the pinned resource
until VM exit.
---
 c_src/py_nif.c | 109 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 66 insertions(+), 43 deletions(-)

diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index 3fd2c93..d44ed42 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -3293,18 +3293,22 @@ static void worker_context_shutdown(py_context_t *ctx) {
         return;
     }
 
-    /* Signal shutdown */
+    /* Signal shutdown and wake any worker parked on the condvar.
+     *
+     * We deliberately don't enqueue a CTX_REQ_SHUTDOWN sentinel:
+     *   - the worker loop predicate already exits once
+     *     shutdown_requested is true, so a broadcast is sufficient;
+     *   - if the worker is mid-process_request when we set the flag,
+     *     it returns to the top of the loop, sees !shutdown_requested
+     *     == false, and exits without dequeuing — leaving any
+     *     sentinel as an orphan ctx_request_t in the queue.
+     * Broadcasting under the mutex avoids the lost-wakeup race.
+     */
     atomic_store(&ctx->shutdown_requested, true);
-
-    /* Cancel all pending (not-yet-started) requests */
     ctx_queue_cancel_all(ctx);
-
-    /* Enqueue shutdown request to wake worker if idle */
-    ctx_request_t *shutdown_req = ctx_request_create();
-    if (shutdown_req != NULL) {
-        shutdown_req->type = CTX_REQ_SHUTDOWN;
-        ctx_queue_enqueue(ctx, shutdown_req);
-    }
+    pthread_mutex_lock(&ctx->queue_mutex);
+    pthread_cond_broadcast(&ctx->queue_not_empty);
+    pthread_mutex_unlock(&ctx->queue_mutex);
 
     /* Wait for thread to exit with timeout */
     bool join_succeeded = false;
@@ -3330,10 +3334,21 @@ static void worker_context_shutdown(py_context_t *ctx) {
 #endif
 
     if (!join_succeeded) {
-        /* Worker thread is unresponsive - use leak pattern */
+        /* Worker thread is unresponsive - leak the context so the
+         * stuck pthread doesn't UAF when the BEAM frees the
+         * resource. Pin the resource: enif_keep_resource pushes the
+         * refcount above zero permanently, so context_destructor
+         * never runs and the BEAM keeps the memory alive for the
+         * thread that still holds a raw pointer to it.
+         *
+         * The leaked thread also keeps using ctx->callback_pipe[]
+         * (see nif_context_destroy: pipe close is gated on
+         * !ctx->leaked for the same reason). Future cleanup happens
+         * at VM exit. */
         fprintf(stderr, "Worker thread shutdown timeout after %d seconds, leaking context\n",
                 WORKER_SHUTDOWN_TIMEOUT_SECS);
         atomic_store(&ctx->leaked, true);
+        enif_keep_resource(ctx);
         return;
     }
 
@@ -4540,18 +4555,14 @@ static void owngil_context_shutdown(py_context_t *ctx) {
         return;
     }
 
-    /* Signal shutdown */
+    /* Signal shutdown and wake any worker parked on the condvar.
+     * See worker_context_shutdown for why we broadcast instead of
+     * enqueuing a CTX_REQ_SHUTDOWN sentinel. */
     atomic_store(&ctx->shutdown_requested, true);
-
-    /* Cancel all pending (not-yet-started) requests */
     ctx_queue_cancel_all(ctx);
-
-    /* Enqueue shutdown request to wake worker if idle */
-    ctx_request_t *shutdown_req = ctx_request_create();
-    if (shutdown_req != NULL) {
-        shutdown_req->type = CTX_REQ_SHUTDOWN;
-        ctx_queue_enqueue(ctx, shutdown_req);
-    }
+    pthread_mutex_lock(&ctx->queue_mutex);
+    pthread_cond_broadcast(&ctx->queue_not_empty);
+    pthread_mutex_unlock(&ctx->queue_mutex);
 
     /* Wait for thread to exit with timeout */
     bool join_succeeded = false;
@@ -4577,13 +4588,14 @@ static void owngil_context_shutdown(py_context_t *ctx) {
 #endif
 
     if (!join_succeeded) {
-        /* Worker thread is unresponsive - use leak pattern */
+        /* Worker thread is unresponsive - leak the context. Pin the
+         * resource so the BEAM doesn't free its memory under the
+         * stuck pthread (UAF). See worker_context_shutdown for the
+         * full rationale. */
         fprintf(stderr, "OWN_GIL shutdown timeout after %d seconds, leaking context\n",
                 OWNGIL_SHUTDOWN_TIMEOUT_SECS);
         atomic_store(&ctx->leaked, true);
-        /* Do NOT free shared resources - worker thread may still be using them.
-         * The leaked thread is isolated and will eventually clean up itself
-         * when Python exits, or persist until VM exit. */
+        enif_keep_resource(ctx);
         return;
     }
 
@@ -4739,14 +4751,20 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_
     /* OWN_GIL mode: shutdown the dedicated thread */
     if (ctx->uses_own_gil) {
         owngil_context_shutdown(ctx);
-        /* Close callback pipes */
-        if (ctx->callback_pipe[0] >= 0) {
-            close(ctx->callback_pipe[0]);
-            ctx->callback_pipe[0] = -1;
-        }
-        if (ctx->callback_pipe[1] >= 0) {
-            close(ctx->callback_pipe[1]);
-            ctx->callback_pipe[1] = -1;
+        /* Close callback pipes only on a clean shutdown. If the
+         * worker timed out (ctx->leaked == true) it may still write
+         * to / read from these fds; closing them here would let the
+         * kernel reissue the fd numbers to unrelated files and
+         * silently corrupt them. */
+        if (!atomic_load(&ctx->leaked)) {
+            if (ctx->callback_pipe[0] >= 0) {
+                close(ctx->callback_pipe[0]);
+                ctx->callback_pipe[0] = -1;
+            }
+            if (ctx->callback_pipe[1] >= 0) {
+                close(ctx->callback_pipe[1]);
+                ctx->callback_pipe[1] = -1;
+            }
         }
         atomic_fetch_add(&g_counters.ctx_destroyed, 1);
         return ATOM_OK;
@@ -4756,14 +4774,17 @@ static ERL_NIF_TERM nif_context_destroy(ErlNifEnv *env, int argc, const ERL_NIF_
     /* Worker mode: shutdown the dedicated worker thread */
     if (ctx->uses_worker_thread) {
         worker_context_shutdown(ctx);
-        /* Close callback pipes */
-        if (ctx->callback_pipe[0] >= 0) {
-            close(ctx->callback_pipe[0]);
-            ctx->callback_pipe[0] = -1;
-        }
-        if (ctx->callback_pipe[1] >= 0) {
-            close(ctx->callback_pipe[1]);
-            ctx->callback_pipe[1] = -1;
+        /* Close callback pipes (see OWN_GIL branch for why this is
+         * gated on !ctx->leaked). */
+        if (!atomic_load(&ctx->leaked)) {
+            if (ctx->callback_pipe[0] >= 0) {
+                close(ctx->callback_pipe[0]);
+                ctx->callback_pipe[0] = -1;
+            }
+            if (ctx->callback_pipe[1] >= 0) {
+                close(ctx->callback_pipe[1]);
+                ctx->callback_pipe[1] = -1;
+            }
         }
         atomic_fetch_add(&g_counters.ctx_destroyed, 1);
         return ATOM_OK;
@@ -7472,8 +7493,10 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
         env, NULL, "inline_continuation", inline_continuation_destructor,
         ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);
 
-    /* Process-scoped shared dictionary resource type
-     * Using simple resource type without process monitoring for now */
+    /* Process-scoped shared dictionary resource type. GC-scoped: the
+     * destructor releases the Python dict when the last term ref
+     * drops. No per-process monitor — explicit shared_dict_destroy/1
+     * is the eager-release path. */
     PY_SHARED_DICT_RESOURCE_TYPE = enif_open_resource_type(
         env, NULL, "py_shared_dict", shared_dict_destructor,
         ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);

From ef31f764073d6776a534aa3d2d8903cd74d6e78b Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 23:44:56 +0200
Subject: [PATCH 13/17] Drain stale {py_result, _, _} after timed-out async
 dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

wait_for_async_result/2 returns {error, async_timeout} after 5
minutes, but the C worker can still finish later and deliver the
result. Without a drain those messages would accumulate on the
context process's mailbox forever. Drain stale ids before the
matching receive — safe because the context process is the sole
receiver and only one wait is in flight at a time.

New test/py_context_async_drain_SUITE.erl pins the behavior by
injecting a stale {py_result, FakeRef, _} into Ctx's mailbox and
asserting it's gone after the next async-dispatched call.
---
 src/py_context.erl                    | 22 +++++++++
 test/py_context_async_drain_SUITE.erl | 64 +++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 test/py_context_async_drain_SUITE.erl

diff --git a/src/py_context.erl b/src/py_context.erl
index a879f30..883e5e1 100644
--- a/src/py_context.erl
+++ b/src/py_context.erl
@@ -804,7 +804,20 @@ handle_eval_blocking(Ref, Code, Locals) ->
 %% @private
 %% Wait for async result from worker thread
 %% The worker thread sends {py_result, RequestId, Result} when done.
+%%
+%% Drains stale {py_result, _, _} messages from prior timed-out
+%% requests before the matching receive so a context that experiences
+%% repeat timeouts doesn't grow an unbounded mailbox: when
+%% wait_for_async_result/2 returns {error, async_timeout}, the C
+%% worker can still finish later and deliver the result; without the
+%% drain those messages would accumulate forever.
+%%
+%% Safe because the context process is the sole receiver for its own
+%% async results and only one wait_for_async_result/2 is in flight at
+%% a time, so the drain cannot consume the result of a concurrent live
+%% request.
 wait_for_async_result(Ref, RequestId) ->
+    drain_stale_async_results(RequestId),
     receive
         {py_result, RequestId, Result} ->
             process_async_result(Ref, Result)
@@ -812,6 +825,15 @@ wait_for_async_result(Ref, RequestId) ->
         {error, async_timeout}
     end.
 
+%% @private
+drain_stale_async_results(CurrentId) ->
+    receive
+        {py_result, OldId, _} when OldId =/= CurrentId ->
+            drain_stale_async_results(CurrentId)
+    after 0 ->
+        ok
+    end.
+
 %% @private
 %% Process the result from async dispatch
 %% Handles suspension, schedule markers, and normal results.
diff --git a/test/py_context_async_drain_SUITE.erl b/test/py_context_async_drain_SUITE.erl
new file mode 100644
index 0000000..b2a2ae9
--- /dev/null
+++ b/test/py_context_async_drain_SUITE.erl
@@ -0,0 +1,64 @@
+%%% @doc Pin the stale-{py_result,_,_}-drain behavior in py_context.
+%%%
+%%% wait_for_async_result/2 returns {error, async_timeout} after 5 minutes
+%%% but the C worker may eventually finish and deliver a {py_result, OldId,
+%%% _} message anyway. Without a drain, those messages would pile up on
+%%% the context process's mailbox forever. This suite injects a stale
+%%% message directly into the context's mailbox and asserts it is gone
+%%% after the next legitimate dispatch.
+-module(py_context_async_drain_SUITE).
+
+-include_lib("common_test/include/ct.hrl").
+
+-export([
+    all/0,
+    init_per_suite/1,
+    end_per_suite/1
+]).
+
+-export([
+    drain_stale_results/1
+]).
+
+all() ->
+    [drain_stale_results].
+
+init_per_suite(Config) ->
+    {ok, _} = application:ensure_all_started(erlang_python),
+    {ok, _} = py:start_contexts(),
+    Config.
+
+end_per_suite(_Config) ->
+    ok = application:stop(erlang_python),
+    ok.
+
+drain_stale_results(_Config) ->
+    Ctx = py:context(1),
+    %% Warm up the context: ensure math is importable, then exercise
+    %% an async-dispatch call so the loop is fully primed.
+    {ok, 4.0} = py_context:call(Ctx, math, sqrt, [16]),
+
+    %% Inject a stale result message directly into the context's mailbox.
+    %% py_context's outer loop matches on specific tags only; an
+    %% unrecognized {py_result, FakeId, _} is left in place by selective
+    %% receive and would accumulate forever without the drain.
+    FakeId = make_ref(),
+    Ctx ! {py_result, FakeId, junk_should_be_drained},
+    {message_queue_len, QLenBefore} =
+        erlang:process_info(Ctx, message_queue_len),
+    true = QLenBefore >= 1,
+
+    %% Trigger an async-dispatch call (py_context:call/4 -> handle_call_
+    %% with_suspension -> wait_for_async_result/2). The drain runs first
+    %% and consumes the stale message; the matching receive gets the
+    %% real result.
+    {ok, 5.0} = py_context:call(Ctx, math, sqrt, [25]),
+
+    %% Brief settle to let any in-flight worker message land.
+    timer:sleep(50),
+    {message_queue_len, QLenAfter} =
+        erlang:process_info(Ctx, message_queue_len),
+    {messages, Msgs} = erlang:process_info(Ctx, messages),
+    ct:pal("Ctx mailbox after drain: len=~p msgs=~p", [QLenAfter, Msgs]),
+    0 = QLenAfter,
+    ok.

From 9cb75416d72ca970d1ce324cf3e74b42363fdb18 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 23:45:03 +0200
Subject: [PATCH 14/17] Roll back partial state on ctx_request_create failure

pthread_mutex_init, pthread_cond_init, and enif_alloc_env can each
fail under resource pressure. Today their returns are ignored, so
mutex/cond init failures leave the request in an unusable state and
a NULL request_env causes the next enif_make_copy to segfault.

Check each return and free what was allocated on partial failure.
All 14 callers already test the result for NULL.
---
 c_src/py_nif.h | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/c_src/py_nif.h b/c_src/py_nif.h
index 54635d0..e353fd5 100644
--- a/c_src/py_nif.h
+++ b/c_src/py_nif.h
@@ -784,19 +784,41 @@ typedef struct ctx_request {
 
 /**
  * @brief Create a new context request
+ *
+ * Rolls back partial state on any init failure: pthread_mutex_init,
+ * pthread_cond_init, or enif_alloc_env() can each fail under resource
+ * pressure. Returning NULL keeps callers safe — every call site
+ * already tests the result.
+ *
  * @return Newly allocated request with refcount=1, or NULL on failure
  */
 static inline ctx_request_t *ctx_request_create(void) {
     ctx_request_t *req = enif_alloc(sizeof(ctx_request_t));
-    if (req == NULL) return NULL;
-
+    if (req == NULL) {
+        return NULL;
+    }
     memset(req, 0, sizeof(ctx_request_t));
-    pthread_mutex_init(&req->mutex, NULL);
-    pthread_cond_init(&req->cond, NULL);
+
+    if (pthread_mutex_init(&req->mutex, NULL) != 0) {
+        enif_free(req);
+        return NULL;
+    }
+    if (pthread_cond_init(&req->cond, NULL) != 0) {
+        pthread_mutex_destroy(&req->mutex);
+        enif_free(req);
+        return NULL;
+    }
+    req->request_env = enif_alloc_env();
+    if (req->request_env == NULL) {
+        pthread_cond_destroy(&req->cond);
+        pthread_mutex_destroy(&req->mutex);
+        enif_free(req);
+        return NULL;
+    }
+
     atomic_store(&req->completed, false);
     atomic_store(&req->cancelled, false);
     atomic_store(&req->refcount, 1);
-    req->request_env = enif_alloc_env();
     req->result_env = NULL;  /* Created by worker when processing */
     req->next = NULL;
     req->async_mode = false;

From 527e7e39f7546d73c91d497f1eac659d670d35d0 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Fri, 1 May 2026 23:45:10 +0200
Subject: [PATCH 15/17] Drop dead _ErlangChildWatcher and tighten stale
 comments

_ErlangChildWatcher in priv/_erlang_impl/_policy.py was never
instantiated (the actual watcher is asyncio.ThreadedChildWatcher /
SafeChildWatcher); delete the class along with its TODO.

Replace "process monitoring disabled for now to debug crash" /
"Using simple resource type without process monitoring for now" in
the SharedDict path with positive-form notes describing the actual
GC-scoped lifecycle.
---
 c_src/py_shared_dict.c       |  7 +++--
 priv/_erlang_impl/_policy.py | 57 ------------------------------------
 2 files changed, 5 insertions(+), 59 deletions(-)

diff --git a/c_src/py_shared_dict.c b/c_src/py_shared_dict.c
index 381cb9d..a55c3d0 100644
--- a/c_src/py_shared_dict.c
+++ b/c_src/py_shared_dict.c
@@ -98,8 +98,11 @@ static ERL_NIF_TERM nif_shared_dict_new(ErlNifEnv *env, int argc,
     }
     PyGILState_Release(gstate);
 
-    /* Note: Process monitoring disabled for now to debug crash
-     * SharedDict will be garbage collected when no references remain */
+    /* SharedDict is GC-scoped: the resource destructor runs once the
+     * last term reference is dropped (or at process exit), at which
+     * point the underlying Python dict is cleared. There is no
+     * per-process monitor — callers that want eager release must call
+     * shared_dict_destroy/1 explicitly. */
     sd->monitor_active = false;
 
     /* Create reference term and release our reference */
diff --git a/priv/_erlang_impl/_policy.py b/priv/_erlang_impl/_policy.py
index bff6603..62705fe 100644
--- a/priv/_erlang_impl/_policy.py
+++ b/priv/_erlang_impl/_policy.py
@@ -146,60 +146,3 @@ def _init_watcher(self):
             self._watcher = asyncio.ThreadedChildWatcher()
         elif hasattr(asyncio, 'SafeChildWatcher'):
             self._watcher = asyncio.SafeChildWatcher()
-
-
-class _ErlangChildWatcher:
-    """Child watcher that delegates to Erlang for process monitoring.
-
-    This watcher uses Erlang ports and monitors instead of SIGCHLD,
-    making it compatible with subinterpreters and free-threaded Python.
-    """
-
-    def __init__(self):
-        self._callbacks = {}
-        self._loop = None
-
-    def attach_loop(self, loop):
-        """Attach to an event loop."""
-        self._loop = loop
-
-    def close(self):
-        """Close the watcher."""
-        self._callbacks.clear()
-        self._loop = None
-
-    def is_active(self):
-        """Return True if the watcher is active."""
-        return self._loop is not None and not self._loop.is_closed()
-
-    def add_child_handler(self, pid, callback, *args):
-        """Register a callback for when a child process exits.
-
-        Args:
-            pid: Process ID to watch.
-            callback: Callback function(pid, returncode, *args).
-            *args: Additional arguments for the callback.
-        """
-        self._callbacks[pid] = (callback, args)
-        # TODO: Use Erlang port monitoring
-
-    def remove_child_handler(self, pid):
-        """Remove the handler for a child process.
-
-        Returns:
-            bool: True if handler was removed, False if not found.
-        """
-        return self._callbacks.pop(pid, None) is not None
-
-    def _do_waitpid(self, pid, returncode):
-        """Called when a child process exits.
-
-        Args:
-            pid: Process ID that exited.
-            returncode: Exit code of the process.
-        """
-        entry = self._callbacks.pop(pid, None)
-        if entry is not None:
-            callback, args = entry
-            if self._loop is not None and not self._loop.is_closed():
-                self._loop.call_soon_threadsafe(callback, pid, returncode, *args)

From d6bb934d1436731771a325269818cf6a2c599276 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Sat, 2 May 2026 00:32:36 +0200
Subject: [PATCH 16/17] Async dispatch for env-bearing call/eval/exec

dispatch_to_worker_thread_impl blocked on a 30s pthread_cond_timedwait
in the env path. ML inference and other long-running calls returned
{error, worker_timeout} while the worker kept going. Add three
async-with-env NIFs that wrap the existing dispatch_to_worker_thread_async
(which already takes a local_env), and rewire handle_*_with_suspension_and_env
plus the {exec,_,_,_,EnvRef} loop arm to async-first / sync-fallback.

The Erlang side waits in wait_for_async_result/2, which has the
stale-result drain from ef31f76, so the env path now matches the
non-env path's behavior.

Adds test/py_context_async_env_SUITE (2 cases).
---
 c_src/py_nif.c                      | 131 ++++++++++++++++++++++++++++
 src/py_context.erl                  |  58 ++++++++++--
 src/py_nif.erl                      |  31 +++++++
 test/py_context_async_env_SUITE.erl |  71 +++++++++++++++
 4 files changed, 284 insertions(+), 7 deletions(-)
 create mode 100644 test/py_context_async_env_SUITE.erl

diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index d44ed42..f81fd16 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -5210,6 +5210,134 @@ static ERL_NIF_TERM nif_context_exec_async(ErlNifEnv *env, int argc, const ERL_N
     return make_error(env, "async_requires_worker_thread");
 }
 
+/**
+ * @brief Async call with process-local environment
+ *
+ * nif_context_call_with_env_async(ContextRef, CallerPid, RequestId,
+ *                                  Module, Func, Args, Kwargs, EnvRef)
+ *     -> {enqueued, RequestId} | {error, Reason}
+ *
+ * Same contract as nif_context_call_async but threads the process-local
+ * env through to the worker. Replaces the 30-second pthread_cond_timedwait
+ * dispatch path; the Erlang side waits in a normal receive.
+ */
+static ERL_NIF_TERM nif_context_call_with_env_async(ErlNifEnv *env, int argc,
+                                                      const ERL_NIF_TERM argv[]) {
+    py_context_t *ctx;
+    py_env_resource_t *penv;
+
+    if (!runtime_is_running()) {
+        return make_error(env, "python_not_running");
+    }
+    if (argc < 8) {
+        return make_error(env, "badarg");
+    }
+    if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) {
+        return make_error(env, "invalid_context");
+    }
+    ErlNifPid caller_pid;
+    if (!enif_get_local_pid(env, argv[1], &caller_pid)) {
+        return make_error(env, "invalid_pid");
+    }
+    ERL_NIF_TERM request_id = argv[2];
+    if (!enif_get_resource(env, argv[7], PY_ENV_RESOURCE_TYPE, (void **)&penv)) {
+        return make_error(env, "invalid_env");
+    }
+
+    if (!ctx->uses_worker_thread) {
+        return make_error(env, "async_requires_worker_thread");
+    }
+
+    ERL_NIF_TERM kwargs = enif_is_map(env, argv[6])
+        ? argv[6] : enif_make_new_map(env);
+    ERL_NIF_TERM request = enif_make_tuple4(env,
+        argv[3],  /* Module */
+        argv[4],  /* Func */
+        argv[5],  /* Args */
+        kwargs);
+    return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_CALL_WITH_ENV,
+        request, caller_pid, request_id, penv);
+}
+
+/**
+ * @brief Async eval with process-local environment
+ *
+ * nif_context_eval_with_env_async(ContextRef, CallerPid, RequestId,
+ *                                  Code, Locals, EnvRef)
+ *     -> {enqueued, RequestId} | {error, Reason}
+ */
+static ERL_NIF_TERM nif_context_eval_with_env_async(ErlNifEnv *env, int argc,
+                                                      const ERL_NIF_TERM argv[]) {
+    py_context_t *ctx;
+    py_env_resource_t *penv;
+
+    if (!runtime_is_running()) {
+        return make_error(env, "python_not_running");
+    }
+    if (argc < 6) {
+        return make_error(env, "badarg");
+    }
+    if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) {
+        return make_error(env, "invalid_context");
+    }
+    ErlNifPid caller_pid;
+    if (!enif_get_local_pid(env, argv[1], &caller_pid)) {
+        return make_error(env, "invalid_pid");
+    }
+    ERL_NIF_TERM request_id = argv[2];
+    if (!enif_get_resource(env, argv[5], PY_ENV_RESOURCE_TYPE, (void **)&penv)) {
+        return make_error(env, "invalid_env");
+    }
+
+    if (!ctx->uses_worker_thread) {
+        return make_error(env, "async_requires_worker_thread");
+    }
+
+    ERL_NIF_TERM locals = enif_is_map(env, argv[4])
+        ? argv[4] : enif_make_new_map(env);
+    ERL_NIF_TERM request = enif_make_tuple2(env, argv[3], locals);
+    return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EVAL_WITH_ENV,
+        request, caller_pid, request_id, penv);
+}
+
+/**
+ * @brief Async exec with process-local environment
+ *
+ * nif_context_exec_with_env_async(ContextRef, CallerPid, RequestId,
+ *                                  Code, EnvRef)
+ *     -> {enqueued, RequestId} | {error, Reason}
+ */
+static ERL_NIF_TERM nif_context_exec_with_env_async(ErlNifEnv *env, int argc,
+                                                      const ERL_NIF_TERM argv[]) {
+    py_context_t *ctx;
+    py_env_resource_t *penv;
+
+    if (!runtime_is_running()) {
+        return make_error(env, "python_not_running");
+    }
+    if (argc < 5) {
+        return make_error(env, "badarg");
+    }
+    if (!enif_get_resource(env, argv[0], PY_CONTEXT_RESOURCE_TYPE, (void **)&ctx)) {
+        return make_error(env, "invalid_context");
+    }
+    ErlNifPid caller_pid;
+    if (!enif_get_local_pid(env, argv[1], &caller_pid)) {
+        return make_error(env, "invalid_pid");
+    }
+    ERL_NIF_TERM request_id = argv[2];
+    if (!enif_get_resource(env, argv[4], PY_ENV_RESOURCE_TYPE, (void **)&penv)) {
+        return make_error(env, "invalid_env");
+    }
+
+    if (!ctx->uses_worker_thread) {
+        return make_error(env, "async_requires_worker_thread");
+    }
+
+    return dispatch_to_worker_thread_async(env, ctx, CTX_REQ_EXEC_WITH_ENV,
+        argv[3], caller_pid, request_id, penv);
+}
+
 /**
  * @brief Evaluate a Python expression in a context
  *
@@ -7772,6 +7900,9 @@ static ErlNifFunc nif_funcs[] = {
     {"context_call_async", 7, nif_context_call_async, 0},
     {"context_eval_async", 5, nif_context_eval_async, 0},
     {"context_exec_async", 4, nif_context_exec_async, 0},
+    {"context_call_with_env_async", 8, nif_context_call_with_env_async, 0},
+    {"context_eval_with_env_async", 6, nif_context_eval_with_env_async, 0},
+    {"context_exec_with_env_async", 5, nif_context_exec_with_env_async, 0},
     {"create_local_env", 1, nif_create_local_env, 0},
     {"interp_apply_imports", 2, nif_interp_apply_imports, ERL_NIF_DIRTY_JOB_CPU_BOUND},
     {"interp_apply_paths", 2, nif_interp_apply_paths, ERL_NIF_DIRTY_JOB_CPU_BOUND},
diff --git a/src/py_context.erl b/src/py_context.erl
index 883e5e1..83b8e79 100644
--- a/src/py_context.erl
+++ b/src/py_context.erl
@@ -567,10 +567,10 @@ loop(#state{ref = Ref, interp_id = InterpId} = State) ->
             From ! {MRef, Result},
             loop(State);
 
-        %% Exec with process-local environment (worker mode)
-        %% Note: Uses blocking dispatch since async+env isn't implemented yet.
+        %% Exec with process-local environment (worker mode).
+        %% Async dispatch with sync fallback (mirrors call/eval).
         {exec, From, MRef, Code, EnvRef} ->
-            Result = py_nif:context_exec(Ref, Code, EnvRef),
+            Result = handle_exec_with_async_and_env(Ref, Code, EnvRef),
             From ! {MRef, Result},
             loop(State);
 
@@ -846,9 +846,24 @@ process_async_result(_Ref, Result) ->
     Result.
 
 %% @private
-%% Handle call with process-local environment
-%% Note: Uses blocking dispatch since async+env isn't implemented yet.
+%% Handle call with process-local environment.
+%% Tries async dispatch first (no 30 s NIF timeout); falls back to the
+%% blocking NIF only when the worker thread isn't available.
 handle_call_with_suspension_and_env(Ref, Module, Func, Args, Kwargs, EnvRef) ->
+    RequestId = make_ref(),
+    case py_nif:context_call_with_env_async(Ref, self(), RequestId,
+                                              Module, Func, Args, Kwargs,
+                                              EnvRef) of
+        {enqueued, RequestId} ->
+            wait_for_async_result(Ref, RequestId);
+        {error, async_requires_worker_thread} ->
+            handle_call_with_env_blocking(Ref, Module, Func, Args, Kwargs, EnvRef);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%% @private
+handle_call_with_env_blocking(Ref, Module, Func, Args, Kwargs, EnvRef) ->
     case py_nif:context_call(Ref, Module, Func, Args, Kwargs, EnvRef) of
         {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} ->
             CallbackResult = handle_callback_with_nested_receive(Ref, FuncName, CallbackArgs),
@@ -860,9 +875,23 @@ handle_call_with_suspension_and_env(Ref, Module, Func, Args, Kwargs, EnvRef) ->
     end.
 
 %% @private
-%% Handle eval with process-local environment
-%% Note: Uses blocking dispatch since async+env isn't implemented yet.
+%% Handle eval with process-local environment.
+%% Tries async dispatch first; falls back to the blocking NIF only when
+%% the worker thread isn't available.
 handle_eval_with_suspension_and_env(Ref, Code, Locals, EnvRef) ->
+    RequestId = make_ref(),
+    case py_nif:context_eval_with_env_async(Ref, self(), RequestId,
+                                              Code, Locals, EnvRef) of
+        {enqueued, RequestId} ->
+            wait_for_async_result(Ref, RequestId);
+        {error, async_requires_worker_thread} ->
+            handle_eval_with_env_blocking(Ref, Code, Locals, EnvRef);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%% @private
+handle_eval_with_env_blocking(Ref, Code, Locals, EnvRef) ->
     case py_nif:context_eval(Ref, Code, Locals, EnvRef) of
         {suspended, _CallbackId, StateRef, {FuncName, CallbackArgs}} ->
             CallbackResult = handle_callback_with_nested_receive(Ref, FuncName, CallbackArgs),
@@ -873,6 +902,21 @@ handle_eval_with_suspension_and_env(Ref, Code, Locals, EnvRef) ->
             Result
     end.
 
+%% @private
+%% Handle exec with process-local environment via the same async-first
+%% path used for call/eval.
+handle_exec_with_async_and_env(Ref, Code, EnvRef) ->
+    RequestId = make_ref(),
+    case py_nif:context_exec_with_env_async(Ref, self(), RequestId,
+                                              Code, EnvRef) of
+        {enqueued, RequestId} ->
+            wait_for_async_result(Ref, RequestId);
+        {error, async_requires_worker_thread} ->
+            py_nif:context_exec(Ref, Code, EnvRef);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
 %% @private
 %% Check if a context is a subinterpreter (has interp_id > 0)
 is_context_subinterp(Ref) ->
diff --git a/src/py_nif.erl b/src/py_nif.erl
index 13e2c5b..55b8b17 100644
--- a/src/py_nif.erl
+++ b/src/py_nif.erl
@@ -166,6 +166,9 @@
     context_call_async/7,
     context_eval_async/5,
     context_exec_async/4,
+    context_call_with_env_async/8,
+    context_eval_with_env_async/6,
+    context_exec_with_env_async/5,
     context_call_method/4,
     create_local_env/1,
     context_to_term/1,
@@ -1299,6 +1302,34 @@ context_eval_async(_ContextRef, _CallerPid, _RequestId, _Code, _Locals) ->
 context_exec_async(_ContextRef, _CallerPid, _RequestId, _Code) ->
     ?NIF_STUB.
 
+%% @doc Async call with process-local environment.
+%% @private
+-spec context_call_with_env_async(reference(), pid(), term(),
+                                   binary(), binary(), list(), map(),
+                                   reference()) ->
+    {enqueued, term()} | {error, term()}.
+context_call_with_env_async(_CtxRef, _CallerPid, _RequestId,
+                             _Module, _Func, _Args, _Kwargs, _EnvRef) ->
+    ?NIF_STUB.
+
+%% @doc Async eval with process-local environment.
+%% @private
+-spec context_eval_with_env_async(reference(), pid(), term(),
+                                   binary(), map(), reference()) ->
+    {enqueued, term()} | {error, term()}.
+context_eval_with_env_async(_CtxRef, _CallerPid, _RequestId,
+                             _Code, _Locals, _EnvRef) ->
+    ?NIF_STUB.
+
+%% @doc Async exec with process-local environment.
+%% @private
+-spec context_exec_with_env_async(reference(), pid(), term(),
+                                   binary(), reference()) ->
+    {enqueued, term()} | {error, term()}.
+context_exec_with_env_async(_CtxRef, _CallerPid, _RequestId,
+                             _Code, _EnvRef) ->
+    ?NIF_STUB.
+
 %% @doc Call a method on a Python object in a context.
 %%
 %% NO MUTEX - caller must ensure exclusive access (process ownership).
diff --git a/test/py_context_async_env_SUITE.erl b/test/py_context_async_env_SUITE.erl
new file mode 100644
index 0000000..18f13c5
--- /dev/null
+++ b/test/py_context_async_env_SUITE.erl
@@ -0,0 +1,71 @@
+%%% @doc Pin the async-with-env dispatch path.
+%%%
+%%% v3.0 introduced an async dispatch path for call / eval / exec that
+%%% returns {enqueued, RequestId} from the NIF and lets the Erlang side
+%%% wait in a normal receive. The env-bearing variants
+%%% (py_context:call/5, eval/5 with EnvRef, exec/3) used to take a
+%%% blocking sync dispatch with a 30-second pthread_cond_timedwait,
+%%% returning {error, worker_timeout} for long-running Python while
+%%% the worker kept going.
+%%%
+%%% These cases verify the env path now uses the async dispatch and
+%%% completes correctly.
+-module(py_context_async_env_SUITE).
+
+-include_lib("common_test/include/ct.hrl").
+
+-export([
+    all/0,
+    init_per_suite/1,
+    end_per_suite/1
+]).
+
+-export([
+    async_env_call_returns_correct_result/1,
+    env_call_does_not_dispatch_timeout/1
+]).
+
+all() ->
+    [
+        async_env_call_returns_correct_result,
+        env_call_does_not_dispatch_timeout
+    ].
+
+init_per_suite(Config) ->
+    {ok, _} = application:ensure_all_started(erlang_python),
+    {ok, _} = py:start_contexts(),
+    Config.
+
+end_per_suite(_Config) ->
+    ok = application:stop(erlang_python),
+    ok.
+
+async_env_call_returns_correct_result(_Config) ->
+    %% py:call/3 wraps an EnvRef under the hood, so a successful
+    %% round-trip proves the new context_call_with_env_async path is
+    %% wired and the worker delivers a {py_result, _, _} for it.
+    {ok, 4.0} = py:call(math, sqrt, [16]),
+    {ok, 5.0} = py:call(math, sqrt, [25]),
+    ok.
+
+env_call_does_not_dispatch_timeout(_Config) ->
+    %% Have the Python side block for 1 second. Under the old sync
+    %% dispatch this exercised the 30-second pthread_cond_timedwait;
+    %% now it's an Erlang-side receive on {py_result, _, _} so latency
+    %% should track wall-clock and never produce {error, worker_timeout}.
+    Ctx = py:context(1),
+    EnvRef = py:get_local_env(Ctx),
+    ok = py_context:exec(Ctx, <<
+        "import time\n"
+        "def _slow_round(x):\n"
+        "    time.sleep(1.0)\n"
+        "    return x * 2\n"
+    >>, EnvRef),
+    Start = erlang:monotonic_time(millisecond),
+    {ok, 14} = py_context:call(Ctx, '__main__', '_slow_round', [7], #{},
+                               infinity, EnvRef),
+    Elapsed = erlang:monotonic_time(millisecond) - Start,
+    ct:pal("env-async call elapsed: ~p ms", [Elapsed]),
+    true = Elapsed >= 900,
+    true = Elapsed < 5000,
+    ok.

From 45a7fcca05bee030b1374bb4bb03c294ead310fb Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Sat, 2 May 2026 00:32:46 +0200
Subject: [PATCH 17/17] Refresh erlang.sleep docs + add silent= flag to
 erlang.install

The asyncio.md and migration.md tables claimed sync erlang.sleep()
always releases the dirty scheduler. In v3.0 the dirty scheduler isn't
held during sync calls anyway (async NIF dispatch returns immediately);
what blocks is the worker pthread for py:call, the Erlang process for
py:exec/py:eval, or no thread at all for awaited async sleep. Update
the table and the sleep() docstring accordingly.

erlang.install() emits a DeprecationWarning on 3.12-3.13 by design,
but users who knowingly use the legacy pattern had no clean local
opt-out. Add a keyword-only silent=False; passing silent=True
suppresses the warning without disabling DeprecationWarning globally.
The 3.14+ RuntimeError stays unconditional.
---
 docs/asyncio.md               | 13 ++++----
 docs/migration.md             | 15 ++++++----
 priv/_erlang_impl/__init__.py | 56 +++++++++++++++++++++++------------
 priv/tests/test_erlang_api.py | 11 +++++++
 4 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/docs/asyncio.md b/docs/asyncio.md
index a86a874..28b30e2 100644
--- a/docs/asyncio.md
+++ b/docs/asyncio.md
@@ -707,14 +707,15 @@ def sync_handler():
     return "done"
 ```
 
-**Behavior by Context:**
+**Behavior by context (v3.0 worker-pthread architecture):**
 
-| Context | Mechanism | Effect |
-|---------|-----------|--------|
-| Async (`await erlang.sleep()`) | `asyncio.sleep()` via `call_later()` | Yields to event loop, dirty scheduler released |
-| Sync (`erlang.sleep()`) | `erlang.call('_py_sleep')` with `receive/after` | Blocks Python, Erlang process suspends, dirty scheduler released |
+| Context | Mechanism | What blocks |
+|---------|-----------|-------------|
+| Async (`await erlang.sleep()`) | `asyncio.sleep()` via Erlang `send_after` | Yields to the event loop. The worker pthread is free to handle other tasks. |
+| Sync from `py:exec` / `py:eval` | `erlang.call('_py_sleep', secs)` triggers suspension; the dirty scheduler is released and an Erlang `receive ... after` parks the caller | Caller's Erlang process. Dirty scheduler free for other work. |
+| Sync from `py:call` (worker mode) | Falls back to `time.sleep`; replaying the Python frame around a suspension would change time-measurement semantics | The context's worker pthread for the sleep duration. Async NIF dispatch returns immediately so the BEAM dirty scheduler is **not** held; other Erlang processes and other contexts run normally. |
 
-Both modes allow other Erlang processes and Python contexts to run during the sleep.
+In every case the BEAM dirty scheduler is freed during the sleep — the difference is which thread blocks (Erlang process, dirty scheduler, or worker pthread).
 
 #### asyncio.sleep(delay)
 
diff --git a/docs/migration.md b/docs/migration.md
index da2e21c..983bcf3 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -541,20 +541,25 @@ erlang.send(("my_server", "node@host"), {"event": "user_login", "user": 123})
 erlang.send(pid, "hello")
 ```
 
-### `erlang.sleep()` with Dirty Scheduler Release
+### `erlang.sleep()` cooperates with the BEAM scheduler
 
-Synchronous sleep that releases the Erlang dirty scheduler thread:
+Synchronous sleep that lets other Erlang processes and Python
+contexts make progress during the wait:
 
 ```python
 import erlang
 
 def slow_handler():
-    # Sleep without blocking Erlang scheduler
-    erlang.sleep(1.0)  # Releases dirty scheduler during sleep
+    erlang.sleep(1.0)
     return "done"
 ```
 
-Unlike `time.sleep()`, `erlang.sleep()` releases the dirty NIF thread while waiting, allowing other Python calls to use the scheduler slot.
+The BEAM dirty scheduler is never held during the sleep. The exact
+thread that blocks depends on context — the Erlang process for
+`py:exec` / `py:eval`, or the context's private worker pthread for
+`py:call`. See the [behavior-by-context table in the asyncio
+guide](asyncio.md#erlangsleepseconds) for the full breakdown. In all
+cases, other contexts and other Erlang processes continue running.
 
 ### `erlang.call()` Blocking with Explicit Scheduling
 
diff --git a/priv/_erlang_impl/__init__.py b/priv/_erlang_impl/__init__.py
index 7dd1ac8..05abea5 100644
--- a/priv/_erlang_impl/__init__.py
+++ b/priv/_erlang_impl/__init__.py
@@ -226,17 +226,26 @@ def sleep(seconds):
     - Async context: Returns an awaitable (use with await)
     - Sync context: Blocks synchronously
 
-    **Dirty Scheduler Release:**
-
-    In async context, uses asyncio.sleep() which routes through the Erlang
-    timer system via erlang:send_after. The dirty scheduler is released
-    because the Python code yields back to the event loop.
-
-    In sync context (when called from py:exec or py:eval), the sleep uses
-    Erlang's receive/after via erlang.call('_py_sleep', seconds), which
-    releases the dirty NIF scheduler thread. When called from py:call
-    contexts, falls back to Python's time.sleep() which blocks the dirty
-    scheduler but ensures correct time measurement behavior.
+    **Behavior by context (v3.0 worker-pthread architecture)**:
+
+    The BEAM dirty scheduler is never held during the sleep — the
+    difference is which thread blocks.
+
+    - Async (``await erlang.sleep()``) uses ``asyncio.sleep()``, which
+      routes through Erlang's ``send_after`` timer. The coroutine
+      yields to the event loop; the worker pthread handles other tasks.
+    - Sync from ``py:exec`` / ``py:eval`` calls
+      ``erlang.call('_py_sleep', seconds)``. The suspension machinery
+      releases the dirty scheduler and parks the caller's Erlang
+      process in a ``receive ... after``.
+    - Sync from ``py:call`` falls back to ``time.sleep`` — the worker
+      pthread blocks for the sleep duration. The BEAM dirty scheduler
+      is *not* held here either: the NIF dispatch returned immediately
+      and the caller is waiting in an Erlang ``receive`` on the
+      context process. Other Erlang processes and other contexts run
+      normally during the sleep. (Replaying a suspended Python frame
+      around ``time.time()`` would change time-measurement semantics,
+      which is why ``py:call`` doesn't take the suspension path.)
 
     Args:
         seconds: Duration to sleep in seconds (float or int).
@@ -246,13 +255,13 @@ def sleep(seconds):
         In sync context: None (blocks until sleep completes).
 
     Example:
-        # Async context - releases dirty scheduler via event loop yield
+        # Async context
         async def main():
-            await erlang.sleep(0.5)  # Uses Erlang timer system
+            await erlang.sleep(0.5)
 
         # Sync context
         def handler():
-            erlang.sleep(0.5)  # Blocks for 0.5 seconds
+            erlang.sleep(0.5)
     """
     try:
         asyncio.get_running_loop()
@@ -396,7 +405,7 @@ def _run_async_from_erlang(module, func, args, kwargs):
     return run(coro)
 
 
-def install():
+def install(*, silent=False):
     """Install ErlangEventLoopPolicy as the default event loop policy.
 
     Deprecated in Python 3.12+; raises ``RuntimeError`` on Python 3.14+
@@ -408,12 +417,20 @@ def install():
     both work on every supported Python version and don't touch the
     global policy.
 
+    Args:
+        silent: If True (keyword-only), suppress the per-call
+            ``DeprecationWarning`` on Python 3.12-3.13. Useful when
+            you knowingly rely on the legacy pattern and don't want
+            to silence ``DeprecationWarning`` globally. The 3.14+
+            ``RuntimeError`` is *not* suppressible — that pattern
+            won't work on 3.16 and the call has no fallback there.
+
     Example (legacy pattern, Python 3.9–3.13 only):
         import asyncio
         import erlang
 
-        erlang.install()
-        asyncio.run(main())  # Uses Erlang event loop
+        erlang.install(silent=True)  # opt out of the warning
+        asyncio.run(main())          # Uses Erlang event loop
     """
     if sys.version_info >= (3, 14):
         raise RuntimeError(
@@ -421,10 +438,11 @@ def install():
             "Use erlang.run(main) or "
             "asyncio.Runner(loop_factory=erlang.new_event_loop) instead."
         )
-    if sys.version_info >= (3, 12):
+    if sys.version_info >= (3, 12) and not silent:
         warnings.warn(
             "erlang.install() is deprecated in Python 3.12+. "
-            "Use erlang.run(main()) instead.",
+            "Use erlang.run(main()) instead, or pass silent=True "
+            "to suppress this warning.",
             DeprecationWarning,
             stacklevel=2
         )
diff --git a/priv/tests/test_erlang_api.py b/priv/tests/test_erlang_api.py
index ee725c6..83d1d71 100644
--- a/priv/tests/test_erlang_api.py
+++ b/priv/tests/test_erlang_api.py
@@ -279,6 +279,17 @@ def test_install_function(self):
                         any(issubclass(warning.category, DeprecationWarning)
                             for warning in w)
                     )
+
+                # silent=True must suppress the warning even with
+                # simplefilter("always").
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter("always")
+                    erlang.install(silent=True)
+                    install_warnings = [
+                        warning for warning in w
+                        if "erlang.install()" in str(warning.message)
+                    ]
+                    self.assertEqual(install_warnings, [])
             else:
                 erlang.install()