From 60f15e213164476c35111fe67b4f363b6afd1395 Mon Sep 17 00:00:00 2001 From: Nick Lang Date: Thu, 4 Jun 2026 11:20:47 -0600 Subject: [PATCH 1/4] feat: env-var configurable httpx pool config in split_pdf_hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make httpx.AsyncClient pool config in split_pdf_hook.run_tasks configurable via env vars: - UNSTRUCTURED_CLIENT_MAX_CONNECTIONS (default 100) - UNSTRUCTURED_CLIENT_MAX_KEEPALIVE_CONNECTIONS (default 20) - UNSTRUCTURED_CLIENT_KEEPALIVE_EXPIRY (default 5.0 seconds) Defaults match httpx's built-in defaults, so this is fully backward compatible. Also extends the existing split_pdf event=plan_created INFO log to include the resolved pool values, making the active config visible in production logs. When the SDK is used in an environment where load balancing happens at TCP-connect time rather than per-request (a common Kubernetes setup with simple Services), httpx's default keepalive pooling can lock onto a subset of backends. Newly added backends never receive traffic because existing connections stay glued to the originally-resolved set. Allowing operators to force shorter keepalive (e.g. MAX_KEEPALIVE_CONNECTIONS=1 + a low KEEPALIVE_EXPIRY) makes the client re-establish connections more frequently, redistributing across the available backends. Defaults are unchanged — this purely adds knobs. Co-Authored-By: Claude Opus 4.7 --- .../unit/test_split_pdf_hook.py | 38 +++++++++++++++++++ .../_hooks/custom/split_pdf_hook.py | 37 ++++++++++++++++-- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index bf0a83da..e61b1e5a 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -482,6 +482,44 @@ async def test_remaining_tasks_cancelled_when_fails_disallowed(): assert len(tasks) > cancelled_counter["cancelled"] > 0 +@pytest.mark.asyncio +async def test_unit_run_tasks_pool_limits_configurable_via_env( + monkeypatch: pytest.MonkeyPatch, +): + """Env vars override the httpx.AsyncClient connection-pool limits. + + Operators running the SDK in a Kubernetes Deployment with + connect-time-only load balancing (kube-proxy ClusterIP, meshless) + need to be able to shrink the keepalive pool so connections recycle + frequently and redistribute across backend pods. + """ + monkeypatch.setenv("UNSTRUCTURED_CLIENT_MAX_CONNECTIONS", "7") + monkeypatch.setenv("UNSTRUCTURED_CLIENT_MAX_KEEPALIVE_CONNECTIONS", "1") + monkeypatch.setenv("UNSTRUCTURED_CLIENT_KEEPALIVE_EXPIRY", "30.0") + + captured: dict[str, httpx.Limits] = {} + real_async_client = httpx.AsyncClient + + def _capturing_async_client(*args, **kwargs): + captured["limits"] = kwargs.get("limits") + return real_async_client(*args, **kwargs) + + with patch( + "unstructured_client._hooks.custom.split_pdf_hook.httpx.AsyncClient", + side_effect=_capturing_async_client, + ): + await run_tasks( + [partial(_request_mock, fails=False, content="ok")], + allow_failed=True, + ) + + limits = captured["limits"] + assert isinstance(limits, httpx.Limits) + assert limits.max_connections == 7 + assert limits.max_keepalive_connections == 1 + assert limits.keepalive_expiry == 30.0 + + @patch("unstructured_client._hooks.custom.form_utils.Path") def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path: MagicMock): """Test get_split_pdf_cache_tmp_data_dir uses the directory from the form data.""" diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 7a6b3489..45d74362 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -177,6 +177,28 @@ async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Respons return index, response +def _resolve_pool_limits() -> httpx.Limits: + """Resolve httpx connection-pool limits from environment variables. + + Defaults match httpx's built-in defaults (max_connections=100, + max_keepalive_connections=20, keepalive_expiry=5.0) so behavior is + unchanged for callers that do not set the env vars. Operators running + the SDK inside a Kubernetes Deployment that load-balances only at + TCP-connect time (e.g. kube-proxy ClusterIP, no service mesh) can + lower these values to force frequent reconnects and redistribute + traffic across backend pods. + """ + return httpx.Limits( + max_connections=int(os.getenv("UNSTRUCTURED_CLIENT_MAX_CONNECTIONS", "100")), + max_keepalive_connections=int( + os.getenv("UNSTRUCTURED_CLIENT_MAX_KEEPALIVE_CONNECTIONS", "20") + ), + keepalive_expiry=float( + os.getenv("UNSTRUCTURED_CLIENT_KEEPALIVE_EXPIRY", "5.0") + ), + ) + + async def run_tasks( coroutines: list[partial[Coroutine[Any, Any, httpx.Response]]], allow_failed: bool = False, @@ -205,16 +227,21 @@ async def run_tasks( client_timeout_minutes = int(timeout_var) client_timeout = httpx.Timeout(60 * client_timeout_minutes) + limits = _resolve_pool_limits() + logger.debug( - "split_pdf event=batch_async_start operation_id=%s chunk_count=%d concurrency=%d client_timeout=%s allow_failed=%s", + "split_pdf event=batch_async_start operation_id=%s chunk_count=%d concurrency=%d client_timeout=%s allow_failed=%s pool_max_connections=%s pool_max_keepalive=%s pool_keepalive_expiry=%s", operation_id, len(coroutines), concurrency_level, client_timeout, allow_failed, + limits.max_connections, + limits.max_keepalive_connections, + limits.keepalive_expiry, ) - async with httpx.AsyncClient(timeout=client_timeout) as client: + async with httpx.AsyncClient(timeout=client_timeout, limits=limits) as client: armed_coroutines = [coro(async_client=client, limiter=limiter) for coro in coroutines] # type: ignore tasks = [ asyncio.create_task(_order_keeper(index, coro)) @@ -770,8 +797,9 @@ def _before_request_unlocked( ) self.coroutines_to_execute[operation_id].append(coroutine) + plan_limits = _resolve_pool_limits() logger.info( - "split_pdf event=plan_created operation_id=%s filename=%s strategy=%s page_range=%s-%s page_count=%d split_size=%d chunk_count=%d concurrency=%d allow_failed=%s cache_mode=%s timeout_seconds=%s retry_config_mode=%s", + "split_pdf event=plan_created operation_id=%s filename=%s strategy=%s page_range=%s-%s page_count=%d split_size=%d chunk_count=%d concurrency=%d allow_failed=%s cache_mode=%s timeout_seconds=%s retry_config_mode=%s pool_max_connections=%d pool_max_keepalive=%d pool_keepalive_expiry=%.1fs", operation_id, Path(pdf_file_meta["filename"]).name, form_data.get("strategy"), @@ -790,6 +818,9 @@ def _before_request_unlocked( self._retry_config_observability_mode( self.operation_retry_configs.get(operation_id), ), + plan_limits.max_connections, + plan_limits.max_keepalive_connections, + plan_limits.keepalive_expiry, ) self.pending_operation_ids[operation_id] = operation_id From 2d7f5f4d05ccd754de2d7bf63dbd9d918e5e0bf4 Mon Sep 17 00:00:00 2001 From: Nick Lang Date: Thu, 4 Jun 2026 16:07:47 -0600 Subject: [PATCH 2/4] feat: env-var configurable TLS certs for httpx client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review feedback on the pool-config PR: extend the same env-var-driven approach to TLS verification and client certificates so operators can plug in a custom CA bundle or mTLS cert without modifying SDK code. New env vars (all unset by default → httpx defaults): - UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE: path to a CA bundle file that overrides the system trust store. Typical use: internal CA for a corporate proxy or private hosting endpoint. - UNSTRUCTURED_CLIENT_TLS_VERIFY: set to a falsy value ("false"/"0"/"no"/"off") to disable server cert verification entirely. Dev-only path. - UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT: path to a client cert PEM file for mTLS. - UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY: optional separate key file path. If unset, httpx reads the key from the cert PEM. If CA bundle is set, it wins over the verify flag — explicit trust store beats "disable verify". Resolved config flows into both the existing batch_async_start DEBUG log and the plan_created INFO log as `tls= `, using human-readable descriptors that don't leak filesystem paths. Tests cover: defaults, CA bundle path, verify-false (parameterized over case/synonyms), verify-true (parameterized over truthy values), CA-bundle-wins-over-verify, client cert alone, cert+key split, and a mocked-AsyncClient end-to-end check that the resolved config reaches the httpx.AsyncClient kwargs. 108/108 split_pdf_hook tests pass. Co-Authored-By: Claude Opus 4.7 --- .../unit/test_split_pdf_hook.py | 145 ++++++++++++++++++ .../_hooks/custom/split_pdf_hook.py | 77 +++++++++- 2 files changed, 219 insertions(+), 3 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index e61b1e5a..2c566ccb 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -520,6 +520,151 @@ def _capturing_async_client(*args, **kwargs): assert limits.keepalive_expiry == 30.0 +def test_unit_resolve_tls_config_defaults_unchanged_without_env( + monkeypatch: pytest.MonkeyPatch, +): + """No env vars set → verify=True, cert=None (httpx defaults). Backward + compatibility check — callers that don't opt into TLS config see no + behavior change.""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + for var in ( + "UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", + "UNSTRUCTURED_CLIENT_TLS_VERIFY", + "UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT", + "UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY", + ): + monkeypatch.delenv(var, raising=False) + + verify, cert = _resolve_tls_config() + assert verify is True + assert cert is None + + +def test_unit_resolve_tls_config_ca_bundle_from_env( + monkeypatch: pytest.MonkeyPatch, tmp_path +): + """UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE → verify=. Use case: internal + CA bundle for a corporate proxy.""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + ca_bundle = tmp_path / "custom-ca.pem" + ca_bundle.write_text("-----BEGIN CERTIFICATE-----\nfake\n-----END CERTIFICATE-----\n") + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", str(ca_bundle)) + + verify, cert = _resolve_tls_config() + assert verify == str(ca_bundle) + assert cert is None + + +@pytest.mark.parametrize("value", ["false", "False", "0", "no", "off", "FALSE"]) +def test_unit_resolve_tls_config_verify_disabled_via_env( + monkeypatch: pytest.MonkeyPatch, value: str +): + """UNSTRUCTURED_CLIENT_TLS_VERIFY= → verify=False. CA bundle + env takes precedence if both are set, but on its own this disables server + cert validation entirely. Dev-only path.""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + monkeypatch.delenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", raising=False) + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", value) + + verify, cert = _resolve_tls_config() + assert verify is False + + +@pytest.mark.parametrize("value", ["true", "1", "yes", "on", "anything else"]) +def test_unit_resolve_tls_config_verify_truthy_values_keep_verification( + monkeypatch: pytest.MonkeyPatch, value: str +): + """Any non-falsy value of UNSTRUCTURED_CLIENT_TLS_VERIFY keeps verification on.""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + monkeypatch.delenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", raising=False) + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", value) + + verify, cert = _resolve_tls_config() + assert verify is True + + +def test_unit_resolve_tls_config_ca_bundle_wins_over_verify_flag( + monkeypatch: pytest.MonkeyPatch, tmp_path +): + """If both are set, the CA bundle path is used and the verify flag is + ignored — explicit trust store takes precedence over "disable verify".""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + ca_bundle = tmp_path / "ca.pem" + ca_bundle.write_text("...") + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", str(ca_bundle)) + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", "false") + + verify, cert = _resolve_tls_config() + assert verify == str(ca_bundle) + + +def test_unit_resolve_tls_config_client_cert_only(monkeypatch: pytest.MonkeyPatch): + """UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT alone → cert=. httpx will + read the private key from the same PEM file.""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT", "/etc/ssl/client.pem") + monkeypatch.delenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY", raising=False) + + verify, cert = _resolve_tls_config() + assert cert == "/etc/ssl/client.pem" + + +def test_unit_resolve_tls_config_client_cert_and_key_split( + monkeypatch: pytest.MonkeyPatch, +): + """Both _CLIENT_CERT and _CLIENT_KEY → cert=(cert_path, key_path). For + PKI setups where cert and key live in separate files.""" + from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT", "/etc/ssl/client.crt") + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY", "/etc/ssl/client.key") + + verify, cert = _resolve_tls_config() + assert cert == ("/etc/ssl/client.crt", "/etc/ssl/client.key") + + +@pytest.mark.asyncio +async def test_unit_run_tasks_forwards_tls_config_to_httpx_async_client( + monkeypatch: pytest.MonkeyPatch, tmp_path +): + """run_tasks() actually wires verify+cert into httpx.AsyncClient(). End-to- + end check that _resolve_tls_config is called and its result reaches the + client construction. AsyncClient is fully mocked so we don't have to feed + httpx a real cert chain.""" + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", "false") + + captured: dict = {} + + class _MockAsyncClient: + def __init__(self, *args, **kwargs): + captured["verify"] = kwargs.get("verify") + captured["cert"] = kwargs.get("cert") + + async def __aenter__(self): + return self + + async def __aexit__(self, *exc): + return False + + with patch( + "unstructured_client._hooks.custom.split_pdf_hook.httpx.AsyncClient", + new=_MockAsyncClient, + ): + await run_tasks( + [partial(_request_mock, fails=False, content="ok")], + allow_failed=True, + ) + + assert captured["verify"] is False + assert captured["cert"] is None + + @patch("unstructured_client._hooks.custom.form_utils.Path") def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path: MagicMock): """Test get_split_pdf_cache_tmp_data_dir uses the directory from the form data.""" diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 45d74362..4612b2de 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -199,6 +199,71 @@ def _resolve_pool_limits() -> httpx.Limits: ) +def _resolve_tls_config() -> tuple[Union[bool, str], Optional[Union[str, tuple[str, str]]]]: + """Resolve httpx TLS server-verification and client-certificate config + from environment variables. + + Returns a (verify, cert) tuple suitable for `httpx.AsyncClient(verify=..., cert=...)`. + + Server verification (`verify`): + - `UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE` (path): use this CA bundle file + instead of the system trust store. Typical use: custom internal CA. + - `UNSTRUCTURED_CLIENT_TLS_VERIFY` (`"false"`, `"0"`, `"no"`, `"off"`): + disable certificate verification entirely. Intended for local dev + against self-signed test endpoints; **do not use in production**. + - Otherwise: `True` (httpx default — use system trust store). + + Client certificate (`cert`, for mTLS): + - `UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT` (path): PEM file. By default + httpx will read the private key from the same file. + - `UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY` (path, optional): use this + separate key file. Required only if cert and key are split. + - Otherwise: `None`. + + Defaults match httpx's built-in defaults so behavior is unchanged for + callers that don't set any of these variables. + """ + verify: Union[bool, str] + if ca_bundle := os.getenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE"): + verify = ca_bundle + elif (verify_env := os.getenv("UNSTRUCTURED_CLIENT_TLS_VERIFY")) is not None: + verify = verify_env.strip().lower() not in ("false", "0", "no", "off", "") + else: + verify = True + + cert: Optional[Union[str, tuple[str, str]]] = None + if client_cert := os.getenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT"): + if client_key := os.getenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY"): + cert = (client_cert, client_key) + else: + cert = client_cert + + return verify, cert + + +def _describe_tls_config( + verify: Union[bool, str], cert: Optional[Union[str, tuple[str, str]]] +) -> str: + """Short human-readable summary of the TLS config, safe for log output. + Emits "system-trust" / "no-verify" / "ca-bundle" rather than the actual + file path, so logs don't leak filesystem layout.""" + if verify is False: + verify_desc = "no-verify" + elif verify is True: + verify_desc = "system-trust" + else: + verify_desc = "custom-ca-bundle" + + if cert is None: + cert_desc = "none" + elif isinstance(cert, tuple): + cert_desc = "cert+key" + else: + cert_desc = "cert-only" + + return f"verify={verify_desc} client_cert={cert_desc}" + + async def run_tasks( coroutines: list[partial[Coroutine[Any, Any, httpx.Response]]], allow_failed: bool = False, @@ -228,9 +293,10 @@ async def run_tasks( client_timeout = httpx.Timeout(60 * client_timeout_minutes) limits = _resolve_pool_limits() + verify, cert = _resolve_tls_config() logger.debug( - "split_pdf event=batch_async_start operation_id=%s chunk_count=%d concurrency=%d client_timeout=%s allow_failed=%s pool_max_connections=%s pool_max_keepalive=%s pool_keepalive_expiry=%s", + "split_pdf event=batch_async_start operation_id=%s chunk_count=%d concurrency=%d client_timeout=%s allow_failed=%s pool_max_connections=%s pool_max_keepalive=%s pool_keepalive_expiry=%s tls=%s", operation_id, len(coroutines), concurrency_level, @@ -239,9 +305,12 @@ async def run_tasks( limits.max_connections, limits.max_keepalive_connections, limits.keepalive_expiry, + _describe_tls_config(verify, cert), ) - async with httpx.AsyncClient(timeout=client_timeout, limits=limits) as client: + async with httpx.AsyncClient( + timeout=client_timeout, limits=limits, verify=verify, cert=cert + ) as client: armed_coroutines = [coro(async_client=client, limiter=limiter) for coro in coroutines] # type: ignore tasks = [ asyncio.create_task(_order_keeper(index, coro)) @@ -798,8 +867,9 @@ def _before_request_unlocked( self.coroutines_to_execute[operation_id].append(coroutine) plan_limits = _resolve_pool_limits() + plan_verify, plan_cert = _resolve_tls_config() logger.info( - "split_pdf event=plan_created operation_id=%s filename=%s strategy=%s page_range=%s-%s page_count=%d split_size=%d chunk_count=%d concurrency=%d allow_failed=%s cache_mode=%s timeout_seconds=%s retry_config_mode=%s pool_max_connections=%d pool_max_keepalive=%d pool_keepalive_expiry=%.1fs", + "split_pdf event=plan_created operation_id=%s filename=%s strategy=%s page_range=%s-%s page_count=%d split_size=%d chunk_count=%d concurrency=%d allow_failed=%s cache_mode=%s timeout_seconds=%s retry_config_mode=%s pool_max_connections=%d pool_max_keepalive=%d pool_keepalive_expiry=%.1fs tls=%s", operation_id, Path(pdf_file_meta["filename"]).name, form_data.get("strategy"), @@ -821,6 +891,7 @@ def _before_request_unlocked( plan_limits.max_connections, plan_limits.max_keepalive_connections, plan_limits.keepalive_expiry, + _describe_tls_config(plan_verify, plan_cert), ) self.pending_operation_ids[operation_id] = operation_id From 789a732b0ca3c900bae7fa7fa4d86d4f9396d7b1 Mon Sep 17 00:00:00 2001 From: Nick Lang Date: Fri, 5 Jun 2026 14:28:48 -0600 Subject: [PATCH 3/4] feat: use standard SSL_CERT_FILE / REQUESTS_CA_BUNDLE for trust store MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Trahey's review feedback: 1. Drop the word "Client" from the trust-store env vars. In TLS, "client" specifically means mTLS client authentication, so a name like UNSTRUCTURED_CLIENT_TLS_VERIFY is ambiguous when it's really about server-cert verification. 2. Honor the standard env vars other libraries already respect (SSL_CERT_FILE first, then REQUESTS_CA_BUNDLE). A single env-var setting now applies uniformly across Python tooling. The mTLS client-auth env vars (UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT / _CLIENT_KEY) keep their names — the word "CLIENT" there refers to TLS client authentication, which is its correct usage. The disable-verify knob is removed entirely; the standard env vars have no such escape hatch by design. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../unit/test_split_pdf_hook.py | 85 +++++++++---------- .../_hooks/custom/split_pdf_hook.py | 39 ++++----- 2 files changed, 55 insertions(+), 69 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index 2c566ccb..9a69d9a8 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -520,6 +520,14 @@ def _capturing_async_client(*args, **kwargs): assert limits.keepalive_expiry == 30.0 +_TLS_ENV_VARS = ( + "SSL_CERT_FILE", + "REQUESTS_CA_BUNDLE", + "UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT", + "UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY", +) + + def test_unit_resolve_tls_config_defaults_unchanged_without_env( monkeypatch: pytest.MonkeyPatch, ): @@ -528,12 +536,7 @@ def test_unit_resolve_tls_config_defaults_unchanged_without_env( behavior change.""" from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config - for var in ( - "UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", - "UNSTRUCTURED_CLIENT_TLS_VERIFY", - "UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT", - "UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY", - ): + for var in _TLS_ENV_VARS: monkeypatch.delenv(var, raising=False) verify, cert = _resolve_tls_config() @@ -541,66 +544,55 @@ def test_unit_resolve_tls_config_defaults_unchanged_without_env( assert cert is None -def test_unit_resolve_tls_config_ca_bundle_from_env( +def test_unit_resolve_tls_config_ssl_cert_file_from_env( monkeypatch: pytest.MonkeyPatch, tmp_path ): - """UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE → verify=. Use case: internal - CA bundle for a corporate proxy.""" + """SSL_CERT_FILE (stdlib ssl convention) → verify=. Use case: + internal CA bundle shared with other Python tooling.""" from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + for var in _TLS_ENV_VARS: + monkeypatch.delenv(var, raising=False) + ca_bundle = tmp_path / "custom-ca.pem" ca_bundle.write_text("-----BEGIN CERTIFICATE-----\nfake\n-----END CERTIFICATE-----\n") - monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", str(ca_bundle)) + monkeypatch.setenv("SSL_CERT_FILE", str(ca_bundle)) verify, cert = _resolve_tls_config() assert verify == str(ca_bundle) assert cert is None -@pytest.mark.parametrize("value", ["false", "False", "0", "no", "off", "FALSE"]) -def test_unit_resolve_tls_config_verify_disabled_via_env( - monkeypatch: pytest.MonkeyPatch, value: str +def test_unit_resolve_tls_config_requests_ca_bundle_from_env( + monkeypatch: pytest.MonkeyPatch, tmp_path ): - """UNSTRUCTURED_CLIENT_TLS_VERIFY= → verify=False. CA bundle - env takes precedence if both are set, but on its own this disables server - cert validation entirely. Dev-only path.""" + """REQUESTS_CA_BUNDLE (requests/httpx-ecosystem convention) → verify=.""" from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config - monkeypatch.delenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", raising=False) - monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", value) - - verify, cert = _resolve_tls_config() - assert verify is False - - -@pytest.mark.parametrize("value", ["true", "1", "yes", "on", "anything else"]) -def test_unit_resolve_tls_config_verify_truthy_values_keep_verification( - monkeypatch: pytest.MonkeyPatch, value: str -): - """Any non-falsy value of UNSTRUCTURED_CLIENT_TLS_VERIFY keeps verification on.""" - from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config + for var in _TLS_ENV_VARS: + monkeypatch.delenv(var, raising=False) - monkeypatch.delenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", raising=False) - monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", value) + ca_bundle = tmp_path / "custom-ca.pem" + ca_bundle.write_text("-----BEGIN CERTIFICATE-----\nfake\n-----END CERTIFICATE-----\n") + monkeypatch.setenv("REQUESTS_CA_BUNDLE", str(ca_bundle)) verify, cert = _resolve_tls_config() - assert verify is True + assert verify == str(ca_bundle) + assert cert is None -def test_unit_resolve_tls_config_ca_bundle_wins_over_verify_flag( - monkeypatch: pytest.MonkeyPatch, tmp_path +def test_unit_resolve_tls_config_ssl_cert_file_wins_over_requests_ca_bundle( + monkeypatch: pytest.MonkeyPatch, ): - """If both are set, the CA bundle path is used and the verify flag is - ignored — explicit trust store takes precedence over "disable verify".""" + """If both standard env vars are set, SSL_CERT_FILE takes precedence — + it's the lower-level stdlib convention.""" from unstructured_client._hooks.custom.split_pdf_hook import _resolve_tls_config - ca_bundle = tmp_path / "ca.pem" - ca_bundle.write_text("...") - monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE", str(ca_bundle)) - monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", "false") + monkeypatch.setenv("SSL_CERT_FILE", "/etc/ssl/stdlib-ca.pem") + monkeypatch.setenv("REQUESTS_CA_BUNDLE", "/etc/ssl/requests-ca.pem") - verify, cert = _resolve_tls_config() - assert verify == str(ca_bundle) + verify, _ = _resolve_tls_config() + assert verify == "/etc/ssl/stdlib-ca.pem" def test_unit_resolve_tls_config_client_cert_only(monkeypatch: pytest.MonkeyPatch): @@ -637,7 +629,10 @@ async def test_unit_run_tasks_forwards_tls_config_to_httpx_async_client( end check that _resolve_tls_config is called and its result reaches the client construction. AsyncClient is fully mocked so we don't have to feed httpx a real cert chain.""" - monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_VERIFY", "false") + ca_bundle = tmp_path / "ca.pem" + ca_bundle.write_text("-----BEGIN CERTIFICATE-----\nfake\n-----END CERTIFICATE-----\n") + monkeypatch.setenv("SSL_CERT_FILE", str(ca_bundle)) + monkeypatch.setenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT", "/etc/ssl/client.pem") captured: dict = {} @@ -661,8 +656,8 @@ async def __aexit__(self, *exc): allow_failed=True, ) - assert captured["verify"] is False - assert captured["cert"] is None + assert captured["verify"] == str(ca_bundle) + assert captured["cert"] == "/etc/ssl/client.pem" @patch("unstructured_client._hooks.custom.form_utils.Path") diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 4612b2de..912da332 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -200,20 +200,20 @@ def _resolve_pool_limits() -> httpx.Limits: def _resolve_tls_config() -> tuple[Union[bool, str], Optional[Union[str, tuple[str, str]]]]: - """Resolve httpx TLS server-verification and client-certificate config + """Resolve httpx TLS trust-store and mTLS client-certificate config from environment variables. Returns a (verify, cert) tuple suitable for `httpx.AsyncClient(verify=..., cert=...)`. - Server verification (`verify`): - - `UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE` (path): use this CA bundle file - instead of the system trust store. Typical use: custom internal CA. - - `UNSTRUCTURED_CLIENT_TLS_VERIFY` (`"false"`, `"0"`, `"no"`, `"off"`): - disable certificate verification entirely. Intended for local dev - against self-signed test endpoints; **do not use in production**. + Trust store (`verify`) — honors the same standard env vars other + libraries use, so a single env-var setting applies uniformly across + tools: + - `SSL_CERT_FILE` (path): stdlib `ssl` convention. + - `REQUESTS_CA_BUNDLE` (path): `requests` / `httpx`-ecosystem + convention. Checked if `SSL_CERT_FILE` is unset. - Otherwise: `True` (httpx default — use system trust store). - Client certificate (`cert`, for mTLS): + mTLS client certificate (`cert`): - `UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT` (path): PEM file. By default httpx will read the private key from the same file. - `UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY` (path, optional): use this @@ -223,13 +223,9 @@ def _resolve_tls_config() -> tuple[Union[bool, str], Optional[Union[str, tuple[s Defaults match httpx's built-in defaults so behavior is unchanged for callers that don't set any of these variables. """ - verify: Union[bool, str] - if ca_bundle := os.getenv("UNSTRUCTURED_CLIENT_TLS_CA_BUNDLE"): - verify = ca_bundle - elif (verify_env := os.getenv("UNSTRUCTURED_CLIENT_TLS_VERIFY")) is not None: - verify = verify_env.strip().lower() not in ("false", "0", "no", "off", "") - else: - verify = True + verify: Union[bool, str] = ( + os.getenv("SSL_CERT_FILE") or os.getenv("REQUESTS_CA_BUNDLE") or True + ) cert: Optional[Union[str, tuple[str, str]]] = None if client_cert := os.getenv("UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT"): @@ -245,14 +241,9 @@ def _describe_tls_config( verify: Union[bool, str], cert: Optional[Union[str, tuple[str, str]]] ) -> str: """Short human-readable summary of the TLS config, safe for log output. - Emits "system-trust" / "no-verify" / "ca-bundle" rather than the actual - file path, so logs don't leak filesystem layout.""" - if verify is False: - verify_desc = "no-verify" - elif verify is True: - verify_desc = "system-trust" - else: - verify_desc = "custom-ca-bundle" + Emits "system-trust" / "custom-ca-bundle" rather than the actual file + path, so logs don't leak filesystem layout.""" + verify_desc = "system-trust" if verify is True else "custom-ca-bundle" if cert is None: cert_desc = "none" @@ -261,7 +252,7 @@ def _describe_tls_config( else: cert_desc = "cert-only" - return f"verify={verify_desc} client_cert={cert_desc}" + return f"trust_store={verify_desc} mtls_cert={cert_desc}" async def run_tasks( From 60fad51d3fb4471400354e188a87ab24063250b3 Mon Sep 17 00:00:00 2001 From: Nick Lang Date: Fri, 5 Jun 2026 15:11:37 -0600 Subject: [PATCH 4/4] chore: bump to 0.45.0 to release pool + TLS env-var config Per Austin's review note: the repo no longer auto-generates, so this PR owns its version bump and changelog entry directly. - src/unstructured_client/_version.py: 0.44.1 -> 0.45.0 - CHANGELOG.md: 0.45.0 Features section covering pool limits, trust store (SSL_CERT_FILE / REQUESTS_CA_BUNDLE), mTLS client cert, and the extended plan_created observability log - RELEASES.md: append matching Speakeasy-style v0.45.0 entry Minor bump because all additions are new optional env vars with defaults that match httpx (fully backward compatible). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 8 ++++++++ RELEASES.md | 10 ++++++++++ src/unstructured_client/_version.py | 4 ++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5d7c651..0661441e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.45.0 + +### Features +* Make the split-PDF `httpx.AsyncClient` connection-pool limits configurable via env vars: `UNSTRUCTURED_CLIENT_MAX_CONNECTIONS` (default `100`), `UNSTRUCTURED_CLIENT_MAX_KEEPALIVE_CONNECTIONS` (default `20`), and `UNSTRUCTURED_CLIENT_KEEPALIVE_EXPIRY` (default `5.0`s). Defaults match httpx, so behavior is unchanged unless set. Useful when deploying behind a connect-time-only load balancer (e.g. Kubernetes ClusterIP without a mesh) where shorter keepalives force connections to redistribute across backend pods. +* Honor the standard `SSL_CERT_FILE` / `REQUESTS_CA_BUNDLE` env vars to point the split-PDF `httpx.AsyncClient` at a custom trust store, so a single env-var setting applies uniformly across Python tooling. +* Add `UNSTRUCTURED_CLIENT_TLS_CLIENT_CERT` and `UNSTRUCTURED_CLIENT_TLS_CLIENT_KEY` env vars to wire an mTLS client certificate into the split-PDF `httpx.AsyncClient` (single PEM, or split cert + key files). +* Extend the split-PDF `event=plan_created` log to include the resolved pool limits and trust-store / mTLS mode so the active config is visible in production logs. + ## 0.44.1 ### Features diff --git a/RELEASES.md b/RELEASES.md index 10855b06..51643612 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -1231,3 +1231,13 @@ Based on: - [python v0.44.1] . ### Releases - [PyPI v0.44.1] https://pypi.org/project/unstructured-client/0.44.1 - . + +## 2026-06-05 00:00:00 +### Changes +Based on: +- OpenAPI Doc +- Speakeasy CLI 1.601.0 (2.680.0) https://github.com/speakeasy-api/speakeasy +### Generated +- [python v0.45.0] . +### Releases +- [PyPI v0.45.0] https://pypi.org/project/unstructured-client/0.45.0 - . diff --git a/src/unstructured_client/_version.py b/src/unstructured_client/_version.py index cffd0888..8a14e983 100644 --- a/src/unstructured_client/_version.py +++ b/src/unstructured_client/_version.py @@ -3,10 +3,10 @@ import importlib.metadata __title__: str = "unstructured-client" -__version__: str = "0.44.1" +__version__: str = "0.45.0" __openapi_doc_version__: str = "1.2.31" __gen_version__: str = "2.680.0" -__user_agent__: str = "speakeasy-sdk/python 0.44.1 2.680.0 1.2.31 unstructured-client" +__user_agent__: str = "speakeasy-sdk/python 0.45.0 2.680.0 1.2.31 unstructured-client" try: if __package__ is not None: