diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 3a1050573..f4882f669 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -16,18 +16,23 @@ on: # the merge has already happened and the coverage check has no power # to block. Hence we deliberately don't subscribe to `push:main`. # -# Serialise E2E runs per ref so a force-push (or a fast follow-up commit) -# on a PR cancels the previous run instead of racing it against shared -# warehouse state (Delta tables, UC Volume files, etc.). -# -# Merge-queue runs are NOT cancelled — each queue entry needs its own -# clean CI signal so a regression on entry N doesn't get hidden by -# entry N+1 arriving seconds later. (Concurrent queue runs can still -# collide on shared warehouse state, but that's the cost of preserving -# per-entry signal; the uuid-suffix conventions in the e2e tests are -# what keep them isolated.) +# Concurrency groups: +# - pull_request: per-ref + cancel-in-progress. A force-push or fast +# follow-up commit on a PR cancels the previous run instead of +# racing it against shared warehouse state (Delta tables, UC Volume +# files, telemetry endpoints, etc.). +# - merge_group: serialised globally with a fixed group name. The +# warehouse can't tolerate two parallel queue entries hammering +# telemetry / retry paths simultaneously — we have observed flaky +# retry-test failures (extra `/telemetry-ext` retries inflating +# mock.call_count) under that load. Running queue entries one at a +# time costs queue throughput (one entry at a time, ~17 min each) +# but keeps signal trustworthy. cancel-in-progress is off so each +# entry gets a complete run. +# - workflow_dispatch: shares the merge_group group; manual triggers +# are rare enough that serialising them with the queue is fine. concurrency: - group: e2e-${{ github.workflow }}-${{ github.ref }} + group: ${{ github.event_name == 'pull_request' && format('e2e-pr-{0}', github.ref) || 'e2e-mq-serial' }} cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py index af635331d..08876e145 100755 --- a/tests/e2e/common/retry_test_mixins.py +++ b/tests/e2e/common/retry_test_mixins.py @@ -17,6 +17,7 @@ ) from databricks.sql.telemetry.telemetry_client import ( NoopTelemetryClient, + TelemetryClient, TelemetryClientFactory, _TelemetryClientHolder, ) @@ -27,8 +28,22 @@ def _isolated_from_telemetry(): # Tests that mock urllib3 globally (via _get_conn / _validate_conn) also # intercept background telemetry pushes from the shared # TelemetryClientFactory executor — inflating mock.call_count and breaking - # assertions like `call_count == 6`. Drain the factory and force any new - # connection to use NoopTelemetryClient for the duration of the test. + # assertions like `call_count == 6`. Three layers of defence: + # + # 1. Drain TelemetryClientFactory and override initialize_telemetry_client + # so new connections install NoopTelemetryClient (which submits nothing). + # 2. Patch TelemetryClient._send_telemetry to a no-op as a backstop — covers + # any real TelemetryClient instance that slips in (e.g. a stale module- + # global, a code path that bypasses initialize_telemetry_client, or + # anything created before this context entered). + # 3. Patch TelemetryClient._export_event to a no-op so even if a real + # client receives an event, the event never reaches the queue and the + # flush logic never fires. + # + # Without layer 2/3 we have observed `/telemetry-ext` requests showing up + # in merge_group runs (concurrent CI load on the warehouse stresses paths + # that single-test runs don't hit), inflating retry-test counts and + # producing flaky AssertionErrors. with TelemetryClientFactory._lock: saved_clients = TelemetryClientFactory._clients saved_executor = TelemetryClientFactory._executor @@ -55,11 +70,21 @@ def _install_noop(*args, host_url=None, **kwargs): NoopTelemetryClient() ) + def _noop_send(self, events): + return None + + def _noop_export(self, event): + return None + try: with patch.object( TelemetryClientFactory, "initialize_telemetry_client", staticmethod(_install_noop), + ), patch.object( + TelemetryClient, "_send_telemetry", _noop_send + ), patch.object( + TelemetryClient, "_export_event", _noop_export ): yield finally: