From 5054436dacb5424e8c37ef4bda702e4de9d45817 Mon Sep 17 00:00:00 2001 From: Hari Dara Date: Thu, 23 Apr 2026 20:00:54 +0530 Subject: [PATCH] HBASE-30112 Fix for flapping TestAsyncTableScan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tool: Claude Code Model: Opus 4.6 (1M context) Prompt: You need to diagnose the HBase test TestAsyncTableScan that is flapping. It had multiple failures and errors in the PR validation build, but none when I ran locally. The source code is at @../../src/apache/hbase/ and the logs for good run is at @good-run and bad run is at @bad-run. Can you compare the logs and try to understand why the run failed and look at the code for a potential fix by focussing solely on these logs alone? Summary of the fix by AI: ## Summary **Root cause**: A race condition between span export and test span isolation causes stale `TracedScanResultConsumer#onComplete` spans from a previous test to appear in the current test's span exporter. **The race**: In `TracedScanResultConsumer.onComplete()`, the `TraceUtil.trace()` wrapper calls `delegate.onComplete()` (which sets `finished=true` and `notifyAll()`) **before** calling `span.end()` (which exports the span). The test thread can wake up from `getAll()`, finish the current test, start the next test (which calls `clearSpans()`), and only then does the ForkJoinPool thread call `span.end()` — exporting the stale span into the next test's freshly cleared exporter. **Why it flaps**: The race is timing-dependent. On CI (slower machines), the ForkJoinPool thread is more likely to be preempted between `delegate.onComplete()` and `span.end()`, giving the test thread time to proceed to the next test. Locally (faster machines), the window is too small to hit. **Why only `scan=batch`**: The batch scan mode produces more results (each result is split in two), making each test slightly longer and increasing the probability that the ForkJoinPool thread scheduling creates the right timing for the race. **The fix**: Modified `assertTraceContinuity()` and `assertTraceError()` in `TestAsyncTableScan.java` to include `hasParentSpanId(scanOperationSpanId)` in the matchers used for both filtering and `waitForSpan`. This ensures the assertions only consider spans that belong to the current test's SCAN operation, ignoring any stale spans from previous tests. The `waitForSpan` with the more specific matcher also correctly waits for the current test's span rather than being satisfied by a stale one. --- .../hbase/client/TestAsyncTableScan.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncTableScan.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncTableScan.java index 7cdb58c12604..30e3a95a7c1f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncTableScan.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncTableScan.java @@ -124,17 +124,17 @@ protected void assertTraceContinuity() { .max((a, b) -> Long.compare(a.getEndEpochNanos(), b.getEndEpochNanos())) .map(SpanData::getSpanId).get(); - final Matcher onNextMatcher = hasName("TracedScanResultConsumer#onNext"); + final Matcher onNextMatcher = + allOf(hasName("TracedScanResultConsumer#onNext"), hasParentSpanId(scanOperationSpanId)); waitForSpan(onNextMatcher); - spanStream().filter(onNextMatcher::matches) - .forEach(span -> assertThat(span, allOf(onNextMatcher, hasParentSpanId(scanOperationSpanId), - hasStatusWithCode(StatusCode.OK), hasEnded()))); + spanStream().filter(onNextMatcher::matches).forEach( + span -> assertThat(span, allOf(onNextMatcher, hasStatusWithCode(StatusCode.OK), hasEnded()))); - final Matcher onCompleteMatcher = hasName("TracedScanResultConsumer#onComplete"); + final Matcher onCompleteMatcher = + allOf(hasName("TracedScanResultConsumer#onComplete"), hasParentSpanId(scanOperationSpanId)); waitForSpan(onCompleteMatcher); - spanStream().filter(onCompleteMatcher::matches) - .forEach(span -> assertThat(span, allOf(onCompleteMatcher, - hasParentSpanId(scanOperationSpanId), hasStatusWithCode(StatusCode.OK), hasEnded()))); + spanStream().filter(onCompleteMatcher::matches).forEach(span -> assertThat(span, + allOf(onCompleteMatcher, hasStatusWithCode(StatusCode.OK), hasEnded()))); } @Override @@ -163,10 +163,10 @@ protected void assertTraceContinuity() { .max((a, b) -> Long.compare(a.getEndEpochNanos(), b.getEndEpochNanos())) .map(SpanData::getSpanId).get(); - final Matcher onErrorMatcher = hasName("TracedScanResultConsumer#onError"); + final Matcher onErrorMatcher = + allOf(hasName("TracedScanResultConsumer#onError"), hasParentSpanId(scanOperationSpanId)); waitForSpan(onErrorMatcher); - spanStream().filter(onErrorMatcher::matches) - .forEach(span -> assertThat(span, allOf(onErrorMatcher, hasParentSpanId(scanOperationSpanId), - hasStatusWithCode(StatusCode.OK), hasEnded()))); + spanStream().filter(onErrorMatcher::matches).forEach(span -> assertThat(span, + allOf(onErrorMatcher, hasStatusWithCode(StatusCode.OK), hasEnded()))); } }