Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions core/src/subgraph/runner/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1303,12 +1303,14 @@ where
.unfail_non_deterministic_error(&block_ptr)
.await?;

// Stop trying to unfail.
self.state.should_try_unfail_non_deterministic = false;
// Stop trying unless we're still behind the error block.
if outcome != UnfailOutcome::BehindErrorBlock {
self.state.should_try_unfail_non_deterministic = false;

if let UnfailOutcome::Unfailed = outcome {
self.metrics.subgraph.deployment_status.running();
self.state.backoff.reset();
if outcome == UnfailOutcome::Unfailed {
self.metrics.subgraph.deployment_status.running();
self.state.backoff.reset();
}
}
}

Expand Down
4 changes: 4 additions & 0 deletions graph/src/components/store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -668,8 +668,12 @@ pub enum EntityOperation {

#[derive(Debug, PartialEq)]
pub enum UnfailOutcome {
/// Nothing to do - no error exists, or error is of wrong type (e.g., deterministic).
Noop,
/// Successfully unfailed the subgraph.
Unfailed,
/// The deployment head is still behind the error block, retry on subsequent blocks.
BehindErrorBlock,
}

#[derive(Clone, PartialEq, Eq, Debug)]
Expand Down
5 changes: 3 additions & 2 deletions store/postgres/src/deployment_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1872,7 +1872,8 @@ impl DeploymentStore {

Ok(UnfailOutcome::Unfailed)
}
// NOOP, the deployment head is still before where non-deterministic error happened.
// The deployment head is still before where non-deterministic error happened.
// Return BehindErrorBlock so the caller knows to retry on subsequent blocks.
block_range => {
info!(
self.logger,
Expand All @@ -1884,7 +1885,7 @@ impl DeploymentStore {
"error_block_hash" => subgraph_error.block_hash.as_ref().map(|hash| format!("0x{}", hex::encode(hash))),
);

Ok(UnfailOutcome::Noop)
Ok(UnfailOutcome::BehindErrorBlock)
}
}
}.scope_boxed()).await
Expand Down
4 changes: 2 additions & 2 deletions store/test-store/tests/postgres/subgraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1215,14 +1215,14 @@ fn fail_unfail_non_deterministic_error_noop() {
// Fail the subgraph with a non-deterministic error, but with an advanced block.
writable.fail_subgraph(error).await.unwrap();

// Since the block range of the block won't match the deployment head, this will be NOOP.
// Since the deployment head is behind the error block, this returns BehindErrorBlock.
let outcome = writable
.unfail_non_deterministic_error(&BLOCKS[1])
.await
.unwrap();

// State continues the same besides a new error added to the database.
assert_eq!(outcome, UnfailOutcome::Noop);
assert_eq!(outcome, UnfailOutcome::BehindErrorBlock);
assert_eq!(count().await, 2);
let vi = get_version_info(&store, NAME).await;
assert_eq!(NAME, vi.deployment_id.as_str());
Expand Down
74 changes: 74 additions & 0 deletions tests/tests/runner_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1418,3 +1418,77 @@ async fn skip_duplicates() {
})
);
}

/// Test that the unfail mechanism retries until the deployment head reaches
/// the error block. This is a regression test for issue #6205.
///
/// Scenario:
/// 1. Sync to block 1
/// 2. Inject non-deterministic error at block 3 (ahead of head)
/// 3. Run runner to block 3
/// 4. At blocks 1-2: unfail returns BehindErrorBlock, keeps trying
/// 5. At block 3: unfail succeeds, health → Healthy
#[graph::test]
async fn non_deterministic_unfail_retries_until_error_block() -> anyhow::Result<()> {
let RunnerTestRecipe { stores, test_info } =
RunnerTestRecipe::new("non_deterministic_unfail_retries", "end-block").await;

let blocks = {
let block_0 = genesis();
let mut block_1 = empty_block(block_0.ptr(), test_ptr(1));
let mut block_2 = empty_block(block_1.ptr(), test_ptr(2));
let mut block_3 = empty_block(block_2.ptr(), test_ptr(3));

// Add triggers to exercise normal block processing.
push_test_log(&mut block_1, "a");
push_test_log(&mut block_2, "b");
push_test_log(&mut block_3, "c");

vec![block_0, block_1, block_2, block_3]
};

let chain = chain(&test_info.test_name, blocks, &stores, None).await;
let ctx = fixture::setup(&test_info, &stores, &chain, None, None).await;

// Advance head to block 1.
ctx.start_and_sync_to(test_ptr(1)).await;

// Inject non-deterministic fatal error at block 3 (ahead of current head).
let writable = ctx
.store
.clone()
.writable(ctx.logger.clone(), ctx.deployment.id, Arc::new(vec![]))
.await
.unwrap();

writable
.fail_subgraph(SubgraphError {
subgraph_id: ctx.deployment.hash.clone(),
message: "injected transient error".to_string(),
block_ptr: Some(test_ptr(3)),
handler: None,
deterministic: false,
})
.await
.unwrap();

writable.flush().await.unwrap();

// Precondition: deployment is failed with non-deterministic error.
let status = ctx.indexing_status().await;
assert!(status.health == SubgraphHealth::Failed);
assert!(!status.fatal_error.as_ref().unwrap().deterministic);

// Run runner to block 3. The unfail mechanism should:
// - Return BehindErrorBlock at blocks 1-2 (keep trying)
// - Return Unfailed at block 3 (success)
let stop_block = test_ptr(3);
let _runner = ctx.runner(stop_block).await.run_for_test(false).await?;

// Postcondition: deployment is healthy, no fatal error.
let status = ctx.indexing_status().await;
assert!(status.health == SubgraphHealth::Healthy);
assert!(status.fatal_error.is_none());

Ok(())
}
Loading