From 3f2fe14b96732e67e4684f9de8426d1b9c8906c4 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Thu, 2 Jul 2026 20:27:09 +0800 Subject: [PATCH] Fix region-group cleanup: recover-safe submission + retry that actually re-runs the delete Follow-up to #18033. The submit-and-return region-group cleanup had two recovery/correctness defects, both found by adversarial review: 1. The `!isStateDeserialized()` guard around submitting the cleanup RemoveRegionGroupProcedures (in DeleteDatabaseProcedure and CreateRegionGroupsProcedure) could permanently skip the submission. The executor persists a procedure AT a state before that state's body runs, so a leader switch/restart landing on that state means the submission has not happened yet -- the guard then skips it forever while the next state drops the partition table, orphaning every region's peers/data on disk with no record. Removed the guard: always submit. Re-submitting on recovery is safe (each RemoveRegionGroupProcedure gets a fresh procId and does an idempotent delete), whereas skipping leaks. 2. RemoveRegionGroupProcedure's "retry until deleted" was a no-op after any DataNode-side FAIL. The DataNode dedups delete tasks by taskId (which was getProcId()) and caches the terminal result forever, so every same-taskId retry returned the cached FAIL without ever re-running the delete -- an infinite silent loop with the region data already orphaned. Each genuine re-attempt now uses a fresh DataNode-side taskId so the DataNode actually re-executes the idempotent delete. The taskId is derived deterministically from the (replicated) procId and a persisted monotonic sequence, packed into the negative i64 space: negative taskIds are disjoint from all procIds (>= 0), which add/remove-peer tasks use as their taskId against the same DataNode task map, so there is no collision; and because it depends only on replicated/persisted state it is stable and never regresses across a leader change. The in-flight taskId is persisted so a leader change re-polls the same task instead of double-submitting. RemoveRegionGroupProcedure now retries a replica indefinitely (a group delete must not leave data on disk) instead of failing after a fixed budget. --- .../confignode/i18n/ProcedureMessages.java | 2 +- .../confignode/i18n/ProcedureMessages.java | 2 +- .../region/CreateRegionGroupsProcedure.java | 28 ++- .../region/RemoveRegionGroupProcedure.java | 180 ++++++++++++++---- .../impl/schema/DeleteDatabaseProcedure.java | 27 ++- .../RemoveRegionGroupProcedureTest.java | 28 +++ 6 files changed, 218 insertions(+), 49 deletions(-) diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 4d3cea6f8ebf..87e18f5ba906 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -623,7 +623,7 @@ public final class ProcedureMessages { public static final String PID_REMOVEREGIONGROUP_STATE_FAILED = "[pid{}][RemoveRegionGroup] state {} failed"; public static final String PID_REMOVEREGIONGROUP_DELETE_REPLICA_FAILED = - "[pid{}][RemoveRegionGroup] failed to delete a replica of region {} (attempt {}/{}), will retry. reason: {}"; + "[pid{}][RemoveRegionGroup] failed to delete a replica of region {} (attempt {}), will keep retrying until it is deleted. reason: {}"; public static final String PID_REMOVEREGIONGROUP_SUCCESS_PROCEDURE_TOOK = "[pid{}][RemoveRegionGroup] success, region group {} has been deleted. Procedure took {} (started at {})."; public static final String PID_MIGRATEREGION_STARTED_WILL_BE_MIGRATED_FROM_DATANODE_TO = diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 9602839fae94..1cb059d6c72f 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -621,7 +621,7 @@ public final class ProcedureMessages { public static final String PID_REMOVEREGIONGROUP_STATE_FAILED = "[pid{}][RemoveRegionGroup] 状态 {} 失败"; public static final String PID_REMOVEREGIONGROUP_DELETE_REPLICA_FAILED = - "[pid{}][RemoveRegionGroup] 删除 region {} 的一个副本失败(第 {}/{} 次尝试),将重试。原因:{}"; + "[pid{}][RemoveRegionGroup] 删除 region {} 的一个副本失败(第 {} 次尝试),将持续重试直到删除成功。原因:{}"; public static final String PID_REMOVEREGIONGROUP_SUCCESS_PROCEDURE_TOOK = "[pid{}][RemoveRegionGroup] 成功,region group {} 已删除。过程耗时 {}(开始于 {})。"; public static final String PID_MIGRATEREGION_STARTED_WILL_BE_MIGRATED_FROM_DATANODE_TO = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java index cb26afc97dd0..276cdf432d9e 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java @@ -108,8 +108,11 @@ protected Flow executeFromState( case SHUNT_REGION_REPLICAS: persistPlan = new CreateRegionGroupsPlan(); final OfferRegionMaintainTasksPlan offerPlan = new OfferRegionMaintainTasksPlan(); - // RegionGroups that failed to reach a serving quorum are removed via a child - // RemoveRegionGroupProcedure, which deletes every replica that did get created. + // RegionGroups that failed to reach a serving quorum have their redundant (already-created) + // replicas removed via an independent root RemoveRegionGroupProcedure. Submitting them as + // root procedures (instead of children) keeps this procedure from waiting for or being + // failed by the cleanup: each one retries forever until those replicas are deleted, while + // this procedure proceeds to activate the region groups that did form a quorum. final List removeRegionGroupProcedures = new ArrayList<>(); // Filter those RegionGroups that created successfully createRegionGroupsPlan @@ -197,7 +200,26 @@ protected Flow executeFromState( LOGGER.warn( ConfigNodeMessages.FAILED_IN_THE_WRITE_API_EXECUTING_THE_CONSENSUS_LAYER_DUE, e); } - removeRegionGroupProcedures.forEach(this::addChildProcedure); + // Submit the redundant-replica cleanups as independent root procedures. This is + // intentionally NOT guarded by isStateDeserialized(): the executor persists a procedure at + // a state BEFORE that state's body has run (it advances the state on the previous cycle, + // then may stop at the inter-state boundary on a leader switch — see + // ProcedureExecutor#executeProcedure), so a recovery that lands on SHUNT_REGION_REPLICAS + // means the submissions have NOT happened yet. Skipping them would leave the + // already-created + // replicas of sub-quorum region groups on disk with no cleanup and no partition-table + // record + // (the else branch above never persisted them). Re-submitting on recovery is safe instead: + // the cleanups are recomputed from the serialized failedRegionReplicaSets, each gets a + // fresh + // procId and performs an idempotent delete, so a duplicate is harmless whereas a skip + // leaks. + removeRegionGroupProcedures.forEach( + removeRegionGroupProcedure -> + env.getConfigManager() + .getProcedureManager() + .getExecutor() + .submitProcedure(removeRegionGroupProcedure)); setNextState(CreateRegionGroupsState.REBALANCE_DATA_PARTITION_POLICY); break; case REBALANCE_DATA_PARTITION_POLICY: diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedure.java index 2dd2e0d6dd31..d417099e97b3 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedure.java @@ -59,11 +59,21 @@ * already-absent peer, so it works for a group of any size — including a sub-quorum group that * never finished forming. The DataNode runs the deletion asynchronously and this procedure polls * for the result, so a slow deletion is never wrongly reported as finished. + * + *

This procedure is submitted as an independent root procedure (not a child) by its callers, + * which only enqueue the deletion and return immediately. It therefore owns the deletion end to + * end: on any failure it retries the current replica forever (backing off between attempts) instead + * of giving up, because there is no parent left to fall back to and the region's peer/data must not + * be left on disk. Each genuine re-attempt uses a FRESH DataNode-side taskId (the DataNode dedups + * by taskId and caches a terminal result forever, so reusing one taskId would make every retry a + * no-op that never re-runs the delete); the in-flight taskId is persisted so a leader change + * re-polls the same task rather than double-submitting. It carries its own {@link + * TRegionReplicaSet} copy, so it can finish even after the caller has dropped the partition table, + * and it survives ConfigNode leader change / restart. */ public class RemoveRegionGroupProcedure extends RegionOperationProcedure { private static final Logger LOGGER = LoggerFactory.getLogger(RemoveRegionGroupProcedure.class); - private static final int MAX_DELETE_REPLICA_RETRY = 3; private static final long DELETE_REPLICA_RETRY_INTERVAL_MS = 5_000; private TRegionReplicaSet regionReplicaSet; @@ -75,10 +85,32 @@ public class RemoveRegionGroupProcedure extends RegionOperationProcedure negative) + PROC_ID_BITS + SEQ_BITS must be <= 64. + private static final int SEQ_BITS = 20; + private static final int PROC_ID_BITS = 43; + public RemoveRegionGroupProcedure() { super(); } @@ -93,15 +125,27 @@ void setCurrentReplicaIndex(int currentReplicaIndex) { this.currentReplicaIndex = currentReplicaIndex; } + @TestOnly + void setDeleteTaskState(long deleteTaskSeq, boolean deleteTaskSubmitted) { + this.deleteTaskSeq = deleteTaskSeq; + this.deleteTaskSubmitted = deleteTaskSubmitted; + } + + @TestOnly + long deleteTaskIdForTest() { + return deleteTaskId(); + } + @Override protected Flow executeFromState(ConfigNodeProcedureEnv env, RemoveRegionGroupState state) throws InterruptedException { final List dataNodeLocations = regionReplicaSet == null ? null : regionReplicaSet.getDataNodeLocations(); if (dataNodeLocations == null) { - // A null replica set means deserialization failed; fail loudly instead of silently reporting - // the group as deleted, otherwise the parent would drop the partition table while the region - // data is still on disk. + // A null replica set means deserialization failed. Retrying cannot recover the lost + // locations, + // so fail loudly instead of silently reporting the group as deleted (which would leave the + // region's peer/data on disk with no record of where it lives). setFailure( new ProcedureException(ProcedureMessages.UNSUPPORTED_STATE + "missing regionReplicaSet")); return Flow.NO_MORE_STATE; @@ -137,26 +181,37 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, RemoveRegionGroupSta regionId, simplifiedLocation(targetDataNode)); - // deleteLocalPeer is idempotent (it tolerates an already-absent peer) and the DataNode - // dedups by taskId, so re-submitting after a leader change or a retry is safe. + // Start a fresh attempt (fresh taskId) unless we are resuming an already-submitted one + // after + // a leader change, in which case we re-poll the SAME task rather than submitting a + // duplicate. + if (!deleteTaskSubmitted) { + deleteTaskSeq++; + deleteTaskSubmitted = true; + } + final long deleteTaskId = deleteTaskId(); + + // deleteLocalPeer is idempotent (it tolerates an already-absent peer), and re-submitting + // the + // same taskId re-polls the same DataNode task, so resuming after a leader change is safe. final TSStatus submitStatus; final TRegionMigrateResult result; try { submitStatus = - handler.submitDeleteOldRegionPeerTask(getProcId(), targetDataNode, regionId); + handler.submitDeleteOldRegionPeerTask(deleteTaskId, targetDataNode, regionId); setKillPoint(state); if (submitStatus.getCode() != SUCCESS_STATUS.getStatusCode()) { - return retryCurrentReplicaOrFail( + return retryCurrentReplica( String.format( "submit delete task for region %s to DataNode %s failed: %s", regionId, simplifiedLocation(targetDataNode), submitStatus)); } - result = handler.waitTaskFinish(getProcId(), targetDataNode); + result = handler.waitTaskFinish(deleteTaskId, targetDataNode); } catch (InterruptedException e) { throw e; } catch (Exception e) { LOGGER.error(ProcedureMessages.PID_REMOVEREGIONGROUP_STATE_FAILED, getProcId(), state, e); - return retryCurrentReplicaOrFail( + return retryCurrentReplica( String.format( "delete region %s from DataNode %s threw %s", regionId, simplifiedLocation(targetDataNode), e)); @@ -164,23 +219,24 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, RemoveRegionGroupSta switch (result.getTaskStatus()) { case SUCCESS: - // Advance to the next replica with a fresh retry budget. + // Advance to the next replica with a fresh retry counter and a fresh delete task. currentReplicaIndex++; attemptedForCurrentReplica = 0; + deleteTaskSubmitted = false; setNextState(RemoveRegionGroupState.DELETE_REGION_REPLICAS); return Flow.HAS_MORE_STATE; case PROCESSING: // waitTaskFinish() only returns PROCESSING when its polling loop was interrupted, i.e. // this ConfigNode is shutting down / losing leadership. The delete task is still - // running - // on the DataNode, so persist and re-poll after recovery: stay on this replica without - // advancing it and without consuming a retry attempt. + // running on the DataNode, so persist and re-poll after recovery: stay on this replica + // without advancing it, without consuming a retry attempt, and keeping deleteTaskSeq / + // deleteTaskSubmitted so the re-poll targets the same in-flight task. setNextState(RemoveRegionGroupState.DELETE_REGION_REPLICAS); return Flow.HAS_MORE_STATE; case TASK_NOT_EXIST: case FAIL: default: - return retryCurrentReplicaOrFail( + return retryCurrentReplica( String.format( "delete region %s from DataNode %s, task status is %s", regionId, simplifiedLocation(targetDataNode), result.getTaskStatus())); @@ -192,30 +248,57 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, RemoveRegionGroupSta } /** - * Retry the replica at {@link #currentReplicaIndex} after a backoff, or fail the whole procedure - * once the per-replica retry budget is exhausted. Failing (rather than skipping the replica) - * keeps the parent from dropping the partition table while a region's peer/data is still on disk. + * Retry the replica at {@link #currentReplicaIndex} after a backoff. This procedure never gives + * up on a replica: because it is submitted as an independent root procedure, there is no parent + * to fall back to, and skipping or failing would leave the region's peer/data on disk. So it + * backs off and re-runs the same state until the replica is deleted, which eventually succeeds + * once the target DataNode is reachable: the delete is idempotent, and clearing {@link + * #deleteTaskSubmitted} here makes the next attempt use a FRESH DataNode-side taskId (a new + * {@link #deleteTaskSeq}), so the DataNode actually re-executes the delete instead of returning a + * cached terminal result for the previous taskId. */ - private Flow retryCurrentReplicaOrFail(String reason) throws InterruptedException { + private Flow retryCurrentReplica(String reason) throws InterruptedException { attemptedForCurrentReplica++; - if (attemptedForCurrentReplica <= MAX_DELETE_REPLICA_RETRY) { - LOGGER.warn( - ProcedureMessages.PID_REMOVEREGIONGROUP_DELETE_REPLICA_FAILED, - getProcId(), - regionId, - attemptedForCurrentReplica, - MAX_DELETE_REPLICA_RETRY + 1, - reason); - Thread.sleep(DELETE_REPLICA_RETRY_INTERVAL_MS); - setNextState(RemoveRegionGroupState.DELETE_REGION_REPLICAS); - return Flow.HAS_MORE_STATE; + LOGGER.warn( + ProcedureMessages.PID_REMOVEREGIONGROUP_DELETE_REPLICA_FAILED, + getProcId(), + regionId, + attemptedForCurrentReplica, + reason); + // Force a fresh delete task on the next attempt so the DataNode re-runs the delete rather than + // replaying a cached FAIL/SUCCESS for this taskId. + deleteTaskSubmitted = false; + Thread.sleep(DELETE_REPLICA_RETRY_INTERVAL_MS); + setNextState(RemoveRegionGroupState.DELETE_REGION_REPLICAS); + return Flow.HAS_MORE_STATE; + } + + /** + * The DataNode-side taskId for the current attempt, derived from this procedure's (globally + * unique, consensus-replicated) procId and its monotonic {@link #deleteTaskSeq}. It is packed + * into the NEGATIVE i64 space, which is disjoint from every real procId (all {@code >= 0}); other + * region-maintain procedures (add/remove peer) use {@code getProcId()} directly as the taskId + * against the same DataNode task map, so a negative id can never collide with theirs. Unlike + * minting from the procedure-store id allocator, this needs nothing extra replicated: procId is + * already replicated and deleteTaskSeq is persisted with this procedure, so the taskId is stable + * across a leader change and never regresses. + * + *

Layout: sign bit set (=> negative) | {@value PROC_ID_BITS} bits of procId | {@value + * SEQ_BITS} bits of deleteTaskSeq. The bounds are astronomically beyond any real cluster (a + * procId needs 2^43 procedures; a single group delete needs 2^20 retries), and are asserted + * rather than silently wrapped so a violation fails the procedure loudly instead of emitting a + * colliding id. + */ + private long deleteTaskId() { + final long procId = getProcId(); + if (procId < 0 || procId >= (1L << PROC_ID_BITS) || deleteTaskSeq >= (1L << SEQ_BITS)) { + throw new IllegalStateException( + String.format( + "cannot derive a collision-free delete taskId: procId=%d, deleteTaskSeq=%d exceed the " + + "%d/%d-bit budget", + procId, deleteTaskSeq, PROC_ID_BITS, SEQ_BITS)); } - setFailure( - new ProcedureException( - String.format( - "[pid%d][RemoveRegionGroup] gave up after %d attempts: %s", - getProcId(), attemptedForCurrentReplica, reason))); - return Flow.NO_MORE_STATE; + return Long.MIN_VALUE | (procId << SEQ_BITS) | deleteTaskSeq; } @Override @@ -243,6 +326,11 @@ public void serialize(DataOutputStream stream) throws IOException { super.serialize(stream); ThriftCommonsSerDeUtils.serializeTRegionReplicaSet(regionReplicaSet, stream); ReadWriteIOUtils.write(currentReplicaIndex, stream); + // Persist the delete-task cursor so a leader change re-derives the SAME in-flight taskId and + // re-polls it (deleteTaskSubmitted == true) instead of submitting a duplicate, and so the + // monotonic deleteTaskSeq never regresses. + ReadWriteIOUtils.write(deleteTaskSeq, stream); + ReadWriteIOUtils.write(deleteTaskSubmitted, stream); } @Override @@ -252,6 +340,14 @@ public void deserialize(ByteBuffer byteBuffer) { regionReplicaSet = ThriftCommonsSerDeUtils.deserializeTRegionReplicaSet(byteBuffer); regionId = regionReplicaSet.getRegionId(); currentReplicaIndex = ReadWriteIOUtils.readInt(byteBuffer); + // deleteTaskSeq/deleteTaskSubmitted were appended after the first version of this procedure. + // That first version only ever existed on the unreleased branch that added this procedure + // (never in a release), but a dev/CI cluster could persist a blob without these trailing + // fields; tolerate it by defaulting to "no in-flight task" instead of reading past the end. + if (byteBuffer.hasRemaining()) { + deleteTaskSeq = ReadWriteIOUtils.readLong(byteBuffer); + deleteTaskSubmitted = ReadWriteIOUtils.readBool(byteBuffer); + } } catch (ThriftSerDeException e) { LOGGER.error(ProcedureMessages.ERROR_IN_DESERIALIZE, this.getClass(), e); } @@ -264,12 +360,14 @@ public boolean equals(Object obj) { } RemoveRegionGroupProcedure procedure = (RemoveRegionGroupProcedure) obj; return this.currentReplicaIndex == procedure.currentReplicaIndex + && this.deleteTaskSeq == procedure.deleteTaskSeq + && this.deleteTaskSubmitted == procedure.deleteTaskSubmitted && Objects.equals(this.regionReplicaSet, procedure.regionReplicaSet); } @Override public int hashCode() { - return Objects.hash(regionReplicaSet, currentReplicaIndex); + return Objects.hash(regionReplicaSet, currentReplicaIndex, deleteTaskSeq, deleteTaskSubmitted); } @Override @@ -279,6 +377,10 @@ public String toString() { + regionReplicaSet + ", currentReplicaIndex=" + currentReplicaIndex + + ", deleteTaskSeq=" + + deleteTaskSeq + + ", deleteTaskSubmitted=" + + deleteTaskSubmitted + '}'; } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteDatabaseProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteDatabaseProcedure.java index 4b979105c2fb..e40bcfbc24cb 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteDatabaseProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteDatabaseProcedure.java @@ -103,10 +103,24 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DeleteDa "[DeleteDatabaseProcedure] Delete DatabaseSchema: {}", deleteDatabaseSchema.getName()); - // Delete every region group (both schema and data regions) of this database via a - // RemoveRegionGroupProcedure child. The DatabasePartitionTable (handled in the next - // state) is only removed once these children have finished, so a slow region deletion is - // always completed before the coordinator forgets about it. + // Enqueue deletion of every region group (both schema and data regions) of this database. + // Each is submitted as an INDEPENDENT root RemoveRegionGroupProcedure rather than a + // child: + // this procedure only submits the deletions and then returns, so it can neither wait for + // nor be failed/rolled-back by a slow or failing region deletion. Each carries its own + // copy of the replica set, so the deletion still completes (and survives leader change / + // restart) even after the next state drops the partition table. + // + // Submission is intentionally NOT guarded by isStateDeserialized(): the executor persists + // a procedure at a state BEFORE that state's body has run (it advances the state on the + // previous cycle, then may stop at the inter-state boundary on a leader switch — see + // ProcedureExecutor#executeProcedure). So a recovery that lands on this state means the + // submission has NOT happened yet; skipping it would drop every region group's cleanup + // while the next state still drops the partition table, orphaning the region peers/data + // on + // disk with no record of where they live. Re-submitting on recovery is safe instead: + // every RemoveRegionGroupProcedure gets a fresh procId and performs an idempotent delete, + // so a duplicate is harmless whereas a skip leaks data. final List regionReplicaSets = env.getAllReplicaSets(deleteDatabaseSchema.getName()); regionReplicaSets.forEach( @@ -115,7 +129,10 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DeleteDa env.getConfigManager() .getLoadManager() .removeRegionGroupRelatedCache(regionReplicaSet.getRegionId()); - addChildProcedure(new RemoveRegionGroupProcedure(regionReplicaSet)); + env.getConfigManager() + .getProcedureManager() + .getExecutor() + .submitProcedure(new RemoveRegionGroupProcedure(regionReplicaSet)); }); setNextState(DeleteDatabaseState.DELETE_DATABASE_CONFIG); break; diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedureTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedureTest.java index 3ca6efaf0375..e239d64c12aa 100644 --- a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedureTest.java +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/region/RemoveRegionGroupProcedureTest.java @@ -59,6 +59,9 @@ public void serDeTest() throws Exception { // A non-zero cursor so the round-trip actually exercises currentReplicaIndex (de)serialization; // equals/hashCode include it, so a dropped/garbled cursor would fail the assertion. procedure.setCurrentReplicaIndex(1); + // Non-default delete-task cursor so the round-trip exercises deleteTaskSeq/deleteTaskSubmitted + // too; equals/hashCode include them, so a dropped/garbled value would fail the assertion. + procedure.setDeleteTaskState(42L, true); try (PublicBAOS byteArrayOutputStream = new PublicBAOS(); DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { procedure.serialize(outputStream); @@ -69,4 +72,29 @@ public void serDeTest() throws Exception { Assert.assertEquals(procedure, ProcedureFactory.getInstance().create(buffer)); } } + + @Test + public void deleteTaskIdIsNegativeAndUnique() { + // The DataNode taskResultMap is keyed only by taskId and is shared with add/remove-peer tasks, + // which use a procedure's (non-negative) procId directly as the taskId. So a delete taskId must + // be strictly negative (disjoint from every procId) and distinct per (procId, deleteTaskSeq), + // otherwise a later peer op could be silently deduped against a lingering delete-task entry. + final TRegionReplicaSet regionReplicaSet = + new TRegionReplicaSet( + new TConsensusGroupId(TConsensusGroupType.DataRegion, 1), + Arrays.asList(new TDataNodeLocation())); + final java.util.Set seen = new java.util.HashSet<>(); + for (long procId : new long[] {0L, 1L, 100L, 1L << 20, (1L << 43) - 1}) { + for (long seq : new long[] {1L, 2L, 100L, (1L << 20) - 1}) { + final RemoveRegionGroupProcedure procedure = + new RemoveRegionGroupProcedure(regionReplicaSet); + procedure.setProcId(procId); + procedure.setDeleteTaskState(seq, true); + final long taskId = procedure.deleteTaskIdForTest(); + Assert.assertTrue("taskId must be negative: " + taskId, taskId < 0); + Assert.assertTrue( + "taskId must be unique for (" + procId + "," + seq + ")", seen.add(taskId)); + } + } + } }