diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java index bd2fb84e0b600..ec67e5f451bd6 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java @@ -85,6 +85,12 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + setProperty("metadata_lease_fence_ms", String.valueOf(metadataLeaseFenceMs)); + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { setProperty("time_partition_interval", String.valueOf(partitionInterval)); diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java index b1c2a4f8d6be5..48544b901d4a5 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java @@ -61,6 +61,13 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + cnConfig.setMetadataLeaseFenceMs(metadataLeaseFenceMs); + dnConfig.setMetadataLeaseFenceMs(metadataLeaseFenceMs); + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { cnConfig.setPartitionInterval(partitionInterval); diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java index ba1f7106dd647..752dcd009db0a 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java @@ -44,6 +44,11 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { return this; diff --git a/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java index 5a7a004fa88a2..0ad3c23af16fe 100644 --- a/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java @@ -32,6 +32,8 @@ public interface CommonConfig { CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold); + CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs); + CommonConfig setPartitionInterval(long partitionInterval); CommonConfig setCompressor(String compressor); diff --git a/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java b/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java new file mode 100644 index 0000000000000..de0092352274f --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.relational.it.schema; + +import org.apache.iotdb.consensus.ConsensusFactory; +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.it.env.cluster.node.DataNodeWrapper; +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.TableClusterIT; +import org.apache.iotdb.itbase.env.BaseEnv; + +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.concurrent.Callable; + +import static org.junit.Assert.assertTrue; + +@RunWith(IoTDBTestRunner.class) +@Category({TableClusterIT.class}) +public class IoTDBTableDDLHAIT { + + private final Logger LOGGER = LoggerFactory.getLogger(IoTDBTableDDLHAIT.class); + + @BeforeClass + public static void setUp() throws Exception { + // Small fence threshold so the ConfigNode can prove the stopped DataNode is self-fenced quickly + // (T_proceed = fence + ~5s internal margin), keeping the test fast. Live DataNodes keep + // heartbeating (~1s), so they do not spuriously fence. + // Use 3 replicas so metadata/data-region operations such as DELETE DEVICES can still succeed + // after one DataNode is stopped. + EnvFactory.getEnv() + .getConfig() + .getCommonConfig() + .setMetadataLeaseFenceMs(20000) // default value + .setConfigNodeConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS) + .setSchemaRegionConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS) + .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS) + .setSchemaReplicationFactor(3) + .setDataReplicationFactor(3); + EnvFactory.getEnv().initClusterEnvironment(1, 3); + } + + @AfterClass + public static void tearDown() throws Exception { + EnvFactory.getEnv().cleanClusterEnvironment(); + } + + @Test + public void tableDdlSucceedsWhileOneDataNodeIsDown() throws Exception { + final String databaseName = "test_table_ddl_ha"; + final String tableName = "table_ddl_ha"; + final String createdAfterDownTableName = "table_ddl_ha_created_after_down"; + final DataNodeWrapper liveDataNode = EnvFactory.getEnv().getDataNodeWrapper(0); + final DataNodeWrapper victimDataNode = EnvFactory.getEnv().getDataNodeWrapper(2); + + // Pin the connection to a DataNode we will keep alive, so stopping the victim cannot break it. + try (final Connection connection = + EnvFactory.getEnv() + .getConnection(liveDataNode, "root", "root", BaseEnv.TABLE_SQL_DIALECT); + final Statement statement = connection.createStatement()) { + statement.execute("CREATE DATABASE " + databaseName); + statement.execute("USE " + databaseName); + statement.execute("CREATE TABLE " + tableName + " (dev STRING TAG, s1 INT32 FIELD)"); + statement.execute( + "INSERT INTO " + + tableName + + "(time, dev, s1) VALUES(1, 'dev01', 1), (2, 'dev02', 2), (3, 'dev03', 3)"); + + // ready for the drop database + statement.execute("CREATE TABLE TABLE1 (dev STRING TAG, s1 INT32 FIELD)"); + statement.execute( + "INSERT INTO TABLE1 (time, dev, s1) VALUES(1, 'dev01', 1), (2, 'dev02', 2), (3, 'dev03', 3)"); + // Take one DataNode down. Its last successful ConfigNode contact is now frozen; after + // T_proceed the ConfigNode can treat it as self-fenced and stop waiting for its ack. + victimDataNode.stop(); + Assert.assertFalse("victim DataNode should be stopped", victimDataNode.isAlive()); + + // The DDL broadcast can no longer reach the stopped DataNode. Previously this hard-failed; + // now it must still succeed (after blocking ~T_proceed while the fence is proven). + LOGGER.info("0. start to test high availability of creating table procedure"); + assertStatementEffect( + statement, + "CREATE TABLE " + + createdAfterDownTableName + + " (region STRING TAG, temperature FLOAT FIELD)", + () -> tableExists(statement, createdAfterDownTableName), + "CREATE TABLE must succeed with one DataNode down"); + + LOGGER.info("1. start to test high availability of adding column procedure"); + assertStatementEffect( + statement, + "ALTER TABLE " + tableName + " ADD COLUMN s2 INT32 FIELD", + () -> columnHasType(statement, tableName, "s2", "INT32"), + "ADD COLUMN must succeed with one DataNode down"); + + LOGGER.info("2. start to test high availability of altering column type procedure"); + assertStatementEffect( + statement, + "ALTER TABLE " + tableName + " ALTER COLUMN s2 SET DATA TYPE INT64", + () -> columnHasType(statement, tableName, "s2", "INT64"), + "ALTER COLUMN TYPE must succeed with one DataNode down"); + + LOGGER.info("3. start to test high availability of altering table ttl procedure"); + assertStatementEffect( + statement, + "ALTER TABLE " + tableName + " SET PROPERTIES ttl = 864000", + () -> tableHasTtl(statement, tableName, "864000"), + "ALTER TABLE TTL must succeed with one DataNode down"); + + LOGGER.info("4. start to test high availability of resetting table ttl procedure"); + assertStatementEffect( + statement, + "ALTER TABLE " + tableName + " SET PROPERTIES ttl = 'INF'", + () -> tableHasTtl(statement, tableName, "INF"), + "ALTER TABLE TTL reset must succeed with one DataNode down"); + + LOGGER.info("5. start to test high availability of deleting devices procedure"); + assertStatementEffect( + statement, + "DELETE DEVICES FROM " + tableName + " WHERE dev = 'dev02'", + () -> !deviceExists(statement, tableName, "dev02"), + "DELETE DEVICES must succeed with one DataNode down"); + + LOGGER.info("6. start to test high availability of dropping table procedure"); + assertStatementEffect( + statement, + "DROP TABLE " + tableName, + () -> !tableExists(statement, tableName), + "DROP TABLE must succeed with one DataNode down"); + + LOGGER.info("7. start to test high availability of dropping database procedure"); + assertStatementEffect( + statement, + "DROP DATABASE " + databaseName, + () -> !databaseExists(statement, databaseName), + "DROP DATABASE must succeed with one DataNode down"); + } + } + + private void assertStatementEffect( + final Statement statement, + final String sql, + final Callable effect, + final String message) + throws Exception { + statement.execute(sql); + assertTrue(message, effect.call()); + } + + private boolean tableExists(final Statement statement, final String tableName) throws Exception { + try (final ResultSet resultSet = statement.executeQuery("SHOW TABLES")) { + while (resultSet.next()) { + if (tableName.equalsIgnoreCase(resultSet.getString(1))) { + return true; + } + } + } + return false; + } + + private boolean columnHasType( + final Statement statement, + final String tableName, + final String columnName, + final String dataType) + throws Exception { + try (final ResultSet resultSet = statement.executeQuery("DESCRIBE " + tableName)) { + while (resultSet.next()) { + if (columnName.equalsIgnoreCase(resultSet.getString(1))) { + return dataType.equalsIgnoreCase(resultSet.getString(2)); + } + } + } + return false; + } + + private boolean tableHasTtl(final Statement statement, final String tableName, final String ttl) + throws Exception { + try (final ResultSet resultSet = statement.executeQuery("SHOW TABLES")) { + while (resultSet.next()) { + if (tableName.equalsIgnoreCase(resultSet.getString(1))) { + return ttl.equalsIgnoreCase(resultSet.getString(2)); + } + } + } + return false; + } + + private boolean deviceExists( + final Statement statement, final String tableName, final String device) throws Exception { + try (final ResultSet resultSet = + statement.executeQuery( + "SHOW DEVICES FROM " + tableName + " WHERE dev = '" + device + "'")) { + return resultSet.next(); + } + } + + private boolean databaseExists(final Statement statement, final String databaseName) + throws Exception { + try (final ResultSet resultSet = statement.executeQuery("SHOW DATABASES")) { + while (resultSet.next()) { + if (databaseName.equalsIgnoreCase(resultSet.getString(1))) { + return true; + } + } + } + return false; + } +} diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java index 95327bc59db97..384f38028e84a 100644 --- a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java @@ -90,6 +90,7 @@ public enum TSStatusCode { TYPE_NOT_FOUND(528), DATABASE_CONFLICT(529), DATABASE_MODEL(530), + METADATA_LEASE_FENCED(531), TABLE_NOT_EXISTS(550), TABLE_ALREADY_EXISTS(551), diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 87e18f5ba906d..f0263b376215a 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -435,6 +435,7 @@ public final class ProcedureMessages { "Failed to pre-release {} for table {}.{} to DataNode, failure results: {}"; public static final String FAILED_TO_PRE_SET_TEMPLATE_ON_PATH_DUE_TO = "Failed to pre set template {} on path {} due to {}"; + public static final String FAILED_TO_PROVE_DN_IS_FENCED = "Failed to prove DN is fenced"; public static final String FAILED_TO_PUSH_CONSUMER_GROUP_META_TO_DATANODES_DETAILS = "Failed to push consumer group meta to dataNodes, details: %s"; public static final String FAILED_TO_PUSH_PIPE_META_LIST_TO_DATA_NODES_WILL = @@ -510,7 +511,7 @@ public final class ProcedureMessages { public static final String FAILED_TO_SYNC_TABLE_PRE_CREATE_INFO_TO_DATANODE_FAILURE = "Failed to sync table {}.{} pre-create info to DataNode, failure results: {}"; public static final String FAILED_TO_SYNC_TABLE_ROLLBACK_CREATE_INFO_TO_DATANODE_FAILURE = - "Failed to sync table {}.{} rollback-create info to DataNode {}, failure results: "; + "Failed to sync table {}.{} rollback-create info to DataNode, failure results: {}"; public static final String FAILED_TO_SYNC_TEMPLATE_COMMIT_SET_INFO_ON_PATH_TO = "Failed to sync template {} commit-set info on path {} to DataNode {}"; public static final String FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO = @@ -575,8 +576,10 @@ public final class ProcedureMessages { "Invalidate view schemaengine cache failed"; public static final String INVALIDATING_CACHE_FOR_COLUMN_IN_WHEN_DROPPING_COLUMN = "Invalidating cache for column {} in {}.{} when dropping column"; - public static final String INVALIDATING_CACHE_FOR_TABLE_WHEN_DROPPING_TABLE = - "Invalidating cache for table {}.{} when dropping table"; + public static final String PRE_RELEASE_DELETE_TABLE_WHEN_DROPPING_TABLE = + "pre release delete table {}.{} when dropping table"; + public static final String COMMIT_RELEASE_DELETE_TABLE_WHEN_DROPPING_TABLE = + "commit release delete table {}.{} when dropping table"; public static final String INVALID_DATA_TYPE_CANNOT_BE_USED_AS_A_NEW_TYPE = "Invalid data type cannot be used as a new type"; public static final String IO_ERROR_WHEN_DESERIALIZE_AUTHPLAN = @@ -845,6 +848,8 @@ public final class ProcedureMessages { public static final String ROLLBACK_CREATETABLE_COSTS_MS = "Rollback CreateTable-{} costs {}ms."; public static final String ROLLBACK_CREATE_TABLE_FAILED = "Rollback create table failed"; public static final String ROLLBACK_DROPTABLE_COSTS_MS = "Rollback DropTable-{} costs {}ms."; + public static final String ROLLBACK_PRE_DELETE_TABLE_FAILED = + "Rollback pre-delete table %s.%s failed, please manually drop the table"; public static final String ROLLBACK_PRE_RELEASE = "Rollback pre-release "; public static final String ROLLBACK_PRE_RELEASE_TEMPLATE_FAILED = "Rollback pre release template failed"; diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 1cb059d6c72f8..5b8c1a85c7fc4 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -435,6 +435,7 @@ public final class ProcedureMessages { "Failed to pre-release {} for table {}.{} to DataNode, failure results: {}"; public static final String FAILED_TO_PRE_SET_TEMPLATE_ON_PATH_DUE_TO = "Failed to pre set template {} on path {} due to {}"; + public static final String FAILED_TO_PROVE_DN_IS_FENCED = "不能证明一个不可达的DN已经处于隔离状态"; public static final String FAILED_TO_PUSH_CONSUMER_GROUP_META_TO_DATANODES_DETAILS = "Failed to push consumer group meta to dataNodes, details: %s"; public static final String FAILED_TO_PUSH_PIPE_META_LIST_TO_DATA_NODES_WILL = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java index 2d44c214967f6..cc28fed0ff56e 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java @@ -45,6 +45,7 @@ public enum CnToDnAsyncRequestType { CHANGE_REGION_LEADER, // Cache + INVALIDATE_PARTITION_CACHE, INVALIDATE_SCHEMA_CACHE, INVALIDATE_LAST_CACHE, CLEAR_CACHE, @@ -121,7 +122,7 @@ public enum CnToDnAsyncRequestType { // Table UPDATE_TABLE, - INVALIDATE_TABLE_CACHE, + PRE_DELETE_TABLE, DELETE_DATA_FOR_DROP_TABLE, DELETE_DEVICES_FOR_DROP_TABLE, INVALIDATE_COLUMN_CACHE, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java index 4048016548a16..4c7930240e867 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java @@ -310,6 +310,11 @@ protected void initActionMapBuilder() { (req, client, handler) -> client.fetchSchemaBlackList( (TFetchSchemaBlackListReq) req, (FetchSchemaBlackListRPCHandler) handler)); + actionMapBuilder.put( + CnToDnAsyncRequestType.INVALIDATE_PARTITION_CACHE, + (req, client, handler) -> + client.invalidatePartitionCache( + (TInvalidateCacheReq) req, (DataNodeTSStatusRPCHandler) handler)); actionMapBuilder.put( CnToDnAsyncRequestType.INVALIDATE_SCHEMA_CACHE, (req, client, handler) -> @@ -442,7 +447,7 @@ protected void initActionMapBuilder() { (req, client, handler) -> client.updateTable((TUpdateTableReq) req, (DataNodeTSStatusRPCHandler) handler)); actionMapBuilder.put( - CnToDnAsyncRequestType.INVALIDATE_TABLE_CACHE, + CnToDnAsyncRequestType.PRE_DELETE_TABLE, (req, client, handler) -> client.invalidateTableCache( (TInvalidateTableCacheReq) req, (DataNodeTSStatusRPCHandler) handler)); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java index 3114d60ca3a59..4d9df1235209b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java @@ -27,6 +27,7 @@ import org.apache.iotdb.commons.cluster.RegionStatus; import org.apache.iotdb.confignode.conf.ConfigNodeConfig; import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.load.LoadManager; import org.apache.iotdb.confignode.manager.load.cache.consensus.ConsensusGroupHeartbeatSample; import org.apache.iotdb.confignode.manager.load.cache.node.NodeHeartbeatSample; @@ -93,6 +94,11 @@ public void onComplete(TDataNodeHeartbeatResp heartbeatResp) { } private void cacheNodeHeartbeatSample(TDataNodeHeartbeatResp heartbeatResp) { + // A successful response confirms ConfigNode->DataNode contact; stamp it on the ConfigNode clock + // for the metadata-lease verdict. Kept separate from the load-cache samples (which record the + // echoed send-time) and deliberately not touched in onError, so failures never advance it. + final DataNodeContactTracker contactTracker = DataNodeContactTracker.getInstance(); + contactTracker.recordSuccessfulResponse(nodeId); loadManager .getLoadCache() .cacheDataNodeHeartbeatSample(nodeId, new NodeHeartbeatSample(heartbeatResp)); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java index 38a7462002408..7733d1f145ae6 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java @@ -236,6 +236,7 @@ public static DataNodeAsyncRequestRPCHandler buildHandler( case SET_SYSTEM_STATUS: case NOTIFY_REGION_MIGRATION: case UPDATE_REGION_ROUTE_MAP: + case INVALIDATE_PARTITION_CACHE: case INVALIDATE_SCHEMA_CACHE: case INVALIDATE_MATCHED_SCHEMA_CACHE: case UPDATE_TEMPLATE: @@ -243,7 +244,7 @@ public static DataNodeAsyncRequestRPCHandler buildHandler( case KILL_QUERY_INSTANCE: case RESET_PEER_LIST: case TEST_CONNECTION: - case INVALIDATE_TABLE_CACHE: + case PRE_DELETE_TABLE: case DELETE_DATA_FOR_DROP_TABLE: case DELETE_DEVICES_FOR_DROP_TABLE: case INVALIDATE_COLUMN_CACHE: diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java index 04130e42664fc..629e7fc393f90 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java @@ -26,9 +26,11 @@ import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.thrift.transport.TTransportException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.ConnectException; import java.util.Map; import java.util.concurrent.CountDownLatch; @@ -76,11 +78,14 @@ public void onError(Exception e) { + ", exception: " + e.getMessage(); logFailure(errorMsg); + // the DN throw Exception -> TApplicationException + // the DN crash -> TTransportException or ConnectException + int code = + e instanceof TTransportException || e instanceof ConnectException + ? TSStatusCode.CAN_NOT_CONNECT_DATANODE.getStatusCode() + : TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode(); - responseMap.put( - requestId, - new TSStatus( - RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode(), errorMsg))); + responseMap.put(requestId, new TSStatus(RpcUtils.getStatus(code, errorMsg))); countDownLatch.countDown(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java index b7453fb987665..0d9ca912571a4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java @@ -116,6 +116,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.RenameTableColumnPlan; import org.apache.iotdb.confignode.consensus.request.write.table.RenameTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.RollbackCreateTablePlan; +import org.apache.iotdb.confignode.consensus.request.write.table.RollbackPreDeleteTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTableColumnCommentPlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTableCommentPlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTablePropertiesPlan; @@ -444,6 +445,9 @@ public static ConfigPhysicalPlan create(final ByteBuffer buffer) throws IOExcept case PreDeleteView: plan = new PreDeleteViewPlan(); break; + case RollbackPreDeleteTable: + plan = new RollbackPreDeleteTablePlan(); + break; case CommitDeleteTable: plan = new CommitDeleteTablePlan(configPhysicalPlanType); break; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java index dce1db12cd032..1be9518141483 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java @@ -232,6 +232,7 @@ public enum ConfigPhysicalPlanType { RenameViewColumn((short) 877), AlterColumnDataType((short) 878), PreAlterColumnDataType((short) 879), + RollbackPreDeleteTable((short) 880), /** Deprecated types for sync, restored them for upgrade. */ @Deprecated diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/read/table/FetchTablePlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/read/table/FetchTablePlan.java index a69eda99d79af..7d7dabdf89579 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/read/table/FetchTablePlan.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/read/table/FetchTablePlan.java @@ -19,6 +19,7 @@ package org.apache.iotdb.confignode.consensus.request.read.table; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlanType; import org.apache.iotdb.confignode.consensus.request.read.ConfigPhysicalReadPlan; @@ -28,13 +29,20 @@ public class FetchTablePlan extends ConfigPhysicalReadPlan { private final Map> fetchTableMap; + private final Set tableNodeStatusSet; - public FetchTablePlan(final Map> fetchTableMap) { + public FetchTablePlan( + final Map> fetchTableMap, Set tableNodeStatus) { super(ConfigPhysicalPlanType.FetchTable); this.fetchTableMap = fetchTableMap; + this.tableNodeStatusSet = tableNodeStatus; } public Map> getFetchTableMap() { return fetchTableMap; } + + public Set getTableNodeStatusSet() { + return tableNodeStatusSet; + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/table/RollbackPreDeleteTablePlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/table/RollbackPreDeleteTablePlan.java new file mode 100644 index 0000000000000..3d30cf1262e9b --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/table/RollbackPreDeleteTablePlan.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.consensus.request.write.table; + +import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlanType; + +public class RollbackPreDeleteTablePlan extends AbstractTablePlan { + + public RollbackPreDeleteTablePlan() { + super(ConfigPhysicalPlanType.RollbackPreDeleteTable); + } + + public RollbackPreDeleteTablePlan(final String database, final String tableName) { + super(ConfigPhysicalPlanType.RollbackPreDeleteTable, database, tableName); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java index ad0a82bcf56b3..3674b531fbd12 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java @@ -37,6 +37,7 @@ import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.pipe.agent.PipeConfigNodeAgent; import org.apache.iotdb.confignode.persistence.executor.ConfigPlanExecutor; import org.apache.iotdb.confignode.persistence.schema.ConfigNodeSnapshotParser; @@ -304,6 +305,15 @@ public void notifyLeaderReady() { ConfigNodeMessages.CURRENT_NODE_NODEID_IP_PORT_BECOMES_CONFIG_REGION_LEADER, ConfigNodeDescriptor.getInstance().getConf().getConfigNodeId(), currentNodeTEndPoint); + + // Reset every DataNode's last-contact time to now on (re)acquiring leadership: a stale + // timestamp + // left from a previous leadership term (while another ConfigNode was contacting the DataNodes) + // would otherwise let the metadata-broadcast verdict wrongly judge a live DataNode as fenced. + DataNodeContactTracker.getInstance() + .onLeadershipAcquired( + configManager.getNodeManager().getRegisteredDataNodeLocations().keySet()); + // Bump the epoch eagerly so that any in-flight services of an older epoch are invalidated // immediately, even before the (serialized) become-leader orchestration gets to run. final long epoch = nextLeaderServicesEpoch(); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java index b11c83d5784e9..178ba7ddd025e 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java @@ -58,6 +58,7 @@ import org.apache.iotdb.commons.pipe.sink.payload.airgap.AirGapPseudoTPipeTransferRequest; import org.apache.iotdb.commons.schema.SchemaConstant; import org.apache.iotdb.commons.schema.table.AlterOrDropTableOperationType; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.schema.table.TreeViewSchema; import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.commons.schema.table.TsTableInternalRPCUtil; @@ -168,6 +169,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TCreateTableViewReq; import org.apache.iotdb.confignode.rpc.thrift.TCreateTopicReq; import org.apache.iotdb.confignode.rpc.thrift.TCreateTriggerReq; +import org.apache.iotdb.confignode.rpc.thrift.TDataNodeLeaseRecoveryResp; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRegisterReq; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRestartReq; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRestartResp; @@ -286,6 +288,7 @@ import java.util.Collection; import java.util.Collections; import java.util.Comparator; +import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -3164,19 +3167,41 @@ public TDescTable4InformationSchemaResp describeTable4InformationSchema() { } @Override - public TFetchTableResp fetchTables(final Map> fetchTableMap) { + public TFetchTableResp fetchTables( + final Map> fetchTableMap, TableNodeStatus tableNodeStatus) { final TSStatus status = confirmLeader(); if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { return new TFetchTableResp(status); } - fetchTableMap.forEach( - (key, value) -> - value.removeIf( - table -> - procedureManager - .checkDuplicateTableTask(key, null, table, null, null, null) - .getRight())); - return clusterSchemaManager.fetchTables(fetchTableMap); + switch (tableNodeStatus) { + case USING: + fetchTableMap.forEach( + (key, value) -> + value.removeIf( + table -> + procedureManager + .checkDuplicateTableTask(key, null, table, null, null, null) + .getRight())); + return clusterSchemaManager.fetchTables(fetchTableMap, EnumSet.of(TableNodeStatus.USING)); + case PRE_DELETE: + // for get the pre_delete status table, do not need checkDuplicateTableTask, + // just get the current table, and should find both of using and pre_delete status + return clusterSchemaManager.fetchTables( + fetchTableMap, EnumSet.of(TableNodeStatus.USING, TableNodeStatus.PRE_DELETE)); + case PRE_CREATE: + default: + throw new UnsupportedOperationException(); + } + } + + public TDataNodeLeaseRecoveryResp reloadCacheAfterLeaseRecovery() { + final TSStatus status = confirmLeader(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return new TDataNodeLeaseRecoveryResp().setStatus(status); + } + return new TDataNodeLeaseRecoveryResp() + .setStatus(RpcUtils.SUCCESS_STATUS) + .setTableInfo(clusterSchemaManager.getAllTableInfoForDataNodeActivation()); } @Override diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/IManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/IManager.java index 1dc097b34440b..bae45596812e1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/IManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/IManager.java @@ -35,6 +35,7 @@ import org.apache.iotdb.commons.cluster.NodeStatus; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.confignode.audit.CNAuditLogger; import org.apache.iotdb.confignode.consensus.request.read.ainode.GetAINodeConfigurationPlan; import org.apache.iotdb.confignode.consensus.request.read.database.CountDatabasePlan; @@ -920,7 +921,8 @@ TDescTableResp describeTable( TDescTable4InformationSchemaResp describeTable4InformationSchema(); - TFetchTableResp fetchTables(final Map> fetchTableMap); + TFetchTableResp fetchTables( + final Map> fetchTableMap, TableNodeStatus tableNodeStatus); TSStatus pushHeartbeat(final int dataNodeId, final TPipeHeartbeatResp resp); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index 28514043188f2..d9e5bec8161c4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -2394,12 +2394,17 @@ public TDeleteTableDeviceResp deleteDevices( } } + // only care about the AbstractAlterOrDropTableProcedure(except the drop table/view) + // and the DeleteDatabaseProcedure public Map> getAllExecutingTables() { final Map> result = new HashMap<>(); for (final Procedure procedure : executor.getProcedures().values()) { if (procedure.isFinished()) { continue; } + if (procedure instanceof DropTableProcedure) { + continue; + } // CreateTableOrViewProcedure is covered by the default process, thus we can ignore it here // Note that if a table is creating there will not be a working table, and the DN will either // be updated by commit or fetch the CN tables diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java new file mode 100644 index 0000000000000..e69095a1e8143 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.confignode.manager.IManager; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.DataNodeState; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; +import org.apache.iotdb.rpc.TSStatusCode; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.function.IntToLongFunction; +import java.util.function.LongSupplier; +import java.util.function.Supplier; + +/** + * Broadcasts one Tier-A cache invalidation and turns the per-DataNode responses into a {@link + * Verdict}. An unreachable DataNode can be skipped only after it is provably self-fenced. + */ +public class ClusterCachePropagator { + + /** + * {@code T_proceed = T_fence + margin}. The margin covers heartbeat-recording granularity and + * scheduling jitter. + */ + private static final long DEFAULT_PROCEED_MARGIN_MS = 5_000L; + + /** How often to retry while waiting for unacked DataNodes to ack or cross T_proceed. */ + private static final long RETRY_INTERVAL_MS = 1_000L; + + /** Broadcasts the cache invalidation to {@code targets} and returns the per-nodeId responses. */ + @FunctionalInterface + public interface CacheBroadcast { + Map sendTo(Map targets); + } + + /** Injectable sleep so the retry loop can be driven deterministically in tests. */ + @FunctionalInterface + interface Sleeper { + void sleepMs(long ms) throws InterruptedException; + } + + private final Supplier> registeredDataNodes; + private final IntToLongFunction elapsedMsSinceLastSuccessfulHeartbeatResponse; + private final LongSupplier fenceTimeoutMs; + private final LongSupplier nanoClock; + private final Sleeper sleeper; + + public ClusterCachePropagator(final IManager configManager) { + this( + () -> configManager.getNodeManager().getRegisteredDataNodeLocations(), + nodeId -> DataNodeContactTracker.getInstance().getMillisSinceLastSuccessfulResponse(nodeId), + () -> + CommonDescriptor.getInstance().getConfig().getMetadataLeaseFenceMs() + + DEFAULT_PROCEED_MARGIN_MS, + System::nanoTime, + Thread::sleep); + } + + ClusterCachePropagator( + final Supplier> registeredDataNodes, + final IntToLongFunction elapsedMsSinceLastSuccessfulHeartbeatResponse, + final LongSupplier fenceTimeoutMs, + final LongSupplier nanoClock, + final Sleeper sleeper) { + this.registeredDataNodes = registeredDataNodes; + this.elapsedMsSinceLastSuccessfulHeartbeatResponse = + elapsedMsSinceLastSuccessfulHeartbeatResponse; + this.fenceTimeoutMs = fenceTimeoutMs; + this.nanoClock = nanoClock; + this.sleeper = sleeper; + } + + /** + * Broadcast once and classify the result. {@code waitBudgetExhausted} turns a would-be {@link + * Verdict#WAIT} into {@link Verdict#FAIL}. + */ + public Verdict propagateOnce(final CacheBroadcast broadcast, final boolean waitBudgetExhausted) { + final Map targets = registeredDataNodes.get(); + final Map responses = broadcast.sendTo(targets); + final long fenceTimeOutsMs = this.fenceTimeoutMs.getAsLong(); + final List states = new ArrayList<>(targets.size()); + for (final Integer nodeId : targets.keySet()) { + final TSStatus status = responses.get(nodeId); + // if the status code is not TSStatusCode.CAN_NOT_CONNECT_DATANODE, + // treat it as a DataNode internal execution exception + boolean executeSuccess; + if (status == null) { + executeSuccess = false; + } else { + switch (TSStatusCode.representOf(status.getCode())) { + case SUCCESS_STATUS: + executeSuccess = true; + break; + case CAN_NOT_CONNECT_DATANODE: + executeSuccess = false; + break; + default: + // There is a DN executes procedure with internal failure + return Verdict.FAIL; + } + } + states.add( + new DataNodeState( + executeSuccess, elapsedMsSinceLastSuccessfulHeartbeatResponse.applyAsLong(nodeId))); + } + return MetadataBroadcastVerdict.decide(states, fenceTimeOutsMs, waitBudgetExhausted); + } + + /** + * Broadcast and retry until the verdict is {@link Verdict#PROCEED} or {@link Verdict#FAIL}. + * Blocks the calling thread for up to {@code T_proceed}. + */ + public boolean propagate(final CacheBroadcast broadcast) { + final long deadlineNanos = + nanoClock.getAsLong() + TimeUnit.MILLISECONDS.toNanos(fenceTimeoutMs.getAsLong()); + while (true) { + final boolean waitBudgetExhausted = nanoClock.getAsLong() >= deadlineNanos; + final Verdict verdict = propagateOnce(broadcast, waitBudgetExhausted); + if (verdict == Verdict.PROCEED) { + return true; + } + if (verdict == Verdict.FAIL) { + return false; + } + try { + sleeper.sleepMs(RETRY_INTERVAL_MS); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return false; + } + } + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java new file mode 100644 index 0000000000000..939287baa5873 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import java.util.Collection; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.LongSupplier; + +/** + * Tracks, per DataNode, the time the ConfigNode last received a successful heartbeat + * response from it, stamped with the ConfigNode's own monotonic clock at receipt. + * + *

This is the sound signal for deciding whether an unreachable DataNode has self-fenced (used by + * the metadata-lease verdict). It must be kept separate from the load-cache {@code + * NodeHeartbeatSample}s, which (a) record the heartbeat send time echoed back by the + * DataNode — not response receipt — and (b) are advanced to the current time by failure ({@code + * onError}) samples. Either property would break the verdict: send-time can make the ConfigNode + * believe a DataNode is fenced while it just renewed from a delayed heartbeat, and failure-advanced + * time would keep the age from ever growing. + * + *

By construction there is no method that advances the time on failure: only {@link + * #recordSuccessfulResponse(int)} updates it. A never-contacted DataNode reads as age 0 (treated as + * just-contacted) so the verdict never wrongly declares an unknown DataNode fenced. + */ +public class DataNodeContactTracker { + + private final LongSupplier nanoClock; + + private final Map lastSuccessfulResponseNanos = new ConcurrentHashMap<>(); + + private DataNodeContactTracker() { + this(System::nanoTime); + } + + DataNodeContactTracker(final LongSupplier nanoClock) { + this.nanoClock = nanoClock; + } + + /** Record that a successful heartbeat response from the DataNode was just received. */ + public void recordSuccessfulResponse(final int dataNodeId) { + lastSuccessfulResponseNanos.put(dataNodeId, nanoClock.getAsLong()); + } + + /** + * Milliseconds since the ConfigNode last received a successful heartbeat response from the + * DataNode. Returns 0 (treated as just-contacted) if never recorded — conservative, so an unknown + * DataNode is never declared fenced. + */ + public long getMillisSinceLastSuccessfulResponse(final int dataNodeId) { + final Long lastNanos = lastSuccessfulResponseNanos.get(dataNodeId); + if (lastNanos == null) { + return 0L; + } + final long elapsedNanos = nanoClock.getAsLong() - lastNanos; + return elapsedNanos > 0 ? elapsedNanos / 1_000_000L : 0L; + } + + /** + * On leadership acquisition, conservatively reset contact ages so this leader does not infer any + * dataNode as fencing state from its previous record history. + */ + public void onLeadershipAcquired(final Collection registeredDataNodeIds) { + final long now = nanoClock.getAsLong(); + for (final Integer dataNodeId : registeredDataNodeIds) { + lastSuccessfulResponseNanos.put(dataNodeId, now); + } + } + + public void removeDataNode(final int dataNodeId) { + lastSuccessfulResponseNanos.remove(dataNodeId); + } + + public static DataNodeContactTracker getInstance() { + return DataNodeContactTrackerHolder.INSTANCE; + } + + private static final class DataNodeContactTrackerHolder { + private static final DataNodeContactTracker INSTANCE = new DataNodeContactTracker(); + + private DataNodeContactTrackerHolder() {} + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java new file mode 100644 index 0000000000000..622d432c2315b --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import java.util.Collection; + +/** + * Pure decision logic for metadata-broadcast verdicts. + * + *

A DataNode that failed to ack is assumed to support self-fencing (enforced by the caller). The + * overall verdict is {@code PROCEED} when every unacked DataNode has been silent for at least + * {@code T_proceed}; otherwise {@code WAIT} until the retry budget is exhausted, then {@code FAIL}. + */ +public final class MetadataBroadcastVerdict { + + public enum Verdict { + PROCEED, + WAIT, + FAIL + } + + private MetadataBroadcastVerdict() {} + + /** Per-DataNode inputs for one broadcast round. */ + public static final class DataNodeState { + private final boolean executeSuccess; + private final long hbAgeMs; + + public DataNodeState(final boolean executeSuccess, final long hbAgeMs) { + this.executeSuccess = executeSuccess; + this.hbAgeMs = hbAgeMs; + } + } + + public static Verdict decide( + final Collection states, + final long fenceTimeOutsMs, + final boolean waitBudgetExhausted) { + for (final DataNodeState state : states) { + if (!state.executeSuccess && state.hbAgeMs < fenceTimeOutsMs) { + return waitBudgetExhausted ? Verdict.FAIL : Verdict.WAIT; + } + } + return Verdict.PROCEED; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/schema/ClusterSchemaManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/schema/ClusterSchemaManager.java index 6b0c82ff1f056..a9c9c978527d2 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/schema/ClusterSchemaManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/schema/ClusterSchemaManager.java @@ -29,6 +29,7 @@ import org.apache.iotdb.commons.schema.SchemaConstant; import org.apache.iotdb.commons.schema.table.Audit; import org.apache.iotdb.commons.schema.table.NonCommittableTsTable; +import org.apache.iotdb.commons.schema.table.PreDeleteTsTable; import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.schema.table.TreeViewSchema; import org.apache.iotdb.commons.schema.table.TsTable; @@ -1369,10 +1370,13 @@ public TDescTable4InformationSchemaResp describeTables4InformationSchema() { } } - public TFetchTableResp fetchTables(final Map> fetchTableMap) { + public TFetchTableResp fetchTables( + final Map> fetchTableMap, Set tableNodeStatus) { try { return ((FetchTableResp) - configManager.getConsensusManager().read(new FetchTablePlan(fetchTableMap))) + configManager + .getConsensusManager() + .read(new FetchTablePlan(fetchTableMap, tableNodeStatus))) .convertToTFetchTableResp(); } catch (final ConsensusException e) { LOGGER.warn(ConfigNodeMessages.FAILED_IN_THE_READ_API_EXECUTING_THE_CONSENSUS_LAYER_DUE, e); @@ -1391,23 +1395,42 @@ public byte[] getAllTableInfoForDataNodeActivation() { final Map> alteringTables = configManager.getProcedureManager().getAllExecutingTables(); final Map> usingTableMap = clusterSchemaInfo.getAllUsingTables(); - final Map> preCreateTableMap = clusterSchemaInfo.getAllPreCreateTables(); - alteringTables.forEach( - (k, v) -> { - final List preCreateList = - preCreateTableMap.computeIfAbsent(k, database -> new ArrayList<>()); - if (Objects.isNull(v)) { - usingTableMap - .remove(k) - .forEach( - table -> preCreateList.add(new NonCommittableTsTable(table.getTableName()))); - } else { - preCreateList.addAll( - v.stream().map(NonCommittableTsTable::new).collect(Collectors.toList())); - } - }); - return TsTableInternalRPCUtil.serializeTableInitializationInfo( - usingTableMap, preCreateTableMap); + final Map> allPreDeleteTables = clusterSchemaInfo.getAllPreDeleteTables(); + // the specialStatusMap will hold the PreCreate/PreDelete/altering table(NonCommittableTsTable) + final Map> specialStatusMap = clusterSchemaInfo.getAllPreCreateTables(); + + for (Map.Entry> databaseEntry : alteringTables.entrySet()) { + String databaseName = databaseEntry.getKey(); + List alteringTableList = databaseEntry.getValue(); + List speicalMapList = + specialStatusMap.computeIfAbsent(databaseName, name -> new ArrayList<>()); + + // 1. if the alteringTableList is null, means that executing the drop database is going on + if (Objects.isNull(alteringTableList)) { + List relatedTables = usingTableMap.remove(databaseName); + relatedTables.forEach( + table -> speicalMapList.add(new NonCommittableTsTable(table.getTableName()))); + } else { + // 2. if the table has existed, the procedure is modifying it. + // so the usingTableMap and specialStatusMap both hold it + speicalMapList.addAll( + alteringTableList.stream() + .map(NonCommittableTsTable::new) + .collect(Collectors.toList())); + } + } + // 3. deal with the pre_delete status table, add the PreDeleteTsTable table + for (Map.Entry> entry : allPreDeleteTables.entrySet()) { + String databaseName = entry.getKey(); + List preDeleteTables = entry.getValue(); + specialStatusMap + .computeIfAbsent(databaseName, name -> new ArrayList<>()) + .addAll( + preDeleteTables.stream() + .map(tsTable -> new PreDeleteTsTable(tsTable.getTableName())) + .collect(Collectors.toList())); + } + return TsTableInternalRPCUtil.serializeTableInitializationInfo(usingTableMap, specialStatusMap); } // endregion diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java index 4a70ace8ca19e..2dd49c1db36ed 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java @@ -133,6 +133,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.RenameTableColumnPlan; import org.apache.iotdb.confignode.consensus.request.write.table.RenameTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.RollbackCreateTablePlan; +import org.apache.iotdb.confignode.consensus.request.write.table.RollbackPreDeleteTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTableColumnCommentPlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTableCommentPlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTablePropertiesPlan; @@ -606,6 +607,8 @@ public TSStatus executeNonQueryPlan(ConfigPhysicalPlan physicalPlan) case PreDeleteTable: case PreDeleteView: return clusterSchemaInfo.preDeleteTable((PreDeleteTablePlan) physicalPlan); + case RollbackPreDeleteTable: + return clusterSchemaInfo.rollbackPreDeleteTable((RollbackPreDeleteTablePlan) physicalPlan); case CommitDeleteTable: case CommitDeleteView: return clusterSchemaInfo.dropTable((CommitDeleteTablePlan) physicalPlan); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ClusterSchemaInfo.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ClusterSchemaInfo.java index b4e2f349f015a..4abea616a2e85 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ClusterSchemaInfo.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ClusterSchemaInfo.java @@ -66,6 +66,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.RenameTableColumnPlan; import org.apache.iotdb.confignode.consensus.request.write.table.RenameTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.RollbackCreateTablePlan; +import org.apache.iotdb.confignode.consensus.request.write.table.RollbackPreDeleteTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTableColumnCommentPlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTableCommentPlan; import org.apache.iotdb.confignode.consensus.request.write.table.SetTablePropertiesPlan; @@ -1187,6 +1188,13 @@ public TSStatus commitCreateTable(final CommitCreateTablePlan plan) { getQualifiedDatabasePartialPath(plan.getDatabase()), plan.getTableName())); } + public TSStatus rollbackPreDeleteTable(final RollbackPreDeleteTablePlan plan) { + return executeWithLock( + () -> + tableModelMTree.rollbackPreDeleteTable( + getQualifiedDatabasePartialPath(plan.getDatabase()), plan.getTableName())); + } + public TSStatus preDeleteTable(final PreDeleteTablePlan plan) { return executeWithLock( () -> @@ -1343,7 +1351,8 @@ public FetchTableResp fetchTables(final FetchTablePlan plan) { database2Tables.getKey(), tableModelMTree.getSpecificTablesUnderSpecificDatabase( getQualifiedDatabasePartialPath(database2Tables.getKey()), - database2Tables.getValue())); + database2Tables.getValue(), + plan.getTableNodeStatusSet())); } catch (final DatabaseNotSetException ignore) { // continue } @@ -1447,7 +1456,19 @@ public Map> getAllUsingTables() { public Map> getAllPreCreateTables() { databaseReadWriteLock.readLock().lock(); try { - return tableModelMTree.getAllPreCreateTables(); + return tableModelMTree.getAllSpecialStatusTables(TableNodeStatus.PRE_CREATE); + } catch (final MetadataException e) { + LOGGER.warn(e.getMessage(), e); + throw new RuntimeException(e); + } finally { + databaseReadWriteLock.readLock().unlock(); + } + } + + public Map> getAllPreDeleteTables() { + databaseReadWriteLock.readLock().lock(); + try { + return tableModelMTree.getAllSpecialStatusTables(TableNodeStatus.PRE_DELETE); } catch (final MetadataException e) { LOGGER.warn(e.getMessage(), e); throw new RuntimeException(e); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ConfigMTree.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ConfigMTree.java index cf8d9d5e2ed3a..43e72240a0459 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ConfigMTree.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/schema/ConfigMTree.java @@ -33,6 +33,7 @@ import org.apache.iotdb.commons.schema.node.role.IDatabaseMNode; import org.apache.iotdb.commons.schema.node.utils.IMNodeFactory; import org.apache.iotdb.commons.schema.node.utils.IMNodeIterator; +import org.apache.iotdb.commons.schema.table.PreDeleteTsTable; import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.schema.table.TreeViewSchema; import org.apache.iotdb.commons.schema.table.TsTable; @@ -754,6 +755,16 @@ public void preDeleteTable( tableNode.setStatus(TableNodeStatus.PRE_DELETE); } + public void rollbackPreDeleteTable(final PartialPath database, final String tableName) + throws MetadataException { + final IConfigMNode databaseNode = getDatabaseNodeByDatabasePath(database).getAsMNode(); + if (!databaseNode.hasChild(tableName)) { + return; + } + final ConfigTableNode tableNode = (ConfigTableNode) databaseNode.getChild(tableName); + tableNode.setStatus(TableNodeStatus.USING); + } + public void dropTable(final PartialPath database, final String tableName) throws MetadataException { final IConfigMNode databaseNode = getDatabaseNodeByDatabasePath(database).getAsMNode(); @@ -845,19 +856,25 @@ public List> getAllTablesUnderSpecificDatabase( } public Map getSpecificTablesUnderSpecificDatabase( - final PartialPath databasePath, final Set tables) throws MetadataException { + final PartialPath databasePath, + final Set tables, + final Set statusSet) + throws MetadataException { final IConfigMNode databaseNode = getDatabaseNodeByDatabasePath(databasePath).getAsMNode(); final Map result = new HashMap<>(); - tables.forEach( - table -> { - final IConfigMNode child = databaseNode.getChildren().get(table); - if (child instanceof ConfigTableNode - && ((ConfigTableNode) child).getStatus().equals(TableNodeStatus.USING)) { - result.put(table, ((ConfigTableNode) child).getTable()); - } else { - result.put(table, null); - } - }); + for (final String tableName : tables) { + final IConfigMNode child = databaseNode.getChildren().get(tableName); + if (child instanceof ConfigTableNode + && statusSet.contains(((ConfigTableNode) child).getStatus())) { + TsTable table = + ((ConfigTableNode) child).getStatus() == TableNodeStatus.PRE_DELETE + ? new PreDeleteTsTable(tableName) + : ((ConfigTableNode) child).getTable(); + result.put(tableName, table); + } else { + result.put(tableName, null); + } + } return result; } @@ -893,7 +910,12 @@ public Map> getAllUsingTables() { })); } - public Map> getAllPreCreateTables() throws MetadataException { + public Map> getAllSpecialStatusTables(TableNodeStatus tableNodeStatus) + throws MetadataException { + if (TableNodeStatus.PRE_CREATE != tableNodeStatus + && TableNodeStatus.PRE_DELETE != tableNodeStatus) { + throw new SemanticException("Invalid table status " + tableNodeStatus); + } final Map> result = new HashMap<>(); final List databaseList = getAllDatabasePaths(true); for (final PartialPath databasePath : databaseList) { @@ -902,7 +924,7 @@ public Map> getAllPreCreateTables() throws MetadataExcepti for (final IConfigMNode child : databaseNode.getChildren().values()) { if (child instanceof ConfigTableNode) { final ConfigTableNode tableNode = (ConfigTableNode) child; - if (!tableNode.getStatus().equals(TableNodeStatus.PRE_CREATE)) { + if (!tableNode.getStatus().equals(tableNodeStatus)) { continue; } result.computeIfAbsent(database, k -> new ArrayList<>()).add(tableNode.getTable()); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java index ab4f367df9242..55263db0cd4eb 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java @@ -22,7 +22,6 @@ import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; -import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; import org.apache.iotdb.common.rpc.thrift.TSStatus; @@ -36,9 +35,7 @@ import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; -import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; import org.apache.iotdb.confignode.client.sync.SyncConfigNodeClientPool; -import org.apache.iotdb.confignode.client.sync.SyncDataNodeClientPool; import org.apache.iotdb.confignode.consensus.request.write.confignode.RemoveConfigNodePlan; import org.apache.iotdb.confignode.consensus.request.write.database.DeleteDatabasePlan; import org.apache.iotdb.confignode.consensus.request.write.database.PreDeleteDatabasePlan; @@ -47,6 +44,7 @@ import org.apache.iotdb.confignode.exception.AddPeerException; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.manager.load.LoadManager; import org.apache.iotdb.confignode.manager.load.cache.region.RegionHeartbeatSample; import org.apache.iotdb.confignode.manager.node.NodeManager; @@ -97,7 +95,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; @@ -165,74 +162,33 @@ public void preDeleteDatabase( getPartitionManager().preDeleteDatabase(deleteSgName, preDeleteType); } - /** - * @param databaseName database name - * @return ALL SUCCESS OR NOT - * @throws IOException IOE - * @throws TException Thrift IOE - */ public boolean invalidateCache(final String databaseName) throws IOException, TException { - final List allDataNodes = getNodeManager().getRegisteredDataNodes(); final TInvalidateCacheReq invalidateCacheReq = new TInvalidateCacheReq(); invalidateCacheReq.setStorageGroup(true); invalidateCacheReq.setFullPath(databaseName); - for (final TDataNodeConfiguration dataNodeConfiguration : allDataNodes) { - final int dataNodeId = dataNodeConfiguration.getLocation().getDataNodeId(); - - // If the node is not alive, retry for up to 10 times - NodeStatus nodeStatus = getLoadManager().getNodeStatus(dataNodeId); - int retryNum = 10; - if (nodeStatus == NodeStatus.Unknown) { - for (int i = 0; i < retryNum && nodeStatus == NodeStatus.Unknown; i++) { - try { - TimeUnit.MILLISECONDS.sleep(500); - } catch (final InterruptedException e) { - LOG.error("Sleep failed in ConfigNodeProcedureEnv: ", e); - Thread.currentThread().interrupt(); - break; - } - nodeStatus = getLoadManager().getNodeStatus(dataNodeId); - } - } - - if (nodeStatus == NodeStatus.Unknown) { - LOG.warn( - "Invalidate cache failed, because DataNode {} is Unknown", - dataNodeConfiguration.getLocation().getInternalEndPoint()); - return false; - } - - // Always invalidate PartitionCache first - final TSStatus invalidatePartitionStatus = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - dataNodeConfiguration.getLocation().getInternalEndPoint(), - invalidateCacheReq, - CnToDnSyncRequestType.INVALIDATE_PARTITION_CACHE); - - final TSStatus invalidateSchemaStatus = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - dataNodeConfiguration.getLocation().getInternalEndPoint(), - invalidateCacheReq, - CnToDnSyncRequestType.INVALIDATE_SCHEMA_CACHE); - - if (!verifySucceed(invalidatePartitionStatus, invalidateSchemaStatus)) { - LOG.error( - "Invalidate cache failed, invalidate partition cache status is {}, invalidate schemaengine cache status is {}", - invalidatePartitionStatus, - invalidateSchemaStatus); - return false; - } + // The per-round cache invalidation is sent asynchronously so the lease framework can collect a + // cluster-wide ack map and decide whether unAcked DataNodes are safely fenced. + final ClusterCachePropagator propagator = new ClusterCachePropagator(configManager); + if (!propagator.propagate( + targets -> + invalidateDatabaseCacheOnce( + targets, invalidateCacheReq, CnToDnAsyncRequestType.INVALIDATE_PARTITION_CACHE))) { + return false; } - return true; - } - - public boolean verifySucceed(TSStatus... status) { - return Arrays.stream(status) - .allMatch(tsStatus -> tsStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()); + return propagator.propagate( + targets -> + invalidateDatabaseCacheOnce( + targets, invalidateCacheReq, CnToDnAsyncRequestType.INVALIDATE_SCHEMA_CACHE)); + } + + private Map invalidateDatabaseCacheOnce( + final Map targets, + final TInvalidateCacheReq invalidateCacheReq, + final CnToDnAsyncRequestType requestType) { + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>(requestType, invalidateCacheReq, targets); + CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); } /** diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java index 6782c1b652a3b..03ad7f3312c6c 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java @@ -40,6 +40,7 @@ import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.load.balancer.region.GreedyCopySetRegionGroupAllocator; import org.apache.iotdb.confignode.manager.load.balancer.region.IRegionGroupAllocator; import org.apache.iotdb.confignode.manager.load.cache.node.NodeHeartbeatSample; @@ -455,6 +456,9 @@ public void removeDataNodePersistence(List removedDataNodes) PartitionMetrics.unbindDataNodePartitionMetricsWhenUpdate( MetricService.getInstance(), NodeUrlUtils.convertTEndPointUrl(dataNodeLocation.getClientRpcEndPoint())); + // Drop the removed DataNode's metadata-lease contact/capability state so it is not retained, + // and a future DataNode reusing the id cannot inherit stale fencing history. + DataNodeContactTracker.getInstance().removeDataNode(dataNodeLocation.getDataNodeId()); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java index 8c8d2019f4de8..b5783e908a944 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java @@ -31,8 +31,6 @@ import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.commons.schema.view.viewExpression.ViewExpression; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; @@ -42,7 +40,6 @@ import org.apache.iotdb.db.exception.BatchProcessException; import org.apache.iotdb.db.exception.metadata.view.ViewNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TAlterViewReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -123,27 +120,15 @@ protected Flow executeFromState( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, false)) { // all dataNodes must clear the related schemaengine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, - viewPathToSourceMap.keySet()); - setFailure( - new ProcedureException( - new MetadataException( - ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, + viewPathToSourceMap.keySet()); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java index 3a4431047c218..7d6cc571767fe 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java @@ -28,8 +28,6 @@ import org.apache.iotdb.commons.path.PathDeserializeUtil; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeAlterTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -42,7 +40,6 @@ import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.db.exception.metadata.PathNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TAlterTimeSeriesReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -249,26 +246,17 @@ public static void invalidateCache( final String requestMessage, final Consumer setFailure, final boolean needLock) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(measurementPathBytes).setNeedLock(needLock), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed past provably-fenced DataNodes instead of hard-failing on the first unreachable one + // (see SchemaUtils.invalidateMatchedSchemaCache). Runs before the physical datatype change, so + // the "alter only after PROCEED" ordering holds. + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), measurementPathBytes, needLock)) { // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, - requestMessage); - setFailure.accept( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); + setFailure.accept( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java index 5846c79b5219f..c634c21d92423 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java @@ -27,8 +27,6 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteLogicalViewPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -41,7 +39,6 @@ import org.apache.iotdb.db.exception.metadata.view.ViewNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TConstructViewSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteViewSchemaReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackViewSchemaBlackListReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -167,26 +164,15 @@ protected List processResponseOfOneDataNode( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, false)) { // all dataNodes must clear the related schemaengine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, requestMessage); - setFailure( - new ProcedureException( - new MetadataException( - ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, requestMessage); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); + return; } setNextState(DeleteLogicalViewState.DELETE_VIEW_SCHEMA); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java index 1cd5efe639cd8..107984dc603d1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java @@ -27,8 +27,6 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -42,7 +40,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TConstructSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataForDeleteSchemaReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteTimeSeriesReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackSchemaBlackListReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -197,26 +194,18 @@ public static void invalidateCache( final String requestMessage, final Consumer setFailure, final boolean needLock) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes).setNeedLock(needLock), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // schema cache and resyncs on recovery, so it cannot serve the to-be-deleted/altered series), + // instead of hard-failing on the first unreachable DataNode. This runs before the physical + // delete in the state machine, so the "delete only after PROCEED" ordering holds. + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, needLock)) { // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, - requestMessage); - setFailure.accept( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); + setFailure.accept( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java index 4b8d0a533afe3..a8b5437e585ad 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java @@ -36,6 +36,7 @@ import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.db.exception.metadata.PathNotExistException; @@ -43,7 +44,9 @@ import org.apache.iotdb.mpp.rpc.thrift.TCheckSchemaRegionUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateResp; +import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTemplateReq; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -60,6 +63,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Supplier; import java.util.stream.Collectors; public class SchemaUtils { @@ -240,23 +244,32 @@ protected void onAllReplicasetFailure( } } - public static Map preReleaseTable( - final String database, - final TsTable table, - final ConfigManager configManager, - final String oldName) { + /** Build the PRE_UPDATE_TABLE request used to pre-release a table change to DataNodes. */ + public static TUpdateTableReq BuildPreUpdateTableReq( + final String database, final TsTable table, final String oldName) { final TUpdateTableReq req = new TUpdateTableReq(); req.setType(TsTableInternalRPCType.PRE_UPDATE_TABLE.getOperationType()); req.setTableInfo(TsTableInternalRPCUtil.serializeSingleTsTableWithDatabase(database, table)); req.setOldName(oldName); + return req; + } - final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); + /** + * Broadcast a table update to exactly {@code targets} and return the full per-nodeId response map + * (both successes and failures). Used by {@link + * org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator}, which needs to know which + * DataNodes acknowledged in order to decide whether it is safe to proceed past the rest. + */ + public static Map broadcastTableUpdate( + final TUpdateTableReq req, final Map targets) { final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.UPDATE_TABLE, req, targets); CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap().entrySet().stream() + return clientHandler.getResponseMap(); + } + + private static Map failedOnly(final Map responses) { + return responses.entrySet().stream() .filter(entry -> entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } @@ -289,11 +302,9 @@ public static Map commitReleaseTable( .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } - public static Map rollbackPreRelease( - final String database, - final String tableName, - final ConfigManager configManager, - final @Nullable String oldName) { + /** Build the ROLLBACK_UPDATE_TABLE request used to roll back a pre-released table change. */ + public static TUpdateTableReq rollbackUpdateTableReq( + final String database, final String tableName, final String oldName) { final TUpdateTableReq req = new TUpdateTableReq(); req.setType(TsTableInternalRPCType.ROLLBACK_UPDATE_TABLE.getOperationType()); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -305,16 +316,73 @@ public static Map rollbackPreRelease( } req.setTableInfo(outputStream.toByteArray()); req.setOldName(oldName); + return req; + } - final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap().entrySet().stream() - .filter(entry -> entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + public static Map rollbackPreRelease( + final String database, + final String tableName, + final ConfigManager configManager, + final @Nullable String oldName) { + return failedOnly( + broadcastTableUpdate( + rollbackUpdateTableReq(database, tableName, oldName), + configManager.getNodeManager().getRegisteredDataNodeLocations())); + } + + /** + * Broadcast an INVALIDATE_MATCHED_SCHEMA_CACHE to all DataNodes through {@link + * ClusterCachePropagator}: proceed once every unreachable DataNode is provably self-fenced (it + * fails closed on its schema cache and resyncs on recovery, so it cannot serve the + * deleted/altered series), instead of hard-failing on the first unreachable DataNode. Returns + * whether it is safe to proceed; the caller maps {@code false} to its own failure. + * + *

The propagator may re-broadcast while waiting for unacked DataNodes, so a fresh request with + * a duplicated buffer is built on each attempt — a consumed buffer can never be re-sent as an + * empty (and silently-successful) invalidation. + */ + public static boolean invalidateMatchedSchemaCache( + final ConfigManager configManager, + final ByteBuffer patternTreeBytes, + final boolean needLock) { + return new ClusterCachePropagator(configManager) + .propagate( + targets -> { + final DataNodeAsyncRequestContext + clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, + new TInvalidateMatchedSchemaCacheReq(patternTreeBytes.duplicate()) + .setNeedLock(needLock), + targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); + } + + /** + * Broadcast an UPDATE_TEMPLATE to all DataNodes through {@link ClusterCachePropagator}: proceed + * once every unreachable DataNode is provably self-fenced (it fails closed on its template cache + * and resyncs on recovery), instead of hard-failing on the first unreachable DataNode. Returns + * whether it is safe to proceed. + * + *

The request is rebuilt from {@code requestSupplier} on every attempt: the propagator may + * re-broadcast while waiting, and {@code TUpdateTemplateReq}'s binary field is backed by a {@link + * ByteBuffer}, so reusing one request could re-send a consumed (empty) payload. + */ + public static boolean broadcastTemplateUpdate( + final ConfigManager configManager, final Supplier requestSupplier) { + return new ClusterCachePropagator(configManager) + .propagate( + targets -> { + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.UPDATE_TEMPLATE, requestSupplier.get(), targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); } public static TSStatus executeInConsensusLayer( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java index dca79a02366f6..31641528269af 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java @@ -34,6 +34,7 @@ import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -115,13 +116,8 @@ void setConfigNodeTTL(final ConfigNodeProcedureEnv env) { } void updateDataNodeTTL(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - sendTTLRequest( - dataNodeLocationMap, - buildSetTTLReq(plan.getPathPattern(), plan.getTTL(), plan.isDataBase())); - if (hasFailedDataNode(clientHandler)) { + if (!broadcastTTLAndDecide( + env, buildSetTTLReq(plan.getPathPattern(), plan.getTTL(), plan.isDataBase()))) { LOGGER.error(ProcedureMessages.FAILED_TO_UPDATE_TTL_CACHE_OF_DATANODE); setFailure( new ProcedureException( @@ -129,6 +125,17 @@ void updateDataNodeTTL(final ConfigNodeProcedureEnv env) { } } + /** + * Broadcast the TTL update to all DataNodes and decide whether it is safe to proceed: proceed + * once every unreachable DataNode is provably self-fenced (it fails closed on TTL in compaction + * and resyncs on recovery) instead of hard-failing on the first unreachable DataNode. + * Package-private and overridable for tests. + */ + boolean broadcastTTLAndDecide(final ConfigNodeProcedureEnv env, final TSetTTLReq req) { + return new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> sendTTLRequest(targets, req).getResponseMap()); + } + private void capturePreviousTTLState(final ConfigNodeProcedureEnv env) { if (previousTTLStateCaptured) { return; @@ -168,19 +175,6 @@ private TSetTTLReq buildSetTTLReq( Collections.singletonList(String.join(".", pathPattern)), ttl, isDataBase); } - private boolean hasFailedDataNode( - final DataNodeAsyncRequestContext clientHandler) { - if (!clientHandler.getRequestIndices().isEmpty()) { - return true; - } - for (TSStatus status : clientHandler.getResponseMap().values()) { - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - return true; - } - } - return false; - } - private long getTTLOrDefault(final ConfigNodeProcedureEnv env, final String[] pathPattern) { final long ttl = env.getConfigManager().getTTLManager().getTTL(pathPattern); return ttl == TTLCache.NULL_TTL ? TTL_NOT_EXIST : ttl; @@ -220,30 +214,20 @@ private void restoreTTLOnConfigNode( } private void rollbackDataNodeTTL(final ConfigNodeProcedureEnv env) throws ProcedureException { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - restoreTTLOnDataNodes(dataNodeLocationMap, plan.getPathPattern(), previousTTL); + restoreTTLOnDataNodes(env, plan.getPathPattern(), previousTTL); if (plan.isDataBase()) { restoreTTLOnDataNodes( - dataNodeLocationMap, - getDatabaseWildcardPathPattern(plan.getPathPattern()), - previousDatabaseWildcardTTL); + env, getDatabaseWildcardPathPattern(plan.getPathPattern()), previousDatabaseWildcardTTL); } } private void restoreTTLOnDataNodes( - final Map dataNodeLocationMap, - final String[] pathPattern, - final long ttl) + final ConfigNodeProcedureEnv env, final String[] pathPattern, final long ttl) throws ProcedureException { - if (dataNodeLocationMap.isEmpty()) { - return; - } - final DataNodeAsyncRequestContext clientHandler = - sendTTLRequest( - dataNodeLocationMap, - buildSetTTLReq(pathPattern, ttl == TTL_NOT_EXIST ? TTLCache.NULL_TTL : ttl, false)); - if (hasFailedDataNode(clientHandler)) { + // Same proceed-past-fenced semantics as the forward update: a down DataNode must not block + // rollback (it resyncs TTL on recovery); only a live unacked DataNode fails it. + if (!broadcastTTLAndDecide( + env, buildSetTTLReq(pathPattern, ttl == TTL_NOT_EXIST ? TTLCache.NULL_TTL : ttl, false))) { throw new ProcedureException( new MetadataException( "Rollback dataNode ttl cache failed for " + String.join(".", pathPattern))); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java index 55fffedad6145..8f24fe92eefe7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java @@ -215,30 +215,25 @@ private void preReleaseTemplate(final ConfigNodeProcedureEnv env) { return; } - final TUpdateTemplateReq req = new TUpdateTemplateReq(); - req.setType(TemplateInternalRPCUpdateType.ADD_TEMPLATE_PRE_SET_INFO.toByte()); - req.setTemplateInfo( - TemplateInternalRPCUtil.generateAddTemplateSetInfoBytes(template, templateSetPath)); - - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TEMPLATE, req, dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final Map.Entry entry : statusMap.entrySet()) { - if (entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn( - ProcedureMessages.FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO, - templateName, - templateSetPath, - dataNodeLocationMap.get(entry.getKey())); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.PRE_SET_TEMPLATE_FAILED))); - return; - } + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // template cache and resyncs on recovery) instead of hard-failing on the first unreachable one. + if (!SchemaUtils.broadcastTemplateUpdate( + env.getConfigManager(), + () -> { + final TUpdateTemplateReq req = new TUpdateTemplateReq(); + req.setType(TemplateInternalRPCUpdateType.ADD_TEMPLATE_PRE_SET_INFO.toByte()); + req.setTemplateInfo( + TemplateInternalRPCUtil.generateAddTemplateSetInfoBytes(template, templateSetPath)); + return req; + })) { + LOGGER.warn( + ProcedureMessages.FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO, + templateName, + templateSetPath, + "an unreachable DataNode is not provably fenced"); + setFailure( + new ProcedureException(new MetadataException(ProcedureMessages.PRE_SET_TEMPLATE_FAILED))); + return; } setNextState(SetTemplateState.VALIDATE_TIMESERIES_EXISTENCE); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java index 1fd7aefb33065..d7bb9d0894660 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java @@ -155,29 +155,24 @@ private void invalidateCache(final ConfigNodeProcedureEnv env) { } private void executeInvalidateCache(final ConfigNodeProcedureEnv env) throws ProcedureException { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final TUpdateTemplateReq invalidateTemplateSetInfoReq = new TUpdateTemplateReq(); - invalidateTemplateSetInfoReq.setType( - TemplateInternalRPCUpdateType.INVALIDATE_TEMPLATE_SET_INFO.toByte()); - invalidateTemplateSetInfoReq.setTemplateInfo(getInvalidateTemplateSetInfo()); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TEMPLATE, - invalidateTemplateSetInfoReq, - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // template cache and resyncs on recovery) instead of hard-failing on the first unreachable one. + if (!SchemaUtils.broadcastTemplateUpdate( + env.getConfigManager(), + () -> { + final TUpdateTemplateReq invalidateTemplateSetInfoReq = new TUpdateTemplateReq(); + invalidateTemplateSetInfoReq.setType( + TemplateInternalRPCUpdateType.INVALIDATE_TEMPLATE_SET_INFO.toByte()); + invalidateTemplateSetInfoReq.setTemplateInfo(getInvalidateTemplateSetInfo()); + return invalidateTemplateSetInfoReq; + })) { // all dataNodes must clear the related template cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_TEMPLATE_CACHE_OF_TEMPLATE_SET_ON, - template.getName(), - path); - throw new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_TEMPLATE_CACHE_FAILED)); - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_TEMPLATE_CACHE_OF_TEMPLATE_SET_ON, + template.getName(), + path); + throw new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_TEMPLATE_CACHE_FAILED)); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java index 7cf1ff1c24f83..406bdfd5909c3 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java @@ -27,11 +27,13 @@ import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; import org.apache.iotdb.confignode.procedure.impl.schema.DataNodeTSStatusTaskExecutor; import org.apache.iotdb.confignode.procedure.impl.schema.SchemaUtils; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.tsfile.utils.ReadWriteIOUtils; import org.slf4j.Logger; @@ -91,17 +93,22 @@ protected void preRelease(final ConfigNodeProcedureEnv env) { } protected void preRelease(final ConfigNodeProcedureEnv env, final @Nullable String oldName) { - final Map failedResults = - SchemaUtils.preReleaseTable(database, table, env.getConfigManager(), oldName); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // Proceed once every unreachable DataNode is provably self-fenced instead of hard-failing the + // DDL: a fenced DataNode fails closed on its now-stale table cache and resyncs on lease + // recovery, so it cannot serve dirty schema. Only fail if an unacked DataNode is not provably + // fenced (it may still be serving clients). + final TUpdateTableReq req = SchemaUtils.BuildPreUpdateTableReq(database, table, oldName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_PRE_RELEASE_FOR_TABLE_TO_DATANODE_FAILURE_RESULTS, getActionMessage(), database, table.getTableName(), - failedResults); + ProcedureMessages.FAILED_TO_PROVE_DN_IS_FENCED); setFailure( new ProcedureException( new MetadataException( @@ -138,18 +145,21 @@ protected void rollbackPreRelease(final ConfigNodeProcedureEnv env) { protected void rollbackPreRelease( final ConfigNodeProcedureEnv env, final @Nullable String tableName) { - final Map failedResults = - SchemaUtils.rollbackPreRelease( - database, table.getTableName(), env.getConfigManager(), tableName); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // A down DataNode must not block rollback either: proceed past provably-fenced DataNodes (which + // resync on recovery) and only fail on an unacked DataNode that is not provably fenced. + final TUpdateTableReq req = + SchemaUtils.rollbackUpdateTableReq(database, table.getTableName(), tableName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_ROLLBACK_PRE_RELEASE_FOR_TABLE_INFO_TO_DATANODE, getActionMessage(), database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException( new MetadataException( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java index 05e0facb3018e..0d6db3a9c4fd2 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java @@ -29,6 +29,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.RollbackCreateTablePlan; import org.apache.iotdb.confignode.exception.DatabaseNotExistsException; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -36,6 +37,7 @@ import org.apache.iotdb.confignode.procedure.state.schema.CreateTableState; import org.apache.iotdb.confignode.procedure.store.ProcedureType; import org.apache.iotdb.confignode.rpc.thrift.TDatabaseSchema; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -151,16 +153,22 @@ protected void preCreateTable(final ConfigNodeProcedureEnv env) { } private void preReleaseTable(final ConfigNodeProcedureEnv env) { - final Map failedResults = - SchemaUtils.preReleaseTable(database, table, env.getConfigManager(), null); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // Broadcast the pre-update to all DataNodes. Instead of failing whenever any DataNode is + // unreachable, proceed once every unacked DataNode is provably self-fenced: such a DataNode + // fails closed on its (now-stale) table cache and resyncs on lease recovery, so it cannot serve + // dirty schema. Only fail if an unacked DataNode is not provably fenced (it may still be + // serving clients). + final TUpdateTableReq req = SchemaUtils.BuildPreUpdateTableReq(database, table, null); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_SYNC_TABLE_PRE_CREATE_INFO_TO_DATANODE_FAILURE, database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException(new MetadataException(ProcedureMessages.PRE_CREATE_TABLE_FAILED))); return; @@ -240,17 +248,19 @@ protected void rollbackCreate(final ConfigNodeProcedureEnv env) { } private void rollbackPreRelease(final ConfigNodeProcedureEnv env) { - final Map failedResults = - SchemaUtils.rollbackPreRelease( - database, table.getTableName(), env.getConfigManager(), null); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // A down DataNode must not block rollback if it is already provably self-fenced. + final TUpdateTableReq req = + SchemaUtils.rollbackUpdateTableReq(database, table.getTableName(), null); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_SYNC_TABLE_ROLLBACK_CREATE_INFO_TO_DATANODE_FAILURE, database, table.getTableName(), - failedResults); + ProcedureMessages.FAILED_TO_PROVE_DN_IS_FENCED); setFailure( new ProcedureException( new MetadataException(ProcedureMessages.ROLLBACK_CREATE_TABLE_FAILED))); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DeleteDevicesProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DeleteDevicesProcedure.java index 66c1687d5d086..9f2bb0bbfc177 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DeleteDevicesProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DeleteDevicesProcedure.java @@ -32,6 +32,7 @@ import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ClusterManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.schema.DataNodeTSStatusTaskExecutor; @@ -222,32 +223,35 @@ protected void onAllReplicasetFailure( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_TABLE_DEVICE_CACHE, - new TTableDeviceInvalidateCacheReq(database, tableName, ByteBuffer.wrap(patternBytes)), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { - // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_DEVICES_IN_TABLE, - database, - tableName); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + TTableDeviceInvalidateCacheReq req = + new TTableDeviceInvalidateCacheReq(database, tableName, ByteBuffer.wrap(patternBytes)); + boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> broadCastInvalidateCache(req, targets)); + + if (!proceeded) { + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_DEVICES_IN_TABLE, + database, + tableName); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); + return; } setNextState(DELETE_DATA); } + private Map broadCastInvalidateCache( + final TTableDeviceInvalidateCacheReq req, final Map targets) { + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.INVALIDATE_MATCHED_TABLE_DEVICE_CACHE, req, targets); + CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + } + private void deleteData(final ConfigNodeProcedureEnv env) { new TableRegionTaskExecutor<>( "delete data for table device", diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableColumnProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableColumnProcedure.java index 4e4d0e758bf7f..d06a275513d73 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableColumnProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableColumnProcedure.java @@ -33,6 +33,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.view.CommitDeleteViewColumnPlan; import org.apache.iotdb.confignode.consensus.request.write.table.view.PreDeleteViewColumnPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.schema.SchemaUtils; @@ -150,36 +151,29 @@ private void checkAndPreDeleteColumn(final ConfigNodeProcedureEnv env) { } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_COLUMN_CACHE, - new TInvalidateColumnCacheReq(database, tableName, columnName, isAttributeColumn), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { - // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_COLUMN_S_CACHE_OF_TABLE, - isAttributeColumn ? "attribute" : "measurement", - columnName, - database, - tableName); - setFailure( - new ProcedureException( - new MetadataException( - String.format( - ProcedureMessages.INVALIDATE_COLUMN_CACHE_FAILED_FOR_TABLE, - columnName, - database, - tableName)))); - return; - } - } + TInvalidateColumnCacheReq req = + new TInvalidateColumnCacheReq(database, tableName, columnName, isAttributeColumn); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> broadCastInvalidateCache(req, targets)); + if (!proceeded) { + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_COLUMN_S_CACHE_OF_TABLE, + isAttributeColumn ? "attribute" : "measurement", + columnName, + database, + tableName); + setFailure( + new ProcedureException( + new MetadataException( + String.format( + ProcedureMessages.INVALIDATE_COLUMN_CACHE_FAILED_FOR_TABLE, + columnName, + database, + tableName)))); + return; + } // View does not need to be executed on regions setNextState( this instanceof DropViewColumnProcedure @@ -187,6 +181,16 @@ private void invalidateCache(final ConfigNodeProcedureEnv env) { : DropTableColumnState.EXECUTE_ON_REGIONS); } + private Map broadCastInvalidateCache( + TInvalidateColumnCacheReq req, Map targets) { + + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.INVALIDATE_COLUMN_CACHE, req, targets); + CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + } + private void executeOnRegions(final ConfigNodeProcedureEnv env) { final Map relatedRegionGroup = isAttributeColumn diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableProcedure.java index ac3ca2ef54fe7..94c36a3b7920d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/DropTableProcedure.java @@ -25,14 +25,17 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.exception.IoTDBException; import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.commons.schema.table.PreDeleteTsTable; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.table.CommitDeleteTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.PreDeleteTablePlan; +import org.apache.iotdb.confignode.consensus.request.write.table.RollbackPreDeleteTablePlan; import org.apache.iotdb.confignode.consensus.request.write.table.view.CommitDeleteViewPlan; import org.apache.iotdb.confignode.consensus.request.write.table.view.PreDeleteViewPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.schema.SchemaUtils; @@ -67,10 +70,9 @@ public DropTableProcedure( super(database, tableName, queryId, isGeneratedByPipe); } - // Not used @Override protected String getActionMessage() { - return null; + return "drop table"; } @Override @@ -86,12 +88,10 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DropTabl tableName); checkAndPreDeleteTable(env); break; - case INVALIDATE_CACHE: + case PRE_DELETE: LOGGER.info( - ProcedureMessages.INVALIDATING_CACHE_FOR_TABLE_WHEN_DROPPING_TABLE, - database, - tableName); - invalidateCache(env); + ProcedureMessages.PRE_RELEASE_DELETE_TABLE_WHEN_DROPPING_TABLE, database, tableName); + preDelete(env); break; case DELETE_DATA: LOGGER.info(ProcedureMessages.DELETING_DATA_FOR_TABLE, database, tableName); @@ -107,6 +107,13 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DropTabl case DROP_TABLE: LOGGER.info(ProcedureMessages.DROPPING_TABLE_ON_CONFIGNODE, database, tableName); dropTable(env); + break; + case COMMIT_DELETE: + LOGGER.info( + ProcedureMessages.COMMIT_RELEASE_DELETE_TABLE_WHEN_DROPPING_TABLE, + database, + tableName); + commitRelease(env); return Flow.NO_MORE_STATE; default: setFailure(new ProcedureException(ProcedureMessages.UNRECOGNIZED_DROPTABLESTATE + state)); @@ -132,40 +139,39 @@ private void checkAndPreDeleteTable(final ConfigNodeProcedureEnv env) { env, LOGGER); if (status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - setNextState(DropTableState.INVALIDATE_CACHE); + setNextState(DropTableState.PRE_DELETE); + table = new PreDeleteTsTable(tableName); } else { setFailure(new ProcedureException(new IoTDBException(status))); } } - private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_TABLE_CACHE, - new TInvalidateTableCacheReq(database, tableName), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { - // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TABLE, - database, - tableName); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } - } + private void preDelete(final ConfigNodeProcedureEnv env) { + TInvalidateTableCacheReq req = new TInvalidateTableCacheReq(database, tableName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> broadCastInvalidateCache(req, targets)); + if (!proceeded) { + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TABLE, database, tableName); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); + return; + } setNextState( this instanceof DropViewProcedure ? DropTableState.DROP_TABLE : DropTableState.DELETE_DATA); } + private Map broadCastInvalidateCache( + final TInvalidateTableCacheReq req, final Map targets) { + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.PRE_DELETE_TABLE, req, targets); + CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + } + private void deleteData(final ConfigNodeProcedureEnv env) { final Map relatedDataRegionGroup = env.getConfigManager().getRelatedDataRegionGroup4TableModel(database); @@ -215,19 +221,29 @@ private void dropTable(final ConfigNodeProcedureEnv env) { isGeneratedByPipe); if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { setFailure(new ProcedureException(new IoTDBException(status))); + } else { + setNextState(DropTableState.COMMIT_DELETE); } } @Override protected boolean isRollbackSupported(final DropTableState state) { - return false; + return state == DropTableState.CHECK_AND_INVALIDATE_TABLE || state == DropTableState.PRE_DELETE; } @Override - protected void rollbackState( - final ConfigNodeProcedureEnv configNodeProcedureEnv, final DropTableState dropTableState) + protected void rollbackState(final ConfigNodeProcedureEnv env, final DropTableState state) throws IOException, InterruptedException, ProcedureException { - // Do nothing + if (state == DropTableState.PRE_DELETE) { + final TSStatus status = + SchemaUtils.executeInConsensusLayer( + new RollbackPreDeleteTablePlan(database, tableName), env, LOGGER); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new ProcedureException( + String.format(ProcedureMessages.ROLLBACK_PRE_DELETE_TABLE_FAILED, database, tableName)); + } + } + // CHECK_AND_INVALIDATE_TABLE: consensus plan failed so no state changed, nothing to revert } @Override diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/schema/DropTableState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/schema/DropTableState.java index 1a4a7b222dc33..804e02344acf4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/schema/DropTableState.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/schema/DropTableState.java @@ -21,8 +21,9 @@ public enum DropTableState { CHECK_AND_INVALIDATE_TABLE, - INVALIDATE_CACHE, + PRE_DELETE, DELETE_DATA, DELETE_DEVICES, DROP_TABLE, + COMMIT_DELETE } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java index 1d0b7612dbd6e..d23c4db9d4891 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java @@ -46,6 +46,7 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.commons.schema.SchemaConstant; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.utils.AuthUtils; import org.apache.iotdb.commons.utils.StatusUtils; import org.apache.iotdb.commons.utils.TestOnly; @@ -127,6 +128,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TCreateTopicReq; import org.apache.iotdb.confignode.rpc.thrift.TCreateTriggerReq; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeConfigurationResp; +import org.apache.iotdb.confignode.rpc.thrift.TDataNodeLeaseRecoveryResp; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRegisterReq; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRegisterResp; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRemoveReq; @@ -336,6 +338,13 @@ public TDataNodeRestartResp restartDataNode(TDataNodeRestartReq req) { return resp; } + @Override + public TDataNodeLeaseRecoveryResp reloadCacheAfterLeaseRecovery() throws TException { + final TDataNodeLeaseRecoveryResp resp = configManager.reloadCacheAfterLeaseRecovery(); + LOGGER.info("Execute getMetaDataCache with result {}", resp.getStatus()); + return resp; + } + @Override public TAINodeRegisterResp registerAINode(TAINodeRegisterReq req) { TAINodeRegisterResp resp = @@ -1495,8 +1504,9 @@ public TDescTable4InformationSchemaResp descTables4InformationSchema() { } @Override - public TFetchTableResp fetchTables(final Map> fetchTableMap) { - return configManager.fetchTables(fetchTableMap); + public TFetchTableResp fetchTables( + final Map> fetchTableMap, final byte tableNodeStatus) { + return configManager.fetchTables(fetchTableMap, TableNodeStatus.deserialize(tableNodeStatus)); } @Override diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java new file mode 100644 index 0000000000000..c8fea39c72971 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.IntToLongFunction; + +public class ClusterCachePropagatorTest { + + private final long FENCE_TIMEOUT_MS = 25_000L; + + private TSStatus success() { + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } + + private TSStatus error() { + return new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()); + } + + private TSStatus canNotConnect() { + return new TSStatus(TSStatusCode.CAN_NOT_CONNECT_DATANODE.getStatusCode()); + } + + private Map twoDataNodes() { + final Map map = new HashMap<>(); + map.put(1, new TDataNodeLocation().setDataNodeId(1)); + map.put(2, new TDataNodeLocation().setDataNodeId(2)); + return map; + } + + /** Build a propagator whose loop seams are inert (only propagateOnce is exercised). */ + private ClusterCachePropagator propagator(final IntToLongFunction hbAgeMs) { + return new ClusterCachePropagator( + this::twoDataNodes, hbAgeMs, () -> FENCE_TIMEOUT_MS, () -> 0L, ms -> {}); + } + + @Test + public void allAckedProceeds() { + final ClusterCachePropagator p = propagator(id -> 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, success()); + return r; + }, + false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void unreachableButProvablyFencedProceeds() { + final ClusterCachePropagator p = propagator(id -> id == 2 ? FENCE_TIMEOUT_MS + 1 : 0L); + final Verdict v = p.propagateOnce(targets -> ackOnly(1), false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void unreachableNotYetFencedWaits() { + final ClusterCachePropagator p = propagator(id -> id == 2 ? 10_000L : 0L); + Assert.assertEquals(Verdict.WAIT, p.propagateOnce(targets -> ackOnly(1), false)); + } + + @Test + public void unreachableNotYetFencedFailsWhenBudgetExhausted() { + final ClusterCachePropagator p = propagator(id -> id == 2 ? 10_000L : 0L); + Assert.assertEquals(Verdict.FAIL, p.propagateOnce(targets -> ackOnly(1), true)); + } + + @Test + public void canNotConnectResponseIsUnackedAndCanProceedAfterFenceTimeout() { + final ClusterCachePropagator p = propagator(id -> id == 2 ? 999_999L : 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, canNotConnect()); + return r; + }, + false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void canNotConnectResponseWaitsBeforeFenceTimeout() { + final ClusterCachePropagator p = propagator(id -> id == 2 ? 1_000L : 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, canNotConnect()); + return r; + }, + false); + Assert.assertEquals(Verdict.WAIT, v); + } + + @Test + public void internalFailureFailsImmediately() { + final ClusterCachePropagator p = propagator(id -> id == 2 ? FENCE_TIMEOUT_MS + 1 : 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, error()); + return r; + }, + false); + Assert.assertEquals(Verdict.FAIL, v); + } + + @Test + public void loopReturnsTrueWhenItEventuallyProceeds() { + final AtomicInteger calls = new AtomicInteger(); + final AtomicLong nanos = new AtomicLong(); + final ClusterCachePropagator p = + new ClusterCachePropagator( + this::twoDataNodes, + id -> id == 2 ? 10_000L : 0L, // DN2 not fenced, so round 1 must WAIT + () -> FENCE_TIMEOUT_MS, + nanos::get, + ms -> nanos.addAndGet(ms * 1_000_000L)); + // Round 1: DN2 unreachable -> WAIT. Round 2: DN2 acks -> PROCEED. + final boolean proceeded = + p.propagate(targets -> calls.incrementAndGet() == 1 ? ackOnly(1) : ackBoth()); + Assert.assertTrue(proceeded); + Assert.assertEquals(2, calls.get()); + } + + @Test + public void loopReturnsFalseWhenBudgetExhausted() { + final AtomicLong nanos = new AtomicLong(); + final ClusterCachePropagator p = + new ClusterCachePropagator( + this::twoDataNodes, + id -> id == 2 ? 10_000L : 0L, // DN2 never fenced (alive but not acking) -> WAIT forever + () -> FENCE_TIMEOUT_MS, + nanos::get, + ms -> nanos.addAndGet(ms * 1_000_000L)); + // DN2 keeps failing to ack; the fake clock advances on each sleep until the wait budget runs + // out, at which point the loop must give up with FAIL. + Assert.assertFalse(p.propagate(targets -> ackOnly(1))); + } + + private Map ackOnly(final int nodeId) { + final Map r = new HashMap<>(); + r.put(nodeId, success()); + return r; + } + + private Map ackBoth() { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, success()); + return r; + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java new file mode 100644 index 0000000000000..80b4498d13bf2 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.Assert.assertEquals; + +public class DataNodeContactTrackerTest { + + private static final int DN = 3; + + @Test + public void reportsMillisSinceLastSuccessfulResponse() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1234)); + assertEquals(1234L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + } + + @Test + public void ageKeepsGrowingWithoutSuccessfulResponse() { + // Failures must NOT refresh the contact time. This is enforced structurally: only + // recordSuccessfulResponse updates it, so with no further success the age keeps growing. + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.SECONDS.toNanos(30)); + assertEquals(30_000L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + } + + @Test + public void leadershipAcquisitionResetsContactToNow() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.SECONDS.toNanos(30)); // would otherwise look stale + tracker.onLeadershipAcquired(Arrays.asList(DN, 4)); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(4)); + } + + @Test + public void neverContactedReadsAsZeroSoVerdictTreatsAsRecent() { + // Conservative: an unknown DataNode must NOT look fenced (else the verdict would wrongly + // proceed past it), so its age reads as 0 until a real success/expiry is observed. + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(999)); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java new file mode 100644 index 0000000000000..c28fd1040c882 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.DataNodeState; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; + +import static org.junit.Assert.assertEquals; + +public class MetadataBroadcastVerdictTest { + + private static final long T_PROCEED_MS = 25_000L; + + private static DataNodeState acked() { + return new DataNodeState(true, 0L); + } + + private static DataNodeState fencedSafe() { + return new DataNodeState(false, T_PROCEED_MS + 1); + } + + private static DataNodeState freshUnacked() { + return new DataNodeState(false, 1_000L); + } + + @Test + public void allAckedProceeds() { + assertEquals( + Verdict.PROCEED, + MetadataBroadcastVerdict.decide(Arrays.asList(acked(), acked()), T_PROCEED_MS, false)); + } + + @Test + public void unackedButAllFencedSafeProceeds() { + assertEquals( + Verdict.PROCEED, + MetadataBroadcastVerdict.decide(Arrays.asList(acked(), fencedSafe()), T_PROCEED_MS, false)); + } + + @Test + public void freshUnackedWaitsWhileBudgetRemains() { + assertEquals( + Verdict.WAIT, + MetadataBroadcastVerdict.decide( + Collections.singletonList(freshUnacked()), T_PROCEED_MS, false)); + } + + @Test + public void freshUnackedFailsWhenWaitBudgetExhausted() { + assertEquals( + Verdict.FAIL, + MetadataBroadcastVerdict.decide( + Collections.singletonList(freshUnacked()), T_PROCEED_MS, true)); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java index cb09c23659c39..a1813c1642cca 100644 --- a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java @@ -25,8 +25,6 @@ import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.schema.ttl.TTLCache; -import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.database.SetTTLPlan; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.TTLManager; @@ -355,30 +353,11 @@ TSStatus writeConfigNodePlan(final ConfigNodeProcedureEnv env, final SetTTLPlan } @Override - DataNodeAsyncRequestContext sendTTLRequest( - final Map dataNodeLocationMap, final TSetTTLReq req) { + boolean broadcastTTLAndDecide(final ConfigNodeProcedureEnv env, final TSetTTLReq req) { requests.add(copyRequest(req)); - - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.SET_TTL, copyRequest(req), dataNodeLocationMap); - final List requestIds = new ArrayList<>(clientHandler.getNodeLocationMap().keySet()); - final boolean shouldFail = failFirstDataNodeUpdate && requestCount++ == 0; - - for (Integer requestId : requestIds) { - clientHandler - .getResponseMap() - .put( - requestId, - new TSStatus( - shouldFail - ? TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode() - : TSStatusCode.SUCCESS_STATUS.getStatusCode())); - if (!shouldFail) { - clientHandler.getNodeLocationMap().remove(requestId); - } - } - return clientHandler; + // Simulate a live, un-acked DataNode on the first broadcast: the propagator verdict is FAIL + // (which triggers rollback). Later broadcasts (the rollback restore) proceed. + return !(failFirstDataNodeUpdate && requestCount++ == 0); } private SetTTLPlan copyPlan(final SetTTLPlan plan) { diff --git a/iotdb-core/datanode/src/main/i18n/en/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java b/iotdb-core/datanode/src/main/i18n/en/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java index 2b5fe9d72f6eb..b705d3efda654 100644 --- a/iotdb-core/datanode/src/main/i18n/en/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java +++ b/iotdb-core/datanode/src/main/i18n/en/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java @@ -121,7 +121,8 @@ public final class DataNodeSchemaMessages { public static final String UPDATE_MLOG_DESCRIPTION_FAILED = "Update {} failed because {}"; public static final String DIRECT_BUFFER_MEMORY_EXCEEDED = "Total allocated memory for direct buffer will be "; - public static final String DIRECT_BUFFER_MEMORY_LIMIT = ", which is greater than limit mem cost: "; + public static final String DIRECT_BUFFER_MEMORY_LIMIT = + ", which is greater than limit mem cost: "; // ======================== SchemaRegion Snapshot ======================== @@ -144,8 +145,7 @@ public final class DataNodeSchemaMessages { "Snapshot creation of schemaRegion {} costs {}ms."; public static final String SUCCESSFULLY_CREATE_SNAPSHOT = "Successfully create snapshot of schemaRegion {}"; - public static final String START_LOADING_SNAPSHOT = - "Start loading snapshot of schemaRegion {}"; + public static final String START_LOADING_SNAPSHOT = "Start loading snapshot of schemaRegion {}"; public static final String DEVICE_ATTR_SNAPSHOT_LOADING_COST = "Device attribute snapshot loading of schemaRegion {} costs {}ms."; public static final String DEVICE_ATTR_UPDATER_SNAPSHOT_LOADING_COST = @@ -265,8 +265,7 @@ public final class DataNodeSchemaMessages { "Failed to rename {} to {} while creating mTree snapshot."; public static final String FAILED_TO_CREATE_MTREE_SNAPSHOT = "Failed to create mTree snapshot due to {}"; - public static final String SERIALIZE_ERROR_INFO = - "Error occurred during serializing MemMTree."; + public static final String SERIALIZE_ERROR_INFO = "Error occurred during serializing MemMTree."; public static final String UNRECOGNIZED_MNODE_TYPE = "Unrecognized MNode type "; // ======================== View ======================== @@ -274,8 +273,7 @@ public final class DataNodeSchemaMessages { public static final String IS_NO_VIEW = "[%s] is no view."; public static final String VIEW_NOT_SUPPORTED = "View is not supported."; public static final String VIEW_DOES_NOT_SUPPORT_ALIAS = "View doesn't support alias"; - public static final String CANNOT_CONSTRUCT_ABSTRACT_CLASS = - "Can not construct abstract class."; + public static final String CANNOT_CONSTRUCT_ABSTRACT_CLASS = "Can not construct abstract class."; // ======================== PBTree ======================== @@ -288,8 +286,7 @@ public final class DataNodeSchemaMessages { "PBTree File [{}] will be overwritten since already exists."; public static final String SCHEMA_FILE_WRONG_VERSION = "SchemaFile with wrong version, please check or upgrade."; - public static final String NODE_NO_CHILD_IN_PBTREE = - "Node [%s] has no child in pbtree file."; + public static final String NODE_NO_CHILD_IN_PBTREE = "Node [%s] has no child in pbtree file."; public static final String SCHEMA_FILE_INSPECTED = "SchemaFile[%s] had been inspected."; public static final String FAILED_TO_CREATE_SCHEMA_FILE_SNAPSHOT = "Failed to create SchemaFile snapshot due to {}"; @@ -306,8 +303,7 @@ public final class DataNodeSchemaMessages { "AliasIndexPage can only extend to buffer with same capacity."; public static final String SEGMENTS_SPLIT_SAME_CAPACITY = "Segments only splits with same capacity."; - public static final String SEGMENT_SPLIT_NO_RECORDS = - "Segment can not be split with no records."; + public static final String SEGMENT_SPLIT_NO_RECORDS = "Segment can not be split with no records."; public static final String SEGMENT_SPLIT_ONLY_ONE_RECORD = "Segment can not be split with only one record."; public static final String INTERNAL_PAGE_EXTEND_CAPACITY = @@ -332,8 +328,7 @@ public final class DataNodeSchemaMessages { public static final String CHILD_SHALL_NOT_HAVE_SEGMENT_ADDRESS = "A child in newChildBuffer shall not have segmentAddress."; public static final String PAGE_INDEX_OUT_OF_RANGE = "Page index %d out of range."; - public static final String ROOT_PAGE_SHALL_NOT_BE_MIGRATED = - "Root page shall not be migrated."; + public static final String ROOT_PAGE_SHALL_NOT_BE_MIGRATED = "Root page shall not be migrated."; public static final String SUBORDINATE_INDEX_NOT_ON_SINGLE_PAGE = "Subordinate index shall not build upon single page segment."; public static final String SUBORDINATE_INDEX_BROKEN = @@ -457,8 +452,7 @@ public final class DataNodeSchemaMessages { public static final String COMMIT_MARK_WITHOUT_PREPARE = "COMMIT_MARK without PREPARE_MARK"; public static final String EXTRANEOUS_BYTE_AFTER_PREPARE = "an extraneous byte rather than COMMIT_MARK after PREPARE_MARK"; - public static final String NOT_ENDED_BY_MARK = - "not ended by COMMIT_MARK nor PREPARE_MARK."; + public static final String NOT_ENDED_BY_MARK = "not ended by COMMIT_MARK nor PREPARE_MARK."; // ======================== Additional MNodeContainer ======================== @@ -512,8 +506,7 @@ public final class DataNodeSchemaMessages { // ======================== Additional CachedMTreeStore ======================== - public static final String ERROR_DURING_PBTREE_CLEAR = - "Error occurred during PBTree clear, {}"; + public static final String ERROR_DURING_PBTREE_CLEAR = "Error occurred during PBTree clear, {}"; public static final String ERROR_DURING_MTREE_FLUSH_SCHEMA_REGION = "Error occurred during MTree flush, current SchemaRegionId is {}"; public static final String ERROR_DURING_MTREE_FLUSH_SCHEMA_REGION_BECAUSE = @@ -532,17 +525,14 @@ public final class DataNodeSchemaMessages { // ======================== FakeCRC32Deserializer ======================== - public static final String READ_LOG_LENGTH_NEGATIVE_LOG = - "Read log length {} is negative."; + public static final String READ_LOG_LENGTH_NEGATIVE_LOG = "Read log length {} is negative."; // ======================== SchemaLogReader ======================== - public static final String FILE_CORRUPTED = - "File {} is corrupted. The uncorrupted size is {}."; + public static final String FILE_CORRUPTED = "File {} is corrupted. The uncorrupted size is {}."; public static final String LOG_FILE_END_CORRUPTED_TRUNCATE = "The end of log file {} is corrupted. Start truncate it. The unbroken size is {}. The file size is {}."; - public static final String FAIL_TO_TRUNCATE_LOG_FILE = - "Fail to truncate log file to size {}"; + public static final String FAIL_TO_TRUNCATE_LOG_FILE = "Fail to truncate log file to size {}"; // ======================== SchemaRegionPlanDeserializer ======================== @@ -553,28 +543,20 @@ public final class DataNodeSchemaMessages { public static final String TIMESERIES_NUM_UPPER_LIMIT = "The number of timeseries has reached the upper limit"; - public static final String ALIAS_DUPLICATED_DETAIL = - ", fullPath: "; - public static final String ALIAS_DUPLICATED_OTHER_MEASUREMENT = - ", otherMeasurement: "; - public static final String START_CREATE_TABLE_DEVICE = - "Start to create table device {}.{}"; - public static final String TABLE_DEVICE_ALREADY_EXISTS = - "Table device {}.{} already exists"; - public static final String TABLE_DEVICE_CREATED = - "Table device {}.{} created"; + public static final String ALIAS_DUPLICATED_DETAIL = ", fullPath: "; + public static final String ALIAS_DUPLICATED_OTHER_MEASUREMENT = ", otherMeasurement: "; + public static final String START_CREATE_TABLE_DEVICE = "Start to create table device {}.{}"; + public static final String TABLE_DEVICE_ALREADY_EXISTS = "Table device {}.{} already exists"; + public static final String TABLE_DEVICE_CREATED = "Table device {}.{} created"; // ======================== CachedMTreeStore / Scheduler ======================== - public static final String MTREE_FLUSH_COST = - "It takes {}ms to flush MTree in SchemaRegion {}"; + public static final String MTREE_FLUSH_COST = "It takes {}ms to flush MTree in SchemaRegion {}"; // ======================== DataNodeTableCache ======================== - public static final String INIT_TABLE_CACHE_SUCCESS = - "Init DataNodeTableCache successfully"; - public static final String PRE_UPDATE_TABLE_SUCCESS = - "Pre-update table {}.{} successfully"; + public static final String INIT_TABLE_CACHE_SUCCESS = "Init DataNodeTableCache successfully"; + public static final String PRE_UPDATE_TABLE_SUCCESS = "Pre-update table {}.{} successfully"; public static final String PRE_RENAME_OLD_TABLE_SUCCESS = "Pre-rename old table {}.{} successfully"; public static final String ROLLBACK_UPDATE_TABLE_SUCCESS = @@ -585,14 +567,18 @@ public final class DataNodeSchemaMessages { "Commit-update table {}.{} successfully, {}"; public static final String COMMIT_UPDATE_TABLE_SUCCESS = "Commit-update table {}.{} successfully."; - public static final String RENAME_OLD_TABLE_SUCCESS = - "Rename old table {}.{} successfully."; + public static final String RENAME_OLD_TABLE_SUCCESS = "Rename old table {}.{} successfully."; + public static final String COMMIT_DELETE_TABLE_SUCCESS = + "commit delete table {}.{} successfully."; + public static final String FAILED_TO_REFRESH_CACHE_FROM_CN = + "Failed to refresh DataNodeTableCache from ConfigNode"; public static final String INTERRUPTED_ACQUIRE_SEMAPHORE_GET_TABLES = "Interrupted when trying to acquire semaphore when trying to get tables from configNode, ignore."; public static final String UPDATE_TABLE_BY_FETCH_WITH_DETAIL = "Update table {}.{} by table fetch, {}"; - public static final String UPDATE_TABLE_BY_FETCH = - "Update table {}.{} by table fetch."; + public static final String UPDATE_TABLE_BY_FETCH = "Update table {}.{} by table fetch."; + public static final String THE_TABLE_IS_IN_PRE_DELETE_STATE = + "The table %s.%s is in the pre-delete state. Please wait a few seconds. If the table is still in this state, please drop it again."; public static final String COMPARE_TABLE_ADDED = "Added table: "; public static final String COMPARE_TABLE_REMOVED = "Removed table: "; public static final String COMPARE_TABLE_NAME = "Table name: "; @@ -602,6 +588,21 @@ public final class DataNodeSchemaMessages { public static final String COMPARE_TABLE_ADDED_COLUMNS = " Added column(s): "; public static final String COMPARE_TABLE_NOT_MODIFIED = " Not modified"; + // ======================== MetadataLeaseManager ======================== + + public static final String FAILED_TO_SUBMIT_METADATA_PULL_TASK = + "Failed to submit metadata pull task."; + public static final String UNEXPECTED_METADATA_STATE = + "Unexpected metadata state {}, because no other clear-and-pull thread should exist."; + public static final String METADATA_LEASE_CACHE_CLEARING_IN_PROGRESS = + "Metadata state is {}, another thread may be set the metadata status. Retry later."; + public static final String FAILED_TO_CLEAR_METADATA_CACHE = "Failed to clear metadata cache."; + public static final String FAILED_TO_MARK_METADATA_STATE_AS_PULLING = + "Failed to mark metadata state {} as PULLING because another metadata pull thread is active."; + public static final String FAILED_TO_PULL_OR_INIT_METADATA = "Failed to pull or init metadata."; + public static final String METADATA_LEASE_IS_FENCED = + "Metadata lease is fenced. The local metadata cache is unavailable."; + // ======================== ClusterTemplateManager ======================== public static final String ILLEGAL_PATH_LOG = "illegal path {}"; diff --git a/iotdb-core/datanode/src/main/i18n/zh/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java b/iotdb-core/datanode/src/main/i18n/zh/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java index de8b762c10b78..415e2580a1380 100644 --- a/iotdb-core/datanode/src/main/i18n/zh/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java +++ b/iotdb-core/datanode/src/main/i18n/zh/org/apache/iotdb/db/i18n/DataNodeSchemaMessages.java @@ -562,28 +562,21 @@ public final class DataNodeSchemaMessages { // ======================== DataNodeTableCache 相关消息 ======================== - public static final String INIT_TABLE_CACHE_SUCCESS = - "DataNodeTableCache 初始化成功"; - public static final String PRE_UPDATE_TABLE_SUCCESS = - "预更新表 {}.{} 成功"; - public static final String PRE_RENAME_OLD_TABLE_SUCCESS = - "预重命名旧表 {}.{} 成功"; - public static final String ROLLBACK_UPDATE_TABLE_SUCCESS = - "回滚更新表 {}.{} 成功"; - public static final String ROLLBACK_RENAME_OLD_TABLE_SUCCESS = - "回滚重命名旧表 {}.{} 成功。"; - public static final String COMMIT_UPDATE_TABLE_SUCCESS_WITH_DETAIL = - "提交更新表 {}.{} 成功,{}"; - public static final String COMMIT_UPDATE_TABLE_SUCCESS = - "提交更新表 {}.{} 成功。"; - public static final String RENAME_OLD_TABLE_SUCCESS = - "重命名旧表 {}.{} 成功。"; + public static final String INIT_TABLE_CACHE_SUCCESS = "DataNodeTableCache 初始化成功"; + public static final String PRE_UPDATE_TABLE_SUCCESS = "预更新表 {}.{} 成功"; + public static final String PRE_RENAME_OLD_TABLE_SUCCESS = "预重命名旧表 {}.{} 成功"; + public static final String ROLLBACK_UPDATE_TABLE_SUCCESS = "回滚更新表 {}.{} 成功"; + public static final String ROLLBACK_RENAME_OLD_TABLE_SUCCESS = "回滚重命名旧表 {}.{} 成功。"; + public static final String COMMIT_UPDATE_TABLE_SUCCESS_WITH_DETAIL = "提交更新表 {}.{} 成功,{}"; + public static final String COMMIT_UPDATE_TABLE_SUCCESS = "提交更新表 {}.{} 成功。"; + public static final String RENAME_OLD_TABLE_SUCCESS = "重命名旧表 {}.{} 成功。"; + public static final String FAILED_TO_REFRESH_CACHE_FROM_CN = + "从configNode拉取元数据更新DataNodeTableCache失败"; public static final String INTERRUPTED_ACQUIRE_SEMAPHORE_GET_TABLES = "尝试获取信号量以从 ConfigNode 获取表时被中断,已忽略。"; - public static final String UPDATE_TABLE_BY_FETCH_WITH_DETAIL = - "通过表拉取更新表 {}.{},{}"; - public static final String UPDATE_TABLE_BY_FETCH = - "通过表拉取更新表 {}.{}。"; + public static final String UPDATE_TABLE_BY_FETCH = "通过表拉取更新表 {}.{}"; + public static final String THE_TABLE_IS_IN_PRE_DELETE_STATE = + "表 %s.%s 处于预删除的状态,请稍等,如之后重试还是此状态,请输入sql再次删除"; public static final String COMPARE_TABLE_ADDED = "新增表:"; public static final String COMPARE_TABLE_REMOVED = "已移除表:"; public static final String COMPARE_TABLE_NAME = "表名:"; @@ -597,5 +590,17 @@ public final class DataNodeSchemaMessages { public static final String ILLEGAL_PATH_LOG = "非法路径 {}"; + // ======================== MetadataLeaseManager 相关消息 ======================== + + public static final String FAILED_TO_SUBMIT_METADATA_PULL_TASK = "提交元数据拉取任务失败。"; + public static final String UNEXPECTED_METADATA_STATE = "元数据状态异常,当前为 {},不应存在其他元数据拉取线程。"; + public static final String METADATA_LEASE_CACHE_CLEARING_IN_PROGRESS = + "当前元数据状态为 {},可能有其他线程正在清理元数据缓存,请稍后重试。"; + public static final String FAILED_TO_CLEAR_METADATA_CACHE = "清理元数据缓存失败。"; + public static final String FAILED_TO_MARK_METADATA_STATE_AS_PULLING = + "无法将元数据状态 {} 标记为 PULLING,因为已有其他元数据拉取线程正在运行。"; + public static final String FAILED_TO_PULL_OR_INIT_METADATA = "拉取元数据失败。"; + public static final String METADATA_LEASE_IS_FENCED = "元数据租约已过期, 本地缓存不可用"; + private DataNodeSchemaMessages() {} } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java index 641ead173d2be..af3c149f2bf44 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java @@ -57,6 +57,7 @@ import org.apache.iotdb.db.queryengine.plan.relational.type.AuthorRType; import org.apache.iotdb.db.queryengine.plan.statement.StatementType; import org.apache.iotdb.db.queryengine.plan.statement.sys.AuthorStatement; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -518,13 +519,24 @@ public void refreshToken() { heartBeatTimeStamp = currentTime; } - private void checkCacheAvailable() { - if (cacheOutDate) { + // Package-private for testing (ClusterAuthorityFetcherLeaseTest). + void checkCacheAvailable() { + // cacheOutDate is set by refreshToken() only when a heartbeat finally arrives after a long gap, + // so it cannot catch an *ongoing* ConfigNode partition (no heartbeat arrives, refreshToken() is + // never called). isFenced() is evaluated on this DataNode's own clock and fires without any + // heartbeat: while fenced we drop the permission cache and force a re-fetch from the + // ConfigNode, + // which fails closed while partitioned, so a missed REVOKE cannot keep authorizing a privilege. + if (cacheOutDate || isMetadataLeaseFenced()) { iAuthorCache.invalidAllCache(); } cacheOutDate = false; } + boolean isMetadataLeaseFenced() { + return MetadataLeaseManager.getInstance().isFenced(); + } + @TestOnly public void setAcceptCache(boolean acceptCache) { this.acceptCache = acceptCache; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java index f6a079bd2d609..33c643ed85990 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java @@ -84,6 +84,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TCreateTopicReq; import org.apache.iotdb.confignode.rpc.thrift.TCreateTriggerReq; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeConfigurationResp; +import org.apache.iotdb.confignode.rpc.thrift.TDataNodeLeaseRecoveryResp; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRegisterReq; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRegisterResp; import org.apache.iotdb.confignode.rpc.thrift.TDataNodeRemoveReq; @@ -552,6 +553,12 @@ public TDataNodeRestartResp restartDataNode(TDataNodeRestartReq req) throws TExc () -> client.restartDataNode(req), resp -> !updateConfigNodeLeader(resp.status)); } + @Override + public TDataNodeLeaseRecoveryResp reloadCacheAfterLeaseRecovery() throws TException { + return executeRemoteCallWithRetry( + () -> client.reloadCacheAfterLeaseRecovery(), resp -> !updateConfigNodeLeader(resp.status)); + } + @Override public TAINodeRegisterResp registerAINode(TAINodeRegisterReq req) throws TException { throw new UnsupportedOperationException(UNSUPPORTED_INVOCATION); @@ -1527,10 +1534,11 @@ public TDescTable4InformationSchemaResp descTables4InformationSchema() throws TE } @Override - public TFetchTableResp fetchTables(final Map> fetchTableMap) + public TFetchTableResp fetchTables(Map> fetchTableMap, byte tableNodeStatus) throws TException { return executeRemoteCallWithRetry( - () -> client.fetchTables(fetchTableMap), resp -> !updateConfigNodeLeader(resp.status)); + () -> client.fetchTables(fetchTableMap, tableNodeStatus), + resp -> !updateConfigNodeLeader(resp.status)); } @Override diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index ec3bbff0bb78d..a1b9d911802b0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -192,6 +192,7 @@ import org.apache.iotdb.db.queryengine.plan.statement.crud.InsertRowStatement; import org.apache.iotdb.db.queryengine.plan.statement.crud.QueryStatement; import org.apache.iotdb.db.schemaengine.SchemaEngine; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.schemaregion.ISchemaRegion; import org.apache.iotdb.db.schemaengine.schemaregion.read.resp.info.ITimeSeriesSchemaInfo; import org.apache.iotdb.db.schemaengine.schemaregion.read.resp.reader.ISchemaReader; @@ -1895,7 +1896,8 @@ public TSStatus invalidateTableCache(final TInvalidateTableCacheReq req) { .takeWriteLock(SchemaLockType.VALIDATE_VS_DELETION_TABLE); try { TableDeviceSchemaCache.getInstance() - .invalidate(PathUtils.unQualifyDatabaseName(req.getDatabase()), req.getTableName()); + .invalidateAndPreDelete( + PathUtils.unQualifyDatabaseName(req.getDatabase()), req.getTableName()); return StatusUtils.OK; } finally { DataNodeSchemaLockManager.getInstance() @@ -2283,6 +2285,10 @@ private PathPatternTree filterPathPatternTree(PathPatternTree patternTree, Strin public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) throws TException { TDataNodeHeartbeatResp resp = new TDataNodeHeartbeatResp(); + // Renew the metadata lease: receiving a ConfigNode heartbeat means this DataNode is still in + // contact with the cluster and may keep trusting its ConfigNode-pushed metadata caches. + MetadataLeaseManager.getInstance().triggerCheckWithHeartBeat(); + // Judging leader if necessary if (req.isNeedJudgeLeader()) { // Always get logical clock before judging leader @@ -2330,6 +2336,9 @@ public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) th AuthorityChecker.getAuthorityFetcher().refreshToken(); resp.setHeartbeatTimestamp(req.getHeartbeatTimestamp()); resp.setStatus(commonConfig.getNodeStatus().getStatus()); + // Advertise that this DataNode supports metadata-lease self-fencing, so the ConfigNode may + // treat + // it as safely fenced when unreachable (older DataNodes that omit this are handled strictly). if (commonConfig.getStatusReason() != null) { resp.setStatusReason(commonConfig.getStatusReason()); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/partition/PartitionCache.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/partition/PartitionCache.java index 97121eb82e86e..11c7c50e12840 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/partition/PartitionCache.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/partition/PartitionCache.java @@ -60,6 +60,7 @@ import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; import org.apache.iotdb.db.protocol.session.IClientSession; import org.apache.iotdb.db.protocol.session.SessionManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.schemaregion.utils.MetaUtils; import org.apache.iotdb.db.service.metrics.CacheMetrics; import org.apache.iotdb.rpc.TSStatusCode; @@ -143,6 +144,10 @@ public PartitionCache() { this.cacheMetrics = new CacheMetrics(); } + protected void failIfMetadataLeaseFenced() { + MetadataLeaseManager.getInstance().failIfMetadataLeaseFenced(); + } + // region database cache /** @@ -158,6 +163,7 @@ public Map> getDatabaseToDevice( final boolean tryToFetch, final boolean isAutoCreate, final String userName) { + failIfMetadataLeaseFenced(); final DatabaseCacheResult> result = new DatabaseCacheResult>() { @Override @@ -182,6 +188,7 @@ public Map getDeviceToDatabase( final boolean tryToFetch, final boolean isAutoCreate, final String userName) { + failIfMetadataLeaseFenced(); final DatabaseCacheResult result = new DatabaseCacheResult() { @Override @@ -516,6 +523,7 @@ private void getDatabaseCacheResult( public void checkAndAutoCreateDatabase( final String database, final boolean isAutoCreate, final String userName) { + failIfMetadataLeaseFenced(); boolean isExisted = containsDatabase(database); if (!isExisted) { try { @@ -577,6 +585,7 @@ public List getRegionReplicaSet(List conse // try to get regionReplicaSet from cache regionReplicaSetLock.readLock().lock(); try { + failIfMetadataLeaseFenced(); result = getRegionReplicaSetInternal(consensusGroupIds); } finally { regionReplicaSetLock.readLock().unlock(); @@ -680,6 +689,7 @@ public SchemaPartition getSchemaPartition( final Map> databaseToDeviceMap) { schemaPartitionCacheLock.readLock().lock(); try { + failIfMetadataLeaseFenced(); if (databaseToDeviceMap.isEmpty()) { cacheMetrics.record(false, CacheMetrics.SCHEMA_PARTITION_CACHE_NAME); return null; @@ -752,6 +762,7 @@ public SchemaPartition getSchemaPartition( public SchemaPartition getSchemaPartition(String database) { schemaPartitionCacheLock.readLock().lock(); try { + failIfMetadataLeaseFenced(); SchemaPartitionTable schemaPartitionTable = schemaPartitionCache.getIfPresent(database); if (null == schemaPartitionTable) { // if database not find, then return cache miss. @@ -841,6 +852,7 @@ public DataPartition getDataPartition( Map> databaseToQueryParamsMap) { dataPartitionCacheLock.readLock().lock(); try { + failIfMetadataLeaseFenced(); if (databaseToQueryParamsMap.isEmpty()) { cacheMetrics.record(false, CacheMetrics.DATA_PARTITION_CACHE_NAME); return null; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java index 9a31dae2baefb..2b70b9b5ae831 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java @@ -84,6 +84,7 @@ import org.apache.iotdb.commons.schema.table.AlterOrDropTableOperationType; import org.apache.iotdb.commons.schema.table.Audit; import org.apache.iotdb.commons.schema.table.InformationSchema; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.commons.schema.table.TsTableInternalRPCUtil; import org.apache.iotdb.commons.schema.table.column.TsTableColumnSchema; @@ -4796,10 +4797,12 @@ public SettableFuture showTables( } @Override - public TFetchTableResp fetchTables(final Map> fetchTableMap) { + public TFetchTableResp fetchTables( + final Map> fetchTableMap, final TableNodeStatus tableNodeStatus) { try (final ConfigNodeClient configNodeClient = CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { - final TFetchTableResp fetchTableResp = configNodeClient.fetchTables(fetchTableMap); + final TFetchTableResp fetchTableResp = + configNodeClient.fetchTables(fetchTableMap, tableNodeStatus.getStatus()); if (TSStatusCode.SUCCESS_STATUS.getStatusCode() != fetchTableResp.getStatus().getCode()) { LOGGER.warn(DataNodeQueryMessages.FAILED_TO_FETCHTABLES_STATUS_IS, fetchTableResp); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/IConfigTaskExecutor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/IConfigTaskExecutor.java index 46f82cefe28cf..964b1c527792b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/IConfigTaskExecutor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/IConfigTaskExecutor.java @@ -27,6 +27,7 @@ import org.apache.iotdb.commons.queryengine.common.SessionInfo; import org.apache.iotdb.commons.queryengine.common.SqlDialect; import org.apache.iotdb.commons.schema.cache.CacheClearOptions; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.commons.schema.table.column.TsTableColumnSchema; import org.apache.iotdb.confignode.rpc.thrift.TDatabaseSchema; @@ -372,7 +373,8 @@ SettableFuture describeTable( SettableFuture showTables( final String database, final Predicate checkCanShowTable, final boolean isDetails); - TFetchTableResp fetchTables(final Map> fetchTableMap); + TFetchTableResp fetchTables( + final Map> fetchTableMap, final TableNodeStatus tableNodeStatus); SettableFuture alterTableRenameTable( final String database, diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/TableMetadataImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/TableMetadataImpl.java index 60b6b6ebc6181..3eb05e7c5ebd6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/TableMetadataImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/TableMetadataImpl.java @@ -59,6 +59,7 @@ import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.TableHeaderSchemaValidator; import org.apache.iotdb.db.queryengine.plan.relational.security.AccessControl; import org.apache.iotdb.db.schemaengine.table.DataNodeTableCache; +import org.apache.iotdb.db.schemaengine.table.ITableCache; import org.apache.iotdb.udf.api.customizer.analysis.AggregateFunctionAnalysis; import org.apache.iotdb.udf.api.customizer.analysis.ScalarFunctionAnalysis; import org.apache.iotdb.udf.api.customizer.parameter.FunctionArguments; @@ -98,7 +99,7 @@ public class TableMetadataImpl implements Metadata { private final IPartitionFetcher partitionFetcher = ClusterPartitionFetcher.getInstance(); - private final DataNodeTableCache tableCache = DataNodeTableCache.getInstance(); + private final ITableCache tableCache = DataNodeTableCache.getInstance(); @Override public TableFunction getTableFunction(String functionName) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TableDeviceSchemaCache.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TableDeviceSchemaCache.java index a328ffa221deb..eb436bfaf7d7b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TableDeviceSchemaCache.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TableDeviceSchemaCache.java @@ -26,6 +26,7 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternUtil; import org.apache.iotdb.commons.queryengine.plan.relational.metadata.QualifiedObjectName; +import org.apache.iotdb.commons.schema.table.PreDeleteTsTable; import org.apache.iotdb.commons.service.metric.MetricService; import org.apache.iotdb.commons.utils.PathUtils; import org.apache.iotdb.db.conf.DataNodeMemoryConfig; @@ -633,11 +634,12 @@ public void invalidate(final @Nonnull String database) { } // Only used by table model - public void invalidate(final String database, final String tableName) { + public void invalidateAndPreDelete(final String database, final String tableName) { readWriteLock.writeLock().lock(); try { // Table cache's invalidate must be guarded by this lock - DataNodeTableCache.getInstance().invalid(database, tableName); + DataNodeTableCache.getInstance() + .preUpdateTable(database, new PreDeleteTsTable(tableName), null); dualKeyCache.invalidate(new TableId(database, tableName)); } finally { readWriteLock.writeLock().unlock(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java index c2588606cea69..28fcbf01b23e8 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java @@ -29,6 +29,7 @@ import org.apache.iotdb.db.queryengine.common.schematree.ClusterSchemaTree; import org.apache.iotdb.db.queryengine.common.schematree.IMeasurementSchemaInfo; import org.apache.iotdb.db.queryengine.plan.analyze.schema.ISchemaComputation; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.template.ClusterTemplateManager; import org.apache.iotdb.db.schemaengine.template.ITemplateManager; @@ -63,7 +64,7 @@ public class TreeDeviceSchemaCacheManager { // cache update or clean have higher priority than cache read private final ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock(false); - private TreeDeviceSchemaCacheManager() { + TreeDeviceSchemaCacheManager() { tableDeviceSchemaCache = TableDeviceSchemaCache.getInstance(); } @@ -71,6 +72,10 @@ public static TreeDeviceSchemaCacheManager getInstance() { return TreeDeviceSchemaCacheManagerHolder.INSTANCE; } + void failIfMetadataLeaseFenced() { + MetadataLeaseManager.getInstance().failIfMetadataLeaseFenced(); + } + /** singleton pattern. */ private static class TreeDeviceSchemaCacheManagerHolder { private static final TreeDeviceSchemaCacheManager INSTANCE = new TreeDeviceSchemaCacheManager(); @@ -100,6 +105,7 @@ public void releaseWriteLock() { * @return timeseries partialPath and its SchemaEntity */ public ClusterSchemaTree get(final PartialPath devicePath, final String[] measurements) { + failIfMetadataLeaseFenced(); final ClusterSchemaTree tree = new ClusterSchemaTree(); final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema(devicePath.getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { @@ -129,6 +135,7 @@ public ClusterSchemaTree get(final PartialPath devicePath, final String[] measur * @return empty if cache miss or the device path is not a template activated path */ public ClusterSchemaTree getMatchedTemplateSchema(final PartialPath devicePath) { + failIfMetadataLeaseFenced(); final ClusterSchemaTree tree = new ClusterSchemaTree(); final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema(devicePath.getNodes()); if (!(schema instanceof TreeDeviceTemplateSchema)) { @@ -148,6 +155,7 @@ public ClusterSchemaTree getMatchedTemplateSchema(final PartialPath devicePath) * @return empty if cache miss */ public ClusterSchemaTree getMatchedNormalSchema(final PartialPath fullPath) { + failIfMetadataLeaseFenced(); final ClusterSchemaTree tree = new ClusterSchemaTree(); final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema( @@ -167,6 +175,7 @@ public ClusterSchemaTree getMatchedNormalSchema(final PartialPath fullPath) { } public List computeWithoutTemplate(final ISchemaComputation schemaComputation) { + failIfMetadataLeaseFenced(); final List indexOfMissingMeasurements = new ArrayList<>(); final String[] measurements = schemaComputation.getMeasurements(); if (measurements == null) { @@ -207,6 +216,7 @@ public Pair, List> computeSourceOfLogicalView( if (!schemaComputation.hasLogicalViewNeedProcess()) { return new Pair<>(new ArrayList<>(), new ArrayList<>()); } + failIfMetadataLeaseFenced(); final List indexOfMissingMeasurements = new ArrayList<>(); final Pair beginToEnd = @@ -263,6 +273,7 @@ public Pair, List> computeSourceOfLogicalView( } public List computeWithTemplate(final ISchemaComputation computation) { + failIfMetadataLeaseFenced(); final List indexOfMissingMeasurements = new ArrayList<>(); final String[] measurements = computation.getMeasurements(); final IDeviceSchema deviceSchema = diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java new file mode 100644 index 0000000000000..a673f8aa37a26 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.commons.exception.MetadataLeaseFencedException; +import org.apache.iotdb.commons.utils.TestOnly; +import org.apache.iotdb.db.i18n.DataNodeSchemaMessages; +import org.apache.iotdb.db.queryengine.plan.analyze.ClusterPartitionFetcher; +import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.TreeDeviceSchemaCacheManager; +import org.apache.iotdb.db.schemaengine.table.DataNodeTableCache; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicStampedReference; +import java.util.function.LongSupplier; + +import static org.apache.iotdb.commons.concurrent.ThreadName.RELOAD_TABLE_METADATA_CACHE; + +/** + * Tracks the DataNode's "metadata lease" with the ConfigNode. The ConfigNode periodically sends + * heartbeats to the DataNode; while these arrive the DataNode may trust its ConfigNode-pushed + * metadata caches (table/tree schema, device attributes, templates, TTL, permissions, ...). If no + * heartbeat is received within {@code metadata_lease_fence_ms} ({@code T_fence}), the lease has + * expired and the DataNode must self-fence: stop trusting those caches so a partitioned DataNode + * cannot serve stale schema and generate dirty data. + * + *

This class only tracks the lease state; wiring fail-closed behavior into the read/write/auth + * paths and resync-on-recovery is done by the respective subsystems. + * + *

A monotonic clock ({@link System#nanoTime()}) is used so the lease is immune to wall-clock + * adjustments. The clock and fence threshold are injectable for testing. + */ +public class MetadataLeaseManager { + + @FunctionalInterface + interface MetadataAction { + void execute(); + } + + private final Logger LOGGER = LoggerFactory.getLogger(MetadataLeaseManager.class); + + private final List clearCacheList; + private final List pullMetaList; + + private final LongSupplier nanoClock; + private final LongSupplier fenceThresholdMs; + + private volatile long lastConfigNodeHeartbeatNanos; + + AtomicBoolean hasPullTaskNowRef; + AtomicStampedReference metadataStateRef; + ExecutorService pullExecutorService; + + private enum MetadataState { + NORMAL, + CACHE_CLEARING, + CACHE_CLEARED, + NEED_CLEAR, + PULLING, + PULL_OR_INIT_FAILED; + } + + private MetadataLeaseManager() { + this( + System::nanoTime, + () -> CommonDescriptor.getInstance().getConfig().getMetadataLeaseFenceMs(), + defaultClearCacheList(), + defaultPullMetaList(), + IoTDBThreadPoolFactory.newCachedThreadPool(RELOAD_TABLE_METADATA_CACHE.getName())); + } + + private static List defaultClearCacheList() { + return Arrays.asList( + () -> ClusterPartitionFetcher.getInstance().invalidAllCache(), + () -> DataNodeTableCache.getInstance().invalidateAll(), + () -> TreeDeviceSchemaCacheManager.getInstance().cleanUp()); + } + + private static List defaultPullMetaList() { + return Collections.singletonList( + () -> DataNodeTableCache.getInstance().reloadTableCacheAfterLeaseRecovery()); + } + + MetadataLeaseManager( + final LongSupplier nanoClock, + final LongSupplier fenceThresholdMs, + final List clearCacheList, + final List pullMetaList, + final ExecutorService pullExecutorService) { + this.nanoClock = nanoClock; + this.fenceThresholdMs = fenceThresholdMs; + this.clearCacheList = new ArrayList<>(clearCacheList); + this.pullMetaList = new ArrayList<>(pullMetaList); + // Startup registration performs a full re-sync, so treat construction time as a fresh contact. + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + + metadataStateRef = new AtomicStampedReference<>(MetadataState.NORMAL, 0); + hasPullTaskNowRef = new AtomicBoolean(false); + this.pullExecutorService = pullExecutorService; + } + + /** Renew the lease: record that a ConfigNode heartbeat has just been received */ + public void triggerCheckWithHeartBeat() { + if (metadataStateRef.getReference() == MetadataState.NORMAL && !hasOutOfLease()) { + // If the lease is about to expire, a cache-clear thread may race with a new CN heartbeat. + // In that case the heartbeat only refreshes the timestamp; + // And the state is no longer NORMAL, so the next heartbeat will schedule + // metadata pulling and make the cache available again + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + return; + } + boolean hasPullTaskNow = hasPullTaskNowRef.get(); + if (hasPullTaskNow) { + return; + } + if (hasPullTaskNowRef.compareAndSet(false, true)) { + try { + pullExecutorService.submit(this::clearCacheAndPullMetaData); + } catch (Exception e) { + LOGGER.error(DataNodeSchemaMessages.FAILED_TO_SUBMIT_METADATA_PULL_TASK, e); + hasPullTaskNowRef.set(false); + } + } + } + + private void clearCacheAndPullMetaData() { + try { + int[] stamp = new int[1]; + MetadataState metadataState = metadataStateRef.get(stamp); + if (metadataState == MetadataState.PULLING || metadataState == MetadataState.CACHE_CLEARING) { + LOGGER.error(DataNodeSchemaMessages.UNEXPECTED_METADATA_STATE, metadataState); + return; + } + + // clear the cache + if (metadataState == MetadataState.NORMAL || metadataState == MetadataState.NEED_CLEAR) { + if (!tryClearCache(metadataState, stamp[0])) { + LOGGER.warn( + DataNodeSchemaMessages.METADATA_LEASE_CACHE_CLEARING_IN_PROGRESS, metadataState); + return; + } + } + + pullMetaDataAndInit(); + } finally { + hasPullTaskNowRef.set(false); + } + } + + /** + * Attempts to CAS {@code currentState} → {@code CACHE_CLEARING}, then executes all cache-clear + * actions. Sets state to {@code CACHE_CLEARED} on success, or {@code NEED_CLEAR} on failure. + */ + private boolean tryClearCache(final MetadataState currentState, final int currentStamp) { + if (!metadataStateRef.compareAndSet( + currentState, MetadataState.CACHE_CLEARING, currentStamp, currentStamp + 1)) { + return false; + } + try { + clearCacheList.forEach(MetadataAction::execute); + } catch (Exception e) { + metadataStateRef.set(MetadataState.NEED_CLEAR, metadataStateRef.getStamp() + 1); + LOGGER.error(DataNodeSchemaMessages.FAILED_TO_CLEAR_METADATA_CACHE, e); + throw e; + } + metadataStateRef.set(MetadataState.CACHE_CLEARED, metadataStateRef.getStamp() + 1); + return true; + } + + private void pullMetaDataAndInit() { + int[] stamp = new int[1]; + MetadataState metadataState = metadataStateRef.get(stamp); + if (metadataState != MetadataState.CACHE_CLEARED + && metadataState != MetadataState.PULL_OR_INIT_FAILED) { + LOGGER.error(DataNodeSchemaMessages.UNEXPECTED_METADATA_STATE, metadataState); + return; + } + + if (!metadataStateRef.compareAndSet( + metadataState, MetadataState.PULLING, stamp[0], stamp[0] + 1)) { + LOGGER.error(DataNodeSchemaMessages.FAILED_TO_MARK_METADATA_STATE_AS_PULLING, metadataState); + return; + } + + for (final MetadataAction action : pullMetaList) { + try { + action.execute(); + } catch (final Exception e) { + metadataStateRef.set(MetadataState.PULL_OR_INIT_FAILED, metadataStateRef.getStamp() + 1); + LOGGER.error(DataNodeSchemaMessages.FAILED_TO_PULL_OR_INIT_METADATA, e); + throw e; + } + } + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + metadataStateRef.set(MetadataState.NORMAL, metadataStateRef.getStamp() + 1); + } + + private boolean hasOutOfLease() { + return getMillisSinceLastConfigNodeHeartbeat() > fenceThresholdMs.getAsLong(); + } + + /** Milliseconds elapsed since the last ConfigNode heartbeat was received (never negative). */ + public long getMillisSinceLastConfigNodeHeartbeat() { + final long elapsedNanos = nanoClock.getAsLong() - lastConfigNodeHeartbeatNanos; + return elapsedNanos > 0 ? elapsedNanos / 1_000_000L : 0L; + } + + public boolean isFenced() { + int[] stampHolder = new int[1]; + MetadataState metadataState = metadataStateRef.get(stampHolder); + if (metadataState != MetadataState.NORMAL) { + return true; + } + + // NORMAL and within lease means the metadata cache is available + if (!hasOutOfLease()) { + return false; + } + + // Do not clear cache in caller threads. Mark the lease as fenced only; the heartbeat worker + // serializes cache clearing and metadata pulling. The stamp prevents an old isFenced() caller + // from changing a newly recovered NORMAL state back to NEED_CLEAR. + metadataStateRef.compareAndSet( + MetadataState.NORMAL, MetadataState.NEED_CLEAR, stampHolder[0], stampHolder[0] + 1); + return true; + } + + /** + * Fail closed when the metadata lease has expired: a fenced DataNode may hold a stale + * table-schema cache (it could have missed a ConfigNode invalidation while partitioned), so + * refuse to serve it rather than risk validating writes/queries against stale schema and + * producing dirty data. + */ + public void failIfMetadataLeaseFenced() { + if (isFenced()) { + throw new MetadataLeaseFencedException(DataNodeSchemaMessages.METADATA_LEASE_IS_FENCED); + } + } + + /** Force the lease to appear expired, for tests that exercise fail-closed behavior. */ + @TestOnly + public void recoveryLeaseForTest(boolean recovery) { + if (recovery) { + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + metadataStateRef.set(MetadataState.NORMAL, 0); + } else { + this.lastConfigNodeHeartbeatNanos = + nanoClock.getAsLong() - (fenceThresholdMs.getAsLong() + 1_000L) * 1_000_000L; + } + } + + public static MetadataLeaseManager getInstance() { + return MetadataLeaseManagerHolder.INSTANCE; + } + + private static final class MetadataLeaseManagerHolder { + private static final MetadataLeaseManager INSTANCE = new MetadataLeaseManager(); + + private MetadataLeaseManagerHolder() {} + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java index 11e80f5a63e8c..444ddac5b1da2 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java @@ -20,17 +20,30 @@ package org.apache.iotdb.db.schemaengine.table; import org.apache.iotdb.calc.plan.relational.metadata.CommonMetadataUtils; +import org.apache.iotdb.commons.client.IClientManager; +import org.apache.iotdb.commons.client.exception.ClientManagerException; +import org.apache.iotdb.commons.consensus.ConfigRegionId; +import org.apache.iotdb.commons.exception.IoTDBRuntimeException; +import org.apache.iotdb.commons.exception.SemanticException; import org.apache.iotdb.commons.schema.table.NonCommittableTsTable; +import org.apache.iotdb.commons.schema.table.PreDeleteTsTable; +import org.apache.iotdb.commons.schema.table.TableNodeStatus; import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.commons.schema.table.TsTableInternalRPCUtil; import org.apache.iotdb.commons.schema.table.column.TsTableColumnSchema; import org.apache.iotdb.commons.utils.PathUtils; +import org.apache.iotdb.confignode.rpc.thrift.TDataNodeLeaseRecoveryResp; import org.apache.iotdb.confignode.rpc.thrift.TFetchTableResp; import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.i18n.DataNodeSchemaMessages; +import org.apache.iotdb.db.protocol.client.ConfigNodeClient; +import org.apache.iotdb.db.protocol.client.ConfigNodeClientManager; +import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; import org.apache.iotdb.db.queryengine.plan.execution.config.executor.ClusterConfigTaskExecutor; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.thrift.TException; import org.apache.tsfile.utils.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,14 +61,19 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Function; import java.util.stream.Collectors; +import static org.apache.iotdb.db.i18n.DataNodeSchemaMessages.FAILED_TO_REFRESH_CACHE_FROM_CN; + /** It contains all tables' latest column schema */ public class DataNodeTableCache implements ITableCache { private static final Logger LOGGER = LoggerFactory.getLogger(DataNodeTableCache.class); + private static final IClientManager CONFIG_NODE_CLIENT_MANAGER = + ConfigNodeClientManager.getInstance(); /** Instance-specific version counter for optimistic locking mechanisms. */ private final AtomicLong instanceVersion = new AtomicLong(0); @@ -64,7 +82,7 @@ public class DataNodeTableCache implements ITableCache { private final Map> databaseTableMap = new ConcurrentHashMap<>(); // The database is without "root" - private final Map>> preUpdateTableMap = + private final Map>> specialStatusMap = new ConcurrentHashMap<>(); private final ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock(); @@ -72,9 +90,7 @@ public class DataNodeTableCache implements ITableCache { new Semaphore( IoTDBDescriptor.getInstance().getConfig().getDataNodeTableCacheSemaphorePermitNum()); - private DataNodeTableCache() { - // Do nothing - } + private DataNodeTableCache() {} private static final class DataNodeTableCacheHolder { private static final DataNodeTableCache INSTANCE = new DataNodeTableCache(); @@ -82,10 +98,14 @@ private static final class DataNodeTableCacheHolder { private DataNodeTableCacheHolder() {} } - public static DataNodeTableCache getInstance() { + public static ITableCache getInstance() { return DataNodeTableCacheHolder.INSTANCE; } + void failIfMetadataLeaseFenced() { + MetadataLeaseManager.getInstance().failIfMetadataLeaseFenced(); + } + @Override public void init(final byte[] tableInitializationBytes) { readWriteLock.writeLock().lock(); @@ -96,7 +116,7 @@ public void init(final byte[] tableInitializationBytes) { final Pair>, Map>> tableInfo = TsTableInternalRPCUtil.deserializeTableInitializationInfo(tableInitializationBytes); final Map> usingMap = tableInfo.left; - final Map> preCreateMap = tableInfo.right; + final Map> specialStatusMap = tableInfo.right; usingMap.forEach( (key, value) -> databaseTableMap.put( @@ -108,9 +128,9 @@ public void init(final byte[] tableInitializationBytes) { Function.identity(), (v1, v2) -> v2, ConcurrentHashMap::new)))); - preCreateMap.forEach( + specialStatusMap.forEach( (key, value) -> - preUpdateTableMap.put( + this.specialStatusMap.put( PathUtils.unQualifyDatabaseName(key), value.stream() .collect( @@ -125,12 +145,43 @@ public void init(final byte[] tableInitializationBytes) { } } + // No need to acquire a lock here; reloadTableCacheAfterLeaseRecovery is within a critical section + // protected by metadataLeaseManager + @Override + public void reloadTableCacheAfterLeaseRecovery() { + try (ConfigNodeClient configNodeClient = + CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { + final TDataNodeLeaseRecoveryResp resp = configNodeClient.reloadCacheAfterLeaseRecovery(); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new IoTDBRuntimeException(resp.getStatus().getMessage(), resp.getStatus().getCode()); + } + if (resp.isSetTableInfo()) { + init(resp.getTableInfo()); + } + } catch (final ClientManagerException | TException e) { + throw new RuntimeException(FAILED_TO_REFRESH_CACHE_FROM_CN, e); + } + } + + /** + * The case that pre update Table and pre delete Table procedures targeting the same table are + * executed serially by CN. + * + *

Consider the scenario: + * + *

    + *
  1. Drop the table first: DN executed pre update but missed the commit phase + *
  2. Create table second: DN executed pre update, which overwrites the result of the drop + * table procedure in {@link #specialStatusMap} + *
+ */ @Override public void preUpdateTable(String database, final TsTable table, final String oldName) { database = PathUtils.unQualifyDatabaseName(database); readWriteLock.writeLock().lock(); try { - preUpdateTableMap + failIfMetadataLeaseFenced(); + specialStatusMap .computeIfAbsent(database, k -> new ConcurrentHashMap<>()) .compute( table.getTableName(), @@ -144,11 +195,15 @@ public void preUpdateTable(String database, final TsTable table, final String ol } }); LOGGER.info(DataNodeSchemaMessages.PRE_UPDATE_TABLE_SUCCESS, database, table.getTableName()); - + if (table instanceof PreDeleteTsTable) { + if (databaseTableMap.containsKey(database)) { + databaseTableMap.get(database).remove(table.getTableName()); + } + } // If rename table if (Objects.nonNull(oldName)) { final TsTable oldTable = databaseTableMap.get(database).remove(oldName); - preUpdateTableMap + specialStatusMap .computeIfAbsent(database, k -> new ConcurrentHashMap<>()) .compute( oldName, @@ -173,13 +228,20 @@ public void rollbackUpdateTable(String database, final String tableName, final S database = PathUtils.unQualifyDatabaseName(database); readWriteLock.writeLock().lock(); try { - removeTableFromPreUpdateMap(database, tableName); + failIfMetadataLeaseFenced(); + // if rollback the drop table procedure, do nothing, + // wait for triggering the action of pull table from CN + final TsTable table = getTableFromSpecialStatusMap(database, tableName); + if (table instanceof PreDeleteTsTable) { + return; + } + removeTableFromSpecialStatusMap(database, tableName); LOGGER.info(DataNodeSchemaMessages.ROLLBACK_UPDATE_TABLE_SUCCESS, database, tableName); // If rename table if (Objects.nonNull(oldName)) { // Equals to commit update - final TsTable oldTable = getTableFromPreUpdateMap(database, oldName); + final TsTable oldTable = getTableFromSpecialStatusMap(database, oldName); if (Objects.isNull(oldTable)) { LOGGER.info( "Skip rollback renaming old table {}.{} because it has been handled.", @@ -199,28 +261,16 @@ public void rollbackUpdateTable(String database, final String tableName, final S .computeIfAbsent(database, k -> new ConcurrentHashMap<>()) .put(tableName, oldTable); LOGGER.info(DataNodeSchemaMessages.ROLLBACK_RENAME_OLD_TABLE_SUCCESS, database, oldName); - removeTableFromPreUpdateMap(database, oldName); + removeTableFromSpecialStatusMap(database, oldName); } } finally { readWriteLock.writeLock().unlock(); } } - private void removeTableFromPreUpdateMap(final String database, final String tableName) { - preUpdateTableMap.computeIfPresent( - database, - (k, v) -> { - final Pair tableVersionPair = v.get(tableName); - if (Objects.nonNull(tableVersionPair)) { - tableVersionPair.setLeft(null); - } - return v; - }); - } - - private @Nullable TsTable getTableFromPreUpdateMap( + private @Nullable TsTable getTableFromSpecialStatusMap( final String database, final String tableName) { - final Map> tableMap = preUpdateTableMap.get(database); + final Map> tableMap = specialStatusMap.get(database); if (Objects.isNull(tableMap)) { return null; } @@ -228,18 +278,33 @@ private void removeTableFromPreUpdateMap(final String database, final String tab return Objects.nonNull(tableVersionPair) ? tableVersionPair.getLeft() : null; } + private void removeTableFromSpecialStatusMap(final String database, final String tableName) { + specialStatusMap.computeIfPresent( + database, + (k, v) -> { + v.computeIfPresent( + tableName, + (innerKey, tableVersionPair) -> { + tableVersionPair.setLeft(null); + return tableVersionPair; + }); + return v; + }); + } + @Override public void commitUpdateTable( String database, final String tableName, final @Nullable String oldName) { database = PathUtils.unQualifyDatabaseName(database); readWriteLock.writeLock().lock(); try { - final TsTable newTable = getTableFromPreUpdateMap(database, tableName); + failIfMetadataLeaseFenced(); + final TsTable newTable = getTableFromSpecialStatusMap(database, tableName); if (Objects.isNull(newTable)) { LOGGER.info( "Skip commit-update table {}.{} because it has been handled.", database, tableName); if (Objects.nonNull(oldName)) { - removeTableFromPreUpdateMap(database, oldName); + removeTableFromSpecialStatusMap(database, oldName); } return; } @@ -251,6 +316,10 @@ public void commitUpdateTable( if (newTable instanceof NonCommittableTsTable) { return; } + if (newTable instanceof PreDeleteTsTable) { + commitDeleteTable(database, tableName); + return; + } final TsTable oldTable = databaseTableMap .computeIfAbsent(database, k -> new ConcurrentHashMap<>()) @@ -264,9 +333,9 @@ public void commitUpdateTable( } else if (LOGGER.isInfoEnabled()) { LOGGER.info(DataNodeSchemaMessages.COMMIT_UPDATE_TABLE_SUCCESS, database, tableName); } - removeTableFromPreUpdateMap(database, tableName); + removeTableFromSpecialStatusMap(database, tableName); if (Objects.nonNull(oldName)) { - removeTableFromPreUpdateMap(database, oldName); + removeTableFromSpecialStatusMap(database, oldName); LOGGER.info(DataNodeSchemaMessages.RENAME_OLD_TABLE_SUCCESS, database, oldName); } instanceVersion.incrementAndGet(); @@ -275,31 +344,38 @@ public void commitUpdateTable( } } + private void commitDeleteTable(String database, final String tableName) { + if (databaseTableMap.containsKey(database)) { + databaseTableMap.get(database).remove(tableName); + } + removeTableFromSpecialStatusMap(database, tableName); + LOGGER.info(DataNodeSchemaMessages.COMMIT_DELETE_TABLE_SUCCESS, database, tableName); + } + @Override public void invalid(String database) { database = PathUtils.unQualifyDatabaseName(database); readWriteLock.writeLock().lock(); try { databaseTableMap.remove(database); - preUpdateTableMap.remove(database); + specialStatusMap.remove(database); instanceVersion.incrementAndGet(); } finally { readWriteLock.writeLock().unlock(); } } - @GuardedBy("TableDeviceSchemaCache#writeLock") + /** + * Drop the entire cache. Used on metadata-lease recovery: after the DataNode was fenced it may + * have missed ConfigNode pushes, so the cached schema is no longer trustworthy and must be + * re-fetched lazily on the next lookup. + */ @Override - public void invalid(String database, final String tableName) { - database = PathUtils.unQualifyDatabaseName(database); + public void invalidateAll() { readWriteLock.writeLock().lock(); try { - if (databaseTableMap.containsKey(database)) { - databaseTableMap.get(database).remove(tableName); - } - if (preUpdateTableMap.containsKey(database)) { - preUpdateTableMap.get(database).remove(tableName); - } + databaseTableMap.clear(); + specialStatusMap.clear(); instanceVersion.incrementAndGet(); } finally { readWriteLock.writeLock().unlock(); @@ -318,9 +394,9 @@ public void invalid(String database, final String tableName, final String column copyTable.removeColumnSchema(columnName); databaseTableMap.get(database).put(tableName, copyTable); } - if (preUpdateTableMap.containsKey(database) - && preUpdateTableMap.get(database).containsKey(tableName)) { - final Pair tableVersionPair = preUpdateTableMap.get(database).get(tableName); + if (specialStatusMap.containsKey(database) + && specialStatusMap.get(database).containsKey(tableName)) { + final Pair tableVersionPair = specialStatusMap.get(database).get(tableName); if (Objects.nonNull(tableVersionPair.getLeft())) { final TsTable copyTable = new TsTable(tableVersionPair.getLeft()); copyTable.removeColumnSchema(columnName); @@ -338,25 +414,36 @@ public long getInstanceVersion() { return instanceVersion.get(); } + @Override public TsTable getTableInWrite(final String database, final String tableName) { final TsTable result = getTableInCache(database, tableName); return Objects.nonNull(result) ? result : getTable(database, tableName, false); } + @Override public TsTable getTable(final String database, final String tableName) { return getTable(database, tableName, true); } /** * The following logic can handle the cases when configNode failed to clear some table in {@link - * #preUpdateTableMap}, due to the failure of "commit" or rollback of "pre-update". + * #specialStatusMap}, due to the failure of "commit" or rollback of "pre-update". */ + @Override public TsTable getTable(String database, final String tableName, final boolean force) { database = PathUtils.unQualifyDatabaseName(database); - final Map> preUpdateTables = - mayGetTableInPreUpdateMap(database, tableName); - if (Objects.nonNull(preUpdateTables) && !preUpdateTables.isEmpty()) { - updateTable(getTablesInConfigNode(preUpdateTables), preUpdateTables); + final AtomicReference tableStatusRef = new AtomicReference<>(); + final Map> specialStatusMap = + mayGetTableInSpecialStatusMap(database, tableName, tableStatusRef); + + if (Objects.nonNull(specialStatusMap) && !specialStatusMap.isEmpty()) { + Map> fetchedTables = + getTablesInConfigNode(specialStatusMap, tableStatusRef.get()); + if (tableStatusRef.get() == TableNodeStatus.USING) { + updateUsingTable(fetchedTables, specialStatusMap); + } else { + updateDeleteTable(fetchedTables, database, tableName); + } } final TsTable table = getTableInCache(database, tableName); if (Objects.isNull(table) && force) { @@ -365,39 +452,58 @@ public TsTable getTable(String database, final String tableName, final boolean f return table; } - private Map> mayGetTableInPreUpdateMap( - final String database, final String tableName) { + private Map> mayGetTableInSpecialStatusMap( + final String database, + final String tableName, + final AtomicReference tableNodeStatus) { readWriteLock.readLock().lock(); try { - return preUpdateTableMap.containsKey(database) - && preUpdateTableMap.get(database).containsKey(tableName) - && Objects.nonNull(preUpdateTableMap.get(database).get(tableName).getLeft()) - ? preUpdateTableMap.entrySet().stream() - .filter( - entry -> { - entry - .getValue() - .entrySet() - .removeIf(tableEntry -> Objects.isNull(tableEntry.getValue().getLeft())); - return !entry.getValue().isEmpty(); - }) - .collect( - Collectors.toMap( - Map.Entry::getKey, - entry -> - entry.getValue().entrySet().stream() - .collect( - Collectors.toMap( - Map.Entry::getKey, - innerEntry -> innerEntry.getValue().getRight())))) - : null; + failIfMetadataLeaseFenced(); + final Map> targetDatabaseMap = specialStatusMap.get(database); + if (Objects.isNull(targetDatabaseMap)) { + return null; + } + + final Pair targetTablePair = targetDatabaseMap.get(tableName); + if (Objects.isNull(targetTablePair) || Objects.isNull(targetTablePair.getLeft())) { + return null; + } + final boolean targetIsPreDelete = targetTablePair.getLeft() instanceof PreDeleteTsTable; + final Map> result = new HashMap<>(); + for (final Map.Entry>> databaseEntry : + specialStatusMap.entrySet()) { + final Map tableVersionMap = + getSpecificStatusTable(databaseEntry, targetIsPreDelete); + if (!tableVersionMap.isEmpty()) { + result.put(databaseEntry.getKey(), tableVersionMap); + } + } + tableNodeStatus.set(targetIsPreDelete ? TableNodeStatus.PRE_DELETE : TableNodeStatus.USING); + return result; } finally { readWriteLock.readLock().unlock(); } } + private Map getSpecificStatusTable( + Map.Entry>> databaseEntry, + boolean targetIsPreDelete) { + final Map tableVersionMap = new HashMap<>(); + for (final Map.Entry> tableEntry : + databaseEntry.getValue().entrySet()) { + final TsTable candidate = tableEntry.getValue().getLeft(); + if (Objects.isNull(candidate)) { + continue; + } + if ((candidate instanceof PreDeleteTsTable) == targetIsPreDelete) { + tableVersionMap.put(tableEntry.getKey(), tableEntry.getValue().getRight()); + } + } + return tableVersionMap; + } + private Map> getTablesInConfigNode( - final Map> tableInput) { + final Map> tableInput, final TableNodeStatus tableNodeStatus) { Map> result = Collections.emptyMap(); boolean acquired = false; try { @@ -408,7 +514,8 @@ private Map> getTablesInConfigNode( .fetchTables( tableInput.entrySet().stream() .collect( - Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().keySet()))); + Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().keySet())), + tableNodeStatus); if (TSStatusCode.SUCCESS_STATUS.getStatusCode() == resp.getStatus().getCode()) { result = TsTableInternalRPCUtil.deserializeTsTableFetchResult(resp.getTableInfoMap()); } @@ -423,20 +530,21 @@ private Map> getTablesInConfigNode( return result; } - private void updateTable( + private void updateUsingTable( final Map> fetchedTables, final Map> previousVersions) { readWriteLock.writeLock().lock(); try { + failIfMetadataLeaseFenced(); final AtomicBoolean isUpdated = new AtomicBoolean(false); fetchedTables.forEach( (qualifiedDatabase, tableInfoMap) -> { final String database = PathUtils.unQualifyDatabaseName(qualifiedDatabase); - if (preUpdateTableMap.containsKey(database)) { + if (specialStatusMap.containsKey(database)) { tableInfoMap.forEach( (tableName, tsTable) -> { final Pair existingPair = - preUpdateTableMap.get(database).get(tableName); + specialStatusMap.get(database).get(tableName); if (Objects.isNull(existingPair) || Objects.isNull(existingPair.getLeft()) || !Objects.equals( @@ -478,6 +586,76 @@ private void updateTable( } } + /** fetch the pre delete table to update */ + private void updateDeleteTable( + Map> fetchedTables, + String targetDatabase, + final String targetTable) { + readWriteLock.writeLock().lock(); + try { + failIfMetadataLeaseFenced(); + boolean isUpdated = false; + boolean targetTableIsStillDeleting = false; + + for (final Map.Entry> databaseEntry : fetchedTables.entrySet()) { + final String currentDatabase = PathUtils.unQualifyDatabaseName(databaseEntry.getKey()); + + final Map> existingDatabaseMap = + this.specialStatusMap.get(currentDatabase); + if (Objects.isNull(existingDatabaseMap)) { + continue; + } + for (final Map.Entry tableEntry : databaseEntry.getValue().entrySet()) { + final String currentTableName = tableEntry.getKey(); + + final Pair existingPair = existingDatabaseMap.get(currentTableName); + if (Objects.isNull(existingPair) + || Objects.isNull(existingPair.getLeft()) + || !(existingPair.getLeft() instanceof PreDeleteTsTable)) { + continue; + } + + final TsTable fetchedTable = tableEntry.getValue(); + // case 1. the table is still in the pre delete status, do not update + // and only remind user of it + // the CN may be still in drop table procedure or has finished the procedure with error + if (fetchedTable instanceof PreDeleteTsTable) { + if (targetDatabase.equals(currentDatabase) && targetTable.equals(currentTableName)) { + targetTableIsStillDeleting = true; + } + continue; + } + + isUpdated = true; + // case 2. the TsTable is normal TsTable, means that the drop table procedure rollback + // recovery it in databaseTableMap + if (Objects.nonNull(fetchedTable)) { + databaseTableMap + .computeIfAbsent(currentDatabase, k -> new ConcurrentHashMap<>()) + .put(currentTableName, fetchedTable); + } else if (databaseTableMap.containsKey(currentDatabase)) { + // case 3. the CN do not hold the table, means that the table has been deleted + databaseTableMap.get(currentDatabase).remove(currentTableName); + } + // case 2 and case 3, remove table from specialStatusMap + existingPair.setLeft(null); + } + } + if (isUpdated) { + instanceVersion.incrementAndGet(); + } + if (targetTableIsStillDeleting) { + throw new SemanticException( + String.format( + DataNodeSchemaMessages.THE_TABLE_IS_IN_PRE_DELETE_STATE, + targetDatabase, + targetTable)); + } + } finally { + readWriteLock.writeLock().unlock(); + } + } + private String compareTable(final TsTable oldTable, final TsTable newTable) { if (Objects.isNull(oldTable)) { return DataNodeSchemaMessages.COMPARE_TABLE_ADDED + newTable; @@ -559,6 +737,7 @@ private String compareTable(final TsTable oldTable, final TsTable newTable) { private TsTable getTableInCache(final String database, final String tableName) { readWriteLock.readLock().lock(); try { + failIfMetadataLeaseFenced(); final TsTable result = databaseTableMap.containsKey(database) ? databaseTableMap.get(database).get(tableName) @@ -572,13 +751,16 @@ private TsTable getTableInCache(final String database, final String tableName) { } public boolean isDatabaseExist(final String database) { + failIfMetadataLeaseFenced(); if (databaseTableMap.containsKey(database)) { return true; } - if (getTablesInConfigNode(Collections.singletonMap(database, Collections.emptyMap())) + if (getTablesInConfigNode( + Collections.singletonMap(database, Collections.emptyMap()), TableNodeStatus.USING) .containsKey(database)) { readWriteLock.readLock().lock(); try { + failIfMetadataLeaseFenced(); databaseTableMap.computeIfAbsent(database, k -> new ConcurrentHashMap<>()); return true; } finally { @@ -589,6 +771,7 @@ public boolean isDatabaseExist(final String database) { } // Database shall not start with "root" + @Override public String tryGetInternColumnName( final @Nonnull String database, final @Nonnull String tableName, diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/ITableCache.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/ITableCache.java index 694be6f3b2adb..63763553981e3 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/ITableCache.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/ITableCache.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.schema.table.TsTable; +import javax.annotation.Nonnull; import javax.annotation.Nullable; public interface ITableCache { @@ -40,7 +41,22 @@ void commitUpdateTable( */ void invalid(final String database); - void invalid(final String database, final String tableName); - void invalid(final String database, final String tableName, final String columnName); + + void invalidateAll(); + + TsTable getTableInWrite(final String database, final String tableName); + + TsTable getTable(final String database, final String tableName); + + TsTable getTable(String database, final String tableName, final boolean force); + + String tryGetInternColumnName( + final @Nonnull String database, + final @Nonnull String tableName, + final @Nonnull String columnName); + + boolean isDatabaseExist(final String database); + + void reloadTableCacheAfterLeaseRecovery(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java index 2183d6ac8812b..e2204e8cf0b57 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java @@ -108,6 +108,9 @@ public static void bind() { // bind memory related metrics metricService.addMetricSet(GlobalMemoryMetrics.getInstance()); + + // bind metadata lease (ConfigNode heartbeat freshness) metrics + metricService.addMetricSet(new MetadataLeaseMetrics()); } private static void initSystemMetrics(MetricService metricService) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java new file mode 100644 index 0000000000000..99b33befc9acf --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.service.metrics; + +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.metrics.AbstractMetricService; +import org.apache.iotdb.metrics.metricsets.IMetricSet; +import org.apache.iotdb.metrics.utils.MetricLevel; +import org.apache.iotdb.metrics.utils.MetricType; + +/** + * Exposes the DataNode's metadata-lease state for observability: how long it has been since the + * last ConfigNode heartbeat was received. A value approaching {@code metadata_lease_fence_ms} + * indicates the DataNode is about to (or has) self-fenced its ConfigNode-pushed metadata caches. + */ +public class MetadataLeaseMetrics implements IMetricSet { + + private static final String METADATA_LEASE_HEARTBEAT_AGE_MS = "metadata_lease_heartbeat_age_ms"; + + @Override + public void bindTo(final AbstractMetricService metricService) { + metricService.createAutoGauge( + METADATA_LEASE_HEARTBEAT_AGE_MS, + MetricLevel.IMPORTANT, + MetadataLeaseManager.getInstance(), + MetadataLeaseManager::getMillisSinceLastConfigNodeHeartbeat); + } + + @Override + public void unbindFrom(final AbstractMetricService metricService) { + metricService.remove(MetricType.AUTO_GAUGE, METADATA_LEASE_HEARTBEAT_AGE_MS); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java index a639fba299cb9..3ce86271968f5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java @@ -28,6 +28,7 @@ import org.apache.iotdb.commons.schema.table.column.TsTableColumnSchema; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.db.queryengine.plan.analyze.cache.schema.DataNodeTTLCache; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.table.DataNodeTableCache; import org.apache.iotdb.db.storageengine.dataregion.compaction.io.CompactionTsFileReader; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.constant.CompactionType; @@ -236,7 +237,15 @@ public Pair nextDevice() throws IllegalPathException, IOExce IDeviceID deviceID = currentDevice.left; boolean isAligned = currentDevice.right; ignoreAllNullRows = !isAligned || deviceID.getTableName().startsWith("root."); - if (!ignoreAllNullRows) { + if (MetadataLeaseManager.getInstance().isFenced()) { + // Metadata lease fenced: this DataNode may hold a stale TTL (it could have missed a + // ConfigNode + // TTL update while partitioned). A too-short stale TTL would make compaction permanently + // delete data that a missed TTL-increase says to keep, so use an infinite TTL: compaction + // deletes nothing by TTL while fenced, and real TTL deletion resumes once the lease recovers + // and the cache resyncs. (Checked first so the table path also avoids the fenced cache.) + ttlForCurrentDevice = Long.MAX_VALUE; + } else if (!ignoreAllNullRows) { ttlForCurrentDevice = DataNodeTTLCache.getInstance().getTTLForTable(databaseName, deviceID.getTableName()); } else { diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java new file mode 100644 index 0000000000000..a166ccdcacbe1 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.auth; + +import org.apache.iotdb.commons.auth.entity.User; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import static org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils.T_FENCE_MS; + +public class ClusterAuthorityFetcherLeaseTest { + + private TestClock clock; + private MetadataLeaseManager leaseManager; + + @Before + public void setUp() { + clock = new TestClock(); + leaseManager = MetadataLeaseTestUtils.newManager(clock::nowNanos); + } + + @Test + public void fencedLeaseDropsPermissionCache() { + final ClusterAuthorityFetcher fetcher = + new TestingClusterAuthorityFetcher(new BasicAuthorityCache(), leaseManager); + final User user = new User("user_fenced", "password"); + fetcher.getAuthorCache().putUserCache(user.getName(), user); + Assert.assertNotNull(fetcher.getAuthorCache().getUserCache(user.getName())); + + clock.addMillis(T_FENCE_MS + 1); + fetcher.checkCacheAvailable(); + + Assert.assertNull( + "a fenced DataNode must drop its permission cache so a missed REVOKE cannot keep authorizing", + fetcher.getAuthorCache().getUserCache(user.getName())); + } + + @Test + public void activeLeaseKeepsPermissionCache() { + final ClusterAuthorityFetcher fetcher = + new TestingClusterAuthorityFetcher(new BasicAuthorityCache(), leaseManager); + final User user = new User("user_active", "password"); + fetcher.getAuthorCache().putUserCache(user.getName(), user); + + // An active lease (a ConfigNode heartbeat was just received) must not needlessly drop the + // cache. + clock.addMillis(1_000L); + fetcher.checkCacheAvailable(); + + Assert.assertNotNull( + "an active lease must not needlessly drop the permission cache", + fetcher.getAuthorCache().getUserCache(user.getName())); + } + + private static class TestClock { + private long nowNanos = 100_000_000_000L; + + private long nowNanos() { + return nowNanos; + } + + private void addMillis(final long millis) { + nowNanos += millis * 1_000_000L; + } + } + + private static class TestingClusterAuthorityFetcher extends ClusterAuthorityFetcher { + + private final MetadataLeaseManager leaseManager; + + private TestingClusterAuthorityFetcher( + final IAuthorCache authorCache, final MetadataLeaseManager leaseManager) { + super(authorCache); + this.leaseManager = leaseManager; + } + + @Override + boolean isMetadataLeaseFenced() { + return leaseManager.isFenced(); + } + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/PartitionCacheLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/PartitionCacheLeaseTest.java new file mode 100644 index 0000000000000..fbfcaa58a2492 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/analyze/cache/PartitionCacheLeaseTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.queryengine.plan.analyze.cache; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.exception.MetadataLeaseFencedException; +import org.apache.iotdb.commons.partition.DataPartitionQueryParam; +import org.apache.iotdb.db.auth.AuthorityChecker; +import org.apache.iotdb.db.queryengine.plan.analyze.cache.partition.PartitionCache; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.apache.tsfile.file.metadata.IDeviceID; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils.T_FENCE_MS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +public class PartitionCacheLeaseTest { + + private final AtomicLong nowNanos = new AtomicLong(100_000_000_000L); + + private MetadataLeaseManager leaseManager; + private PartitionCache partitionCache; + private IDeviceID deviceID; + private TConsensusGroupId consensusGroupId; + + @Before + public void setUp() { + nowNanos.set(100_000_000_000L); + leaseManager = MetadataLeaseTestUtils.newManager(nowNanos); + partitionCache = new TestingPartitionCache(leaseManager); + deviceID = IDeviceID.Factory.DEFAULT_FACTORY.create("root.sg.d1"); + consensusGroupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 1); + } + + @After + public void tearDown() { + partitionCache.invalidAllCache(); + } + + @Test + public void fencedLeaseFailsClosedForPartitionCache() { + nowNanos.addAndGet((T_FENCE_MS + 1) * 1_000_000L); + + assertLeaseFenced( + () -> + partitionCache.getDatabaseToDevice( + Collections.singletonList(deviceID), false, false, AuthorityChecker.SUPER_USER)); + assertLeaseFenced( + () -> + partitionCache.getDeviceToDatabase( + Collections.singletonList(deviceID), false, false, AuthorityChecker.SUPER_USER)); + assertLeaseFenced( + () -> + partitionCache.checkAndAutoCreateDatabase( + "root.sg", false, AuthorityChecker.SUPER_USER)); + assertLeaseFenced( + () -> partitionCache.getRegionReplicaSet(Collections.singletonList(consensusGroupId))); + assertLeaseFenced(() -> partitionCache.getSchemaPartition(databaseToDeviceMap())); + assertLeaseFenced(() -> partitionCache.getSchemaPartition("root.sg")); + assertLeaseFenced(() -> partitionCache.getDataPartition(dataQueryMap())); + } + + private static void assertLeaseFenced(final Runnable runnable) { + final MetadataLeaseFencedException e = + assertThrows(MetadataLeaseFencedException.class, runnable::run); + assertEquals(TSStatusCode.METADATA_LEASE_FENCED.getStatusCode(), e.getErrorCode()); + } + + private Map> databaseToDeviceMap() { + final Map> map = new HashMap<>(); + map.put("root.sg", Collections.singletonList(deviceID)); + return map; + } + + private Map> dataQueryMap() { + final DataPartitionQueryParam param = new DataPartitionQueryParam(); + param.setDeviceID(deviceID); + param.setTimePartitionSlotList(Collections.singletonList(new TTimePartitionSlot(0))); + + final Map> map = new HashMap<>(); + map.put("root.sg", Collections.singletonList(param)); + return map; + } + + private static class TestingPartitionCache extends PartitionCache { + + private final MetadataLeaseManager leaseManager; + + private TestingPartitionCache(final MetadataLeaseManager leaseManager) { + this.leaseManager = leaseManager; + } + + @Override + protected void failIfMetadataLeaseFenced() { + MetadataLeaseTestUtils.failIfMetadataLeaseFenced(leaseManager); + } + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManagerLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManagerLeaseTest.java new file mode 100644 index 0000000000000..df689c257d414 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManagerLeaseTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache; + +import org.apache.iotdb.commons.exception.IllegalPathException; +import org.apache.iotdb.commons.exception.MetadataLeaseFencedException; +import org.apache.iotdb.commons.path.MeasurementPath; +import org.apache.iotdb.commons.path.PartialPath; +import org.apache.iotdb.db.queryengine.common.schematree.ClusterSchemaTree; +import org.apache.iotdb.db.queryengine.plan.analyze.schema.ISchemaComputation; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +import java.util.Collections; +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils.T_FENCE_MS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +public class TreeDeviceSchemaCacheManagerLeaseTest { + + private final AtomicLong nowNanos = new AtomicLong(100_000_000_000L); + + private MetadataLeaseManager leaseManager; + private TreeDeviceSchemaCacheManager manager; + + @Before + public void setUp() throws IllegalPathException { + nowNanos.set(100_000_000_000L); + leaseManager = MetadataLeaseTestUtils.newManager(nowNanos); + manager = new TestingTreeDeviceSchemaCacheManager(leaseManager); + manager.cleanUp(); + + final ClusterSchemaTree tree = new ClusterSchemaTree(); + tree.appendSingleMeasurement( + new PartialPath("root.sg1.d1.s1"), + new MeasurementSchema("s1", TSDataType.INT32), + null, + null, + null, + false); + tree.setDatabases(Collections.singleton("root.sg1")); + manager.put(tree); + } + + @After + public void tearDown() { + manager.cleanUp(); + } + + @Test + public void fencedLeaseFailsClosedForTreeSchemaCache() throws IllegalPathException { + final PartialPath device1 = new PartialPath("root.sg1.d1"); + final String[] measurements = new String[] {"s1"}; + + nowNanos.addAndGet((T_FENCE_MS + 1) * 1_000_000L); + assertLeaseFenced(() -> manager.get(device1, measurements)); + assertLeaseFenced( + () -> { + try { + manager.getMatchedNormalSchema(new MeasurementPath("root.sg1.d1.s1")); + } catch (IllegalPathException e) { + throw new RuntimeException(e); + } + }); + assertLeaseFenced(() -> manager.getMatchedTemplateSchema(device1)); + assertLeaseFenced(() -> manager.computeWithoutTemplate(Mockito.mock(ISchemaComputation.class))); + assertLeaseFenced(() -> manager.computeWithTemplate(Mockito.mock(ISchemaComputation.class))); + final ISchemaComputation logicalViewComputation = Mockito.mock(ISchemaComputation.class); + Mockito.when(logicalViewComputation.hasLogicalViewNeedProcess()).thenReturn(true); + assertLeaseFenced(() -> manager.computeSourceOfLogicalView(logicalViewComputation)); + } + + private static void assertLeaseFenced(final Runnable runnable) { + final MetadataLeaseFencedException e = + assertThrows(MetadataLeaseFencedException.class, runnable::run); + assertEquals(TSStatusCode.METADATA_LEASE_FENCED.getStatusCode(), e.getErrorCode()); + } + + private static class TestingTreeDeviceSchemaCacheManager extends TreeDeviceSchemaCacheManager { + + private final MetadataLeaseManager leaseManager; + + private TestingTreeDeviceSchemaCacheManager(final MetadataLeaseManager leaseManager) { + this.leaseManager = leaseManager; + } + + @Override + void failIfMetadataLeaseFenced() { + MetadataLeaseTestUtils.failIfMetadataLeaseFenced(leaseManager); + } + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java new file mode 100644 index 0000000000000..9ea9c580bd6b5 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.junit.Test; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils.T_FENCE_MS; +import static org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils.newManager; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class MetadataLeaseManagerTest { + + @Test + public void isNotFencedWithinLease() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos, () -> {}, () -> {}); + + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1234)); + + assertFalse(manager.isFenced()); + assertEquals(1234L, manager.getMillisSinceLastConfigNodeHeartbeat()); + } + + @Test + public void recoversAfterHeartbeatWhenLeaseExpired() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos, () -> {}, () -> {}); + + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + + manager.recoveryLeaseForTest(true); + + assertFalse(manager.isFenced()); + } + + @Test + public void retriesCacheClearInHeartbeatWorkerAfterFailure() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final AtomicInteger clearAttempts = new AtomicInteger(); + final MetadataLeaseManager manager = + newManager( + nowNanos, + () -> { + if (clearAttempts.getAndIncrement() == 0) { + throw new RuntimeException("mock clear cache failure"); + } + }, + () -> {}); + + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + assertEquals(0, clearAttempts.get()); + + manager.triggerCheckWithHeartBeat(); + assertTrue(manager.isFenced()); + assertEquals(1, clearAttempts.get()); + + manager.triggerCheckWithHeartBeat(); + assertFalse(manager.isFenced()); + assertEquals(2, clearAttempts.get()); + } + + @Test + public void retriesMetadataPullAfterFailure() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final AtomicInteger pullAttempts = new AtomicInteger(); + final MetadataLeaseManager manager = + newManager( + nowNanos, + () -> {}, + () -> { + if (pullAttempts.getAndIncrement() == 0) { + throw new RuntimeException("mock pull failure"); + } + }); + + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + + manager.triggerCheckWithHeartBeat(); + assertTrue(manager.isFenced()); + + manager.triggerCheckWithHeartBeat(); + assertFalse(manager.isFenced()); + assertEquals(2, pullAttempts.get()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseTestUtils.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseTestUtils.java new file mode 100644 index 0000000000000..1ae72821a77c3 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseTestUtils.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.apache.iotdb.commons.exception.MetadataLeaseFencedException; + +import com.google.common.util.concurrent.MoreExecutors; + +import java.util.Collections; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.LongSupplier; + +public final class MetadataLeaseTestUtils { + + public static final long T_FENCE_MS = 20_000L; + + private MetadataLeaseTestUtils() {} + + public static MetadataLeaseManager newManager(final AtomicLong nowNanos) { + return newManager(nowNanos::get); + } + + public static MetadataLeaseManager newManager(final LongSupplier nowNanos) { + return newManager(nowNanos, () -> {}, () -> {}); + } + + public static void failIfMetadataLeaseFenced(final MetadataLeaseManager manager) { + if (manager.isFenced()) { + throw new MetadataLeaseFencedException( + "Metadata lease is fenced. The local metadata cache is unavailable."); + } + } + + static MetadataLeaseManager newManager( + final AtomicLong nowNanos, + final MetadataLeaseManager.MetadataAction clearAction, + final MetadataLeaseManager.MetadataAction pullAction) { + return newManager(nowNanos::get, clearAction, pullAction); + } + + static MetadataLeaseManager newManager( + final LongSupplier nowNanos, + final MetadataLeaseManager.MetadataAction clearAction, + final MetadataLeaseManager.MetadataAction pullAction) { + return new MetadataLeaseManager( + nowNanos, + () -> T_FENCE_MS, + Collections.singletonList(clearAction), + Collections.singletonList(pullAction), + MoreExecutors.newDirectExecutorService()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java new file mode 100644 index 0000000000000..8007581f9289c --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.table; + +import org.apache.iotdb.commons.exception.MetadataLeaseFencedException; +import org.apache.iotdb.commons.schema.table.TsTable; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.iotdb.db.schemaengine.lease.MetadataLeaseTestUtils.T_FENCE_MS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +public class DataNodeTableCacheLeaseTest { + + private final AtomicLong nowNanos = new AtomicLong(100_000_000_000L); + + private MetadataLeaseManager leaseManager; + private DataNodeTableCache tableCache; + + @Before + public void setUp() { + nowNanos.set(100_000_000_000L); + leaseManager = MetadataLeaseTestUtils.newManager(nowNanos); + tableCache = Mockito.spy((DataNodeTableCache) DataNodeTableCache.getInstance()); + tableCache.invalidateAll(); + Mockito.doAnswer( + invocation -> { + MetadataLeaseTestUtils.failIfMetadataLeaseFenced(leaseManager); + return null; + }) + .when(tableCache) + .failIfMetadataLeaseFenced(); + } + + @After + public void tearDown() { + tableCache.invalidateAll(); + } + + @Test + public void fencedLeaseFailsClosedForReadApis() { + nowNanos.addAndGet((T_FENCE_MS + 1) * 1_000_000L); + assertLeaseFenced(() -> tableCache.getTableInWrite("root.db", "t")); + assertLeaseFenced(() -> tableCache.getTable("root.db", "t", false)); + assertLeaseFenced(() -> tableCache.isDatabaseExist("root.db")); + } + + @Test + public void fencedLeaseFailsClosedForUpdateApis() { + final TsTable table = new TsTable("t"); + + nowNanos.addAndGet((T_FENCE_MS + 1) * 1_000_000L); + assertLeaseFenced(() -> tableCache.preUpdateTable("root.db", table, null)); + assertLeaseFenced(() -> tableCache.rollbackUpdateTable("root.db", "t", null)); + assertLeaseFenced(() -> tableCache.commitUpdateTable("root.db", "t", null)); + } + + private static void assertLeaseFenced(final Runnable runnable) { + final MetadataLeaseFencedException e = + assertThrows(MetadataLeaseFencedException.class, runnable::run); + assertEquals(TSStatusCode.METADATA_LEASE_FENCED.getStatusCode(), e.getErrorCode()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheTest.java index 4b33991e3d7ed..a3baad292c78f 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheTest.java @@ -39,7 +39,7 @@ public class DataNodeTableCacheTest { @Test public void interruptedFetchDoesNotLeakSemaphorePermit() throws Exception { - final DataNodeTableCache cache = DataNodeTableCache.getInstance(); + final ITableCache cache = DataNodeTableCache.getInstance(); cache.invalid(DATABASE); try { final Semaphore fetchTableSemaphore = getFetchTableSemaphore(cache); @@ -60,7 +60,7 @@ public void interruptedFetchDoesNotLeakSemaphorePermit() throws Exception { @Test public void commitUpdateTableIsIdempotent() { - final DataNodeTableCache cache = DataNodeTableCache.getInstance(); + final ITableCache cache = DataNodeTableCache.getInstance(); cache.invalid(TABLE_CACHE_TEST_DATABASE); try { cache.preUpdateTable(TABLE_CACHE_TEST_DATABASE, createTable(TABLE_NAME), null); @@ -77,7 +77,7 @@ public void commitUpdateTableIsIdempotent() { @Test public void commitAfterRollbackUpdateTableIsIgnored() { - final DataNodeTableCache cache = DataNodeTableCache.getInstance(); + final ITableCache cache = DataNodeTableCache.getInstance(); cache.invalid(TABLE_CACHE_TEST_DATABASE); try { cache.preUpdateTable(TABLE_CACHE_TEST_DATABASE, createTable(TABLE_NAME), null); @@ -91,7 +91,7 @@ public void commitAfterRollbackUpdateTableIsIgnored() { } } - private Semaphore getFetchTableSemaphore(final DataNodeTableCache cache) throws Exception { + private Semaphore getFetchTableSemaphore(final ITableCache cache) throws Exception { final Field field = DataNodeTableCache.class.getDeclaredField("fetchTableSemaphore"); field.setAccessible(true); return (Semaphore) field.get(cache); diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/utils/EnvironmentUtils.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/utils/EnvironmentUtils.java index 4c2e1c9c95b98..83e0106e2aeb6 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/utils/EnvironmentUtils.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/utils/EnvironmentUtils.java @@ -353,7 +353,7 @@ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOEx public static void envSetUp() { logger.debug("EnvironmentUtil setup..."); config.setThriftServerAwaitTimeForStopService(60); - + CommonDescriptor.getInstance().getConfig().setMetadataLeaseFenceMs(Long.MAX_VALUE); createAllDir(); try { diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index 64e35fddd75c2..2ec0b2e010fcf 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -752,6 +752,16 @@ failure_detector_phi_threshold=30 # Datatype: long failure_detector_phi_acceptable_pause_in_ms=10000 +# A DataNode self-fences its ConfigNode-pushed metadata caches (table/tree schema, templates, TTL, +# permissions, ...) if it has not received a ConfigNode heartbeat within this duration, so a +# partitioned DataNode stops trusting stale caches. Kept aligned with +# failure_detector_fixed_threshold_in_ms so a DataNode fences itself around the same time the +# cluster would consider it down. The ConfigNode also uses this to decide how long it must wait +# before treating an unreachable DataNode as safely fenced. +# effectiveMode: restart +# Datatype: long +metadata_lease_fence_ms=20000 + # Whether to enable topology probing between DataNodes # effectiveMode: hot_reload # Datatype: Boolean diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index e9e01efd2d48b..0435fc1f5e85a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -105,6 +105,7 @@ public enum ThreadName { CONFIG_NODE_TIMEOUT_EXECUTOR("ProcedureTimeoutExecutor"), CONFIG_NODE_WORKER_THREAD_MONITOR("ProcedureWorkerThreadMonitor"), CONFIG_NODE_RETRY_FAILED_TASK("Cluster-RetryFailedTasks-Service"), + RELOAD_TABLE_METADATA_CACHE("Reload-table-metadata-cache"), // -------------------------- IoTConsensusV2 -------------------------- IOT_CONSENSUS_V2_RPC_SERVICE("IoTConsensusV2RPC-Service"), IOT_CONSENSUS_V2_RPC_PROCESSOR("IoTConsensusV2RPC-Processor"), @@ -383,7 +384,8 @@ public enum ThreadName { CONFIG_NODE_PROCEDURE_WORKER, CONFIG_NODE_WORKER_THREAD_MONITOR, CONFIG_NODE_TIMEOUT_EXECUTOR, - CONFIG_NODE_RETRY_FAILED_TASK)); + CONFIG_NODE_RETRY_FAILED_TASK, + RELOAD_TABLE_METADATA_CACHE)); private static final Set metricsThreadNames = new HashSet<>( diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java index 257b2a8ad5176..8442937bbb75d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java @@ -477,6 +477,13 @@ public class CommonConfig { private volatile long remoteWriteMaxRetryDurationInMs = 60000; + // The DataNode self-fences its ConfigNode-pushed metadata caches (table/tree schema, template, + // TTL, permission, ...) if it has not received a ConfigNode heartbeat within this duration. Kept + // aligned with the failure detector threshold so a partitioned DataNode stops trusting stale + // caches around the same time the cluster would consider it dead. Also used by the ConfigNode to + // derive how long it must wait before treating an unreachable DataNode as safely fenced. + private volatile long metadataLeaseFenceMs = 20_000; + private final RateLimiter querySamplingRateLimiter = RateLimiter.create(160); // if querySamplingRateLimiter < 0, means that there is no rate limit, we need to full sample all // the queries @@ -2907,6 +2914,14 @@ public void setRemoteWriteMaxRetryDurationInMs(long remoteWriteMaxRetryDurationI this.remoteWriteMaxRetryDurationInMs = remoteWriteMaxRetryDurationInMs; } + public long getMetadataLeaseFenceMs() { + return metadataLeaseFenceMs; + } + + public void setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + this.metadataLeaseFenceMs = metadataLeaseFenceMs; + } + public int getArenaNum() { return arenaNum; } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java index 9d7c6bdffc26b..dba01c67bd25b 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java @@ -340,6 +340,11 @@ public void loadCommonProps(TrimProperties properties) throws IOException { properties.getProperty( "path_log_max_size", String.valueOf(config.getPathLogMaxSize())))); + config.setMetadataLeaseFenceMs( + Long.parseLong( + properties.getProperty( + "metadata_lease_fence_ms", String.valueOf(config.getMetadataLeaseFenceMs())))); + loadRetryProperties(properties); loadBinaryAllocatorProps(properties); } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/exception/MetadataLeaseFencedException.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/exception/MetadataLeaseFencedException.java new file mode 100644 index 0000000000000..b87f76bb18f12 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/exception/MetadataLeaseFencedException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.exception; + +import org.apache.iotdb.rpc.TSStatusCode; + +public class MetadataLeaseFencedException extends IoTDBRuntimeException { + + public MetadataLeaseFencedException(String message) { + super(message, TSStatusCode.METADATA_LEASE_FENCED.getStatusCode()); + } + + public MetadataLeaseFencedException(Throwable cause) { + super(cause, TSStatusCode.METADATA_LEASE_FENCED.getStatusCode()); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/NonCommittableTsTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/NonCommittableTsTable.java index 5f22c86474f3c..71989fa54ac57 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/NonCommittableTsTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/NonCommittableTsTable.java @@ -31,6 +31,8 @@ * version. */ public class NonCommittableTsTable extends TsTable { + public static final int NON_COMMITTABLE_MARKER = -1; + public NonCommittableTsTable(final String tableName) { super(tableName); } @@ -38,6 +40,6 @@ public NonCommittableTsTable(final String tableName) { @Override public void serialize(final OutputStream stream) throws IOException { ReadWriteIOUtils.write(tableName, stream); - ReadWriteIOUtils.write(-1, stream); + ReadWriteIOUtils.write(NON_COMMITTABLE_MARKER, stream); } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/PreDeleteTsTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/PreDeleteTsTable.java new file mode 100644 index 0000000000000..2ed1aeff3506b --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/PreDeleteTsTable.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.schema.table; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.IOException; +import java.io.OutputStream; + +public class PreDeleteTsTable extends TsTable { + public static final int PRE_DELETE_MARKER = -2; + + public PreDeleteTsTable(final String tableName) { + super(tableName); + } + + @Override + public void serialize(final OutputStream stream) throws IOException { + ReadWriteIOUtils.write(tableName, stream); + ReadWriteIOUtils.write(PRE_DELETE_MARKER, stream); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TableNodeStatus.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TableNodeStatus.java index 33aad560c0f45..48c85a4081b91 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TableNodeStatus.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TableNodeStatus.java @@ -39,6 +39,10 @@ public enum TableNodeStatus { this.status = status; } + public byte getStatus() { + return status; + } + public void serialize(final OutputStream outputStream) throws IOException { ReadWriteIOUtils.write(status, outputStream); } @@ -57,7 +61,11 @@ public static TableNodeStatus deserialize(final InputStream inputStream) throws } public static TableNodeStatus deserialize(final ByteBuffer buffer) { - switch (ReadWriteIOUtils.readByte(buffer)) { + return deserialize(ReadWriteIOUtils.readByte(buffer)); + } + + public static TableNodeStatus deserialize(final byte status) { + switch (status) { case 0: return PRE_CREATE; case 1: diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TsTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TsTable.java index 833c37a59407f..3546413b10324 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TsTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/TsTable.java @@ -391,9 +391,12 @@ public void serialize(final OutputStream stream) throws IOException { public static TsTable deserialize(final InputStream inputStream) throws IOException { final String name = ReadWriteIOUtils.readString(inputStream); final int columnNum = ReadWriteIOUtils.readInt(inputStream); - if (columnNum < 0) { + if (columnNum == NonCommittableTsTable.NON_COMMITTABLE_MARKER) { return new NonCommittableTsTable(name); } + if (columnNum == PreDeleteTsTable.PRE_DELETE_MARKER) { + return new PreDeleteTsTable(name); + } final TsTable table = new TsTable(name); for (int i = 0; i < columnNum; i++) { table.addColumnSchema(TsTableColumnSchemaUtil.deserialize(inputStream)); @@ -405,9 +408,12 @@ public static TsTable deserialize(final InputStream inputStream) throws IOExcept public static TsTable deserialize(final ByteBuffer buffer) { final String name = ReadWriteIOUtils.readString(buffer); final int columnNum = ReadWriteIOUtils.readInt(buffer); - if (columnNum < 0) { + if (columnNum == NonCommittableTsTable.NON_COMMITTABLE_MARKER) { return new NonCommittableTsTable(name); } + if (columnNum == PreDeleteTsTable.PRE_DELETE_MARKER) { + return new PreDeleteTsTable(name); + } final TsTable table = new TsTable(name); for (int i = 0; i < columnNum; i++) { table.addColumnSchema(TsTableColumnSchemaUtil.deserialize(buffer)); diff --git a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift index a414584f80ec9..4b0b995c16778 100644 --- a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift +++ b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift @@ -159,6 +159,11 @@ struct TDataNodeRestartResp { 4: optional list correctConsensusGroups } +struct TDataNodeLeaseRecoveryResp{ + 1: required common.TSStatus status + 2: optional binary tableInfo +} + struct TDataNodeRemoveReq { 1: required list dataNodeLocations } @@ -1343,6 +1348,11 @@ service IConfigNodeRPCService { TDataNodeRestartResp restartDataNode(TDataNodeRestartReq req) + /** + * get all metadate cache when the heartbeart renew the lease + */ + TDataNodeLeaseRecoveryResp reloadCacheAfterLeaseRecovery(); + // ====================================================== // AINode // ====================================================== @@ -2095,7 +2105,7 @@ service IConfigNodeRPCService { TDescTable4InformationSchemaResp descTables4InformationSchema() - TFetchTableResp fetchTables(map> fetchTableMap) + TFetchTableResp fetchTables(map> fetchTableMap, byte tableNodeStatus) TDeleteTableDeviceResp deleteDevice(TDeleteTableDeviceReq req)