diff --git a/CMakeLists.txt b/CMakeLists.txt
index 546a14494..b5a438713 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,6 +161,7 @@ option(
 )
 message(STATUS "Custom communicator support is turned ${QUEST_ENABLE_SUBCOMM}. Set QUEST_ENABLE_SUBCOMM to modify.")
 
+
 # GPU Acceleration
 option(
   QUEST_ENABLE_CUDA
@@ -184,6 +185,20 @@ option(
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
 
+# GPU Performance Tuning
+# (We do not print this value when configuring CMake as it is for advanced users only)
+
+set(quest_tpb_description # (the games we play for multi-line set() strings!)
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. "
+  "Must be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). Can be overridden at executable launch "
+  "via an environment variable of the same name, or during runtime via a corresponding API setter function."
+)
+set(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK 128 
+  CACHE STRING
+  "${quest_tpb_description}")
+mark_as_advanced(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK)
+
+
 # Deprecated API
 option(
   QUEST_ENABLE_DEPRECATED_API
@@ -197,9 +212,15 @@ option(
   "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
   OFF
 )
-message(STATUS "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify.")
+message(STATUS 
+  "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. "
+  "Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify."
+)
 
 option(QUEST_INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+if (QUEST_INSTALL_BINARIES)
+  message(STATUS "Including example and user binaries in the install (if built).")
+endif()
 
 
 
@@ -222,10 +243,12 @@ if (QUEST_ENABLE_CUQUANTUM AND NOT QUEST_ENABLE_CUDA)
   message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
 endif()
 
+
 if (QUEST_ENABLE_SUBCOMM AND NOT QUEST_ENABLE_MPI)
   message(FATAL_ERROR "Distribution must be enabled to make use of a user-defined communicator for QuEST.")
 endif()
 
+
 if(WIN32)
   
   # Force MSVC to export all symbols in a shared library, like GCC and clang
@@ -243,6 +266,37 @@ if(WIN32)
 endif()
 
 
+# validate numTPB even when GPU not compiled
+if (QUEST_ENABLE_HIP)
+  set(quest_warp_size 64)
+  set(quest_gpu_model "AMD GPUs (via HIP)")
+else()
+  set(quest_warp_size 32)
+  set(quest_gpu_model "NVIDIA GPUs (via CUDA), or when not targeting GPUs")
+endif()
+math(EXPR quest_tpb_remainder "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} % ${quest_warp_size}")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK GREATER 0))
+  message(FATAL_ERROR
+    "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK was set to ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}, "
+    "but it must be a positive multiple of ${quest_warp_size} when compiling for ${quest_gpu_model}."
+  )
+endif()
+
+
+# warn when numTPB will be later overridden by the current environment variable
+if(
+  DEFINED ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} 
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL ""
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}"
+)
+  message(WARNING 
+    "The CMake option QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK=${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} "
+    "differs from the current environment variable (of the same name) value of $ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}. "
+    "If not cleared before QuEST is launched, the latter will override the former."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at
@@ -508,18 +562,19 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
 else()
   set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
+
+
+# non-binary set vars which will be written to config.h.in (with a differing name) 
+set(QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK})
 
 
-# these vars are already set, but repeated here for clarity
+# these vars are already set (cmake name matches the macro name), but repeated here for clarity
 set(QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION})
 set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
-
-
 
 # ============================
 # Pass files to library
diff --git a/docs/cmake.md b/docs/cmake.md
index e49157f4e..fec90d76a 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -48,7 +48,7 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default if not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 
diff --git a/docs/launch.md b/docs/launch.md
index 9d5e6ac22..3eb8493ee 100644
--- a/docs/launch.md
+++ b/docs/launch.md
@@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 
 - [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
 - [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 
diff --git a/examples/extended/set_num_gpu_threads.c b/examples/extended/set_num_gpu_threads.c
new file mode 100644
index 000000000..1b3dc175f
--- /dev/null
+++ b/examples/extended/set_num_gpu_threads.c
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <stdio.h>
+#include <time.h>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    printf("Using %d threads per block... ", numThreadsPerBlock);
+    fflush(stdout);
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double start = (double) clock();
+
+    for (int r = 0; r < NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double end = (double) clock();
+    double dur = (end - start) / CLOCKS_PER_SEC;
+    double av = dur / NUM_REPS;
+
+    printf("took %fs\n", av);
+}
+
+
+int main(void)
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated)
+    {
+        printf(
+            "GPU acceleration is not enabled, and so changing the number "
+            "of threads per block has no effect. Exiting...\n");
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    printf("Initial numThreadsPerBlock: %d\n\n", initNumTPB);
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark sensible parameters
+    int goodTPB[] = {64, 128, 256, 512, 1024};
+    for (int i = 0; i < 5; i++)
+        benchmark(qureg, goodTPB[i]);
+
+    // Try silly parameters
+    setQuESTValidationOff();
+    int badTPB[] = {31, 15, 5, 1};
+    for (int i = 0; i < 4; i++)
+        benchmark(qureg, badTPB[i]);
+
+    destroyQureg(qureg);
+    finalizeQuESTEnv();
+
+    return 0;
+}
diff --git a/examples/extended/set_num_gpu_threads.cpp b/examples/extended/set_num_gpu_threads.cpp
new file mode 100644
index 000000000..c298d736d
--- /dev/null
+++ b/examples/extended/set_num_gpu_threads.cpp
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <iostream>
+#include <chrono>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    std::cout << "Using " << numThreadsPerBlock << " threads per block... " << std::flush;
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    using clock = std::chrono::steady_clock;
+    auto start = clock::now();
+
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    auto end = clock::now();
+    auto dur = std::chrono::duration<double>(end - start).count();
+    auto av  = dur / NUM_REPS;
+
+    std::cout << " took " << av << "s" << std::endl;
+}
+
+
+int main()
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated) {
+        std::cout 
+            << "GPU acceleration is not enabled, and so changing the number "
+            << "of threads per block has no effect. Exiting..."
+            << std::endl;
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    // The initial number of threads per block is informed by the optional environment
+    // variable QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK. If not specified, QuEST will
+    // use the value of the CMake option of the same name passed during compilation,
+    // which itself will has a default of 128
+    auto initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    std::cout << "Initial numThreadsPerBlock: " << initNumTPB << "\n\n";
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark QuEST with sensible numbers of threads per block (multiples of warp size)
+    for (auto numTPB : {64, 128, 256, 512, 1024})
+        benchmark(qureg, numTPB);
+
+    // Try silly parameters ¯\_(ツ)_/¯
+    setQuESTValidationOff();
+    for (auto numTPB : {31, 15, 5, 1})
+        benchmark(qureg, numTPB);
+    
+    finalizeQuESTEnv();
+    return 0;
+}
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index 3f301c250..1bb8a0470 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -83,14 +83,15 @@
 #cmakedefine01 QUEST_COMPILE_SUBCOMM
 #cmakedefine01 QUEST_COMPILE_CUDA
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
 
 // crucial to QuEST source (informs optional NUMA usage)
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 QUEST_COMPILE_HIP
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK @QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK@
 
 
 
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index 2fabdc34f..8c2cc4e0a 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -44,7 +44,6 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM
-
 /** @notyetdoced
  * 
  *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
@@ -61,10 +60,46 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
  * @author Oliver Brown
  */
 void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will use the value specified by the CMake/compile option of the
+ * same name, which itself presently defaults to @p 128. After this function is called, QuEST
+ * will adopt @p numThreadsPerBlock for the remainder of execution, or until this function is
+ * called again.
+ * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
+ * @author Oliver Brown
+ * @author Tyson Jones
+ */
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
+
+
 // end de-mangler
 #ifdef __cplusplus
 }
diff --git a/quest/include/modes.h b/quest/include/modes.h
index 285b1cb5d..25ad8bb54 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -43,6 +43,10 @@
      *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
      *  - permit sharing: @p 1, @p '1'
      * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified string does not evaluate to an integer @p 0 or @p 1.
+     * 
      * @author Tyson Jones
      */
     const int QUEST_PERMIT_NODES_TO_SHARE_GPU = 0;
@@ -68,7 +72,7 @@
      *    default validation epsilon.
      * 
      * @constraints
-     * The function initQuESTEnv() will throw a validation error if:
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
      *   - The specified epsilon must be `0` or positive.
      *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
      *     in a `qreal`, which is specific to its precision.
@@ -78,6 +82,40 @@
     const qreal QUEST_DEFAULT_VALIDATION_EPSILON = 0;
 
 
+    /** @envvardoc
+     * 
+     * Specifies the default number of threads per block (or "block dimension") used by GPU acceleration. 
+     * 
+     * The number of dispatched CUDA threads per block controls the parallelisation granularity of
+     * QuEST's GPU backend, affecting performance.
+     * Specifying `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` to a valid, positive integer overrides
+     * QuEST's default otherwise set during compilation via a CMake option of the same name. If 
+     * that CMake option was not set, the default is assumed to be @p 128.
+     * 
+     * The number specified by this environment variable will be used as the block dimension by all of
+     * QuEST's GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
+     * The actual number of threads per block used at any time can be queried via 
+     * getQuESTNumGpuThreadsPerBlock(), or reported by reportQuESTEnv().
+     * 
+     * @envvarvalues
+     *  - use internal default of `128`: @p '', @p , (unspecified)
+     *  - use number `x`: @p x, @p 'x', @p '+x'
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified number must be a positive integer.
+     *   - The specified number must not exceed the minimum or maximum value which can be stored in an @p int.
+     *   - The specified number must be divisible by the GPU warp size, which is 32 or 64, depending on
+     *     whether deployed to an NVIDIA or AMD GPU. This restriction is imposed even when QuEST is not
+     *     deployed with GPU-acceleration.
+     *   - The specified number exceeds the maximum imposed by the available GPU hardware.
+     * 
+     * @author Oliver Brown
+     * @author Tyson Jones
+     */
+    const qreal QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = 0;
+
+
 #endif
 
 
diff --git a/quest/include/precision.h b/quest/include/precision.h
index 2c89545f7..7b932e678 100644
--- a/quest/include/precision.h
+++ b/quest/include/precision.h
@@ -126,13 +126,13 @@
  */
 
 #if QUEST_FLOAT_PRECISION == 1
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
 #elif QUEST_FLOAT_PRECISION == 2
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
 #elif QUEST_FLOAT_PRECISION == 4
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index abf3127e8..c59334b55 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -79,7 +79,10 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     validate_envNeverInit(global_envPtr != nullptr, global_hasEnvBeenFinalized, caller);
 
     // load env-vars before validating deployment mode, because some env vars can
-    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU)
+    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU). note that
+    // some env-vars (like QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK) will be here
+    // validated to have a correct format (like an int), but the validity of its
+    // actual value will be checked later (since it requires deciding GPU-accel).
     envvars_validateAndLoadEnvVars(caller);
     validateconfig_setEpsilonToDefault();
 
@@ -131,6 +134,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
+    // validate the initial numTPB env-var (if specified) is valid
+    int initNumThreadsPerBlock = envvars_getDefaultNumGpuThreadsPerBlock();
+    validate_numGpuThreadsPerBlock(initNumThreadsPerBlock, useGpuAccel, caller);
+    gpu_setNumThreadsPerBlock(initNumThreadsPerBlock);
+
     // cuQuantum is always used in GPU-accelerated envs when available
     bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
     if (useCuQuantum) {
@@ -157,7 +165,7 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     global_envPtr->isGpuAccelerated    = useGpuAccel;
     global_envPtr->isDistributed       = useDistrib;
     global_envPtr->isMpiUserOwned      = userOwnsMpi;
-    global_envPtr->isMpiGpuAware        = isMpiGpuAware;
+    global_envPtr->isMpiGpuAware       = isMpiGpuAware;
     global_envPtr->isCuQuantumEnabled  = useCuQuantum;
     global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
@@ -200,10 +208,11 @@ void printCompilationInfo() {
 
     print_table(
         "compilation", {
+        {"isOmpCompiled",         cpu_isOpenmpCompiled()},
         {"isMpiCompiled",         comm_isMpiCompiled()},
         {"isMpiSubCommCompiled",  comm_isMpiSubCommCompiled()},
         {"isGpuCompiled",         gpu_isGpuCompiled()},
-        {"isOmpCompiled",         cpu_isOpenmpCompiled()},
+        {"isHipCompiled",         gpu_isHipCompiled()},
         {"isCuQuantumCompiled",   gpu_isCuQuantumCompiled()},
     });
 }
@@ -213,9 +222,9 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
+        {"isOmpEnabled",        global_envPtr->isMultithreaded},
         {"isMpiEnabled",        global_envPtr->isDistributed},
         {"isGpuEnabled",        global_envPtr->isGpuAccelerated},
-        {"isOmpEnabled",        global_envPtr->isMultithreaded},
         {"isCuQuantumEnabled",  global_envPtr->isCuQuantumEnabled},
     });
 }
@@ -265,6 +274,7 @@ void printGpuInfo() {
         {"gpuMemory",     isGpu?  printer_getMemoryWithUnitStr(gpu_getTotalMemoryInBytes())            + pg : na},
         {"gpuMemoryFree", isGpu?  printer_getMemoryWithUnitStr(gpu_getCurrentAvailableMemoryInBytes()) + pg : na},
         {"gpuCache",      isGpu?  printer_getMemoryWithUnitStr(gpu_getCacheMemoryInBytes())            + pg : na},
+        {"numThreadsPerBlock", isGpu? printer_toStr(gpu_getNumThreadsPerBlock()) : na},
     });
 }
 
diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp
index 1ad6fdb42..a6f883656 100644
--- a/quest/src/api/experimental.cpp
+++ b/quest/src/api/experimental.cpp
@@ -13,6 +13,7 @@
 
 #include "quest/src/core/validation.hpp"
 #include "quest/src/comm/comm_config.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
     #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
@@ -59,7 +60,7 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
-
+ 
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
 
     // useDistrib and userOwnsMpi are implied by the user of this initialiser
@@ -81,9 +82,26 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     // perform remaining validation (some is harmlessly repeated) and init QuEST env
     validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+int getQuESTNumGpuThreadsPerBlock() {
+    validate_envIsInit(__func__);
+    
+    return gpu_getNumThreadsPerBlock();
+}
+
+
+void setQuESTNumGpuThreadsPerBlock(int numTPB) {
+    validate_envIsInit(__func__);
+
+    // validation messages and queries depend upon GPU usage
+    bool gpuIsActive = getQuESTEnv().isGpuAccelerated;
+    validate_numGpuThreadsPerBlock(numTPB, gpuIsActive, __func__);
+
+    gpu_setNumThreadsPerBlock(numTPB);
+}
+
+
 // end de-mangler
 }
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 5c59477ca..4b76ca71e 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -209,6 +209,14 @@ bool comm_isMpiInit() {
 }
 
 
+bool comm_isMpiUserOwned() {
+
+    // this isn't presently used by the code base; I'm just naughtily silencing
+    // "unused var" warning when compiling without MPI :^)
+    return global_isMpiUserOwned;
+}
+
+
 
 /*
  * QUEST COMMUNICATION MANAGEMENT
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 826ebdf1c..cc009ab9a 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -17,6 +17,7 @@ bool comm_isMpiCompiled();
 bool comm_isMpiSubCommCompiled();
 bool comm_isMpiGpuAware();
 bool comm_isMpiInit();
+bool comm_isMpiUserOwned();
 
 // control of QuEST's (possibly more limited) MPI env
 bool comm_isActive();
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
index bd9f87b6f..c1d3e81ed 100644
--- a/quest/src/core/envvars.cpp
+++ b/quest/src/core/envvars.cpp
@@ -6,12 +6,14 @@
  * @author Tyson Jones
  */
 
+#include "quest/include/config.h"
 #include "quest/include/precision.h"
 #include "quest/include/types.h"
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/parser.hpp"
 #include "quest/src/core/validation.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #include <string>
 #include <cstdlib>
@@ -26,8 +28,9 @@ using std::string;
 
 
 namespace envvar_names {
-    string QUEST_PERMIT_NODES_TO_SHARE_GPU = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
-    string QUEST_DEFAULT_VALIDATION_EPSILON = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_PERMIT_NODES_TO_SHARE_GPU         = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
+    string QUEST_DEFAULT_VALIDATION_EPSILON        = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK";
 }
 
 
@@ -45,7 +48,11 @@ namespace envvar_values {
 
     // by default, the initial validation epsilon (before being overriden
     // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
-    qreal QUEST_DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+    qreal QUEST_DEFAULT_VALIDATION_EPSILON = QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+
+    // by default, the initial number of GPU threads per block is informed by
+    // the below cmake variable (before being overridden by env-var or at runtime)
+    int QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -123,6 +130,21 @@ void validateAndSetDefaultValidationEpsilon(const char* caller) {
 }
 
 
+void validateAndSetDefaultNumGpuThreadsPerBlock(const char* caller) {
+
+    // permit unspecified, falling back to the hardcoded default
+    string name = envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+    if (!isEnvVarSpecified(name))
+        return;
+
+    string value = getSpecifiedEnvVarValue(name);
+    validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(value, caller);
+
+    // overwrite default env-var value
+    envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = parser_parseInteger(value);
+}
+
+
 
 /*
  * PUBLIC
@@ -138,6 +160,7 @@ void envvars_validateAndLoadEnvVars(const char* caller) {
     // load all env-vars
     validateAndSetWhetherGpuSharingIsPermitted(caller);
     validateAndSetDefaultValidationEpsilon(caller);
+    validateAndSetDefaultNumGpuThreadsPerBlock(caller);
 
     // ensure no re-loading
     global_areEnvVarsLoaded = true;
@@ -156,3 +179,10 @@ qreal envvars_getDefaultValidationEpsilon() {
 
     return envvar_values::QUEST_DEFAULT_VALIDATION_EPSILON;
 }
+
+
+int envvars_getDefaultNumGpuThreadsPerBlock() {
+    assertEnvVarsAreLoaded();
+
+    return envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+}
diff --git a/quest/src/core/envvars.hpp b/quest/src/core/envvars.hpp
index 555e76f15..4862e8d08 100644
--- a/quest/src/core/envvars.hpp
+++ b/quest/src/core/envvars.hpp
@@ -15,6 +15,7 @@
 namespace envvar_names { 
     extern std::string QUEST_PERMIT_NODES_TO_SHARE_GPU;
     extern std::string QUEST_DEFAULT_VALIDATION_EPSILON;
+    extern std::string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -33,5 +34,7 @@ bool envvars_getWhetherGpuSharingIsPermitted();
 
 qreal envvars_getDefaultValidationEpsilon();
 
+int envvars_getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // ENVVARS_HPP
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 8879fc7a1..807cad105 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -650,6 +650,11 @@ void error_gpuUnexpectedlyInaccessible() {
     raiseInternalError("A function internally assumed (as a precondition) that QuEST was compiled with GPU-acceleration enabled, and that one was physically accessible, though this was untrue.");
 }
 
+void error_gpuNumThreadsPerBlockNotSet() {
+
+    raiseInternalError("A function queried the GPU numThreadsPerBlock before it had been set (intendedly by QuESTEnv initialisation).");
+}
+
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated() {
 
     raiseInternalError("A function checked whether persistent GPU memory (such as in a CompMatr) had been synchronised, but the QuEST environment is not GPU accelerated.");  
@@ -884,6 +889,16 @@ void error_attemptedToParseRealFromInvalidString() {
     raiseInternalError("A function attempted to parse a string to a qreal but the string was not validly formatted. This should have been caught by prior user validation.");
 }
 
+void error_attemptedToParseIntegerFromInvalidString() {
+
+    raiseInternalError("A function attempted to parse a string to an int but the string was not validly formatted. This should have been caught by prior user validation.");
+}
+
+void error_attemptedToParseOutOfRangeInteger() {
+
+    raiseInternalError("A function attempted to parse a string to an integer but the numerical value of the string literal exceeded the range of the integer. This should have been caught by prior validation.");
+}
+
 void error_attemptedToParseOutOfRangeReal() {
 
     raiseInternalError("A function attempted to parse a string to a qreal but the numerical value of the string literal exceeded the range of the qreal. This should have been caught by prior user validation.");
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 33cc182c7..f91f890b0 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -249,12 +249,16 @@ void error_gpuCopyButMatrixNotGpuAccelerated();
 
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated();
 
+void error_gpuNumThreadsPerBlockNotSet();
+
 void error_gpuUnexpectedlyInaccessible();
 
 void error_gpuDeadCopyMatrixFunctionCalled();
 
 void error_gpuDenseMatrixConjugatedAndTransposed();
 
+void error_gpuBadNumThreadsPerBlock();
+
 void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
@@ -365,6 +369,10 @@ void error_attemptedToParseComplexFromInvalidString();
 
 void error_attemptedToParseRealFromInvalidString();
 
+void error_attemptedToParseIntegerFromInvalidString();
+
+void error_attemptedToParseOutOfRangeInteger();
+
 void error_attemptedToParseOutOfRangeReal();
 
 void error_attemptedToParsePauliStringFromInvalidString();
diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
index 140d77745..9d9194a3f 100644
--- a/quest/src/core/parser.cpp
+++ b/quest/src/core/parser.cpp
@@ -82,6 +82,9 @@ namespace patterns {
     // full complex; any format, importantly in order of decreasing specificity. do not consult for captured groups
     string num = group(comp) + "|" + group(imag) + "|" + group(real);
 
+    // full signed integer
+    string signedInt = optSign + "[0-9]+";
+
     // no capturing because 'num' pollutes captured groups, and pauli syntax overlaps real integers
     string pauli = "[" + parser_RECOGNISED_PAULI_CHARS + "]";
     string paulis = group(optSpace + pauli + optSpace) + "+";
@@ -96,6 +99,7 @@ namespace regexes {
     regex imag(patterns::imag);
     regex comp(patterns::comp);
     regex num(patterns::num);
+    regex signedInt(patterns::signedInt);
     regex paulis(patterns::paulis);
     regex weightedPaulis(patterns::weightedPaulis);
 }
@@ -173,6 +177,63 @@ int getNumPaulisInLine(string line) {
 
 
 
+/*
+ * INTEGER PARSING
+ */
+
+
+bool parser_isAnySizedInteger(string str) {
+
+    smatch match;
+    return regex_match(str, match, regexes::signedInt);
+}
+
+
+bool parser_isValidInteger(string str) {
+
+    // reject str if it doesn't match regex
+    if (!parser_isAnySizedInteger(str))
+        return false;
+
+    // remove whitespace which stoi() below cannot handle after the sign
+    removeWhiteSpace(str);
+
+    // check number is in-range of int via duck-typing
+    try {
+        std::stoi(str);
+    } catch (const out_of_range&) {
+        return false;
+
+    // error if our regex permitted an unparsable string
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    }
+
+    return true;
+}
+
+
+int parser_parseInteger(string str) {
+
+    if (!parser_isValidInteger(str))
+        error_attemptedToParseIntegerFromInvalidString();
+
+    removeWhiteSpace(str); // stoi can't handle
+
+    try {
+        return std::stoi(str);
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    } catch (const out_of_range&) {
+        error_attemptedToParseOutOfRangeInteger();
+    }
+
+    // unreachable
+    return -1;
+}
+
+
+
 /*
  * REAL NUMBER PARSING
  */
diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp
index 4a9df2d02..3d34588ae 100644
--- a/quest/src/core/parser.hpp
+++ b/quest/src/core/parser.hpp
@@ -20,12 +20,16 @@ using std::string;
  * PARSING NUMBERS
  */
 
+bool parser_isAnySizedInteger(string str);
+bool parser_isValidInteger(string str);
+
 bool parser_isAnySizedReal(string str);
 bool parser_isAnySizedComplex(string str);
 
 bool parser_isValidReal(string str);
 bool parser_isValidComplex(string str);
 
+int parser_parseInteger(string str);
 qreal parser_parseReal(string str);
 qcomp parser_parseComplex(string str);
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index c727ad1c5..30119f2dd 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -159,6 +159,31 @@ namespace report {
     string INVALID_REPORTED_PAULI_STR_STYLE_FLAG =
         "Given an unrecognised style flag (${FLAG}). Legal flags are 0 and 1.";
 
+    // substrings re-used below
+    string _invalid_num_tpb_prefix =
+        "An invalid number of GPU threads per block (${NUM_TPB}) was passed, or specified via environment variable " + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + ", or compiled into the QuEST library through the CMake option of the same name.";
+    string _num_tpb_warp_indivisible_infix =
+        "The specified number does not divide evenly into the warp size of ${CUDA_WARP_SIZE} (NVIDIA GPUs) or ${HIP_WARP_SIZE} (AMD GPUs).";
+    string _num_tpb_warp_negative_infix =
+        "The specified number must be positive.";
+    string _num_tpb_ineffectual_suffix =
+        "Note GPU acceleration is not active so this parameter has no effect anyway.";
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX =
+        _invalid_num_tpb_prefix + " Exceeds the hardware-imposed maximum of ${MAX_TPB}.";
+
 
     /*
      * QUREG CREATION
@@ -1147,6 +1172,13 @@ namespace report {
 
     string DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE =
         "The optional '" + envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON + "' environment variable was negative. The value must be zero or positive.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was not a recognisable integer.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was larger (in magnitude) than the maximum value which can be stored in an integer.";
+
 }
 
 
@@ -1647,6 +1679,49 @@ void validate_reportedPauliStrStyleFlag(int flag, const char* caller) {
     assertThat(flag==0 || flag==1, report::INVALID_REPORTED_PAULI_STR_STYLE_FLAG, {{"${FLAG}",flag}}, caller);
 }
 
+void validate_numGpuThreadsPerBlock(int numTPB, bool isGpuActive, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    // var 'isGpuActive' indicates that the GPU backend is compiled, a physical
+    // GPU is available, AND that the QuESTEnv has GPU-acceleration enabled, i.e.
+    // isGPuActive = gpu_isGpuCompiled() && gpu_isGpuAvailable() && env.isGpuAccelerated,
+    // though is established before QuESTEnv initialisation has completed.
+
+    // validate numTPB > 0 with an error message that points out TPB may be redundant
+    tokenSubs vars = {{"${NUM_TPB}", numTPB}};
+    auto errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY;
+    assertThat(numTPB > 0, errorMsg, vars, caller);
+
+    // prepare to validate TPB is warp-divisible, again pointing out redundancy...
+    vars["${CUDA_WARP_SIZE}"] = gpu_CUDA_WARP_SIZE;
+    vars["${HIP_WARP_SIZE}"] = gpu_HIP_WARP_SIZE;
+    errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY;
+
+    // ... but note that when the GPU backend isn't compiled, we don't know whether the
+    // user has an NVIDIA or AMD GPU, which have distinct warps of 32 (CUDA) and 64 (HIP),
+    // and so choose the smaller divisor (32,CUDA), ergo potentially permitting warp TPB
+    // that are incompatible with HIP. An extremely unimportant subtlety!
+    static_assert(gpu_HIP_WARP_SIZE >= gpu_CUDA_WARP_SIZE);
+    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
+    assertThat(numTPB % warpSize == 0, errorMsg, vars, caller);
+
+    // the final check of max numTBP requires querying the hardware device, which obviously
+    // isn't possible if not available (and is pointless if available but we're not using!)
+    if (!isGpuActive)
+        return;
+
+    // otherwise, we verify numTPB doesn't exceed the hardware-declared maximum
+    auto maxNumTPB = gpu_getMaxNumThreadsPerBlock();
+    vars = {{"${NUM_TPB}", numTPB}, {"${MAX_TPB}", maxNumTPB}};
+    assertThat(numTPB <= maxNumTPB, report::GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX, vars, caller);
+}
+
 
 
 /*
@@ -4991,6 +5066,9 @@ void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* ca
 
 void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
     if (!global_isValidationEnabled)
         return;
 
@@ -5002,6 +5080,9 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
     if (!global_isValidationEnabled)
         return;
 
@@ -5011,3 +5092,17 @@ void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller
     qreal eps = parser_parseReal(varValue);
     assertThat(eps >= 0, report::DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE, caller);
 }
+
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller) {
+
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
+    if (!global_isValidationEnabled)
+        return;
+
+    // we here only validate that the value is a valid signed integer;
+    // validation of its GPU-compatibility is performed by another func
+    assertThat(parser_isAnySizedInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT, caller);
+    assertThat(parser_isValidInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE, caller);
+}
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 787316326..58a0b632f 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -113,6 +113,8 @@ void validate_numPauliChars(const char* paulis, const char* caller);
 
 void validate_reportedPauliStrStyleFlag(int flag, const char* caller);
 
+void validate_numGpuThreadsPerBlock(int numTBP, bool isGpuActive, const char* caller);
+
 
 
 /*
@@ -554,6 +556,8 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller);
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller);
 
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller);
+
 
 
 #endif // VALIDATION_HPP
\ No newline at end of file
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index 9666040b5..bd51236bb 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() {
 #if QUEST_COMPILE_OMP
     int n = -1;
 
-    #pragma omp parallel shared(n)
-    #pragma omp single
-    n = omp_get_num_threads();
+    n = omp_get_max_threads();
 
     return n;
 #else
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 4e03217e5..001cc62c0 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -174,6 +174,11 @@ bool gpu_isCuQuantumCompiled() {
 }
 
 
+bool gpu_isHipCompiled() {
+    return (bool) (QUEST_COMPILE_CUDA && QUEST_COMPILE_HIP);
+}
+
+
 int gpu_getNumberOfLocalGpus() {
 #if QUEST_COMPILE_CUDA
 
@@ -331,6 +336,40 @@ qindex gpu_getMaxNumConcurrentThreads() {
  */
 
 
+// the default numTPB is not known until runtime since the initial value
+// (provided either by the CMake var, or the environment variable) must
+// be validated during QuEST initialisation.
+static int global_numThreadsPerBlock = -1;
+
+
+int gpu_getNumThreadsPerBlock() {
+    if (global_numThreadsPerBlock == -1)
+        error_gpuNumThreadsPerBlockNotSet();
+
+    return global_numThreadsPerBlock;
+}
+
+
+void gpu_setNumThreadsPerBlock(int newNumTPB) {
+
+    global_numThreadsPerBlock = newNumTPB;
+}
+
+
+int gpu_getMaxNumThreadsPerBlock() {
+#if QUEST_COMPILE_CUDA
+
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, getBoundGpuId());
+    return prop.maxThreadsPerBlock; // HIP compatible
+
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
+}
+
+
 std::array<char,17> getBoundGpuUuid() {
 #if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 9a2565544..98cb9c8a3 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -20,6 +20,15 @@
 
 
 
+/*
+ * CONSTANTS
+ */
+
+constexpr int gpu_CUDA_WARP_SIZE = 32;
+constexpr int gpu_HIP_WARP_SIZE = 64;
+
+
+
 /*
  * CUDA ERROR HANDLING
  */
@@ -43,6 +52,8 @@ bool gpu_isGpuCompiled();
 
 bool gpu_isCuQuantumCompiled();
 
+bool gpu_isHipCompiled();
+
 bool gpu_isGpuAvailable();
 
 bool gpu_isDirectGpuCommPossible();
@@ -65,6 +76,12 @@ qindex gpu_getMaxNumConcurrentThreads();
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock();
+
+void gpu_setNumThreadsPerBlock(int newThreadsPerBlock);
+
+int gpu_getMaxNumThreadsPerBlock();
+
 void gpu_bindLocalGPUsToNodes();
 
 bool gpu_areAnyNodesBoundToSameGpu();
@@ -76,7 +93,6 @@ void gpu_initCuQuantum();
 void gpu_finalizeCuQuantum();
 
 
-
 /*
  * MEMORY MANAGEMENT
  */
@@ -122,4 +138,4 @@ size_t gpu_getCacheMemoryInBytes();
 
 
 
-#endif // GPU_CONFIG_HPP
\ No newline at end of file
+#endif // GPU_CONFIG_HPP
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 1578ce200..b6954f701 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -42,23 +42,19 @@
  * THREAD MANAGEMENT
  */
 
-
-const int NUM_THREADS_PER_BLOCK = 128;
-
-
 __forceinline__ __device__ qindex getThreadInd() {
     return blockIdx.x*blockDim.x + threadIdx.x;
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads) {
+__host__ qindex getNumBlocks(qindex numThreads, int numThreadsPerBlock) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
     /// making it function specific
 
     // CUDA ceil
-    return ceil(numThreads / static_cast<qreal>(NUM_THREADS_PER_BLOCK));
+    return ceil(numThreads / static_cast<qreal>(numThreadsPerBlock));
 }
 
 
@@ -305,7 +301,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
     // must be strictly through compile-time-known indices, otherwise it will auto-
     // spill to local memory). Hence, this _subA() function is not a subroutine 
     // despite some logic being common to non-compile-time _subB(), and hence
-    // why the loops below are explicitly compile-time unrolled
+    // why the loops below are explicitly compile-time unrolled. Beware that when
+    // numThreadsPerBlock is increased from 128, this kernel will still behave
+    // correctly, but privateCache below will spill over into local memory at a
+    // performance penalty for NumTargs <= 5, with spillage occurring for fewer
+    // NumTargs as numThreadsPerBlock increases.
     REGISTER gpu_qcomp privateCache[1 << NumTargs];
 
     // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1)
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 7567d3c46..9b8e819b5 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -66,7 +66,6 @@
 using std::vector;
 
 
-
 /*
  * GETTERS
  */
@@ -141,13 +140,14 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstLis
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
     devints sortedQubits = getDevInts(util_getSorted(qubits));
     qindex qubitStateMask  = util_getBitMask(qubits, qubitStates);
 
-    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, 
         getPtr(sortedQubits), qubits.size(), qubitStateMask
     );
@@ -169,10 +169,11 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
-    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, 
         qubit1, qubit2, qubit3, bit2
     );
@@ -208,12 +209,13 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 c
 #elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ2, targ1}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});
 
-    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2
     );
@@ -232,13 +234,14 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 c
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask
     );
@@ -257,13 +260,14 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 c
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState});
 
-    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask
     );
@@ -300,14 +304,15 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, C
 #elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
 
     auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, 
         m00, m01, m10, m11
@@ -327,13 +332,14 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, C
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, 
         getGpuQcomp(fac0), getGpuQcomp(fac1)
@@ -370,7 +376,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
 #elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ1,targ2}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0});
@@ -378,7 +385,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
     // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...)
     auto m = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2,
         m[0], m[1], m[2],  m[3],  m[4],  m[5],  m[6],  m[7],
@@ -455,9 +462,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
     if constexpr (NumTargs != -1) {
 
         // when NumTargs <= 5, each thread has a private array stored in the registers,
-        // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of 
-        // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
-        // memory capacity, but does NOT exceed maximum register capacity.
+        // enabling rapid IO. When using the default numThreadsPerBlock = 128, the max
+        // size of this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
+        // memory capacity, but does NOT exceed maximum register capacity. When the user
+        // increases numThreadsPerBlock, the thread-private array in the below kernel
+        // will spill from registers into local memory, degrading performance, but
+        // behaving correctly and stably.
 
         /// @todo
         /// We should really check the above claims, otherwise the thread-private arrays could
@@ -465,11 +475,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
         /// global memory) and greatly sabotage performance on some GPUs.
 
         qindex numThreads = numBatches;
-        qindex numBlocks = getNumBlocks(numThreads);
+        int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+        qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         kernel_statevec_anyCtrlFewTargDenseMatr
             <NumCtrls, NumTargs, ApplyConj, ApplyTransp> 
-            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            <<<numBlocks, numThreadsPerBlock>>> (
                 ampsPtr, numThreads, 
                 qubitsPtr, nCtrls, qubitStateMask, 
                 targsPtr, matrPtr
@@ -488,6 +499,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
         // where we assign one-block per multiprocessor because we are anyway memory-
         // bandwidth bound (so we don't expect many interweaved blocks per MP).
         qindex numThreads = gpu_getMaxNumConcurrentThreads();
+        int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
         
         // use strictly 2^# threads to maintain precondition of all kernels
         if (!isPowerOf2(numThreads))
@@ -499,15 +511,15 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
 
         // evenly distribute the batches between threads, and the threads unevenly between blocks
         qindex numBatchesPerThread = numBatches / numThreads; // divides evenly
-        qindex numBlocks = getNumBlocks(numThreads);
+        qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         // expand the cache if necessary
-        qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
+        qindex numKernelInvocations = numBlocks * numThreadsPerBlock;
         qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations);
 
         kernel_statevec_anyCtrlManyTargDenseMatr 
             <NumCtrls, ApplyConj, ApplyTransp> 
-            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            <<<numBlocks, numThreadsPerBlock>>> (
                 getGpuQcompPtr(cache),
                 ampsPtr, numThreads, numBatchesPerThread, 
                 qubitsPtr, nCtrls, qubitStateMask, 
@@ -569,13 +581,14 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
     );
@@ -639,13 +652,14 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2,
         elems[0], elems[1], elems[2], elems[3]
@@ -707,13 +721,14 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints deviceTargs = getDevInts(targs);
     devints deviceCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), 
         getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent)
@@ -764,11 +779,12 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     kernel_densmatr_allTargDiagMatr_sub 
         <HasPower, ApplyLeft, ApplyRight, ConjRight> 
-        <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+        <<<numBlocks, numThreadsPerBlock>>> (
             getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
             getGpuQcompPtr(util_getGpuMemPtr(matr)), matr.numElems, getGpuQcomp(exponent)
     );
@@ -826,8 +842,9 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ct
     // faster than when giving threads many pair-amps to modify, due to memory movements
 
     qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
-    qindex numBlocks = getNumBlocks(numThreads);
-    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
+    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         getPtr(deviceQubits), ctrls.size(), qubitStateMask, 
         getPtr(deviceTargs), deviceTargs.size(),
@@ -848,7 +865,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ct
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     qcomp powI = util_getPowerOfI(y.size());
@@ -858,7 +876,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ct
     devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask,
         maskXY, maskYZ, bufferMaskXY,
@@ -889,13 +907,14 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 c
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     qindex targMask = util_getBitMask(targs);
 
-    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask,
         getGpuQcomp(fac0), getGpuQcomp(fac1)
@@ -922,7 +941,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // extract amp ptrs from qureg list
     vector<gpu_qcomp*> ptrs;
@@ -934,7 +954,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
     devgpuqcompptrs devQuregAmps = ptrs;
     devcomps devCoeffs = coeffs;
 
-    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(outQureg.gpuAmps), numThreads,
         getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size()
     );
@@ -962,9 +982,10 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    kernel_densmatr_mixQureg_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_mixQureg_subB <<<numBlocks, numThreadsPerBlock>>> (
         outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(inQureg.gpuAmps),
         numThreads, inQureg.numAmps
     );
@@ -980,9 +1001,10 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    kernel_densmatr_mixQureg_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_mixQureg_subC <<<numBlocks, numThreadsPerBlock>>> (
         outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(outQureg.gpuCommBuffer),
         numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode        
     );
@@ -1012,12 +1034,13 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
 #elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braQubit = util_getBraQubit(ketQubit, qureg);
 
-    kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac
     );
 
@@ -1038,12 +1061,13 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
 #elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
 
-    kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, fac
     );
 
@@ -1083,13 +1107,14 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto term = util_getTwoQubitDephasingTerm(prob);
     int braQubitA = util_getBraQubit(ketQubitA, qureg);
     int braQubitB = util_getBraQubit(ketQubitB, qureg);
 
-    kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols
         ketQubitA, ketQubitB, braQubitA, braQubitB, term
     );
@@ -1111,12 +1136,13 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3
     );
 
@@ -1131,13 +1157,14 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3
     );
@@ -1159,13 +1186,14 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
-    kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, c3
     );
@@ -1181,7 +1209,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 16;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1190,7 +1219,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     // each kernel invocation sums all 4 amps together, so adjusts c1
     qreal altc1 = factors.c1 - factors.c2;
 
-    kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2
     );
@@ -1206,13 +1235,14 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
-    kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braBit2, c3
     );
@@ -1228,14 +1258,15 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads,
         ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2
     );
@@ -1251,7 +1282,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
@@ -1260,7 +1292,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
     qreal fac0 = 1 + factors.c3;
     qreal fac1 = factors.c1 - fac0;
 
-    kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braBit1, braBit2, fac0, fac1
     );
@@ -1276,14 +1308,15 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2;
 
-    kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads,
         ketQb1, ketQb2, braBit1, braBit2, c2
     );
@@ -1305,12 +1338,13 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
-    kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, 
         factors.c1, factors.c2, factors.c3, factors.c4
     );
@@ -1326,13 +1360,14 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
-    kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4
     );
@@ -1354,12 +1389,13 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDampingFactors(prob);
 
-    kernel_densmatr_oneQubitDamping_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQubit, braQubit, prob, factors.c1, factors.c2
     );
@@ -1375,11 +1411,12 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
 
-    kernel_densmatr_oneQubitDamping_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qubit, c2
     );
 
@@ -1394,12 +1431,13 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto c1 = util_getOneQubitDampingFactors(prob).c1;
 
-    kernel_densmatr_oneQubitDamping_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subC <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, c1
     );
 
@@ -1414,10 +1452,11 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
-    kernel_densmatr_oneQubitDamping_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subD <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         qubit, prob
     );
@@ -1442,13 +1481,14 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 ta
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     devints devTargs = getDevInts(targs);
     devints devPairTargs = getDevInts(pairTargs);
     devints devAllTargs = getDevInts(util_getSorted(targs, pairTargs));
 
-    kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads,
         getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size()
     );
@@ -1562,13 +1602,14 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 #if QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // allocate exponentially-big temporary memory (error if failed)
     devints devQubits = getDevInts(qubits);
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
-    kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size()
     );
@@ -1596,7 +1637,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     // we decouple numColsPerNode and numThreads for clarity
     // (and in case parallelisation granularity ever changes);
     qindex numThreads = powerOf2(qureg.logNumColsPerNode);
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
@@ -1605,7 +1647,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     devints devQubits = getDevInts(qubits);
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
-    kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), 
         numThreads, firstDiagInd, numAmpsPerCol,
         qureg.rank, qureg.logNumAmpsPerNode, 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d617ba8df..59341759f 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -7,6 +7,7 @@ target_sources(tests
   debug.cpp
   decoherence.cpp
   environment.cpp
+  experimental.cpp
   initialisations.cpp
   matrices.cpp
   multiplication.cpp
diff --git a/tests/unit/decoherence.cpp b/tests/unit/decoherence.cpp
index f36c491bb..60b4cd640 100644
--- a/tests/unit/decoherence.cpp
+++ b/tests/unit/decoherence.cpp
@@ -38,7 +38,8 @@ using std::vector;
  */
 
 
-#define TEST_CATEGORY "[unit][decoherence]"
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[decoherence]"
 
 
 void TEST_ON_CACHED_QUREGS(auto apiFunc, vector<int> targs, vector<qmatrix> kraus) {
diff --git a/tests/unit/experimental.cpp b/tests/unit/experimental.cpp
new file mode 100644
index 000000000..b36f67ad1
--- /dev/null
+++ b/tests/unit/experimental.cpp
@@ -0,0 +1,133 @@
+/** @file
+ * Unit tests of the environment module.
+ *
+ * @author Oliver Brown
+ * @author Tyson Jones
+ * 
+ * @defgroup unitexperi Experimental
+ * @ingroup unittests
+ */
+
+#include "quest.h"
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#include "tests/utils/macros.hpp"
+#include "tests/utils/config.hpp"
+
+using Catch::Matchers::ContainsSubstring;
+
+
+
+/*
+ * UTILITIES
+ */
+
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[experimental]"
+
+
+
+/** 
+ * TESTS
+ * 
+ * @ingroup unitexperi
+ * @{
+ */
+
+
+TEST_CASE( "setQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    // remember the default number for later restoration (hence static)
+    static int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // begin at 64 (AMD min, larger than NVIDIA min of 32),
+        // stop at 1024 (should be less than dev-specific max)
+        int inNumTPB = GENERATE( 64, 128, 256, 512, 1024 ); 
+        setQuESTNumGpuThreadsPerBlock(inNumTPB);
+
+        int outNumTPB = getQuESTNumGpuThreadsPerBlock();
+        REQUIRE( inNumTPB == outNumTPB );
+        
+        // BEWARE that we do not here test whether all QuEST
+        // operators succeed with the various numTBP; that must
+        // be ad hoc asssesed via updating the numTBP env-var
+        // before launching the entirety of the tests
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        SECTION( "Negative" ) {
+
+            int badNumTPB = GENERATE( 0, -1, -9999 );
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "must be positive" ) );
+        }
+
+        SECTION( "Indivisible by warp size" ) {
+
+            // If HIP status was attached to QuESTEnv, we could do:
+            //     QuESTEnv env = getQuESTEnv();
+            //     int warpSize = (env.isGpuAccelerated && env.isHipCompiled)? 64 : 32;
+            // Since this currently isn't the case, we assume a warp size of 32,
+            // which will mean when this test is run on AMD GPUs, the below tested
+            // badNumTBP won't be as interestingly/rigorously spread
+            int warpSize = 32;
+
+            int badNumTPB = GENERATE_COPY( warpSize - 1, warpSize + 1, warpSize + warpSize/2, 3*warpSize + warpSize/2 );
+
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Number does not divide evenly into the warp size" ) );
+        }
+
+        SECTION( "Exceeds device maximum" ) {
+
+            int badNumTPB = 999999; // exceeds expected 1024 max
+
+            // Cannot be tested (since validation not imposed) when GPU is not actively used
+            if (getQuESTEnv().isGpuAccelerated)
+                REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Exceeds the hardware-imposed maximum" ) );
+
+            SUCCEED( );
+        }
+    }
+
+    // restore numTBP, so as not to interfere with other tests
+    setQuESTNumGpuThreadsPerBlock(initNumTPB);
+}
+
+
+TEST_CASE( "getQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // check initial value matches either the env-var (if set),
+        // or the fixed default in the codebase (hardcoded in test utils)
+        int defaultNum = getDefaultNumGpuThreadsPerBlock(); // test util via env-var
+        int reportedNum = getQuESTNumGpuThreadsPerBlock();  // QuEST API
+
+        REQUIRE( defaultNum == reportedNum );
+
+        // further testing of this function appears in setQuESTNumGpuThreadsPerBlock()
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // there is none (except untestable env is init!)
+        SUCCEED( );
+    }
+}
+
+
+/** @} (end defgroup) */
+
+
+
+/**
+ * @todo
+ * UNTESTED FUNCTIONS
+ */
+
+// nothing! :^)
diff --git a/tests/utils/config.cpp b/tests/utils/config.cpp
index 30a3844ba..d8eeab605 100644
--- a/tests/utils/config.cpp
+++ b/tests/utils/config.cpp
@@ -40,9 +40,7 @@ int getIntEnvVarValueOrDefault(string name, int defaultValue) {
 
 
 /*
- * PUBLIC
- *
- * which each call std::getenv only once
+ * PUBLIC TEST ENV VARS
  */
 
 int getNumQubitsInUnitTestedQuregs() {
@@ -74,3 +72,20 @@ bool getWhetherToTestAllDeployments() {
     static bool value = getIntEnvVarValueOrDefault("QUEST_TEST_TRY_ALL_DEPLOYMENTS", 1);
     return value;
 }
+
+
+
+/*
+ * PUBLIC QUEST ENV VARS
+ */
+
+int getDefaultNumGpuThreadsPerBlock() {
+
+    // when the env-var is not present, we MUST return the default assumed by the QuEST src code,
+    // which at the time of writing, is a fixed 128 (rather than hardware-specific value)
+    const int compileTimeDefaultTPB = 128;
+
+    // when the env-var is present, we consult that, just like QuEST
+    static int value = getIntEnvVarValueOrDefault("QUEST_NUM_GPU_THREADS_PER_BLOCK", compileTimeDefaultTPB);
+    return value;
+}
diff --git a/tests/utils/config.hpp b/tests/utils/config.hpp
index 10a61f67a..80be56e01 100644
--- a/tests/utils/config.hpp
+++ b/tests/utils/config.hpp
@@ -82,12 +82,16 @@
  * ACCESSING ENV-VARS 
  */
 
+// test env-vars
 int getNumQubitsInUnitTestedQuregs();
 int getMaxNumTestedQubitPermutations();
 int getMaxNumTestedSuperoperatorTargets();
 int getNumTestedMixedDeploymentRepetitions();
 bool getWhetherToTestAllDeployments();
 
+// quest env-vars
+int getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // CONFIG_PP