beehive-lab · orionpapadakis · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 11, 2026
diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
@@ -177,15 +177,62 @@ jobs:
           flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
           metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json
 
-      - name: FP16 - Run Qwen3-4B-f16.gguf
+      - name: FP16 - Run Qwen3-0.6B-f16.gguf
         uses: ./.github/actions/run-inference
         with:
           backend: ${{ matrix.backend.name }}
-          model_file: Qwen3-4B-f16.gguf
-          model: Qwen3-4B
+          model_file: Qwen3-0.6B-f16.gguf
+          model: Qwen3-0.6B
           quantization: F16
           configuration: standard
-          metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json
+          metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0.6b-f16-standard.json
+
+      - name: FP16 - Run Qwen3-0.6B-f16.gguf - Prefill-Decode
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-f16.gguf
+          model: Qwen3-0.6B
+          quantization: F16
+          configuration: prefill-decode
+          flags: --with-prefill-decode
+          metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0.6b-f16-prefill-decode.json
+
+      - name: FP16 - Run Qwen3-0.6B-f16.gguf - Batch-Prefill-Decode
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-f16.gguf
+          model: Qwen3-0.6B
+          quantization: F16
+          configuration: batch-prefill-decode
+          flags: --with-prefill-decode --batch-prefill-size 32
+          metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0.6b-f16-batch-prefill-decode.json
+
+      # PTX-only: CUDA-graph variants
+      - name: PTX - FP16 - Run Qwen3-0.6B-f16.gguf - Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-f16.gguf
+          model: Qwen3-0.6B
+          quantization: F16
+          configuration: prefill-decode-cuda-graphs
+          flags: --with-prefill-decode --cuda-graphs
+          metrics_file: ${{ runner.temp }}/metrics-ptx-qwen3-0.6b-f16-prefill-decode-cuda-graphs.json
+
+      - name: PTX - FP16 - Run Qwen3-0.6B-f16.gguf - Batch-Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-f16.gguf
+          model: Qwen3-0.6B
+          quantization: F16
+          configuration: batch-prefill-decode-cuda-graphs
+          flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
+          metrics_file: ${{ runner.temp }}/metrics-ptx-qwen3-0.6b-f16-batch-prefill-decode-cuda-graphs.json
 
       - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf
         uses: ./.github/actions/run-inference
@@ -358,6 +405,53 @@ jobs:
           configuration: standard
           metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json
 
+      - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf - Prefill-Decode
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-Q8_0.gguf
+          model: Qwen3-0.6B
+          quantization: Q8_0
+          configuration: prefill-decode
+          flags: --with-prefill-decode
+          metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-prefill-decode.json
+
+      - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf - Batch-Prefill-Decode
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-Q8_0.gguf
+          model: Qwen3-0.6B
+          quantization: Q8_0
+          configuration: batch-prefill-decode
+          flags: --with-prefill-decode --batch-prefill-size 32
+          metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-batch-prefill-decode.json
+
+      # PTX-only: CUDA-graph variants
+      - name: PTX - Q8 - Run Qwen3-0.6B-Q8_0.gguf - Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-Q8_0.gguf
+          model: Qwen3-0.6B
+          quantization: Q8_0
+          configuration: prefill-decode-cuda-graphs
+          flags: --with-prefill-decode --cuda-graphs
+          metrics_file: ${{ runner.temp }}/metrics-ptx-qwen3-0-6b-q8-prefill-decode-cuda-graphs.json
+
+      - name: PTX - Q8 - Run Qwen3-0.6B-Q8_0.gguf - Batch-Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        uses: ./.github/actions/run-inference
+        with:
+          backend: ${{ matrix.backend.name }}
+          model_file: Qwen3-0.6B-Q8_0.gguf
+          model: Qwen3-0.6B
+          quantization: Q8_0
+          configuration: batch-prefill-decode-cuda-graphs
+          flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
+          metrics_file: ${{ runner.temp }}/metrics-ptx-qwen3-0-6b-q8-batch-prefill-decode-cuda-graphs.json
+
       - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf
         uses: ./.github/actions/run-inference
         with:

diff --git a/src/main/java/org/beehive/gpullama3/Options.java b/src/main/java/org/beehive/gpullama3/Options.java
@@ -11,8 +11,8 @@ public record Options(Path modelPath, String prompt, String systemPrompt, String
 
     public Options {
         require(interactive || prompt != null, "Missing argument: --prompt is required in --instruct mode e.g. --prompt \"Why is the sky blue?\"");
-        require(0 <= temperature, "Invalid argument: --temperature must be non-negative");
-        require(0 <= topp && topp <= 1, "Invalid argument: --top-p must be within [0, 1]");
+        require(Float.isNaN(temperature) || 0 <= temperature, "Invalid argument: --temperature must be non-negative");
+        require(Float.isNaN(topp) || 0 <= topp && topp <= 1, "Invalid argument: --top-p must be within [0, 1]");
         require(batchPrefillSize >= 1, "Invalid argument: --batch-prefill-size must be >= 1");
         require(batchPrefillSize == 1 || withPrefillDecode, "Invalid argument: --batch-prefill-size requires --with-prefill-decode");
         // Publish to system properties so TornadoVMMasterPlan and Llama read the right values
@@ -44,8 +44,8 @@ public static void printUsage(PrintStream out) {
         out.println("  --prompt, -p <string>         input prompt");
         out.println("  --system-prompt, -sp <string> (optional) system prompt (Llama models)");
         out.println("  --suffix <string>             suffix for fill-in-the-middle request (Codestral)");
-        out.println("  --temperature, -temp <float>  temperature in [0,inf], default 0.1");
-        out.println("  --top-p <float>               p value in top-p (nucleus) sampling in [0,1] default 0.95");
+        out.println("  --temperature, -temp <float>  temperature in [0,inf], default: auto-detected from model family");
+        out.println("  --top-p <float>               p value in top-p (nucleus) sampling in [0,1], default: auto-detected from model family");
         out.println("  --seed <long>                 random seed, default System.nanoTime()");
         out.println("  --max-tokens, -n <int>        number of steps to run for < 0 = limited by context length, default " + DEFAULT_MAX_TOKENS);
         out.println("  --stream <boolean>            print tokens during generation; may cause encoding artifacts for non ASCII text, default true");
@@ -59,8 +59,8 @@ public static Options getDefaultOptions() {
         String prompt = "Tell me a story with Java"; // Hardcoded for testing
         String systemPrompt = null;
         String suffix = null;
-        float temperature = 0.1f;
-        float topp = 0.95f;
+        float temperature = Float.NaN; // resolved from model family after loading
+        float topp = Float.NaN;        // resolved from model family after loading
         Path modelPath = null;
         long seed = System.nanoTime();
         int maxTokens = DEFAULT_MAX_TOKENS;
@@ -76,8 +76,8 @@ public static Options parseOptions(String[] args) {
         String prompt = "Tell me a story with Java"; // Hardcoded for testing
         String systemPrompt = null;
         String suffix = null;
-        float temperature = 0.1f;
-        float topp = 0.95f;
+        float temperature = Float.NaN; // resolved from model family after loading
+        float topp = Float.NaN;        // resolved from model family after loading
         Path modelPath = null;
         long seed = System.nanoTime();
         int maxTokens = DEFAULT_MAX_TOKENS;

diff --git a/src/main/java/org/beehive/gpullama3/inference/sampler/Sampler.java b/src/main/java/org/beehive/gpullama3/inference/sampler/Sampler.java
@@ -122,7 +122,13 @@ static Sampler selectSampler(int vocabularySize, float temperature, float topp,
     }
 
     static Sampler createSampler(Model model, Options options) {
-        return selectSampler(model.configuration().vocabularySize(), options.temperature(), options.topp(), options.seed());
+        float temperature = Float.isNaN(options.temperature())
+                ? (float) model.chatFormat().defaultTemperature()
+                : options.temperature();
+        float topp = Float.isNaN(options.topp())
+                ? (float) model.chatFormat().defaultTopP()
+                : options.topp();
+        return selectSampler(model.configuration().vocabularySize(), temperature, topp, options.seed());
     }
 
     /**

diff --git a/src/main/java/org/beehive/gpullama3/inference/state/Qwen3State.java b/src/main/java/org/beehive/gpullama3/inference/state/Qwen3State.java
@@ -35,6 +35,18 @@ public Qwen3State(Configuration config, int batchsize) {
         this.tempKcur = new FloatArray(nEmbdHead);
     }
 
+    @Override
+    protected int batchQDim(Configuration config) {
+        Qwen3Configuration q3 = (Qwen3Configuration) config;
+        return q3.numberOfHeadsKey() * q3.numberOfHeads();
+    }
+
+    @Override
+    protected int batchKvDim(Configuration config) {
+        Qwen3Configuration q3 = (Qwen3Configuration) config;
+        return q3.numberOfHeadsValue() * q3.numberOfKeyValueHeads();
+    }
+
     @Override
     protected StateFields createStateFields(Configuration configuration) {
         StateFields fields = new StateFields();

diff --git a/src/main/java/org/beehive/gpullama3/inference/state/State.java b/src/main/java/org/beehive/gpullama3/inference/state/State.java
@@ -77,10 +77,10 @@ public abstract class State {
     public final HalfFloatArray embeddingXBatch;    // B × dim  (FP16 input)
     public final FloatArray wrapXBatch;             // B × dim  (live activations / Q8_0 dequant)
     public final HalfFloatArray wrapXbFP16Batch;    // B × dim  (RMSNorm output, FP16)
-    public final FloatArray wrapQBatch;             // B × dim
+    public final FloatArray wrapQBatch;             // B × qDim (Q projection)
     public final FloatArray wrapKBatch;             // B × kvDim
     public final FloatArray wrapVBatch;             // B × kvDim
-    public final FloatArray wrapXbBatch;            // B × dim  (attention output)
+    public final FloatArray wrapXbBatch;            // B × qDim  (attention output)
     public final FloatArray wrapHbBatch;            // B × hiddenDim
     public final FloatArray attnScaleBatch;         // B        (per-token RMS scale, attn)
     public final FloatArray ffnScaleBatch;          // B        (per-token RMS scale, FFN)
@@ -135,14 +135,15 @@ protected State(Configuration config, int batchsize) {
 
         int gpuBatchSize = Integer.getInteger("llama.prefillBatchSize", 1);
         if (gpuBatchSize > 1) {
-            int kvDim = (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads();
+            int qDim  = batchQDim(config);
+            int kvDim = batchKvDim(config);
             this.embeddingXBatch = new HalfFloatArray(gpuBatchSize * config.dim());
             this.wrapXBatch = new FloatArray(gpuBatchSize * config.dim());
             this.wrapXbFP16Batch = new HalfFloatArray(gpuBatchSize * config.dim());
-            this.wrapQBatch = new FloatArray(gpuBatchSize * config.dim());
+            this.wrapQBatch = new FloatArray(gpuBatchSize * qDim);
             this.wrapKBatch = new FloatArray(gpuBatchSize * kvDim);
             this.wrapVBatch = new FloatArray(gpuBatchSize * kvDim);
-            this.wrapXbBatch = new FloatArray(gpuBatchSize * config.dim());
+            this.wrapXbBatch = new FloatArray(gpuBatchSize * qDim);
             this.wrapHbBatch = new FloatArray(gpuBatchSize * config.hiddenDim());
             this.attnScaleBatch = new FloatArray(gpuBatchSize);
             this.ffnScaleBatch = new FloatArray(gpuBatchSize);
@@ -162,6 +163,16 @@ protected State(Configuration config, int batchsize) {
         }
     }
 
+    /** Q-projection output dimension per token (model specific: = dim for Llama; differs for Qwen3). */
+    protected int batchQDim(Configuration config) {
+        return config.dim();
+    }
+
+    /** KV-cache dimension per token (model specific: = dim*nHeadKv/nHeads for Llama; differs for Qwen3). */
+    protected int batchKvDim(Configuration config) {
+        return (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads();
+    }
+
     // Abstract method - subclasses implement their specific allocation logic and sizes
     protected abstract StateFields createStateFields(Configuration config);