From a3e0b5bba4fefa10f4a97a407a2fe9c64ebae037 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Fri, 26 Jun 2026 14:21:30 +0200
Subject: [PATCH] =?UTF-8?q?release:=200.32.4=20=E2=80=94=20streaming=20det?=
 =?UTF-8?q?okenization=20preserves=20word=20spaces?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tokenizer.decodeToken(id): per-token streaming decode that keeps each
SentencePiece piece's leading word-boundary space (llama.cpp token_to_piece
semantics), so a generation loop decoding one token at a time no longer runs
words together ("the process" -> "theprocess"). SentencePieceTokenizer
overrides it to skip the sequence-level addSpacePrefix strip; adds
decode(ids, stripLeadingSpace). Backward-compatible (decode(IntArray) unchanged).

Version bump + CHANGELOG/README/docs version snippets -> 0.32.4. antora.yml is
version: ~ (branch-tracked). skainet-io-core is not API-tracked, no dump change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                  | 13 ++++++++++
 README.md                                     |  9 ++++++-
 .../modules/ROOT/pages/how-to/io-readers.adoc |  4 +--
 .../pages/how-to/java-model-training.adoc     |  2 +-
 .../ROOT/pages/how-to/minerva-export.adoc     |  2 +-
 .../tutorials/image-data-getting-started.adoc |  2 +-
 .../pages/tutorials/java-getting-started.adoc |  4 +--
 gradle.properties                             |  2 +-
 .../io/tokenizer/SentencePieceTokenizer.kt    | 24 +++++++++++++-----
 .../kotlin/sk/ainet/io/tokenizer/Tokenizer.kt | 16 ++++++++++++
 .../SentencePieceTokenizerCoreTest.kt         | 25 +++++++++++++++++++
 11 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e5100a0..cf033410 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,19 @@
 
 ## [Unreleased]
 
+## [0.32.4] - 2026-06-26
+
+### Fixed
+
+- **Streaming detokenization preserves word-boundary spaces (`Tokenizer.decodeToken`).** A generation
+  loop that decodes one token at a time (`decode(tokenId)`) ran words together (`"the process"` →
+  `"theprocess"`): the single-token path delegated to the sequence-level
+  `SentencePieceTokenizer.decode(IntArray)`, whose `addSpacePrefix` leading-space strip is only
+  correct once per sequence. Adds `Tokenizer.decodeToken(id)` (default = `decode(intArrayOf(id))`) and
+  a `SentencePieceTokenizer` override that decodes a single token without the leading strip (llama.cpp
+  `token_to_piece` semantics), plus a `decode(ids, stripLeadingSpace)` overload. Every streaming
+  consumer now reconstructs spacing correctly.
+
 ## [0.32.3] - 2026-06-25
 
 ### Added
diff --git a/README.md b/README.md
index 5755da3e..59beabfe 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Add the core dependencies (Gradle Kotlin DSL):
 ```kotlin
 dependencies {
     // Recommended: import the umbrella BOM and drop versions on the engine modules.
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet.core:skainet-lang-core")
     implementation("sk.ainet.core:skainet-backend-cpu")
@@ -241,6 +241,13 @@ Runnable examples:
 
 ---
 
+## What's New in 0.32.4
+
+- **Streaming detokenization keeps word spaces (`Tokenizer.decodeToken`).** Decoding generated tokens
+  one at a time no longer runs words together (`"the process"` → `"theprocess"`). The new
+  `decodeToken(id)` keeps each SentencePiece piece's leading space (llama.cpp `token_to_piece`
+  semantics); `decode(IntArray)` still strips the single sequence-leading space as before.
+
 ## What's New in 0.32.3
 
 - **Graph-output pruning for export (`ComputeGraph.prunedToOutputs`).** Trims a traced decoder's
diff --git a/docs/modules/ROOT/pages/how-to/io-readers.adoc b/docs/modules/ROOT/pages/how-to/io-readers.adoc
index 5093883f..62d6d4a5 100644
--- a/docs/modules/ROOT/pages/how-to/io-readers.adoc
+++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc
@@ -20,7 +20,7 @@ Add the following dependencies to your `build.gradle.kts`:
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet.core:skainet-io-gguf")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
@@ -32,7 +32,7 @@ dependencies {
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet.core:skainet-io-onnx")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
index 4b0407b1..309827f5 100644
--- a/docs/modules/ROOT/pages/how-to/java-model-training.adoc
+++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
@@ -23,7 +23,7 @@ This guide covers building neural networks, defining loss functions and optimize
         <dependency>
             <groupId>sk.ainet</groupId>
             <artifactId>skainet-bom</artifactId>
-            <version>0.32.3</version>
+            <version>0.32.4</version>
             <type>pom</type>
             <scope>import</scope>
         </dependency>
diff --git a/docs/modules/ROOT/pages/how-to/minerva-export.adoc b/docs/modules/ROOT/pages/how-to/minerva-export.adoc
index ab99753b..a25e452f 100644
--- a/docs/modules/ROOT/pages/how-to/minerva-export.adoc
+++ b/docs/modules/ROOT/pages/how-to/minerva-export.adoc
@@ -38,7 +38,7 @@ For a published application, use the SKaiNET BOM and the Minerva artifact:
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
     implementation("sk.ainet.core:skainet-compile-minerva")
 }
 ----
diff --git a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
index 15fe5f4b..ecfab5ca 100644
--- a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
+++ b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
@@ -32,7 +32,7 @@ For a JVM project, add the image/data modules alongside the CPU backend:
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet:skainet-backend-cpu-jvm")
     implementation("sk.ainet:skainet-io-image-jvm")
diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
index 90ed31ab..dd161769 100644
--- a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
+++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
@@ -46,7 +46,7 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep
 ----
 <project>
     <properties>
-        <skainet.version>0.32.3</skainet.version>
+        <skainet.version>0.32.4</skainet.version>
     </properties>
 
     <dependencyManagement>
@@ -144,7 +144,7 @@ repositories {
 
 dependencies {
     // Import BOM for version alignment
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     // Core tensor library
     implementation("sk.ainet:skainet-lang-core-jvm")
diff --git a/gradle.properties b/gradle.properties
index f0673801..ce9dd46e 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,5 +1,5 @@
 GROUP=sk.ainet.core
-VERSION_NAME=0.32.3
+VERSION_NAME=0.32.4
 POM_DESCRIPTION=SKaiNET
 
 POM_URL=https://github.com/SKaiNET-developers/skainet/
diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
index 16d711cb..174d18dc 100644
--- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
+++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
@@ -86,7 +86,22 @@ public class SentencePieceTokenizer(
         return IntArray(out.size) { out[it] }
     }
 
-    override fun decode(ids: IntArray): String {
+    override fun decode(ids: IntArray): String = decode(ids, stripLeadingSpace = addSpacePrefix)
+
+    /**
+     * Decode a single token to its surface piece without stripping the leading
+     * word-boundary space — see [Tokenizer.decodeToken]. Required for correct
+     * spacing when a caller decodes generated tokens one at a time.
+     */
+    override fun decodeToken(id: Int): String = decode(intArrayOf(id), stripLeadingSpace = false)
+
+    /**
+     * Decode [ids] to text. When [stripLeadingSpace] is true a single leading
+     * space (the artefact of SentencePiece's `addSpacePrefix` at encode time) is
+     * removed — correct for a whole sequence, but NOT for an individual streaming
+     * token, which must keep its leading space.
+     */
+    public fun decode(ids: IntArray, stripLeadingSpace: Boolean): String {
         val sb = StringBuilder()
         val byteBuf = ArrayList<Byte>()
         for (id in ids) {
@@ -104,11 +119,8 @@ public class SentencePieceTokenizer(
         }
         if (byteBuf.isNotEmpty()) sb.append(flushBytes(byteBuf))
 
-        var result = sb.toString().replace(WHITESPACE_ESCAPE, ' ')
-        if (addSpacePrefix && result.startsWith(' ')) {
-            result = result.substring(1)
-        }
-        return result
+        val result = sb.toString().replace(WHITESPACE_ESCAPE, ' ')
+        return if (stripLeadingSpace && result.startsWith(' ')) result.substring(1) else result
     }
 
     // ------------------------------------------------------------------
diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
index 2100d8d1..9c772d85 100644
--- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
+++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
@@ -15,4 +15,20 @@ public interface Tokenizer {
 
     public fun encode(text: String): IntArray
     public fun decode(ids: IntArray): String
+
+    /**
+     * Decode a single token to its surface piece for **streaming** generation.
+     *
+     * Unlike [decode], this must NOT apply any sequence-level leading-space
+     * normalisation: each piece keeps its own leading word-boundary space, so
+     * concatenating a stream of per-token pieces reconstructs spacing (llama.cpp
+     * `token_to_piece` semantics). Decoding tokens one at a time through [decode]
+     * would strip every word's leading space and run the words together
+     * (`"the process"` → `"theprocess"`).
+     *
+     * The default decodes the 1-element array; implementations whose [decode]
+     * strips a leading space (e.g. SentencePiece with `addSpacePrefix`) override
+     * this to skip that strip.
+     */
+    public fun decodeToken(id: Int): String = decode(intArrayOf(id))
 }
diff --git a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
index 2ff50d7d..cef3d8c3 100644
--- a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
+++ b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
@@ -127,6 +127,31 @@ class SentencePieceTokenizerCoreTest {
         assertTrue(buildToyTokenizer().vocabSize >= 3 + 256 + 10)
     }
 
+    @Test
+    fun `streaming decodeToken keeps each word-boundary space`() {
+        val tok = buildToyTokenizer()
+        val ids = tok.encode("Hello world") // -> [▁Hello, ▁world]
+
+        // Streaming: each per-token piece keeps its own leading space, so a
+        // consumer that appends piece-by-piece reconstructs the spacing.
+        val streamed = ids.joinToString("") { tok.decodeToken(it) }
+        assertEquals(" Hello world", streamed)
+        assertEquals(tok.decode(ids), streamed.trimStart())
+
+        // Regression guard: decoding each token through the sequence-level
+        // decode() strips every leading space and runs the words together.
+        val buggy = ids.joinToString("") { tok.decode(intArrayOf(it)) }
+        assertEquals("Helloworld", buggy)
+    }
+
+    @Test
+    fun `decode with stripLeadingSpace=false keeps the leading space`() {
+        val tok = buildToyTokenizer()
+        val ids = tok.encode("Hello")
+        assertEquals("Hello", tok.decode(ids)) // default strips
+        assertEquals(" Hello", tok.decode(ids, stripLeadingSpace = false))
+    }
+
     @Test
     fun `bos and eos ids are exposed`() {
         val tok = buildToyTokenizer()