SKaiNET-developers · michalharakal · Jun 26, 2026 · Jun 26, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,19 @@
 
 ## [Unreleased]
 
+## [0.32.4] - 2026-06-26
+
+### Fixed
+
+- **Streaming detokenization preserves word-boundary spaces (`Tokenizer.decodeToken`).** A generation
+  loop that decodes one token at a time (`decode(tokenId)`) ran words together (`"the process"` →
+  `"theprocess"`): the single-token path delegated to the sequence-level
+  `SentencePieceTokenizer.decode(IntArray)`, whose `addSpacePrefix` leading-space strip is only
+  correct once per sequence. Adds `Tokenizer.decodeToken(id)` (default = `decode(intArrayOf(id))`) and
+  a `SentencePieceTokenizer` override that decodes a single token without the leading strip (llama.cpp
+  `token_to_piece` semantics), plus a `decode(ids, stripLeadingSpace)` overload. Every streaming
+  consumer now reconstructs spacing correctly.
+
 ## [0.32.3] - 2026-06-25
 
 ### Added

diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ Add the core dependencies (Gradle Kotlin DSL):
 ```kotlin
 dependencies {
     // Recommended: import the umbrella BOM and drop versions on the engine modules.
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet.core:skainet-lang-core")
     implementation("sk.ainet.core:skainet-backend-cpu")
@@ -241,6 +241,13 @@ Runnable examples:
 
 ---
 
+## What's New in 0.32.4
+
+- **Streaming detokenization keeps word spaces (`Tokenizer.decodeToken`).** Decoding generated tokens
+  one at a time no longer runs words together (`"the process"` → `"theprocess"`). The new
+  `decodeToken(id)` keeps each SentencePiece piece's leading space (llama.cpp `token_to_piece`
+  semantics); `decode(IntArray)` still strips the single sequence-leading space as before.
+
 ## What's New in 0.32.3
 
 - **Graph-output pruning for export (`ComputeGraph.prunedToOutputs`).** Trims a traced decoder's

diff --git a/docs/modules/ROOT/pages/how-to/io-readers.adoc b/docs/modules/ROOT/pages/how-to/io-readers.adoc
@@ -20,7 +20,7 @@ Add the following dependencies to your `build.gradle.kts`:
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet.core:skainet-io-gguf")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
@@ -32,7 +32,7 @@ dependencies {
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet.core:skainet-io-onnx")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")

diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
@@ -23,7 +23,7 @@ This guide covers building neural networks, defining loss functions and optimize
         <dependency>
             <groupId>sk.ainet</groupId>
             <artifactId>skainet-bom</artifactId>
-            <version>0.32.3</version>
+            <version>0.32.4</version>
             <type>pom</type>
             <scope>import</scope>
         </dependency>

diff --git a/docs/modules/ROOT/pages/how-to/minerva-export.adoc b/docs/modules/ROOT/pages/how-to/minerva-export.adoc
@@ -38,7 +38,7 @@ For a published application, use the SKaiNET BOM and the Minerva artifact:
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
     implementation("sk.ainet.core:skainet-compile-minerva")
 }
 ----

diff --git a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
@@ -32,7 +32,7 @@ For a JVM project, add the image/data modules alongside the CPU backend:
 [source,kotlin]
 ----
 dependencies {
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     implementation("sk.ainet:skainet-backend-cpu-jvm")
     implementation("sk.ainet:skainet-io-image-jvm")

diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
@@ -46,7 +46,7 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep
 ----
 <project>
     <properties>
-        <skainet.version>0.32.3</skainet.version>
+        <skainet.version>0.32.4</skainet.version>
     </properties>
 
     <dependencyManagement>
@@ -144,7 +144,7 @@ repositories {
 
 dependencies {
     // Import BOM for version alignment
-    implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+    implementation(platform("sk.ainet:skainet-bom:0.32.4"))
 
     // Core tensor library
     implementation("sk.ainet:skainet-lang-core-jvm")

diff --git a/gradle.properties b/gradle.properties
@@ -1,5 +1,5 @@
 GROUP=sk.ainet.core
-VERSION_NAME=0.32.3
+VERSION_NAME=0.32.4
 POM_DESCRIPTION=SKaiNET
 
 POM_URL=https://github.com/SKaiNET-developers/skainet/

diff --git a/...-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt b/...-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
@@ -86,7 +86,22 @@ public class SentencePieceTokenizer(
         return IntArray(out.size) { out[it] }
     }
 
-    override fun decode(ids: IntArray): String {
+    override fun decode(ids: IntArray): String = decode(ids, stripLeadingSpace = addSpacePrefix)
+
+    /**
+     * Decode a single token to its surface piece without stripping the leading
+     * word-boundary space — see [Tokenizer.decodeToken]. Required for correct
+     * spacing when a caller decodes generated tokens one at a time.
+     */
+    override fun decodeToken(id: Int): String = decode(intArrayOf(id), stripLeadingSpace = false)
+
+    /**
+     * Decode [ids] to text. When [stripLeadingSpace] is true a single leading
+     * space (the artefact of SentencePiece's `addSpacePrefix` at encode time) is
+     * removed — correct for a whole sequence, but NOT for an individual streaming
+     * token, which must keep its leading space.
+     */
+    public fun decode(ids: IntArray, stripLeadingSpace: Boolean): String {
         val sb = StringBuilder()
         val byteBuf = ArrayList<Byte>()
         for (id in ids) {
@@ -104,11 +119,8 @@ public class SentencePieceTokenizer(
         }
         if (byteBuf.isNotEmpty()) sb.append(flushBytes(byteBuf))
 
-        var result = sb.toString().replace(WHITESPACE_ESCAPE, ' ')
-        if (addSpacePrefix && result.startsWith(' ')) {
-            result = result.substring(1)
-        }
-        return result
+        val result = sb.toString().replace(WHITESPACE_ESCAPE, ' ')
+        return if (stripLeadingSpace && result.startsWith(' ')) result.substring(1) else result
     }
 
     // ------------------------------------------------------------------

diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
@@ -15,4 +15,20 @@ public interface Tokenizer {
 
     public fun encode(text: String): IntArray
     public fun decode(ids: IntArray): String
+
+    /**
+     * Decode a single token to its surface piece for **streaming** generation.
+     *
+     * Unlike [decode], this must NOT apply any sequence-level leading-space
+     * normalisation: each piece keeps its own leading word-boundary space, so
+     * concatenating a stream of per-token pieces reconstructs spacing (llama.cpp
+     * `token_to_piece` semantics). Decoding tokens one at a time through [decode]
+     * would strip every word's leading space and run the words together
+     * (`"the process"` → `"theprocess"`).
+     *
+     * The default decodes the 1-element array; implementations whose [decode]
+     * strips a leading space (e.g. SentencePiece with `addSpacePrefix`) override
+     * this to skip that strip.
+     */
+    public fun decodeToken(id: Int): String = decode(intArrayOf(id))
 }
diff --git a/...net-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt b/...net-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
@@ -127,6 +127,31 @@ class SentencePieceTokenizerCoreTest {
         assertTrue(buildToyTokenizer().vocabSize >= 3 + 256 + 10)
     }
 
+    @Test
+    fun `streaming decodeToken keeps each word-boundary space`() {
+        val tok = buildToyTokenizer()
+        val ids = tok.encode("Hello world") // -> [▁Hello, ▁world]
+
+        // Streaming: each per-token piece keeps its own leading space, so a
+        // consumer that appends piece-by-piece reconstructs the spacing.
+        val streamed = ids.joinToString("") { tok.decodeToken(it) }
+        assertEquals(" Hello world", streamed)
+        assertEquals(tok.decode(ids), streamed.trimStart())
+
+        // Regression guard: decoding each token through the sequence-level
+        // decode() strips every leading space and runs the words together.
+        val buggy = ids.joinToString("") { tok.decode(intArrayOf(it)) }
+        assertEquals("Helloworld", buggy)
+    }
+
+    @Test
+    fun `decode with stripLeadingSpace=false keeps the leading space`() {
+        val tok = buildToyTokenizer()
+        val ids = tok.encode("Hello")
+        assertEquals("Hello", tok.decode(ids)) // default strips
+        assertEquals(" Hello", tok.decode(ids, stripLeadingSpace = false))
+    }
+
     @Test
     fun `bos and eos ids are exposed`() {
         val tok = buildToyTokenizer()