diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e5100a0..cf033410 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,19 @@
## [Unreleased]
+## [0.32.4] - 2026-06-26
+
+### Fixed
+
+- **Streaming detokenization preserves word-boundary spaces (`Tokenizer.decodeToken`).** A generation
+ loop that decodes one token at a time (`decode(tokenId)`) ran words together (`"the process"` →
+ `"theprocess"`): the single-token path delegated to the sequence-level
+ `SentencePieceTokenizer.decode(IntArray)`, whose `addSpacePrefix` leading-space strip is only
+ correct once per sequence. Adds `Tokenizer.decodeToken(id)` (default = `decode(intArrayOf(id))`) and
+ a `SentencePieceTokenizer` override that decodes a single token without the leading strip (llama.cpp
+ `token_to_piece` semantics), plus a `decode(ids, stripLeadingSpace)` overload. Every streaming
+ consumer now reconstructs spacing correctly.
+
## [0.32.3] - 2026-06-25
### Added
diff --git a/README.md b/README.md
index 5755da3e..59beabfe 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Add the core dependencies (Gradle Kotlin DSL):
```kotlin
dependencies {
// Recommended: import the umbrella BOM and drop versions on the engine modules.
- implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+ implementation(platform("sk.ainet:skainet-bom:0.32.4"))
implementation("sk.ainet.core:skainet-lang-core")
implementation("sk.ainet.core:skainet-backend-cpu")
@@ -241,6 +241,13 @@ Runnable examples:
---
+## What's New in 0.32.4
+
+- **Streaming detokenization keeps word spaces (`Tokenizer.decodeToken`).** Decoding generated tokens
+ one at a time no longer runs words together (`"the process"` → `"theprocess"`). The new
+ `decodeToken(id)` keeps each SentencePiece piece's leading space (llama.cpp `token_to_piece`
+ semantics); `decode(IntArray)` still strips the single sequence-leading space as before.
+
## What's New in 0.32.3
- **Graph-output pruning for export (`ComputeGraph.prunedToOutputs`).** Trims a traced decoder's
diff --git a/docs/modules/ROOT/pages/how-to/io-readers.adoc b/docs/modules/ROOT/pages/how-to/io-readers.adoc
index 5093883f..62d6d4a5 100644
--- a/docs/modules/ROOT/pages/how-to/io-readers.adoc
+++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc
@@ -20,7 +20,7 @@ Add the following dependencies to your `build.gradle.kts`:
[source,kotlin]
----
dependencies {
- implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+ implementation(platform("sk.ainet:skainet-bom:0.32.4"))
implementation("sk.ainet.core:skainet-io-gguf")
implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
@@ -32,7 +32,7 @@ dependencies {
[source,kotlin]
----
dependencies {
- implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+ implementation(platform("sk.ainet:skainet-bom:0.32.4"))
implementation("sk.ainet.core:skainet-io-onnx")
implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
index 4b0407b1..309827f5 100644
--- a/docs/modules/ROOT/pages/how-to/java-model-training.adoc
+++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
@@ -23,7 +23,7 @@ This guide covers building neural networks, defining loss functions and optimize
sk.ainet
skainet-bom
- 0.32.3
+ 0.32.4
pom
import
diff --git a/docs/modules/ROOT/pages/how-to/minerva-export.adoc b/docs/modules/ROOT/pages/how-to/minerva-export.adoc
index ab99753b..a25e452f 100644
--- a/docs/modules/ROOT/pages/how-to/minerva-export.adoc
+++ b/docs/modules/ROOT/pages/how-to/minerva-export.adoc
@@ -38,7 +38,7 @@ For a published application, use the SKaiNET BOM and the Minerva artifact:
[source,kotlin]
----
dependencies {
- implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+ implementation(platform("sk.ainet:skainet-bom:0.32.4"))
implementation("sk.ainet.core:skainet-compile-minerva")
}
----
diff --git a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
index 15fe5f4b..ecfab5ca 100644
--- a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
+++ b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc
@@ -32,7 +32,7 @@ For a JVM project, add the image/data modules alongside the CPU backend:
[source,kotlin]
----
dependencies {
- implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+ implementation(platform("sk.ainet:skainet-bom:0.32.4"))
implementation("sk.ainet:skainet-backend-cpu-jvm")
implementation("sk.ainet:skainet-io-image-jvm")
diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
index 90ed31ab..dd161769 100644
--- a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
+++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
@@ -46,7 +46,7 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep
----
- 0.32.3
+ 0.32.4
@@ -144,7 +144,7 @@ repositories {
dependencies {
// Import BOM for version alignment
- implementation(platform("sk.ainet:skainet-bom:0.32.3"))
+ implementation(platform("sk.ainet:skainet-bom:0.32.4"))
// Core tensor library
implementation("sk.ainet:skainet-lang-core-jvm")
diff --git a/gradle.properties b/gradle.properties
index f0673801..ce9dd46e 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,5 +1,5 @@
GROUP=sk.ainet.core
-VERSION_NAME=0.32.3
+VERSION_NAME=0.32.4
POM_DESCRIPTION=SKaiNET
POM_URL=https://github.com/SKaiNET-developers/skainet/
diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
index 16d711cb..174d18dc 100644
--- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
+++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt
@@ -86,7 +86,22 @@ public class SentencePieceTokenizer(
return IntArray(out.size) { out[it] }
}
- override fun decode(ids: IntArray): String {
+ override fun decode(ids: IntArray): String = decode(ids, stripLeadingSpace = addSpacePrefix)
+
+ /**
+ * Decode a single token to its surface piece without stripping the leading
+ * word-boundary space — see [Tokenizer.decodeToken]. Required for correct
+ * spacing when a caller decodes generated tokens one at a time.
+ */
+ override fun decodeToken(id: Int): String = decode(intArrayOf(id), stripLeadingSpace = false)
+
+ /**
+ * Decode [ids] to text. When [stripLeadingSpace] is true a single leading
+ * space (the artefact of SentencePiece's `addSpacePrefix` at encode time) is
+ * removed — correct for a whole sequence, but NOT for an individual streaming
+ * token, which must keep its leading space.
+ */
+ public fun decode(ids: IntArray, stripLeadingSpace: Boolean): String {
val sb = StringBuilder()
val byteBuf = ArrayList()
for (id in ids) {
@@ -104,11 +119,8 @@ public class SentencePieceTokenizer(
}
if (byteBuf.isNotEmpty()) sb.append(flushBytes(byteBuf))
- var result = sb.toString().replace(WHITESPACE_ESCAPE, ' ')
- if (addSpacePrefix && result.startsWith(' ')) {
- result = result.substring(1)
- }
- return result
+ val result = sb.toString().replace(WHITESPACE_ESCAPE, ' ')
+ return if (stripLeadingSpace && result.startsWith(' ')) result.substring(1) else result
}
// ------------------------------------------------------------------
diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
index 2100d8d1..9c772d85 100644
--- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
+++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt
@@ -15,4 +15,20 @@ public interface Tokenizer {
public fun encode(text: String): IntArray
public fun decode(ids: IntArray): String
+
+ /**
+ * Decode a single token to its surface piece for **streaming** generation.
+ *
+ * Unlike [decode], this must NOT apply any sequence-level leading-space
+ * normalisation: each piece keeps its own leading word-boundary space, so
+ * concatenating a stream of per-token pieces reconstructs spacing (llama.cpp
+ * `token_to_piece` semantics). Decoding tokens one at a time through [decode]
+ * would strip every word's leading space and run the words together
+ * (`"the process"` → `"theprocess"`).
+ *
+ * The default decodes the 1-element array; implementations whose [decode]
+ * strips a leading space (e.g. SentencePiece with `addSpacePrefix`) override
+ * this to skip that strip.
+ */
+ public fun decodeToken(id: Int): String = decode(intArrayOf(id))
}
diff --git a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
index 2ff50d7d..cef3d8c3 100644
--- a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
+++ b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt
@@ -127,6 +127,31 @@ class SentencePieceTokenizerCoreTest {
assertTrue(buildToyTokenizer().vocabSize >= 3 + 256 + 10)
}
+ @Test
+ fun `streaming decodeToken keeps each word-boundary space`() {
+ val tok = buildToyTokenizer()
+ val ids = tok.encode("Hello world") // -> [▁Hello, ▁world]
+
+ // Streaming: each per-token piece keeps its own leading space, so a
+ // consumer that appends piece-by-piece reconstructs the spacing.
+ val streamed = ids.joinToString("") { tok.decodeToken(it) }
+ assertEquals(" Hello world", streamed)
+ assertEquals(tok.decode(ids), streamed.trimStart())
+
+ // Regression guard: decoding each token through the sequence-level
+ // decode() strips every leading space and runs the words together.
+ val buggy = ids.joinToString("") { tok.decode(intArrayOf(it)) }
+ assertEquals("Helloworld", buggy)
+ }
+
+ @Test
+ fun `decode with stripLeadingSpace=false keeps the leading space`() {
+ val tok = buildToyTokenizer()
+ val ids = tok.encode("Hello")
+ assertEquals("Hello", tok.decode(ids)) // default strips
+ assertEquals(" Hello", tok.decode(ids, stripLeadingSpace = false))
+ }
+
@Test
fun `bos and eos ids are exposed`() {
val tok = buildToyTokenizer()