From a3e0b5bba4fefa10f4a97a407a2fe9c64ebae037 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Fri, 26 Jun 2026 14:21:30 +0200 Subject: [PATCH] =?UTF-8?q?release:=200.32.4=20=E2=80=94=20streaming=20det?= =?UTF-8?q?okenization=20preserves=20word=20spaces?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tokenizer.decodeToken(id): per-token streaming decode that keeps each SentencePiece piece's leading word-boundary space (llama.cpp token_to_piece semantics), so a generation loop decoding one token at a time no longer runs words together ("the process" -> "theprocess"). SentencePieceTokenizer overrides it to skip the sequence-level addSpacePrefix strip; adds decode(ids, stripLeadingSpace). Backward-compatible (decode(IntArray) unchanged). Version bump + CHANGELOG/README/docs version snippets -> 0.32.4. antora.yml is version: ~ (branch-tracked). skainet-io-core is not API-tracked, no dump change. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 13 ++++++++++ README.md | 9 ++++++- .../modules/ROOT/pages/how-to/io-readers.adoc | 4 +-- .../pages/how-to/java-model-training.adoc | 2 +- .../ROOT/pages/how-to/minerva-export.adoc | 2 +- .../tutorials/image-data-getting-started.adoc | 2 +- .../pages/tutorials/java-getting-started.adoc | 4 +-- gradle.properties | 2 +- .../io/tokenizer/SentencePieceTokenizer.kt | 24 +++++++++++++----- .../kotlin/sk/ainet/io/tokenizer/Tokenizer.kt | 16 ++++++++++++ .../SentencePieceTokenizerCoreTest.kt | 25 +++++++++++++++++++ 11 files changed, 88 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e5100a0..cf033410 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,19 @@ ## [Unreleased] +## [0.32.4] - 2026-06-26 + +### Fixed + +- **Streaming detokenization preserves word-boundary spaces (`Tokenizer.decodeToken`).** A generation + loop that decodes one token at a time (`decode(tokenId)`) ran words together (`"the process"` → + `"theprocess"`): the single-token path delegated to the sequence-level + `SentencePieceTokenizer.decode(IntArray)`, whose `addSpacePrefix` leading-space strip is only + correct once per sequence. Adds `Tokenizer.decodeToken(id)` (default = `decode(intArrayOf(id))`) and + a `SentencePieceTokenizer` override that decodes a single token without the leading strip (llama.cpp + `token_to_piece` semantics), plus a `decode(ids, stripLeadingSpace)` overload. Every streaming + consumer now reconstructs spacing correctly. + ## [0.32.3] - 2026-06-25 ### Added diff --git a/README.md b/README.md index 5755da3e..59beabfe 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Add the core dependencies (Gradle Kotlin DSL): ```kotlin dependencies { // Recommended: import the umbrella BOM and drop versions on the engine modules. - implementation(platform("sk.ainet:skainet-bom:0.32.3")) + implementation(platform("sk.ainet:skainet-bom:0.32.4")) implementation("sk.ainet.core:skainet-lang-core") implementation("sk.ainet.core:skainet-backend-cpu") @@ -241,6 +241,13 @@ Runnable examples: --- +## What's New in 0.32.4 + +- **Streaming detokenization keeps word spaces (`Tokenizer.decodeToken`).** Decoding generated tokens + one at a time no longer runs words together (`"the process"` → `"theprocess"`). The new + `decodeToken(id)` keeps each SentencePiece piece's leading space (llama.cpp `token_to_piece` + semantics); `decode(IntArray)` still strips the single sequence-leading space as before. + ## What's New in 0.32.3 - **Graph-output pruning for export (`ComputeGraph.prunedToOutputs`).** Trims a traced decoder's diff --git a/docs/modules/ROOT/pages/how-to/io-readers.adoc b/docs/modules/ROOT/pages/how-to/io-readers.adoc index 5093883f..62d6d4a5 100644 --- a/docs/modules/ROOT/pages/how-to/io-readers.adoc +++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc @@ -20,7 +20,7 @@ Add the following dependencies to your `build.gradle.kts`: [source,kotlin] ---- dependencies { - implementation(platform("sk.ainet:skainet-bom:0.32.3")) + implementation(platform("sk.ainet:skainet-bom:0.32.4")) implementation("sk.ainet.core:skainet-io-gguf") implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2") @@ -32,7 +32,7 @@ dependencies { [source,kotlin] ---- dependencies { - implementation(platform("sk.ainet:skainet-bom:0.32.3")) + implementation(platform("sk.ainet:skainet-bom:0.32.4")) implementation("sk.ainet.core:skainet-io-onnx") implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2") diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc index 4b0407b1..309827f5 100644 --- a/docs/modules/ROOT/pages/how-to/java-model-training.adoc +++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc @@ -23,7 +23,7 @@ This guide covers building neural networks, defining loss functions and optimize sk.ainet skainet-bom - 0.32.3 + 0.32.4 pom import diff --git a/docs/modules/ROOT/pages/how-to/minerva-export.adoc b/docs/modules/ROOT/pages/how-to/minerva-export.adoc index ab99753b..a25e452f 100644 --- a/docs/modules/ROOT/pages/how-to/minerva-export.adoc +++ b/docs/modules/ROOT/pages/how-to/minerva-export.adoc @@ -38,7 +38,7 @@ For a published application, use the SKaiNET BOM and the Minerva artifact: [source,kotlin] ---- dependencies { - implementation(platform("sk.ainet:skainet-bom:0.32.3")) + implementation(platform("sk.ainet:skainet-bom:0.32.4")) implementation("sk.ainet.core:skainet-compile-minerva") } ---- diff --git a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc index 15fe5f4b..ecfab5ca 100644 --- a/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc +++ b/docs/modules/ROOT/pages/tutorials/image-data-getting-started.adoc @@ -32,7 +32,7 @@ For a JVM project, add the image/data modules alongside the CPU backend: [source,kotlin] ---- dependencies { - implementation(platform("sk.ainet:skainet-bom:0.32.3")) + implementation(platform("sk.ainet:skainet-bom:0.32.4")) implementation("sk.ainet:skainet-backend-cpu-jvm") implementation("sk.ainet:skainet-io-image-jvm") diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc index 90ed31ab..dd161769 100644 --- a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc +++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc @@ -46,7 +46,7 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep ---- - 0.32.3 + 0.32.4 @@ -144,7 +144,7 @@ repositories { dependencies { // Import BOM for version alignment - implementation(platform("sk.ainet:skainet-bom:0.32.3")) + implementation(platform("sk.ainet:skainet-bom:0.32.4")) // Core tensor library implementation("sk.ainet:skainet-lang-core-jvm") diff --git a/gradle.properties b/gradle.properties index f0673801..ce9dd46e 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ GROUP=sk.ainet.core -VERSION_NAME=0.32.3 +VERSION_NAME=0.32.4 POM_DESCRIPTION=SKaiNET POM_URL=https://github.com/SKaiNET-developers/skainet/ diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt index 16d711cb..174d18dc 100644 --- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizer.kt @@ -86,7 +86,22 @@ public class SentencePieceTokenizer( return IntArray(out.size) { out[it] } } - override fun decode(ids: IntArray): String { + override fun decode(ids: IntArray): String = decode(ids, stripLeadingSpace = addSpacePrefix) + + /** + * Decode a single token to its surface piece without stripping the leading + * word-boundary space — see [Tokenizer.decodeToken]. Required for correct + * spacing when a caller decodes generated tokens one at a time. + */ + override fun decodeToken(id: Int): String = decode(intArrayOf(id), stripLeadingSpace = false) + + /** + * Decode [ids] to text. When [stripLeadingSpace] is true a single leading + * space (the artefact of SentencePiece's `addSpacePrefix` at encode time) is + * removed — correct for a whole sequence, but NOT for an individual streaming + * token, which must keep its leading space. + */ + public fun decode(ids: IntArray, stripLeadingSpace: Boolean): String { val sb = StringBuilder() val byteBuf = ArrayList() for (id in ids) { @@ -104,11 +119,8 @@ public class SentencePieceTokenizer( } if (byteBuf.isNotEmpty()) sb.append(flushBytes(byteBuf)) - var result = sb.toString().replace(WHITESPACE_ESCAPE, ' ') - if (addSpacePrefix && result.startsWith(' ')) { - result = result.substring(1) - } - return result + val result = sb.toString().replace(WHITESPACE_ESCAPE, ' ') + return if (stripLeadingSpace && result.startsWith(' ')) result.substring(1) else result } // ------------------------------------------------------------------ diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt index 2100d8d1..9c772d85 100644 --- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt @@ -15,4 +15,20 @@ public interface Tokenizer { public fun encode(text: String): IntArray public fun decode(ids: IntArray): String + + /** + * Decode a single token to its surface piece for **streaming** generation. + * + * Unlike [decode], this must NOT apply any sequence-level leading-space + * normalisation: each piece keeps its own leading word-boundary space, so + * concatenating a stream of per-token pieces reconstructs spacing (llama.cpp + * `token_to_piece` semantics). Decoding tokens one at a time through [decode] + * would strip every word's leading space and run the words together + * (`"the process"` → `"theprocess"`). + * + * The default decodes the 1-element array; implementations whose [decode] + * strips a leading space (e.g. SentencePiece with `addSpacePrefix`) override + * this to skip that strip. + */ + public fun decodeToken(id: Int): String = decode(intArrayOf(id)) } diff --git a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt index 2ff50d7d..cef3d8c3 100644 --- a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt +++ b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/SentencePieceTokenizerCoreTest.kt @@ -127,6 +127,31 @@ class SentencePieceTokenizerCoreTest { assertTrue(buildToyTokenizer().vocabSize >= 3 + 256 + 10) } + @Test + fun `streaming decodeToken keeps each word-boundary space`() { + val tok = buildToyTokenizer() + val ids = tok.encode("Hello world") // -> [▁Hello, ▁world] + + // Streaming: each per-token piece keeps its own leading space, so a + // consumer that appends piece-by-piece reconstructs the spacing. + val streamed = ids.joinToString("") { tok.decodeToken(it) } + assertEquals(" Hello world", streamed) + assertEquals(tok.decode(ids), streamed.trimStart()) + + // Regression guard: decoding each token through the sequence-level + // decode() strips every leading space and runs the words together. + val buggy = ids.joinToString("") { tok.decode(intArrayOf(it)) } + assertEquals("Helloworld", buggy) + } + + @Test + fun `decode with stripLeadingSpace=false keeps the leading space`() { + val tok = buildToyTokenizer() + val ids = tok.encode("Hello") + assertEquals("Hello", tok.decode(ids)) // default strips + assertEquals(" Hello", tok.decode(ids, stripLeadingSpace = false)) + } + @Test fun `bos and eos ids are exposed`() { val tok = buildToyTokenizer()