Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions kotlin-js-store/wasm/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# yarn lockfile v1


ws@8.18.3:
version "8.18.3"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.3.tgz#b56b88abffde62791c639170400c93dcb0c95472"
integrity sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==
ws@8.20.1:
version "8.20.1"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.20.1.tgz#91a9ae2b312ccf98e0a85ec499b48cef45ab0ddb"
integrity sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ set(SKAINET_KERNEL_SOURCES
src/skainet_smoke.c
src/q4k_matmul.c
src/q5k_matmul.c
src/q6k_matmul.c
src/fp32_matmul.c
src/bf16_matmul.c
src/q8_0_matmul.c
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,33 @@ SKAINET_API void skainet_q5k_matmul(
int32_t output_offset
);

/*
* Q6_K matrix-vector multiply.
*
* output[output_offset + o] = sum_j input[input_offset + j] *
* dequant(weight[block, o, j])
*
* Block layout: canonical ggml Q6_K, 256 elements per super-block, 210
* bytes per block (128 B `ql` low nibbles + 64 B `qh` high-2-bit plane +
* 16 B int8 `scales` + 2 B `d` FP16). Each 6-bit code is
* `lowNibble | (highBits << 4)`, dequantized as `d * scale * (code - 32)`
* (signed, range [-32, 31]; Q6_K has no per-block min). Packed weights
* laid out as
* weight + weight_byte_offset + (block_idx * output_dim + o) * 210
*
* input_dim must be a multiple of 256.
*/
SKAINET_API void skainet_q6k_matmul(
const float* input,
int32_t input_offset,
const uint8_t* weight,
int32_t weight_byte_offset,
int32_t input_dim,
int32_t output_dim,
float* output,
int32_t output_offset
);

/*
* Row-major FP32 SGEMM: C(m, n) = A(m, k) * B(k, n).
*
Expand Down
146 changes: 146 additions & 0 deletions skainet-backends/skainet-backend-native-cpu/native/src/q6k_matmul.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#include "skainet_kernels.h"
#include "skainet_simd.h"

#include <stddef.h>
#include <stdint.h>

#define Q6K_BLOCK_SIZE 256
#define Q6K_BYTES_PER_BLOCK 210
#define Q6K_QL_OFFSET 0
#define Q6K_QH_OFFSET 128
#define Q6K_SCALES_OFFSET 192
#define Q6K_D_OFFSET 208

/*
* IEEE 754 binary16 (LE byte order) -> binary32 conversion.
* Byte-for-byte identical to the Q5_K / Q4_K converter (kept scalar to
* preserve bit-exact FP16 parity with the Panama / scalar references).
*/
static inline float skainet_q6k_half_to_float(uint16_t hbits) {
const uint32_t sign = (hbits >> 15) & 0x1u;
const uint32_t exp = (hbits >> 10) & 0x1Fu;
const uint32_t frac = hbits & 0x3FFu;

if (exp == 0u) {
if (frac == 0u) {
union { uint32_t u; float f; } v = { sign << 31 };
return v.f;
}
float f = ((float) frac) / 1024.0f * (1.0f / 16384.0f);
return sign ? -f : f;
}
if (exp == 0x1Fu) {
union { uint32_t u; float f; } v;
v.u = (sign << 31) | 0x7F800000u | (frac ? 0x00400000u : 0u);
return v.f;
}
union { uint32_t u; float f; } v;
v.u = (sign << 31) | ((exp - 15u + 127u) << 23) | (frac << 13);
return v.f;
}

/*
* Dequantize one 256-element Q6_K super-block into scratch[256].
* Direct transcription of ScalarQ6_KMatmulKernel.dequantBlock /
* ggml dequantize_row_q6_K: two 128-element halves, each split into two
* 16-element scale groups carrying four strided sub-codes (q1..q4).
*
* The 6-bit code is `lowNibble(ql) | (twoHighBits(qh) << 4)`, biased by
* -32, and `scales` are SIGNED int8. Per-element value = d * scale * code.
*/
static inline void skainet_q6k_dequant_block(const uint8_t* SKAINET_RESTRICT block,
float* SKAINET_RESTRICT scratch) {
const uint8_t* ql0 = block + Q6K_QL_OFFSET;
const uint8_t* qh0 = block + Q6K_QH_OFFSET;
const int8_t* sc0 = (const int8_t*)(block + Q6K_SCALES_OFFSET);
const uint16_t d_bits = (uint16_t) block[Q6K_D_OFFSET]
| ((uint16_t) block[Q6K_D_OFFSET + 1] << 8);
const float d = skainet_q6k_half_to_float(d_bits);

for (int half = 0; half < 2; ++half) {
const uint8_t* ql = ql0 + half * 64;
const uint8_t* qh = qh0 + half * 32;
const int8_t* sc = sc0 + half * 8;
float* out = scratch + half * 128;
for (int is = 0; is < 2; ++is) {
const float sc1 = d * (float) sc[is + 0];
const float sc2 = d * (float) sc[is + 2];
const float sc3 = d * (float) sc[is + 4];
const float sc4 = d * (float) sc[is + 6];
const int l_start = is * 16;
for (int l = l_start; l < l_start + 16; ++l) {
const int q_l0 = ql[l];
const int q_l32 = ql[l + 32];
const int q_h = qh[l];
const int q1 = ((q_l0 & 0x0F) | ((q_h & 0x03) << 4)) - 32;
const int q2 = ((q_l32 & 0x0F) | (((q_h >> 2) & 0x03) << 4)) - 32;
const int q3 = ((q_l0 >> 4) | (((q_h >> 4) & 0x03) << 4)) - 32;
const int q4 = ((q_l32 >> 4) | (((q_h >> 6) & 0x03) << 4)) - 32;
out[l + 0] = sc1 * (float) q1;
out[l + 32] = sc2 * (float) q2;
out[l + 64] = sc3 * (float) q3;
out[l + 96] = sc4 * (float) q4;
}
}
}
}

/*
* Native Q6_K matrix-vector multiply matching the
* sk.ainet.backend.api.kernel.Q6KMatmulKernel SPI contract. A single
* input row times an `outputDim x inputDim` Q6_K-packed weight tensor
* laid out (blockIdx * outputDim + o) * 210 bytes.
*
* The 6-bit bit-assembly is kept scalar (cheap byte shuffling that the
* compiler auto-vectorizes under -O3) and materialized into a 256-float
* scratch block; the hot dot product against the input window is the
* NEON path (vfmaq_f32 + horizontal add) behind __ARM_NEON. On non-ARM
* targets the dot is a straight-line loop that auto-vectorizes too.
*/
SKAINET_API void skainet_q6k_matmul(
const float* SKAINET_RESTRICT input,
int32_t input_offset,
const uint8_t* SKAINET_RESTRICT weight,
int32_t weight_byte_offset,
int32_t input_dim,
int32_t output_dim,
float* SKAINET_RESTRICT output,
int32_t output_offset
) {
if (output_dim <= 0 || input_dim <= 0) return;

const int32_t blocks_per_input_dim = input_dim / Q6K_BLOCK_SIZE;
const float* in_base = input + input_offset;
float* out_base = output + output_offset;

float scratch[Q6K_BLOCK_SIZE];

for (int32_t o = 0; o < output_dim; ++o) {
float acc = 0.0f;

for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
const uint8_t* block = weight + weight_byte_offset
+ (size_t)(block_idx * output_dim + o) * Q6K_BYTES_PER_BLOCK;

skainet_q6k_dequant_block(block, scratch);

const float* in_block = in_base + (size_t) block_idx * Q6K_BLOCK_SIZE;

#ifdef SKAINET_HAVE_NEON
float32x4_t vacc = vdupq_n_f32(0.0f);
for (int i = 0; i < Q6K_BLOCK_SIZE; i += 4) {
const float32x4_t vi = vld1q_f32(in_block + i);
const float32x4_t vw = vld1q_f32(scratch + i);
vacc = vfmaq_f32(vacc, vi, vw);
}
acc += skainet_neon_hadd_f32(vacc);
#else
for (int i = 0; i < Q6K_BLOCK_SIZE; ++i) {
acc += in_block[i] * scratch[i];
}
#endif
}

out_base[o] = acc;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import sk.ainet.backend.api.kernel.Q4KMatmulKernel
import sk.ainet.backend.api.kernel.Q4KMemSegMatmulKernel
import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5KMatmulKernel
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
import sk.ainet.backend.api.kernel.Q8_0MatmulKernel

/**
Expand Down Expand Up @@ -73,7 +74,7 @@ import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
* - PR 2: real Q4_K matmul wired into the heap SPI.
* - PR 3: MemSeg-input zero-copy sibling.
* - PR 5: native FP32 matmul wired into [matmulFp32].
* - Later: native `matmulQ6K`, `matmulQ8_0` (need new SPI accessors).
* - Now: native `matmulQ5K`, `matmulQ6K`, `matmulQ8_0`, `matmulQ4_0` all wired.
*/
public object NativeKernelProvider : KernelProvider, MemSegKernelProvider {
override val name: String = "native-ffm"
Expand Down Expand Up @@ -101,4 +102,7 @@ public object NativeKernelProvider : KernelProvider, MemSegKernelProvider {

override fun matmulQ5K(): Q5KMatmulKernel? =
if (NativeQ5KMatmulKernel.isAvailable()) NativeQ5KMatmulKernel else null

override fun matmulQ6K(): Q6KMatmulKernel? =
if (NativeQ6KMatmulKernel.isAvailable()) NativeQ6KMatmulKernel else null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package sk.ainet.exec.kernel

import java.lang.foreign.Arena
import java.lang.foreign.FunctionDescriptor
import java.lang.foreign.Linker
import java.lang.foreign.MemorySegment
import java.lang.foreign.ValueLayout
import java.lang.invoke.MethodHandle
import sk.ainet.backend.api.kernel.Q6KMatmulKernel

/**
* Native (FFM) implementation of [Q6KMatmulKernel].
*
* Wraps the bundled C symbol
*
* void skainet_q6k_matmul(
* const float* input, int32_t input_offset,
* const uint8_t* weight, int32_t weight_byte_offset,
* int32_t input_dim, int32_t output_dim,
* float* output, int32_t output_offset);
*
* Canonical 256-element / 210-byte Q6_K super-block: `ql` low nibbles +
* `qh` high-2-bit plane + 16 int8 `scales` + FP16 `d`; each 6-bit code is
* dequantized as `d * scale * (code - 32)`. Numerical parity vs
* [PanamaVectorQ6_KMatmulKernel] is asserted by [NativeQ6KMatmulKernelParityTest].
*
* Scalar 6-bit bit-assembly (`-O3 -ffast-math`, auto-vectorized) feeding a
* NEON dot product behind `__ARM_NEON`.
*/
internal object NativeQ6KMatmulKernel : Q6KMatmulKernel {

private const val BLOCK_SIZE = 256
private const val BYTES_PER_BLOCK = 210

fun isAvailable(): Boolean = handle != null

override fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
) {
require(inputDim % BLOCK_SIZE == 0) {
"NativeQ6KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
}
if (outputDim == 0 || inputDim == 0) return
val mh = handle
?: error("NativeQ6KMatmulKernel.matmul invoked while native library unavailable")

Arena.ofConfined().use { arena ->
val inSeg = arena.allocate(
inputDim.toLong() * java.lang.Float.BYTES,
ValueLayout.JAVA_FLOAT.byteAlignment(),
)
val outSeg = arena.allocate(
outputDim.toLong() * java.lang.Float.BYTES,
ValueLayout.JAVA_FLOAT.byteAlignment(),
)
val weightBytesUsed = ((inputDim / BLOCK_SIZE).toLong() * outputDim) * BYTES_PER_BLOCK.toLong()
val weightSeg = arena.allocate(weightBytesUsed, 1L)

MemorySegment.copy(input, inputOffset, inSeg, ValueLayout.JAVA_FLOAT, 0L, inputDim)
MemorySegment.copy(weight, weightByteOffset, weightSeg, ValueLayout.JAVA_BYTE, 0L, weightBytesUsed.toInt())

mh.invoke(
inSeg, 0,
weightSeg, 0,
inputDim, outputDim,
outSeg, 0,
)

MemorySegment.copy(outSeg, ValueLayout.JAVA_FLOAT, 0L, output, outputOffset, outputDim)
}
}

private val handle: MethodHandle? by lazy {
val lookup = NativeLibraryLoader.lookup() ?: return@lazy null
val symbol = lookup.find("skainet_q6k_matmul").orElse(null) ?: return@lazy null
val descriptor = FunctionDescriptor.ofVoid(
ValueLayout.ADDRESS, // input
ValueLayout.JAVA_INT, // input_offset
ValueLayout.ADDRESS, // weight
ValueLayout.JAVA_INT, // weight_byte_offset
ValueLayout.JAVA_INT, // input_dim
ValueLayout.JAVA_INT, // output_dim
ValueLayout.ADDRESS, // output
ValueLayout.JAVA_INT, // output_offset
)
runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull()
}
}
Loading
Loading