Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,35 @@ Runnable examples:

- Built-in loaders: MNIST, Fashion-MNIST, CIFAR-10
- URI-backed data sources: `file://`, `https://`, `hf+https://`, and `hf://...`
- Dataset operations: deterministic shuffle/split, stratified split, filter/map/transform views, batch flows, and epoch flows
- Raw dataset parsers: CSV, TSV, JSON arrays/objects, JSON Lines (`.jsonl`, `.ndjson`)
- Type-safe transform DSLs: image/tensor transforms plus suspendable raw data pipelines
- Formats: GGUF, ONNX, SafeTensors, JSON, Image (JPEG, PNG)
- Type-safe transform DSL: resize, crop, normalize, toTensor

```kotlin
val raw = JvmDataSourceResolver().rawDataset {
from("hf://datasets/org/repo@main/train.jsonl")
format(DataFormat.JSON_LINES)
cachePolicy(CachePolicy.Use)
}

val withoutLabel = dataPipeline<RawDataset>()
.stage(
dataTransformer(
name = "drop-label",
outputSchema = { schema -> DataSchema(schema.columns - "label") }
) { dataset ->
val columns = dataset.schema.columns - "label"
dataset.copy(
schema = DataSchema(columns),
rows = dataset.rows.map { row ->
RawDataRow(row.values.filterKeys { key -> key in columns })
}
)
}
)
.execute(raw)
```


### Edge AI: Arduino / C99 Export
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,37 +390,39 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
require(a.dtype == b.dtype) { "DType mismatch: ${a.dtype} vs ${b.dtype}" }

// Packed-quant fast path (FP32 input × packed weight), resolved via KernelRegistry.
chooseQuantizedMatmulHeap(a, b)?.let { return it }
KernelProfile.timeQuant { chooseQuantizedMatmulHeap(a, b) }?.let { return it }

// Fast path: 2D × 2D with FloatArray backing — direct buffer access, no per-element allocation
if (a.rank == 2 && b.rank == 2
&& (a.dtype == FP32::class)
&& a.data is FloatArrayTensorData<*> && b.data is FloatArrayTensorData<*>
) {
val aBuf = (a.data as FloatArrayTensorData<*>).buffer
val bBuf = (b.data as FloatArrayTensorData<*>).buffer
val m = a.shape[0]
val k = a.shape[1]
val n = b.shape[1]
require(k == b.shape[0]) { "Matrix multiplication shape mismatch: ${a.shape} vs ${b.shape}" }
val out = FloatArray(m * n)
for (i in 0 until m) {
val aOff = i * k
for (j in 0 until n) {
var sum = 0f
for (p in 0 until k) {
sum += aBuf[aOff + p] * bBuf[p * n + j]
return KernelProfile.timeFp32 {
val aBuf = (a.data as FloatArrayTensorData<*>).buffer
val bBuf = (b.data as FloatArrayTensorData<*>).buffer
val m = a.shape[0]
val k = a.shape[1]
val n = b.shape[1]
require(k == b.shape[0]) { "Matrix multiplication shape mismatch: ${a.shape} vs ${b.shape}" }
val out = FloatArray(m * n)
for (i in 0 until m) {
val aOff = i * k
for (j in 0 until n) {
var sum = 0f
for (p in 0 until k) {
sum += aBuf[aOff + p] * bBuf[p * n + j]
}
out[i * n + j] = sum
}
out[i * n + j] = sum
}
@Suppress("UNCHECKED_CAST")
val outData = dataFactory.fromFloatArray<T, Float>(Shape(m, n), a.dtype, out) as sk.ainet.lang.tensor.data.TensorData<T, V>
newTensor(outData, a.dtype, a, b)
}
@Suppress("UNCHECKED_CAST")
val outData = dataFactory.fromFloatArray<T, Float>(Shape(m, n), a.dtype, out) as sk.ainet.lang.tensor.data.TensorData<T, V>
return newTensor(outData, a.dtype, a, b)
}

// Generic fallback for batched / non-float / non-2D cases
return matmulGeneric(a, b)
return KernelProfile.timeGeneric { matmulGeneric(a, b) }
}

private fun <T : DType, V> matmulGeneric(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package sk.ainet.exec.tensor.ops

import kotlin.time.TimeSource

/**
* Lightweight, always-on accumulating profiler for the matmul dispatch paths.
* Diagnostic only — used to localize where native decode time goes (quant-NEON
* vs FP32-scalar vs generic) before investing in a kernel rewrite. The clock
* read per call is negligible next to a matmul. Read [report] after a run and
* [reset] between phases (e.g. to separate prefill from decode).
*/
public object KernelProfile {
private val clock = TimeSource.Monotonic

public var quantNanos: Long = 0; private set
public var quantCalls: Long = 0; private set
public var fp32Nanos: Long = 0; private set
public var fp32Calls: Long = 0; private set
public var genericNanos: Long = 0; private set
public var genericCalls: Long = 0; private set

public fun <R> timeQuant(body: () -> R): R {
val mark = clock.markNow(); val r = body()
quantNanos += mark.elapsedNow().inWholeNanoseconds; quantCalls++; return r
}

public fun <R> timeFp32(body: () -> R): R {
val mark = clock.markNow(); val r = body()
fp32Nanos += mark.elapsedNow().inWholeNanoseconds; fp32Calls++; return r
}

public fun <R> timeGeneric(body: () -> R): R {
val mark = clock.markNow(); val r = body()
genericNanos += mark.elapsedNow().inWholeNanoseconds; genericCalls++; return r
}

public fun reset() {
quantNanos = 0; quantCalls = 0
fp32Nanos = 0; fp32Calls = 0
genericNanos = 0; genericCalls = 0
}

public fun report(): String {
fun ms(ns: Long) = ns / 1_000_000.0
val total = quantNanos + fp32Nanos + genericNanos
fun pct(ns: Long) = if (total > 0) 100.0 * ns / total else 0.0
return buildString {
appendLine("[KernelProfile] matmul time breakdown:")
appendLine(" quant-NEON : ${ms(quantNanos)} ms over $quantCalls calls (${pct(quantNanos)}%)")
appendLine(" fp32-scalar : ${ms(fp32Nanos)} ms over $fp32Calls calls (${pct(fp32Nanos)}%)")
appendLine(" generic : ${ms(genericNanos)} ms over $genericCalls calls (${pct(genericNanos)}%)")
append(" matmul total : ${ms(total)} ms")
}
}
}
177 changes: 111 additions & 66 deletions skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>

#define Q4K_BLOCK_SIZE 256
#define Q4K_SUB_BLOCK_SIZE 32
Expand Down Expand Up @@ -60,21 +62,48 @@ static inline void skainet_q4k_decode_scales(
}
}

/*
* Quantize one 256-float input block to symmetric int8 (Q8) with a single
* per-block scale d_in = maxabs/127, q8[i] = round(in[i]/d_in). Returns d_in
* (0 if the block is all-zero, with q8 zeroed). Mirrors ggml's block_q8_K
* activation quantization — the source of the (small, well-understood) error
* vs the exact float kernel, and what unlocks the int8 dot-product fast path.
*/
static inline float skainet_q8_quantize_block(const float* SKAINET_RESTRICT in, int8_t* SKAINET_RESTRICT q8) {
float maxabs = 0.0f;
for (int i = 0; i < Q4K_BLOCK_SIZE; ++i) {
const float a = fabsf(in[i]);
if (a > maxabs) maxabs = a;
}
if (maxabs == 0.0f) {
for (int i = 0; i < Q4K_BLOCK_SIZE; ++i) q8[i] = 0;
return 0.0f;
}
const float d_in = maxabs / 127.0f;
const float inv = 127.0f / maxabs;
for (int i = 0; i < Q4K_BLOCK_SIZE; ++i) {
int v = (int) lrintf(in[i] * inv);
if (v > 127) v = 127; else if (v < -127) v = -127;
q8[i] = (int8_t) v;
}
return d_in;
}

/*
* Native Q4_K matrix-vector multiply matching the
* sk.ainet.backend.api.kernel.Q4KMatmulKernel SPI contract. Single
* input row times an `outputDim x inputDim` Q4_K-packed weight tensor
* laid out (blockIdx * outputDim + o) * 144 bytes.
*
* Lazy-dmin pattern: per sub-block accumulate
* codeSum[s] = sum_i input[i] * code[i]
* inputSum[s] = sum_i input[i]
* and combine once via
* acc += d * scaleIdx[s] * codeSum[s] - dMin * minIdx[s] * inputSum[s]
* sk.ainet.backend.api.kernel.Q4KMatmulKernel SPI contract. Single input row
* times an `outputDim x inputDim` Q4_K-packed weight laid out
* (blockIdx * outputDim + o) * 144 bytes.
*
* Scalar single-threaded for PR 2; the tight inner loop is
* straight-line FP arithmetic so -O3 auto-vectorizes the
* codeSum/inputSum accumulators on AVX2/NEON.
* Fused int8 dot path (ggml-style): the input row is quantized to Q8 ONCE per
* 256-block (reused across all output rows), then each weight sub-block is an
* int8 dot-product against the Q8 activation:
* acc += d_in[b] * ( d * Σ_s scaleIdx[s]*intDot[s] - dMin * Σ_s minIdx[s]*intSum[s] )
* where intDot[s] = Σ q8[i]*code[i] and intSum[s] = Σ q8[i] over the sub-block.
* On AArch64 with dotprod (asimddp) the inner dot uses vdotq_s32 (16 int8 MACs
* per instruction); otherwise a scalar integer fallback (auto-vectorized).
* The index mapping (groups, lo/hi sub-blocks, input alignment) is identical to
* the previous float kernel, which was parity-checked against Panama.
*/
SKAINET_API void skainet_q4k_matmul(
const float* SKAINET_RESTRICT input,
Expand All @@ -92,86 +121,102 @@ SKAINET_API void skainet_q4k_matmul(
const float* in_base = input + input_offset;
float* out_base = output + output_offset;

/* Pre-quantize the whole input row to Q8 once (reused across all o). */
int8_t* q8 = (int8_t*) malloc((size_t) input_dim * sizeof(int8_t));
float* d_in = (float*) malloc((size_t) blocks_per_input_dim * sizeof(float));
if (q8 == NULL || d_in == NULL) { free(q8); free(d_in); return; }
for (int32_t b = 0; b < blocks_per_input_dim; ++b) {
d_in[b] = skainet_q8_quantize_block(in_base + (size_t) b * Q4K_BLOCK_SIZE,
q8 + (size_t) b * Q4K_BLOCK_SIZE);
}

int scale_idx[Q4K_SUB_BLOCKS];
int min_idx[Q4K_SUB_BLOCKS];

for (int32_t o = 0; o < output_dim; ++o) {
float acc = 0.0f;

for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
const uint8_t* block = weight + weight_byte_offset
+ (size_t)(block_idx * output_dim + o) * Q4K_BYTES_PER_BLOCK;

/* d, dMin (FP16 LE -> FP32). */
/*
* Loop order: block OUTER, output row INNER. The weight is packed
* block-major — (blockIdx * output_dim + o) * 144 — so for a fixed block,
* consecutive `o` are exactly 144 bytes apart: the weight bytes are read
* strictly sequentially (prefetch- and cache-line-friendly). The reverse
* order (o outer) strides output_dim*144 bytes per step (~295 KB on the
* down-proj), which on an in-order A55 with small caches makes every weight
* read a cold miss and dominates runtime regardless of inner-loop compute.
* out_base[o] is accumulated across blocks (output_dim*4 bytes stays hot in
* cache); the accumulation order over blocks is unchanged, so this is
* numerically identical to the o-outer form.
*/
for (int32_t o = 0; o < output_dim; ++o) out_base[o] = 0.0f;

for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
const int8_t* q8_block = q8 + (size_t) block_idx * Q4K_BLOCK_SIZE;
const float di = d_in[block_idx];
const uint8_t* block = weight + weight_byte_offset
+ (size_t)(block_idx * output_dim) * Q4K_BYTES_PER_BLOCK;

for (int32_t o = 0; o < output_dim; ++o, block += Q4K_BYTES_PER_BLOCK) {
const uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
const uint16_t d_min_bits = (uint16_t) block[2] | ((uint16_t) block[3] << 8);
const float d = skainet_half_to_float(d_bits);
const float d_min = skainet_half_to_float(d_min_bits);

/* 12 bytes of packed (scaleIdx, minIdx) -> 8 ints each. */
skainet_q4k_decode_scales(block + 4, scale_idx, min_idx);

const uint8_t* qs = block + 16;
const float* in_block = in_base + (size_t) block_idx * Q4K_BLOCK_SIZE;

/* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
int64_t block_scale_dot = 0;
int64_t block_min_sum = 0;

for (int group_j = 0; group_j < 4; ++group_j) {
const uint8_t* qs_group = qs + group_j * Q4K_SUB_BLOCK_SIZE;
const uint8_t* qs_group = qs + group_j * Q4K_SUB_BLOCK_SIZE;
const int sb_lo = 2 * group_j;
const int sb_hi = sb_lo + 1;
const float* in_lo = in_block + sb_lo * Q4K_SUB_BLOCK_SIZE;
const float* in_hi = in_block + sb_hi * Q4K_SUB_BLOCK_SIZE;
const int8_t* q8_lo = q8_block + sb_lo * Q4K_SUB_BLOCK_SIZE;
const int8_t* q8_hi = q8_block + sb_hi * Q4K_SUB_BLOCK_SIZE;

float code_sum_lo = 0.0f, input_sum_lo = 0.0f;
float code_sum_hi = 0.0f, input_sum_hi = 0.0f;
int32_t dot_lo = 0, sum_lo = 0, dot_hi = 0, sum_hi = 0;

#ifdef SKAINET_HAVE_NEON
float32x4_t cacc_lo = vdupq_n_f32(0.0f), iacc_lo = vdupq_n_f32(0.0f);
float32x4_t cacc_hi = vdupq_n_f32(0.0f), iacc_hi = vdupq_n_f32(0.0f);
#ifdef SKAINET_HAVE_DOTPROD
int32x4_t acc_dot_lo = vdupq_n_s32(0), acc_dot_hi = vdupq_n_s32(0);
int32_t acc_sum_lo = 0, acc_sum_hi = 0;
for (int off = 0; off < Q4K_SUB_BLOCK_SIZE; off += 16) {
const uint8x16_t packed = vld1q_u8(qs_group + off);
const uint8x16_t lo_nib = vandq_u8(packed, vdupq_n_u8(0x0F));
const uint8x16_t hi_nib = vshrq_n_u8(packed, 4);
float32x4_t cl[4], ch[4];
skainet_neon_u8x16_to_f32x4x4(lo_nib, cl);
skainet_neon_u8x16_to_f32x4x4(hi_nib, ch);
for (int q = 0; q < 4; ++q) {
const float32x4_t v_lo = vld1q_f32(in_lo + off + q * 4);
const float32x4_t v_hi = vld1q_f32(in_hi + off + q * 4);
cacc_lo = vfmaq_f32(cacc_lo, v_lo, cl[q]);
iacc_lo = vaddq_f32(iacc_lo, v_lo);
cacc_hi = vfmaq_f32(cacc_hi, v_hi, ch[q]);
iacc_hi = vaddq_f32(iacc_hi, v_hi);
}
const int8x16_t code_lo = vreinterpretq_s8_u8(vandq_u8(packed, vdupq_n_u8(0x0F)));
const int8x16_t code_hi = vreinterpretq_s8_u8(vshrq_n_u8(packed, 4));
const int8x16_t a_lo = vld1q_s8(q8_lo + off);
const int8x16_t a_hi = vld1q_s8(q8_hi + off);
acc_dot_lo = vdotq_s32(acc_dot_lo, code_lo, a_lo);
acc_dot_hi = vdotq_s32(acc_dot_hi, code_hi, a_hi);
acc_sum_lo += vaddlvq_s8(a_lo);
acc_sum_hi += vaddlvq_s8(a_hi);
}
code_sum_lo = skainet_neon_hadd_f32(cacc_lo);
input_sum_lo = skainet_neon_hadd_f32(iacc_lo);
code_sum_hi = skainet_neon_hadd_f32(cacc_hi);
input_sum_hi = skainet_neon_hadd_f32(iacc_hi);
dot_lo = vaddvq_s32(acc_dot_lo);
dot_hi = vaddvq_s32(acc_dot_hi);
sum_lo = acc_sum_lo;
sum_hi = acc_sum_hi;
#else
/* 32 iterations — auto-vectorizes cleanly under -O3. */
for (int i = 0; i < Q4K_SUB_BLOCK_SIZE; ++i) {
const uint8_t b = qs_group[i];
const float code_lo = (float)(b & 0x0F);
const float code_hi = (float)(b >> 4);
const float v_lo = in_lo[i];
const float v_hi = in_hi[i];
code_sum_lo += v_lo * code_lo;
input_sum_lo += v_lo;
code_sum_hi += v_hi * code_hi;
input_sum_hi += v_hi;
const uint8_t pb = qs_group[i];
const int code_lo = (int)(pb & 0x0F);
const int code_hi = (int)(pb >> 4);
const int a_lo = (int) q8_lo[i];
const int a_hi = (int) q8_hi[i];
dot_lo += a_lo * code_lo;
sum_lo += a_lo;
dot_hi += a_hi * code_hi;
sum_hi += a_hi;
}
#endif

const float scale_lo = d * (float) scale_idx[sb_lo];
const float offset_lo = d_min * (float) min_idx[sb_lo];
const float scale_hi = d * (float) scale_idx[sb_hi];
const float offset_hi = d_min * (float) min_idx[sb_hi];
acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
block_scale_dot += (int64_t) scale_idx[sb_lo] * dot_lo
+ (int64_t) scale_idx[sb_hi] * dot_hi;
block_min_sum += (int64_t) min_idx[sb_lo] * sum_lo
+ (int64_t) min_idx[sb_hi] * sum_hi;
}
}

out_base[o] = acc;
out_base[o] += di * (d * (float) block_scale_dot - d_min * (float) block_min_sum);
}
}

free(q8);
free(d_in);
}
Loading
Loading