diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 55540edc..585d564b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,10 +13,11 @@ jobs: name: Build runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Install modules run: npm ci - name: Build @@ -24,21 +25,22 @@ jobs: - name: Download latest llama.cpp release env: CI: true + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle - name: Upload build artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "build" path: "dist" - name: Upload packed templates artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "build-templates" path: "templates/packed" - name: Upload llama.cpp artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "llama.cpp" @@ -77,19 +79,20 @@ jobs: artifact: "mac-arm64" steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Download build artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: build path: dist - name: Download llama.cpp artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: llama.cpp path: llama @@ -314,7 +317,7 @@ jobs: # - name: Cache UPX # id: cache-upx -# uses: actions/cache@v4 +# uses: actions/cache@v5 # with: # path: "upxInstallations/**" # key: cache-upx-${{ runner.os }}-${{ github.workflow }} @@ -361,7 +364,7 @@ jobs: # chmod -x ./bins/linux-x64-cuda/llama-addon.node - name: Publish artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "bins-${{ matrix.config.artifact }}" @@ -383,19 +386,20 @@ jobs: outputs: next-version: ${{ steps.save-next-version.outputs.next-version }} steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: "22" + package-manager-cache: false - name: Install modules run: npm ci - name: Download build artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: build path: dist - name: Download llama.cpp artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: llama.cpp path: llama @@ -418,7 +422,7 @@ jobs: echo "Next release version: \`$(cat ./resolvedNextVersion.txt)\`" >> $GITHUB_STEP_SUMMARY fi - name: Upload resolved release artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "resolved-next-release" @@ -430,19 +434,20 @@ jobs: needs: - build steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Download build artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: build path: dist - name: Download llama.cpp artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: llama.cpp path: llama @@ -469,19 +474,20 @@ jobs: needs: - build steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Download build artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: build path: dist - name: Download llama.cpp artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: llama.cpp path: llama @@ -507,7 +513,7 @@ jobs: - name: Cache models id: cache-restore-test-models - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 with: path: "test/.models/**.gguf" key: cache-test-models-${{ runner.os }}-${{ github.workflow }} @@ -524,7 +530,7 @@ jobs: - name: Save cached models id: cache-save-test-models if: steps.download-all-test-models.outcome == 'success' && always() - uses: actions/cache/save@v4 + uses: actions/cache/save@v5 with: path: "test/.models/**.gguf" key: cache-test-models-${{ runner.os }}-${{ github.workflow }} @@ -550,17 +556,18 @@ jobs: outputs: package-version: ${{ steps.set-package-version.outputs.package-version }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: lfs: true - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: "22" + package-manager-cache: false - name: Update npm run: npm install -g npm@latest - name: Install modules run: npm ci - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: path: artifacts - name: Move artifacts @@ -684,12 +691,13 @@ jobs: os: macos-15-intel steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: lfs: true - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Install dependencies on Ubuntu if: matrix.config.name == 'Ubuntu' @@ -721,7 +729,7 @@ jobs: ls ./release - name: Upload artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "electron-app-example-${{ matrix.config.name }}" @@ -775,17 +783,18 @@ jobs: # Can be replaced with YAML anchors when this will be supported by GitHub Actions: # https://github.com/actions/runner/issues/1182#issuecomment-2317953582 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: lfs: true fetch-depth: 0 fetch-tags: true - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Install modules run: npm ci - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: path: artifacts - name: Move artifacts @@ -817,12 +826,12 @@ jobs: npm run docs:build - name: Upload docs to GitHub Pages - uses: actions/upload-pages-artifact@v3 + uses: actions/upload-pages-artifact@v5 with: name: pages-docs path: docs-site - name: Deploy docs to GitHub Pages - uses: actions/deploy-pages@v4 + uses: actions/deploy-pages@v5 with: artifact_name: pages-docs - name: Update feed @@ -855,17 +864,18 @@ jobs: # Can be replaced with YAML anchors when this will be supported by GitHub Actions: # https://github.com/actions/runner/issues/1182#issuecomment-2317953582 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: lfs: true fetch-depth: 0 fetch-tags: true - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Install modules run: npm ci - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: path: artifacts - name: Move artifacts @@ -897,12 +907,12 @@ jobs: npm run docs:build - name: Upload docs to GitHub Pages - uses: actions/upload-pages-artifact@v3 + uses: actions/upload-pages-artifact@v5 with: name: pages-docs path: docs-site - name: Deploy docs to GitHub Pages - uses: actions/deploy-pages@v4 + uses: actions/deploy-pages@v5 with: artifact_name: pages-docs - name: Update feed @@ -921,15 +931,16 @@ jobs: # pull-requests: write # discussions: write # steps: -# - uses: actions/checkout@v4 -# - uses: actions/setup-node@v4 +# - uses: actions/checkout@v6 +# - uses: actions/setup-node@v6 # with: -# node-version: "20" +# node-version: "22" +# package-manager-cache: false # - name: Install modules # run: npm ci # # - name: Pull artifact from broken release -# uses: actions/download-artifact@v4 +# uses: actions/download-artifact@v8 # with: # name: resolved-next-release # github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 65d916f3..5c85b98e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,10 +12,10 @@ jobs: name: Test runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" - name: Install modules run: npm ci - name: ESLint @@ -27,14 +27,14 @@ jobs: name: Test docs compilation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: lfs: true fetch-depth: 0 fetch-tags: true - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" - name: Install modules run: npm ci - name: Build diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts index 6b8c35ed..428d3aae 100644 --- a/.vitepress/config/apiReferenceSidebar.ts +++ b/.vitepress/config/apiReferenceSidebar.ts @@ -53,6 +53,7 @@ const chatWrappersOrder = [ "Llama3ChatWrapper", "Llama2ChatWrapper", "MistralChatWrapper", + "Gemma4ChatWrapper", "GemmaChatWrapper", "ChatMLChatWrapper", "FalconChatWrapper", diff --git a/docs/guide/electron.md b/docs/guide/electron.md index cc24321b..0e2bacb9 100644 --- a/docs/guide/electron.md +++ b/docs/guide/electron.md @@ -65,10 +65,11 @@ jobs: os: macos-13 steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: - node-version: "20" + node-version: "22" + package-manager-cache: false - name: Install dependencies on Ubuntu if: matrix.config.name == 'Ubuntu' @@ -87,7 +88,7 @@ jobs: run: npm run build - name: Upload artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: include-hidden-files: true name: "electron-app-${{ matrix.config.name }}" diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt index 76949abe..49d0d638 100644 --- a/llama/CMakeLists.txt +++ b/llama/CMakeLists.txt @@ -120,8 +120,8 @@ list(REMOVE_DUPLICATES GPU_INFO_HEADERS) list(REMOVE_DUPLICATES GPU_INFO_SOURCES) list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS) -addVariantSuffix(llama ${NLC_VARIANT}) -addVariantSuffix(ggml ${NLC_VARIANT}) +addVariantSuffix(llama "${NLC_VARIANT}") +addVariantSuffix(ggml "${NLC_VARIANT}") file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES}) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 4a7de9aa..9427e8ff 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -2,6 +2,7 @@ #include #include #include "common/common.h" +#include "llama-context.h" #include "llama-vocab.h" #include "llama.h" @@ -107,7 +108,7 @@ class AddonContextLoadContextWorker : public Napi::AsyncWorker { try { context->ctx = llama_init_from_model(context->model->model, context->context_params); - context->contextLoaded = context->ctx != nullptr && context->ctx != NULL; + context->contextLoaded = context->ctx != nullptr; } catch (const std::exception& e) { SetError(e.what()); } catch(...) { @@ -115,9 +116,9 @@ class AddonContextLoadContextWorker : public Napi::AsyncWorker { } } void OnOK() { - if (context->contextLoaded) { + if (context->contextLoaded && !context->model->model_params.no_alloc) { uint64_t contextMemorySize = llama_state_get_size(context->ctx); - adjustNapiExternalMemoryAdd(Env(), contextMemorySize); + adjustNapiExternalMemoryAdd(context->Env(), contextMemorySize); context->loadedContextMemorySize = contextMemorySize; } @@ -150,35 +151,15 @@ class AddonContextUnloadContextWorker : public Napi::AsyncWorker { void Execute() { try { - llama_free(context->ctx); - context->contextLoaded = false; - - try { - if (context->has_batch) { - llama_batch_free(context->batch); - context->has_batch = false; - context->batch_n_tokens = 0; - } - - context->dispose(); - } catch (const std::exception& e) { - SetError(e.what()); - } catch(...) { - SetError("Unknown error when calling \"llama_batch_free\""); - } + context->disposeMemory(); } catch (const std::exception& e) { SetError(e.what()); } catch(...) { - SetError("Unknown error when calling \"llama_free\""); + SetError("Unknown error while disposing context memory"); } } void OnOK() { - adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize); - context->loadedContextMemorySize = 0; - - adjustNapiExternalMemorySubtract(Env(), context->batchMemorySize); - context->batchMemorySize = 0; - + context->disposeMT(); deferred.Resolve(Env().Undefined()); } void OnError(const Napi::Error& err) { @@ -251,22 +232,8 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker { sampler->rebuildChainIfNeeded(); - const auto * logits = llama_get_logits_ith(ctx->ctx, batchLogitIndex); - const int n_vocab = llama_vocab_n_tokens(ctx->model->vocab); - - auto & candidates = sampler->tokenCandidates; - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; - } - - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(sampler->chain, &cur_p); + llama_token_data_array cur_p; + sampler->sample(ctx->ctx, batchLogitIndex, cur_p, returnProbabilities || returnConfidence); if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) { no_output = true; @@ -403,7 +370,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap 1 && info[1].IsObject()) { - Napi::Object options = info[1].As(); + const auto options = info[1].As(); if (options.Has("contextSize")) { context_params.n_ctx = options.Get("contextSize").As().Uint32Value(); @@ -427,16 +394,26 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value(); - context_params.flash_attn_type = flashAttention ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; + const auto flashAttention = options.Get("flashAttention"); + + if (flashAttention.IsString() && flashAttention.As().Utf8Value() == "auto") { + context_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; + } else { + const bool flashAttentionEnabled = flashAttention.As().Value(); + context_params.flash_attn_type = flashAttentionEnabled + ? LLAMA_FLASH_ATTN_TYPE_ENABLED + : LLAMA_FLASH_ATTN_TYPE_DISABLED; + } } if (options.Has("threads")) { - const auto n_threads = options.Get("threads").As().Int32Value(); - const auto resolved_n_threads = n_threads == 0 ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads) : n_threads; + const auto threads = options.Get("threads").As().Int32Value(); + const auto resolvedThreads = threads == 0 + ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads) + : threads; - context_params.n_threads = resolved_n_threads; - context_params.n_threads_batch = resolved_n_threads; + context_params.n_threads = resolvedThreads; + context_params.n_threads_batch = resolvedThreads; } if (options.Has("performanceTracking")) { @@ -444,14 +421,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Int32Value(); + const auto keyType = options.Get("kvCacheKeyType").As().Int32Value(); if (keyType >= 0 && keyType < GGML_TYPE_COUNT) { context_params.type_k = static_cast(keyType); } } if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) { - auto valueType = options.Get("kvCacheValueType").As().Int32Value(); + const auto valueType = options.Get("kvCacheValueType").As().Int32Value(); if (valueType >= 0 && valueType < GGML_TYPE_COUNT) { context_params.type_v = static_cast(valueType); } @@ -463,67 +440,123 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap lock(disposeMutex); + + if (!has_batch) { return; } - disposed = true; - if (contextLoaded) { - contextLoaded = false; - llama_free(ctx); + llama_batch_free(batch); + has_batch = false; + batch_n_tokens = 0; +} - adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize); - loadedContextMemorySize = 0; - } +void AddonContext::disposeBatchMT() { + uint64_t currentBatchMemorySize = 0; - model->Unref(); + disposeBatchMemory(); + + { + std::lock_guard lock(disposeMutex); + currentBatchMemorySize = batchMemorySize; + batchMemorySize = 0; + } - disposeBatch(); + if (currentBatchMemorySize > 0) { + adjustNapiExternalMemorySubtract(Env(), currentBatchMemorySize); + } } -void AddonContext::disposeBatch() { - if (!has_batch) { - return; + +void AddonContext::disposeMemory() { + llama_context* currentCtx = nullptr; + + { + std::lock_guard lock(disposeMutex); + + if (!memoryDisposed) { + memoryDisposed = true; + currentCtx = ctx; + ctx = nullptr; + contextLoaded = false; + } } - llama_batch_free(batch); - has_batch = false; - batch_n_tokens = 0; + disposeBatchMemory(); - adjustNapiExternalMemorySubtract(Env(), batchMemorySize); - batchMemorySize = 0; + if (currentCtx != nullptr) { + llama_free(currentCtx); + } } -Napi::Value AddonContext::Init(const Napi::CallbackInfo& info) { - if (disposed) { - Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); - return info.Env().Undefined(); +void AddonContext::disposeMT() { + uint64_t currentLoadedContextMemorySize = 0; + uint64_t currentBatchMemorySize = 0; + bool shouldUnrefModel = false; + + disposeMemory(); + + { + std::lock_guard lock(disposeMutex); + + if (disposed) { + return; + } + + disposed = true; + shouldUnrefModel = true; + + if (!model->model_params.no_alloc) { + currentLoadedContextMemorySize = loadedContextMemorySize; + loadedContextMemorySize = 0; + } + + currentBatchMemorySize = batchMemorySize; + batchMemorySize = 0; } - AddonContextLoadContextWorker* worker = new AddonContextLoadContextWorker(this->Env(), this); - worker->Queue(); - return worker->GetPromise(); + if (currentLoadedContextMemorySize > 0) { + adjustNapiExternalMemorySubtract(Env(), currentLoadedContextMemorySize); + } + + if (currentBatchMemorySize > 0) { + adjustNapiExternalMemorySubtract(Env(), currentBatchMemorySize); + } + + if (shouldUnrefModel) { + model->Unref(); + } } + Napi::Value AddonContext::Dispose(const Napi::CallbackInfo& info) { if (disposed) { return info.Env().Undefined(); } - if (contextLoaded) { - contextLoaded = false; - + if (contextLoaded || has_batch) { AddonContextUnloadContextWorker* worker = new AddonContextUnloadContextWorker(this->Env(), this); worker->Queue(); return worker->GetPromise(); - } else { - dispose(); + } + + disposeMT(); + + Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); + deferred.Resolve(info.Env().Undefined()); + return deferred.Promise(); +} - Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); - deferred.Resolve(info.Env().Undefined()); - return deferred.Promise(); +Napi::Value AddonContext::Init(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } else { + AddonContextLoadContextWorker* worker = new AddonContextLoadContextWorker(this->Env(), this); + worker->Queue(); + return worker->GetPromise(); } } @@ -568,7 +601,7 @@ Napi::Value AddonContext::DisposeBatch(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } - disposeBatch(); + disposeBatchMT(); return info.Env().Undefined(); } @@ -728,6 +761,49 @@ Napi::Value AddonContext::GetStateSize(const Napi::CallbackInfo& info) { return Napi::Number::From(info.Env(), llama_state_get_size(ctx)); } +Napi::Value AddonContext::GetMemoryBreakdown(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + if (!contextLoaded || ctx == nullptr) { + Napi::Error::New(info.Env(), "Context is not loaded").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + std::size_t cpuRam = 0; + std::size_t gpuVram = 0; + + for (const auto& [bufferType, memoryBreakdown] : ctx->memory_breakdown()) { + const std::size_t size = memoryBreakdown.context + memoryBreakdown.compute; + if (size == 0) { + continue; + } + + if (ggml_backend_buft_is_host(bufferType)) { + cpuRam += size; + } else { + ggml_backend_dev_t device = ggml_backend_buft_get_device(bufferType); + if (device != nullptr) { + auto deviceType = ggml_backend_dev_type(device); + if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) { + gpuVram += size; + } else { + cpuRam += size; + } + } else { + cpuRam += size; + } + } + } + + Napi::Object result = Napi::Object::New(info.Env()); + result.Set("cpuRam", Napi::Number::New(info.Env(), cpuRam)); + result.Set("gpuVram", Napi::Number::New(info.Env(), gpuVram)); + return result; +} + Napi::Value AddonContext::GetThreads(const Napi::CallbackInfo& info) { if (disposed) { Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); @@ -1062,6 +1138,7 @@ void AddonContext::init(Napi::Object exports) { InstanceMethod("sampleToken", &AddonContext::SampleToken), InstanceMethod("getEmbedding", &AddonContext::GetEmbedding), InstanceMethod("getStateSize", &AddonContext::GetStateSize), + InstanceMethod("getMemoryBreakdown", &AddonContext::GetMemoryBreakdown), InstanceMethod("getThreads", &AddonContext::GetThreads), InstanceMethod("setThreads", &AddonContext::SetThreads), InstanceMethod("printTimings", &AddonContext::PrintTimings), diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h index 2e677203..6f5f193b 100644 --- a/llama/addon/AddonContext.h +++ b/llama/addon/AddonContext.h @@ -20,14 +20,18 @@ class AddonContext : public Napi::ObjectWrap { uint64_t loadedContextMemorySize = 0; bool contextLoaded = false; + std::mutex disposeMutex; bool disposed = false; + bool memoryDisposed = false; AddonContext(const Napi::CallbackInfo& info); ~AddonContext(); - void dispose(); - void disposeBatch(); + void disposeMemory(); + void disposeMT(); + void disposeBatchMemory(); + void disposeBatchMT(); Napi::Value Init(const Napi::CallbackInfo& info); Napi::Value Dispose(const Napi::CallbackInfo& info); @@ -46,6 +50,7 @@ class AddonContext : public Napi::ObjectWrap { Napi::Value GetEmbedding(const Napi::CallbackInfo& info); Napi::Value GetStateSize(const Napi::CallbackInfo& info); + Napi::Value GetMemoryBreakdown(const Napi::CallbackInfo& info); Napi::Value GetThreads(const Napi::CallbackInfo& info); Napi::Value SetThreads(const Napi::CallbackInfo& info); diff --git a/llama/addon/AddonGgufMetadata.cpp b/llama/addon/AddonGgufMetadata.cpp new file mode 100644 index 00000000..9eec39da --- /dev/null +++ b/llama/addon/AddonGgufMetadata.cpp @@ -0,0 +1,285 @@ +#include +#include +#include +#include +#include + +#include "AddonGgufMetadata.h" +#include "gguf.h" +#include "llama.h" + + +static constexpr const char* kGgufSplitNoKey = "split.no"; +static constexpr const char* kGgufSplitCountKey = "split.count"; +static constexpr const char* kGgufSplitTensorsCountKey = "split.tensors.count"; + +static std::optional getOptionalGgufU16(const gguf_context* metadata, const char* key) { + const int64_t keyId = gguf_find_key(metadata, key); + if (keyId < 0) { + return std::nullopt; + } + + return gguf_get_val_u16(metadata, keyId); +} + +static std::string getSplitPrefixFromPath(const std::string& path, const uint16_t splitNo, const uint16_t splitCount) { + std::vector splitPrefix(path.size() + 1, '\0'); + if (llama_split_prefix(splitPrefix.data(), splitPrefix.size(), path.c_str(), splitNo, splitCount) <= 0) { + throw std::runtime_error("Invalid split GGUF path: " + path); + } + + return std::string(splitPrefix.data()); +} + +static std::string getSplitPath(const std::string& splitPrefix, const uint16_t splitNo, const uint16_t splitCount) { + std::vector splitPath(splitPrefix.size() + 32, '\0'); + if (llama_split_path(splitPath.data(), splitPath.size(), splitPrefix.c_str(), splitNo, splitCount) <= 0) { + throw std::runtime_error("Failed to construct GGUF split path for split " + std::to_string(splitNo)); + } + + return std::string(splitPath.data()); +} + + +AddonGgufMetadata::AddonGgufMetadata(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info), + ggufMetadata(gguf_init_empty()) { + if (ggufMetadata.get() == nullptr) { + throw std::runtime_error("Failed to create an empty GGUF context"); + } +} +AddonGgufMetadata::~AddonGgufMetadata() { + dispose(); +} + +void AddonGgufMetadata::dispose() { + if (disposed) { + return; + } + + disposed = true; + ggufMetadata.reset(); +} + +Napi::Value AddonGgufMetadata::Dispose(const Napi::CallbackInfo& info) { + dispose(); + return info.Env().Undefined(); +} + +class AddonGgufMetadataInitWorker : public Napi::AsyncWorker { + public: + AddonGgufMetadata* addonGgufMetadata; + std::vector sources; + std::vector>> bufferRefs; + + AddonGgufMetadataInitWorker(const Napi::Env& env, AddonGgufMetadata* addonGgufMetadata) + : Napi::AsyncWorker(env, "AddonGgufMetadataInitWorker"), + addonGgufMetadata(addonGgufMetadata), + deferred(Napi::Promise::Deferred::New(env)) { + addonGgufMetadata->Ref(); + } + ~AddonGgufMetadataInitWorker() { + addonGgufMetadata->Unref(); + } + + Napi::Promise GetPromise() { + return deferred.Promise(); + } + + protected: + Napi::Promise::Deferred deferred; + + void Execute() { + try { + gguf_context_ptr& ggufMetadata = addonGgufMetadata->ggufMetadata; + + auto loadMetadataSource = [](const AddonGgufMetadataSource& itemSource, ggml_context_ptr& tensorContextGuard) { + struct ggml_context* tensorContext = nullptr; + struct gguf_init_params ggufParams = { + /* .no_alloc = */ true, + /* .ctx = */ &tensorContext, + }; + gguf_context_ptr metadata( + itemSource.type == AddonGgufMetadataSourceType::buffer + ? gguf_init_from_buffer(itemSource.buffer.data, itemSource.buffer.length, ggufParams) + : gguf_init_from_file(itemSource.path.c_str(), ggufParams) + ); + tensorContextGuard.reset(tensorContext); + + if (metadata.get() == nullptr || tensorContext == nullptr) { + throw std::runtime_error("Failed to parse GGUF metadata buffer"); + } + + return metadata; + }; + + std::vector resolvedSources = sources; + if (!sources.empty()) { + ggml_context_ptr initialTensorContextGuard; + gguf_context_ptr initialMetadata = loadMetadataSource(sources.front(), initialTensorContextGuard); + const std::optional splitCount = getOptionalGgufU16(initialMetadata.get(), kGgufSplitCountKey); + + if (splitCount.has_value() && splitCount.value() > 1) { + if (sources.size() == 1) { + if (sources.front().type != AddonGgufMetadataSourceType::path) { + throw std::runtime_error( + "Loading split GGUF metadata from source buffers requires all split parts to be provided" + ); + } + + const std::optional splitNo = getOptionalGgufU16(initialMetadata.get(), kGgufSplitNoKey); + if (!splitNo.has_value()) { + throw std::runtime_error("Missing split.no metadata in split GGUF source"); + } + + const std::string splitPrefix = getSplitPrefixFromPath( + sources.front().path, + splitNo.value(), + splitCount.value() + ); + + resolvedSources.clear(); + resolvedSources.reserve(splitCount.value()); + + for (uint16_t splitIndex = 0; splitIndex < splitCount.value(); ++splitIndex) { + resolvedSources.emplace_back(AddonGgufMetadataSource( + getSplitPath(splitPrefix, splitIndex, splitCount.value()) + )); + } + } else if (sources.size() != splitCount.value()) { + throw std::runtime_error( + "Expected " + std::to_string(splitCount.value()) + + " split GGUF sources, but got " + std::to_string(sources.size()) + ); + } + } + } + + bool hasCopiedMetadata = false; + int32_t mergedTensorCount = 0; + std::optional mergedSplitCount; + for (size_t sourceIndex = 0; sourceIndex < resolvedSources.size(); sourceIndex++) { + const auto& itemSource = resolvedSources[sourceIndex]; + ggml_context_ptr tensorContextGuard; + gguf_context_ptr metadata = loadMetadataSource(itemSource, tensorContextGuard); + + if (!hasCopiedMetadata) { + gguf_set_kv(ggufMetadata.get(), metadata.get()); + hasCopiedMetadata = true; + mergedSplitCount = getOptionalGgufU16(metadata.get(), kGgufSplitCountKey); + } + + if (mergedSplitCount.has_value() && mergedSplitCount.value() > 1) { + const std::optional splitNo = getOptionalGgufU16(metadata.get(), kGgufSplitNoKey); + if (!splitNo.has_value()) { + throw std::runtime_error("Missing split.no metadata in split GGUF source"); + } else if (splitNo.value() != sourceIndex) { + throw std::runtime_error( + "Invalid split GGUF source order: expected split index " + std::to_string(sourceIndex) + + ", but got " + std::to_string(splitNo.value()) + ); + } + + const std::optional splitCount = getOptionalGgufU16(metadata.get(), kGgufSplitCountKey); + if (!splitCount.has_value()) { + throw std::runtime_error("Missing split.count metadata in split GGUF source"); + } else if (splitCount.value() != mergedSplitCount.value()) { + throw std::runtime_error( + "Inconsistent split.count metadata in split GGUF source: expected " + + std::to_string(mergedSplitCount.value()) + ", but got " + std::to_string(splitCount.value()) + ); + } + } + + for (ggml_tensor* tensor = ggml_get_first_tensor(tensorContextGuard.get()); tensor != nullptr; + tensor = ggml_get_next_tensor(tensorContextGuard.get(), tensor)) { + gguf_add_tensor(ggufMetadata.get(), tensor); + mergedTensorCount++; + } + } + + if (mergedSplitCount.has_value() && mergedSplitCount.value() > 1) { + // mirror `gguf_merge` in `llama.cpp/tools/gguf-split/gguf-split.cpp`: + // copy the KV metadata from the first split, append tensors from all splits, + // then normalize the split bookkeeping so the merged context behaves like + // a single spliced GGUF instead of shard 0 with extra tensors appended. + gguf_set_val_u16(ggufMetadata.get(), kGgufSplitNoKey, 0); + gguf_set_val_u16(ggufMetadata.get(), kGgufSplitCountKey, 0); + gguf_set_val_i32(ggufMetadata.get(), kGgufSplitTensorsCountKey, mergedTensorCount); + } + } catch (const std::exception& e) { + SetError(e.what()); + } catch (...) { + SetError("Unknown error when loading GGUF metadata from the given sources"); + } + } + void OnOK() { + deferred.Resolve(Env().Undefined()); + } + void OnError(const Napi::Error& err) { + deferred.Reject(err.Value()); + } +}; + +Napi::Value AddonGgufMetadata::Init(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Metadata is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + std::vector metadataSources; + std::vector>> bufferRefs; + + if (info.Length() == 0 || !info[0].IsArray()) { + Napi::TypeError::New(info.Env(), "Expected an array of sources as the first argument").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + const auto sourceBufferValues = info[0].As(); + const uint32_t sourcesCount = sourceBufferValues.Length(); + + if (sourcesCount == 0) { + Napi::TypeError::New(info.Env(), "Expected source array to contain at least one item").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + metadataSources.reserve(sourcesCount); + bufferRefs.reserve(sourcesCount); + + for (uint32_t i = 0; i < sourcesCount; i++) { + const auto sourceBufferValue = sourceBufferValues.Get(i); + if (sourceBufferValue.IsBuffer()) { + const auto sourceBuffer = sourceBufferValue.As>(); + metadataSources.emplace_back(AddonGgufMetadataSource(AddonGgufMetadataSourceBuffer(sourceBuffer.Data(), sourceBuffer.Length()))); + bufferRefs.emplace_back(Napi::Persistent(sourceBuffer)); + } else if (sourceBufferValue.IsString()) { + const auto sourcePath = sourceBufferValue.As().Utf8Value(); + metadataSources.emplace_back(AddonGgufMetadataSource(sourcePath)); + } else { + Napi::TypeError::New(info.Env(), "Expected every source array item to be a Buffer or a string").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + } + + + AddonGgufMetadataInitWorker* worker = new AddonGgufMetadataInitWorker(info.Env(), this); + worker->sources.swap(metadataSources); + worker->bufferRefs.swap(bufferRefs); + + worker->Queue(); + return worker->GetPromise(); +} + +void AddonGgufMetadata::init(Napi::Object exports) { + exports.Set( + "AddonGgufMetadata", + DefineClass( + exports.Env(), + "AddonGgufMetadata", + { + InstanceMethod("init", &AddonGgufMetadata::Init), + InstanceMethod("dispose", &AddonGgufMetadata::Dispose), + } + ) + ); +} diff --git a/llama/addon/AddonGgufMetadata.h b/llama/addon/AddonGgufMetadata.h new file mode 100644 index 00000000..11521543 --- /dev/null +++ b/llama/addon/AddonGgufMetadata.h @@ -0,0 +1,48 @@ +#pragma once + +#include + +#include "ggml-cpp.h" +#include "napi.h" + + +enum class AddonGgufMetadataSourceType { + path = 0, + buffer = 1 +}; + +struct AddonGgufMetadataSourceBuffer { + const uint8_t* data = nullptr; + std::size_t length = 0; + + AddonGgufMetadataSourceBuffer() = default; + AddonGgufMetadataSourceBuffer(const uint8_t* data, std::size_t length) : data(data), length(length) { + } +}; + +struct AddonGgufMetadataSource { + AddonGgufMetadataSourceType type = AddonGgufMetadataSourceType::path; + std::string path; + AddonGgufMetadataSourceBuffer buffer; + + AddonGgufMetadataSource() = default; + explicit AddonGgufMetadataSource(std::string path) : type(AddonGgufMetadataSourceType::path), path(std::move(path)) { + } + explicit AddonGgufMetadataSource(AddonGgufMetadataSourceBuffer buffer) : type(AddonGgufMetadataSourceType::buffer), buffer(buffer) { + } +}; + +class AddonGgufMetadata : public Napi::ObjectWrap { + public: + gguf_context_ptr ggufMetadata; + bool disposed = false; + + AddonGgufMetadata(const Napi::CallbackInfo& info); + ~AddonGgufMetadata(); + void dispose(); + + Napi::Value Init(const Napi::CallbackInfo& info); + Napi::Value Dispose(const Napi::CallbackInfo& info); + + static void init(Napi::Object exports); +}; diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp index 94b4e576..72cfc79e 100644 --- a/llama/addon/AddonModel.cpp +++ b/llama/addon/AddonModel.cpp @@ -4,9 +4,12 @@ #include "globals/addonProgress.h" #include "common/common.h" #include "llama.h" +#include "llama-model.h" +#include "gguf.h" #include "AddonModel.h" #include "AddonModelData.h" #include "AddonModelLora.h" +#include "AddonGgufMetadata.h" static Napi::Value getNapiToken(const Napi::CallbackInfo& info, const llama_vocab* vocab, llama_token token) { if (token < 0 || token == LLAMA_TOKEN_NULL) { @@ -69,18 +72,37 @@ static bool llamaModelParamsProgressCallback(float progress, void * user_data) { return !(addonModel->abortModelLoad); } +struct ModelEstimatorTensorAccessState { + bool accessedTensorData = false; +}; + +static void markUnexpectedTensorDataAccess(struct ggml_tensor * /* tensor */, void * userData) { + auto * tensorAccessState = static_cast(userData); + if (tensorAccessState != nullptr) { + tensorAccessState->accessedTensorData = true; + } +} + class AddonModelLoadModelWorker : public Napi::AsyncWorker { public: AddonModel* model; + AddonGgufMetadata* ggufMetadata = nullptr; - AddonModelLoadModelWorker(const Napi::Env& env, AddonModel* model) + AddonModelLoadModelWorker(const Napi::Env& env, AddonModel* model, AddonGgufMetadata* ggufMetadata) : Napi::AsyncWorker(env, "AddonModelLoadModelWorker"), model(model), + ggufMetadata(ggufMetadata), deferred(Napi::Promise::Deferred::New(env)) { model->Ref(); + if (ggufMetadata != nullptr) { + ggufMetadata->Ref(); + } } ~AddonModelLoadModelWorker() { model->Unref(); + if (ggufMetadata != nullptr) { + ggufMetadata->Unref(); + } } Napi::Promise GetPromise() { @@ -92,10 +114,41 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker { void Execute() { try { - model->model = llama_model_load_from_file(model->modelPath.c_str(), model->model_params); - model->vocab = llama_model_get_vocab(model->model); + if (model->modelPath != "" && ggufMetadata == nullptr) { + model->model = llama_model_load_from_file(model->modelPath.c_str(), model->model_params); + } else { + if (!model->model_params.no_alloc) { + throw std::runtime_error("Loading a model from source buffers requires no_alloc=true"); + } else if (ggufMetadata->disposed || ggufMetadata->ggufMetadata.get() == nullptr) { + throw std::runtime_error("GGUF metadata is disposed"); + } - model->modelLoaded = model->model != nullptr && model->model != NULL; + ModelEstimatorTensorAccessState tensorAccessState; + model->model = llama_model_init_from_user( + ggufMetadata->ggufMetadata.get(), + markUnexpectedTensorDataAccess, + &tensorAccessState, + model->model_params + ); + + if (tensorAccessState.accessedTensorData) { + if (model->model != nullptr) { + llama_model_free(model->model); + model->model = nullptr; + } + + throw std::runtime_error( + "Unexpected tensor data access when loading a model from source buffers with no_alloc=true" + ); + } + } + + if (model->model != nullptr) { + model->vocab = llama_model_get_vocab(model->model); + model->modelLoaded = true; + } else { + model->modelLoaded = false; + } } catch (const std::exception& e) { SetError(e.what()); } catch(...) { @@ -103,9 +156,9 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker { } } void OnOK() { - if (model->modelLoaded) { + if (model->modelLoaded && !model->model_params.no_alloc) { uint64_t modelSize = llama_model_size(model->model); - adjustNapiExternalMemoryAdd(Env(), modelSize); + adjustNapiExternalMemoryAdd(model->Env(), modelSize); model->loadedModelSize = modelSize; } @@ -116,6 +169,9 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker { } void OnError(const Napi::Error& err) { deferred.Reject(err.Value()); + if (model->onLoadProgressEventCallbackSet) { + model->addonThreadSafeOnLoadProgressEventCallback.Release(); + } } }; @@ -142,20 +198,15 @@ class AddonModelUnloadModelWorker : public Napi::AsyncWorker { void Execute() { try { - llama_model_free(model->model); - model->modelLoaded = false; - - model->dispose(); + model->disposeMemory(); } catch (const std::exception& e) { SetError(e.what()); } catch(...) { - SetError("Unknown error when calling \"llama_model_free\""); + SetError("Unknown error when disposing model memory"); } } void OnOK() { - adjustNapiExternalMemorySubtract(Env(), model->loadedModelSize); - model->loadedModelSize = 0; - + model->disposeMT(); deferred.Resolve(Env().Undefined()); } void OnError(const Napi::Error& err) { @@ -203,12 +254,18 @@ class AddonModelLoadLoraWorker : public Napi::AsyncWorker { } modelLora->lora_adapter = loraAdapter; - modelLora->model->Ref(); - if (modelLora->model->data != nullptr) { - modelLora->model->data->loraAdapters.insert(modelLora); - } else { - modelLora->dispose(true); + bool hasModelData = false; + { + std::lock_guard modelLock(modelLora->model->disposeMutex); + hasModelData = !modelLora->model->disposed + && !modelLora->model->memoryDisposed + && modelLora->model->data != nullptr; + } + + if (!hasModelData) { + llama_adapter_lora_free(modelLora->lora_adapter); + modelLora->lora_adapter = nullptr; SetError("Model data is not initialized"); } } catch (const std::exception& e) { @@ -218,6 +275,32 @@ class AddonModelLoadLoraWorker : public Napi::AsyncWorker { } } void OnOK() { + bool shouldReject = false; + + { + std::lock_guard modelLock(modelLora->model->disposeMutex); + std::lock_guard loraLock(modelLora->disposeMutex); + + shouldReject = modelLora->disposed + || modelLora->model->disposed + || modelLora->model->memoryDisposed + || modelLora->model->data == nullptr; + + if (!shouldReject) { + modelLora->model->Ref(); + modelLora->hasModelRef = true; + modelLora->Ref(); + modelLora->hasSelfRef = true; + modelLora->model->data->addLora(modelLora); + } + } + + if (shouldReject) { + modelLora->disposeMemory(); + deferred.Reject(Napi::Error::New(Env(), "Model or LoRA was disposed before LoRA load completed").Value()); + return; + } + deferred.Resolve(Env().Undefined()); } void OnError(const Napi::Error& err) { @@ -225,11 +308,11 @@ class AddonModelLoadLoraWorker : public Napi::AsyncWorker { } }; -AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { +AddonModel::AddonModel(const Napi::CallbackInfo& info) : + Napi::ObjectWrap(info) { data = new AddonModelData(); model_params = llama_model_default_params(); - // Get the model path modelPath = info[0].As().Utf8Value(); if (info.Length() > 1 && info[1].IsObject()) { @@ -264,6 +347,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value(); } + if (options.Has("noAlloc")) { + model_params.no_alloc = options.Get("noAlloc").As().Value(); + } + if (options.Has("onLoadProgress")) { auto onLoadProgressJSCallback = options.Get("onLoadProgress").As(); if (onLoadProgressJSCallback.IsFunction()) { @@ -351,35 +438,102 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap lock(disposeMutex); + + if (memoryDisposed) { + return; + } + + memoryDisposed = true; + + currentData = data; + currentModel = model; + model = nullptr; + vocab = nullptr; + modelLoaded = false; } - disposed = true; - - if (data != nullptr) { - auto currentData = data; + if (currentData != nullptr) { + currentData->disposeMemory(); + } + + if (currentModel != nullptr) { + llama_model_free(currentModel); + } +} + +void AddonModel::disposeMT() { + AddonModelData* currentData = nullptr; + uint64_t currentLoadedModelSize = 0; + bool shouldUnrefAddonExports = false; + + disposeMemory(); + + { + std::lock_guard lock(disposeMutex); + + if (disposed) { + return; + } + + disposed = true; + currentData = data; data = nullptr; - delete currentData; + + if (!model_params.no_alloc) { + currentLoadedModelSize = loadedModelSize; + loadedModelSize = 0; + } + + shouldUnrefAddonExports = hasAddonExportsRef; + hasAddonExportsRef = false; } - if (modelLoaded) { - modelLoaded = false; - llama_model_free(model); + if (currentData != nullptr) { + currentData->disposeMT(); + delete currentData; + } - adjustNapiExternalMemorySubtract(Env(), loadedModelSize); - loadedModelSize = 0; + if (currentLoadedModelSize > 0) { + adjustNapiExternalMemorySubtract(Env(), currentLoadedModelSize); } - if (hasAddonExportsRef) { + if (shouldUnrefAddonExports) { addonExportsRef.Unref(); - hasAddonExportsRef = false; + } +} + +Napi::Value AddonModel::Dispose(const Napi::CallbackInfo& info) { + if (disposed) { + return info.Env().Undefined(); + } + + if (modelLoaded) { + AddonModelUnloadModelWorker* worker = new AddonModelUnloadModelWorker(this->Env(), this); + worker->Queue(); + return worker->GetPromise(); + } else { + disposeMT(); + + Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); + deferred.Resolve(info.Env().Undefined()); + return deferred.Promise(); } } @@ -389,11 +543,25 @@ Napi::Value AddonModel::Init(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } - AddonModelLoadModelWorker* worker = new AddonModelLoadModelWorker(this->Env(), this); + AddonGgufMetadata* ggufMetadata = nullptr; + if (info.Length() > 0 && !info[0].IsUndefined()) { + ggufMetadata = Napi::ObjectWrap::Unwrap(info[0].As()); + if (ggufMetadata == nullptr || ggufMetadata->ggufMetadata.get() == nullptr) { + Napi::TypeError::New(info.Env(), "Invalid GGUF metadata object").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + } + + AddonModelLoadModelWorker* worker = new AddonModelLoadModelWorker(this->Env(), this, ggufMetadata); worker->Queue(); return worker->GetPromise(); } Napi::Value AddonModel::LoadLora(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + AddonModelLora* modelLora = Napi::ObjectWrap::Unwrap(info[0].As()); AddonModelLoadLoraWorker* worker = new AddonModelLoadLoraWorker(this->Env(), modelLora); worker->Queue(); @@ -403,26 +571,6 @@ Napi::Value AddonModel::AbortActiveModelLoad(const Napi::CallbackInfo& info) { abortModelLoad = true; return info.Env().Undefined(); } -Napi::Value AddonModel::Dispose(const Napi::CallbackInfo& info) { - if (disposed) { - return info.Env().Undefined(); - } - - if (modelLoaded) { - modelLoaded = false; - - AddonModelUnloadModelWorker* worker = new AddonModelUnloadModelWorker(this->Env(), this); - worker->Queue(); - return worker->GetPromise(); - } else { - dispose(); - - Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); - deferred.Resolve(info.Env().Undefined()); - return deferred.Promise(); - } -} - Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) { if (disposed) { Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException(); @@ -515,6 +663,48 @@ Napi::Value AddonModel::GetModelDescription(const Napi::CallbackInfo& info) { return Napi::String::New(info.Env(), model_desc, actual_length); } +Napi::Value AddonModel::GetMemoryBreakdown(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + if (!modelLoaded || model == nullptr) { + Napi::Error::New(info.Env(), "Model is not loaded").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + std::size_t cpuRam = 0; + std::size_t gpuVram = 0; + + for (const auto& [bufferType, size] : model->memory_breakdown()) { + if (size == 0) { + continue; + } + + if (ggml_backend_buft_is_host(bufferType)) { + cpuRam += size; + } else { + ggml_backend_dev_t device = ggml_backend_buft_get_device(bufferType); + if (device != nullptr) { + auto deviceType = ggml_backend_dev_type(device); + if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) { + gpuVram += size; + } else { + cpuRam += size; + } + } else { + cpuRam += size; + } + } + } + + Napi::Object result = Napi::Object::New(info.Env()); + result.Set("cpuRam", Napi::Number::New(info.Env(), cpuRam)); + result.Set("gpuVram", Napi::Number::New(info.Env(), gpuVram)); + return result; +} + Napi::Value AddonModel::TokenBos(const Napi::CallbackInfo& info) { if (disposed) { Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException(); @@ -669,6 +859,7 @@ void AddonModel::init(Napi::Object exports) { InstanceMethod("getTotalSize", &AddonModel::GetTotalSize), InstanceMethod("getTotalParameters", &AddonModel::GetTotalParameters), InstanceMethod("getModelDescription", &AddonModel::GetModelDescription), + InstanceMethod("getMemoryBreakdown", &AddonModel::GetMemoryBreakdown), InstanceMethod("tokenBos", &AddonModel::TokenBos), InstanceMethod("tokenEos", &AddonModel::TokenEos), InstanceMethod("tokenNl", &AddonModel::TokenNl), diff --git a/llama/addon/AddonModel.h b/llama/addon/AddonModel.h index b6661d35..ff5b70dd 100644 --- a/llama/addon/AddonModel.h +++ b/llama/addon/AddonModel.h @@ -1,9 +1,17 @@ #pragma once + +#include +#include +#include +#include +#include + #include "llama.h" #include "napi.h" #include "addonGlobals.h" #include "globals/addonProgress.h" + class AddonModel : public Napi::ObjectWrap { public: llama_model_params model_params; @@ -14,6 +22,7 @@ class AddonModel : public Napi::ObjectWrap { Napi::Reference addonExportsRef; bool hasAddonExportsRef = false; AddonModelData* data; + std::mutex disposeMutex; std::string modelPath; bool modelLoaded = false; @@ -26,10 +35,12 @@ class AddonModel : public Napi::ObjectWrap { bool hasLoadAbortSignal = false; bool disposed = false; + bool memoryDisposed = false; AddonModel(const Napi::CallbackInfo& info); ~AddonModel(); - void dispose(); + void disposeMemory(); + void disposeMT(); Napi::Value Init(const Napi::CallbackInfo& info); Napi::Value LoadLora(const Napi::CallbackInfo& info); @@ -42,6 +53,7 @@ class AddonModel : public Napi::ObjectWrap { Napi::Value GetTotalSize(const Napi::CallbackInfo& info); Napi::Value GetTotalParameters(const Napi::CallbackInfo& info); Napi::Value GetModelDescription(const Napi::CallbackInfo& info); + Napi::Value GetMemoryBreakdown(const Napi::CallbackInfo& info); Napi::Value TokenBos(const Napi::CallbackInfo& info); Napi::Value TokenEos(const Napi::CallbackInfo& info); diff --git a/llama/addon/AddonModelData.cpp b/llama/addon/AddonModelData.cpp index 3c1758a3..633e0162 100644 --- a/llama/addon/AddonModelData.cpp +++ b/llama/addon/AddonModelData.cpp @@ -1,4 +1,4 @@ -#include +#include #include "addonGlobals.h" #include "AddonModelData.h" @@ -8,18 +8,59 @@ AddonModelData::AddonModelData() { } AddonModelData::~AddonModelData() { - std::set currentLoraAdapters; - currentLoraAdapters.swap(loraAdapters); + disposeMT(); +} - for (auto lora : currentLoraAdapters) { - lora->dispose(true); - } - currentLoraAdapters.clear(); +void AddonModelData::addLora(AddonModelLora* lora) { + std::lock_guard lock(loraAdaptersMutex); + loraAdapters.insert(lora); } void AddonModelData::removeLora(AddonModelLora* lora) { + std::lock_guard lock(loraAdaptersMutex); auto pos = loraAdapters.find(lora); if (pos != loraAdapters.end()) { loraAdapters.erase(pos); } -} \ No newline at end of file + + pendingFinalization.erase( + std::remove(pendingFinalization.begin(), pendingFinalization.end(), lora), + pendingFinalization.end() + ); +} + +void AddonModelData::disposeMemory() { + std::vector currentLoraAdapters; + + { + std::lock_guard lock(loraAdaptersMutex); + currentLoraAdapters.reserve(loraAdapters.size()); + pendingFinalization.reserve(pendingFinalization.size() + loraAdapters.size()); + + for (auto* lora : loraAdapters) { + currentLoraAdapters.push_back(lora); + pendingFinalization.push_back(lora); + } + + loraAdapters.clear(); + } + + for (auto* lora : currentLoraAdapters) { + lora->disposeMemory(); + } +} + +void AddonModelData::disposeMT() { + std::vector currentPendingFinalization; + + disposeMemory(); + + { + std::lock_guard lock(loraAdaptersMutex); + currentPendingFinalization.swap(pendingFinalization); + } + + for (auto* lora : currentPendingFinalization) { + lora->disposeMT(true); + } +} diff --git a/llama/addon/AddonModelData.h b/llama/addon/AddonModelData.h index 78c82497..24c8ca2f 100644 --- a/llama/addon/AddonModelData.h +++ b/llama/addon/AddonModelData.h @@ -1,15 +1,22 @@ #pragma once #include +#include +#include #include "llama.h" #include "napi.h" #include "addonGlobals.h" class AddonModelData { public: + std::mutex loraAdaptersMutex; std::set loraAdapters; + std::vector pendingFinalization; AddonModelData(); ~AddonModelData(); + void addLora(AddonModelLora* lora); void removeLora(AddonModelLora* lora); -}; \ No newline at end of file + void disposeMemory(); + void disposeMT(); +}; diff --git a/llama/addon/AddonModelLora.cpp b/llama/addon/AddonModelLora.cpp index cf70f939..123384ee 100644 --- a/llama/addon/AddonModelLora.cpp +++ b/llama/addon/AddonModelLora.cpp @@ -26,7 +26,7 @@ class AddonModelLoraUnloadLoraWorker : public Napi::AsyncWorker { void Execute() { try { - addonLora->dispose(); + addonLora->disposeMemory(); } catch (const std::exception& e) { SetError(e.what()); } catch(...) { @@ -34,6 +34,7 @@ class AddonModelLoraUnloadLoraWorker : public Napi::AsyncWorker { } } void OnOK() { + addonLora->disposeMT(); deferred.Resolve(Env().Undefined()); } void OnError(const Napi::Error& err) { @@ -48,19 +49,56 @@ AddonModelLora::AddonModelLora(const Napi::CallbackInfo& info) : Napi::ObjectWra } AddonModelLora::~AddonModelLora() { - dispose(); + disposeMT(); } -void AddonModelLora::dispose(bool skipErase) { +void AddonModelLora::disposeMemory() { + std::lock_guard lock(disposeMutex); + + if (memoryDisposed) { + return; + } + + memoryDisposed = true; + if (lora_adapter != nullptr) { + llama_adapter_lora_free(lora_adapter); lora_adapter = nullptr; + } +} + +void AddonModelLora::disposeMT(bool skipErase) { + bool shouldRemoveFromModel = false; + bool shouldUnrefModel = false; + bool shouldUnrefSelf = false; + + disposeMemory(); - if (!skipErase && model->data != nullptr) { - model->data->removeLora(this); + { + std::lock_guard lock(disposeMutex); + + if (disposed) { + return; } + disposed = true; + shouldRemoveFromModel = !skipErase; + shouldUnrefModel = hasModelRef; + shouldUnrefSelf = hasSelfRef; + hasModelRef = false; + hasSelfRef = false; + } + + if (shouldRemoveFromModel && model->data != nullptr) { + model->data->removeLora(this); + } + + if (shouldUnrefModel) { model->Unref(); } + if (shouldUnrefSelf) { + Unref(); + } } Napi::Value AddonModelLora::GetFilePath(const Napi::CallbackInfo& info) { @@ -77,13 +115,27 @@ void AddonModelLora::SetUsages(const Napi::CallbackInfo& info, const Napi::Value } Napi::Value AddonModelLora::Dispose(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); + deferred.Resolve(info.Env().Undefined()); + return deferred.Promise(); + } + + if (lora_adapter == nullptr) { + disposeMT(); + + Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); + deferred.Resolve(info.Env().Undefined()); + return deferred.Promise(); + } + AddonModelLoraUnloadLoraWorker* worker = new AddonModelLoraUnloadLoraWorker(this->Env(), this); worker->Queue(); return worker->GetPromise(); } Napi::Value AddonModelLora::GetDisposed(const Napi::CallbackInfo& info) { - return Napi::Boolean::New(info.Env(), lora_adapter == nullptr); + return Napi::Boolean::New(info.Env(), disposed); } void AddonModelLora::init(Napi::Object exports) { diff --git a/llama/addon/AddonModelLora.h b/llama/addon/AddonModelLora.h index a90344f0..3b44ec71 100644 --- a/llama/addon/AddonModelLora.h +++ b/llama/addon/AddonModelLora.h @@ -1,4 +1,5 @@ #pragma once +#include #include "llama.h" #include "napi.h" #include "addonGlobals.h" @@ -9,11 +10,17 @@ class AddonModelLora : public Napi::ObjectWrap { llama_adapter_lora * lora_adapter; std::string loraFilePath; uint32_t usages = 0; + std::mutex disposeMutex; + bool disposed = false; + bool memoryDisposed = false; + bool hasSelfRef = false; + bool hasModelRef = false; AddonModelLora(const Napi::CallbackInfo& info); ~AddonModelLora(); - void dispose(bool skipErase = false); + void disposeMemory(); + void disposeMT(bool skipErase = false); Napi::Value GetFilePath(const Napi::CallbackInfo& info); diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp index cd65d066..cfea8fa4 100644 --- a/llama/addon/AddonSampler.cpp +++ b/llama/addon/AddonSampler.cpp @@ -122,10 +122,6 @@ void AddonSampler::rebuildChainIfNeeded() { llama_sampler_chain_add(chain, dryRepeatPenaltySampler); } - if (grammarEvaluationState != nullptr) { - llama_sampler_chain_add(chain, grammarEvaluationState->sampler); - } - if (greedySampler != nullptr) { if (xtcSampler != nullptr) { llama_sampler_chain_add(chain, xtcSampler); @@ -174,6 +170,96 @@ void AddonSampler::acceptToken(llama_token token) { } } +void AddonSampler::sample(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP, bool forceGrammar) { + setTokenCandidates(llamaContext, batchLogitIndex, curP); + + if (forceGrammar && grammarEvaluationState != nullptr && grammarEvaluationState->sampler != nullptr) { + llama_sampler_apply(grammarEvaluationState->sampler, &curP); + llama_sampler_apply(chain, &curP); + return; + } + + if (grammarEvaluationState == nullptr || grammarEvaluationState->sampler == nullptr) { + llama_sampler_apply(chain, &curP); + return; + } + + // test whether the sampled token would be accepted by the grammar, + // and otherwise apply the grammar first and then the rest of the chain + { + llama_sampler_apply(chain, &curP); + if (!(curP.selected >= 0 && curP.selected < (int32_t)curP.size)) { + return; + } + + llama_token_data singleTokenData = { curP.data[curP.selected].id, 1.0f, 0.0f }; + llama_token_data_array singleTokenDataArray = { &singleTokenData, 1, -1, false }; + + llama_sampler_apply(grammarEvaluationState->sampler, &singleTokenDataArray); + + const bool isValid = singleTokenData.logit != -INFINITY; + if (isValid) { + return; + } + + setTokenCandidates(llamaContext, batchLogitIndex, curP); + + llama_sampler_apply(grammarEvaluationState->sampler, &curP); + llama_sampler_apply(chain, &curP); + } +} + +void AddonSampler::setTokenCandidates(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP) { + const float* sampledProbs = llama_get_sampled_probs_ith(llamaContext, batchLogitIndex); + const float* sampledLogits = llama_get_sampled_logits_ith(llamaContext, batchLogitIndex); + const llama_token* sampledIds = llama_get_sampled_candidates_ith(llamaContext, batchLogitIndex); + + const llama_model* model = llama_get_model(llamaContext); + const llama_vocab* vocab = llama_model_get_vocab(model); + + if (sampledProbs != nullptr) { + const uint32_t sampledProbsSize = llama_get_sampled_probs_count_ith(llamaContext, batchLogitIndex); + curP.size = sampledProbsSize; + + if (tokenCandidates.size() < sampledProbsSize) { + tokenCandidates.resize(sampledProbsSize); + } + + for (uint32_t i = 0; i < sampledProbsSize; i++) { + tokenCandidates[i] = llama_token_data { sampledIds[i], sampledLogits[i], sampledProbs[i] }; + } + } else if (sampledLogits != nullptr) { + const uint32_t sampledLogitsSize = llama_get_sampled_logits_count_ith(llamaContext, batchLogitIndex); + curP.size = sampledLogitsSize; + if (tokenCandidates.size() < sampledLogitsSize) { + tokenCandidates.resize(sampledLogitsSize); + } + + for (uint32_t i = 0; i < sampledLogitsSize; i++) { + tokenCandidates[i] = llama_token_data { sampledIds[i], sampledLogits[i], 0.0f }; + } + } else { + const auto* logits = llama_get_logits_ith(llamaContext, batchLogitIndex); + if (logits != nullptr) { + const auto vocabLength = llama_vocab_n_tokens(vocab); + curP.size = vocabLength; + if (tokenCandidates.size() < vocabLength) { + tokenCandidates.resize(vocabLength); + } + + for (llama_token tokenId = 0; tokenId < vocabLength; tokenId++) { + tokenCandidates[tokenId] = llama_token_data { tokenId, logits[tokenId], 0.0f }; + } + } else { + curP.size = 0; + } + } + + curP.data = tokenCandidates.data(); + curP.selected = -1; + curP.sorted = false; +} + Napi::Value AddonSampler::Dispose(const Napi::CallbackInfo& info) { dispose(); return info.Env().Undefined(); @@ -337,6 +423,8 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { if (config.Has("repeatPenaltyTokens")) { Napi::Uint32Array repeat_penalty_tokens_uint32_array = config.Get("repeatPenaltyTokens").As(); + const size_t repeatPenaltyTokensLength = repeat_penalty_tokens_uint32_array.ElementLength(); + const uint32_t* repeatPenaltyTokens = repeat_penalty_tokens_uint32_array.Data(); auto repeatPenalty = config.Has("repeatPenalty") ? config.Get("repeatPenalty").As().FloatValue() : 1; @@ -370,19 +458,19 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { existingSamplerMatchesConfig &= repeatPenalty_frequencyPenalty == repeatPenaltyFrequencyPenalty; if (existingSamplerMatchesConfig) { - if (repeat_penalty_tokens_uint32_array.ElementLength() > 0) { - const auto firstToken = static_cast(repeat_penalty_tokens_uint32_array[0]); + if (repeatPenaltyTokensLength > 0) { + const auto firstToken = static_cast(repeatPenaltyTokens[0]); if (repeatPenalty_lastTokens.rat(0) != firstToken && repeatPenalty_lastTokens.size() == repeatPenalty_maxTokens && - repeat_penalty_tokens_uint32_array.ElementLength() == repeatPenalty_maxTokens + repeatPenaltyTokensLength == static_cast(repeatPenalty_maxTokens) ) { - const auto lastToken = static_cast(repeat_penalty_tokens_uint32_array[repeat_penalty_tokens_uint32_array.ElementLength() - 1]); + const auto lastToken = static_cast(repeatPenaltyTokens[repeatPenaltyTokensLength - 1]); llama_sampler_accept(repeatPenaltySampler, lastToken); repeatPenalty_lastTokens.push_back(lastToken); } } - for (size_t i = 0; i < repeat_penalty_tokens_uint32_array.ElementLength() && existingSamplerMatchesConfig; i++) { - auto token = static_cast(repeat_penalty_tokens_uint32_array[i]); + for (size_t i = 0; i < repeatPenaltyTokensLength && existingSamplerMatchesConfig; i++) { + auto token = static_cast(repeatPenaltyTokens[i]); if (i < repeatPenalty_lastTokens.size()) { existingSamplerMatchesConfig &= repeatPenalty_lastTokens.rat(i) == token; @@ -411,9 +499,9 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { ); repeatPenalty_lastTokens = RingBuffer(repeatPenaltyMaxTokens); - for (size_t i = 0; i < repeat_penalty_tokens_uint32_array.ElementLength(); i++) { - llama_sampler_accept(repeatPenaltySampler, static_cast(repeat_penalty_tokens_uint32_array[i])); - repeatPenalty_lastTokens.push_back(static_cast(repeat_penalty_tokens_uint32_array[i])); + for (size_t i = 0; i < repeatPenaltyTokensLength; i++) { + llama_sampler_accept(repeatPenaltySampler, static_cast(repeatPenaltyTokens[i])); + repeatPenalty_lastTokens.push_back(static_cast(repeatPenaltyTokens[i])); } repeatPenalty_maxTokens = repeatPenaltyMaxTokens; @@ -584,8 +672,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { Napi::ObjectWrap::Unwrap(config.Get("grammarEvaluationState").As()); if (grammarEvaluationState != configGrammarEvaluationState) { - freeChain(); - if (grammarEvaluationState != nullptr) { grammarEvaluationState->Unref(); grammarEvaluationState = nullptr; @@ -595,7 +681,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { grammarEvaluationState->Ref(); } } else if (grammarEvaluationState != nullptr) { - freeChain(); grammarEvaluationState->Unref(); grammarEvaluationState = nullptr; } diff --git a/llama/addon/AddonSampler.h b/llama/addon/AddonSampler.h index 3218b11d..77735d04 100644 --- a/llama/addon/AddonSampler.h +++ b/llama/addon/AddonSampler.h @@ -64,6 +64,8 @@ class AddonSampler : public Napi::ObjectWrap { void freeChain(); void rebuildChainIfNeeded(); void acceptToken(llama_token token); + void sample(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP, bool forceGrammar); + void setTokenCandidates(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP); Napi::Value Dispose(const Napi::CallbackInfo& info); Napi::Value ApplyConfig(const Napi::CallbackInfo& info); diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index 2af3d909..1f5c2a75 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -1,19 +1,22 @@ -#include "addonGlobals.h" -#include "AddonModel.h" -#include "AddonModelLora.h" +#include +#include +#include + +#include "AddonContext.h" +#include "AddonGgufMetadata.h" #include "AddonGrammar.h" #include "AddonGrammarEvaluationState.h" +#include "AddonModel.h" +#include "AddonModelLora.h" #include "AddonSampler.h" -#include "AddonContext.h" +#include "addonGlobals.h" #include "globals/addonLog.h" #include "globals/addonProgress.h" #include "globals/getGpuInfo.h" +#include "globals/getProcessMemoryInfo.h" #include "globals/getSwapInfo.h" -#include "globals/getMemoryInfo.h" - -#include -#include -#include +#include "globals/getSystemMemoryInfo.h" +#include "globals/addonEnv.h" std::mutex backendMutex; bool backendInitialized = false; @@ -168,9 +171,9 @@ class AddonBackendUnloadWorker : public Napi::AsyncWorker { Napi::Promise::Deferred deferred; void Execute() { - std::lock_guard lock(backendMutex); - try { + std::lock_guard lock(backendMutex); + if (backendInitialized) { backendInitialized = false; llama_backend_free(); @@ -236,7 +239,7 @@ Napi::Value addonSetNuma(const Napi::CallbackInfo& info) { } Napi::Value markLoaded(const Napi::CallbackInfo& info) { - static std::atomic_bool loaded{false}; + static std::atomic_bool loaded(false); return Napi::Boolean::New(info.Env(), loaded.exchange(true)); } @@ -306,12 +309,15 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { Napi::PropertyDescriptor::Function("getGpuType", getGpuType), Napi::PropertyDescriptor::Function("ensureGpuDeviceIsSupported", ensureGpuDeviceIsSupported), Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo), - Napi::PropertyDescriptor::Function("getMemoryInfo", getMemoryInfo), + Napi::PropertyDescriptor::Function("getProcessMemoryInfo", getProcessMemoryInfo), + Napi::PropertyDescriptor::Function("getSystemMemoryInfo", getSystemMemoryInfo), Napi::PropertyDescriptor::Function("loadBackends", addonLoadBackends), Napi::PropertyDescriptor::Function("setNuma", addonSetNuma), + Napi::PropertyDescriptor::Function("setEnv", addonSetEnv), Napi::PropertyDescriptor::Function("init", addonInit), Napi::PropertyDescriptor::Function("dispose", addonDispose), }); + AddonGgufMetadata::init(exports); AddonModel::init(exports); AddonModelLora::init(exports); AddonGrammar::init(exports); diff --git a/llama/addon/addonGlobals.cpp b/llama/addon/addonGlobals.cpp index 2d73c466..69584fbc 100644 --- a/llama/addon/addonGlobals.cpp +++ b/llama/addon/addonGlobals.cpp @@ -4,18 +4,24 @@ #include "napi.h" void adjustNapiExternalMemoryAdd(Napi::Env env, uint64_t size) { - const uint64_t chunkSize = std::numeric_limits::max(); + constexpr uint64_t chunkSize = (std::min)( + static_cast(512LL * 1024LL * 1024LL), + static_cast(std::numeric_limits::max()) + ); while (size > 0) { - int64_t adjustSize = std::min(size, chunkSize); + int64_t adjustSize = (std::min)(size, chunkSize); Napi::MemoryManagement::AdjustExternalMemory(env, adjustSize); size -= adjustSize; } } void adjustNapiExternalMemorySubtract(Napi::Env env, uint64_t size) { - const uint64_t chunkSize = std::numeric_limits::max(); + constexpr uint64_t chunkSize = (std::min)( + static_cast(512LL * 1024LL * 1024LL), + static_cast(std::numeric_limits::max()) + ); while (size > 0) { - int64_t adjustSize = std::min(size, chunkSize); + int64_t adjustSize = (std::min)(size, chunkSize); Napi::MemoryManagement::AdjustExternalMemory(env, -adjustSize); size -= adjustSize; } diff --git a/llama/addon/addonGlobals.h b/llama/addon/addonGlobals.h index 1a4dd8d1..5edffa37 100644 --- a/llama/addon/addonGlobals.h +++ b/llama/addon/addonGlobals.h @@ -1,6 +1,7 @@ #pragma once #include "napi.h" +class AddonGgufMetadata; class AddonModel; class AddonModelLora; class AddonModelData; diff --git a/llama/addon/globals/addonEnv.cpp b/llama/addon/globals/addonEnv.cpp new file mode 100644 index 00000000..16ce15a0 --- /dev/null +++ b/llama/addon/globals/addonEnv.cpp @@ -0,0 +1,49 @@ +#include "addonEnv.h" + +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#endif + + +Napi::Value addonSetEnv(const Napi::CallbackInfo& info) { + const std::string varName = info[0].As().Utf8Value(); + + const bool shouldUnset = info.Length() == 1 || ( + info.Length() > 1 && ( + info[1].IsUndefined() || + info[1].IsNull() || + (info[1].IsString() && info[1].As().Utf8Value().empty()) + ) + ); + if (shouldUnset) { +#ifdef _WIN32 + if (_putenv_s(varName.c_str(), "") != 0) { + return Napi::Boolean::New(info.Env(), false); + } +#else + if (unsetenv(varName.c_str()) != 0) { + return Napi::Boolean::New(info.Env(), false); + } +#endif + } else { + const std::string varValue = info[1].As().Utf8Value(); + const bool overwrite = info.Length() > 2 && info[2].IsBoolean() ? info[2].As().Value() : true; + +#ifdef _WIN32 + if (_putenv_s(varName.c_str(), varValue.c_str()) != 0) { + return Napi::Boolean::New(info.Env(), false); + } +#else + if (setenv(varName.c_str(), varValue.c_str(), overwrite ? 1 : 0) != 0) { + return Napi::Boolean::New(info.Env(), false); + } +#endif + } + + return Napi::Boolean::New(info.Env(), true); +} diff --git a/llama/addon/globals/addonEnv.h b/llama/addon/globals/addonEnv.h new file mode 100644 index 00000000..31c03193 --- /dev/null +++ b/llama/addon/globals/addonEnv.h @@ -0,0 +1,4 @@ +#pragma once +#include "napi.h" + +Napi::Value addonSetEnv(const Napi::CallbackInfo& info); diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp index 34c0a94a..9ded4cba 100644 --- a/llama/addon/globals/getGpuInfo.cpp +++ b/llama/addon/globals/getGpuInfo.cpp @@ -37,7 +37,10 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) { used += deviceTotal - deviceFree; #if defined(__arm64__) || defined(__aarch64__) - if (std::string(ggml_backend_dev_name(device)) == "Metal") { + ggml_backend_reg_t backend = ggml_backend_dev_backend_reg(device); + const auto backendName = std::string(backend == nullptr ? "" : ggml_backend_reg_name(backend)); + + if (backendName == "MTL" || backendName == "Metal") { unifiedVramSize += deviceTotal; } #endif @@ -121,6 +124,22 @@ std::pair getGpuDevice() { } Napi::Value getGpuType(const Napi::CallbackInfo& info) { + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + ggml_backend_reg_t backend = ggml_backend_reg_get(i); + const auto backendName = std::string(ggml_backend_reg_name(backend)); + + if (backendName == "MTL" || backendName == "Metal") { + return Napi::String::New(info.Env(), "metal"); + } else if (backendName == "Vulkan") { + return Napi::String::New(info.Env(), "vulkan"); + } + + // else if ( + // backendName == "CUDA" || backendName == "ROCm" || backendName == "MUSA") { + // return Napi::String::New(info.Env(), "cuda"); + // } + } + const auto gpuDeviceRes = getGpuDevice(); const auto device = gpuDeviceRes.first; const auto deviceType = gpuDeviceRes.second; diff --git a/llama/addon/globals/getMemoryInfo.h b/llama/addon/globals/getMemoryInfo.h deleted file mode 100644 index fbcf2de4..00000000 --- a/llama/addon/globals/getMemoryInfo.h +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once -#include "napi.h" - -Napi::Value getMemoryInfo(const Napi::CallbackInfo& info); diff --git a/llama/addon/globals/getMemoryInfo.cpp b/llama/addon/globals/getProcessMemoryInfo.cpp similarity index 95% rename from llama/addon/globals/getMemoryInfo.cpp rename to llama/addon/globals/getProcessMemoryInfo.cpp index 1825c425..21f292af 100644 --- a/llama/addon/globals/getMemoryInfo.cpp +++ b/llama/addon/globals/getProcessMemoryInfo.cpp @@ -1,4 +1,4 @@ -#include "getMemoryInfo.h" +#include "getProcessMemoryInfo.h" #include "addonLog.h" #ifdef __APPLE__ @@ -16,7 +16,7 @@ #endif -Napi::Value getMemoryInfo(const Napi::CallbackInfo& info) { +Napi::Value getProcessMemoryInfo(const Napi::CallbackInfo& info) { uint64_t totalMemoryUsage = 0; #ifdef __APPLE__ diff --git a/llama/addon/globals/getProcessMemoryInfo.h b/llama/addon/globals/getProcessMemoryInfo.h new file mode 100644 index 00000000..098a68b4 --- /dev/null +++ b/llama/addon/globals/getProcessMemoryInfo.h @@ -0,0 +1,4 @@ +#pragma once +#include "napi.h" + +Napi::Value getProcessMemoryInfo(const Napi::CallbackInfo& info); diff --git a/llama/addon/globals/getSystemMemoryInfo.cpp b/llama/addon/globals/getSystemMemoryInfo.cpp new file mode 100644 index 00000000..3567c04c --- /dev/null +++ b/llama/addon/globals/getSystemMemoryInfo.cpp @@ -0,0 +1,464 @@ +#include "getSystemMemoryInfo.h" +#include "addonLog.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __APPLE__ +#include +#include +#elif __linux__ +#include +#include +#include +#elif _WIN32 +#include +#include +#endif + +struct AddonSystemMemoryInfo { + std::optional total = std::nullopt; + std::optional wired = std::nullopt; + std::optional free = std::nullopt; +}; + +static std::optional multiplyUint64(uint64_t left, uint64_t right) { + if (left == 0 || right == 0) { + return uint64_t(0); + } + + if (left > ((std::numeric_limits::max)() / right)) { + return std::nullopt; + } + + return left * right; +} + +#ifdef __APPLE__ +struct AddonHostPortScopeExit { + mach_port_t hostPort; + + explicit AddonHostPortScopeExit(mach_port_t hostPort) + : hostPort(hostPort) { + } + + ~AddonHostPortScopeExit() { + if (hostPort != MACH_PORT_NULL) + mach_port_deallocate(mach_task_self(), hostPort); + } +}; +#endif + +#ifdef __linux__ +static bool isLinuxMeminfoWhitespace(char character) { + return character == ' ' || character == '\t'; +} + +static std::string toLowerAscii(std::string value) { + for (char& character : value) { + character = static_cast(std::tolower(static_cast(character))); + } + + return value; +} + +static std::optional parseLinuxMeminfoNumericValue(const std::string& value, size_t& index) { + long double parsedValue = 0; + bool sawDigit = false; + + while (index < value.size() && std::isdigit(static_cast(value[index]))) { + sawDigit = true; + parsedValue = (parsedValue * 10) + (value[index] - '0'); + index++; + } + + if (index < value.size() && value[index] == '.') { + index++; + + long double fractionalScale = 1; + while (index < value.size() && std::isdigit(static_cast(value[index]))) { + sawDigit = true; + fractionalScale *= 10; + parsedValue += static_cast(value[index] - '0') / fractionalScale; + index++; + } + } + + if (!sawDigit) { + return std::nullopt; + } + + return parsedValue; +} + +static std::optional getLinuxMeminfoUnitMultiplier(const std::string& rawUnit) { + if (rawUnit == "") { + return uint64_t(1); + } + + if (rawUnit == "kB") { + return uint64_t(1024); + } + + const std::string lowercaseUnit = toLowerAscii(rawUnit); + if (lowercaseUnit == "b" || lowercaseUnit == "byte" || lowercaseUnit == "bytes") { + return uint64_t(1); + } else if (lowercaseUnit == "kib" || lowercaseUnit == "ki") { + return uint64_t(1024); + } else if (lowercaseUnit == "kb" || lowercaseUnit == "k") { + return uint64_t(1000); + } else if (lowercaseUnit == "mb" || lowercaseUnit == "m") { + return uint64_t(1000) * 1000; + } else if (lowercaseUnit == "gb" || lowercaseUnit == "g") { + return uint64_t(1000) * 1000 * 1000; + } else if (lowercaseUnit == "tb" || lowercaseUnit == "t") { + return uint64_t(1000) * 1000 * 1000 * 1000; + } else if (lowercaseUnit == "pb" || lowercaseUnit == "p") { + return uint64_t(1000) * 1000 * 1000 * 1000 * 1000; + } else if (lowercaseUnit == "mib" || lowercaseUnit == "mi") { + return uint64_t(1024) * 1024; + } else if (lowercaseUnit == "gib" || lowercaseUnit == "gi") { + return uint64_t(1024) * 1024 * 1024; + } else if (lowercaseUnit == "tib" || lowercaseUnit == "ti") { + return uint64_t(1024) * 1024 * 1024 * 1024; + } else if (lowercaseUnit == "pib" || lowercaseUnit == "pi") { + return uint64_t(1024) * 1024 * 1024 * 1024 * 1024; + } + + return std::nullopt; +} + +static std::optional parseLinuxMeminfoValueBytes(const std::string& line, std::string& key) { + const size_t separatorIndex = line.find(':'); + if (separatorIndex == std::string::npos || separatorIndex == 0) { + return std::nullopt; + } + + key = line.substr(0, separatorIndex); + + size_t index = separatorIndex + 1; + while (index < line.size() && isLinuxMeminfoWhitespace(line[index])) { + index++; + } + + const std::optional parsedValue = parseLinuxMeminfoNumericValue(line, index); + if (!parsedValue.has_value()) { + return std::nullopt; + } + + while (index < line.size() && isLinuxMeminfoWhitespace(line[index])) { + index++; + } + + const size_t unitStartIndex = index; + while (index < line.size() && !isLinuxMeminfoWhitespace(line[index])) { + index++; + } + + const std::string unit = line.substr(unitStartIndex, index - unitStartIndex); + + while (index < line.size() && isLinuxMeminfoWhitespace(line[index])) { + index++; + } + + if (index != line.size()) { + return std::nullopt; + } + + const std::optional unitMultiplier = getLinuxMeminfoUnitMultiplier(unit); + if (!unitMultiplier.has_value()) { + return std::nullopt; + } + + const long double bytesValue = parsedValue.value() * unitMultiplier.value(); + if (!std::isfinite(bytesValue) || bytesValue < 0) { + return std::nullopt; + } + + const long double roundedBytesValue = std::round(bytesValue); + if (roundedBytesValue > static_cast((std::numeric_limits::max)())) { + return std::nullopt; + } + + return static_cast(roundedBytesValue); +} + +static std::optional parseLinuxVmstatValuePages(const std::string& line, std::string& key) { + const size_t separatorIndex = line.find_first_of(" \t"); + if (separatorIndex == std::string::npos || separatorIndex == 0) { + return std::nullopt; + } + + key = line.substr(0, separatorIndex); + + size_t index = separatorIndex; + while (index < line.size() && isLinuxMeminfoWhitespace(line[index])) { + index++; + } + + if (index == line.size()) { + return std::nullopt; + } + + uint64_t pageCount = 0; + bool sawDigit = false; + while (index < line.size() && std::isdigit(static_cast(line[index]))) { + sawDigit = true; + + const uint64_t digit = uint64_t(line[index] - '0'); + if (pageCount > (((std::numeric_limits::max)() - digit) / 10)) { + return std::nullopt; + } + + pageCount = (pageCount * 10) + digit; + index++; + } + + if (!sawDigit) { + return std::nullopt; + } + + while (index < line.size() && isLinuxMeminfoWhitespace(line[index])) { + index++; + } + + if (index != line.size()) { + return std::nullopt; + } + + return pageCount; +} + +static std::optional getLinuxPageSize() { + static const std::optional cachedLinuxPageSize = []() -> std::optional { + const long pageSize = sysconf(_SC_PAGESIZE); + if (pageSize <= 0) { + return std::nullopt; + } + + return uint64_t(pageSize); + }(); + + return cachedLinuxPageSize; +} +#endif + +static AddonSystemMemoryInfo retrieveSystemMemoryInfo() { + AddonSystemMemoryInfo systemMemoryInfo; + +#ifdef __APPLE__ + { + static const std::optional cachedPhysicalMemory = []() -> std::optional { + uint64_t physicalMemory = 0; + size_t physicalMemorySize = sizeof(physicalMemory); + if (sysctlbyname("hw.memsize", &physicalMemory, &physicalMemorySize, NULL, 0) == 0) { + return physicalMemory; + } + + return std::nullopt; + }(); + + if (cachedPhysicalMemory.has_value()) { + systemMemoryInfo.total = cachedPhysicalMemory.value(); + } else { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get total system memory"); + } + } + + mach_port_t hostPort = mach_host_self(); + AddonHostPortScopeExit hostPortGuard(hostPort); + if (vm_page_size != 0) { + vm_statistics64_data_t vmStats; + mach_msg_type_number_t infoCount = HOST_VM_INFO64_COUNT; + if (host_statistics64(hostPort, HOST_VM_INFO64, (host_info64_t)&vmStats, &infoCount) == KERN_SUCCESS) { + const uint64_t freeBytes = (uint64_t(vmStats.free_count) * vm_page_size); + const uint64_t wiredBytes = uint64_t(vmStats.wire_count) * vm_page_size; + + systemMemoryInfo.free = freeBytes; + systemMemoryInfo.wired = wiredBytes; + } else { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get wired system memory"); + } + } else { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get system page size"); + } +#elif __linux__ + { + struct sysinfo linuxSystemInfo; + if (sysinfo(&linuxSystemInfo) == 0) { + if (systemMemoryInfo.total.has_value()) { + systemMemoryInfo.total = multiplyUint64(uint64_t(linuxSystemInfo.totalram), uint64_t(linuxSystemInfo.mem_unit)); + } + + if (!systemMemoryInfo.free.has_value()) { + systemMemoryInfo.free = multiplyUint64(uint64_t(linuxSystemInfo.freeram), uint64_t(linuxSystemInfo.mem_unit)); + } + } + } + + const std::optional pageSize = getLinuxPageSize(); + std::optional nrMlockPages = std::nullopt; + if (pageSize.has_value()) { + std::ifstream procVmstat("/proc/vmstat"); + if (procVmstat.is_open()) { + std::string line; + + while (std::getline(procVmstat, line)) { + std::string key; + const std::optional valuePages = parseLinuxVmstatValuePages(line, key); + if (!valuePages.has_value()) { + continue; + } + + if (key == "nr_unevictable" && !systemMemoryInfo.wired.has_value()) { + systemMemoryInfo.wired = multiplyUint64(valuePages.value(), pageSize.value()); + } else if (key == "nr_mlock") { + nrMlockPages = valuePages.value(); + } else if (key == "nr_free_pages" && !systemMemoryInfo.free.has_value()) { + systemMemoryInfo.free = multiplyUint64(valuePages.value(), pageSize.value()); + } + + if (systemMemoryInfo.wired.has_value() && systemMemoryInfo.free.has_value()) { + break; + } + } + } + } + + if (!systemMemoryInfo.total.has_value() || !systemMemoryInfo.free.has_value() || !systemMemoryInfo.wired.has_value()) { + std::ifstream procMeminfo("/proc/meminfo"); + if (procMeminfo.is_open()) { + std::string line; + std::optional mlockedBytes = std::nullopt; + + while (std::getline(procMeminfo, line)) { + std::string key; + const std::optional valueBytes = parseLinuxMeminfoValueBytes(line, key); + if (!valueBytes.has_value()) { + continue; + } + + if (key == "MemTotal" && !systemMemoryInfo.total.has_value()) { + systemMemoryInfo.total = valueBytes.value(); + } else if (key == "Unevictable" && !systemMemoryInfo.wired.has_value()) { + systemMemoryInfo.wired = valueBytes.value(); + } else if (key == "MemFree" && !systemMemoryInfo.free.has_value()) { + systemMemoryInfo.free = valueBytes.value(); + } else if (key == "Mlocked") { + mlockedBytes = valueBytes.value(); + } + } + + if (!systemMemoryInfo.wired.has_value() && mlockedBytes.has_value()) { + systemMemoryInfo.wired = mlockedBytes.value(); + } + } + } + + if (!systemMemoryInfo.wired.has_value() && pageSize.has_value() && nrMlockPages.has_value()) { + systemMemoryInfo.wired = multiplyUint64(nrMlockPages.value(), pageSize.value()); + } + + if (!systemMemoryInfo.total.has_value()) { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get total system memory"); + } + + if (!systemMemoryInfo.free.has_value()) { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get free system memory"); + } + + if (!systemMemoryInfo.wired.has_value()) { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get wired system memory"); + } +#elif _WIN32 + MEMORYSTATUSEX memoryStatus; + memoryStatus.dwLength = sizeof(MEMORYSTATUSEX); + + if (GlobalMemoryStatusEx(&memoryStatus)) { + systemMemoryInfo.total = memoryStatus.ullTotalPhys; + systemMemoryInfo.free = memoryStatus.ullAvailPhys; + } else { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get free system memory"); + } + + PERFORMANCE_INFORMATION perfInfo; + perfInfo.cb = sizeof(PERFORMANCE_INFORMATION); + if (GetPerformanceInfo(&perfInfo, sizeof(perfInfo))) { + systemMemoryInfo.wired = uint64_t(perfInfo.KernelNonpaged) * uint64_t(perfInfo.PageSize); + } else { + addonLog(GGML_LOG_LEVEL_ERROR, "Failed to get wired system memory"); + } +#endif + + if (systemMemoryInfo.total.has_value() && systemMemoryInfo.wired.has_value()) + systemMemoryInfo.wired = (std::min)(systemMemoryInfo.wired.value(), systemMemoryInfo.total.value()); + + if (systemMemoryInfo.total.has_value() && systemMemoryInfo.free.has_value()) + systemMemoryInfo.free = (std::min)(systemMemoryInfo.free.value(), systemMemoryInfo.total.value()); + + return systemMemoryInfo; +} + +class AddonGetSystemMemoryInfoWorker : public Napi::AsyncWorker { + public: + explicit AddonGetSystemMemoryInfoWorker(const Napi::Env& env) + : Napi::AsyncWorker(env, "AddonGetSystemMemoryInfoWorker"), + deferred(Napi::Promise::Deferred::New(env)) { + } + + Napi::Promise GetPromise() { + return deferred.Promise(); + } + + protected: + Napi::Promise::Deferred deferred; + AddonSystemMemoryInfo systemMemoryInfo; + + void Execute() override { + try { + systemMemoryInfo = retrieveSystemMemoryInfo(); + } catch (const std::exception& e) { + SetError(e.what()); + } catch (...) { + SetError("Unknown error when getting system memory info"); + } + } + + void OnOK() override { + Napi::Object result = Napi::Object::New(Env()); + result.Set( + "total", + systemMemoryInfo.total.has_value() + ? Napi::Number::New(Env(), systemMemoryInfo.total.value()) + : Env().Null() + ); + result.Set( + "wired", + systemMemoryInfo.wired.has_value() + ? Napi::Number::New(Env(), systemMemoryInfo.wired.value()) + : Env().Null() + ); + result.Set( + "free", + systemMemoryInfo.free.has_value() + ? Napi::Number::New(Env(), systemMemoryInfo.free.value()) + : Env().Null() + ); + deferred.Resolve(result); + } + + void OnError(const Napi::Error& err) override { + deferred.Reject(err.Value()); + } +}; + +Napi::Value getSystemMemoryInfo(const Napi::CallbackInfo& info) { + AddonGetSystemMemoryInfoWorker* worker = new AddonGetSystemMemoryInfoWorker(info.Env()); + worker->Queue(); + return worker->GetPromise(); +} diff --git a/llama/addon/globals/getSystemMemoryInfo.h b/llama/addon/globals/getSystemMemoryInfo.h new file mode 100644 index 00000000..7de09bfe --- /dev/null +++ b/llama/addon/globals/getSystemMemoryInfo.h @@ -0,0 +1,5 @@ +#pragma once + +#include "napi.h" + +Napi::Value getSystemMemoryInfo(const Napi::CallbackInfo& info); diff --git a/llama/cmake/addVariantSuffix.cmake b/llama/cmake/addVariantSuffix.cmake index 064c5b62..53b5e881 100644 --- a/llama/cmake/addVariantSuffix.cmake +++ b/llama/cmake/addVariantSuffix.cmake @@ -1,5 +1,5 @@ function(addVariantSuffix originalTarget variantSuffix) - if (NOT TARGET ${originalTarget} OR ${variantSuffix} STREQUAL "") + if (NOT TARGET ${originalTarget} OR "${variantSuffix}" STREQUAL "") return() endif() diff --git a/llama/patches/PR-22566.diff b/llama/patches/PR-22566.diff new file mode 100644 index 00000000..d33e776a --- /dev/null +++ b/llama/patches/PR-22566.diff @@ -0,0 +1,76 @@ +diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp +index 474cabdfc09..35fef097f5b 100644 +--- a/src/llama-model-loader.cpp ++++ b/src/llama-model-loader.cpp +@@ -698,8 +698,13 @@ llama_model_loader::llama_model_loader( + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + } + +- n_kv = gguf_get_n_kv(metadata); +- n_tensors = weights_map.size(); ++ n_kv = gguf_get_n_kv(metadata); ++ if (files.empty()) { ++ n_tensors = gguf_get_n_tensors(metadata); ++ } else { ++ n_tensors = weights_map.size(); ++ GGML_ASSERT(files.size() != 1 || static_cast(n_tensors) == gguf_get_n_tensors(metadata)); ++ } + + fver = (enum llama_fver) gguf_get_version(metadata); + +@@ -1213,13 +1218,20 @@ struct ggml_tensor * llama_model_loader::create_tensor( + }; + + if (files.empty()) { +- if (flags & TENSOR_SKIP_IF_VIRTUAL) { +- return nullptr; +- } + ggml_type type = GGML_TYPE_F32; + const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str()); + if (tid != -1) { + type = gguf_get_tensor_type(metadata, tid); ++ } else if (flags & TENSOR_SKIP_IF_VIRTUAL) { ++ return nullptr; ++ } else { ++ if (flags & TENSOR_NOT_REQUIRED) { ++ return nullptr; ++ } ++ ++ if (no_alloc) { ++ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); ++ } + } + + // for tensors that are not required some of the dimensions can be invalid: +@@ -1237,7 +1249,13 @@ struct ggml_tensor * llama_model_loader::create_tensor( + for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) { + t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1; + GGML_ASSERT(t_meta.ne[dim] >= 1); +- t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1]; ++ } ++ t_meta.nb[0] = ggml_type_size(type); ++ t_meta.nb[1] = t_meta.nb[0] * (t_meta.ne[0] / ggml_blck_size(type)); ++ GGML_ASSERT(t_meta.nb[0] >= 1); ++ GGML_ASSERT(t_meta.nb[1] >= 1); ++ for (size_t dim = 2; dim < GGML_MAX_DIMS; ++dim) { ++ t_meta.nb[dim] = t_meta.nb[dim - 1] * t_meta.ne[dim - 1]; + GGML_ASSERT(t_meta.nb[dim] >= 1); + } + ggml_set_name(&t_meta, tn.str().c_str()); +@@ -1245,6 +1263,16 @@ struct ggml_tensor * llama_model_loader::create_tensor( + ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta); + GGML_ASSERT(buft != nullptr); + ggml_context * ctx = ctx_for_buft(buft); ++ ++ if (flags & TENSOR_DUPLICATED) { ++ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); ++ if (t) { ++ return t; ++ } ++ } else { ++ n_created++; ++ } ++ + ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta); + ggml_set_name(ret, tn.str().c_str()); + return ret; diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake index 2107f8e1..91a61167 100644 --- a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake +++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake @@ -13,8 +13,3 @@ llvmUseGnuModeCompilers("x64") include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake") ensureNinjaPath() - -set(arch_c_flags "-march=native") - -set(CMAKE_C_FLAGS_INIT "${arch_c_flags}") -set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags}") diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index 294749d5..89822375 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -1,40 +1,48 @@ import {Token} from "../types.js"; import {LlamaNuma} from "./types.js"; +export type AddonModelParams = { + gpuLayers?: number, + vocabOnly?: boolean, + noAlloc?: boolean, + useMmap?: boolean, + useDirectIo?: boolean, + useMlock?: boolean, + checkTensors?: boolean, + overridesList?: Array<[key: string, value: number | bigint | boolean | string, type: 0 | 1 | undefined]> +}; + +export type AddonContextParams = { + contextSize?: number, + batchSize?: number, + sequences?: number, + flashAttention?: boolean | "auto", + logitsAll?: boolean, + embeddings?: boolean, + ranking?: boolean, + threads?: number, + performanceTracking?: boolean, + kvCacheKeyType?: number, + kvCacheValueType?: number, + swaFullCache?: boolean +}; export type BindingModule = { + AddonGgufMetadata: { + new (): AddonGgufMetadata + }, AddonModel: { - new (modelPath: string, params: { + new (modelPath: string, params: AddonModelParams & { addonExports?: BindingModule, - gpuLayers?: number, - vocabOnly?: boolean, - useMmap?: boolean, - useDirectIo?: boolean, - useMlock?: boolean, - checkTensors?: boolean, onLoadProgress?(loadPercentage: number): void, - hasLoadAbortSignal?: boolean, - overridesList?: Array<[key: string, value: number | bigint | boolean | string, type: 0 | 1 | undefined]> + hasLoadAbortSignal?: boolean }): AddonModel }, AddonModelLora: { new (model: AddonModel, filePath: string): AddonModelLora }, AddonContext: { - new (model: AddonModel, params: { - contextSize?: number, - batchSize?: number, - sequences?: number, - flashAttention?: boolean, - logitsAll?: boolean, - embeddings?: boolean, - ranking?: boolean, - threads?: number, - performanceTracking?: boolean, - kvCacheKeyType?: number, - kvCacheValueType?: number, - swaFullCache?: boolean - }): AddonContext + new (model: AddonModel, params: AddonContextParams): AddonContext }, AddonContextSequenceCheckpoint: { new (): AddonContextSequenceCheckpoint @@ -89,17 +97,28 @@ export type BindingModule = { maxSize: number, free: number }, - getMemoryInfo(): { + getSystemMemoryInfo(): Promise<{ + total: number | null, + wired: number | null, + free: number | null + }>, + getProcessMemoryInfo(): { total: number }, init(): Promise, setNuma(numa?: LlamaNuma): void, + setEnv(key: string, value?: string | null, overwrite?: boolean): boolean, loadBackends(forceLoadLibrariesSearchPath?: string): void, dispose(): Promise }; +export type AddonGgufMetadata = { + init(source: Array): Promise, + dispose(): Promise +}; + export type AddonModel = { - init(): Promise, + init(source?: AddonGgufMetadata): Promise, loadLora(lora: AddonModelLora): Promise, abortActiveModelLoad(): void, dispose(): Promise, @@ -110,6 +129,10 @@ export type AddonModel = { getTotalSize(): number, getTotalParameters(): number, getModelDescription(): ModelTypeDescription, + getMemoryBreakdown(): { + cpuRam: number, + gpuVram: number + }, tokenBos(): Token, tokenEos(): Token, tokenNl(): Token, @@ -158,6 +181,10 @@ export type AddonContext = { getSequenceKvCacheMaxPosition(sequenceId: number): number, getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array, getStateSize(): number, + getMemoryBreakdown(): { + cpuRam: number, + gpuVram: number + }, getThreads(): number, setThreads(threads: number): void, printTimings(): void, diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index eee3ada6..205b5775 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -14,10 +14,13 @@ import {getTempDir, FsPathHandle} from "../utils/getTempDir.js"; import {BindingModule} from "./AddonTypes.js"; import { BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel, - LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaNuma + LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaNuma, + RamState } from "./types.js"; -import {MemoryOrchestrator, MemoryReservation} from "./utils/MemoryOrchestrator.js"; +import {MemoryOrchestrator} from "./utils/MemoryOrchestrator.js"; import {registerDisposeBeforeExit, unregisterDisposeBeforeExit} from "./utils/disposeBeforeExit.js"; +import {LlamaExperimentalOptions} from "./getLlama.js"; +import {getPlatform} from "./utils/getPlatform.js"; export const LlamaLogLevelToAddonLogLevel: ReadonlyMap = new Map([ [LlamaLogLevel.disabled, 0], @@ -41,9 +44,7 @@ export class Llama { /** @internal */ public readonly _memoryLock = {}; /** @internal */ public readonly _consts: ReturnType; /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator; - /** @internal */ public _vramPadding: MemoryReservation; /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator; - /** @internal */ public readonly _ramPadding: MemoryReservation; /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator; /** @internal */ public readonly _debug: boolean; /** @internal */ public readonly _threadsSplitter: ThreadsSplitter; @@ -52,6 +53,7 @@ export class Llama { /** @internal */ public _hadErrorLogs: boolean = false; /** @internal */ private readonly _gpu: LlamaGpuType; /** @internal */ private readonly _numa: LlamaNuma; + /** @internal */ private readonly _experimentalOptions?: LlamaExperimentalOptions; /** @internal */ private readonly _buildType: "localBuild" | "prebuilt"; /** @internal */ private readonly _cmakeOptions: Readonly>; /** @internal */ private readonly _supportsGpuOffloading: boolean; @@ -78,7 +80,7 @@ export class Llama { private constructor({ bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, tempDir, numa, buildGpu, - maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit + maxThreads, vramOrchestrator, ramOrchestrator, swapOrchestrator, skipLlamaInit, experimentalOptions }: { bindings: BindingModule, bindingPath: string, @@ -97,11 +99,10 @@ export class Llama { buildGpu: BuildGpu, maxThreads?: number, vramOrchestrator: MemoryOrchestrator, - vramPadding: MemoryReservation, ramOrchestrator: MemoryOrchestrator, - ramPadding: MemoryReservation, swapOrchestrator: MemoryOrchestrator, - skipLlamaInit: boolean + skipLlamaInit: boolean, + experimentalOptions?: LlamaExperimentalOptions }) { this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this); this._onAddonLog = this._onAddonLog.bind(this); @@ -113,6 +114,7 @@ export class Llama { this._logLevel = this._debug ? LlamaLogLevel.debug : (logLevel ?? LlamaLogLevel.debug); + this._experimentalOptions = experimentalOptions; this._selfWeakRef = new WeakRef(this); const previouslyLoaded = bindings.markLoaded(); @@ -122,6 +124,14 @@ export class Llama { this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel); } + let disabledResidencySets = false; + if (getPlatform() === "mac" && buildGpu === "metal" && this._experimentalOptions?.metalSkipDisablingResidencySets !== true && + process.env["GGML_METAL_NO_RESIDENCY"] == null + ) { + this._log(LlamaLogLevel.debug, "Disabling residency sets"); + disabledResidencySets = bindings.setEnv("GGML_METAL_NO_RESIDENCY", "1"); + } + bindings.loadBackends(); let loadedGpu = bindings.getGpuType(); if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) { @@ -140,6 +150,9 @@ export class Llama { if (this._numa !== false) bindings.setNuma(numa); + if (disabledResidencySets) + bindings.setEnv("GGML_METAL_NO_RESIDENCY", undefined); + this._gpu = bindings.getGpuType() ?? false; this._supportsGpuOffloading = bindings.getSupportsGpuOffloading(); this._supportsMmap = bindings.getSupportsMmap(); @@ -148,9 +161,7 @@ export class Llama { this._mathCores = Math.floor(bindings.getMathCores()); this._consts = bindings.getConsts(); this._vramOrchestrator = vramOrchestrator; - this._vramPadding = vramPadding; this._ramOrchestrator = ramOrchestrator; - this._ramPadding = ramPadding; this._swapOrchestrator = swapOrchestrator; this._threadsSplitter = new ThreadsSplitter( maxThreads ?? ( @@ -299,7 +310,17 @@ export class Llama { * See `vramPadding` on `getLlama` for more information. */ public get vramPaddingSize() { - return this._vramPadding.size; + return this._vramOrchestrator.padding; + } + + /** + * RAM padding used for memory size calculations, as these calculations are not always accurate. + * This is set by default to ensure stability, but can be configured when you call `getLlama`. + * + * See `ramPadding` on `getLlama` for more information. + */ + public get ramPaddingSize() { + return this._ramOrchestrator.padding; } /** @@ -308,17 +329,24 @@ export class Llama { * `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU. * On SoC devices, this is usually the same as `total`. */ - public async getVramState() { + public async getVramState(): Promise<{ + total: number, + used: number, + free: number, + unifiedSize: number + }> { this._ensureNotDisposed(); - const {total, used, unifiedSize} = this._bindings.getGpuVramInfo(); + return getBalancedVramState(this._bindings, true); + } - return { - total, - used, - free: Math.max(0, total - used), - unifiedSize - }; + /** + * Get the state of the system RAM + */ + public async getRamState(): Promise { + this._ensureNotDisposed(); + + return getBalancedRamState(this._bindings); } /** @@ -359,6 +387,90 @@ export class Llama { }; } + /** + * Get the total memory usage of this Llama instance + */ + public async getLlamaMemoryUsage() { + return { + gpuVram: this._vramOrchestrator.markedMemory, + cpuRam: this._ramOrchestrator.markedMemory + }; + } + + /** + * Cap the amount of VRAM that this Llama instance is allowed to use in bytes. + * This is useful for constraining the resource usage of models and contexts created with the Llama instance. + * + * Capping to a value that's too low may cause model loads and context creations to either fail or not fully offload to VRAM, + * causing inference to be significantly slower. + * + * Setting a cap will only affect future model loads and context creations. + * + * Use with caution. + * Setting to `null` disables the cap. + * + * Defaults to `null`. + */ + public async setVramCap(bytes: number | null) { + this._ensureNotDisposed(); + if (bytes != null && bytes < 0) + throw new RangeError("VRAM cap must be a non-negative number or null"); + else if (bytes != null) + bytes = Math.floor(bytes); + + this._vramOrchestrator.memoryCap = bytes; + } + + /** + * Get the current VRAM cap in bytes. See {@link setVramCap `setVramCap`} for more information. + * + * Defaults to `null`, which means no cap is set. + */ + public getVramCap() { + return this._vramOrchestrator.memoryCap; + } + + /** + * Cap the amount of RAM that this Llama instance is allowed to use in bytes. + * This is useful for constraining the resource usage of models and contexts created with the Llama instance. + * + * Capping to a value that's too low may cause model loads and context creations to fail. + * Capping to any value will exclude swap from the resource calculations, + * so extremely large models may not load at all even if you have enough swap available. + * + * Setting a cap will only affect future model loads and context creations. + * + * On unified memory systems, capping the RAM may also effectively cap the VRAM, as they are shared. + * On such systems, it's recommended to either cap the VRAM or the RAM (but not both), + * and if you need to cap both then make sure to set the RAM cap to a value greater than the VRAM cap. + * > **Note:** You can detect a unified memory system by checking whether `getVramState().unifiedSize` is greater than 0. + * + * Use with caution. + * Setting to `null` disables the cap. + * + * Defaults to `null`. + */ + public async setRamCap(bytes: number | null) { + this._ensureNotDisposed(); + + if (bytes != null && bytes < 0) + throw new RangeError("RAM cap must be a non-negative number or null"); + else if (bytes != null) + bytes = Math.floor(bytes); + + this._ramOrchestrator.memoryCap = bytes; + this._swapOrchestrator.memoryCap = bytes == null ? null : 0; // if RAM is capped, we can't count on swap for calculation + } + + /** + * Get the current RAM cap in bytes. See {@link setRamCap `setRamCap`} for more information. + * + * Defaults to `null`, which means no cap is set. + */ + public getRamCap() { + return this._ramOrchestrator.memoryCap; + } + public async getGpuDeviceNames() { this._ensureNotDisposed(); @@ -406,6 +518,13 @@ export class Llama { return new LlamaGrammar(this, options); } + /** @internal */ + public async _getRawVramState() { + this._ensureNotDisposed(); + + return getBalancedVramState(this._bindings, false); + } + /** @internal */ public async _init() { await this._bindings.init(); @@ -419,6 +538,14 @@ export class Llama { this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n"); } + /** + * Check whether a message with the given log level would be logged by the Llama instance + * @internal + */ + public _shouldLog(level: LlamaLogLevel) { + return LlamaLogLevelGreaterThanOrEqual(level, this._logLevel); + } + /** @internal */ public _createTempFilePath() { if (this._tempDir == null) @@ -522,7 +649,7 @@ export class Llama { /** @internal */ public static async _create({ bindings, bindingPath, extBackendsPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, - skipLlamaInit = false, debug, numa, tempDir + skipLlamaInit = false, debug, numa, tempDir, experimentalOptions }: { bindings: BindingModule, bindingPath: string, @@ -537,24 +664,16 @@ export class Llama { skipLlamaInit?: boolean, debug: boolean, numa?: LlamaNuma, - tempDir?: string | string[] | false + tempDir?: string | string[] | false, + experimentalOptions?: LlamaExperimentalOptions }) { - const vramOrchestrator = new MemoryOrchestrator(() => { - const {total, used, unifiedSize} = bindings.getGpuVramInfo(); - + const vramOrchestrator = new MemoryOrchestrator(getBalancedVramState.bind(undefined, bindings, true)); + const ramOrchestrator = new MemoryOrchestrator(async () => { + const {total, wired} = await getBalancedRamState(bindings); + return { total, - free: Math.max(0, total - used), - unifiedSize - }; - }); - const ramOrchestrator = new MemoryOrchestrator(() => { - const used = process.memoryUsage().rss; - const total = os.totalmem(); - - return { - total, - free: Math.max(0, total - used), + free: total - wired, unifiedSize: total }; }); @@ -576,11 +695,10 @@ export class Llama { }; }); - let resolvedRamPadding: MemoryReservation; if (ramPadding instanceof Function) - resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total)); + ramOrchestrator.padding = ramPadding((await ramOrchestrator.getMemoryState()).total); else - resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding); + ramOrchestrator.padding = ramPadding; const resolvedTempDir = tempDir === false ? undefined @@ -608,23 +726,18 @@ export class Llama { buildGpu: buildMetadata.buildOptions.gpu, vramOrchestrator, maxThreads, - vramPadding: vramOrchestrator.reserveMemory(0), ramOrchestrator, - ramPadding: resolvedRamPadding, swapOrchestrator, - skipLlamaInit + skipLlamaInit, + experimentalOptions }); if (llama.gpu === false || vramPadding === 0) { - // do nothing since `llama._vramPadding` is already set to 0 + // do nothing since `llama._vramOrchestrator.padding` is already set to 0 } else if (vramPadding instanceof Function) { - const currentVramPadding = llama._vramPadding; - llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total)); - currentVramPadding.dispose(); + vramOrchestrator.padding = vramPadding((await vramOrchestrator.getMemoryState()).total); } else { - const currentVramPadding = llama._vramPadding; - llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding); - currentVramPadding.dispose(); + vramOrchestrator.padding = vramPadding; } if (!skipLlamaInit) @@ -726,6 +839,10 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil return LlamaLogLevel.info; else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is not available, using mmap")) return LlamaLogLevel.info; + else if (level === LlamaLogLevel.warn && message.startsWith("str: cannot properly format tensor name ")) + return LlamaLogLevel.info; + else if (level === LlamaLogLevel.warn && message.startsWith("llama_kv_cache: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to")) + return LlamaLogLevel.info; else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU")) return LlamaLogLevel.info; else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for")) @@ -733,3 +850,55 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil return level; } + +async function getBalancedVramState(bindings: BindingModule, balanceUnifiedMemory: boolean = true) { + const {total, used, unifiedSize} = bindings.getGpuVramInfo(); + let currentUsed = used; + + if (unifiedSize !== 0 && balanceUnifiedMemory) { + try { + const systemMemoryInfo = await bindings.getSystemMemoryInfo(); + systemMemoryInfo.total ??= os.totalmem(); + + const nonUnifiedMemoryRam = systemMemoryInfo.total - unifiedSize; + const lockedUnifiedVram = Math.max(0, Math.min(systemMemoryInfo.wired ?? 0, systemMemoryInfo.total) - nonUnifiedMemoryRam); + + currentUsed = Math.max(currentUsed, Math.min(lockedUnifiedVram, unifiedSize)); + } catch (err) { + // do nothing + } + } + + return { + total, + used: currentUsed, + free: Math.max(0, total - currentUsed), + unifiedSize + }; +} + + +async function getBalancedRamState(bindings: BindingModule): Promise { + let total: number | null = null; + let free: number | null = null; + let wired: number | null = null; + + try { + const systemMemoryInfo = await bindings.getSystemMemoryInfo(); + total = systemMemoryInfo.total; + free = systemMemoryInfo.free; + wired = systemMemoryInfo.wired; + } catch (error) { + // do nothing + } + + total ??= os.totalmem(); + free ??= os.freemem(); + wired = Math.min(wired ?? 0, total); + + return { + total, + free, + wired + }; +} diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index f9c900fe..8732d9a9 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -3,8 +3,8 @@ import path from "path"; import console from "console"; import {createRequire} from "module"; import { - builtinLlamaCppGitHubRepo, builtinLlamaCppRelease, defaultLlamaCppLogLevel, defaultLlamaCppGitHubRepo, defaultLlamaCppGpuSupport, - defaultLlamaCppRelease, defaultSkipDownload, llamaLocalBuildBinsDirectory, recommendedBaseDockerImage, defaultLlamaCppDebugMode + builtinLlamaCppGitHubRepo, defaultLlamaCppLogLevel, defaultLlamaCppGitHubRepo, defaultLlamaCppGpuSupport, + defaultSkipDownload, llamaLocalBuildBinsDirectory, recommendedBaseDockerImage, defaultLlamaCppDebugMode } from "../config.js"; import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js"; import {waitForLockfileRelease} from "../utils/waitForLockfileRelease.js"; @@ -31,6 +31,7 @@ import {testBindingBinary} from "./utils/testBindingBinary.js"; import {BinaryPlatformInfo, getPlatformInfo} from "./utils/getPlatformInfo.js"; import {hasBuildingFromSourceDependenciesInstalled} from "./utils/hasBuildingFromSourceDependenciesInstalled.js"; import {resolveActualBindingBinaryPath} from "./utils/resolveActualBindingBinaryPath.js"; +import {defaultLlamaCppRelease, builtinLlamaCppRelease} from "./utils/binariesGithubRelease.js"; const require = createRequire(import.meta.url); @@ -122,9 +123,12 @@ export type LlamaOptions = { /** * Print binary compilation progress logs. - * Enabled by default. + * + * When set to "stderr", progress logs will be printed to stderr instead of stdout. + * + * Defaults to `"stderr"`. */ - progressLogs?: boolean, + progressLogs?: boolean | "stderr", /** * Don't download llama.cpp source if it's not found. @@ -228,7 +232,16 @@ export type LlamaOptions = { * Hidden since currently unused - defaults to `false` for now * @internal */ - tempDir?: string | string[] | false + tempDir?: string | string[] | false, + + /** + * Experimental options that may be removed in the future without a major version bump, so use with caution. + * @deprecated Any options under this field are experimental and may be removed in the future without a major version bump, + * so use with caution. + * @experimental Any options under this field are experimental and may be removed in the future without a major version bump, + * so use with caution. + */ + experimental?: LlamaExperimentalOptions }; export type LastBuildOptions = { @@ -250,11 +263,13 @@ export type LastBuildOptions = { usePrebuiltBinaries?: boolean, /** - * If a local build is not found, and prebuilt binaries are not found, when building from source, - * print binary compilation progress logs. - * Enabled by default. + * Print binary compilation progress logs. + * + * When set to "stderr", progress logs will be printed to stderr instead of stdout. + * + * Defaults to `"stderr"`. */ - progressLogs?: boolean, + progressLogs?: boolean | "stderr", /** * If a local build is not found, and prebuilt binaries are not found, don't download llama.cpp source if it's not found. @@ -358,7 +373,25 @@ export type LastBuildOptions = { * Hidden since currently unused - defaults to `false` for now * @internal */ - tempDir?: string | string[] | false + tempDir?: string | string[] | false, + + /** + * Experimental options that may be removed in the future without a major version bump, so use with caution. + */ + experimental?: LlamaExperimentalOptions +}; + +export type LlamaExperimentalOptions = { + /** + * Skip disabling Metal residency sets on macOS when using Metal, + * which will keep the model data wired (force it to stay in memory) but can negatively affect system performance. + * + * Doing this may cause issues where the system thinks it has less available memory to load a model even after disposing a previous one. + * + * Defaults to `false`. + * @experimental This is an experimental option that may be removed in the future without a major version bump, so use with caution. + */ + metalSkipDisablingResidencySets?: boolean }; export const getLlamaFunctionName = "getLlama"; @@ -411,7 +444,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp logLevel: lastBuildOptions?.logLevel ?? defaultLlamaCppLogLevel, logger: lastBuildOptions?.logger ?? Llama.defaultConsoleLogger, usePrebuiltBinaries: lastBuildOptions?.usePrebuiltBinaries ?? true, - progressLogs: lastBuildOptions?.progressLogs ?? true, + progressLogs: lastBuildOptions?.progressLogs ?? "stderr", skipDownload: lastBuildOptions?.skipDownload ?? defaultSkipDownload, maxThreads: lastBuildOptions?.maxThreads, vramPadding: lastBuildOptions?.vramPadding ?? defaultLlamaVramPadding, @@ -419,7 +452,8 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp debug: lastBuildOptions?.debug ?? defaultLlamaCppDebugMode, numa: lastBuildOptions?.numa, tempDir: lastBuildOptions?.tempDir ?? defaultTempDir, - dryRun + dryRun, + experimental: lastBuildOptions?.experimental }; if (lastBuildInfo == null) @@ -448,7 +482,8 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp debug: lastBuildOptions?.debug ?? defaultLlamaCppDebugMode, numa: lastBuildOptions?.numa, tempDir: lastBuildOptions?.tempDir ?? defaultTempDir, - skipLlamaInit: dryRun + skipLlamaInit: dryRun, + experimentalOptions: lastBuildOptions?.experimental }); if (dryRun) @@ -476,7 +511,7 @@ export async function getLlamaForOptions({ cmakeOptions = {}, existingPrebuiltBinaryMustMatchBuildOptions = false, usePrebuiltBinaries = true, - progressLogs = true, + progressLogs = "stderr", skipDownload = defaultSkipDownload, maxThreads, vramPadding = defaultLlamaVramPadding, @@ -484,7 +519,8 @@ export async function getLlamaForOptions({ debug = defaultLlamaCppDebugMode, numa = false, tempDir = defaultTempDir, - dryRun = false + dryRun = false, + experimental: experimentalOptions }: LlamaOptions, { updateLastBuildInfoOnCompile = false, skipLlamaInit = false, @@ -503,7 +539,7 @@ export async function getLlamaForOptions({ if (cmakeOptions == null) cmakeOptions = {}; if (existingPrebuiltBinaryMustMatchBuildOptions == null) existingPrebuiltBinaryMustMatchBuildOptions = false; if (usePrebuiltBinaries == null) usePrebuiltBinaries = true; - if (progressLogs == null) progressLogs = true; + if (progressLogs == null) progressLogs = "stderr"; if (skipDownload == null) skipDownload = defaultSkipDownload; if (vramPadding == null) vramPadding = defaultLlamaVramPadding; if (ramPadding == null) ramPadding = defaultLlamaRamPadding; @@ -559,7 +595,8 @@ export async function getLlamaForOptions({ debug, numa, tempDir, - dryRun + dryRun, + experimental: experimentalOptions }); } catch (err) { return await getLlamaForOptions({ @@ -578,7 +615,8 @@ export async function getLlamaForOptions({ debug, numa, tempDir, - dryRun + dryRun, + experimental: experimentalOptions }); } } else @@ -626,7 +664,8 @@ export async function getLlamaForOptions({ debug, numa, tempDir, - pipeBinaryTestErrorLogs + pipeBinaryTestErrorLogs, + experimentalOptions }); if (llama != null) { @@ -644,7 +683,7 @@ export async function getLlamaForOptions({ if (isGithubReleaseNeedsResolving(llamaCppInfo.release)) { const [owner, name] = defaultLlamaCppGitHubRepo.split("/"); - llamaCppInfo.release = await resolveGithubRelease(owner!, name!, llamaCppInfo.release); + llamaCppInfo.release = (await resolveGithubRelease(owner!, name!, llamaCppInfo.release)).tag; } } @@ -662,7 +701,8 @@ export async function getLlamaForOptions({ skipLlamaInit, debug, numa, - tempDir + tempDir, + experimentalOptions }); } catch (err) { console.error( @@ -689,7 +729,7 @@ export async function getLlamaForOptions({ } } - if (shouldLogNoGlibcWarningIfNoBuildIsAvailable && progressLogs) + if (shouldLogNoGlibcWarningIfNoBuildIsAvailable && progressLogs !== false) await logNoGlibcWarning(); if (!canBuild || build === "autoAttempt") @@ -705,7 +745,7 @@ export async function getLlamaForOptions({ if (isGithubReleaseNeedsResolving(llamaCppInfo.release)) { const [owner, name] = defaultLlamaCppGitHubRepo.split("/"); - llamaCppInfo.release = await resolveGithubRelease(owner!, name!, llamaCppInfo.release); + llamaCppInfo.release = (await resolveGithubRelease(owner!, name!, llamaCppInfo.release)).tag; } } @@ -740,7 +780,8 @@ export async function getLlamaForOptions({ skipLlamaInit, debug, numa, - tempDir + tempDir, + experimentalOptions }); } catch (err) { console.error( @@ -787,14 +828,15 @@ async function loadExistingLlamaBinary({ debug, numa, tempDir, - pipeBinaryTestErrorLogs + pipeBinaryTestErrorLogs, + experimentalOptions }: { buildOptions: BuildOptions, canUsePrebuiltBinaries: boolean, logLevel: Required["logLevel"], logger: Required["logger"], existingPrebuiltBinaryMustMatchBuildOptions: boolean, - progressLogs: boolean, + progressLogs: boolean | "stderr", platform: BinaryPlatform, platformInfo: BinaryPlatformInfo, skipLlamaInit: boolean, @@ -805,7 +847,8 @@ async function loadExistingLlamaBinary({ debug: boolean, numa?: LlamaNuma, tempDir: LlamaOptions["tempDir"], - pipeBinaryTestErrorLogs: boolean + pipeBinaryTestErrorLogs: boolean, + experimentalOptions?: LlamaExperimentalOptions }) { const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions); @@ -843,26 +886,27 @@ async function loadExistingLlamaBinary({ skipLlamaInit, debug, numa, - tempDir + tempDir, + experimentalOptions }); - } else if (progressLogs) { + } else if (progressLogs !== false) { console.warn( getConsoleLogPrefix() + "The local build binary was not built in the current system and is incompatible with it" ); if (canUsePrebuiltBinaries) - console.info(getConsoleLogPrefix() + "Falling back to prebuilt binaries"); + console.warn(getConsoleLogPrefix() + "Falling back to prebuilt binaries"); else if (fallbackMessage != null) - console.info(getConsoleLogPrefix() + fallbackMessage); + console.warn(getConsoleLogPrefix() + fallbackMessage); } } catch (err) { const binaryDescription = describeBinary(buildOptions); console.error(getConsoleLogPrefix() + `Failed to load a local build ${binaryDescription}. Error:`, err); if (canUsePrebuiltBinaries) - console.info(getConsoleLogPrefix() + "Falling back to prebuilt binaries"); + console.warn(getConsoleLogPrefix() + "Falling back to prebuilt binaries"); else if (fallbackMessage != null) - console.info(getConsoleLogPrefix() + fallbackMessage); + console.warn(getConsoleLogPrefix() + fallbackMessage); } } @@ -910,9 +954,10 @@ async function loadExistingLlamaBinary({ skipLlamaInit, debug, numa, - tempDir + tempDir, + experimentalOptions }); - } else if (progressLogs) { + } else if (progressLogs !== false) { const binaryDescription = describeBinary({ ...buildOptions, customCmakeOptions: existingPrebuiltBinaryMustMatchBuildOptions @@ -942,7 +987,7 @@ async function loadExistingLlamaBinary({ : "" ) + ". Error:", err); } - } else if (progressLogs) + } else if (progressLogs !== false) console.warn( getConsoleLogPrefix() + "A prebuilt binary was not found" + ( fallbackMessage != null @@ -967,7 +1012,8 @@ async function buildAndLoadLlamaBinary({ skipLlamaInit, debug, numa, - tempDir + tempDir, + experimentalOptions }: { buildOptions: BuildOptions, skipDownload: boolean, @@ -980,7 +1026,8 @@ async function buildAndLoadLlamaBinary({ skipLlamaInit: boolean, debug: boolean, numa?: LlamaNuma, - tempDir: LlamaOptions["tempDir"] + tempDir: LlamaOptions["tempDir"], + experimentalOptions?: LlamaExperimentalOptions }) { const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions); @@ -1016,7 +1063,8 @@ async function buildAndLoadLlamaBinary({ skipLlamaInit, debug, numa, - tempDir + tempDir, + experimentalOptions }); } diff --git a/src/bindings/types.ts b/src/bindings/types.ts index da689d8a..e4c3400c 100644 --- a/src/bindings/types.ts +++ b/src/bindings/types.ts @@ -12,7 +12,7 @@ export const nodeLlamaCppGpuOffStringOptions = ["false", "off", "none", "disable export type BuildGpu = (typeof buildGpuOptions)[number]; export type BuildOptions = { customCmakeOptions: Map, - progressLogs: boolean, + progressLogs: boolean | "stderr", platform: BinaryPlatform, platformInfo: BinaryPlatformInfo, arch: typeof process.arch, @@ -29,6 +29,24 @@ export type BuildOptionsJSON = Omit & { customCmakeOptions: Record }; +export type RamState = { + /** + * The total amount of memory in bytes + */ + total: number, + + /** + * The amount of free memory in bytes. + * This memory is immediately available for allocation. + */ + free: number, + + /** + * The amount of wired memory (memory that cannot be evicted or moved to disk) in bytes + */ + wired: number +}; + export function parseNodeLlamaCppGpuOption(option: (typeof nodeLlamaCppGpuOptions)[number] | (typeof nodeLlamaCppGpuOffStringOptions)[number]): BuildGpu | "auto" { function optionIsGpuOff(opt: typeof option): opt is (typeof nodeLlamaCppGpuOffStringOptions)[number] { return nodeLlamaCppGpuOffStringOptions.includes(opt as (typeof nodeLlamaCppGpuOffStringOptions)[number]); @@ -140,5 +158,10 @@ export function LlamaLogLevelGreaterThanOrEqual(a: LlamaLogLevel, b: LlamaLogLev } export const enum LlamaLocks { - loadToMemory = "loadToMemory" + loadToMemory = "loadToMemory", + addonInit = "addonInit" +} + +export function doesLlamaBackendNeedAddonInitLock(gpu: LlamaGpuType): boolean { + return gpu === "vulkan"; } diff --git a/src/bindings/utils/MemoryOrchestrator.ts b/src/bindings/utils/MemoryOrchestrator.ts index 992f336e..f2b870dd 100644 --- a/src/bindings/utils/MemoryOrchestrator.ts +++ b/src/bindings/utils/MemoryOrchestrator.ts @@ -1,13 +1,20 @@ import {EventRelay} from "lifecycle-utils"; +import {Promisable} from "../../utils/transformPromisable.js"; export class MemoryOrchestrator { - /** @internal */ private readonly _getMemoryState: () => {free: number, total: number, unifiedSize: number}; + /** @internal */ private readonly _getMemoryState: () => Promisable<{free: number, total: number, unifiedSize: number}>; /** @internal */ private _reservedMemory: number = 0; + /** @internal */ public _markedMemory: number = 0; + /** @internal */ private _memoryCap: number | null = null; + /** @internal */ private _padding: number = 0; public readonly onMemoryReservationRelease = new EventRelay(); + public readonly onMemoryMarkingRelease = new EventRelay(); - public constructor(getMemoryState: () => {free: number, total: number, unifiedSize: number}) { + public constructor(getMemoryState: () => Promisable<{free: number, total: number, unifiedSize: number}>) { this._getMemoryState = getMemoryState; + + this._onMarkFinalized = this._onMarkFinalized.bind(this); } public reserveMemory(bytes: number) { @@ -19,8 +26,45 @@ export class MemoryOrchestrator { }); } + public markAllocation(bytes: number) { + this._markedMemory += bytes; + + return MemoryMarking._create(bytes, this); + } + + public set padding(bytes: number) { + this._padding = bytes; + } + + public get padding() { + return this._padding; + } + + public set memoryCap(maxBytes: number | null) { + this._memoryCap = maxBytes ?? null; + } + + public get memoryCap() { + return this._memoryCap; + } + + public get markedMemory() { + return this._markedMemory; + } + public async getMemoryState() { - const {free, total, unifiedSize} = this._getMemoryState(); + const state = this._getMemoryState(); + let {free, total, unifiedSize} = state instanceof Promise + ? await state + : state; + + free = Math.max(0, free - this._padding); + + if (this._memoryCap != null) { + total = Math.min(total, this._memoryCap); + free = Math.max(0, Math.min(free, this._memoryCap, total - this._markedMemory)); + unifiedSize = Math.min(unifiedSize, this._memoryCap); + } return { free: Math.max(0, free - this._reservedMemory), @@ -28,6 +72,12 @@ export class MemoryOrchestrator { unifiedSize }; } + + /** @internal */ + public _onMarkFinalized(bytes: number) { + this._markedMemory -= bytes; + this.onMemoryMarkingRelease.dispatchEvent(); + } } export class MemoryReservation { @@ -62,3 +112,41 @@ export class MemoryReservation { return new MemoryReservation(bytes, dispose); } } + +export class MemoryMarking { + /** @internal */ private readonly _size: number; + /** @internal */ private _orchestrator?: MemoryOrchestrator; + /** @internal */ private _finalizationRegistry: FinalizationRegistry; + + private constructor(size: number, orchestrator: MemoryOrchestrator) { + this._size = size; + this._orchestrator = orchestrator; + this._finalizationRegistry = new FinalizationRegistry(orchestrator._onMarkFinalized); + this._finalizationRegistry.register(this, size, this); + } + + public get size(): number { + return this._size; + } + + public get disposed(): boolean { + return this._orchestrator == null; + } + + public [Symbol.dispose](): void { + this.dispose(); + } + + public dispose(): void { + if (this._orchestrator != null) { + this._orchestrator._onMarkFinalized(this._size); + this._finalizationRegistry.unregister(this); + } + + this._orchestrator = undefined; + } + + public static _create(bytes: number, orchestrator: MemoryOrchestrator): MemoryMarking { + return new MemoryMarking(bytes, orchestrator); + } +} diff --git a/src/bindings/utils/applyLlamaCppRepoPatches.ts b/src/bindings/utils/applyLlamaCppRepoPatches.ts new file mode 100644 index 00000000..cdb942bf --- /dev/null +++ b/src/bindings/utils/applyLlamaCppRepoPatches.ts @@ -0,0 +1,102 @@ +import path from "path"; +import fs from "fs-extra"; +import {simpleGit} from "simple-git"; +import {GitHubClient} from "../../utils/GitHubClient.js"; +import {defaultLlamaCppRepoSkipPatches, llamaCppDirectory, llamaCppPatchesDirectory} from "../../config.js"; +import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; + +type RepoPatch = { + filename: string, + title: string, + canSkip(repoPath: string, lastCommitDate?: Date): Promise +}; + +const patches: RepoPatch[] = [{ + // https://github.com/ggml-org/llama.cpp/pull/22566 + filename: "PR-22566.diff", + title: "fix: consistent memory breakdown for models loaded with `no_alloc`", + async canSkip(repoPath, lastCommitDate) { + const llamaModelLoaderCpp = await fs.readFile(path.join(repoPath, "src", "llama-model-loader.cpp"), "utf8"); + if (llamaModelLoaderCpp.includes("n_tensors = gguf_get_n_tensors(metadata);")) + return true; + + if (lastCommitDate == null) + return false; + + try { + const githubClient = new GitHubClient(); + const pullRequestStatus = await githubClient.getPullRequestStatus({ + owner: "ggml-org", + repo: "llama.cpp", + id: "22566" + }); + + if (pullRequestStatus.merged && pullRequestStatus.merged_at != null) { + const mergedAt = new Date(pullRequestStatus.merged_at); + if (+mergedAt >= +lastCommitDate) + return true; + } + } catch (err) { + // do nothing + } + + return false; + } +}]; + +export function hasLlamaCppRepoPatchesToApply() { + return patches.length > 0; +} + +export async function applyLlamaCppRepoPatches(lastCommitDate?: Date, throwOnError: boolean = false, progressLogs: boolean | "stderr" = "stderr") { + if (!hasLlamaCppRepoPatchesToApply() || (defaultLlamaCppRepoSkipPatches.length === 1 && defaultLlamaCppRepoSkipPatches[0] === "*")) + return; + + if (!(await fs.pathExists(llamaCppPatchesDirectory)) || !(await fs.pathExists(llamaCppDirectory))) + return; + + const git = simpleGit({baseDir: llamaCppDirectory}); + for (const patch of patches) { + const patchPath = path.join(path.resolve(llamaCppPatchesDirectory), patch.filename); + + const filenameWithoutDiff = patch.filename.toLowerCase().endsWith(".diff") + ? patch.filename.slice(0, -".diff".length) + : patch.filename; + + if (defaultLlamaCppRepoSkipPatches.includes(filenameWithoutDiff) || defaultLlamaCppRepoSkipPatches.includes(patch.filename)) + continue; + + try { + if (!(await fs.pathExists(patchPath))) { + if (progressLogs !== false) + console.warn(`Patch file "${patch.filename}" not found, skipping patch "${patch.title}"`); + + continue; + } + + if (await patch.canSkip(llamaCppDirectory, lastCommitDate)) + continue; + } catch (err) { + if (progressLogs !== false) + console.warn( + getConsoleLogPrefix(), + `Failed testing whether patch "${patch.filename}": "${patch.title}" can be skipped:`, + String(err) + ); + } + + try { + await git.applyPatch(patchPath, {"--ignore-whitespace": null}); + } catch (err) { + if (progressLogs !== false) + console.error( + getConsoleLogPrefix(), + `Failed to apply patch "${patch.filename}": "${patch.title}", building llama.cpp may fail.`, + String(err) + ); + + if (throwOnError) + throw err; + } + } +} diff --git a/src/bindings/utils/binariesGithubRelease.ts b/src/bindings/utils/binariesGithubRelease.ts index 7cdede45..81b53203 100644 --- a/src/bindings/utils/binariesGithubRelease.ts +++ b/src/bindings/utils/binariesGithubRelease.ts @@ -1,5 +1,5 @@ import fs from "fs-extra"; -import {binariesGithubReleasePath} from "../../config.js"; +import {binariesGithubReleasePath, llamaCppReleaseEnvVar} from "../../config.js"; type BinariesGithubReleaseFile = { release: "latest" | string @@ -20,3 +20,6 @@ export async function setBinariesGithubRelease(release: BinariesGithubReleaseFil spaces: 4 }); } + +export const builtinLlamaCppRelease = await getBinariesGithubRelease(); +export const defaultLlamaCppRelease = llamaCppReleaseEnvVar || builtinLlamaCppRelease; diff --git a/src/bindings/utils/cloneLlamaCppRepo.ts b/src/bindings/utils/cloneLlamaCppRepo.ts index d6f3f370..ff9c2322 100644 --- a/src/bindings/utils/cloneLlamaCppRepo.ts +++ b/src/bindings/utils/cloneLlamaCppRepo.ts @@ -4,9 +4,9 @@ import chalk from "chalk"; import fs from "fs-extra"; import which from "which"; import { - defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, enableRecursiveClone, llamaCppDirectory, llamaCppDirectoryInfoFilePath + defaultLlamaCppGitHubRepo, enableRecursiveClone, llamaCppDirectory, llamaCppDirectoryInfoFilePath } from "../../config.js"; -import {getGitBundlePathForRelease} from "../../utils/gitReleaseBundles.js"; +import {getGitBundlePathForRelease, isGitBundleCompatible} from "../../utils/gitReleaseBundles.js"; import {withLockfile} from "../../utils/withLockfile.js"; import {waitForLockfileRelease} from "../../utils/waitForLockfileRelease.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; @@ -15,6 +15,8 @@ import {isGithubReleaseNeedsResolving, resolveGithubRelease} from "../../utils/r import withStatusLogs from "../../utils/withStatusLogs.js"; import {withProgressLog} from "../../utils/withProgressLog.js"; import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js"; +import {applyLlamaCppRepoPatches, hasLlamaCppRepoPatchesToApply} from "./applyLlamaCppRepoPatches.js"; +import {defaultLlamaCppRelease} from "./binariesGithubRelease.js"; type ClonedLlamaCppRepoTagFile = { tag: string, @@ -23,14 +25,14 @@ type ClonedLlamaCppRepoTagFile = { export async function cloneLlamaCppRepo( - githubOwner: string, githubRepo: string, tag: string, useBundles: boolean = true, progressLogs: boolean = true, + githubOwner: string, githubRepo: string, tag: string, useBundles: boolean = true, progressLogs: boolean | "stderr" = "stderr", recursive: boolean = enableRecursiveClone ) { const gitBundleForTag = !useBundles ? null : await getGitBundlePathForRelease(githubOwner, githubRepo, tag); const remoteGitUrl = `https://github.com/${githubOwner}/${githubRepo}.git`; async function withGitCloneProgress(cloneName: string, callback: (gitWithCloneProgress: SimpleGit) => Promise): Promise { - if (!progressLogs) + if (progressLogs === false) return await callback(simpleGit({})); const repoText = `${githubOwner}/${githubRepo} (${cloneName})`; @@ -92,7 +94,7 @@ export async function cloneLlamaCppRepo( await fs.remove(llamaCppDirectory); await fs.remove(llamaCppDirectoryInfoFilePath); - if (progressLogs) + if (progressLogs !== false) console.error(getConsoleLogPrefix() + "Failed to clone git bundle, cloning from GitHub instead", err); await printCloneErrorHelp(String(err)); @@ -174,29 +176,36 @@ export async function isLlamaCppRepoCloned(waitForLock: boolean = true) { return repoGitExists && releaseInfoFileExists; } -export async function ensureLlamaCppRepoIsCloned({progressLogs = true}: {progressLogs?: boolean} = {}) { +export async function ensureLlamaCppRepoIsCloned({progressLogs = "stderr"}: {progressLogs?: boolean | "stderr"} = {}) { if (await isLlamaCppRepoCloned(true)) return; const [githubOwner, githubRepo] = defaultLlamaCppGitHubRepo.split("/"); - if (progressLogs) - console.log(getConsoleLogPrefix() + chalk.blue("Cloning llama.cpp")); + if (progressLogs !== false) + console.warn(getConsoleLogPrefix() + chalk.blue("Cloning llama.cpp")); let releaseTag = defaultLlamaCppRelease; + let releaseDate: Date | undefined = undefined; - if (isGithubReleaseNeedsResolving(releaseTag)) { + if (isGithubReleaseNeedsResolving(releaseTag) || ( + hasLlamaCppRepoPatchesToApply() && + !(await isGitBundleCompatible(githubOwner!, githubRepo!, releaseTag)) + )) { await withStatusLogs({ loading: chalk.blue("Fetching llama.cpp info"), success: chalk.blue("Fetched llama.cpp info"), fail: chalk.blue("Failed to fetch llama.cpp info"), - disableLogs: !progressLogs + disableLogs: progressLogs === false }, async () => { - releaseTag = await resolveGithubRelease(githubOwner!, githubRepo!, releaseTag); + const release = await resolveGithubRelease(githubOwner!, githubRepo!, releaseTag); + releaseTag = release.tag; + releaseDate = release.date; }); } await cloneLlamaCppRepo(githubOwner!, githubRepo!, releaseTag, true, progressLogs); + await applyLlamaCppRepoPatches(releaseDate, false, progressLogs); } async function updateClonedLlamaCppRepoTagFile(githubOwner: string, githubRepo: string, tag: string) { diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts index 6857890b..ec16120d 100644 --- a/src/bindings/utils/compileLLamaCpp.ts +++ b/src/bindings/utils/compileLLamaCpp.ts @@ -169,7 +169,11 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions } else if (!cmakeCustomOptions.has("GGML_BACKEND_DL")) cmakeCustomOptions.set("GGML_BACKEND_DL", "ON"); } - } + } else if (!cmakeCustomOptions.has("GGML_NATIVE") && + buildOptions.platform === platform && + buildOptions.arch === process.arch + ) + cmakeCustomOptions.set("GGML_NATIVE", "ON"); await fs.remove(outDirectory); @@ -277,8 +281,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions ) )) { for (const {nvccPath, cudaHomePath} of await getCudaNvccPaths()) { - if (buildOptions.progressLogs) - console.info( + if (buildOptions.progressLogs !== false) + console.warn( getConsoleLogPrefix(true) + `Trying to compile again with "CUDACXX=${nvccPath}" and "CUDA_PATH=${cudaHomePath}" environment variables` ); @@ -294,7 +298,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions ignoreWorkarounds: [...ignoreWorkarounds, "cudaArchitecture"] }); } catch (err) { - if (buildOptions.progressLogs) + if (buildOptions.progressLogs !== false) console.error(getConsoleLogPrefix(true, false), err); } } @@ -307,13 +311,13 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions err.combinedStd.toLowerCase().includes("compiler is out of heap space".toLowerCase()) ) ) { - if (buildOptions.progressLogs) { + if (buildOptions.progressLogs !== false) { if (ignoreWorkarounds.includes("reduceParallelBuildThreads")) - console.info( + console.warn( getConsoleLogPrefix(true) + "Trying to compile again with a single build thread" ); else - console.info( + console.warn( getConsoleLogPrefix(true) + "Trying to compile again with reduced parallel build threads" ); } @@ -329,7 +333,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions ] }); } catch (err) { - if (buildOptions.progressLogs) + if (buildOptions.progressLogs !== false) console.error(getConsoleLogPrefix(true, false), err); } } @@ -346,8 +350,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions documentationPageUrls.Vulkan ); else if (useWindowsLlvm && !ciMode) { - if (buildOptions.progressLogs) - console.info(getConsoleLogPrefix(true) + "Trying to compile again without LLVM"); + if (buildOptions.progressLogs !== false) + console.warn(getConsoleLogPrefix(true) + "Trying to compile again without LLVM"); try { return await compileLlamaCpp(buildOptions, { @@ -355,7 +359,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions ignoreWorkarounds: [...ignoreWorkarounds, "avoidWindowsLlvm"] }); } catch (err) { - if (buildOptions.progressLogs) + if (buildOptions.progressLogs !== false) console.error(getConsoleLogPrefix(true, false), err); } } diff --git a/src/bindings/utils/getBuildFolderNameForBuildOptions.ts b/src/bindings/utils/getBuildFolderNameForBuildOptions.ts index 1d08164d..a6864a3b 100644 --- a/src/bindings/utils/getBuildFolderNameForBuildOptions.ts +++ b/src/bindings/utils/getBuildFolderNameForBuildOptions.ts @@ -1,6 +1,7 @@ import {hashString} from "../../utils/hashString.js"; import {BuildOptions} from "../types.js"; -import {builtinLlamaCppGitHubRepo, builtinLlamaCppRelease} from "../../config.js"; +import {builtinLlamaCppGitHubRepo} from "../../config.js"; +import {builtinLlamaCppRelease} from "./binariesGithubRelease.js"; export async function getBuildFolderNameForBuildOptions(buildOptions: BuildOptions) { const nameParts: string[] = [buildOptions.platform, buildOptions.arch]; diff --git a/src/bindings/utils/getCanUsePrebuiltBinaries.ts b/src/bindings/utils/getCanUsePrebuiltBinaries.ts index 31e3ba9c..c8f37fe7 100644 --- a/src/bindings/utils/getCanUsePrebuiltBinaries.ts +++ b/src/bindings/utils/getCanUsePrebuiltBinaries.ts @@ -1,4 +1,5 @@ -import {builtinLlamaCppGitHubRepo, builtinLlamaCppRelease} from "../../config.js"; +import {builtinLlamaCppGitHubRepo} from "../../config.js"; +import {builtinLlamaCppRelease} from "./binariesGithubRelease.js"; import {getClonedLlamaCppRepoReleaseInfo} from "./cloneLlamaCppRepo.js"; export async function getCanUsePrebuiltBinaries() { diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts index 4a40b12b..c26f4636 100644 --- a/src/bindings/utils/testBindingBinary.ts +++ b/src/bindings/utils/testBindingBinary.ts @@ -305,6 +305,12 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro binding.getGpuVramInfo(); binding.getGpuDeviceInfo(); + try { + await binding.getSystemMemoryInfo(); + } catch (err) { + // do nothing + } + const gpuType = binding.getGpuType(); void (gpuType as BuildGpu satisfies typeof gpuType); if (gpuType !== message.gpu) diff --git a/src/chatWrappers/Gemma4ChatWrapper.ts b/src/chatWrappers/Gemma4ChatWrapper.ts new file mode 100644 index 00000000..ed1c9a34 --- /dev/null +++ b/src/chatWrappers/Gemma4ChatWrapper.ts @@ -0,0 +1,256 @@ +import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js"; +import { + ChatModelFunctionCall, ChatModelFunctions, ChatModelResponse, ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, + ChatWrapperSettings +} from "../types.js"; +import {LlamaText, SpecialToken, SpecialTokensText} from "../utils/LlamaText.js"; +import {jsonDumps} from "./utils/jsonDumps.js"; + +// source: https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4 +export class Gemma4ChatWrapper extends ChatWrapper { + public readonly wrapperName: string = "Gemma 4"; + + public readonly reasoning: boolean; + public readonly keepOnlyLastThought: boolean; + + public override readonly settings: ChatWrapperSettings = { + supportsSystemMessages: true, + functions: { + call: { + optionalPrefixSpace: false, + prefix: LlamaText(new SpecialTokensText("<|tool_call>call:")), + paramsPrefix: "{", + suffix: LlamaText(new SpecialTokensText("}")), + emptyCallParamsPlaceholder: undefined + }, + result: { + prefix: LlamaText(new SpecialTokensText("response:"), "{{functionName}}", "{"), + suffix: LlamaText(new SpecialTokensText("}")) + } + }, + segments: { + reiterateStackAfterFunctionCalls: true, + thought: { + prefix: LlamaText(new SpecialTokensText("<|channel>thought\n")), + suffix: LlamaText(new SpecialTokensText("")) + } + } + }; + + public constructor(options: { + /** + * Whether to promote the model to perform reasoning. + * + * Defaults to `true`. + */ + reasoning?: boolean, + + /** + * Whether to keep only the chain of thought from the last model response. + * + * Setting this to `false` will keep all the chain of thoughts from the model responses in the context state. + * + * Defaults to `true`. + */ + keepOnlyLastThought?: boolean + } = {}) { + super(); + + const { + reasoning = true, + keepOnlyLastThought = true + } = options; + + this.reasoning = reasoning; + this.keepOnlyLastThought = keepOnlyLastThought; + } + + public override generateContextState({ + chatHistory, availableFunctions, documentFunctionParams + }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState { + const hasFunctions = Object.keys(availableFunctions ?? {}).length > 0; + const modifiedChatHistory = chatHistory.slice(); + + let systemMessage: LlamaText = LlamaText(); + if (modifiedChatHistory[0]?.type === "system") { + systemMessage = LlamaText.fromJSON(modifiedChatHistory[0].text); + modifiedChatHistory.shift(); + } + + if (hasFunctions) + systemMessage = LlamaText([ + systemMessage, + this.generateAvailableFunctionsSystemText(availableFunctions ?? {}, {documentParams: documentFunctionParams}) + ]); + + if (this.reasoning) + systemMessage = LlamaText([ + new SpecialTokensText("<|think|>"), + systemMessage + ]); + + if (systemMessage.values.length > 0) + modifiedChatHistory.unshift({ + type: "system", + text: systemMessage.toJSON() + }); + + const contextContent: LlamaText[] = [ + LlamaText(new SpecialToken("BOS")) + ]; + + for (let i = 0; i < modifiedChatHistory.length; i++) { + const isLastItem = i === modifiedChatHistory.length - 1; + const item = modifiedChatHistory[i]; + + if (item == null) + continue; + + if (item.type === "system") + contextContent.push( + LlamaText([ + new SpecialTokensText("<|turn>system\n"), + LlamaText.fromJSON(item.text), + isLastItem + ? LlamaText([]) + : new SpecialTokensText("\n") + ]) + ); + else if (item.type === "user") + contextContent.push( + LlamaText([ + new SpecialTokensText("<|turn>user\n"), + item.text, + isLastItem + ? LlamaText([]) + : new SpecialTokensText("\n") + ]) + ); + else if (item.type === "model") + contextContent.push(this._getModelResponse(item.response, true, isLastItem, this.keepOnlyLastThought)); + else + void (item satisfies never); + } + + return { + contextText: LlamaText(contextContent), + stopGenerationTriggers: [ + LlamaText(new SpecialToken("EOS")), + LlamaText(new SpecialToken("EOT")), + LlamaText(new SpecialTokensText("")), + LlamaText(new SpecialTokensText("\n")), + LlamaText("<|return|>") + ] + }; + } + + public override generateAvailableFunctionsSystemText(availableFunctions: ChatModelFunctions, {documentParams = true}: { + documentParams?: boolean + }): LlamaText { + return LlamaText( + Object.entries(availableFunctions) + .map(([name, definition]) => { + return LlamaText([ + new SpecialTokensText("<|tool>"), + "declaration:", name, "{", + jsonDumps({ + description: definition.description || undefined, + parameters: documentParams + ? (definition.params || {}) + : undefined + }), + "}", new SpecialTokensText("") + ]); + }) + ); + } + + public override generateModelResponseText(modelResponse: ChatModelResponse["response"], useRawValues: boolean = true): LlamaText { + return this._getModelResponse(modelResponse, useRawValues, false, false); + } + + /** @internal */ + private _getModelResponse( + modelResponse: ChatModelResponse["response"], + useRawValues: boolean, + isLastItem: boolean, + keepOnlyLastThought: boolean + ) { + const res: LlamaText[] = [ + LlamaText(new SpecialTokensText("<|turn>model\n")) + ]; + const pendingFunctionCalls: ChatModelFunctionCall[] = []; + + const addPendingFunctions = () => { + if (pendingFunctionCalls.length === 0) + return; + + res.push(this.generateFunctionCallsAndResults(pendingFunctionCalls, useRawValues)); + + pendingFunctionCalls.length = 0; + }; + + for (let index = 0; index < modelResponse.length; index++) { + const isLastResponse = index === modelResponse.length - 1; + const response = modelResponse[index]; + + if (response == null) + continue; + else if (response === "" && (!isLastResponse || !isLastItem)) + continue; + + if (typeof response === "string") { + addPendingFunctions(); + res.push(LlamaText(response)); + } else if (response.type === "segment") { + addPendingFunctions(); + + if (response.ended && response.raw != null && useRawValues) + res.push(LlamaText.fromJSON(response.raw)); + else if (response.segmentType === "thought") { + if (keepOnlyLastThought && !isLastItem) + continue; + + res.push( + LlamaText([ + new SpecialTokensText("<|channel>thought"), + response.text, + (isLastItem && !response.ended) + ? LlamaText([]) + : new SpecialTokensText("") + ]) + ); + } else if (response.segmentType === "comment") + continue; // unsupported + else + void (response.segmentType satisfies never); + } else if (response.type === "functionCall") { + if (response.startsNewChunk) + addPendingFunctions(); + + pendingFunctionCalls.push(response); + } else + void (response satisfies never); + } + + addPendingFunctions(); + + if (!isLastItem) + res.push(LlamaText(new SpecialTokensText("\n"))); + + return LlamaText(res); + } + + /** @internal */ + public static override _getOptionConfigurationsToTestIfCanSupersedeJinjaTemplate(): ChatWrapperJinjaMatchConfiguration { + return [ + [{}, {}], + [{reasoning: false}, {}], + [ + {}, + {}, + {additionalRenderParameters: {"enable_thinking": true}} + ] + ]; + } +} diff --git a/src/chatWrappers/utils/resolveChatWrapper.ts b/src/chatWrappers/utils/resolveChatWrapper.ts index 8cadf9e8..b1faa330 100644 --- a/src/chatWrappers/utils/resolveChatWrapper.ts +++ b/src/chatWrappers/utils/resolveChatWrapper.ts @@ -7,6 +7,7 @@ import {FalconChatWrapper} from "../FalconChatWrapper.js"; import {FunctionaryChatWrapper} from "../FunctionaryChatWrapper.js"; import {AlpacaChatWrapper} from "../AlpacaChatWrapper.js"; import {GemmaChatWrapper} from "../GemmaChatWrapper.js"; +import {Gemma4ChatWrapper} from "../Gemma4ChatWrapper.js"; import {JinjaTemplateChatWrapper, JinjaTemplateChatWrapperOptions} from "../generic/JinjaTemplateChatWrapper.js"; import {TemplateChatWrapper} from "../generic/TemplateChatWrapper.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; @@ -27,7 +28,7 @@ import type {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js"; export const specializedChatWrapperTypeNames = Object.freeze([ "general", "deepSeek", "qwen", "llama3.2-lightweight", "llama3.1", "llama3", "llama2Chat", "mistral", "alpacaChat", "functionary", - "chatML", "falconChat", "gemma", "harmony", "seed" + "chatML", "falconChat", "gemma4", "gemma", "harmony", "seed" ] as const); export type SpecializedChatWrapperTypeName = (typeof specializedChatWrapperTypeNames)[number]; @@ -56,6 +57,7 @@ export const chatWrappers = Object.freeze({ "functionary": FunctionaryChatWrapper, "chatML": ChatMLChatWrapper, "falconChat": FalconChatWrapper, + "gemma4": Gemma4ChatWrapper, "gemma": GemmaChatWrapper, "harmony": HarmonyChatWrapper, "seed": SeedChatWrapper, @@ -70,7 +72,8 @@ const chatWrapperToConfigType = new Map( ); const specializedChatWrapperRelatedTexts = { - "harmony": ["gpt", "gpt-oss"] + "harmony": ["gpt", "gpt-oss"], + "gemma4": ["gemma 4", "gemma-4"] } satisfies Partial>; export type BuiltInChatWrapperType = InstanceType; @@ -364,6 +367,8 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(Llama3ChatWrapper); else if (includesText(modelNames, ["Mistral", "Mistral Large", "Mistral Large Instruct", "Mistral-Large", "Codestral"])) return createSpecializedChatWrapper(MistralChatWrapper); + else if (includesText(modelNames, ["Gemma 4", "Gemma-4", "gemma-4"])) + return createSpecializedChatWrapper(Gemma4ChatWrapper); else if (includesText(modelNames, ["Gemma", "Gemma 2"])) return createSpecializedChatWrapper(GemmaChatWrapper); else if (includesText(modelNames, ["gpt-oss", "Gpt Oss", "Gpt-Oss", "openai_gpt-oss", "Openai_Gpt Oss", "openai.gpt-oss", "Openai.Gpt Oss"])) @@ -381,6 +386,8 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(SeedChatWrapper); else if (modelJinjaTemplate.includes("<|start|>") && modelJinjaTemplate.includes("<|channel|>")) return createSpecializedChatWrapper(HarmonyChatWrapper); + else if (modelJinjaTemplate.includes("<|turn>") && modelJinjaTemplate.includes("<|tool_call>call:")) + return createSpecializedChatWrapper(Gemma4ChatWrapper); else if (modelJinjaTemplate.includes("<|im_start|>")) return createSpecializedChatWrapper(ChatMLChatWrapper); else if (modelJinjaTemplate.includes("[INST]")) @@ -430,9 +437,12 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(FunctionaryChatWrapper); else if (lowercaseName === "dolphin" && splitLowercaseSubType.includes("mistral")) return createSpecializedChatWrapper(ChatMLChatWrapper); - else if (lowercaseName === "gemma") + else if (lowercaseName === "gemma") { + if (firstSplitLowercaseSubType === "4") + return createSpecializedChatWrapper(Gemma4ChatWrapper); + return createSpecializedChatWrapper(GemmaChatWrapper); - else if (splitLowercaseSubType.includes("chatml")) + } else if (splitLowercaseSubType.includes("chatml")) return createSpecializedChatWrapper(ChatMLChatWrapper); } } @@ -454,6 +464,8 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(FalconChatWrapper); else if (arch === "gemma" || arch === "gemma2") return createSpecializedChatWrapper(GemmaChatWrapper); + else if (arch === "gemma4") + return createSpecializedChatWrapper(Gemma4ChatWrapper); } return null; diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 88ec258e..55bd27e6 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -5,6 +5,7 @@ import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; +import bytes from "bytes"; import {chatCommandHistoryFilePath, defaultChatSystemPrompt, documentationPageUrls} from "../../config.js"; import {getIsInDocumentationMode} from "../../state.js"; import {ReplHistory} from "../../utils/ReplHistory.js"; @@ -50,6 +51,8 @@ type ChatCommand = { kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, noTrimWhitespace: boolean, grammar: "text" | Parameters[1], jsonSchemaGrammarFile?: string, @@ -80,7 +83,7 @@ type ChatCommand = { numa?: LlamaNuma, meter: boolean, timing: boolean, - noMmap: boolean, + mmap?: boolean, useDirectIo: boolean, printTimings: boolean }; @@ -172,8 +175,7 @@ export const ChatCommand: CommandModule = { .option("flashAttention", { alias: "fa", type: "boolean", - default: false, - description: "Enable flash attention" + description: "Force enable flash attention. Flash attention is enabled by default when supported. You can force disable flash attention via `--no-fa`" }) .option("kvCacheKeyType", { alias: "kvckt", @@ -201,6 +203,16 @@ export const ChatCommand: CommandModule = { default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("noTrimWhitespace", { type: "boolean", alias: ["noTrim"], @@ -383,10 +395,9 @@ export const ChatCommand: CommandModule = { default: false, description: "Print how how long it took to generate each response" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("useDirectIo", { type: "boolean", @@ -403,22 +414,22 @@ export const ChatCommand: CommandModule = { async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, - noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, + maxRam, maxVram, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, - environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, + environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }) { try { await RunChat({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, - batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, - threads, temperature, minP, topK, topP, seed, xtc, + batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, + noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, useDirectIo, printTimings + debug, numa, meter, timing, mmap, useDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -431,17 +442,20 @@ export const ChatCommand: CommandModule = { async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, - contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, - jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, + contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, maxRam, maxVram, + noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, - tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; if (reasoningBudget === -1) reasoningBudget = undefined; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? bytes.parse(maxRam) ?? undefined : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? bytes.parse(maxVram) ?? undefined : undefined; + const headers = resolveHeaderFlag(headerArg); const trimWhitespace = !noTrimWhitespace; @@ -462,7 +476,14 @@ async function RunChat({ numa }); const logBatchSize = batchSize != null; - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; + + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -673,7 +694,10 @@ async function RunChat({ printBos: true, printEos: true, logBatchSize, - tokenMeterEnabled: meter + tokenMeterEnabled: meter, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }); printInfoLine({ title: "Chat", diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index f3fca2a9..2f1c1da3 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -5,6 +5,7 @@ import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; +import bytes from "bytes"; import {getLlama} from "../../bindings/getLlama.js"; import { BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, @@ -38,6 +39,8 @@ type CompleteCommand = { kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, threads?: number, temperature: number, minP: number, @@ -62,7 +65,7 @@ type CompleteCommand = { numa?: LlamaNuma, meter: boolean, timing: boolean, - noMmap: boolean, + mmap?: boolean, useDirectIo: boolean, printTimings: boolean }; @@ -129,8 +132,7 @@ export const CompleteCommand: CommandModule = { .option("flashAttention", { alias: "fa", type: "boolean", - default: false, - description: "Enable flash attention" + description: "Force enable flash attention. Flash attention is enabled by default when supported. You can force disable flash attention via `--no-fa`" }) .option("kvCacheKeyType", { alias: "kvckt", @@ -158,6 +160,16 @@ export const CompleteCommand: CommandModule = { default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -303,10 +315,9 @@ export const CompleteCommand: CommandModule = { default: false, description: "Print how how long it took to generate each response" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("useDirectIo", { type: "boolean", @@ -322,20 +333,20 @@ export const CompleteCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, - flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, useDirectIo, printTimings + debug, numa, meter, timing, mmap, useDirectIo, printTimings }) { try { await RunCompletion({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -348,15 +359,18 @@ export const CompleteCommand: CommandModule = { async function RunCompletion({ modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, mmap, useDirectIo, printTimings }: CompleteCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? bytes.parse(maxRam) ?? undefined : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? bytes.parse(maxVram) ?? undefined : undefined; + const headers = resolveHeaderFlag(headerArg); if (debug) @@ -376,7 +390,14 @@ async function RunCompletion({ numa }); const logBatchSize = batchSize != null; - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; + + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -552,7 +573,10 @@ async function RunCompletion({ useDirectIo, minTitleLength: "Complete".length + 1, logBatchSize, - tokenMeterEnabled: meter + tokenMeterEnabled: meter, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }); printInfoLine({ title: "Complete", diff --git a/src/cli/commands/DebugCommand.ts b/src/cli/commands/DebugCommand.ts index d2ee7117..5cb48384 100644 --- a/src/cli/commands/DebugCommand.ts +++ b/src/cli/commands/DebugCommand.ts @@ -1,4 +1,3 @@ -import os from "os"; import {CommandModule} from "yargs"; import chalk from "chalk"; import {getLlama} from "../../bindings/getLlama.js"; @@ -37,9 +36,8 @@ async function DebugVramFunction() { const llama = await getLlama("lastBuild"); const vramStatus = await llama.getVramState(); - const totalMemory = os.totalmem(); - const freeMemory = os.freemem(); - const usedMemory = totalMemory - freeMemory; + const ramStatus = await llama.getRamState(); + const usedMemory = ramStatus.total - ramStatus.free; const getPercentageString = (amount: number, total: number) => { if (total === 0) @@ -54,8 +52,9 @@ async function DebugVramFunction() { console.info(`${chalk.yellow("Used VRAM:")} ${getPercentageString(vramStatus.used, vramStatus.total)}% ${chalk.gray("(" + toBytes(vramStatus.used) + "/" + toBytes(vramStatus.total) + ")")}`); console.info(`${chalk.yellow("Free VRAM:")} ${getPercentageString(vramStatus.free, vramStatus.total)}% ${chalk.gray("(" + toBytes(vramStatus.free) + "/" + toBytes(vramStatus.total) + ")")}`); console.info(); - console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, totalMemory)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(totalMemory) + ")")}`); - console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + toBytes(freeMemory) + "/" + toBytes(totalMemory) + ")")}`); + console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, ramStatus.total)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(ramStatus.total) + ")")}`); + console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(ramStatus.free, ramStatus.total)}% ${chalk.gray("(" + toBytes(ramStatus.free) + "/" + toBytes(ramStatus.total) + ")")}`); + console.info(`${chalk.yellow("Wired RAM:")} ${getPercentageString(ramStatus.wired, ramStatus.total)}% ${chalk.gray("(" + toBytes(ramStatus.wired) + "/" + toBytes(ramStatus.total) + ")")}`); } async function DebugCmakeOptionsFunction() { diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 7bdb7bd6..13ea8e31 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -5,6 +5,7 @@ import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; +import bytes from "bytes"; import {getLlama} from "../../bindings/getLlama.js"; import { BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, @@ -40,6 +41,8 @@ type InfillCommand = { kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, threads?: number, temperature: number, minP: number, @@ -64,7 +67,7 @@ type InfillCommand = { numa?: LlamaNuma, meter: boolean, timing: boolean, - noMmap: boolean, + mmap?: boolean, useDirectIo: boolean, printTimings: boolean }; @@ -139,8 +142,7 @@ export const InfillCommand: CommandModule = { .option("flashAttention", { alias: "fa", type: "boolean", - default: false, - description: "Enable flash attention" + description: "Force enable flash attention. Flash attention is enabled by default when supported. You can force disable flash attention via `--no-fa`" }) .option("kvCacheKeyType", { alias: "kvckt", @@ -168,6 +170,16 @@ export const InfillCommand: CommandModule = { default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -313,10 +325,9 @@ export const InfillCommand: CommandModule = { default: false, description: "Print how how long it took to generate each response" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("useDirectIo", { type: "boolean", @@ -332,20 +343,20 @@ export const InfillCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, - flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, useDirectIo, printTimings + debug, numa, meter, timing, mmap, useDirectIo, printTimings }) { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, - lastTokensRepeatPenalty, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, + threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -358,14 +369,17 @@ export const InfillCommand: CommandModule = { async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, mmap, useDirectIo, printTimings }: InfillCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? bytes.parse(maxRam) ?? undefined : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? bytes.parse(maxVram) ?? undefined : undefined; + const headers = resolveHeaderFlag(headerArg); if (debug) @@ -385,7 +399,14 @@ async function RunInfill({ numa }); const logBatchSize = batchSize != null; - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; + + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -574,7 +595,10 @@ async function RunInfill({ useMmap, useDirectIo, logBatchSize, - tokenMeterEnabled: meter + tokenMeterEnabled: meter, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }); printInfoLine({ title: "Infill", diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index 5915632f..8f152a02 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -2,6 +2,7 @@ import process from "process"; import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; +import bytes from "bytes"; import {readGgufFileInfo} from "../../../../gguf/readGgufFileInfo.js"; import {resolveHeaderFlag} from "../../../utils/resolveHeaderFlag.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; @@ -32,11 +33,14 @@ type InspectEstimateCommand = { gpu?: BuildGpu | "auto", gpuLayers?: number | "max", contextSize?: number | "train", + flashAttention?: boolean, embedding?: boolean, - noMmap?: boolean, + mmap?: boolean, kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, - swaFullCache?: boolean + swaFullCache?: boolean, + maxRam?: string, + maxVram?: string }; export const InspectEstimateCommand: CommandModule = { @@ -108,6 +112,11 @@ export const InspectEstimateCommand: CommandModule !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, default: "F16" as const, - description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors", + group: "Optional:" }) .option("kvCacheValueType", { alias: "kvcvt", @@ -138,24 +147,41 @@ export const InspectEstimateCommand: CommandModule !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, default: "F16" as const, - description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors", + group: "Optional:" }) .option("swaFullCache", { alias: "noSwa", type: "boolean", default: false, - description: "Disable SWA (Sliding Window Attention) on supported models" + description: "Disable SWA (Sliding Window Attention) on supported models", + group: "Optional:" + }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for the model and the context. If the estimated RAM usage exceeds this value, the compatibility score will be reduced. This is useful for estimating compatibility with devices that have limited RAM. You can set this to a value like `16GB` or `512MB`.", + group: "Optional:" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Experimental. Maximum VRAM to use for the model and the context. If the estimated VRAM usage exceeds this value, the compatibility score will be reduced. This is useful for estimating compatibility with devices that have limited VRAM. You can set this to a value like `8GB` or `256MB`.", + group: "Optional:" }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap, - kvCacheKeyType, kvCacheValueType, swaFullCache + modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, flashAttention, embedding, mmap, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram }: InspectEstimateCommand) { if (gpuLayers === -1) gpuLayers = undefined; if (gpuLayers === -2) gpuLayers = "max"; if (contextSizeArg === -1) contextSizeArg = undefined; if (contextSizeArg === -2) contextSizeArg = "train"; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? (bytes.parse(maxRam) ?? undefined) : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? (bytes.parse(maxVram) ?? undefined) : undefined; + const headers = resolveHeaderFlag(headerArg); const [resolvedModelDestination, resolvedGgufPath] = isModelUri(ggufPath) @@ -182,7 +208,14 @@ export const InspectEstimateCommand: CommandModule 0) { + console.warn( + chalk.yellow( + "Warning: Both RAM and VRAM limits are set, but the RAM limit is lower than the VRAM limit.\n" + + "On unified memory systems, this may cause the effective VRAM limit to be the same as the RAM limit" + ) + ); + console.warn(); + } + + const compatibilityScore = await withOra({ + loading: chalk.blue("Resolving config"), + success: chalk.blue("Resolved config"), + fail: chalk.blue("Failed to resolve config"), + noSuccessLiveStatus: true + }, async () => { + return await ggufInsights.configurationResolver.resolveAndScoreConfig({ + flashAttention: flashAttention == null + ? "auto" + : flashAttention, + targetContextSize: contextSize, + targetGpuLayers: gpuLayers, + embeddingContext: embedding, + useMmap, + kvCacheKeyType: kvCacheKeyType === "currentQuant" + ? ggufInsights.dominantTensorType + : resolveGgmlTypeOption(kvCacheKeyType), + kvCacheValueType: kvCacheValueType === "currentQuant" + ? ggufInsights.dominantTensorType + : resolveGgmlTypeOption(kvCacheValueType), + swaFullCache + }); + }); + + logCompatibilityScore( + "Resolved config", + longestTitle, + compatibilityScore, + ggufInsights, + llama, + flashAttention == null + ? "auto" + : flashAttention + ); } }; @@ -291,7 +374,7 @@ function logCompatibilityScore( compatibilityScore: Awaited>, ggufInsights: GgufInsights, llama: Llama, - flashAttention: boolean + flashAttention: boolean | "auto" ) { printInfoLine({ title, @@ -319,9 +402,18 @@ function logCompatibilityScore( title: "RAM usage", value: () => toBytes(compatibilityScore.resolvedValues.totalRamUsage) }, { - show: flashAttention, title: "Flash attention", - value: "enabled" + value: flashAttention === "auto" + ? "auto" + : flashAttention + ? "enabled" + : "disabled" + }, { + show: llama.supportsMmap, + title: "mmap", + value: compatibilityScore.resolvedValues.useMmap + ? "enabled" + : "disabled" }] }); } diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts index 1f9c08fe..8861acb3 100644 --- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts +++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts @@ -4,7 +4,7 @@ import chalk from "chalk"; import {getLlamaForOptions} from "../../../../bindings/getLlama.js"; import {detectAvailableComputeLayers} from "../../../../bindings/utils/detectAvailableComputeLayers.js"; import {getPlatform} from "../../../../bindings/utils/getPlatform.js"; -import {BuildGpu, LlamaLogLevel} from "../../../../bindings/types.js"; +import {BuildGpu, LlamaLogLevel, RamState} from "../../../../bindings/types.js"; import {getPrettyBuildGpuName} from "../../../../bindings/consts.js"; import {getModuleVersion} from "../../../../utils/getModuleVersion.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; @@ -210,7 +210,7 @@ export const InspectGpuCommand: CommandModule = { } console.info(); - await logRamUsage(lastLlama?.cpuMathCores); + await logRamUsage(lastLlama, lastLlama?.cpuMathCores); if (lastLlama != null) { await logSwapUsage(lastLlama); @@ -287,10 +287,15 @@ async function logGpuVramUsage(llama: Llama) { } catch (err) {} } -async function logRamUsage(cpuMathCores?: number) { - const totalMemory = os.totalmem(); - const freeMemory = os.freemem(); - const usedMemory = totalMemory - freeMemory; +async function logRamUsage(llama?: Llama, cpuMathCores?: number) { + const ramState: RamState = llama != null + ? await llama.getRamState() + : { + total: os.totalmem(), + free: os.freemem(), + wired: 0 + }; + const usedMemory = ramState.total - ramState.free; const cpuDeviceNames = Array.from( new Set( os.cpus() @@ -305,8 +310,11 @@ async function logRamUsage(cpuMathCores?: number) { if (cpuMathCores != null) console.info(`${chalk.yellow("Math cores:")} ${cpuMathCores}`); - console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, totalMemory)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(totalMemory) + ")")}`); - console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + toBytes(freeMemory) + "/" + toBytes(totalMemory) + ")")}`); + console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, ramState.total)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(ramState.total) + ")")}`); + console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(ramState.free, ramState.total)}% ${chalk.gray("(" + toBytes(ramState.free) + "/" + toBytes(ramState.total) + ")")}`); + + if (llama != null) + console.info(`${chalk.yellow("Wired RAM:")} ${getPercentageString(ramState.wired, ramState.total)}% ${chalk.gray("(" + toBytes(ramState.wired) + "/" + toBytes(ramState.total) + ")")}`); } async function logSwapUsage(llama: Llama) { diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts index 60dcace7..9712b227 100644 --- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts +++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts @@ -2,10 +2,10 @@ import path from "path"; import process from "process"; import {fileURLToPath} from "url"; import {fork} from "node:child_process"; -import os from "os"; import {CommandModule} from "yargs"; import chalk from "chalk"; import stripAnsi from "strip-ansi"; +import bytes from "bytes"; import {readGgufFileInfo} from "../../../../gguf/readGgufFileInfo.js"; import {resolveCommandGgufPath} from "../../../utils/resolveCommandGgufPath.js"; import {getLlama} from "../../../../bindings/getLlama.js"; @@ -18,12 +18,13 @@ import {resolveHeaderFlag} from "../../../utils/resolveHeaderFlag.js"; import {getPrettyBuildGpuName} from "../../../../bindings/consts.js"; import {getReadablePath} from "../../../utils/getReadablePath.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; -import {documentationPageUrls} from "../../../../config.js"; +import {documentationPageUrls, minAllowedContextSizeInCalculations} from "../../../../config.js"; import {Llama} from "../../../../bindings/Llama.js"; import {toBytes} from "../../../utils/toBytes.js"; import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js"; import {getPlatform} from "../../../../bindings/utils/getPlatform.js"; import {GgmlType, resolveGgmlTypeOption} from "../../../../gguf/types/GgufTensorInfoTypes.js"; +import {LlamaContext, LlamaEmbeddingContext} from "../../../../apiDocsIndex.js"; type InspectMeasureCommand = { modelPath?: string, @@ -34,14 +35,17 @@ type InspectMeasureCommand = { minContextSize: number, maxContextSize?: number, flashAttention?: boolean, + embedding?: boolean, kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, batchSize?: number, measures: number, memory: "vram" | "ram" | "all", - noMmap: boolean, - noDirectIo: boolean, + mmap?: boolean, + useDirectIo: boolean, printHeaderBeforeEachLayer?: boolean, evaluateText?: string, repeatEvaluateText?: number @@ -112,6 +116,12 @@ export const InspectMeasureCommand: CommandModule default: false, description: "Enable flash attention for the context" }) + .option("embedding", { + alias: "e", + type: "boolean", + description: "Whether to create an embedding context instead of a regular context", + default: false + }) .option("kvCacheKeyType", { alias: "kvckt", type: "string", @@ -138,6 +148,16 @@ export const InspectMeasureCommand: CommandModule default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("batchSize", { alias: "b", type: "number", @@ -155,15 +175,14 @@ export const InspectMeasureCommand: CommandModule default: "vram" as const, description: "Type of memory to measure" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) - .option("noDirectIo", { + .option("useDirectIo", { type: "boolean", default: false, - description: "Disable Direct I/O usage when available" + description: "Use Direct I/O usage when available" }) .option("printHeaderBeforeEachLayer", { alias: "ph", @@ -184,15 +203,18 @@ export const InspectMeasureCommand: CommandModule }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, - batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, + modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, embedding, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, + batchSize, measures = 10, memory: measureMemoryType, mmap, useDirectIo, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }: InspectMeasureCommand) { if (maxLayers === -1) maxLayers = undefined; if (maxContextSize === -1) maxContextSize = undefined; if (minLayers < 1) minLayers = 1; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? (bytes.parse(maxRam) ?? undefined) : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? (bytes.parse(maxVram) ?? undefined) : undefined; + const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all"; const headers = resolveHeaderFlag(headerArg); @@ -206,9 +228,15 @@ export const InspectMeasureCommand: CommandModule logLevel: LlamaLogLevel.error }); + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); + const platform = getPlatform(); - const useMmap = !noMmap && llama.supportsMmap; - const useDirectIo = !noDirectIo; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, { flashAttention, swaFullCache, useMmap, kvCacheKeyType, kvCacheValueType }); @@ -218,9 +246,11 @@ export const InspectMeasureCommand: CommandModule console.info(chalk.yellow("mmap:") + " " + ( !llama.supportsMmap ? "unsupported" - : useMmap - ? "enabled" - : "disabled" + : useMmap === "auto" + ? "auto" + : useMmap === true + ? "enabled" + : "disabled" )); if (platform !== "mac") // Direct I/O is not supported on macOS @@ -239,10 +269,21 @@ export const InspectMeasureCommand: CommandModule sourceType: "filesystem" }); const ggufInsights = await GgufInsights.from(ggufMetadata, llama); - const totalVram = (await llama.getVramState()).total; - const totalRam = os.totalmem(); - - let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers; + const totalVram = (await llama._getRawVramState()).total; + const totalRam = (await llama.getRamState()).total; + + let lastGpuLayers = maxLayers ?? ( + resolvedMaxVram == null + ? ggufInsights.totalLayers + : (await ggufInsights.configurationResolver.resolveModelGpuLayersV2({ + fitContext: { + contextSize: minAllowedContextSizeInCalculations + } + }, { + useMmap, + defaultContextFlashAttention: flashAttention ?? undefined + })).gpuLayers + ); let previousContextSizeCheck: undefined | number = undefined; const resolvedKvCacheKeyType = kvCacheKeyType === "currentQuant" @@ -283,16 +324,19 @@ export const InspectMeasureCommand: CommandModule maxContextSize, minContextSize, flashAttention, + embedding, kvCacheKeyType: resolvedKvCacheKeyType, kvCacheValueType: resolvedKvCacheValueType, swaFullCache, + maxRam: resolvedMaxRam, + maxVram: resolvedMaxVram, batchSize, tests: measures, evaluateText: evaluateText == null ? undefined : evaluateText.repeat(repeatEvaluateText ?? 1), exitAfterMeasurement: exitAfterEachMeasurement, - onInfo({gpuLayers, result}) { + async onInfo({gpuLayers, result}) { if (lastGpuLayers !== gpuLayers) { lastGpuLayers = gpuLayers; previousContextSizeCheck = undefined; @@ -336,9 +380,9 @@ export const InspectMeasureCommand: CommandModule previousContextSizeCheck = result.contextSize; hadSuccessInThisProcess = true; - const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({ + const modelResourceEstimation = await ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers: lastGpuLayers, - useMmap + useMmap: result.useMmap }); const modelVramEstimation = modelResourceEstimation.gpuVram; const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") + @@ -354,12 +398,13 @@ export const InspectMeasureCommand: CommandModule const contextResourceEstimation = previousContextSizeCheck == null ? undefined - : ggufInsights.estimateContextResourceRequirements({ + : await ggufInsights.estimateContextResourceRequirementsV2({ contextSize: previousContextSizeCheck, modelGpuLayers: lastGpuLayers, flashAttention, swaFullCache, - batchSize + batchSize, + isEmbeddingContext: embedding }); const contextVramEstimation = contextResourceEstimation?.gpuVram; @@ -399,7 +444,11 @@ export const InspectMeasureCommand: CommandModule type: previousContextSizeCheck == null ? "Model" : "Context", - gpuLayers: String(lastGpuLayers), + gpuLayers: String(lastGpuLayers).padEnd("Layers".length - 1, " ") + ( + result.useMmap + ? chalk.gray("M") + : " " + ), contextSize: previousContextSizeCheck != null ? String(previousContextSizeCheck) : undefined, @@ -541,13 +590,18 @@ function renderDiffPercentageWithColors(percentage: number, { greenBright = 2, green = 6, yellow = 10, - yellowBright = 14 + yellowBright = 14, + nanIsZero = true }: { greenBright?: number, green?: number, yellow?: number, - yellowBright?: number + yellowBright?: number, + nanIsZero?: boolean } = {}): string { + if (nanIsZero && Number.isNaN(percentage)) + percentage = 0; + const percentageText = percentage.toFixed(2).padStart(5, "0") + "%"; const absPercentage = Math.abs(percentage); @@ -569,10 +623,12 @@ const expectedFileName = "InspectMeasureCommand"; async function measureModel({ modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, - flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo + flashAttention, embedding, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, batchSize, evaluateText, + exitAfterMeasurement = false, + onInfo }: { modelPath: string, - useMmap?: boolean, + useMmap?: "auto" | boolean, useDirectIo?: boolean, gpu?: BuildGpu | "auto", tests: number, @@ -582,9 +638,12 @@ async function measureModel({ maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + embedding?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, + maxRam?: number, + maxVram?: number, batchSize?: number, evaluateText?: string, exitAfterMeasurement?: boolean, @@ -602,13 +661,14 @@ async function measureModel({ modelVramUsage: number, modelRamUsage: number, contextSize?: number, + useMmap: boolean, contextVramUsage?: number, contextRamUsage?: number, contextStateSize?: number, totalVramUsage: number, totalRamUsage: number } - }): void + }): void | Promise }) { if (!detectedFileName.startsWith(expectedFileName)) { console.warn( @@ -679,7 +739,7 @@ async function measureModel({ cleanup(); } - subProcess.on("message", (message: ChildToParentMessage) => { + subProcess.on("message", async (message: ChildToParentMessage) => { if (message.type === "ready") { forkSucceeded = true; subProcess.send({ @@ -694,9 +754,12 @@ async function measureModel({ maxGpuLayers, minGpuLayers, flashAttention, + embedding, kvCacheKeyType, kvCacheValueType, swaFullCache, + maxRam, + maxVram, batchSize, evaluateText, exitAfterMeasurement @@ -716,7 +779,7 @@ async function measureModel({ } else if (message.type === "error") { lastGpuLayers = message.gpuLayers; - onInfo({ + await onInfo({ gpuLayers: lastGpuLayers, result: { type: "error", @@ -727,13 +790,14 @@ async function measureModel({ } else if (message.type === "stats") { lastGpuLayers = message.gpuLayers; - onInfo({ + await onInfo({ gpuLayers: message.gpuLayers, result: { type: "success", modelVramUsage: message.modelVramUsage, modelRamUsage: message.modelRamUsage, contextSize: message.contextSize, + useMmap: message.useMmap, contextVramUsage: message.contextVramUsage, contextRamUsage: message.contextRamUsage, contextStateSize: message.contextStateSize, @@ -746,7 +810,7 @@ async function measureModel({ subProcess.on("exit", (code) => { if (code !== 0 || !isPlannedExit) - onInfo({ + void onInfo({ gpuLayers: lastGpuLayers, result: { type: "crash", @@ -759,7 +823,7 @@ async function measureModel({ if (subProcess.killed || subProcess.exitCode != null) { if (subProcess.exitCode !== 0 || !isPlannedExit) - onInfo({ + void onInfo({ gpuLayers: lastGpuLayers, result: { type: "crash", @@ -799,12 +863,12 @@ async function runTestWorkerLogic() { } async function testContextSizes({ - model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, + model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, embedding, kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }: { model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, tests: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, - swaFullCache?: boolean, batchSize?: number, evaluateText?: string, + minContextSize?: number, tests: number, flashAttention?: boolean, embedding?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, batchSize?: number, evaluateText?: string, exitAfterMeasurement?: boolean }) { let measurementsDone: number = 0; @@ -825,29 +889,42 @@ async function runTestWorkerLogic() { currentContextSizeCheck = null; try { - const preContextVramUsage = (await llama.getVramState()).used; + const preContextVramUsage = (await llama._getRawVramState()).used; const preContextRamUsage = getMemoryUsage(llama); - const context = await model.createContext({ - contextSize: currentContextSizeCheck ?? ( - maxContextSize != null - ? {max: maxContextSize} - : undefined - ), - ignoreMemorySafetyChecks: currentContextSizeCheck != null, - flashAttention, - experimentalKvCacheKeyType: kvCacheKeyType, - experimentalKvCacheValueType: kvCacheValueType, - swaFullCache, - batchSize, - failedCreationRemedy: false - }); - if (evaluateText != null && evaluateText != "") { - const sequence = context.getSequence(); - await sequence.evaluateWithoutGeneratingNewTokens(model.tokenize(evaluateText)); - } + let context: LlamaContext | LlamaEmbeddingContext | undefined = undefined; + if (!embedding) { + context = await model.createContext({ + contextSize: currentContextSizeCheck ?? ( + maxContextSize != null + ? {max: maxContextSize} + : undefined + ), + ignoreMemorySafetyChecks: currentContextSizeCheck != null, + flashAttention, + experimentalKvCacheKeyType: kvCacheKeyType, + experimentalKvCacheValueType: kvCacheValueType, + swaFullCache, + batchSize, + failedCreationRemedy: false + }); + + if (evaluateText != null && evaluateText != "") { + const sequence = context.getSequence(); + await sequence.evaluateWithoutGeneratingNewTokens(model.tokenize(evaluateText)); + } + } else + context = await model.createEmbeddingContext({ + contextSize: currentContextSizeCheck ?? ( + maxContextSize != null + ? {max: maxContextSize} + : undefined + ), + ignoreMemorySafetyChecks: currentContextSizeCheck != null, + batchSize + }); - const postContextVramUsage = (await llama.getVramState()).used; + const postContextVramUsage = (await llama._getRawVramState()).used; const postContextRamUsage = getMemoryUsage(llama); measurementsDone++; @@ -856,14 +933,21 @@ async function runTestWorkerLogic() { gpuLayers: model.gpuLayers, modelVramUsage, modelRamUsage, - contextSize: context.contextSize, + contextSize: context instanceof LlamaContext + ? context.contextSize + : context._llamaContext.contextSize, + useMmap: model.useMmap, contextVramUsage: postContextVramUsage - preContextVramUsage, contextRamUsage: postContextRamUsage - preContextRamUsage, - contextStateSize: context.stateSize, + contextStateSize: context instanceof LlamaContext + ? context.stateSize + : context._llamaContext.stateSize, totalVramUsage: postContextVramUsage, totalRamUsage: postContextRamUsage }); - currentContextSizeCheck = context.contextSize; + currentContextSizeCheck = context instanceof LlamaContext + ? context.contextSize + : context._llamaContext.contextSize; await context.dispose(); } catch (err) { @@ -892,34 +976,52 @@ async function runTestWorkerLogic() { } async function testWithGpuLayers({ - modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false + modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, embedding, + kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, isFirstLoad }: { - modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number, - maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, - swaFullCache?: boolean, batchSize?: number, - evaluateText?: string, exitAfterMeasurement?: boolean + modelPath: string, useMmap?: "auto" | boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number, + maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, embedding?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, batchSize?: number, + evaluateText?: string, exitAfterMeasurement?: boolean, + isFirstLoad: boolean }) { try { - const preModelVramUsage = (await llama.getVramState()).used; + const preModelVramUsage = (await llama._getRawVramState()).used; const preModelRamUsage = getMemoryUsage(llama); - const model = await llama.loadModel({ - modelPath, - useMmap, - useDirectIo, - gpuLayers, - defaultContextFlashAttention: flashAttention, - experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, - experimentalDefaultContextKvCacheValueType: kvCacheValueType, - defaultContextSwaFullCache: swaFullCache, - ignoreMemorySafetyChecks: true - }); - const postModelVramUsage = (await llama.getVramState()).used; + let model: LlamaModel | undefined = undefined; + + for (let triesLeft = 2; triesLeft > 0; triesLeft--) { + try { + model = await llama.loadModel({ + modelPath, + useMmap, + useDirectIo, + gpuLayers, + defaultContextFlashAttention: flashAttention, + experimentalDefaultContextKvCacheKeyType: kvCacheKeyType, + experimentalDefaultContextKvCacheValueType: kvCacheValueType, + defaultContextSwaFullCache: swaFullCache + }); + break; + } catch (err) { + if (isFirstLoad || triesLeft === 1) + throw err; + + // wait for the locked memory to free up before trying again + await new Promise((accept) => setTimeout(accept, 6 * 1000)); + } + } + + if (model == null) + throw new Error("Failed to load model"); + + const postModelVramUsage = (await llama._getRawVramState()).used; const postModelRamUsage = getMemoryUsage(llama); sendInfoBack({ type: "stats", gpuLayers: model.gpuLayers, + useMmap: model.useMmap, modelVramUsage: postModelVramUsage - preModelVramUsage, modelRamUsage: postModelRamUsage - preModelRamUsage, totalVramUsage: postModelVramUsage, @@ -934,6 +1036,7 @@ async function runTestWorkerLogic() { maxContextSize, minContextSize, flashAttention, + embedding, kvCacheKeyType, kvCacheValueType, swaFullCache, @@ -975,6 +1078,9 @@ async function runTestWorkerLogic() { continue; } + await llama.setVramCap(message.maxVram ?? null); + await llama.setRamCap(message.maxRam ?? null); + const measurementsDone = await testWithGpuLayers({ modelPath: message.modelPath, useMmap: message.useMmap, @@ -987,12 +1093,14 @@ async function runTestWorkerLogic() { maxContextSize: message.maxContextSize, minContextSize: message.minContextSize, flashAttention: message.flashAttention, + embedding: message.embedding, kvCacheKeyType: message.kvCacheKeyType, kvCacheValueType: message.kvCacheValueType, swaFullCache: message.swaFullCache, batchSize: message.batchSize, evaluateText: message.evaluateText, - exitAfterMeasurement: message.exitAfterMeasurement + exitAfterMeasurement: message.exitAfterMeasurement, + isFirstLoad: gpuLayers == message.maxGpuLayers }); if (measurementsDone > 0 && message.exitAfterMeasurement) { @@ -1077,15 +1185,18 @@ function getNextItemInCheckContextSizesPlan(plan: number[], currentSize: number) type ParentToChildMessage = { type: "start", modelPath: string, - useMmap?: boolean, + useMmap?: "auto" | boolean, useDirectIo?: boolean, tests: number, maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + embedding?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, + maxRam?: number, + maxVram?: number, batchSize?: number, initialMaxContextSize?: number, maxContextSize?: number, @@ -1104,6 +1215,7 @@ type ChildToParentMessage = { modelVramUsage: number, modelRamUsage: number, contextSize?: number, + useMmap: boolean, contextVramUsage?: number, contextRamUsage?: number, contextStateSize?: number, @@ -1123,7 +1235,7 @@ function padStartAnsi(text: string, length: number, padChar: string = " ") { } function getMemoryUsage(llama: Llama) { - const totalMemoryUsage = llama._bindings.getMemoryInfo().total; + const totalMemoryUsage = llama._bindings.getProcessMemoryInfo().total; const vramUsage = llama._bindings.getGpuVramInfo(); let memoryUsage = totalMemoryUsage; diff --git a/src/cli/commands/source/commands/BuildCommand.ts b/src/cli/commands/source/commands/BuildCommand.ts index e592fa00..6053627a 100644 --- a/src/cli/commands/source/commands/BuildCommand.ts +++ b/src/cli/commands/source/commands/BuildCommand.ts @@ -4,7 +4,7 @@ import chalk from "chalk"; import {compileLlamaCpp} from "../../../../bindings/utils/compileLLamaCpp.js"; import withOra from "../../../../utils/withOra.js"; import {clearTempFolder} from "../../../../utils/clearTempFolder.js"; -import {builtinLlamaCppGitHubRepo, builtinLlamaCppRelease, isCI, defaultLlamaCppGpuSupport, documentationPageUrls} from "../../../../config.js"; +import {builtinLlamaCppGitHubRepo, isCI, defaultLlamaCppGpuSupport, documentationPageUrls} from "../../../../config.js"; import {downloadCmakeIfNeeded} from "../../../../utils/cmake.js"; import withStatusLogs from "../../../../utils/withStatusLogs.js"; import {logBinaryUsageExampleToConsole} from "../../../../bindings/utils/logBinaryUsageExampleToConsole.js"; @@ -18,6 +18,7 @@ import {getConsoleLogPrefix} from "../../../../utils/getConsoleLogPrefix.js"; import {getPrettyBuildGpuName} from "../../../../bindings/consts.js"; import {getPlatformInfo} from "../../../../bindings/utils/getPlatformInfo.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; +import {builtinLlamaCppRelease} from "../../../../bindings/utils/binariesGithubRelease.js"; type BuildCommand = { arch?: typeof process.arch, diff --git a/src/cli/commands/source/commands/DownloadCommand.ts b/src/cli/commands/source/commands/DownloadCommand.ts index d4e42bab..e787d7e0 100644 --- a/src/cli/commands/source/commands/DownloadCommand.ts +++ b/src/cli/commands/source/commands/DownloadCommand.ts @@ -3,17 +3,16 @@ import {CommandModule} from "yargs"; import fs from "fs-extra"; import chalk from "chalk"; import { - defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, isCI, llamaCppDirectory, llamaCppDirectoryInfoFilePath, - defaultLlamaCppGpuSupport, documentationPageUrls + defaultLlamaCppGitHubRepo, isCI, llamaCppDirectory, llamaCppDirectoryInfoFilePath, defaultLlamaCppGpuSupport, documentationPageUrls } from "../../../../config.js"; import {compileLlamaCpp} from "../../../../bindings/utils/compileLLamaCpp.js"; import withOra from "../../../../utils/withOra.js"; import {clearTempFolder} from "../../../../utils/clearTempFolder.js"; -import {setBinariesGithubRelease} from "../../../../bindings/utils/binariesGithubRelease.js"; +import {defaultLlamaCppRelease, setBinariesGithubRelease} from "../../../../bindings/utils/binariesGithubRelease.js"; import {downloadCmakeIfNeeded} from "../../../../utils/cmake.js"; import withStatusLogs from "../../../../utils/withStatusLogs.js"; import {getIsInDocumentationMode} from "../../../../state.js"; -import {getGitBundlePathForRelease, unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle} from "../../../../utils/gitReleaseBundles.js"; +import {getGitBundlePathForRelease, isGitBundleCompatible, unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle} from "../../../../utils/gitReleaseBundles.js"; import {cloneLlamaCppRepo} from "../../../../bindings/utils/cloneLlamaCppRepo.js"; import {getPlatform} from "../../../../bindings/utils/getPlatform.js"; import {resolveCustomCmakeOptions} from "../../../../bindings/utils/resolveCustomCmakeOptions.js"; @@ -26,6 +25,7 @@ import {getConsoleLogPrefix} from "../../../../utils/getConsoleLogPrefix.js"; import {getPrettyBuildGpuName} from "../../../../bindings/consts.js"; import {getPlatformInfo} from "../../../../bindings/utils/getPlatformInfo.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; +import {applyLlamaCppRepoPatches, hasLlamaCppRepoPatchesToApply} from "../../../../bindings/utils/applyLlamaCppRepoPatches.js"; type DownloadCommandArgs = { repo?: string, @@ -147,14 +147,20 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { let githubReleaseTag: string | null = (useBundle && (await getGitBundlePathForRelease(githubOwner, githubRepo, release)) != null) ? release : null; + let githubReleaseDate: Date | undefined = undefined; - if (githubReleaseTag == null) + if (githubReleaseTag == null || ( + hasLlamaCppRepoPatchesToApply() && + !(await isGitBundleCompatible(githubOwner, githubRepo, githubReleaseTag)) + )) await withOra({ loading: chalk.blue("Fetching llama.cpp info"), success: chalk.blue("Fetched llama.cpp info"), fail: chalk.blue("Failed to fetch llama.cpp info") }, async () => { - githubReleaseTag = await resolveGithubRelease(githubOwner, githubRepo, release); + const githubRelease = await resolveGithubRelease(githubOwner, githubRepo, release); + githubReleaseTag = githubRelease.tag; + githubReleaseDate = githubRelease.date; }); await clearTempFolder(); @@ -170,6 +176,14 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { await cloneLlamaCppRepo(githubOwner, githubRepo, githubReleaseTag!, useBundle); + if (isCI && updateBinariesReleaseMetadataAndSaveGitBundle) { + await setBinariesGithubRelease(githubReleaseTag!); + await unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle(); + + await applyLlamaCppRepoPatches(githubReleaseDate, true); + } else + await applyLlamaCppRepoPatches(githubReleaseDate, false); + if (!skipBuild) { for (let i = 0; i < buildGpusToTry.length; i++) { const gpuToTry = buildGpusToTry[i]; @@ -263,11 +277,6 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { logBinaryUsageExampleToConsole(buildOptions, gpu !== "auto", true); } - if (isCI && updateBinariesReleaseMetadataAndSaveGitBundle) { - await setBinariesGithubRelease(githubReleaseTag!); - await unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle(); - } - console.log(); console.log(); console.log(`${chalk.yellow("Repo:")} ${repo}`); diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index bd1cfb71..2a118ed1 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -60,7 +60,7 @@ export async function interactivelyAskForModel({ modelsDirectory, allowLocalModels = true, downloadIntent = true, - flashAttention = false, + flashAttention = "auto", swaFullCache = false, useMmap, kvCacheKeyType, @@ -70,9 +70,9 @@ export async function interactivelyAskForModel({ modelsDirectory?: string, allowLocalModels?: boolean, downloadIntent?: boolean, - flashAttention?: boolean, + flashAttention?: "auto" | boolean, swaFullCache?: boolean, - useMmap?: boolean, + useMmap?: "auto" | boolean, kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType }): Promise { @@ -126,7 +126,7 @@ export async function interactivelyAskForModel({ progressUpdater.setProgress(readItems / ggufFileNames.length, renderProgress()); const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({ - flashAttention: flashAttention && ggufInsights?.flashAttentionSupported, + flashAttention, swaFullCache, useMmap, kvCacheKeyType: kvCacheKeyType === "currentQuant" @@ -233,7 +233,7 @@ export async function interactivelyAskForModel({ try { while (true) { - const minWidth = Math.min(80 + (flashAttention ? 26 : 0), process.stdout.columns - 1); + const minWidth = Math.min(80 + (flashAttention !== false ? 26 : 0), process.stdout.columns - 1); const selectedItem = await basicChooseFromListConsoleInteraction({ title(item, rerender) { const title = chalk.bold("Select a model:") + " "; @@ -258,13 +258,19 @@ export async function interactivelyAskForModel({ chalk.dim("(" + toBytes(vramState.used) + "/" + toBytes(vramState.total) + ")") + " " ) + ( - !flashAttention + flashAttention === false ? "" : ( " " + chalk.bgGray( " " + - chalk.yellow("Flash attention:") + " " + "enabled" + + chalk.yellow("Flash attention:") + " " + ( + flashAttention === "auto" + ? "auto" + : flashAttention === true + ? "enabled" + : "disabled" + ) + " " ) ) @@ -424,8 +430,8 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean, - swaFullCache: boolean, useMmap: boolean | undefined, + item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: "auto" | boolean, + swaFullCache: boolean, useMmap: "auto" | boolean | undefined, kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType ) { if (item.type === "localModel") { @@ -584,9 +590,9 @@ async function selectFileForModelRecommendation({ llama: Llama, abortSignal: AbortSignal, rerenderOption(): void, - flashAttention: boolean, + flashAttention: "auto" | boolean, swaFullCache: boolean, - useMmap?: boolean, + useMmap?: "auto" | boolean, kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType }) { diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts index 47a81c42..42e4eb5d 100644 --- a/src/cli/utils/printCommonInfoLines.ts +++ b/src/cli/utils/printCommonInfoLines.ts @@ -15,17 +15,23 @@ export async function printCommonInfoLines({ logBatchSize = false, tokenMeterEnabled = false, printBos = false, - printEos = false + printEos = false, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }: { context: LlamaContext, draftContext?: LlamaContext, minTitleLength?: number, - useMmap?: boolean, + useMmap?: "auto" | boolean, useDirectIo?: boolean, logBatchSize?: boolean, tokenMeterEnabled?: boolean, printBos?: boolean, - printEos?: boolean + printEos?: boolean, + resolvedMaxRam?: number, + resolvedMaxVram?: number, + swaFullCache?: boolean }) { const platform = getPlatform(); const llama = context._llama; @@ -62,6 +68,28 @@ export async function printCommonInfoLines({ }] }); } + if (resolvedMaxRam != null || resolvedMaxVram != null || swaFullCache === true) + printInfoLine({ + title: "Options", + padTitle: padTitle, + info: [{ + show: resolvedMaxRam != null, + title: "Max RAM", + value: toBytes(resolvedMaxRam ?? 0) + }, { + show: resolvedMaxVram != null, + title: "Max VRAM", + value: toBytes(resolvedMaxVram ?? 0) + }, { + show: swaFullCache === true, + title: "SWA", + value: model.fileInsights.swaSize == null + ? "unsupported" + : swaFullCache === true + ? "disabled" + : "enabled" + }] + }); printInfoLine({ title: "Model", padTitle: padTitle, @@ -81,9 +109,13 @@ export async function printCommonInfoLines({ title: "mmap", value: !model._llama.supportsMmap ? "unsupported" - : (useMmap || useMmap == null) + : useMmap === true ? "enabled" - : "disabled" + : (useMmap === "auto" || useMmap == null) + ? model.useMmap + ? "auto (enabled)" + : "auto (disabled)" + : "disabled" }, { title: "Direct I/O", show: platform !== "mac", // Direct IO is not supported on macOS @@ -119,9 +151,12 @@ export async function printCommonInfoLines({ title: "Batch size", value: context.batchSize.toLocaleString("en-US") }, { - show: context.flashAttention, title: "Flash attention", - value: "enabled" + value: context.flashAttention === "auto" + ? "auto" + : context.flashAttention === true + ? "enabled" + : "disabled" }, { show: tokenMeterEnabled, title: "Token meter", @@ -178,9 +213,12 @@ export async function printCommonInfoLines({ title: "Batch size", value: draftContext.batchSize.toLocaleString("en-US") }, { - show: draftContext.flashAttention, title: "Flash attention", - value: "enabled" + value: draftContext.flashAttention === "auto" + ? "auto" + : draftContext.flashAttention === true + ? "enabled" + : "disabled" }, { show: tokenMeterEnabled, title: "Token meter", diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts index a11cfa96..cd50ddca 100644 --- a/src/cli/utils/resolveCommandGgufPath.ts +++ b/src/cli/utils/resolveCommandGgufPath.ts @@ -14,10 +14,10 @@ import {getReadablePath} from "./getReadablePath.js"; import {interactivelyAskForModel} from "./interactivelyAskForModel.js"; export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record, { - targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File", + targetDirectory = cliModelsDirectory, flashAttention = "auto", swaFullCache = false, useMmap, consoleTitle = "File", kvCacheKeyType, kvCacheValueType }: { - targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string, + targetDirectory?: string, flashAttention?: "auto" | boolean, swaFullCache?: boolean, useMmap?: "auto" | boolean, consoleTitle?: string, kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType } = {}) { @@ -85,6 +85,60 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama } await fs.remove(downloader.entrypointFilePath); + } else if (downloader.totalFiles > 1) { + const filesInfo = downloader.filesInfo; + const fileStats = await Promise.all( + filesInfo.map(async (info) => { + try { + return { + stats: await fs.stat(info.filePath), + info + }; + } catch (err) { + return { + stats: null, + info + }; + } + }) + ); + + const totalFileSizes = fileStats.reduce((res, {stats}) => res + (stats?.size ?? 0), 0); + const allFilesExist = fileStats.every(({stats}) => stats != null); + if (allFilesExist && totalFileSizes === downloader.totalSize) { + console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`); + + return downloader.entrypointFilePath; + } else if (allFilesExist && totalFileSizes !== downloader.totalSize) { + const res = await ConsoleInteraction.yesNoQuestion( + "There are already downloaded local files for this model that are different from the remote ones.\n" + + "Download them and override the existing files?" + ); + + if (!res) { + console.info("Loading the existing files"); + console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`); + + return downloader.entrypointFilePath; + } else + await Promise.all( + fileStats.map(({stats, info}) => { + if (stats == null) + return undefined; + + return fs.remove(info.filePath); + }) + ); + } else + await Promise.all( + fileStats.map(async ({stats, info}) => { + if (stats == null) + return; + + if (stats.size !== info.totalSize) + await fs.remove(info.filePath); + }) + ); } const consoleInteraction = new ConsoleInteraction(); diff --git a/src/config.ts b/src/config.ts index 5337d012..c302e179 100644 --- a/src/config.ts +++ b/src/config.ts @@ -4,7 +4,6 @@ import * as os from "os"; import process from "process"; import envVar from "env-var"; import {nanoid} from "nanoid"; -import {getBinariesGithubRelease} from "./bindings/utils/binariesGithubRelease.js"; import { nodeLlamaCppGpuOptions, LlamaLogLevel, LlamaLogLevelValues, parseNodeLlamaCppGpuOption, nodeLlamaCppGpuOffStringOptions } from "./bindings/types.js"; @@ -24,6 +23,7 @@ export const projectTemplatesDirectory = path.join(__dirname, "..", "templates") export const localTempDirectory = path.join(__dirname, "..", ".temp"); export const packedProjectTemplatesDirectory = path.join(projectTemplatesDirectory, "packed"); export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp"); +export const llamaCppPatchesDirectory = path.join(llamaDirectory, "patches"); export const llamaCppGrammarsDirectory = path.join(llamaDirectory, "llama.cpp", "grammars"); export const tempDownloadDirectory = path.join(os.tmpdir(), "node-llama-cpp", nanoid()); export const cliHomedirDirectory = path.join(os.homedir(), ".node-llama-cpp"); @@ -40,7 +40,7 @@ export const localXpacksCacheDirectory = path.join(xpackDirectory, "cache"); export const buildMetadataFileName = "_nlcBuildMetadata.json"; export const xpmVersion = "^0.16.3"; export const builtinLlamaCppGitHubRepo = "ggml-org/llama.cpp"; -export const builtinLlamaCppRelease = await getBinariesGithubRelease(); +// builtinLlamaCppRelease moved to binariesGithubRelease.ts export const isCI = env.get("CI") .default("false") @@ -52,9 +52,13 @@ export const useCiLogs = isCI || isRunningInsideGoogleColab; export const defaultLlamaCppGitHubRepo = env.get("NODE_LLAMA_CPP_REPO") .default(builtinLlamaCppGitHubRepo) .asString(); -export const defaultLlamaCppRelease = env.get("NODE_LLAMA_CPP_REPO_RELEASE") - .default(builtinLlamaCppRelease) +// defaultLlamaCppRelease moved to binariesGithubRelease.ts +export const llamaCppReleaseEnvVar = env.get("NODE_LLAMA_CPP_REPO_RELEASE") .asString(); +export const defaultLlamaCppRepoSkipPatches = env.get("NODE_LLAMA_CPP_REPO_SKIP_PATCHES") + .default("") + .asString() + .split(";"); export const defaultLlamaCppGpuSupport = parseNodeLlamaCppGpuOption( env.get("NODE_LLAMA_CPP_GPU") .default("auto") diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts index 4ecf3399..5f716e0d 100644 --- a/src/evaluator/LlamaChat/LlamaChat.ts +++ b/src/evaluator/LlamaChat/LlamaChat.ts @@ -522,6 +522,7 @@ const defaultSegmentBudgetSize = (contextSize: number) => ( ? contextSize * 0.5 : contextSize * 0.75 ); +const maxGrammarEndReevaluateCheckpointRestoreEvaluationBudget = 10; export class LlamaChat { @@ -2920,7 +2921,12 @@ class GenerateResponseState Math.floor(maxGrammarEndReevaluateCheckpointRestoreEvaluationBudget / 2) ) await this.llamaChat.sequence._takeNamedCheckpoint( internalCheckpoints.chatGrammarEnd.name, @@ -3383,7 +3389,10 @@ class GenerateResponseState Math.floor(maxGrammarEndReevaluateCheckpointRestoreEvaluationBudget / 2) ) return this.llamaChat.sequence._takeNamedCheckpoint( internalCheckpoints.chatGrammarEnd.name, diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 248c763e..cb3be164 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -14,8 +14,9 @@ import {ThreadsSplitterConsumer} from "../../utils/ThreadsSplitter.js"; import {pushAll} from "../../utils/pushAll.js"; import {safeEventCallback} from "../../utils/safeEventCallback.js"; import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js"; -import {LlamaLogLevel} from "../../bindings/types.js"; +import {LlamaLocks, LlamaLogLevel} from "../../bindings/types.js"; import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; +import {MemoryMarking} from "../../bindings/utils/MemoryOrchestrator.js"; import { BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceDryRepeatPenalty, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, @@ -71,7 +72,7 @@ export class LlamaContext { /** @internal */ private readonly _model: LlamaModel; /** @internal */ private readonly _contextSize: number; /** @internal */ private readonly _batchSize: number; - /** @internal */ private readonly _flashAttention: boolean; + /** @internal */ private readonly _flashAttention: "auto" | boolean; /** @internal */ private readonly _idealThreads: number; /** @internal */ private readonly _minThreads: number; /** @internal */ private readonly _performanceTracking: boolean; @@ -86,6 +87,8 @@ export class LlamaContext { /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); /** @internal */ private readonly _modelPreventDisposalHandle: DisposalPreventionHandle; /** @internal */ private readonly _loraAdapters = new Set(); + /** @internal */ public _vramConsumptionMarking?: MemoryMarking; + /** @internal */ public _ramConsumptionMarking?: MemoryMarking; /** @internal */ private _nextGeneratedSequenceId = 0; /** @internal */ private _dispatchDecodeScheduled = false; /** @internal */ private _batchDispatchPending = false; @@ -121,7 +124,7 @@ export class LlamaContext { sequences: number, contextSize: number, batchSize: number, - flashAttention: boolean, + flashAttention: "auto" | boolean, experimentalKvCacheKeyType: GgmlType, experimentalKvCacheValueType: GgmlType }) { @@ -163,7 +166,9 @@ export class LlamaContext { : 0 ), sequences: this._totalSequences, - flashAttention: this._flashAttention, + flashAttention: this._flashAttention === "auto" + ? "auto" + : Boolean(this._flashAttention), threads: this._idealThreads, embeddings: _embeddings, ranking: _ranking, @@ -194,6 +199,8 @@ export class LlamaContext { this._disposeAggregator.add(async () => { await this._backendContextDisposeGuard.acquireDisposeLock(); await this._ctx.dispose(); + this._vramConsumptionMarking?.dispose(); + this._ramConsumptionMarking?.dispose(); this._modelPreventDisposalHandle.dispose(); }); } @@ -228,7 +235,7 @@ export class LlamaContext { return this._batchSize; } - public get flashAttention(): boolean { + public get flashAttention(): "auto" | boolean { return this._flashAttention; } @@ -888,9 +895,10 @@ export class LlamaContext { const kvUnified = false; const sequences = Math.max(1, Math.floor(options.sequences ?? getDefaultContextSequences())); - const flashAttention = _model.flashAttentionSupported - ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) - : false; + const flashAttentionOption = options.flashAttention ?? _model.defaultContextFlashAttention; + const flashAttention = flashAttentionOption === "auto" + ? "auto" + : Boolean(flashAttentionOption); const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant" ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType; @@ -940,7 +948,7 @@ export class LlamaContext { async function createContext(contextSize: number) { const batchSize = options.batchSize ?? getDefaultContextBatchSize({contextSize, sequences}); - const resourceRequirementsEstimation = _model.fileInsights.estimateContextResourceRequirements({ + const resourceRequirementsEstimation = await _model.fileInsights.estimateContextResourceRequirementsV2({ contextSize, sequences, isEmbeddingContext: options._embeddings, @@ -967,13 +975,19 @@ export class LlamaContext { : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram); const contextCreationRamReservation = options.ignoreMemorySafetyChecks ? null - : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam); + : _model._llama._ramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam); try { if (createSignal?.aborted) throw createSignal.reason; - const contextLoaded = await context._ctx.init(); + const initLock = await acquireLock([_model._llama._memoryLock, LlamaLocks.addonInit]); + let contextLoaded: boolean = false; + try { + contextLoaded = await context._ctx.init(); + } finally { + initLock.dispose(); + } if (createSignal?.aborted) { if (contextLoaded) @@ -983,6 +997,9 @@ export class LlamaContext { } else if (!contextLoaded) throw new Error("Failed to create context"); + const memoryBreakdown = context._ctx.getMemoryBreakdown(); + context._vramConsumptionMarking = _model._llama._vramOrchestrator.markAllocation(memoryBreakdown.gpuVram); + context._ramConsumptionMarking = _model._llama._ramOrchestrator.markAllocation(memoryBreakdown.cpuRam); contextCreationVramReservation?.dispose?.(); contextCreationRamReservation?.dispose?.(); @@ -1097,7 +1114,7 @@ export class LlamaContextSequence { }; this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId); - this._gcRegistry.register(this, sequenceId); + this._gcRegistry.register(this, sequenceId, this); this._disposeAggregator.add(() => this._gcRegistry.unregister(this)); this._disposeAggregator.add(this.onDispose.dispatchEvent); @@ -1945,6 +1962,28 @@ export class LlamaContextSequence { return this._checkpoints.memoryUsage; } + /** + * Check how many tokens will be evaluated when trying to restore to checkpoint at a given index + */ + public getRestoreToPrefixIndexEvaluationSize(tokenIndex: number) { + if (tokenIndex < 0) + return 0; + else if (tokenIndex >= this.nextTokenIndex) + return -1; + + if (!this.needsCheckpoints) + return 0; + + if (tokenIndex >= this.stateCellsStartIndex) + return 0; + + const checkpoint = this._checkpoints.getLastCheckpoint(tokenIndex, this.contextSize); + if (checkpoint == null) + return tokenIndex; + + return Math.max(0, tokenIndex - checkpoint.maxPos); + } + /** @internal */ private async _takeCheckpoint(name: string | undefined, maxNamedCheckpoints: number) { if (!this.needsCheckpoints || this._nextTokenIndex === 0 || this._checkpoints.hasCheckpoint(name, this._nextTokenIndex - 1)) diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index a0a64f02..23480cf5 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -50,18 +50,14 @@ export type LlamaContextOptions = { /** * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. - * - * The support for flash attention is currently experimental and may not always work as expected. - * Use with caution. + * + * When set to `"auto"`, flash attention will automatically be used when supported by the model and hardware capabilities. * * This option will be ignored if flash attention is not supported by the model. * - * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`). - * - * Upon flash attention exiting the experimental status, the default value will become `true` - * (the inherited value from the model option `defaultContextFlashAttention` will become `true`). + * Defaults to `"auto"` (inherited from the model option `defaultContextFlashAttention`). */ - flashAttention?: boolean, + flashAttention?: "auto" | boolean, /** * number of threads to use to evaluate tokens. diff --git a/src/evaluator/LlamaEmbeddingContext.ts b/src/evaluator/LlamaEmbeddingContext.ts index cd91d747..3394ed76 100644 --- a/src/evaluator/LlamaEmbeddingContext.ts +++ b/src/evaluator/LlamaEmbeddingContext.ts @@ -50,7 +50,7 @@ export type LlamaEmbeddingContextOptions = { * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial */ export class LlamaEmbeddingContext { - /** @internal */ private readonly _llamaContext: LlamaContext; + /** @internal */ public readonly _llamaContext: LlamaContext; /** @internal */ private readonly _sequence: LlamaContextSequence; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 84dfa65e..d14dc183 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -1,6 +1,6 @@ import process from "process"; import path from "path"; -import {AsyncDisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils"; +import {acquireLock, AsyncDisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils"; import {removeNullFields} from "../../utils/removeNullFields.js"; import {Token, Tokenizer} from "../../types.js"; import {AddonModel, AddonModelLora, ModelTypeDescription} from "../../bindings/AddonTypes.js"; @@ -8,7 +8,7 @@ import {DisposalPreventionHandle, DisposeGuard} from "../../utils/DisposeGuard.j import {LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues} from "../../bindings/types.js"; import {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js"; import {readGgufFileInfo} from "../../gguf/readGgufFileInfo.js"; -import {GgufInsights} from "../../gguf/insights/GgufInsights.js"; +import {GgufInsights, GgufInsightsResourceRequirements} from "../../gguf/insights/GgufInsights.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; import {Writable} from "../../utils/utilTypes.js"; import {getReadablePath} from "../../cli/utils/getReadablePath.js"; @@ -20,6 +20,7 @@ import {OverridesObject} from "../../utils/OverridesObject.js"; import {maxRecentDetokenizerTokens} from "../../consts.js"; import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js"; import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; +import {MemoryMarking} from "../../bindings/utils/MemoryOrchestrator.js"; import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js"; import type {Llama} from "../../bindings/Llama.js"; import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js"; @@ -72,10 +73,12 @@ export type LlamaModelOptions = { * * When using mmap, you might notice a delay the first time you actually use the model, * which is caused by the OS itself loading the model into memory. + * + * When this option is set to `"auto"`, mmap may be disabled in scenarios where doing so allows more layers to be offloaded to the GPU. * - * Defaults to `true` if the current system supports it. + * Defaults to `"auto"` if the current system supports it. */ - useMmap?: boolean, + useMmap?: "auto" | boolean, /** * Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory, @@ -113,20 +116,15 @@ export type LlamaModelOptions = { * * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. * - * The support for flash attention is currently experimental and may not always work as expected. - * Use with caution. - * * This option will be ignored if flash attention is not supported by the model. * * Enabling this affects the calculations of default values for the model and contexts created with it * as flash attention reduces the amount of memory required, * which allows for more layers to be offloaded to the GPU and for context sizes to be bigger. * - * Defaults to `false`. - * - * Upon flash attention exiting the experimental status, the default value will become `true`. + * Defaults to `"auto"`. */ - defaultContextFlashAttention?: boolean, + defaultContextFlashAttention?: "auto" | boolean, /** * The default type of the key for the KV cache tensors used for contexts created with this model. @@ -198,9 +196,9 @@ export type LlamaModelOptions = { metadataOverrides?: OverridesObject }; -const defaultUseMmap = true; +const defaultUseMmap = "auto" as const satisfies NonNullable; const defaultUseDirectIo = false; -const defaultContextFlashAttentionEnabled = false; +const defaultContextFlashAttentionOptionDefault = "auto" as const satisfies NonNullable; const defaultContextSwaFullCache = false; export class LlamaModel { @@ -212,18 +210,21 @@ export class LlamaModel { /** @internal */ private readonly _fileInfo: GgufFileInfo; /** @internal */ private readonly _fileInsights: GgufInsights; /** @internal */ private readonly _gpuLayers: number; + /** @internal */ public readonly _useMmap: boolean; /** @internal */ private readonly _vocabOnly: boolean; /** @internal */ private readonly _filename?: string; /** @internal */ private readonly _disposedState: DisposedState = {disposed: false}; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle; - /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean; - /** @internal */ private readonly _defaultContextFlashAttention: boolean; + /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: "auto" | boolean; + /** @internal */ private readonly _defaultContextFlashAttention: "auto" | boolean; /** @internal */ private readonly _defaultContextSwaFullCache: boolean; /** @internal */ private readonly _defaultContextKvCacheKeyType: GgmlType; /** @internal */ private readonly _defaultContextKvCacheValueType: GgmlType; /** @internal */ private readonly _flashAttentionSupported: boolean; /** @internal */ private readonly _loraAdapters = new Map(); + /** @internal */ public _vramConsumptionMarking?: MemoryMarking; + /** @internal */ public _ramConsumptionMarking?: MemoryMarking; /** @internal */ private _typeDescription?: ModelTypeDescription; /** @internal */ private _trainContextSize?: number; /** @internal */ private _embeddingVectorSize?: number; @@ -233,9 +234,11 @@ export class LlamaModel { public readonly onDispose = new EventRelay(); private constructor({ - modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides + modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock = false, checkTensors, onLoadProgress, loadSignal, + metadataOverrides }: LlamaModelOptions & { - gpuLayers: number + gpuLayers: number, + useMmap: boolean }, { _llama, _fileInfo, @@ -250,8 +253,8 @@ export class LlamaModel { _llama: Llama, _fileInfo: GgufFileInfo, _fileInsights: GgufInsights, - _defaultContextFlashAttentionOptionEnabled: boolean, - _defaultContextFlashAttention: boolean, + _defaultContextFlashAttentionOptionEnabled: "auto" | boolean, + _defaultContextFlashAttention: "auto" | boolean, _defaultContextSwaFullCache: boolean, _defaultContextKvCacheKeyType: GgmlType, _defaultContextKvCacheValueType: GgmlType, @@ -262,6 +265,7 @@ export class LlamaModel { this._modelPath = path.resolve(process.cwd(), modelPath); this._fileInsights = _fileInsights; this._gpuLayers = gpuLayers; + this._useMmap = useMmap ?? false; this._vocabOnly = vocabOnly ?? false; this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]); this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle(); @@ -313,6 +317,8 @@ export class LlamaModel { this._disposeAggregator.add(async () => { await this._backendModelDisposeGuard.acquireDisposeLock(); await this._model.dispose(); + this._vramConsumptionMarking?.dispose(); + this._ramConsumptionMarking?.dispose(); this._llamaPreventDisposalHandle.dispose(); }); @@ -375,6 +381,16 @@ export class LlamaModel { return this._gpuLayers; } + /** + * Whether the model is loaded using mmap (memory-mapped file) or not. + * + * When Direct I/O (setting the `useDirectIo` option to `true`) is used it'll override mmap and this value may be out of sync + * with the actual usage of mmap for the loading of this model instance. + */ + public get useMmap(): boolean { + return this._useMmap; + } + /** * Total model size in memory in bytes. * @@ -567,7 +583,7 @@ export class LlamaModel { if (token == null) return false; - if (this.getTokenAttributes(token).control) + if (this.getTokenAttributes(token).control || this.isEogToken(token)) return true; const normalText = this.detokenize([token], false); @@ -658,7 +674,7 @@ export class LlamaModel { } try { - if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) { + if (this._defaultContextFlashAttentionOptionEnabled === true && !this._flashAttentionSupported) { if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok) warnings.push("Flash attention is incompatible with Grok and thus was turned off"); else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) @@ -762,7 +778,11 @@ export class LlamaModel { experimentalDefaultContextKvCacheKeyType, experimentalDefaultContextKvCacheValueType } = modelOptions; - const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap); + const useMmap = !_llama.supportsMmap + ? false + : typeof modelOptions.useMmap === "boolean" + ? modelOptions.useMmap + : defaultUseMmap; const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo; const fileInfo = await readGgufFileInfo(modelOptions.modelPath, { @@ -771,9 +791,12 @@ export class LlamaModel { }); applyGgufMetadataOverrides(fileInfo, modelOptions.metadataOverrides); const ggufInsights = await GgufInsights.from(fileInfo, _llama); + ggufInsights._defaultUseMmap = useMmap === "auto" + ? true + : useMmap; const flashAttentionSupported = ggufInsights.flashAttentionSupported; const resolvedDefaultContextFlashAttention = flashAttentionSupported - ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) + ? defaultContextFlashAttention ?? defaultContextFlashAttentionOptionDefault : false; const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache; const resolvedDefaultContextKvCacheKeyType = experimentalDefaultContextKvCacheKeyType === "currentQuant" @@ -782,24 +805,39 @@ export class LlamaModel { const resolvedDefaultContextKvCacheValueType = experimentalDefaultContextKvCacheValueType === "currentQuant" ? ggufInsights.dominantTensorType ?? GgmlType.F16 : resolveGgmlTypeOption(experimentalDefaultContextKvCacheValueType) ?? GgmlType.F16; - const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { - ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, - defaultContextFlashAttention: resolvedDefaultContextFlashAttention, - defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, - defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType, - defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType, - useMmap - }); - const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({ - gpuLayers: gpuLayers, - useMmap - }); + + let gpuLayers: number; + let resolvedUseMmap: boolean; + let resourceRequirementsEstimation: GgufInsightsResourceRequirements; + const simulatorSession = ggufInsights._createSimulatorSession(); + try { + const layersResolution = await ggufInsights.configurationResolver.resolveModelGpuLayersV2(modelOptions.gpuLayers, { + ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, + defaultContextFlashAttention: resolvedDefaultContextFlashAttention, + defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, + defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType, + defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType, + useMmap, + + _simulatorSession: simulatorSession + }); + gpuLayers = layersResolution.gpuLayers; + resolvedUseMmap = layersResolution.useMmap; + resourceRequirementsEstimation = await ggufInsights.estimateModelResourceRequirementsV2({ + gpuLayers, + useMmap: resolvedUseMmap, + + _simulatorSession: simulatorSession + }); + } finally { + simulatorSession.dispose(); + } - const model = new LlamaModel({...modelOptions, gpuLayers, useMmap, useDirectIo}, { + const model = new LlamaModel({...modelOptions, gpuLayers, useMmap: resolvedUseMmap, useDirectIo}, { _fileInfo: fileInfo, _fileInsights: ggufInsights, _llama, - _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false, + _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? defaultContextFlashAttentionOptionDefault, _flashAttentionSupported: flashAttentionSupported, _defaultContextFlashAttention: resolvedDefaultContextFlashAttention, _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, @@ -839,7 +877,13 @@ export class LlamaModel { logWarnings(ggufInsights.getWarnings(modelOptions.modelPath)); try { - const modelLoaded = await model._model.init(); + const initLock = await acquireLock([_llama._memoryLock, LlamaLocks.addonInit]); + let modelLoaded: boolean = false; + try { + modelLoaded = await model._model.init(); + } finally { + initLock.dispose(); + } if (loadSignal?.aborted) { if (modelLoaded) @@ -853,6 +897,12 @@ export class LlamaModel { logWarnings(model.getWarnings()); + const memoryBreakdown = model._model.getMemoryBreakdown(); + model._vramConsumptionMarking = _llama._vramOrchestrator.markAllocation(memoryBreakdown.gpuVram); + model._ramConsumptionMarking = _llama._ramOrchestrator.markAllocation(memoryBreakdown.cpuRam); + modelCreationVramReservation?.dispose?.(); + modelCreationRamReservation?.dispose?.(); + return model; } finally { loadSignal?.removeEventListener("abort", onAbort); diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index ed364c35..5384e0a5 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -1,4 +1,7 @@ +import {acquireLock, withLock} from "lifecycle-utils"; +import bytes from "bytes"; import {Llama} from "../../bindings/Llama.js"; +import {doesLlamaBackendNeedAddonInitLock, LlamaLocks, LlamaLogLevel} from "../../bindings/types.js"; import {getLlamaWithoutBackend} from "../../bindings/utils/getLlamaWithoutBackend.js"; import {getDefaultContextBatchSize, getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js"; import {GgufFileInfo} from "../types/GgufFileInfoTypes.js"; @@ -6,8 +9,13 @@ import {GgmlType, GgufTensorInfo} from "../types/GgufTensorInfoTypes.js"; import {GgufArchitectureType} from "../types/GgufMetadataTypes.js"; import {getReadablePath} from "../../cli/utils/getReadablePath.js"; import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js"; +import {removeNullFields, removeUndefinedFields} from "../../utils/removeNullFields.js"; +import {LruCache} from "../../utils/LruCache.js"; import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js"; import {GgufInsightsTokens} from "./GgufInsightsTokens.js"; +import type {Promisable} from "../../utils/transformPromisable.js"; +import type {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js"; +import type {AddonContextParams, AddonGgufMetadata, AddonModel, AddonModelParams} from "../../bindings/AddonTypes.js"; export type GgufInsightsResourceRequirements = { cpuRam: number, @@ -20,9 +28,15 @@ export class GgufInsights { /** @internal */ private _totalFileLayers: number | null = null; /** @internal */ private _supportsRanking?: boolean; /** @internal */ private _dominantTensorType?: GgmlType; + /** @internal */ private _addonMetadata?: AddonGgufMetadata; + /** @internal */ public _defaultUseMmap?: boolean; /** @internal */ public readonly _ggufFileInfo: GgufFileInfo; /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver; /** @internal */ private readonly _tokens: GgufInsightsTokens; + /** @internal */ private readonly _exactModelResourceRequirementsCache = new LruCache(40); + /** @internal */ private readonly _exactContextResourceRequirementsCache = new LruCache(40); + /** @internal */ private readonly _simulationSession: GgufInsightsSimulatorSession; + /** @internal */ private readonly _locks = {}; private constructor(ggufFileInfo: GgufFileInfo, llama: Llama) { this._llama = llama; @@ -31,6 +45,7 @@ export class GgufInsights { this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true); this._configurationResolver = GgufInsightsConfigurationResolver._create(this); this._tokens = GgufInsightsTokens._create(this); + this._simulationSession = this._createSimulatorSession(); } /** @@ -40,16 +55,23 @@ export class GgufInsights { */ public getWarnings(modelFilePath?: string) { const warnings: string[] = []; - const modelFilePathText = (modelFilePath != null && modelFilePath !== "") - ? ` ("${getReadablePath(modelFilePath)}")` - : ""; + const resolvedModelFilePath = modelFilePath || ( + this._ggufFileInfo.source?.type === "path" + ? this._ggufFileInfo.source.path + : undefined + ); + const modelFileSourceText = (resolvedModelFilePath != null && resolvedModelFilePath !== "") + ? ` ("${getReadablePath(resolvedModelFilePath)}")` + : this._ggufFileInfo.source?.type === "uri" + ? ` ("${getReadablePath(this._ggufFileInfo.source.uri)}")` + : ""; if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" && this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null ) { // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'" warnings.push( - `This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` + + `This model file${modelFileSourceText} is missing a pre-tokenizer configuration. ` + "This may cause incorrect tokenization and thus degrade the generation quality. " + "Consider using a newer model or regenerating this GGUF model file" ); @@ -214,8 +236,37 @@ export class GgufInsights { return slidingWindow; } + /** @internal */ + public _getAddonMetadata(): Promisable { + if (this._addonMetadata != null || this._ggufFileInfo.sourceData.length === 0) + return this._addonMetadata; + + return withLock([this._locks, "addonMetadata"], async () => { + if (this._addonMetadata != null) + return this._addonMetadata; + + const initInput: Array = []; + for (const data of this._ggufFileInfo.sourceData) { + if (data.type === "buffer") + initInput.push(data.buffer); + else if (data.type === "path") + initInput.push(data.path); + else + void (data satisfies never); + } + + const addonMetadata = new this._llama._bindings.AddonGgufMetadata(); + await addonMetadata.init(initInput); + this._addonMetadata = addonMetadata; + return addonMetadata; + }); + } + + /** + * @deprecated Use `estimateModelResourceRequirementsV2` instead + */ public estimateModelResourceRequirements({ - gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap + gpuLayers, useMmap = this._getUseMmap(), gpuSupportsMmap = this._llama.gpuSupportsMmap }: { gpuLayers: number, useMmap?: boolean, gpuSupportsMmap?: boolean }): GgufInsightsResourceRequirements { @@ -227,10 +278,100 @@ export class GgufInsights { }; } + public async estimateModelResourceRequirementsV2(options: { + gpuLayers: number, useMmap?: boolean, gpuSupportsMmap?: boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + const { + gpuLayers, useMmap = this._getUseMmap(), gpuSupportsMmap = this._llama.gpuSupportsMmap, + + _simulatorSession + } = options; + + try { + const simulationResult = await this._simulateModelResourceUsage({ + gpuLayers, + useMmap, + simulatorSession: _simulatorSession + }); + + if (simulationResult != null) { + if (!useMmap || !gpuSupportsMmap) + return simulationResult; + + // adjust for the missing mmap simulation implementation + const standardEstimation = this.estimateModelResourceRequirements({ + gpuLayers, + useMmap, + gpuSupportsMmap + }); + + return { + gpuVram: Math.max(simulationResult.gpuVram, standardEstimation.gpuVram), + cpuRam: Math.min(simulationResult.cpuRam, standardEstimation.cpuRam) + }; + } + } catch (error: any) { + this._llama._log(LlamaLogLevel.warn, error?.message ?? String(error)); + } + + return this.estimateModelResourceRequirements({ + gpuLayers, + useMmap, + gpuSupportsMmap + }); + } + + /** @internal */ + public async _simulateModelResourceUsage({ + gpuLayers, + useMmap = this._getUseMmap(), + simulatorSession = this._simulationSession + }: { + gpuLayers: number, + useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + const cacheKey = [gpuLayers, Number(useMmap)].join(":"); + const cachedValue = this._exactModelResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const lock = await acquireLock([this._locks, "_simulateModelResourceUsage", cacheKey]); + try { + const cachedValue = this._exactModelResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const simulatorSource = await this._resolveSimulatorSource(); + if (simulatorSource == null) + return null; + + let resourceRequirements: GgufInsightsResourceRequirements; + try { + resourceRequirements = await simulatorSession.estimateModelResources({ + modelSource: simulatorSource, + gpuLayers, + useMmap + }); + } catch (error: any) { + throw new Error("Failed simulating model resource usage. Falling back to estimation heuristic. Error: " + (error?.message ?? String(error))); + } + + this._exactModelResourceRequirementsCache.set(cacheKey, resourceRequirements); + return {...resourceRequirements}; + } finally { + lock.dispose(); + } + } + /** * Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`. * The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that. * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. + * @deprecated Use `estimateContextResourceRequirementsV2` instead */ public estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, @@ -248,29 +389,48 @@ export class GgufInsights { const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const slidingWindow = this.swaSize ?? 0; const kvUnified = false; - const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize && + const totalFileLayers = this._getTotalFileLayers(); + const hasSwaAttention = slidingWindow > 0; + const usingReducedSWA = hasSwaAttention && !swaFullCache && slidingWindow < contextSize && (this.trainContextSize == null || slidingWindow < this.trainContextSize); - const swaPattern = getSwaPatternForArchitecture( - this._ggufFileInfo.metadata?.general?.architecture, - this._ggufFileInfo.architectureMetadata?.attention?.sliding_window_pattern - ); - const nonSwaPercent = swaPattern <= 1 - ? 1 - : (1 / (swaPattern + (flashAttention ? -0.5 : -1))); + let graphRelevantTensorCount = 0; + let graphRelevantTensorElements = 0; + let totalTensorElements = 0; + + for (const singleTensorInfo of tensorInfo) { + let tensorElements = 0; + for (const dim of singleTensorInfo.dimensions) + tensorElements += Number(dim); + + totalTensorElements += tensorElements; - // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp` - const kvCachePadding = 1; + if (!isGraphRelevantTensor(singleTensorInfo.name)) + continue; + + graphRelevantTensorCount++; + graphRelevantTensorElements += tensorElements; + } + + const effectiveGraphTensorCount = graphRelevantTensorCount > 0 + ? graphRelevantTensorCount + : tensorInfo.length; + const effectiveGraphTensorElements = graphRelevantTensorCount > 0 + ? graphRelevantTensorElements + : totalTensorElements; + + const paddedContextSize = padSafeContextSize(contextSize, "up"); const actualContextSize = kvUnified ? padSafeContextSize(sequences * contextSize, "up") - : sequences * padSafeContextSize(contextSize, "up"); - const kvSize = usingSWA - ? ( - (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) + - nonSwaPercent * actualContextSize - ) - : actualContextSize; - - const totalFileLayers = this._getTotalFileLayers(); + : sequences * paddedContextSize; + const fullAttentionKvSize = actualContextSize; + const swaBatchSize = hasSwaAttention && !swaFullCache + ? batchSize + 1 + : batchSize; + const swaKvSize = !hasSwaAttention + ? actualContextSize + : !usingReducedSWA + ? actualContextSize + : Math.min(actualContextSize, ggmlPad((sequences * slidingWindow) + swaBatchSize, 256)); const totalLayersIncludingOutput = totalFileLayers + 1; const finalModelGpuLayers = Math.max( 0, @@ -284,13 +444,17 @@ export class GgufInsights { gpuKVCacheSize, cpuKVCacheSize, gpuRecurrentStateSize, - cpuRecurrentStateSize + cpuRecurrentStateSize, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv } = this._estimateContextCacheMemorySplitInBytes({ - kvSize, + fullAttentionKvSize, + swaKvSize, sequences, totalFileLayers, finalModelGpuLayers, usingGpu, + flashAttention, kvCacheKeyType, kvCacheValueType }); @@ -324,37 +488,82 @@ export class GgufInsights { const estimateGraphOverheadMemory = (): number => { const s1MB = Math.pow(1024, 2); - const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const expertCount = llmData?.expert_count ?? 0; const headCount = llmData?.attention?.head_count ?? 0; const embeddingLength = llmData?.embedding_length ?? 0; + const activeGraphTokens = roundUpToMultiple( + Math.max(1, Math.min(paddedContextSize, batchSize)), + Math.max(1, sequences) + ); + const graphContextSize = resolveGraphContextSizeForOverheadEstimation({ + fullAttentionKvSize, + trainContextSize: this.trainContextSize, + flashAttention, + headCount, + batchSize, + paddedContextSize, + sequences + }); let defaultCalculationAdjustment = 0; + const totalElements = effectiveGraphTensorCount === 0 + ? this.totalLayers * ( + ( + (llmData.embedding_length ?? 0) + + (llmData.feed_forward_length ?? 0) + ) / 2 + ) + : effectiveGraphTensorElements; + const tensorBasedGraphOverhead = (tensorElementMultiplier: number) => ( + (totalElements * tensorElementMultiplier * (graphContextSize / 4096)) + defaultCalculationAdjustment + ); + const batchLocalTensorBasedGraphOverhead = (tensorElementMultiplier: number) => ( + (totalElements * tensorElementMultiplier * (activeGraphTokens / 4096)) + defaultCalculationAdjustment + ); if (batchSize == null) return 0; + const genericNonFlashAttentionWorkspaceEstimate = !flashAttention + ? estimateNonFlashAttentionWorkspace({ + trainContextSize: this.trainContextSize, + fullAttentionKvSize, + swaKvSize, + hasSwaAttention, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv, + activeGraphTokens, + headCount + }) + : 0; + if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) { if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; - return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); } - return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (embeddingLength + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) { if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment -= (s1MB * 340) * ( this.trainContextSize == null ? 1 - : kvSize / this.trainContextSize + : graphContextSize / this.trainContextSize ); } else { defaultCalculationAdjustment -= (s1MB * 250) + ( (s1MB * 50) * ( this.trainContextSize == null ? 1 - : kvSize / this.trainContextSize + : graphContextSize / this.trainContextSize ) ); } @@ -367,7 +576,7 @@ export class GgufInsights { (s1MB * 270) * ( this.trainContextSize == null ? 1 - : kvSize / this.trainContextSize + : graphContextSize / this.trainContextSize ) ); } else { @@ -375,14 +584,25 @@ export class GgufInsights { (s1MB * 150) * ( this.trainContextSize == null ? 1 - : Math.max(0, (1 - (kvSize / this.trainContextSize))) + : Math.max(0, (1 - (graphContextSize / this.trainContextSize))) ) ); } + } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma3) { + const trainContextSize = Math.max(1, this.trainContextSize ?? graphContextSize); + const contextRatio = Math.min(1, Math.max(0, graphContextSize / trainContextSize)); + + return Math.max( + int32TBytes * batchSize * graphContextSize * headCount * (0.08 + Math.pow(contextRatio, 2)), + genericNonFlashAttentionWorkspaceEstimate + ); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) { const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; - return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB); + return Math.max( + (int32TBytes * batchSize * graphContextSize * headCount) - (50 * s1MB), + genericNonFlashAttentionWorkspaceEstimate + ); // if (modelGpuLayers === this.totalLayers) { // defaultCalculationAdjustment += -(s1MB * 20) + ( @@ -402,34 +622,54 @@ export class GgufInsights { // ); // } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen3) { - return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (embeddingLength + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); + } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma4) { + const trainContextSize = Math.max(1, this.trainContextSize ?? graphContextSize); + const contextRatio = Math.min(1, Math.max(0, graphContextSize / trainContextSize)); + const gemma4DenseShortContextScale = 0.4; + const gemma4DenseContextScaleExponent = 3; + const gemma4DenseEstimate = int32TBytes * batchSize * graphContextSize * headCount * + (gemma4DenseShortContextScale + Math.pow(contextRatio, gemma4DenseContextScaleExponent)); + + if (expertCount > 0) { + const tensorBasedEstimate = tensorBasedGraphOverhead(77.655); + const moeBlendWeight = Math.sqrt(contextRatio); + + return Math.max( + gemma4DenseEstimate, + (gemma4DenseEstimate + ((tensorBasedEstimate - gemma4DenseEstimate) * moeBlendWeight)) * 1.01, + genericNonFlashAttentionWorkspaceEstimate + ); + } + + return Math.max(gemma4DenseEstimate, genericNonFlashAttentionWorkspaceEstimate); } else if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; - return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); } - const totalElements = tensorInfo.length === 0 - ? this.totalLayers * ( - ( - (llmData.embedding_length ?? 0) + - (llmData.feed_forward_length ?? 0) - ) / 2 - ) - : tensorInfo.reduce((res, tensor) => { - return res + tensor.dimensions.reduce((res: number, dim) => res + Number(dim), 0); - }, 0); - if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) { // magic numbers for estimation. will be improved in the future - return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment; + return Math.max(tensorBasedGraphOverhead(123), genericNonFlashAttentionWorkspaceEstimate); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.cohere2) { // magic numbers for estimation. will be improved in the future - return (totalElements * 148 * (kvSize / 4096)) + defaultCalculationAdjustment; + return Math.max(tensorBasedGraphOverhead(148), genericNonFlashAttentionWorkspaceEstimate); } // magic numbers for estimation. will be improved in the future - return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment; + return Math.max( + !flashAttention + ? batchLocalTensorBasedGraphOverhead(77.655) + : tensorBasedGraphOverhead(77.655), + genericNonFlashAttentionWorkspaceEstimate + ); }; // source: `llama_context::graph_max_nodes` in `llama-context.cpp` @@ -449,10 +689,10 @@ export class GgufInsights { this._ggufFileInfo.metadata?.general?.architecture, Math.min(actualContextSize, batchSize) ); - const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length); + const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * effectiveGraphTensorCount); const cpuNodes = totalFileLayers === 0 ? 0 - : maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers)); + : maxNodesMultiplier.multiplier * (effectiveGraphTensorCount * (finalCpuLayers / totalFileLayers)); const gpuNodes = maxNodes - cpuNodes; const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) + @@ -460,7 +700,7 @@ export class GgufInsights { const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) + this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false); - const graphOverheadMemory = (flashAttention || !includeGraphOverhead) + const graphOverheadMemory = !includeGraphOverhead ? 0 : estimateGraphOverheadMemory(); const graphOverheadGpuSize = (usingGpu && totalFileLayers > 0) @@ -481,6 +721,148 @@ export class GgufInsights { }; } + public async estimateContextResourceRequirementsV2(options: { + contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], swaFullCache?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, + useMmap?: boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + const { + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, flashAttention = "auto", + swaFullCache = false, + kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16, + useMmap, + + _simulatorSession + } = options; + + try { + const simulationResult = await this._simulateContextResourceUsage({ + contextSize, + modelGpuLayers, + batchSize, + sequences, + isEmbeddingContext, + flashAttention, + swaFullCache, + useMmap, + simulatorSession: _simulatorSession, + kvCacheKeyType, + kvCacheValueType + }); + if (simulationResult != null) + return simulationResult; + } catch (error: any) { + this._llama._log(LlamaLogLevel.warn, error?.message ?? String(error)); + } + + return this.estimateContextResourceRequirements({ + contextSize, + modelGpuLayers, + batchSize, + sequences, + isEmbeddingContext, + flashAttention: flashAttention === true, + swaFullCache, + kvCacheKeyType, + kvCacheValueType + }); + } + + /** @internal */ + public _getUseMmap(useMmapOption?: boolean) { + return useMmapOption ?? this._defaultUseMmap ?? this._llama.supportsMmap; + } + + /** @internal */ + public async _simulateContextResourceUsage({ + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, flashAttention = "auto", + swaFullCache = false, useMmap = this._getUseMmap(), + kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16, + simulatorSession = this._simulationSession + }: { + contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], swaFullCache?: boolean, useMmap?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, + simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + if (sequences == null) sequences = getDefaultContextSequences(); + if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences}); + + const kvUnified = false; + + const cacheKey = [ + contextSize, + modelGpuLayers, + batchSize, + sequences, + Number(isEmbeddingContext), + flashAttention === "auto" + ? "auto" + : String(flashAttention), + Number(swaFullCache), + Number(useMmap), + kvCacheKeyType, + kvCacheValueType + ].join(":"); + const cachedValue = this._exactContextResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const lock = await acquireLock([this._locks, "_simulateContextResourceUsage", cacheKey]); + try { + const cachedValue = this._exactContextResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const simulatorSource = await this._resolveSimulatorSource(); + if (simulatorSource == null) + return null; + + let contextResources: GgufInsightsResourceRequirements; + try { + const paddedContextSize = padSafeContextSize(contextSize, "up"); + const actualContextSize = kvUnified + ? padSafeContextSize(sequences * contextSize, "up") + : sequences * paddedContextSize; + const actualBatchSize = Math.max(batchSize, sequences) + ( + (!swaFullCache && this.swaSize != null && this.swaSize > 0) + ? 1 // +1 to handle edge cases with SWA KV cache + : 0 + ); + + contextResources = await simulatorSession.estimateContextResources({ + modelSource: simulatorSource, + gpuLayers: modelGpuLayers, + contextSize: actualContextSize, + batchSize: actualBatchSize, + sequences, + isEmbeddingContext, + flashAttention, + swaFullCache, + useMmap, + kvCacheKeyType, + kvCacheValueType + }); + } catch (error: any) { + throw new Error("Failed simulating context resource usage. Falling back to estimation heuristic. Error: " + (error?.message ?? String(error))); + } + + const resourceRequirements = { + cpuRam: contextResources.cpuRam, + gpuVram: contextResources.gpuVram + } satisfies GgufInsightsResourceRequirements; + + this._exactContextResourceRequirementsCache.set(cacheKey, resourceRequirements); + return {...resourceRequirements}; + } finally { + lock.dispose(); + } + } + /** * Get the split tensor resources for CPU and GPU based on the number of GPU layers * @internal @@ -500,7 +882,7 @@ export class GgufInsights { } const fileLayers = this._getFileLayers(); - const startGpuLayer = Math.max(0, fileLayers - gpuLayers); + const startGpuLayer = Math.max(0, fileLayers - gpuLayers + 1); const gpuTensors: GgufTensorInfo[] = []; const cpuTensors: GgufTensorInfo[] = []; @@ -523,7 +905,7 @@ export class GgufInsights { // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU } else if (isOutputLayer(singleTensorInfo.name)) { - if (gpuLayers === this.totalLayers) { + if (gpuLayers > 0) { gpuTensors.push(singleTensorInfo); continue; } else { @@ -580,19 +962,23 @@ export class GgufInsights { } private _estimateContextCacheMemorySplitInBytes({ - kvSize, + fullAttentionKvSize, + swaKvSize, sequences, totalFileLayers, finalModelGpuLayers, usingGpu, + flashAttention, kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16 }: { - kvSize: number, + fullAttentionKvSize: number, + swaKvSize: number, sequences: number, totalFileLayers: number, finalModelGpuLayers: number, usingGpu: boolean, + flashAttention: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType }) { @@ -603,8 +989,24 @@ export class GgufInsights { const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nHeadKv: number | number[] = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead; const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); + const nEmbdHeadKSwa = this._ggufFileInfo.architectureMetadata.attention?.key_length_swa; + const nEmbdHeadVSwa = this._ggufFileInfo.architectureMetadata.attention?.value_length_swa; + const sharedKvLayers = this._ggufFileInfo.architectureMetadata.attention?.shared_kv_layers; + const slidingWindowPattern = this._ggufFileInfo.architectureMetadata.attention?.sliding_window_pattern; const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size; const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size; + const nHeadKvValues = nHeadKv as unknown; + let maxLayerValueEmbedding = 0; + + if (!flashAttention && nHeadKvValues instanceof Array) { + for (let i = 0; i < totalFileLayers; i++) { + const layerHeadCountKv = resolveLayerHeadCountKv(nHeadKvValues, i, nHead); + const isSwaLayer = isSwaLayerAtIndex(architecture, slidingWindowPattern, i); + const layerValueEmbedding = resolveLayerHeadDimension(nEmbdHeadV, nEmbdHeadVSwa, isSwaLayer) * layerHeadCountKv; + + maxLayerValueEmbedding = Math.max(maxLayerValueEmbedding, layerValueEmbedding); + } + } // source: `llama_model::load_tensors` in `llama-model.cpp` // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers` @@ -618,6 +1020,8 @@ export class GgufInsights { let cpuKvElementsV = 0; let gpuRecurrentLayers = 0; let cpuRecurrentLayers = 0; + let maxAttentionLayerKvSize = 0; + let maxAttentionLayerHeadCountKv = 0; for (let i = 0; i < totalFileLayers; i++) { const isGpuLayer = i >= gpuRepeatingLayerStart; @@ -629,9 +1033,22 @@ export class GgufInsights { else cpuRecurrentLayers++; } else { + if (!doesLayerOwnKvCache(totalFileLayers, i, sharedKvLayers)) + continue; + const nHeadKvLayer = resolveLayerHeadCountKv(nHeadKv, i, nHead); - const layerElementsK = nEmbdHeadK * nHeadKvLayer * kvSize; - const layerElementsV = nEmbdHeadV * nHeadKvLayer * kvSize; + const isSwaLayer = isSwaLayerAtIndex(architecture, slidingWindowPattern, i); + const layerKvSize = isSwaLayer + ? swaKvSize + : fullAttentionKvSize; + maxAttentionLayerKvSize = Math.max(maxAttentionLayerKvSize, layerKvSize); + maxAttentionLayerHeadCountKv = Math.max(maxAttentionLayerHeadCountKv, nHeadKvLayer); + const layerElementsK = resolveLayerHeadDimension(nEmbdHeadK, nEmbdHeadKSwa, isSwaLayer) * nHeadKvLayer * layerKvSize; + const layerElementsV = layerKvSize * ( + maxLayerValueEmbedding > 0 + ? maxLayerValueEmbedding + : (resolveLayerHeadDimension(nEmbdHeadV, nEmbdHeadVSwa, isSwaLayer) * nHeadKvLayer) + ); if (isGpuLayer) { gpuKvElementsK += layerElementsK; @@ -658,7 +1075,9 @@ export class GgufInsights { gpuKVCacheSize, cpuKVCacheSize, gpuRecurrentStateSize, - cpuRecurrentStateSize + cpuRecurrentStateSize, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv }; } @@ -726,6 +1145,23 @@ export class GgufInsights { return this._totalFileLayers; } + /** @internal */ + private async _resolveSimulatorSource(): Promise { + const addonMetadata = await this._getAddonMetadata(); + if (addonMetadata != null) + return addonMetadata; + + if (this._ggufFileInfo.source?.type === "path") + return this._ggufFileInfo.source.path; + + return null; + } + + /** @internal */ + public _createSimulatorSession(lruCacheSize: number = 10) { + return new GgufInsightsSimulatorSession(this._llama, lruCacheSize); + } + /** * @param ggufFileInfo * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance. @@ -742,6 +1178,195 @@ export class GgufInsights { } } +export class GgufInsightsSimulatorSession { + private readonly _llama: Llama; + private readonly _modelPromises: LruCache>; + private _disposed = false; + + public constructor(llama: Llama, lruCacheSize: number = 10) { + this._llama = llama; + this._modelPromises = new LruCache(lruCacheSize); + } + + public async estimateModelResources({ + modelSource, + gpuLayers, + useMmap = false + }: { + modelSource: string | AddonGgufMetadata, + gpuLayers: number, + useMmap?: boolean + }): Promise { + const model = await this._getModel({source: modelSource, gpuLayers, useMmap}); + const memoryBreakdown = model.getMemoryBreakdown(); + if (this._llama._shouldLog(LlamaLogLevel.debug)) + this._llama._log(LlamaLogLevel.debug, "Simulating model resource usage. " + [ + `gpuLayers=${gpuLayers}`, + `useMmap=${useMmap}`, + `memoryBreakdownCpuRam=${bytes(memoryBreakdown.cpuRam)}`, + `memoryBreakdownGpuVram=${bytes(memoryBreakdown.gpuVram)}` + ].join(" ")); + return memoryBreakdown; + } + + public async estimateContextResources({ + modelSource, + gpuLayers, + contextSize, + batchSize, + sequences, + isEmbeddingContext = false, + flashAttention = "auto", + swaFullCache = false, + useMmap = false, + kvCacheKeyType = GgmlType.F16, + kvCacheValueType = GgmlType.F16 + }: { + modelSource: string | AddonGgufMetadata, + gpuLayers: number, + contextSize: number, + batchSize: number, + sequences: number, + isEmbeddingContext?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], + swaFullCache?: boolean, + useMmap?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType + }): Promise { + const model = await this._getModel({source: modelSource, gpuLayers, useMmap}); + const context = new this._llama._bindings.AddonContext(model, removeUndefinedFields({ + contextSize, + batchSize, + sequences, + embeddings: isEmbeddingContext, + flashAttention: flashAttention === "auto" + ? "auto" + : flashAttention, + kvCacheKeyType, + kvCacheValueType, + swaFullCache + } satisfies AddonContextParams)); + + try { + const loadingLock = doesLlamaBackendNeedAddonInitLock(this._llama.gpu) + ? await acquireLock([this._llama._memoryLock, LlamaLocks.addonInit]) + : undefined; + try { + const contextLoaded = await context.init(); + if (!contextLoaded) + throw new Error("Failed to create context"); + } finally { + loadingLock?.dispose(); + } + + const memoryBreakdown = context.getMemoryBreakdown(); + if (this._llama._shouldLog(LlamaLogLevel.debug)) + this._llama._log(LlamaLogLevel.debug, "Simulating context resource usage. " + [ + `gpuLayers=${gpuLayers}`, + `contextSize=${contextSize.toLocaleString("en-US", {notation: "compact"})}`, + `batchSize=${batchSize}`, + `sequences=${sequences}`, + `isEmbeddingContext=${isEmbeddingContext}`, + `flashAttention=${flashAttention}`, + `swaFullCache=${swaFullCache}`, + `kvCacheKeyType=${kvCacheKeyType}`, + `kvCacheValueType=${kvCacheValueType}`, + `useMmap=${useMmap}`, + `memoryBreakdownCpuRam=${bytes(memoryBreakdown.cpuRam)}`, + `memoryBreakdownGpuVram=${bytes(memoryBreakdown.gpuVram)}` + ].join(" ")); + return memoryBreakdown; + } finally { + await context.dispose(); + } + } + + public [Symbol.asyncDispose]() { + return this.dispose(); + } + + public async dispose() { + if (this._disposed) + return; + + this._disposed = true; + + const modelPromises = [...this._modelPromises.values()].map((modelPromise) => modelPromise.catch(() => void 0)); + this._modelPromises.clear(); + const loadedModels = (await Promise.all(modelPromises)).filter((model) => model != null); + + await Promise.all(loadedModels.map((model) => model.dispose().catch(() => void 0))); + } + + private async _getModel({ + source, + gpuLayers, + useMmap = this._llama.supportsMmap + }: { + source: string | AddonGgufMetadata, + gpuLayers: number, + useMmap?: boolean + }) { + if (this._disposed) + throw new Error("simulator session is disposed"); + + const cacheKey = String(gpuLayers) + ":" + String(useMmap); + const existingModelPromise = this._modelPromises.get(cacheKey); + if (existingModelPromise != null) + return await existingModelPromise; + + if (this._llama._shouldLog(LlamaLogLevel.debug)) + this._llama._log(LlamaLogLevel.debug, `Loading model for simulator session. gpuLayers=${gpuLayers} useMmap=${useMmap}`); + const modelPromise = this._loadModel({ + source, + gpuLayers, + useMmap + }); + this._modelPromises.set(cacheKey, modelPromise); + + try { + return await modelPromise; + } catch (error) { + this._modelPromises.delete(cacheKey); + throw error; + } + } + + private async _loadModel({ + source, gpuLayers, useMmap = false + }: { + source: string | AddonGgufMetadata, gpuLayers: number, useMmap?: boolean + }) { + const model = new this._llama._bindings.AddonModel( + typeof source === "string" + ? source + : "", + removeNullFields({ + gpuLayers, + noAlloc: true, + useMmap, + useMlock: false + } satisfies AddonModelParams) + ); + + const loadingLock = doesLlamaBackendNeedAddonInitLock(this._llama.gpu) + ? await acquireLock([this._llama._memoryLock, LlamaLocks.addonInit]) + : undefined; + try { + const modelLoaded = typeof source === "string" + ? await model.init() + : await model.init(source); + if (!modelLoaded) + throw new Error("Failed to load model"); + } finally { + loadingLock?.dispose(); + } + + return model; + } +} + function parseTensorName(tensorName?: string): { layerNumber: number | undefined } { @@ -941,49 +1566,212 @@ function isTokenEmbedLayer(layerName: string) { return firstPart === "token_embd"; } +function isGraphRelevantTensor(tensorName: string): boolean { + return isInputLayer(tensorName) || + isOutputLayer(tensorName) || + tensorName.startsWith("blk.") || + tensorName.startsWith("enc.blk.") || + tensorName.startsWith("dec.blk."); +} + function ggmlPad(value: number, padding: number): number { return ((value + padding - 1) & ~(padding - 1)); } -function getSwaPatternForArchitecture(architecture?: GgufArchitectureType, slidingWindowPattern?: number | number[]): number { - if (typeof slidingWindowPattern === "number") - return slidingWindowPattern; +function roundUpToMultiple(value: number, multiple: number): number { + if (multiple <= 1) + return value; + + return Math.ceil(value / multiple) * multiple; +} + +function resolveGraphContextSizeForOverheadEstimation({ + fullAttentionKvSize, + trainContextSize, + flashAttention, + headCount, + batchSize, + paddedContextSize, + sequences +}: { + fullAttentionKvSize: number, + trainContextSize: number | undefined, + flashAttention: boolean, + headCount: number, + batchSize: number, + paddedContextSize: number, + sequences: number +}) { + // heuristic coefficients fit to estimate llama.cpp graph-reserve behavior + const flashAttentionMinContextMultiplier = 0.5; + const flashAttentionMaxContextMultiplier = 0.78; + const flashAttentionMinHeadCountForScaling = 4; + const flashAttentionContextRatioLog2Cap = 2; + const flashAttentionContextRatioLog2Scale = 0.05; + const longContextOverflowStartRatio = 1.25; + const longContextOverflowGrowthScale = 0.1; + const longContextMaxMultiplierIncrease = 0.4; + + const normalizedTrainContextSize = trainContextSize == null || trainContextSize <= 0 + ? Math.max(1, fullAttentionKvSize) + : trainContextSize; + const contextRatio = Math.max(1, fullAttentionKvSize / normalizedTrainContextSize); + + if (flashAttention) { + const activeGraphTokens = roundUpToMultiple( + Math.max(1, Math.min(paddedContextSize, batchSize)), + Math.max(1, sequences) + ); + const flashContextMultiplierBase = + flashAttentionMinContextMultiplier + (1 / Math.max(flashAttentionMinHeadCountForScaling, headCount)); + const flashContextMultiplierLongContextAdjustment = + Math.min(flashAttentionContextRatioLog2Cap, Math.log2(contextRatio)) * flashAttentionContextRatioLog2Scale; + const flashContextMultiplier = Math.max( + flashAttentionMinContextMultiplier, + Math.min( + flashAttentionMaxContextMultiplier, + flashContextMultiplierBase + flashContextMultiplierLongContextAdjustment + ) + ); + + return activeGraphTokens * flashContextMultiplier; + } + + const contextOverflow = Math.max(0, contextRatio - longContextOverflowStartRatio); + const longContextMultiplier = 1 + Math.min( + longContextMaxMultiplierIncrease, + longContextOverflowGrowthScale * contextOverflow * contextOverflow + ); + + return fullAttentionKvSize * longContextMultiplier; +} + +function estimateNonFlashAttentionWorkspace({ + trainContextSize, + fullAttentionKvSize, + swaKvSize, + hasSwaAttention, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv, + activeGraphTokens, + headCount +}: { + trainContextSize: number | undefined, + fullAttentionKvSize: number, + swaKvSize: number, + hasSwaAttention: boolean, + maxAttentionLayerKvSize: number, + maxAttentionLayerHeadCountKv: number, + activeGraphTokens: number, + headCount: number +}) { + const floatBytes = 4; // sizeof(float) + const strongGqaMaxKvToQHeadRatio = 0.5; + const minAttentionScoreWorkspaceScale = 0.4; + const additionalAttentionScoreWorkspaceScale = 0.6; + + if (maxAttentionLayerKvSize <= 0 || activeGraphTokens <= 0 || headCount <= 0) + return 0; + + const attentionScoresWorkspace = floatBytes * activeGraphTokens * maxAttentionLayerKvSize * headCount; + const attentionMaskWorkspace = floatBytes * activeGraphTokens * ( + hasSwaAttention + ? fullAttentionKvSize + swaKvSize + : maxAttentionLayerKvSize + ); + + if (!hasSwaAttention) + // source: non-FA reserve path in `llm_graph_context::build_attn_mha` + `build_attn_inp_kq_mask` in `llama-graph.cpp` + // reserves the full KQ tensor and the matching F32 attention mask for the ubatch-local graph + return attentionScoresWorkspace + attentionMaskWorkspace; + + // the explicit KQ workspace floor matches the non-FA reserve path well for MHA-like layouts, + // but it becomes too aggressive for strong GQA / MQA hybrid models where KV heads are much fewer than Q heads + if (maxAttentionLayerHeadCountKv / headCount < strongGqaMaxKvToQHeadRatio) + return attentionMaskWorkspace; + + const normalizedTrainContextSize = Math.max(1, trainContextSize ?? maxAttentionLayerKvSize); + const contextRatio = Math.min(1, Math.max(0, maxAttentionLayerKvSize / normalizedTrainContextSize)); + const attentionScoreWorkspaceScale = + minAttentionScoreWorkspaceScale + (additionalAttentionScoreWorkspaceScale * contextRatio); + + return (attentionScoresWorkspace * attentionScoreWorkspaceScale) + attentionMaskWorkspace; +} + +function isSwaLayerAtIndex( + architecture: GgufArchitectureType | undefined, + slidingWindowPattern: number | number[] | undefined, + layerIndex: number +): boolean { + if (layerIndex < 0) + return false; + + if (slidingWindowPattern instanceof Array) + return Boolean(slidingWindowPattern[layerIndex]); + const [defaultPattern, denseFirst] = getSwaPatternForArchitecture(architecture); + const pattern = typeof slidingWindowPattern === "number" + ? Math.max(0, Math.floor(slidingWindowPattern)) + : defaultPattern; + + if (pattern === 0) + return true; + + return denseFirst + ? (layerIndex % pattern !== 0) + : (layerIndex % pattern < (pattern - 1)); +} + +function getSwaPatternForArchitecture(architecture?: GgufArchitectureType): [pattern: number, denseFirst: boolean] { // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern` switch (architecture) { case GgufArchitectureType.llama4: - return 4; + return [4, false]; case GgufArchitectureType.afmoe: - return 4; + return [4, false]; case GgufArchitectureType.modernBert: - return 3; + return [3, true]; case GgufArchitectureType.phi3: - return 1; + return [1, false]; case GgufArchitectureType.plamo3: - return 8; + return [8, false]; case GgufArchitectureType.gemma2: - return 2; + return [2, false]; case GgufArchitectureType.gemma3: - return 6; + return [6, false]; case GgufArchitectureType.gemma3n: - return 5; + return [5, false]; case GgufArchitectureType.gemmaEmbedding: - return 6; + return [6, false]; case GgufArchitectureType.cohere2: - return 4; + return [4, false]; case GgufArchitectureType.olmo2: - return 4; + return [4, false]; case GgufArchitectureType.exaone4: - return 4; + return [4, false]; case GgufArchitectureType.exaoneMoe: - return 4; + return [4, false]; case GgufArchitectureType.gptOss: - return 2; + return [2, false]; case GgufArchitectureType.smallthinker: - return 4; + return [4, true]; } - return 1; + return [1, false]; +} + +function resolveLayerHeadDimension(defaultValue: number, swaValue: number | undefined, isSwaLayer: boolean): number { + if (isSwaLayer && swaValue != null) + return swaValue; + + return defaultValue; +} + +function doesLayerOwnKvCache(totalLayers: number, layerIndex: number, sharedKvLayers: number | undefined): boolean { + if (sharedKvLayers == null || sharedKvLayers <= 0) + return true; + + return layerIndex < Math.max(0, totalLayers - sharedKvLayers); } function resolveLayerHeadCountKv(nHeadKv: number | number[], layerIndex: number, nHead: number): number { @@ -1007,8 +1795,9 @@ function getRecurrentLayersPattern( architectureMetadata: GgufFileInfo["architectureMetadata"] ): RecurrentLayersPattern { const nHeadKv = architectureMetadata?.attention?.head_count_kv; + const nHeadKvValues: number | number[] | undefined = nHeadKv; const feedForwardLength = architectureMetadata?.feed_forward_length as number | number[] | undefined; - const hasRecurrentHeadCountKvEntry = Array.isArray(nHeadKv) && nHeadKv.some((value) => value === 0); + const hasRecurrentHeadCountKvEntry = nHeadKvValues instanceof Array && nHeadKvValues.some((value: number) => value === 0); if (architecture === GgufArchitectureType.falconH1) // source: `llama_model::load_hparams` in `llama-model.cpp`: @@ -1019,10 +1808,10 @@ function getRecurrentLayersPattern( // source: `llama_model::load_hparams` in `llama-model.cpp`: // `case LLM_ARCH_NEMOTRON_H / LLM_ARCH_NEMOTRON_H_MOE`: // `recurrent_layer_arr[i] = (n_head_kv(i) == 0 && n_ff(i) == 0)` - if (Array.isArray(nHeadKv)) + if (nHeadKvValues instanceof Array) return { type: "headCountKvAndFeedForward", - headCountKvValues: nHeadKv, + headCountKvValues: nHeadKvValues, feedForwardLength }; @@ -1055,10 +1844,10 @@ function getRecurrentLayersPattern( interval: Math.max(1, Math.floor(architectureMetadata?.full_attention_interval)) }; - if (hasRecurrentHeadCountKvEntry) + if (nHeadKvValues instanceof Array && hasRecurrentHeadCountKvEntry) return { type: "headCountKvArray", - values: nHeadKv + values: nHeadKvValues }; return "none"; @@ -1081,7 +1870,7 @@ function isLayerRecurrent(pattern: RecurrentLayersPattern, layerIndex: number): function resolveLayerFeedForwardLength(feedForwardLength: number | number[] | undefined, layerIndex: number): number { if (typeof feedForwardLength === "number") return feedForwardLength; - else if (Array.isArray(feedForwardLength)) + else if (feedForwardLength instanceof Array) return feedForwardLength[layerIndex] ?? 0; return 0; diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index ea41dcfa..23be6b82 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -8,7 +8,7 @@ import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.j import {resolveContextContextSizeOption} from "./utils/resolveContextContextSizeOption.js"; import {scoreLevels} from "./utils/scoreLevels.js"; import {getRamUsageFromUnifiedVram} from "./utils/getRamUsageFromUnifiedVram.js"; -import type {GgufInsights} from "./GgufInsights.js"; +import type {GgufInsights, GgufInsightsSimulatorSession} from "./GgufInsights.js"; export const defaultTrainContextSizeForEstimationPurposes = 4096; const defaultContextSizeForUnfitContextSizeConfiguration = 2048; @@ -39,23 +39,23 @@ export class GgufInsightsConfigurationResolver { targetGpuLayers, targetContextSize, embeddingContext = false, - flashAttention = false, + flashAttention = "auto", kvCacheKeyType, kvCacheValueType, swaFullCache = false, - useMmap = this._ggufInsights._llama.supportsMmap + useMmap = "auto" }: { targetGpuLayers?: number | "max", targetContextSize?: number, embeddingContext?: boolean, - flashAttention?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, - useMmap?: boolean + useMmap?: "auto" | boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), + getRamState = (() => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, @@ -114,7 +114,7 @@ export class GgufInsightsConfigurationResolver { public async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, - flashAttention = false, + flashAttention = "auto", kvCacheKeyType, kvCacheValueType, swaFullCache = false, @@ -122,11 +122,11 @@ export class GgufInsightsConfigurationResolver { maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, forceGpuLayers, - useMmap = this._ggufInsights._llama.supportsMmap + useMmap = "auto" }: { contextSize?: number, embeddingContext?: boolean, - flashAttention?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, @@ -141,7 +141,7 @@ export class GgufInsightsConfigurationResolver { forceStrictContextSize?: boolean, forceGpuLayers?: number | "max", - useMmap?: boolean + useMmap?: "auto" | boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), @@ -179,6 +179,7 @@ export class GgufInsightsConfigurationResolver { resolvedValues: { gpuLayers: number, contextSize: number, + useMmap: boolean, modelRamUsage: number, contextRamUsage: number, @@ -201,10 +202,14 @@ export class GgufInsightsConfigurationResolver { let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max") ? this.ggufInsights.totalLayers : forceGpuLayers; + let resolvedUseMmap = useMmap === "auto" + ? this._ggufInsights._getUseMmap() + : useMmap; let gpuLayersFitMemory = false; + const simulatorSession = this._ggufInsights._createSimulatorSession(); try { - resolvedGpuLayers = await this.resolveModelGpuLayers( + const layersResolution = await this.resolveModelGpuLayersV2( forceGpuLayers != null ? forceGpuLayers : embeddingContext @@ -229,9 +234,13 @@ export class GgufInsightsConfigurationResolver { defaultContextKvCacheKeyType: kvCacheKeyType, defaultContextKvCacheValueType: kvCacheValueType, ignoreMemorySafetyChecks: forceGpuLayers != null, - useMmap + useMmap, + + _simulatorSession: simulatorSession } ); + resolvedGpuLayers = layersResolution.gpuLayers; + resolvedUseMmap = layersResolution.useMmap; gpuLayersFitMemory = true; } catch (err) { if (!(err instanceof InsufficientMemoryError)) @@ -239,9 +248,11 @@ export class GgufInsightsConfigurationResolver { } const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false; - const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({ + const estimatedModelResourceUsage = await this._ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers: resolvedGpuLayers, - useMmap + useMmap: resolvedUseMmap, + + _simulatorSession: simulatorSession }); let resolvedContextSize = forceStrictContextSize @@ -263,8 +274,10 @@ export class GgufInsightsConfigurationResolver { total: ramState.total, free: Math.max( 0, - ramState.free - estimatedModelResourceUsage.cpuRam + - (-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) + ramState.free - ( + estimatedModelResourceUsage.cpuRam + + getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState) + ) ) }), getSwapState: async () => ({ @@ -274,7 +287,7 @@ export class GgufInsightsConfigurationResolver { swapState.free - Math.max( 0, estimatedModelResourceUsage.cpuRam + - (-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) + + getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState) + (-ramState.free) ) ) @@ -287,7 +300,10 @@ export class GgufInsightsConfigurationResolver { flashAttention, kvCacheKeyType, kvCacheValueType, - swaFullCache + swaFullCache, + useMmap: resolvedUseMmap, + + _simulatorSession: simulatorSession }); contextFitsMemory = true; @@ -302,14 +318,17 @@ export class GgufInsightsConfigurationResolver { throw err; } - const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({ + const estimatedContextResourceUsage = await this._ggufInsights.estimateContextResourceRequirementsV2({ contextSize: resolvedContextSize, isEmbeddingContext: embeddingContext, modelGpuLayers: resolvedGpuLayers, flashAttention, swaFullCache, kvCacheKeyType, - kvCacheValueType + kvCacheValueType, + + _simulatorSession: simulatorSession, + useMmap: resolvedUseMmap }); const rankPoints = { @@ -387,6 +406,7 @@ export class GgufInsightsConfigurationResolver { resolvedValues: { gpuLayers: resolvedGpuLayers, contextSize: resolvedContextSize, + useMmap: resolvedUseMmap, modelRamUsage: estimatedModelResourceUsage.cpuRam, contextRamUsage: estimatedContextResourceUsage.cpuRam, @@ -399,22 +419,63 @@ export class GgufInsightsConfigurationResolver { }; } - public async resolveModelGpuLayers(gpuLayers?: LlamaModelOptions["gpuLayers"], { - ignoreMemorySafetyChecks = false, - getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, - llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, - defaultContextFlashAttention = false, - defaultContextKvCacheKeyType, - defaultContextKvCacheValueType, - defaultContextSwaFullCache = false, - useMmap = this._ggufInsights._llama.supportsMmap - }: { + /** + * @deprecated use `resolveModelGpuLayersV2` instead + */ + public async resolveModelGpuLayers(gpuLayers?: LlamaModelOptions["gpuLayers"], options: { ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, - llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean, + llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache?: boolean, useMmap?: boolean } = {}) { + const { + ignoreMemorySafetyChecks = false, + getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, + llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, + defaultContextFlashAttention = "auto", + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache = false, + useMmap = this._ggufInsights._defaultUseMmap ?? this._ggufInsights._llama.supportsMmap + } = options; + + return (await this.resolveModelGpuLayersV2(gpuLayers, { + ignoreMemorySafetyChecks, + getVramState, + llamaVramPaddingSize, llamaGpu, + llamaSupportsGpuOffloading, + defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache, + useMmap + })).gpuLayers; + } + + public async resolveModelGpuLayersV2(gpuLayers?: LlamaModelOptions["gpuLayers"], options: { + ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, + llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: LlamaContextOptions["flashAttention"], + defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache?: boolean, + useMmap?: "auto" | boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession + } = {}) { + const { + ignoreMemorySafetyChecks = false, + getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, + llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, + defaultContextFlashAttention = "auto", + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache = false, + useMmap = "auto", + + _simulatorSession + } = options; + return resolveModelGpuLayersOption(gpuLayers, { ggufInsights: this._ggufInsights, ignoreMemorySafetyChecks, @@ -426,7 +487,9 @@ export class GgufInsightsConfigurationResolver { defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, - useMmap + useMmap, + simulatorSession: _simulatorSession, + vramCapIsSet: this._ggufInsights._llama.getVramCap() != null }); } @@ -435,28 +498,14 @@ export class GgufInsightsConfigurationResolver { * * If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown. */ - public async resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], { - modelGpuLayers, - batchSize, - modelTrainContextSize, - flashAttention = false, - kvCacheKeyType, - kvCacheValueType, - swaFullCache = false, - getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), - getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), - llamaGpu = this._ggufInsights._llama.gpu, - ignoreMemorySafetyChecks = false, - isEmbeddingContext = false, - sequences = getDefaultContextSequences() - }: { + public async resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], options: { modelGpuLayers: number, modelTrainContextSize: number, - flashAttention?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, + useMmap?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, @@ -464,8 +513,31 @@ export class GgufInsightsConfigurationResolver { getSwapState?(): Promise<{total: number, free: number}>, llamaGpu?: BuildGpu, ignoreMemorySafetyChecks?: boolean, - isEmbeddingContext?: boolean + isEmbeddingContext?: boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession }) { + const { + modelGpuLayers, + batchSize, + modelTrainContextSize, + flashAttention = "auto", + kvCacheKeyType, + kvCacheValueType, + swaFullCache = false, + useMmap = this._ggufInsights._defaultUseMmap ?? this._ggufInsights._llama.supportsMmap, + getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), + getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), + llamaGpu = this._ggufInsights._llama.gpu, + ignoreMemorySafetyChecks = false, + isEmbeddingContext = false, + sequences = getDefaultContextSequences(), + + _simulatorSession + } = options; + return await resolveContextContextSizeOption({ contextSize, batchSize, @@ -477,12 +549,17 @@ export class GgufInsightsConfigurationResolver { kvCacheKeyType, kvCacheValueType, swaFullCache, + useMmap, getVramState, getRamState, getSwapState, llamaGpu, ignoreMemorySafetyChecks, - isEmbeddingContext + isEmbeddingContext, + + simulatorSession: _simulatorSession, + ramCapIsSet: this._ggufInsights._llama.getRamCap() != null, + vramCapIsSet: this._ggufInsights._llama.getVramCap() != null }); } diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index ba0e4ae7..f1ccf5e1 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -1,5 +1,5 @@ import {LlamaContextOptions} from "../../../evaluator/LlamaContext/types.js"; -import {GgufInsights} from "../GgufInsights.js"; +import {GgufInsights, GgufInsightsSimulatorSession} from "../GgufInsights.js"; import {BuildGpu} from "../../../bindings/types.js"; import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; @@ -9,173 +9,214 @@ import type {GgmlType} from "../../types/GgufTensorInfoTypes.js"; const defaultMaxContextSizeSwapUse = 2048; -export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, - getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, - maxContextSizeSwapUse = defaultMaxContextSizeSwapUse -}: { +export async function resolveContextContextSizeOption(options: { contextSize?: LlamaContextOptions["contextSize"], batchSize?: LlamaContextOptions["batchSize"], sequences: number, modelFileInsights: GgufInsights, modelGpuLayers: number, modelTrainContextSize: number, - flashAttention: boolean, + flashAttention: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean, + useMmap?: boolean, getVramState(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState(): Promise<{total: number, free: number}>, getSwapState(): Promise<{total: number, free: number}>, llamaGpu: BuildGpu, ignoreMemorySafetyChecks?: boolean, isEmbeddingContext?: boolean, - maxContextSizeSwapUse?: number + maxContextSizeSwapUse?: number, + simulatorSession?: GgufInsightsSimulatorSession, + ramCapIsSet?: boolean, + vramCapIsSet?: boolean }): Promise { - if (contextSize == null) - contextSize = "auto"; + const { + contextSize: _contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, useMmap = modelFileInsights._llama.supportsMmap, + getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, + maxContextSizeSwapUse = defaultMaxContextSizeSwapUse, + + simulatorSession: _simulatorSession, + ramCapIsSet = false, + vramCapIsSet = false + } = options; + let contextSize = _contextSize; + + const simulatorSession = _simulatorSession ?? modelFileInsights._createSimulatorSession(); + + try { + if (contextSize == null) + contextSize = "auto"; + + if (typeof contextSize === "number") { + const resolvedContextSize = Math.max(1, Math.floor(contextSize)); + + if (ignoreMemorySafetyChecks) + return resolvedContextSize; + + const [ + vramState, + ramState, + swapState + ] = await Promise.all([ + getVramState(), + getRamState(), + getSwapState() + ]); + const contextResourceRequirements = await modelFileInsights.estimateContextResourceRequirementsV2({ + contextSize: resolvedContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}), + modelGpuLayers: modelGpuLayers, + sequences, + flashAttention, + kvCacheKeyType, + kvCacheValueType, + swaFullCache, + isEmbeddingContext, + + _simulatorSession: simulatorSession, + useMmap + }); - if (typeof contextSize === "number") { - const resolvedContextSize = Math.max(1, Math.floor(contextSize)); + if (contextResourceRequirements.gpuVram > vramState.free) + throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (contextResourceRequirements.cpuRam > ( + ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + )) + throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); - if (ignoreMemorySafetyChecks) return resolvedContextSize; + } else if (contextSize === "auto" || typeof contextSize === "object") { + const [ + vramState, + ramState, + swapState + ] = await Promise.all([ + getVramState(), + getRamState(), + getSwapState() + ]); + + const maxContextSize = contextSize === "auto" + ? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) + : Math.min( + contextSize.max ?? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}), + getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) + ); + + const minContextSize = contextSize === "auto" + ? minAllowedContextSizeInCalculations + : Math.max( + contextSize.min ?? minAllowedContextSizeInCalculations, + minAllowedContextSizeInCalculations + ); + + let highestCompatibleContextSize: number | null = null; + let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4)); + for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) { + const contextResourceRequirements = await modelFileInsights.estimateContextResourceRequirementsV2({ + contextSize: testContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}), + modelGpuLayers: modelGpuLayers, + sequences, + flashAttention, + kvCacheKeyType, + kvCacheValueType, + swaFullCache, + isEmbeddingContext, + + _simulatorSession: simulatorSession, + useMmap + }); + + if (contextResourceRequirements.gpuVram <= vramState.free && + contextResourceRequirements.cpuRam <= ( + ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + ( + testContextSize <= maxContextSizeSwapUse + ? swapState.free + : 0 + ) + ) + ) { + if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) { + highestCompatibleContextSize = testContextSize; + + if (step === -1 || testContextSize === maxContextSize) + break; + else if (step < 0) + step = Math.max(1, Math.floor(-step / 2)); + } else if (testContextSize < highestCompatibleContextSize) { + testContextSize = highestCompatibleContextSize; + step = Math.max(1, Math.floor(Math.abs(step) / 2)); + } + } else if (step > 0) + step = -Math.max(1, Math.floor(step / 2)); + + if (testContextSize == minContextSize && step === -1) + break; + + testContextSize += step; + if (testContextSize < minContextSize) { + testContextSize = minContextSize; + step = Math.max(1, Math.floor(Math.abs(step) / 2)); + } else if (testContextSize > maxContextSize) { + testContextSize = maxContextSize; + step = -Math.max(1, Math.floor(Math.abs(step) / 2)); + } + } + + if (highestCompatibleContextSize != null) + return highestCompatibleContextSize; + + if (ignoreMemorySafetyChecks) + return minContextSize; - const [ - vramState, - ramState, - swapState - ] = await Promise.all([ - getVramState(), - getRamState(), - getSwapState() - ]); - const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ - contextSize: resolvedContextSize, - batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}), - modelGpuLayers: modelGpuLayers, - sequences, - flashAttention, - kvCacheKeyType, - kvCacheValueType, - swaFullCache, - isEmbeddingContext - }); - - if (contextResourceRequirements.gpuVram > vramState.free) - throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); - else if (contextResourceRequirements.cpuRam > ( - ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) - )) - throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); - - return resolvedContextSize; - } else if (contextSize === "auto" || typeof contextSize === "object") { - const [ - vramState, - ramState, - swapState - ] = await Promise.all([ - getVramState(), - getRamState(), - getSwapState() - ]); - - const maxContextSize = contextSize === "auto" - ? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) - : Math.min( - contextSize.max ?? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}), - getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) - ); - - const minContextSize = contextSize === "auto" - ? minAllowedContextSizeInCalculations - : Math.max( - contextSize.min ?? minAllowedContextSizeInCalculations, - minAllowedContextSizeInCalculations - ); - - let highestCompatibleContextSize: number | null = null; - let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4)); - for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) { - const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ - contextSize: testContextSize, - batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}), + const minContextSizeResourceRequirements = await modelFileInsights.estimateContextResourceRequirementsV2({ + contextSize: minContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: minContextSize, sequences}), modelGpuLayers: modelGpuLayers, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, - isEmbeddingContext + isEmbeddingContext, + + _simulatorSession: simulatorSession, + useMmap }); - if (contextResourceRequirements.gpuVram <= vramState.free && - contextResourceRequirements.cpuRam <= ( - ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + ( - testContextSize <= maxContextSizeSwapUse - ? swapState.free - : 0 - ) - ) - ) { - if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) { - highestCompatibleContextSize = testContextSize; - - if (step === -1) - break; - else if (step < 0) - step = Math.max(1, Math.floor(-step / 2)); - } - } else if (step > 0) - step = -Math.max(1, Math.floor(step / 2)); - - if (testContextSize == minContextSize && step === -1) - break; - - testContextSize += step; - if (testContextSize < minContextSize) { - testContextSize = minContextSize; - step = Math.max(1, Math.floor(Math.abs(step) / 2)); - } else if (testContextSize > maxContextSize) { - testContextSize = maxContextSize; - step = -Math.max(1, Math.floor(Math.abs(step) / 2)); - } + const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState); + if (minContextSizeResourceRequirements.gpuVram > vramState.free && + minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage + ) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (minContextSizeResourceRequirements.gpuVram > vramState.free) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); } - if (highestCompatibleContextSize != null) - return highestCompatibleContextSize; - - if (ignoreMemorySafetyChecks) - return minContextSize; - - const minContextSizeResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ - contextSize: minContextSize, - batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: minContextSize, sequences}), - modelGpuLayers: modelGpuLayers, - sequences, - flashAttention, - kvCacheKeyType, - kvCacheValueType, - swaFullCache, - isEmbeddingContext - }); - - const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState); - if (minContextSizeResourceRequirements.gpuVram > vramState.free && - minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage - ) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}`); - else if (minContextSizeResourceRequirements.gpuVram > vramState.free) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); - else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); - else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM`); - else - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources`); + throw new Error(`Invalid context size: "${contextSize}"`); + } finally { + if (_simulatorSession == null) + await simulatorSession.dispose(); } +} + +function getCapErrorMessage(ramCapIsSet: boolean, vramCapIsSet: boolean) { + if (ramCapIsSet && vramCapIsSet) + return " (RAM and VRAM caps are set, consider increasing or removing the caps to allow more memory to be used)"; + else if (vramCapIsSet) + return " (VRAM cap is set, consider increasing or removing the cap to allow more VRAM to be used)"; + else if (ramCapIsSet) + return " (RAM cap is set, consider increasing or removing the cap to allow more RAM to be used)"; - throw new Error(`Invalid context size: "${contextSize}"`); + return ""; } diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts index 5c544744..8bb226ce 100644 --- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts +++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts @@ -1,104 +1,236 @@ import {LlamaModelOptions} from "../../../evaluator/LlamaModel/LlamaModel.js"; import {BuildGpu} from "../../../bindings/types.js"; import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js"; -import {findBestOption} from "../../../utils/findBestOption.js"; +import {findFirstNonNullBestOptionAsync} from "../../../utils/findBestOption.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {scoreLevels} from "./scoreLevels.js"; +import type {LlamaContextOptions} from "../../../evaluator/LlamaContext/types.js"; import type {GgmlType} from "../../types/GgufTensorInfoTypes.js"; -import type {GgufInsights} from "../GgufInsights.js"; +import type {GgufInsights, GgufInsightsSimulatorSession} from "../GgufInsights.js"; const fitContextExtraMemoryPaddingPercentage = 0.5; +const vramWastePercentageToPreferDisablingMmap = 0.2; +const contextSizeMissPercentageToPreferDisablingMmap = 0.2; -export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { - ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, - llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, - defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap -}: { +export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], options: { ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean, getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu, - llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, + llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean, - useMmap?: boolean -}): Promise { - if (gpuLayers == null) - gpuLayers = "auto"; - - if (!llamaSupportsGpuOffloading) - return 0; - - if (gpuLayers === "max" || typeof gpuLayers === "number") { - const resolvedGpuLayers = typeof gpuLayers === "number" - ? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers)) - : ggufInsights.totalLayers; - - if (ignoreMemorySafetyChecks) - return resolvedGpuLayers; - - const vramState = await getVramState(); - const maxLayersRequirements = getVramRequiredForGpuLayers({ - gpuLayers: resolvedGpuLayers, - ggufInsights, - currentVram: vramState.free, - defaultContextFlashAttention, - defaultContextKvCacheKeyType, - defaultContextKvCacheValueType, - defaultContextSwaFullCache, - useMmap - }); - - if (maxLayersRequirements == null) - throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings"); - - return resolvedGpuLayers; - } else if (gpuLayers === "auto" || typeof gpuLayers === "object") { - if (llamaGpu === false) - return 0; - - const vramState = await getVramState(); - if (vramState.total === 0) - return 0; - - let freeVram = vramState.free; - if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) { - freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage; + useMmap?: "auto" | boolean, simulatorSession?: GgufInsightsSimulatorSession, vramCapIsSet?: boolean +}): Promise<{gpuLayers: number, useMmap: boolean}> { + const { + ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, + llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, + defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap = "auto", + simulatorSession: _simulatorSession, vramCapIsSet = false + } = options; + + const simulatorSession = _simulatorSession ?? ggufInsights._createSimulatorSession(); + + try { + if (gpuLayers == null) + gpuLayers = "auto"; + + if (!llamaSupportsGpuOffloading) + return {gpuLayers: 0, useMmap: useMmap === "auto" ? ggufInsights._getUseMmap() : useMmap}; + + if (gpuLayers === "max" || typeof gpuLayers === "number") { + const resolvedGpuLayers = typeof gpuLayers === "number" + ? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers)) + : ggufInsights.totalLayers; + const vramState = await getVramState(); + + const getVramNeeds = (useMmap: boolean) => getVramRequiredForGpuLayers({ + gpuLayers: resolvedGpuLayers, + ggufInsights, + currentVram: vramState.free, + defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache, + useMmap, + simulatorSession + }); + const getPreferredResolvedLayers = async () => { + if (useMmap !== "auto") + return await getVramNeeds(useMmap); + + const [ + withMmap, + withoutMmap + ] = await Promise.all([ + getVramNeeds(true), + getVramNeeds(false) + ]); + + if (withoutMmap != null && withMmap == null) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + typeof gpuLayers === "number" && + withoutMmap.totalVram <= withMmap.totalVram * (1 - vramWastePercentageToPreferDisablingMmap) + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.gpuLayers > withMmap.gpuLayers + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.contextSize >= withMmap.contextSize * (1 + contextSizeMissPercentageToPreferDisablingMmap) + ) + return withoutMmap; + + return withMmap ?? withoutMmap; + }; + + if (ignoreMemorySafetyChecks) + return { + gpuLayers: resolvedGpuLayers, + useMmap: useMmap === "auto" + ? gpuLayers === "max" + ? true + : (await getPreferredResolvedLayers())?.useMmap ?? false + : useMmap + }; + + const maxLayersRequirements = (useMmap !== "auto" || gpuLayers === "max") + ? await getVramNeeds( + useMmap === "auto" + ? ggufInsights._getUseMmap() + : useMmap + ) + : await getPreferredResolvedLayers(); + + if (maxLayersRequirements == null) + throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings" + getCapErrorMessage(vramCapIsSet)); + + return { + gpuLayers: resolvedGpuLayers, + useMmap: maxLayersRequirements.useMmap + }; + } else if (gpuLayers === "auto" || typeof gpuLayers === "object") { + if (llamaGpu === false) + return {gpuLayers: 0, useMmap: useMmap === "auto" ? ggufInsights._getUseMmap() : useMmap}; + + const vramState = await getVramState(); + if (vramState.total === 0) + return {gpuLayers: 0, useMmap: useMmap === "auto" ? ggufInsights._getUseMmap() : useMmap}; + + let freeVram = vramState.free; + if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) { + freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage; + + if (freeVram < 0) + freeVram = 0; + } - if (freeVram < 0) - freeVram = 0; + const getGpuLayersForMmapOptions = (useMmap: boolean) => getBestGpuLayersForFreeVram({ + ggufInsights, + freeVram, + fitContext: typeof gpuLayers === "object" + ? gpuLayers.fitContext + : undefined, + minGpuLayers: typeof gpuLayers === "object" + ? gpuLayers.min + : undefined, + maxGpuLayers: typeof gpuLayers === "object" + ? gpuLayers.max + : undefined, + defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache, + useMmap, + simulatorSession + }); + const getGpuLayersForMmapOptionsWithResourceRequirements = async (useMmap: boolean) => { + const resolvedLayers = await getGpuLayersForMmapOptions(useMmap); + if (resolvedLayers == null) + return null; + + return getVramRequiredForGpuLayers({ + gpuLayers: resolvedLayers, + ggufInsights, + currentVram: freeVram, + fitContext: typeof gpuLayers === "object" + ? gpuLayers.fitContext + : undefined, + defaultContextFlashAttention, + defaultContextSwaFullCache, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + useMmap, + simulatorSession + }); + }; + const getPreferredResolvedLayers = async () => { + if (useMmap !== "auto") + return { + gpuLayers: await getGpuLayersForMmapOptions(useMmap), + useMmap + }; + + const [ + withMmap, + withoutMmap + ] = await Promise.all([ + getGpuLayersForMmapOptionsWithResourceRequirements(true), + getGpuLayersForMmapOptionsWithResourceRequirements(false) + ]); + + if (withoutMmap != null && withMmap == null) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + typeof gpuLayers === "number" && + withoutMmap.totalVram <= withMmap.totalVram * (1 - vramWastePercentageToPreferDisablingMmap) + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.gpuLayers > withMmap.gpuLayers + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.contextSize >= withMmap.contextSize * (1 + contextSizeMissPercentageToPreferDisablingMmap) + ) + return withoutMmap; + + return withMmap ?? withoutMmap; + }; + + const bestGpuLayersOption = await getPreferredResolvedLayers(); + + const hasGpuLayersRequirements = typeof gpuLayers === "object" && + (gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null); + + if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements) + throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings" + getCapErrorMessage(vramCapIsSet)); + + return { + gpuLayers: bestGpuLayersOption?.gpuLayers ?? 0, + useMmap: bestGpuLayersOption?.useMmap ?? ( + useMmap === "auto" + ? ggufInsights._getUseMmap() + : useMmap + ) + }; } - - const bestGpuLayersOption = getBestGpuLayersForFreeVram({ - ggufInsights, - freeVram, - fitContext: typeof gpuLayers === "object" - ? gpuLayers.fitContext - : undefined, - minGpuLayers: typeof gpuLayers === "object" - ? gpuLayers.min - : undefined, - maxGpuLayers: typeof gpuLayers === "object" - ? gpuLayers.max - : undefined, - defaultContextFlashAttention, - defaultContextKvCacheKeyType, - defaultContextKvCacheValueType, - defaultContextSwaFullCache, - useMmap - }); - - const hasGpuLayersRequirements = typeof gpuLayers === "object" && - (gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null); - - if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements) - throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings"); - - return bestGpuLayersOption ?? 0; + + throw new Error(`Invalid gpuLayers value: ${gpuLayers}`); + } finally { + if (_simulatorSession == null) + await simulatorSession.dispose(); } +} + +function getCapErrorMessage(vramCapIsSet: boolean) { + if (vramCapIsSet) + return " (VRAM cap is set, consider increasing or removing the cap to fit more layers)"; - throw new Error(`Invalid gpuLayers value: ${gpuLayers}`); + return ""; } -function getBestGpuLayersForFreeVram({ +async function getBestGpuLayersForFreeVram({ ggufInsights, freeVram, fitContext, @@ -108,32 +240,35 @@ function getBestGpuLayersForFreeVram({ defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, - useMmap + useMmap, + simulatorSession }: { ggufInsights: GgufInsights, freeVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, minGpuLayers?: number, maxGpuLayers?: number, - defaultContextFlashAttention: boolean, + defaultContextFlashAttention: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean, - useMmap?: boolean + useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession }) { - return findBestOption({ - *generator() { - const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0)); - const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers)); + const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0)); + const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers)); + return (await findFirstNonNullBestOptionAsync({ + prefill: Math.max(1, Math.min(100, Math.ceil((maxLayers - minLayers) / 3))), + *generator() { for (let layers = maxLayers; layers >= minLayers; layers--) { yield { gpuLayers: layers }; } }, - score(option) { - const layersRequirements = getVramRequiredForGpuLayers({ + async score(option) { + const layersRequirements = await getVramRequiredForGpuLayers({ gpuLayers: option.gpuLayers, ggufInsights, currentVram: freeVram, @@ -142,7 +277,8 @@ function getBestGpuLayersForFreeVram({ defaultContextSwaFullCache, defaultContextKvCacheKeyType, defaultContextKvCacheValueType, - useMmap + useMmap, + simulatorSession }); if (layersRequirements == null) @@ -153,7 +289,7 @@ function getBestGpuLayersForFreeVram({ trainContextSize: getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}) }); } - })?.gpuLayers ?? null; + }))?.gpuLayers ?? null; } function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayers: number, contextSize: number}, { @@ -180,10 +316,13 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer return scoreLevels(contextSize, [{ start: 0, - points: 2 + points: 8 + }, { + start: 512, + points: 8 }, { start: 1024, - points: 4 + points: 8 }, { start: 2048, points: gpuLayersPercentage < 0.1 ? 1 : 8 @@ -200,24 +339,27 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer return scoreGpuLayers() + scoreContextSize(); } -function getVramRequiredForGpuLayers({ - gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, - defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap +async function getVramRequiredForGpuLayers({ + gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = "auto", + defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap = ggufInsights._getUseMmap(), + simulatorSession }: { gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, - defaultContextFlashAttention: boolean, defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, - defaultContextSwaFullCache: boolean, useMmap?: boolean + defaultContextFlashAttention: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, + defaultContextSwaFullCache: boolean, useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession }) { - const modelVram = ggufInsights.estimateModelResourceRequirements({ + const modelVram = (await ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers, - useMmap - }).gpuVram; + useMmap, + _simulatorSession: simulatorSession + })).gpuVram; if (modelVram > currentVram) return null; if (fitContext != null && fitContext.contextSize != null) { - const contextVram = ggufInsights.estimateContextResourceRequirements({ + const contextVram = (await ggufInsights.estimateContextResourceRequirementsV2({ contextSize: fitContext.contextSize, batchSize: getDefaultContextBatchSize({contextSize: fitContext.contextSize, sequences: 1}), modelGpuLayers: gpuLayers, @@ -226,21 +368,26 @@ function getVramRequiredForGpuLayers({ flashAttention: defaultContextFlashAttention, kvCacheKeyType: defaultContextKvCacheKeyType, kvCacheValueType: defaultContextKvCacheValueType, - swaFullCache: defaultContextSwaFullCache - }).gpuVram; + swaFullCache: defaultContextSwaFullCache, + + _simulatorSession: simulatorSession, + useMmap + })).gpuVram; const totalVram = modelVram + contextVram; if (totalVram > currentVram) return null; return { + gpuLayers, contextSize: fitContext.contextSize, contextVram, - totalVram + totalVram, + useMmap }; } - const maxContext = findMaxPossibleContextSizeForVram({ + const maxContext = await findMaxPossibleContextSizeForVram({ gpuLayers, ggufInsights, vram: currentVram - modelVram, @@ -248,24 +395,30 @@ function getVramRequiredForGpuLayers({ flashAttention: defaultContextFlashAttention, kvCacheKeyType: defaultContextKvCacheKeyType, kvCacheValueType: defaultContextKvCacheValueType, - swaFullCache: defaultContextSwaFullCache + swaFullCache: defaultContextSwaFullCache, + useMmap, + simulatorSession }); if (maxContext == null || modelVram + maxContext.vram > currentVram) return null; return { + gpuLayers, contextSize: maxContext.contextSize, contextVram: maxContext.vram, - totalVram: modelVram + maxContext.vram + totalVram: modelVram + maxContext.vram, + useMmap }; } -function findMaxPossibleContextSizeForVram({ - gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache +async function findMaxPossibleContextSizeForVram({ + gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, + useMmap, simulatorSession }: { - gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, - kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean + gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, + flashAttention: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean, + useMmap?: boolean, simulatorSession?: GgufInsightsSimulatorSession }) { const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}); @@ -273,8 +426,8 @@ function findMaxPossibleContextSizeForVram({ maxValue: maxContextSize, minValue: minAllowedContextSizeInCalculations, minStep: 1, - test(contextSize) { - const contextVram = ggufInsights.estimateContextResourceRequirements({ + async test(contextSize) { + const contextVram = (await ggufInsights.estimateContextResourceRequirementsV2({ contextSize, batchSize: getDefaultContextBatchSize({contextSize, sequences: 1}), modelGpuLayers: gpuLayers, @@ -283,8 +436,11 @@ function findMaxPossibleContextSizeForVram({ flashAttention, kvCacheKeyType, kvCacheValueType, - swaFullCache - }).gpuVram; + swaFullCache, + + _simulatorSession: simulatorSession, + useMmap: useMmap + })).gpuVram; if (contextVram <= vram) return { @@ -297,7 +453,7 @@ function findMaxPossibleContextSizeForVram({ }); } -function findMaxValidValue({ +async function findMaxValidValue({ maxValue, minValue, minStep = 1, @@ -306,29 +462,26 @@ function findMaxValidValue({ maxValue: number, minValue: number, minStep?: number, - test(value: number): T | null -}): T | null { + test(value: number): Promise +}): Promise { let step = -Math.max(minStep, Math.floor((maxValue - minValue) / 4)); let bestValue: null | {value: number, result: T} = null; for (let value = maxValue; value >= minValue;) { const result: T | null = (bestValue != null && value === bestValue.value) ? bestValue.result - : test(value); + : await test(value); - if (result != null) { - if (bestValue == null || value >= bestValue.value) { - bestValue = {value: value, result: result}; + if (result != null && (bestValue == null || value >= bestValue.value)) { + bestValue = {value: value, result: result}; - if (step === -minStep) - break; - else if (step < 0) - step = Math.max(minStep, Math.floor(-step / 2)); - } + if (step === -minStep || value === maxValue) + break; + else if (step < 0) + step = Math.max(minStep, Math.floor(-step / 2)); } else if (bestValue != null && value < bestValue.value) { value = bestValue.value; step = Math.max(minStep, Math.floor(Math.abs(step) / 2)); - continue; } else if (step > 0) step = -Math.max(minStep, Math.floor(step / 2)); @@ -337,7 +490,9 @@ function findMaxValidValue({ value += step; if (value < minValue) { - value = minValue; + value = bestValue != null + ? Math.max(bestValue.value, minValue) + : minValue; step = Math.max(minStep, Math.floor(Math.abs(step) / 2)); } else if (value > maxValue) { value = maxValue; diff --git a/src/gguf/parser/GgufV2Parser.ts b/src/gguf/parser/GgufV2Parser.ts index cdd66bc4..69b7b824 100644 --- a/src/gguf/parser/GgufV2Parser.ts +++ b/src/gguf/parser/GgufV2Parser.ts @@ -62,7 +62,7 @@ export class GgufV2Parser { tensorInfo: tensorReadResult?.tensorInfo, metadataSize: headerReadResult.headerSize + initialOffset, tensorInfoSize: tensorReadResult?.tensorInfoSize, - tensorDataOffset: tensorReadResult?.tensorDataOffset + infoEndOffset: tensorReadResult?.infoEndOffset }; } @@ -201,7 +201,7 @@ export class GgufV2Parser { return { tensorInfo, tensorInfoSize: readOffset.offset - initialOffset, - tensorDataOffset: fileTensorDataOffset + infoEndOffset: fileTensorDataOffset }; } }); diff --git a/src/gguf/parser/parseGguf.ts b/src/gguf/parser/parseGguf.ts index 8f7a9919..ef4d2ead 100644 --- a/src/gguf/parser/parseGguf.ts +++ b/src/gguf/parser/parseGguf.ts @@ -3,8 +3,10 @@ import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; import {UnsupportedError} from "../../utils/UnsupportedError.js"; import {GgufReadOffset} from "../utils/GgufReadOffset.js"; import {GgufFileReader} from "../fileReaders/GgufFileReader.js"; -import {GgufFileInfo, GgufVersionParserOptions, GgufVersionParserResult} from "../types/GgufFileInfoTypes.js"; +import {GgufFileInfo, GgufFileInfoSourceData, GgufVersionParserOptions, GgufVersionParserResult} from "../types/GgufFileInfoTypes.js"; import {getGgufMetadataArchitectureData} from "../utils/getGgufMetadataArchitectureData.js"; +import {GgufFsFileReader} from "../fileReaders/GgufFsFileReader.js"; +import {Promisable, transformPromisable} from "../../utils/transformPromisable.js"; import {GgufV2Parser} from "./GgufV2Parser.js"; import {GgufV3Parser} from "./GgufV3Parser.js"; @@ -33,11 +35,21 @@ export async function parseGguf({ logWarnings }); const architectureMetadata = getGgufMetadataArchitectureData(ggufInfo.metadata); + const sourceData: Promisable | undefined = ggufInfo.infoEndOffset == null + ? undefined + : (fileReader instanceof GgufFsFileReader) + ? { + type: "path", + path: fileReader.filePath, + length: ggufInfo.infoEndOffset + } + : transformPromisable(fileReader.readByteRange(0, ggufInfo.infoEndOffset), createGgufFileInfoSourceDataFromBuffer); return { version: magicAndVersion.version, tensorCount: ggufInfo.tensorCount, metadata: ggufInfo.metadata, + infoEndOffset: ggufInfo.infoEndOffset, architectureMetadata: architectureMetadata, tensorInfo: ggufInfo.tensorInfo, metadataSize: ggufInfo.metadataSize, @@ -45,6 +57,11 @@ export async function parseGguf({ totalTensorInfoSize: ggufInfo.tensorInfoSize, totalTensorCount: ggufInfo.tensorCount, totalMetadataSize: ggufInfo.metadataSize, + sourceData: sourceData == null + ? [] + : sourceData instanceof Promise + ? [await sourceData] + : [sourceData], fullTensorInfo: ggufInfo.tensorInfo, tensorInfoSize: ggufInfo.tensorInfoSize }; @@ -87,3 +104,10 @@ async function parseGgufUsingASpecificVersionParser( return await (new GgufV3Parser(specificVersionParserOptions)).parse(); } } + +function createGgufFileInfoSourceDataFromBuffer(buffer: Buffer): GgufFileInfoSourceData { + return { + type: "buffer", + buffer + }; +} diff --git a/src/gguf/readGgufFileInfo.ts b/src/gguf/readGgufFileInfo.ts index f57a0669..1f18c6e8 100644 --- a/src/gguf/readGgufFileInfo.ts +++ b/src/gguf/readGgufFileInfo.ts @@ -85,6 +85,11 @@ export async function readGgufFileInfo(pathOrUri: string, { endpoints?: ModelDownloadEndpoints } = {}) { const useNetworkReader = sourceType === "network" || (sourceType == null && (isUrl(pathOrUri) || isModelUri(pathOrUri))); + function createSource(pathOrUri: string) { + return useNetworkReader + ? {type: "uri" as const, uri: pathOrUri} + : {type: "path" as const, path: pathOrUri}; + } async function createFileReader(pathOrUri: string) { if (useNetworkReader) { @@ -125,6 +130,8 @@ export async function readGgufFileInfo(pathOrUri: string, { (tensor as Writable).filePart = splitPartNumber; } + (res as Writable).source = createSource(pathOrUri); + return res; } @@ -147,6 +154,7 @@ export async function readGgufFileInfo(pathOrUri: string, { version: first.version, tensorCount: first.tensorCount, metadata: first.metadata, + infoEndOffset: first.infoEndOffset, architectureMetadata: first.architectureMetadata, tensorInfo: first.tensorInfo, metadataSize: first.metadataSize, @@ -159,6 +167,8 @@ export async function readGgufFileInfo(pathOrUri: string, { fullTensorInfo: first.fullTensorInfo == null ? undefined : [first, ...rest].flatMap((part) => (part.fullTensorInfo ?? [])), - tensorInfoSize: first.tensorInfoSize + tensorInfoSize: first.tensorInfoSize, + source: createSource(pathOrUri), + sourceData: [first, ...rest].flatMap((part) => part.sourceData) } satisfies GgufFileInfo; } diff --git a/src/gguf/types/GgufFileInfoTypes.ts b/src/gguf/types/GgufFileInfoTypes.ts index 0ff9a20c..f506698f 100644 --- a/src/gguf/types/GgufFileInfoTypes.ts +++ b/src/gguf/types/GgufFileInfoTypes.ts @@ -16,6 +16,13 @@ export type GgufFileInfo = { readonly metadata: GgufMetadata, readonly metadataSize: number, + /** + * Offset in bytes from the start of the file to the end of the preserved GGUF info section. + * This includes the header, key-value metadata, tensor info and the alignment padding up to the tensor data section. + * Can be null if `readTensorInfo` is set to `false`. + */ + readonly infoEndOffset?: number, + /** Same value as `metadata[metadata.general.architecture]`, but with merged types for convenience */ readonly architectureMetadata: MergeOptionalUnionTypes>, @@ -60,7 +67,41 @@ export type GgufFileInfo = { * * When no splicing is done, this will be the same as `tensorInfoSize`. */ - readonly totalTensorInfoSize?: number + readonly totalTensorInfoSize?: number, + + /** + * An array of source data entries from which the file info was read. + * Each entry can be either a file path or a read-only buffer containing the raw GGUF metadata section part of the file + * (including the header, key-value pairs, tensor info and alignment padding up to the tensor data section). + * + * For a single source file, this array will contain only a single entry, + * but for spliced metadata from multiple file parts, this array will contain an entry for each part, in the order they were spliced. + * + * When `readTensorInfo` is set to `false`, this will be an empty array. + */ + readonly sourceData: GgufFileInfoSourceData[], + + /** + * Indication of the source of the GGUF file info, such as the file path or URI it was read from. + */ + readonly source?: GgufFileInfoSource +}; + +export type GgufFileInfoSource = { + type: "path", + path: string +} | { + type: "uri", + uri: string +}; + +export type GgufFileInfoSourceData = { + type: "path", + path: string, + length: number +} | { + type: "buffer", + buffer: Readonly }; @@ -97,5 +138,5 @@ export type GgufVersionParserResult = { tensorInfo?: GgufTensorInfo[], metadataSize: number, tensorInfoSize?: number, - tensorDataOffset?: number + infoEndOffset?: number }; diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index 249cad32..3d53e764 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -47,6 +47,7 @@ export const enum GgufArchitectureType { gemma2 = "gemma2", gemma3 = "gemma3", gemma3n = "gemma3n", + gemma4 = "gemma4", gemmaEmbedding = "gemma-embedding", starcoder2 = "starcoder2", mamba = "mamba", @@ -64,6 +65,7 @@ export const enum GgufArchitectureType { arctic = "arctic", deepseek = "deepseek", deepseek2 = "deepseek2", + deepseek2ocr = "deepseek2-ocr", chatglm = "chatglm", glm4 = "glm4", glm4moe = "glm4moe", @@ -195,7 +197,8 @@ export enum GgufFileType { MOSTLY_TQ1_0 = 36, MOSTLY_TQ2_0 = 37, MOSTLY_MXFP4_MOE = 38, - MOSTLY_NVFP4 = 39 + MOSTLY_NVFP4 = 39, + MOSTLY_Q1_0 = 40 } @@ -299,13 +302,13 @@ export const enum GgufMetadataTokenizerTokenType { export type GgufMetadataTokenizer = { readonly ggml: { - readonly model: "no_vocab" | "none" | "llama" | "gpt2" | "bert" | "rwkv" | "t5" | "plamo2" | string, + readonly model: "no_vocab" | "none" | "llama" | "gpt2" | "bert" | "rwkv" | "t5" | "plamo2" | "gemma4" | string, readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "falcon3" | "pixtral" | "mpt" | "starcoder" | "gpt-2" | "phi-2" | "jina-es" | "jina-de" | "jina-v1-en" | "jina-v2-es" | "jina-v2-de" | "jina-v2-code" | "refact" | "command-r" | "qwen2" | "stablelm2" | "olmo" | "dbrx" | "smaug-bpe" | "poro-chat" | "chatglm-bpe" | "viking" | "jais" | "tekken" | "smollm" | "codeshell" | "bloom" | "gpt3-finnish" | "exaone" | "exaone4" | "chameleon" | "minerva-7b" | "megrez" | "gpt-4o" | "superbpe" | "trillion" | "bailingmoe" | "a.x-4.0" | "mellum" | "modern-bert" | - "roberta-bpe" | "deepseek-r1-qwen" | "kormo" | "qwen35" | string, + "roberta-bpe" | "deepseek-r1-qwen" | "kormo" | "qwen35" | "gemma4" | string, readonly tokens: readonly string[], readonly token_type: GgufMetadataTokenizerTokenType[], readonly token_type_count?: number, @@ -378,8 +381,11 @@ export type GgufMetadataDefaultArchitectureType = { readonly layer_norm_rms_epsilon?: number, readonly key_length?: number, readonly value_length?: number, + readonly key_length_swa?: number, + readonly value_length_swa?: number, readonly sliding_window?: number, readonly sliding_window_pattern?: number | number[], + readonly shared_kv_layers?: number, readonly causal?: boolean }, diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index 39e2b984..42b038a5 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -61,7 +61,8 @@ export enum GgmlType { IQ4_NL_4_8 = 37, IQ4_NL_8_8 = 38, MXFP4 = 39, // MXFP4 (1 block) - NVFP4 = 40 // NVFP4 (4 blocks, E4M3 scale) + NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) + Q1_0 = 41 } export function resolveGgmlTypeOption(option?: keyof typeof GgmlType | GgmlType) { diff --git a/src/gguf/utils/ggufQuantNames.ts b/src/gguf/utils/ggufQuantNames.ts index 3e2c5c65..b56a166a 100644 --- a/src/gguf/utils/ggufQuantNames.ts +++ b/src/gguf/utils/ggufQuantNames.ts @@ -1,10 +1,11 @@ import {GgufFileType} from "../types/GgufMetadataTypes.js"; export const ggufQuantNames = new Map([ + ["Q1_0", GgufFileType.MOSTLY_Q1_0], ["Q4_0", GgufFileType.MOSTLY_Q4_0], ["Q4_1", GgufFileType.MOSTLY_Q4_1], - ["MXFP4", GgufFileType.MOSTLY_MXFP4_MOE], - ["NVFP4", GgufFileType.MOSTLY_MXFP4_MOE], + ["MXFP4_MOE", GgufFileType.MOSTLY_MXFP4_MOE], + ["NVFP4", GgufFileType.MOSTLY_NVFP4], ["Q5_0", GgufFileType.MOSTLY_Q5_0], ["Q5_1", GgufFileType.MOSTLY_Q5_1], ["IQ2_XXS", GgufFileType.MOSTLY_IQ2_XXS], diff --git a/src/index.ts b/src/index.ts index 66d254fb..0577ae03 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,10 +1,11 @@ import {DisposedError} from "lifecycle-utils"; import {Llama} from "./bindings/Llama.js"; -import {getLlama, type LlamaOptions, type LastBuildOptions} from "./bindings/getLlama.js"; +import {getLlama, type LlamaOptions, type LastBuildOptions, type LlamaExperimentalOptions} from "./bindings/getLlama.js"; import {getLlamaGpuTypes} from "./bindings/utils/getLlamaGpuTypes.js"; import {NoBinaryFoundError} from "./bindings/utils/NoBinaryFoundError.js"; import { - type LlamaGpuType, type LlamaNuma, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType + type LlamaGpuType, type LlamaNuma, type RamState, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, + LlamaVocabularyType } from "./bindings/types.js"; import {resolveModelFile, type ResolveModelFileOptions} from "./utils/resolveModelFile.js"; import {LlamaModel, LlamaModelInfillTokens, type LlamaModelOptions, LlamaModelTokens} from "./evaluator/LlamaModel/LlamaModel.js"; @@ -62,6 +63,7 @@ import {FalconChatWrapper} from "./chatWrappers/FalconChatWrapper.js"; import {AlpacaChatWrapper} from "./chatWrappers/AlpacaChatWrapper.js"; import {FunctionaryChatWrapper} from "./chatWrappers/FunctionaryChatWrapper.js"; import {GemmaChatWrapper} from "./chatWrappers/GemmaChatWrapper.js"; +import {Gemma4ChatWrapper} from "./chatWrappers/Gemma4ChatWrapper.js"; import {HarmonyChatWrapper} from "./chatWrappers/HarmonyChatWrapper.js"; import {TemplateChatWrapper, type TemplateChatWrapperOptions} from "./chatWrappers/generic/TemplateChatWrapper.js"; import { @@ -108,7 +110,7 @@ import { type GbnfJsonBasicStringSchema, type GbnfJsonFormatStringSchema, type GbnfJsonObjectSchema, type GbnfJsonOneOfSchema, type GbnfJsonSchema, type GbnfJsonSchemaImmutableType, type GbnfJsonSchemaToType } from "./utils/gbnfJson/types.js"; -import {type GgufFileInfo} from "./gguf/types/GgufFileInfoTypes.js"; +import {type GgufFileInfo, type GgufFileInfoSource, type GgufFileInfoSourceData} from "./gguf/types/GgufFileInfoTypes.js"; import { type GgufMetadata, type GgufMetadataLlmToType, GgufArchitectureType, GgufFileType, GgufMetadataTokenizerTokenType, GgufMetadataArchitecturePoolingType, type GgufMetadataGeneral, type GgufMetadataTokenizer, type GgufMetadataDefaultArchitectureType, @@ -129,8 +131,10 @@ export { getLlamaGpuTypes, type LlamaOptions, type LastBuildOptions, + type LlamaExperimentalOptions, type LlamaGpuType, type LlamaNuma, + type RamState, type LlamaClasses, LlamaLogLevel, NoBinaryFoundError, @@ -231,6 +235,7 @@ export { AlpacaChatWrapper, FunctionaryChatWrapper, GemmaChatWrapper, + Gemma4ChatWrapper, HarmonyChatWrapper, TemplateChatWrapper, type TemplateChatWrapperOptions, @@ -300,6 +305,8 @@ export { LlamaLogLevelGreaterThanOrEqual, readGgufFileInfo, type GgufFileInfo, + type GgufFileInfoSource, + type GgufFileInfoSourceData, type GgufMetadata, type GgufTensorInfo, type GgufMetadataLlmToType, diff --git a/src/utils/GitHubClient.ts b/src/utils/GitHubClient.ts new file mode 100644 index 00000000..fcce828d --- /dev/null +++ b/src/utils/GitHubClient.ts @@ -0,0 +1,209 @@ +import {getConsoleLogPrefix} from "./getConsoleLogPrefix.js"; + +const defaultGitHubApiBase = "https://api.github.com"; +const defaultGitHubApiVersion: GitHubApiVersion = "2022-11-28"; + +type GitHubApiVersion = "2022-11-28" | (string & {}); + +type GitHubClientOptions = { + token?: string, + + /** + * GitHub REST API base URL. + * + * Defaults to `https://api.github.com`. + */ + apiBase?: string, + + /** + * GitHub REST API version header. + * + * Defaults to `"2022-11-28"`. + */ + apiVersion?: GitHubApiVersion, + + userAgent?: string, + + /** + * When the API ratelimit is exceeded, retry with a GitHub token if one is found + * in the environment variables (`GITHUB_TOKEN` or `GH_TOKEN`). + * + * Defaults to `true`. + */ + retryWithToken?: boolean +}; + +export type GitHubRelease = { + url: string, + "html_url": string, + "assets_url": string, + "upload_url": string, + + id: number, + "node_id": string, + + "tag_name": string, + "target_commitish": string, + name: string | null, + body: string | null, + + draft: boolean, + prerelease: boolean, + + "created_at": string, // ISO date-time + "published_at": string | null, // ISO date-time + + author: GitHubUser | null, + + assets: GitHubReleaseAsset[], + + "tarball_url": string | null, + "zipball_url": string | null +}; + +export type GitHubUser = { + login: string, + id: number, + "node_id": string, + "avatar_url": string, + "html_url": string, + type: string, + "site_admin": boolean +}; + +export type GitHubReleaseAsset = { + url: string, + id: number, + "node_id": string, + + name: string, + label: string | null, + "content_type": string, + state: string, + size: number, + "download_count": number, + + "browser_download_url": string, + + "created_at": string, // ISO date-time + "updated_at": string, // ISO date-time + + uploader: GitHubUser | null +}; + +export type GitHubApiError = Error & { + status: number, + url: string, + bodyText?: string, + headers?: Record +}; + +export type GitHubPullRequestStatus = { + merged: boolean, + mergeable: true | false | null, + merged_at: string | null +}; + +export class GitHubClient { + private readonly _clientOptions: GitHubClientOptions; + + public constructor(clientOptions: GitHubClientOptions = {}) { + this._clientOptions = clientOptions; + } + + public async getLatestRelease({ + owner, repo + }: { + owner: string, + repo: string + }): Promise { + return this._fetchJson( + `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/latest` + ); + } + + public async getReleaseByTag({ + owner, repo, tag + }: { + owner: string, + repo: string, + tag: string + }): Promise { + return this._fetchJson( + `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/tags/${encodeURIComponent(tag)}` + ); + } + + public async getPullRequestStatus({ + owner, repo, id + }: { + owner: string, + repo: string, + id: string + }): Promise { + return this._fetchJson( + `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/pulls/${encodeURIComponent(id)}` + ); + } + + private async _fetchJson( + path: string + ): Promise { + const url = this._getApiBase() + path; + + const headers: Record = { + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": this._clientOptions.apiVersion ?? defaultGitHubApiVersion + }; + + if (this._clientOptions.token != null && this._clientOptions.token !== "") + headers.Authorization = "Bearer " + this._clientOptions.token; + + if (this._clientOptions.userAgent != null && this._clientOptions.userAgent !== "") + headers["User-Agent"] = this._clientOptions.userAgent; + + + let res = await fetch(url, { + method: "GET", + headers + }); + + if (!res.ok && res.status === 403 && headers.Authorization == null && this._clientOptions.retryWithToken !== false && + res.headers.get("X-RateLimit-Remaining") === "0" + ) { + const tokenFromEnv = process.env.GITHUB_TOKEN || process.env.GH_TOKEN; + if (tokenFromEnv != null && tokenFromEnv !== "") { + headers.Authorization = "Bearer " + tokenFromEnv; + + res = await fetch(url, { + method: "GET", + headers + }); + } else + console.warn(getConsoleLogPrefix(), "GitHub API rate limit exceeded. Try setting the `GITHUB_TOKEN` environment variable to increase the rate limit"); + } + + if (!res.ok) { + const err = new Error( + `GitHub API error ${res.status} ${res.statusText}` + ) as GitHubApiError; + + err.status = res.status; + err.url = url; + err.headers = Object.fromEntries(res.headers.entries()); + try { + err.bodyText = await res.text(); + } catch { + err.bodyText = undefined; + } + + throw err; + } + + return (await res.json()) as T; + } + + private _getApiBase() { + return this._clientOptions?.apiBase ?? defaultGitHubApiBase; + } +} diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts index 6675b762..70ba0ec2 100644 --- a/src/utils/LlamaText.ts +++ b/src/utils/LlamaText.ts @@ -122,6 +122,10 @@ class LlamaText { return LlamaTextConstructor.compare(this, other); } + public trim(): LlamaText { + return this.trimStart().trimEnd(); + } + public trimStart(): LlamaText { const newValues = this.values.slice(); diff --git a/src/utils/LruCache.ts b/src/utils/LruCache.ts index 6d6b40a2..7f44cd17 100644 --- a/src/utils/LruCache.ts +++ b/src/utils/LruCache.ts @@ -52,6 +52,10 @@ export class LruCache { return this._cache.keys(); } + public values() { + return this._cache.values(); + } + public delete(key: Key) { this._cache.delete(key); } diff --git a/src/utils/ThreadsSplitter.ts b/src/utils/ThreadsSplitter.ts index 4b207b0f..7f08e29d 100644 --- a/src/utils/ThreadsSplitter.ts +++ b/src/utils/ThreadsSplitter.ts @@ -140,10 +140,10 @@ export class ThreadsSplitterConsumer { this._threadsSplitter._addThreadDemand(this._demandedThreads); this._wantedThreadsGcRegistry = new FinalizationRegistry(this._threadsSplitter._removeWantedThreads); - this._wantedThreadsGcRegistry.register(this, this._wantedThreads); + this._wantedThreadsGcRegistry.register(this, this._wantedThreads, this); this._demandedThreadsGcRegistry = new FinalizationRegistry(this._threadsSplitter._removeThreadDemand); - this._demandedThreadsGcRegistry.register(this, this._demandedThreads); + this._demandedThreadsGcRegistry.register(this, this._demandedThreads, this); } public [Symbol.dispose]() { diff --git a/src/utils/cmake.ts b/src/utils/cmake.ts index 0155b627..1cded63d 100644 --- a/src/utils/cmake.ts +++ b/src/utils/cmake.ts @@ -68,13 +68,13 @@ export async function getCmakePath() { throw new Error("cmake not found"); } -export async function downloadCmakeIfNeeded(wrapWithStatusLogs: boolean = false) { +export async function downloadCmakeIfNeeded(wrapWithStatusLogs: boolean | "stderr" = "stderr") { try { await getCmakePath(); return; } catch (err) {} - if (!wrapWithStatusLogs) + if (wrapWithStatusLogs === false) await downloadCmake({progressLogs: wrapWithStatusLogs}); else { try { @@ -140,7 +140,7 @@ async function findExistingCmake() { return undefined; } -async function downloadCmake({progressLogs = true}: {progressLogs?: boolean} = {}) { +async function downloadCmake({progressLogs = "stderr"}: {progressLogs?: boolean | "stderr"} = {}) { await withLockfile({ resourcePath: path.join(xpackDirectory, "cmakeInstall") }, async () => { diff --git a/src/utils/createModelDownloader.ts b/src/utils/createModelDownloader.ts index 4acc0b23..1c48018d 100644 --- a/src/utils/createModelDownloader.ts +++ b/src/utils/createModelDownloader.ts @@ -294,6 +294,19 @@ export class ModelDownloader { .reduce((acc, transferredBytes) => acc + transferredBytes, 0); } + /** + * Info about all the files that will be saved to the download directory, + * including their filenames, full paths, total sizes and downloaded sizes. + */ + public get filesInfo() { + return this._specificFileDownloaders.map((downloader) => ({ + fileName: downloader.fileName!, + filePath: path.join(this._dirPath, downloader.fileName!), + totalSize: downloader.status.totalBytes, + downloadedSize: downloader.status.transferredBytes + })); + } + /** * @returns The path to the entrypoint file that should be used to load the model */ @@ -361,9 +374,6 @@ export class ModelDownloader { /** @internal */ private async resolveTryHeaders() { - if (this._tokens == null) - return; - pushAll( this._tryHeaders, await resolveModelFileAccessTokensTryHeaders(this._modelUrl, this._tokens, this._endpoints, this._headers) diff --git a/src/utils/findBestOption.ts b/src/utils/findBestOption.ts index a14b2f16..8348ed31 100644 --- a/src/utils/findBestOption.ts +++ b/src/utils/findBestOption.ts @@ -19,3 +19,118 @@ export function findBestOption({generator, score}: { return bestOption; } + +/** + * This algorithm assumes that the first non-null score is the best one and from there + * it then starts iterating by 1 index forward to find the actual best option. + * + * It prefills the next `prefill` options to jump ahead an score fewer options to find the best one faster and more efficiently. + */ +export async function findFirstNonNullBestOptionAsync({generator, score, prefill}: { + generator: () => Generator, + score: (option: O) => Promise, + prefill: number +}): Promise { + const iterator = generator(); + let iteratorDone = false; + const options: O[] = []; + const scores = new Map(); + + function getIndex(index: number) { + if (index < options.length) + return options[index]!; + + if (iteratorDone) + return undefined; + + while (options.length <= index) { + const nextOption = iterator.next(); + if (nextOption.done) { + iteratorDone = true; + return undefined; + } + + options.push(nextOption.value); + } + + return options[index]; + } + + let step = Math.max(1, (Number.isFinite(prefill) ? Math.floor(prefill) : 5)); + let currentIndex = 0; + let bestIndex: number | null = null; + let bestScore: number | null = null; + while (true) { + if (currentIndex < 0) + currentIndex = 0; + + const option = getIndex(currentIndex); + if (option == null) + break; + + const currentScore = scores.get(currentIndex) ?? await score(option); + if (!scores.has(currentIndex)) + scores.set(currentIndex, currentScore); + + if (currentScore == null) { + if (step < 0) + step = Math.max(1, Math.floor(-step / 2)); + + while (bestIndex != null && currentIndex + step >= bestIndex && step !== 1) + step = Math.max(1, Math.floor(step / 2)); + + let nextIndex = currentIndex + step; + if (getIndex(nextIndex) == null) { + nextIndex = options.length - 1; + if (currentIndex === nextIndex) + break; + } + currentIndex = nextIndex; + } else if (bestScore == null || currentScore > bestScore) { + bestIndex = currentIndex; + bestScore = currentScore; + + step = -Math.max(1, Math.floor(Math.abs(step) / 2)); + + let nextIndex = currentIndex + step; + if (nextIndex < 0) { + nextIndex = 0; + step = Math.max(1, Math.floor(Math.abs(step) / 2)); + nextIndex = currentIndex + step; + } + + if (getIndex(nextIndex) == null) { + nextIndex = options.length - 1; + if (currentIndex === nextIndex) + break; + } + + currentIndex = nextIndex; + } else if (bestIndex != null && currentScore < bestScore && currentIndex > bestIndex) { + step = -Math.max(1, Math.floor(Math.abs(currentIndex - bestIndex) / 2)); + currentIndex = bestIndex + step; + } else if (bestIndex != null && currentScore < bestScore && currentIndex < bestIndex) { + if (step < 0) + step = Math.max(1, Math.floor(Math.abs(bestIndex - currentIndex) / 2)); + + currentIndex = currentIndex + step; + } else if (currentScore === bestScore && currentIndex === bestIndex && + (step === 1 || currentIndex === 0) + ) { + if (scores.has(currentIndex + 1) || (iteratorDone && currentIndex === options.length - 1)) + break; + + step = 1; + currentIndex = bestIndex + step; + } else + currentIndex = currentIndex + step; + + if (iteratorDone && scores.size === options.length && bestIndex != null) + break; + } + + if (bestIndex == null) + return null; + + return options[bestIndex] ?? null; +} diff --git a/src/utils/getBuildDefaults.ts b/src/utils/getBuildDefaults.ts index 97f61b02..dc4e6337 100644 --- a/src/utils/getBuildDefaults.ts +++ b/src/utils/getBuildDefaults.ts @@ -1,4 +1,5 @@ -import {defaultLlamaCppGitHubRepo, defaultLlamaCppGpuSupport, defaultLlamaCppRelease} from "../config.js"; +import {defaultLlamaCppRelease} from "../bindings/utils/binariesGithubRelease.js"; +import {defaultLlamaCppGitHubRepo, defaultLlamaCppGpuSupport} from "../config.js"; export async function getBuildDefaults() { return { diff --git a/src/utils/getTempDir.ts b/src/utils/getTempDir.ts index 2d505091..5f93121d 100644 --- a/src/utils/getTempDir.ts +++ b/src/utils/getTempDir.ts @@ -112,7 +112,7 @@ export class FsPathHandle { this._finalizationRegistry = new FinalizationRegistry(removePathUsageSync); addPathUsage(this.path); - this._finalizationRegistry.register(this, this.path); + this._finalizationRegistry.register(this, this.path, this); } public async dispose() { diff --git a/src/utils/gitReleaseBundles.ts b/src/utils/gitReleaseBundles.ts index e2159932..bc8dd5fb 100644 --- a/src/utils/gitReleaseBundles.ts +++ b/src/utils/gitReleaseBundles.ts @@ -108,22 +108,26 @@ async function unshallowAndSquashCurrentRepoWithSubmodulesAndSaveItAsReleaseBund } export async function getGitBundlePathForRelease(githubOwner: string, githubRepo: string, release: string) { + if (!(await isGitBundleCompatible(githubOwner, githubRepo, release))) + return null; + + if (!(await fs.pathExists(currentReleaseGitBundlePath))) + return null; + + return currentReleaseGitBundlePath; +} + +export async function isGitBundleCompatible(githubOwner: string, githubRepo: string, release: string) { const [builtinGithubOwner, builtinGithubRepo] = builtinLlamaCppGitHubRepo.split("/"); if (githubOwner !== builtinGithubOwner || githubRepo !== builtinGithubRepo) - return null; + return false; const currentBundleRelease = await getBinariesGithubRelease(); if (isGithubReleaseNeedsResolving(currentBundleRelease)) - return null; - - if (currentBundleRelease !== release) - return null; + return false; - if (!(await fs.pathExists(currentReleaseGitBundlePath))) - return null; - - return currentReleaseGitBundlePath; + return currentBundleRelease === release; } async function getCurrentTagOrBranch() { diff --git a/src/utils/modelFileAccessTokens.ts b/src/utils/modelFileAccessTokens.ts index acb1bde7..3018c0f3 100644 --- a/src/utils/modelFileAccessTokens.ts +++ b/src/utils/modelFileAccessTokens.ts @@ -17,25 +17,26 @@ export async function resolveModelFileAccessTokensTryHeaders( ) { const res: Record[] = []; - if (tokens == null || !isUrl(modelUrl)) + if (!isUrl(modelUrl)) return res; - const {huggingFace} = tokens; + const {huggingFace} = tokens ?? {}; if (isHuggingFaceUrl(modelUrl, endpoints)) { - const hfToken = resolveHfToken(huggingFace); + const hfToken = await resolveHfToken(huggingFace); - res.push({ - ...(baseHeaders ?? {}), - "Authorization": `Bearer ${hfToken}` - }); + if (hfToken != null) + res.push({ + ...(baseHeaders ?? {}), + "Authorization": `Bearer ${hfToken}` + }); } return res; } async function resolveHfToken(providedToken?: string) { - if (providedToken !== null) + if (providedToken !== undefined) // if `null` then don't try to resolve from env var or file return providedToken; if (process.env.HF_TOKEN != null) diff --git a/src/utils/parseModelUri.ts b/src/utils/parseModelUri.ts index 7cf1696f..2a46050f 100644 --- a/src/utils/parseModelUri.ts +++ b/src/utils/parseModelUri.ts @@ -217,15 +217,15 @@ async function fetchHuggingFaceModelManifest({ ].join(""); const headersToTry = [ {}, - await resolveModelFileAccessTokensTryHeaders(manifestUrl, tokens, endpoints) + ...await resolveModelFileAccessTokensTryHeaders(manifestUrl, tokens, endpoints) ]; let rateLimitPendingRetries = 0; - + for (let i = 0; i < headersToTry.length * (1 + rateLimitPendingRetries); i++) { const headers = headersToTry[i % headersToTry.length]; if (headers == null) continue; - + let response: Awaited> | undefined; try { response = await fetch(manifestUrl, { diff --git a/src/utils/resolveGithubRelease.ts b/src/utils/resolveGithubRelease.ts index ac280b94..cabc4022 100644 --- a/src/utils/resolveGithubRelease.ts +++ b/src/utils/resolveGithubRelease.ts @@ -1,6 +1,10 @@ import {getConsoleLogPrefix} from "./getConsoleLogPrefix.js"; +import {GitHubClient, GitHubRelease} from "./GitHubClient.js"; -export async function resolveGithubRelease(githubOwner: string, githubRepo: string, release: string) { +export async function resolveGithubRelease(githubOwner: string, githubRepo: string, release: string): Promise<{ + tag: string, + date: Date +}> { const githubClient = new GitHubClient(); const repo = githubOwner + "/" + githubRepo; @@ -30,176 +34,13 @@ export async function resolveGithubRelease(githubOwner: string, githubRepo: stri if (githubRelease.tag_name == null) throw new Error(`Failed to find tag of release "${release}" of "${repo}"`); - return githubRelease.tag_name; + return { + tag: githubRelease.tag_name, + date: new Date(githubRelease.created_at) + }; } export function isGithubReleaseNeedsResolving(release: string) { return release === "latest"; } -const defaultGitHubApiBase = "https://api.github.com"; -const defaultGitHubApiVersion: GitHubApiVersion = "2022-11-28"; - -type GitHubApiVersion = "2022-11-28" | (string & {}); - -type GitHubClientOptions = { - token?: string, - - /** - * GitHub REST API base URL. - * - * Defaults to `https://api.github.com`. - */ - apiBase?: string, - - /** - * GitHub REST API version header. - * - * Defaults to `"2022-11-28"`. - */ - apiVersion?: GitHubApiVersion, - - userAgent?: string -}; - -type GitHubRelease = { - url: string, - "html_url": string, - "assets_url": string, - "upload_url": string, - - id: number, - "node_id": string, - - "tag_name": string, - "target_commitish": string, - name: string | null, - body: string | null, - - draft: boolean, - prerelease: boolean, - - "created_at": string, // ISO date-time - "published_at": string | null, // ISO date-time - - author: GitHubUser | null, - - assets: GitHubReleaseAsset[], - - "tarball_url": string | null, - "zipball_url": string | null -}; - -type GitHubUser = { - login: string, - id: number, - "node_id": string, - "avatar_url": string, - "html_url": string, - type: string, - "site_admin": boolean -}; - -type GitHubReleaseAsset = { - url: string, - id: number, - "node_id": string, - - name: string, - label: string | null, - "content_type": string, - state: string, - size: number, - "download_count": number, - - "browser_download_url": string, - - "created_at": string, // ISO date-time - "updated_at": string, // ISO date-time - - uploader: GitHubUser | null -}; - -type GitHubApiError = Error & { - status: number, - url: string, - bodyText?: string, - headers?: Record -}; - -class GitHubClient { - private readonly _clientOptions: GitHubClientOptions; - - public constructor(clientOptions: GitHubClientOptions = {}) { - this._clientOptions = clientOptions; - } - - public async getLatestRelease({ - owner, repo - }: { - owner: string, - repo: string - }): Promise { - return this._fetchJson( - `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/latest` - ); - } - - public async getReleaseByTag({ - owner, repo, tag - }: { - owner: string, - repo: string, - tag: string - }): Promise { - return this._fetchJson( - `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/tags/${encodeURIComponent(tag)}` - ); - } - - private async _fetchJson( - path: string - ): Promise { - const url = this._getApiBase() + path; - - const headers: Record = { - Accept: "application/vnd.github+json", - "X-GitHub-Api-Version": this._clientOptions.apiVersion ?? defaultGitHubApiVersion - }; - - if (this._clientOptions.token != null && this._clientOptions.token !== "") - headers.Authorization = "Bearer " + this._clientOptions.token; - - if (this._clientOptions.userAgent != null && this._clientOptions.userAgent !== "") - headers["User-Agent"] = this._clientOptions.userAgent; - - - const res = await fetch(url, { - method: "GET", - headers - }); - - if (!res.ok) { - const err = new Error( - `GitHub API error ${res.status} ${res.statusText}` - ) as GitHubApiError; - - err.status = res.status; - err.url = url; - err.headers = Object.fromEntries(res.headers.entries()); - try { - err.bodyText = await res.text(); - } catch { - err.bodyText = undefined; - } - - throw err; - } - - return (await res.json()) as T; - } - - private _getApiBase() { - return this._clientOptions?.apiBase ?? defaultGitHubApiBase; - } -} diff --git a/src/utils/spawnCommand.ts b/src/utils/spawnCommand.ts index e894acfd..f85e6f8a 100644 --- a/src/utils/spawnCommand.ts +++ b/src/utils/spawnCommand.ts @@ -1,7 +1,7 @@ import spawn from "cross-spawn"; export function spawnCommand( - command: string, args: string[], cwd: string, env = process.env, progressLogs: boolean = true + command: string, args: string[], cwd: string, env = process.env, progressLogs: boolean | "stderr" = "stderr" ) { function getCommandString() { let res = command; @@ -66,10 +66,14 @@ export function spawnCommand( reject(createError(`Command ${getCommandString()} closed with code ${code}`)); }); - if (progressLogs) { + if (progressLogs === true) { child.stdout?.pipe(process.stdout); child.stderr?.pipe(process.stderr); process.stdin.pipe(child.stdin!); + } else if (progressLogs === "stderr") { + child.stdout?.pipe(process.stderr); + child.stderr?.pipe(process.stderr); + process.stdin.pipe(child.stdin!); } else { child.stderr?.pipe(process.stderr); } diff --git a/src/utils/withProgressLog.ts b/src/utils/withProgressLog.ts index 790140f8..11b622ad 100644 --- a/src/utils/withProgressLog.ts +++ b/src/utils/withProgressLog.ts @@ -128,7 +128,7 @@ export async function withProgressLog({ currentProgressBarText !== lastLogProgressBarText || (progress === 1 && lastLogProgress !== 1) ) { - console.log(getConsoleLogPrefix() + getLoadingText()); + console.warn(getConsoleLogPrefix() + getLoadingText()); lastLogProgress = currentProgress; lastLogProgressBarText = currentProgressBarText; } @@ -137,22 +137,22 @@ export async function withProgressLog({ } }; - console.log(getConsoleLogPrefix() + getLoadingText()); + console.warn(getConsoleLogPrefix() + getLoadingText()); try { const res = await callback(progressUpdater); - console.log(getConsoleLogPrefix() + `${logSymbols.success} ${successText}`); + console.warn(getConsoleLogPrefix() + `${logSymbols.success} ${successText}`); return res; } catch (er) { - console.log(getConsoleLogPrefix() + `${logSymbols.error} ${failText}`); + console.warn(getConsoleLogPrefix() + `${logSymbols.error} ${failText}`); throw er; } } - const updateManager = UpdateManager.getInstance(); + const updateManager = UpdateManager.getInstance(process.stderr); let etaUpdateTimeout: ReturnType | undefined = undefined; function getProgressLine() { @@ -280,7 +280,7 @@ export async function withProgressLog({ updateManager.unhook(true); if (moveCursorUpAfterUnhook) - process.stdout.moveCursor(0, -1); + process.stderr.moveCursor(0, -1); } } diff --git a/src/utils/withStatusLogs.ts b/src/utils/withStatusLogs.ts index f227d94c..1b2d6179 100644 --- a/src/utils/withStatusLogs.ts +++ b/src/utils/withStatusLogs.ts @@ -16,22 +16,22 @@ export default async function withStatusLogs( if (typeof messageAndOptions !== "string" && messageAndOptions.disableLogs) return await callback(); - console.log(getConsoleLogPrefix() + `${chalk.cyan(clockChar)} ${typeof messageAndOptions === "string" ? messageAndOptions : messageAndOptions.loading}`); + console.warn(getConsoleLogPrefix() + `${chalk.cyan(clockChar)} ${typeof messageAndOptions === "string" ? messageAndOptions : messageAndOptions.loading}`); try { const res = await callback(); if (typeof messageAndOptions !== "string") - console.log(getConsoleLogPrefix() + `${logSymbols.success} ${messageAndOptions.success}`); + console.warn(getConsoleLogPrefix() + `${logSymbols.success} ${messageAndOptions.success}`); else - console.log(getConsoleLogPrefix() + `${logSymbols.success} ${messageAndOptions}`); + console.warn(getConsoleLogPrefix() + `${logSymbols.success} ${messageAndOptions}`); return res; } catch (er) { if (typeof messageAndOptions !== "string") - console.log(getConsoleLogPrefix() + `${logSymbols.error} ${messageAndOptions.fail}`); + console.warn(getConsoleLogPrefix() + `${logSymbols.error} ${messageAndOptions.fail}`); else - console.log(getConsoleLogPrefix() + `${logSymbols.error} ${messageAndOptions}`); + console.warn(getConsoleLogPrefix() + `${logSymbols.error} ${messageAndOptions}`); throw er; } diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts index c1e427c2..c014c79c 100644 --- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts +++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts @@ -7,10 +7,11 @@ import {defaultLlamaVramPadding} from "../../../src/bindings/getLlama.js"; import {BuildGpu} from "../../../src/bindings/types.js"; describe("functionary", () => { - describe("model options", () => { - describe("Resolve the correct number of GPU layers", async () => { + describe("model options", async () => { + const llama = await getTestLlama(); + + describe.skipIf(llama.gpu === false)("Resolve the correct number of GPU layers", async () => { const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf"); - const llama = await getTestLlama(); const fileInfo = await readGgufFileInfo(modelPath); const ggufInsights = await GgufInsights.from(fileInfo, llama); @@ -28,7 +29,10 @@ describe("functionary", () => { totalSwap?: number, freeSwap?: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu }) { - const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, { + const { + gpuLayers: resolvedGpuLayers, + useMmap: resolvedUseMmap + } = await ggufInsights.configurationResolver.resolveModelGpuLayersV2(gpuLayers, { ignoreMemorySafetyChecks, getVramState: async () => ({ total: llamaGpu === false ? 0 : totalVram, @@ -37,13 +41,13 @@ describe("functionary", () => { llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram), llamaGpu, llamaSupportsGpuOffloading: llamaGpu !== false, - useMmap: true + useMmap: "auto" }); async function resolveAutoContextSize() { const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({ targetGpuLayers: resolvedGpuLayers, - useMmap: true + useMmap: resolvedUseMmap }, { llamaGpu, getVramState: async () => ({ @@ -71,7 +75,8 @@ describe("functionary", () => { return { gpuLayers: resolvedGpuLayers, - contextSize: await resolveAutoContextSize() + contextSize: await resolveAutoContextSize(), + useMmap: resolvedUseMmap }; } @@ -84,6 +89,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -92,6 +98,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -102,6 +109,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -114,7 +122,8 @@ describe("functionary", () => { freeRam: s1GB * 6 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -125,6 +134,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -137,6 +147,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -152,6 +163,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -163,7 +175,8 @@ describe("functionary", () => { freeSwap: s1GB * 1 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -178,6 +191,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -191,6 +205,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -233,6 +248,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -243,6 +259,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -255,7 +272,8 @@ describe("functionary", () => { freeRam: s1GB * 4.5 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -307,6 +325,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -318,7 +337,8 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -333,6 +353,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -343,7 +364,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 7.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("1792"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -354,7 +376,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 5.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("5632"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -410,6 +433,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -422,7 +446,8 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -438,6 +463,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -497,6 +523,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -511,6 +538,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -524,6 +552,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(32, { @@ -542,6 +571,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -552,6 +582,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -562,6 +593,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -575,6 +607,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(32, { @@ -597,6 +630,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -608,7 +642,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -619,7 +654,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.contextSize).to.toMatchInlineSnapshot("397"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -632,6 +668,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -664,6 +701,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -674,6 +712,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -684,6 +723,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -697,6 +737,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -707,6 +748,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -717,6 +759,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(33, { @@ -739,6 +782,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -751,6 +795,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -761,7 +806,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.contextSize).to.toMatchInlineSnapshot("397"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -772,7 +818,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -783,7 +830,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -795,7 +843,8 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -809,7 +858,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -820,7 +870,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("3072"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -831,7 +882,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.contextSize).to.toMatchInlineSnapshot("3584"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(33, { @@ -856,6 +908,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } }); }); @@ -899,6 +952,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -908,7 +962,8 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("501"); + expect(res.contextSize).to.toMatchInlineSnapshot("512"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -918,12 +973,13 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); describe('attempts to resolve "auto"', () => { - test("8GB RAM", async () => { + test("8GB RAM", {timeout: 1000 * 60 * 2}, async () => { { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, @@ -933,6 +989,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -943,6 +1000,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -953,6 +1011,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -963,6 +1022,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -973,6 +1033,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -981,8 +1042,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -991,8 +1053,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1001,8 +1064,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1011,8 +1075,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1021,8 +1086,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1031,8 +1097,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1041,8 +1108,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1051,8 +1119,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1062,7 +1131,8 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("7168"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1072,7 +1142,8 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1083,10 +1154,11 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - test("5GB RAM", async () => { + test("5GB RAM", {timeout: 1000 * 60 * 2}, async () => { { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, @@ -1095,7 +1167,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1105,7 +1178,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1115,7 +1189,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("5120"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1126,6 +1201,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1136,6 +1212,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1144,8 +1221,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1154,8 +1232,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1164,8 +1243,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1174,8 +1254,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1184,8 +1265,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1194,8 +1276,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1204,8 +1287,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1214,8 +1298,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1225,7 +1310,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("7168"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1235,7 +1321,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1246,12 +1333,13 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); describe("attempts to resolve {min?: number, max?: number}", () => { - test("8GB RAM", async () => { + test("8GB RAM", {timeout: 1000 * 60 * 2}, async () => { { const res = await resolveGpuLayers({max: 4}, { totalVram: s1GB * 6, @@ -1261,6 +1349,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1271,6 +1360,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 2}, { @@ -1304,6 +1394,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 16}, { @@ -1324,8 +1415,9 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1336,8 +1428,9 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1349,11 +1442,12 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - test("5GB RAM", async () => { + test("5GB RAM", {timeout: 1000 * 60 * 2}, async () => { { const res = await resolveGpuLayers({max: 4}, { totalVram: s1GB * 6, @@ -1362,7 +1456,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1372,7 +1467,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 2}, { @@ -1406,6 +1502,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 16}, { @@ -1426,8 +1523,9 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1438,8 +1536,9 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1451,13 +1550,14 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); describe("attempts to resolve {fitContext?: {contextSize?: number}}", () => { - test("8GB RAM", async () => { + test("8GB RAM", {timeout: 1000 * 60 * 2}, async () => { { const contextSize = 4096; const res = await resolveGpuLayers({fitContext: {contextSize}}, { @@ -1469,6 +1569,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1479,8 +1580,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1491,8 +1593,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("9"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1503,8 +1606,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1517,6 +1621,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1529,6 +1634,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1546,7 +1652,7 @@ describe("functionary", () => { } }); - test("7GB RAM", async () => { + test("7GB RAM", {timeout: 1000 * 60 * 2}, async () => { { const contextSize = 4096; const res = await resolveGpuLayers({fitContext: {contextSize}}, { @@ -1558,6 +1664,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1568,8 +1675,9 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1580,8 +1688,9 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("9"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1592,8 +1701,9 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1606,6 +1716,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1618,6 +1729,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { diff --git a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap index c69768c2..5b57f7a2 100644 --- a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap +++ b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap @@ -63,6 +63,7 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = ` "offset": 328548352, }, ], + "infoEndOffset": 7836512, "metadata": { "general": { "architecture": "llama", @@ -156,6 +157,23 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = ` }, }, "metadataSize": 7819208, + "source": { + "path": { + "_type": "path", + "path": "/functionary-small-v2.5.Q4_0.gguf", + }, + "type": "path", + }, + "sourceData": [ + { + "length": 7836512, + "path": { + "_type": "path", + "path": "/functionary-small-v2.5.Q4_0.gguf", + }, + "type": "path", + }, + ], "splicedParts": 1, "tensorCount": 291, "tensorInfo": [ @@ -274,6 +292,7 @@ exports[`gguf > parser > should parse local gguf model 1`] = ` "offset": 328548352, }, ], + "infoEndOffset": 7836512, "metadata": { "general": { "architecture": "llama", @@ -367,6 +386,16 @@ exports[`gguf > parser > should parse local gguf model 1`] = ` }, }, "metadataSize": 7819208, + "sourceData": [ + { + "length": 7836512, + "path": { + "_type": "path", + "path": "/functionary-small-v2.5.Q4_0.gguf", + }, + "type": "path", + }, + ], "splicedParts": 1, "tensorCount": 291, "tensorInfo": [ diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts index fcd56a3a..adf8a1b8 100644 --- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts +++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts @@ -25,48 +25,48 @@ describe("gguf", async () => { expect(ggufInsights.modelSize).toMatchInlineSnapshot("4653375488"); }); - test("estimated model memory footprint stays the same", async () => { + test("estimated model memory footprint stays the same", {timeout: 1000 * 60 * 5}, async () => { const llama = await getTestLlama(); const ggufMetadataParseResult = await readGgufFileInfo(modelPath); const ggufInsights = await GgufInsights.from(ggufMetadataParseResult, llama); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 0}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 0}))).toMatchInlineSnapshot(` { "cpuRam": "4.33GB", "gpuVram": "0B", } `); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 1}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 1}))).toMatchInlineSnapshot(` { - "cpuRam": "4.22GB", - "gpuVram": "528.01MB", + "cpuRam": "3.93GB", + "gpuVram": "442.52MB", } `); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 8}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 8}))).toMatchInlineSnapshot(` { - "cpuRam": "3.42GB", - "gpuVram": "1.32GB", + "cpuRam": "3.13GB", + "gpuVram": "1.2GB", } `); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 16}))).toMatchInlineSnapshot(` { - "cpuRam": "2.51GB", - "gpuVram": "2.34GB", + "cpuRam": "2.22GB", + "gpuVram": "2.23GB", } `); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 24}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 24}))).toMatchInlineSnapshot(` { - "cpuRam": "1.59GB", - "gpuVram": "3.14GB", + "cpuRam": "1.3GB", + "gpuVram": "3.03GB", } `); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 32}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 32}))).toMatchInlineSnapshot(` { - "cpuRam": "692.8MB", - "gpuVram": "4.06GB", + "cpuRam": "398.84MB", + "gpuVram": "3.94GB", } `); - expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 33}))).toMatchInlineSnapshot(` + expect(makeEstimationReadable(await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 33}))).toMatchInlineSnapshot(` { "cpuRam": "281.81MB", "gpuVram": "4.06GB", @@ -95,7 +95,7 @@ describe("gguf", async () => { const s330MB = 330 * Math.pow(1024, 2); const s5MB = 5 * Math.pow(1024, 2); - const estimatedModelResourceUsage = ggufInsights.estimateModelResourceRequirements({ + const estimatedModelResourceUsage = await ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers: ggufInsights.totalLayers }); expect(toBytes(estimatedModelResourceUsage.gpuVram)).toMatchInlineSnapshot('"4.06GB"'); @@ -149,7 +149,7 @@ describe("gguf", async () => { const vramUsageDiff = currentVramUsage - initialVramUsage; const s200MB = 200 * Math.pow(1024, 2); - const calculatedVramUsage = ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}).gpuVram; + const calculatedVramUsage = (await ggufInsights.estimateModelResourceRequirementsV2({gpuLayers: 16})).gpuVram; expect(Math.abs(vramUsageDiff - calculatedVramUsage)).to.be.lte(s200MB); @@ -168,7 +168,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.75GB", + "cpuRam": "1.76GB", "gpuVram": "0B", } `); @@ -213,8 +213,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.74GB", - "gpuVram": "266.78MB", + "cpuRam": "1.75GB", + "gpuVram": "267.03MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -259,7 +259,7 @@ describe("gguf", async () => { }))).toMatchInlineSnapshot(` { "cpuRam": "1.03GB", - "gpuVram": "990.98MB", + "gpuVram": "994.98MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -304,7 +304,7 @@ describe("gguf", async () => { }))).toMatchInlineSnapshot(` { "cpuRam": "282.5MB", - "gpuVram": "1.72GB", + "gpuVram": "1.73GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -349,7 +349,7 @@ describe("gguf", async () => { }))).toMatchInlineSnapshot(` { "cpuRam": "250.5MB", - "gpuVram": "1.75GB", + "gpuVram": "1.76GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ diff --git a/test/modelDependent/gemma4-e2b/functions.test.ts b/test/modelDependent/gemma4-e2b/functions.test.ts new file mode 100644 index 00000000..1a17245d --- /dev/null +++ b/test/modelDependent/gemma4-e2b/functions.test.ts @@ -0,0 +1,60 @@ +import {describe, expect, test} from "vitest"; +import {defineChatSessionFunction, Gemma4ChatWrapper, LlamaChatSession} from "../../../src/index.js"; +import {getModelFile} from "../../utils/modelFiles.js"; +import {getTestLlama} from "../../utils/getTestLlama.js"; + +describe("gemma4 e2b", () => { + describe("functions", () => { + test("auto-resolves Gemma4ChatWrapper and can call a function", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("gemma-4-E2B-it-Q4_K_M.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 2048 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence() + }); + + expect(chatSession.chatWrapper).toBeInstanceOf(Gemma4ChatWrapper); + + let functionCallCount = 0; + const promptOptions: Parameters[1] = { + maxTokens: 200, + functions: { + getNthWord: defineChatSessionFunction({ + description: "Get an n-th word", + params: { + type: "object", + properties: { + n: { + enum: [1, 2, 3, 4] + } + }, + required: ["n"] + }, + handler(params) { + functionCallCount++; + return ["very", "secret", "this", "hello"][params.n - 1]; + } + }) + } + }; + const response = await chatSession.prompt("What is the second word?", promptOptions); + + expect(functionCallCount).toBeGreaterThan(0); + expect(functionCallCount).toBeLessThanOrEqual(2); + expect(response.toLowerCase()).toContain("secret"); + + const followUpResponse = await chatSession.prompt("Explain what this word means in one short sentence.", { + ...promptOptions, + maxTokens: 60 + }); + + expect(followUpResponse.length).toBeGreaterThan(10); + }); + }); +}); diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts index fbaff03b..b3e917b5 100644 --- a/test/modelDependent/llama3.1/controlledEvaluate.test.ts +++ b/test/modelDependent/llama3.1/controlledEvaluate.test.ts @@ -75,11 +75,11 @@ describe("llama 3.1", () => { item.next.probabilities = new Map( [...item.next.probabilities.entries()] .slice(0, 10) - .map(([token, probability]) => [token, parseFloat(probability.toFixed(7))]) + .map(([token, probability]) => [token, simplifyFloat(probability)]) ); if (item.next?.confidence != null) - item.next.confidence = parseFloat(item.next.confidence.toFixed(7)); + item.next.confidence = simplifyFloat(item.next.confidence); return item; }); @@ -96,73 +96,73 @@ describe("llama 3.1", () => { { "next": { "probabilities": Map { - 35308 => 0.5205752, - 27096 => 0.2434221, - 11 => 0.0222422, - 198 => 0.0119651, - 863 => 0.0083929, - 374 => 0.0083748, - 1131 => 0.0068622, - 25 => 0.0062526, - 7940 => 0.0053943, - 1 => 0.0051856, + 35308 => 0.522, + 27096 => 0.243, + 11 => 0.0221, + 198 => 0.012, + 374 => 0.00837, + 863 => 0.00836, + 1131 => 0.00682, + 25 => 0.00624, + 7940 => 0.00539, + 13 => 0.00517, }, }, }, { "next": { "probabilities": Map { - 927 => 0.9811716, - 198 => 0.003379, - 6288 => 0.0032698, - 279 => 0.0006585, - 1633 => 0.0003187, - 1035 => 0.0003126, - 13 => 0.0002916, - 264 => 0.0002902, - 297 => 0.0002849, - 720 => 0.0002489, + 927 => 0.981, + 198 => 0.00338, + 6288 => 0.00328, + 279 => 0.000653, + 1633 => 0.00032, + 1035 => 0.000312, + 13 => 0.000291, + 264 => 0.000289, + 297 => 0.000283, + 720 => 0.00025, }, "token": 927, }, }, { "next": { - "confidence": 0.9307394, + "confidence": 0.931, "token": 279, }, }, { "next": { - "confidence": 0.9596596, + "confidence": 0.96, "probabilities": Map { - 16053 => 0.9596596, - 1208 => 0.0047719, - 198 => 0.0031805, - 5679 => 0.0029246, - 65536 => 0.0019735, - 6435 => 0.000917, - 2697 => 0.0006723, - 720 => 0.0005984, - 21811 => 0.0005529, - 45363 => 0.0005513, + 16053 => 0.96, + 1208 => 0.00473, + 198 => 0.00318, + 5679 => 0.0029, + 65536 => 0.00197, + 6435 => 0.000912, + 2697 => 0.000666, + 720 => 0.000598, + 21811 => 0.000549, + 45363 => 0.000549, }, }, }, { "next": { - "confidence": 0.9871598, + "confidence": 0.987, "probabilities": Map { - 5679 => 0.9871598, - 21811 => 0.0014282, - 198 => 0.0009355, - 8415 => 0.0007248, - 12875 => 0.0003796, - 4194 => 0.0003463, - 720 => 0.0002809, - 14588 => 0.0002761, - 9522 => 0.0002418, - 627 => 0.0002038, + 5679 => 0.987, + 21811 => 0.00143, + 198 => 0.000937, + 8415 => 0.000724, + 12875 => 0.00038, + 4194 => 0.000344, + 720 => 0.000282, + 14588 => 0.000276, + 9522 => 0.000241, + 627 => 0.000204, }, "token": 5679, }, @@ -172,3 +172,11 @@ describe("llama 3.1", () => { }); }); }); + +function simplifyFloat(value: number) { + if (value === 0) + return 0; + + const step = 10 ** (Math.floor(Math.log10(Math.abs(value))) - 2); + return Number.parseFloat((Math.round(value / step) * step).toPrecision(12)); +} diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts index 44305d77..ec703f04 100644 --- a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts +++ b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts @@ -98,151 +98,151 @@ describe("llama 3.1", () => { [ { "probabilities": Map { - 578 => 0.4305387, - 1115 => 0.130273, - 1102 => 0.0517783, - 763 => 0.0429566, - 1283 => 0.0294619, - 2100 => 0.0294103, - 15636 => 0.0263193, - 2030 => 0.0218532, - 320 => 0.0168992, - 1628 => 0.011877, + 578 => 0.43, + 1115 => 0.13, + 1102 => 0.0516, + 763 => 0.0429, + 2100 => 0.0294, + 1283 => 0.0294, + 15636 => 0.0263, + 2030 => 0.0219, + 320 => 0.0169, + 1628 => 0.0119, }, "token": 578, }, { "probabilities": Map { - 16053 => 0.4229744, - 4062 => 0.303401, - 39935 => 0.0602281, - 2944 => 0.0372685, - 5679 => 0.0237816, - 11914 => 0.0162851, - 2144 => 0.0146596, - 1121 => 0.0069732, - 17571 => 0.0057899, - 3446 => 0.0049125, + 16053 => 0.422, + 4062 => 0.304, + 39935 => 0.0603, + 2944 => 0.0376, + 5679 => 0.0237, + 11914 => 0.0161, + 2144 => 0.0147, + 1121 => 0.00699, + 17571 => 0.00574, + 3446 => 0.00492, }, "token": 16053, }, { "probabilities": Map { - 5679 => 0.9981223, - 12875 => 0.0001592, - 18964 => 0.0001154, - 39935 => 0.0001146, - 13 => 0.0001047, - 627 => 0.0000926, + 5679 => 0.998, + 12875 => 0.000159, + 18964 => 0.000115, + 39935 => 0.000115, + 13 => 0.000105, + 627 => 0.0000928, 656 => 0.0000625, 893 => 0.0000563, 198 => 0.0000522, - 374 => 0.0000518, + 374 => 0.0000521, }, "token": 5679, }, { "probabilities": Map { - 374 => 0.8128683, - 1587 => 0.0480889, - 596 => 0.0247298, - 1120 => 0.0222965, - 3250 => 0.0215258, - 706 => 0.0161501, - 15849 => 0.0086884, - 1053 => 0.0059099, - 55064 => 0.0037784, - 11 => 0.0036557, + 374 => 0.812, + 1587 => 0.0482, + 596 => 0.0247, + 1120 => 0.0224, + 3250 => 0.0216, + 706 => 0.0162, + 15849 => 0.00871, + 1053 => 0.00591, + 55064 => 0.00378, + 11 => 0.00368, }, "token": 374, }, { "probabilities": Map { - 2288 => 0.2759203, - 1120 => 0.166673, - 539 => 0.1576579, - 779 => 0.1335195, - 264 => 0.055744, - 1101 => 0.0292486, - 16053 => 0.0176843, - 5042 => 0.0158506, - 1193 => 0.0146031, - 2744 => 0.0140961, + 2288 => 0.276, + 1120 => 0.167, + 539 => 0.158, + 779 => 0.133, + 264 => 0.0558, + 1101 => 0.0293, + 16053 => 0.0178, + 5042 => 0.0159, + 1193 => 0.0146, + 2744 => 0.0141, }, "token": 2288, }, { "probabilities": Map { - 16053 => 0.9066879, - 13326 => 0.0635879, - 19781 => 0.0071462, - 17551 => 0.0020222, - 10968 => 0.0012692, - 11920 => 0.0011004, - 6435 => 0.0010057, - 34386 => 0.0007741, - 1208 => 0.0006092, - 25366 => 0.0005664, + 16053 => 0.907, + 13326 => 0.0635, + 19781 => 0.00713, + 17551 => 0.00202, + 10968 => 0.00126, + 11920 => 0.0011, + 6435 => 0.001, + 34386 => 0.000775, + 1208 => 0.000609, + 25366 => 0.000566, }, "token": 16053, }, { "probabilities": Map { - 311 => 0.9882948, - 1524 => 0.0061879, - 11 => 0.002568, - 323 => 0.000522, - 13 => 0.0003525, - 627 => 0.0003204, - 1606 => 0.0002628, + 311 => 0.988, + 1524 => 0.00617, + 11 => 0.00258, + 323 => 0.000525, + 13 => 0.000354, + 627 => 0.000322, + 1606 => 0.000265, 2288 => 0.000258, - 369 => 0.0001243, - 320 => 0.0001019, + 369 => 0.000125, + 320 => 0.000102, }, "token": 311, }, { "probabilities": Map { - 2512 => 0.749257, - 1524 => 0.0991107, - 656 => 0.0322866, - 636 => 0.0240931, - 7940 => 0.014378, - 33586 => 0.0108598, - 387 => 0.0086719, - 1781 => 0.0058546, - 1629 => 0.0054801, - 3351 => 0.0051043, + 2512 => 0.75, + 1524 => 0.0987, + 656 => 0.0324, + 636 => 0.0241, + 7940 => 0.0144, + 33586 => 0.0109, + 387 => 0.00867, + 1781 => 0.00585, + 1629 => 0.00549, + 3351 => 0.00512, }, "token": 2512, }, { "probabilities": Map { - 922 => 0.9522551, - 1606 => 0.0149839, - 11 => 0.0139898, - 430 => 0.002966, - 627 => 0.0023101, - 13 => 0.0018821, - 1524 => 0.0018027, - 369 => 0.0017665, - 323 => 0.0009226, - 382 => 0.0008453, + 922 => 0.952, + 1606 => 0.015, + 11 => 0.014, + 430 => 0.00297, + 627 => 0.00232, + 13 => 0.00189, + 1524 => 0.0018, + 369 => 0.00177, + 323 => 0.000927, + 382 => 0.000848, }, "token": 922, }, { "probabilities": Map { - 279 => 0.6508359, - 4205 => 0.3128611, - 1148 => 0.0113738, - 1690 => 0.0044254, - 904 => 0.0030366, - 1202 => 0.0026803, - 264 => 0.0011148, - 1790 => 0.0010861, - 813 => 0.0010576, - 1524 => 0.0007703, + 279 => 0.652, + 4205 => 0.312, + 1148 => 0.0114, + 1690 => 0.00443, + 904 => 0.00304, + 1202 => 0.00267, + 264 => 0.00111, + 1790 => 0.00108, + 813 => 0.00105, + 1524 => 0.000764, }, "token": 279, }, @@ -280,43 +280,43 @@ describe("llama 3.1", () => { expect(res).toMatchInlineSnapshot(` [ { - "confidence": 0.4305387, + "confidence": 0.43, "token": 578, }, { - "confidence": 0.4229744, + "confidence": 0.422, "token": 16053, }, { - "confidence": 0.9981223, + "confidence": 0.998, "token": 5679, }, { - "confidence": 0.8128683, + "confidence": 0.812, "token": 374, }, { - "confidence": 0.2759203, + "confidence": 0.276, "token": 2288, }, { - "confidence": 0.9066879, + "confidence": 0.907, "token": 16053, }, { - "confidence": 0.9882948, + "confidence": 0.988, "token": 311, }, { - "confidence": 0.749257, + "confidence": 0.75, "token": 2512, }, { - "confidence": 0.9522551, + "confidence": 0.952, "token": 922, }, { - "confidence": 0.6508359, + "confidence": 0.652, "token": 279, }, ] @@ -353,162 +353,162 @@ describe("llama 3.1", () => { expect(res).toMatchInlineSnapshot(` [ { - "confidence": 0.4305387, + "confidence": 0.43, "probabilities": Map { - 578 => 0.4305387, - 1115 => 0.130273, - 1102 => 0.0517783, - 763 => 0.0429566, - 1283 => 0.0294619, - 2100 => 0.0294103, - 15636 => 0.0263193, - 2030 => 0.0218532, - 320 => 0.0168992, - 1628 => 0.011877, + 578 => 0.43, + 1115 => 0.13, + 1102 => 0.0516, + 763 => 0.0429, + 2100 => 0.0294, + 1283 => 0.0294, + 15636 => 0.0263, + 2030 => 0.0219, + 320 => 0.0169, + 1628 => 0.0119, }, "token": 578, }, { - "confidence": 0.4229744, + "confidence": 0.422, "probabilities": Map { - 16053 => 0.4229744, - 4062 => 0.303401, - 39935 => 0.0602281, - 2944 => 0.0372685, - 5679 => 0.0237816, - 11914 => 0.0162851, - 2144 => 0.0146596, - 1121 => 0.0069732, - 17571 => 0.0057899, - 3446 => 0.0049125, + 16053 => 0.422, + 4062 => 0.304, + 39935 => 0.0603, + 2944 => 0.0376, + 5679 => 0.0237, + 11914 => 0.0161, + 2144 => 0.0147, + 1121 => 0.00699, + 17571 => 0.00574, + 3446 => 0.00492, }, "token": 16053, }, { - "confidence": 0.9981223, + "confidence": 0.998, "probabilities": Map { - 5679 => 0.9981223, - 12875 => 0.0001592, - 18964 => 0.0001154, - 39935 => 0.0001146, - 13 => 0.0001047, - 627 => 0.0000926, + 5679 => 0.998, + 12875 => 0.000159, + 18964 => 0.000115, + 39935 => 0.000115, + 13 => 0.000105, + 627 => 0.0000928, 656 => 0.0000625, 893 => 0.0000563, 198 => 0.0000522, - 374 => 0.0000518, + 374 => 0.0000521, }, "token": 5679, }, { - "confidence": 0.8128683, + "confidence": 0.812, "probabilities": Map { - 374 => 0.8128683, - 1587 => 0.0480889, - 596 => 0.0247298, - 1120 => 0.0222965, - 3250 => 0.0215258, - 706 => 0.0161501, - 15849 => 0.0086884, - 1053 => 0.0059099, - 55064 => 0.0037784, - 11 => 0.0036557, + 374 => 0.812, + 1587 => 0.0482, + 596 => 0.0247, + 1120 => 0.0224, + 3250 => 0.0216, + 706 => 0.0162, + 15849 => 0.00871, + 1053 => 0.00591, + 55064 => 0.00378, + 11 => 0.00368, }, "token": 374, }, { - "confidence": 0.2759203, + "confidence": 0.276, "probabilities": Map { - 2288 => 0.2759203, - 1120 => 0.166673, - 539 => 0.1576579, - 779 => 0.1335195, - 264 => 0.055744, - 1101 => 0.0292486, - 16053 => 0.0176843, - 5042 => 0.0158506, - 1193 => 0.0146031, - 2744 => 0.0140961, + 2288 => 0.276, + 1120 => 0.167, + 539 => 0.158, + 779 => 0.133, + 264 => 0.0558, + 1101 => 0.0293, + 16053 => 0.0178, + 5042 => 0.0159, + 1193 => 0.0146, + 2744 => 0.0141, }, "token": 2288, }, { - "confidence": 0.9066879, + "confidence": 0.907, "probabilities": Map { - 16053 => 0.9066879, - 13326 => 0.0635879, - 19781 => 0.0071462, - 17551 => 0.0020222, - 10968 => 0.0012692, - 11920 => 0.0011004, - 6435 => 0.0010057, - 34386 => 0.0007741, - 1208 => 0.0006092, - 25366 => 0.0005664, + 16053 => 0.907, + 13326 => 0.0635, + 19781 => 0.00713, + 17551 => 0.00202, + 10968 => 0.00126, + 11920 => 0.0011, + 6435 => 0.001, + 34386 => 0.000775, + 1208 => 0.000609, + 25366 => 0.000566, }, "token": 16053, }, { - "confidence": 0.9882948, + "confidence": 0.988, "probabilities": Map { - 311 => 0.9882948, - 1524 => 0.0061879, - 11 => 0.002568, - 323 => 0.000522, - 13 => 0.0003525, - 627 => 0.0003204, - 1606 => 0.0002628, + 311 => 0.988, + 1524 => 0.00617, + 11 => 0.00258, + 323 => 0.000525, + 13 => 0.000354, + 627 => 0.000322, + 1606 => 0.000265, 2288 => 0.000258, - 369 => 0.0001243, - 320 => 0.0001019, + 369 => 0.000125, + 320 => 0.000102, }, "token": 311, }, { - "confidence": 0.749257, + "confidence": 0.75, "probabilities": Map { - 2512 => 0.749257, - 1524 => 0.0991107, - 656 => 0.0322866, - 636 => 0.0240931, - 7940 => 0.014378, - 33586 => 0.0108598, - 387 => 0.0086719, - 1781 => 0.0058546, - 1629 => 0.0054801, - 3351 => 0.0051043, + 2512 => 0.75, + 1524 => 0.0987, + 656 => 0.0324, + 636 => 0.0241, + 7940 => 0.0144, + 33586 => 0.0109, + 387 => 0.00867, + 1781 => 0.00585, + 1629 => 0.00549, + 3351 => 0.00512, }, "token": 2512, }, { - "confidence": 0.9522551, + "confidence": 0.952, "probabilities": Map { - 922 => 0.9522551, - 1606 => 0.0149839, - 11 => 0.0139898, - 430 => 0.002966, - 627 => 0.0023101, - 13 => 0.0018821, - 1524 => 0.0018027, - 369 => 0.0017665, - 323 => 0.0009226, - 382 => 0.0008453, + 922 => 0.952, + 1606 => 0.015, + 11 => 0.014, + 430 => 0.00297, + 627 => 0.00232, + 13 => 0.00189, + 1524 => 0.0018, + 369 => 0.00177, + 323 => 0.000927, + 382 => 0.000848, }, "token": 922, }, { - "confidence": 0.6508359, + "confidence": 0.652, "probabilities": Map { - 279 => 0.6508359, - 4205 => 0.3128611, - 1148 => 0.0113738, - 1690 => 0.0044254, - 904 => 0.0030366, - 1202 => 0.0026803, - 264 => 0.0011148, - 1790 => 0.0010861, - 813 => 0.0010576, - 1524 => 0.0007703, + 279 => 0.652, + 4205 => 0.312, + 1148 => 0.0114, + 1690 => 0.00443, + 904 => 0.00304, + 1202 => 0.00267, + 264 => 0.00111, + 1790 => 0.00108, + 813 => 0.00105, + 1524 => 0.000764, }, "token": 279, }, @@ -568,10 +568,18 @@ function simplifyRes [token, parseFloat(probability.toFixed(7))]) + .map(([token, probability]) => [token, simplifyFloat(probability)]) ); if (item.confidence != null) - item.confidence = parseFloat(item.confidence.toFixed(7)); + item.confidence = simplifyFloat(item.confidence); } } + +function simplifyFloat(value: number) { + if (value === 0) + return 0; + + const step = 10 ** (Math.floor(Math.log10(Math.abs(value))) - 2); + return Number.parseFloat((Math.round(value / step) * step).toPrecision(12)); +} diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts index 8c76051d..9df6b337 100644 --- a/test/modelDependent/llama3.2/promptCompletion.test.ts +++ b/test/modelDependent/llama3.2/promptCompletion.test.ts @@ -24,7 +24,7 @@ describe("llama 3.2", () => { chatWrapper: resolveChatWrapper(model, { customWrapperSettings: { "llama3.2-lightweight": { - todayDate: new Date("2025-01-01T00:00:00Z") + todayDate: new Date("2025-01-01T00:00:00") } } }) @@ -34,7 +34,7 @@ describe("llama 3.2", () => { chatWrapper: resolveChatWrapper(model, { customWrapperSettings: { "llama3.2-lightweight": { - todayDate: new Date("2025-01-01T00:00:00Z") + todayDate: new Date("2025-01-01T00:00:00") } } }) diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts index c3e71ffe..a7b67b49 100644 --- a/test/modelDependent/llama3.2/sequenceState.test.ts +++ b/test/modelDependent/llama3.2/sequenceState.test.ts @@ -1,6 +1,6 @@ import {describe, expect, test} from "vitest"; import fs from "fs-extra"; -import {LlamaChatSession, TokenMeter} from "../../../src/index.js"; +import {LlamaChatSession, resolveChatWrapper, TokenMeter} from "../../../src/index.js"; import {getModelFile} from "../../utils/modelFiles.js"; import {getTestLlama} from "../../utils/getTestLlama.js"; import {getTempTestFilePath} from "../../utils/helpers/getTempTestDir.js"; @@ -8,6 +8,8 @@ import {toBytes} from "../../../src/cli/utils/toBytes.js"; describe("llama 3.2", () => { describe("chatSession", () => { + const todayDate = new Date("2026-04-28T00:00:00Z"); + describe("sequence state", () => { test("save and load a state works properly", {timeout: 1000 * 60 * 60 * 2}, async (test) => { const modelPath = await getModelFile("Llama-3.2-3B-Instruct.Q4_K_M.gguf"); @@ -24,10 +26,12 @@ describe("llama 3.2", () => { const contextSequence2 = context.getSequence(); const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const [ @@ -37,8 +41,8 @@ describe("llama 3.2", () => { chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}), chatSession2.prompt("Remember: giraffes are not elephants", {maxTokens: 5}) ]); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); - expect(res2).to.toMatchInlineSnapshot('"I appreciate the reminder."'); + expect(res1).to.toMatchInlineSnapshot("\"That's a common\""); + expect(res2).to.match(/I appreciate the reminder|I'll keep that in/); const stateFile1Path = await getTempTestFilePath("state1"); @@ -73,7 +77,8 @@ describe("llama 3.2", () => { `); const chatSession1_1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1_1 = await chatSession1_1.prompt("What's the exact thing I told you to remember?", {maxTokens: 10}); expect(res1_1).to.toMatchInlineSnapshot("\"You didn't tell me to remember anything. This\""); @@ -102,7 +107,8 @@ describe("llama 3.2", () => { `); const chatSession1_2 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession1_2.setChatHistory(chatSession1.getChatHistory()); const res1_2 = await chatSession1_2.prompt("What's the exact thing I told you to remember?", {maxTokens: 12}); @@ -136,11 +142,12 @@ describe("llama 3.2", () => { const contextSequence2 = context2.getSequence(); const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); + expect(res1).to.toMatchInlineSnapshot("\"That's a common\""); const stateFile1Path = await getTempTestFilePath("state1"); @@ -163,7 +170,8 @@ describe("llama 3.2", () => { const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession2.setChatHistory(chatSession1.getChatHistory()); await contextSequence2.loadStateFromFile(stateFile1Path, {acceptRisk: true}); @@ -198,11 +206,12 @@ describe("llama 3.2", () => { expect(context2.contextSize).to.eql(256); // the context is actually bigger due to `llama.cpp`'s padding const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1 = await chatSession1.prompt("Remember: locks are not doors. Also, write a long poem about it", {maxTokens: 154}); - expect(res1).toMatch(/^(A clever reminder indeed.|A clever reminder, indeed.|A wise phrase to ponder|A wise phrase indeed)/); + expect(res1).toMatch(/^(A clever reminder indeed.|A clever reminder, indeed.|A wise phrase to ponder|A wise phrase indeed|A poetic reminder|A poetic task)/); const stateFile1Path = await getTempTestFilePath("state1"); @@ -225,7 +234,8 @@ describe("llama 3.2", () => { const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession2.setChatHistory(chatSession1.getChatHistory()); try { @@ -256,11 +266,12 @@ describe("llama 3.2", () => { const contextSequence2 = context2.getSequence(); const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); + expect(res1).to.toMatchInlineSnapshot("\"That's a common\""); const stateFile1Path = await getTempTestFilePath("state1"); @@ -283,7 +294,8 @@ describe("llama 3.2", () => { const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession2.setChatHistory(chatSession1.getChatHistory()); try { diff --git a/test/modelDependent/qwen3.5-0.8b/functions.test.ts b/test/modelDependent/qwen3.5-0.8b/functions.test.ts index ab6d8148..fae80858 100644 --- a/test/modelDependent/qwen3.5-0.8b/functions.test.ts +++ b/test/modelDependent/qwen3.5-0.8b/functions.test.ts @@ -29,6 +29,9 @@ describe("qwen3.5 0.8b", () => { properties: { n: { enum: [1, 2, 3, 4] + }, + dummy: { + const: "This is a long text, I'll write it" } } }, @@ -48,12 +51,12 @@ describe("qwen3.5 0.8b", () => { The second word is "secret"." `); - expect(chatSession.sequence.tokenMeter.usedInputTokens).toMatchInlineSnapshot("372"); - expect(chatSession.sequence.lastCheckpointIndex).toMatchInlineSnapshot("393"); - expect(chatSession.sequence.nextTokenIndex).toMatchInlineSnapshot("405"); + expect(chatSession.sequence.tokenMeter.usedInputTokens).toMatchInlineSnapshot("389"); + expect(chatSession.sequence.lastCheckpointIndex).toMatchInlineSnapshot("425"); + expect(chatSession.sequence.nextTokenIndex).toMatchInlineSnapshot("437"); const initialMeterState = chatSession.sequence.tokenMeter.getState(); - const res2 = await chatSession.prompt("Explain what this word means", { + const res2 = await chatSession.prompt("Explain what this word means. short", { ...promptOptions, maxTokens: 15 }); @@ -62,12 +65,12 @@ describe("qwen3.5 0.8b", () => { expect(res2).to.toMatchInlineSnapshot(` " - The word "secret" means something that is hidden or" + "Secret" means something that is hidden or not known" `); - expect(diffMeterState.usedInputTokens).toMatchInlineSnapshot("73"); - expect(diffMeterState.usedInputTokens).to.be.lessThanOrEqual(80); - expect(chatSession.sequence.lastCheckpointIndex).toMatchInlineSnapshot("414"); - expect(chatSession.sequence.nextTokenIndex).toMatchInlineSnapshot("429"); + expect(diffMeterState.usedInputTokens).toMatchInlineSnapshot("90"); + expect(diffMeterState.usedInputTokens).to.be.lessThanOrEqual(95); + expect(chatSession.sequence.lastCheckpointIndex).toMatchInlineSnapshot("448"); + expect(chatSession.sequence.nextTokenIndex).toMatchInlineSnapshot("463"); }); }); }); diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts index 6b88dc77..7abfdf29 100644 --- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts +++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts @@ -7,10 +7,11 @@ import {BuildGpu} from "../../../src/bindings/types.js"; import {defaultLlamaVramPadding} from "../../../src/bindings/getLlama.js"; describe("stableCode", () => { - describe("model options", () => { - describe("Resolve the correct number of GPU layers", async () => { + describe("model options", async () => { + const llama = await getTestLlama(); + + describe.skipIf(llama.gpu === false)("Resolve the correct number of GPU layers", async (scope) => { const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf"); - const llama = await getTestLlama(); const fileInfo = await readGgufFileInfo(modelPath); const ggufInsights = await GgufInsights.from(fileInfo, llama); @@ -28,7 +29,7 @@ describe("stableCode", () => { totalSwap?: number, freeSwap?: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu }) { - const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, { + const {gpuLayers: resolvedGpuLayers, useMmap: resolvedUseMmap} = await ggufInsights.configurationResolver.resolveModelGpuLayersV2(gpuLayers, { ignoreMemorySafetyChecks, getVramState: async () => ({ total: llamaGpu === false ? 0 : totalVram, @@ -37,13 +38,13 @@ describe("stableCode", () => { llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram), llamaGpu, llamaSupportsGpuOffloading: llamaGpu !== false, - useMmap: true + useMmap: "auto" }); async function resolveAutoContextSize() { const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({ targetGpuLayers: resolvedGpuLayers, - useMmap: true + useMmap: resolvedUseMmap }, { llamaGpu, getVramState: async () => ({ @@ -71,7 +72,8 @@ describe("stableCode", () => { return { gpuLayers: resolvedGpuLayers, - contextSize: await resolveAutoContextSize() + contextSize: await resolveAutoContextSize(), + useMmap: resolvedUseMmap }; } @@ -83,6 +85,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -91,6 +94,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -101,6 +105,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -111,7 +116,8 @@ describe("stableCode", () => { freeVram: s1GB * 3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("8960"); + expect(res.contextSize).to.toMatchInlineSnapshot("13824"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } try { await resolveGpuLayers(16, { @@ -142,7 +148,8 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("179"); + expect(res.contextSize).to.toMatchInlineSnapshot("2816"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } @@ -154,6 +161,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -164,6 +172,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -174,7 +183,8 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("11520"); + expect(res.contextSize).to.toMatchInlineSnapshot("14080"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(32, { @@ -193,6 +203,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -203,6 +214,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -213,6 +225,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -223,7 +236,8 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("11264"); + expect(res.contextSize).to.toMatchInlineSnapshot("13312"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(33, { @@ -242,6 +256,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -252,6 +267,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -262,10 +278,11 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - it('attempts to resolve "max"', async () => { + it('attempts to resolve "max"', {timeout: 1000 * 60 * 2}, async () => { try { await resolveGpuLayers("max", { totalVram: s1GB * 6, @@ -304,6 +321,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -311,7 +329,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -319,7 +338,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -327,11 +347,12 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("9472"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - it('attempts to resolve "auto"', async () => { + it('attempts to resolve "auto"', {timeout: 1000 * 60 * 2}, async () => { { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, @@ -339,30 +360,34 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 0.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("14848"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 0.8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.contextSize).to.toMatchInlineSnapshot("15104"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 1.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("8960"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("10"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -371,6 +396,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); expect(res.contextSize).to.toMatchInlineSnapshot("1536"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -378,7 +404,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3328"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -386,7 +413,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -394,7 +422,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.contextSize).to.toMatchInlineSnapshot("5120"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -402,7 +431,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5376"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -410,7 +440,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -418,7 +449,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("6656"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -426,7 +458,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7168"); + expect(res.contextSize).to.toMatchInlineSnapshot("8448"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -434,7 +467,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("9472"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -442,7 +476,8 @@ describe("stableCode", () => { freeVram: s1GB * 5.2 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("9216"); + expect(res.contextSize).to.toMatchInlineSnapshot("10752"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -450,7 +485,8 @@ describe("stableCode", () => { freeVram: s1GB * 5.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("10752"); + expect(res.contextSize).to.toMatchInlineSnapshot("12800"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -458,11 +494,12 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("11264"); + expect(res.contextSize).to.toMatchInlineSnapshot("13312"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - it("attempts to resolve {min?: number, max?: number}", async () => { + it("attempts to resolve {min?: number, max?: number}", {timeout: 1000 * 60 * 2}, async () => { { const res = await resolveGpuLayers({max: 4}, { totalVram: s1GB * 6, @@ -470,6 +507,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -478,6 +516,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 2}, { @@ -504,7 +543,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("14592"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 16}, { @@ -522,7 +562,8 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -532,7 +573,8 @@ describe("stableCode", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); - expect(res.contextSize).to.toMatchInlineSnapshot("8448"); + expect(res.contextSize).to.toMatchInlineSnapshot("9984"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -541,12 +583,13 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.contextSize).to.toMatchInlineSnapshot("10752"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } }); - it("attempts to resolve {fitContext?: {contextSize?: number}}", async () => { + it("attempts to resolve {fitContext?: {contextSize?: number}}", {timeout: 1000 * 60 * 2}, async () => { { const contextSize = 4096; const res = await resolveGpuLayers({fitContext: {contextSize}}, { @@ -556,6 +599,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -565,7 +609,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -574,8 +619,9 @@ describe("stableCode", () => { totalVram: s1GB * 2, freeVram: s1GB * 1 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); + expect(res.contextSize).to.toMatchInlineSnapshot("11008"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -584,8 +630,9 @@ describe("stableCode", () => { totalVram: s1GB * 6, freeVram: s1GB * 4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); - expect(res.contextSize).to.toMatchInlineSnapshot("9216"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); + expect(res.contextSize).to.toMatchInlineSnapshot("11776"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -595,7 +642,8 @@ describe("stableCode", () => { freeVram: s1GB * 1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("3"); - expect(res.contextSize).to.toMatchInlineSnapshot("11264"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -606,6 +654,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -627,6 +676,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } }); diff --git a/test/standalone/chatWrappers/Gemma4ChatWrapper.test.ts b/test/standalone/chatWrappers/Gemma4ChatWrapper.test.ts new file mode 100644 index 00000000..9aeb8e5f --- /dev/null +++ b/test/standalone/chatWrappers/Gemma4ChatWrapper.test.ts @@ -0,0 +1,362 @@ +import {describe, expect, test} from "vitest"; +import {ChatHistoryItem, defineChatSessionFunction, Gemma4ChatWrapper} from "../../../src/index.js"; +import {defaultChatSystemPrompt} from "../../../src/config.js"; + + +describe("Gemma4ChatWrapper", () => { + const conversationHistory: ChatHistoryItem[] = [{ + type: "system", + text: defaultChatSystemPrompt + }, { + type: "user", + text: "Hi there!" + }, { + type: "model", + response: [ + { + type: "segment", + segmentType: "thought", + text: "Let me think how to respond to this.", + ended: true + }, + "Hello!" + ] + }, { + type: "user", + text: "How are you?" + }, { + type: "model", + response: [ + { + type: "segment", + segmentType: "thought", + text: "Let me think how to answer", + ended: true + }, + { + type: "segment", + segmentType: "comment", + text: "This is a question about my state", + ended: true + }, + "I'm good, how are you?" + ] + }]; + + const functions = { + getRandomNumber: defineChatSessionFunction({ + description: "Get a random number", + params: { + type: "object", + properties: { + min: { + type: "number" + }, + max: { + type: "number" + } + } + }, + async handler(params) { + return Math.floor(Math.random() * (params.max - params.min + 1) + params.min); + } + }), + notifyOwner: defineChatSessionFunction({ + description: "Send a notification to the owner, and create sub notifications", + params: { + $ref: "#/$defs/notification", + $defs: { + notification: { + type: "object", + properties: { + message: { + type: "string" + }, + subNotifications: { + type: "array", + items: { + $ref: "#/$defs/notification" + } + } + } + } + } + }, + handler(notification) { + return "Notification created: " + notification.message; + } + }), + notifyOwner2: defineChatSessionFunction({ + description: "Send a notification to the owner, and create sub notifications", + params: { + $ref: "#/$defs/notification", + $defs: { + notification: { + type: "object", + properties: { + message: { + type: "string", + description: "Notification message" + }, + subNotifications: { + type: "array", + description: "Sub notifications", + items: { + $ref: "#/$defs/notification" + } + } + } + } + } + }, + handler(notification) { + return "Notification created: " + notification.message; + } + }), + func1: defineChatSessionFunction({ + description: "Some function", + params: { + type: "object", + properties: { + message: { + type: "string", + description: "Some message", + minLength: 3, + maxLength: 10 + }, + words: { + type: "array", + description: "Some words", + items: { + type: "string" + }, + minItems: 2, + maxItems: 5 + }, + headers: { + type: "object", + description: "Some headers", + additionalProperties: { + type: "string" + }, + minProperties: 4, + maxProperties: 12 + }, + mappings: { + type: "object", + description: "Some mappings", + properties: { + a: { + type: "boolean" + }, + b: { + type: "number" + }, + c: { + type: ["string", "null"] + } + }, + additionalProperties: { + type: "string" + }, + minProperties: 4, + maxProperties: 12 + } + } + }, + handler(params) { + + } + }) + }; + const conversationHistory2: ChatHistoryItem[] = [{ + type: "system", + text: defaultChatSystemPrompt + }, { + type: "user", + text: "Hi there!" + }, { + type: "model", + response: ["Hello!"] + }, { + type: "user", + text: "Role a dice twice and tell me the total result" + }, { + type: "model", + response: [ + { + type: "functionCall", + name: "getRandomNumber", + description: "Get a random number", + params: { + min: 1, + max: 6 + }, + result: 3 + }, + { + type: "functionCall", + name: "getRandomNumber", + description: "Get a random number", + params: { + min: 1, + max: 6 + }, + result: 4 + }, + "The total result of rolling the dice twice is 3 + 4 = 7." + ] + }]; + + test("should generate valid context text", () => { + const chatWrapper = new Gemma4ChatWrapper({keepOnlyLastThought: false}); + const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory}); + + expect(contextText).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + <|channel>thought"), + "Let me think how to respond to this.", + new SpecialTokensText(""), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "How are you?", + new SpecialTokensText(" + <|turn>model + <|channel>thought"), + "Let me think how to answer", + new SpecialTokensText(""), + "I'm good, how are you?", + ]) + `); + + const chatWrapper2 = new Gemma4ChatWrapper(); + const {contextText: contextText2} = chatWrapper2.generateContextState({ + chatHistory: conversationHistory2, + availableFunctions: functions + }); + + expect(contextText2).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText("<|tool>"), + "declaration:getRandomNumber{{"description": "Get a random number", "parameters": {"type": "object", "properties": {"min": {"type": "number"}, "max": {"type": "number"}}}}}", + new SpecialTokensText("<|tool>"), + "declaration:notifyOwner{{"description": "Send a notification to the owner, and create sub notifications", "parameters": {"$ref": "#/$defs/notification", "$defs": {"notification": {"type": "object", "properties": {"message": {"type": "string"}, "subNotifications": {"type": "array", "items": {"$ref": "#/$defs/notification"}}}}}}}}", + new SpecialTokensText("<|tool>"), + "declaration:notifyOwner2{{"description": "Send a notification to the owner, and create sub notifications", "parameters": {"$ref": "#/$defs/notification", "$defs": {"notification": {"type": "object", "properties": {"message": {"type": "string", "description": "Notification message"}, "subNotifications": {"type": "array", "description": "Sub notifications", "items": {"$ref": "#/$defs/notification"}}}}}}}}", + new SpecialTokensText("<|tool>"), + "declaration:func1{{"description": "Some function", "parameters": {"type": "object", "properties": {"message": {"type": "string", "description": "Some message", "minLength": 3, "maxLength": 10}, "words": {"type": "array", "description": "Some words", "items": {"type": "string"}, "minItems": 2, "maxItems": 5}, "headers": {"type": "object", "description": "Some headers", "additionalProperties": {"type": "string"}, "minProperties": 4, "maxProperties": 12}, "mappings": {"type": "object", "description": "Some mappings", "properties": {"a": {"type": "boolean"}, "b": {"type": "number"}, "c": {"type": ["string", "null"]}}, "additionalProperties": {"type": "string"}, "minProperties": 4, "maxProperties": 12}}}}}", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + "), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "Role a dice twice and tell me the total result", + new SpecialTokensText(" + <|turn>model + <|tool_call>call:"), + "getRandomNumber{{"min": 1, "max": 6}", + new SpecialTokensText("}response:"), + "getRandomNumber{3", + new SpecialTokensText("}<|tool_call>call:"), + "getRandomNumber{{"min": 1, "max": 6}", + new SpecialTokensText("}response:"), + "getRandomNumber{4", + new SpecialTokensText("}"), + "The total result of rolling the dice twice is 3 + 4 = 7.", + ]) + `); + + const chatWrapper3 = new Gemma4ChatWrapper(); + const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory}); + const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({ + chatHistory: [ + ...conversationHistory, + { + type: "model", + response: [] + } + ] + }); + + expect(contextText3).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + "), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "How are you?", + new SpecialTokensText(" + <|turn>model + <|channel>thought"), + "Let me think how to answer", + new SpecialTokensText(""), + "I'm good, how are you?", + ]) + `); + + expect(contextText3WithOpenModelResponse).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + "), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "How are you?", + new SpecialTokensText(" + <|turn>model + "), + "I'm good, how are you?", + new SpecialTokensText(" + <|turn>model + "), + ]) + `); + }); +}); diff --git a/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts b/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts index 92fdc1a9..6fedeed0 100644 --- a/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts +++ b/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts @@ -4,7 +4,7 @@ import {defaultChatSystemPrompt} from "../../../src/config.js"; describe("HarmonyChatWrapper", () => { - const todayDate = new Date("2025-08-05T00:00:00Z"); + const todayDate = new Date("2025-08-05T00:00:00"); const conversationHistory: ChatHistoryItem[] = [{ type: "system", diff --git a/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts b/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts index fab30cc9..d6f853de 100644 --- a/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts +++ b/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts @@ -4,7 +4,7 @@ import {defaultChatSystemPrompt} from "../../../src/config.js"; describe("Llama3_1ChatWrapper", () => { - const todayDate = new Date("2024-07-26T00:00:00Z"); + const todayDate = new Date("2024-07-26T00:00:00"); const conversationHistory: ChatHistoryItem[] = [ ...(new Llama3_1ChatWrapper({todayDate})).generateInitialChatHistory({systemPrompt: defaultChatSystemPrompt}), { type: "user", diff --git a/test/standalone/chatWrappers/utils/jinjaTemplates.ts b/test/standalone/chatWrappers/utils/jinjaTemplates.ts index f63e3748..4f7f09e8 100644 --- a/test/standalone/chatWrappers/utils/jinjaTemplates.ts +++ b/test/standalone/chatWrappers/utils/jinjaTemplates.ts @@ -1636,3 +1636,538 @@ export const harmonyJinjaTemplate5 = ` <|start|>assistant {%- endif -%} `.slice(1, -1); + +export const gemma4JinjaTemplate1 = ` +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{ bos_token }} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {{- messages[0]['content'] | trim -}} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + + {{- '\n' -}} +{%- endif %} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {{- '<|turn>' + role + '\n' }} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- if message['tool_responses'] -%} + {#- Tool Response handling -#} + {%- for tool_response in message['tool_responses'] -%} + {{- '<|tool_response>' -}} + {%- if tool_response['response'] is mapping -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}} + {%- for key, value in tool_response['response'] | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '\n\n<|image|>\n\n' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '\n\n<|video|>\n\n' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- if not (message['tool_responses'] and not message['content']) -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} +`.slice(1); + +export const gemma4JinjaTemplate2 = ` +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{ bos_token }} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {{- messages[0]['content'] | trim -}} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + + {{- '\n' -}} +{%- endif %} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {{- '<|turn>' + role + '\n' }} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- if message['tool_responses'] -%} + {#- Tool Response handling -#} + {%- for tool_response in message['tool_responses'] -%} + {{- '<|tool_response>' -}} + {%- if tool_response['response'] is mapping -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}} + {%- for key, value in tool_response['response'] | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '\n\n<|image|>\n\n' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '\n\n<|video|>\n\n' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- if not (message['tool_responses'] and not message['content']) -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} + {%- if not enable_thinking | default(false) -%} + {{- '<|channel>thought\n' -}} + {%- endif -%} +{%- endif -%} +`.slice(1); diff --git a/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts b/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts index f8e0e94d..e68f9024 100644 --- a/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts +++ b/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts @@ -1,10 +1,12 @@ import {describe, expect, test} from "vitest"; import { AlpacaChatWrapper, ChatMLChatWrapper, DeepSeekChatWrapper, FalconChatWrapper, FunctionaryChatWrapper, GemmaChatWrapper, - GeneralChatWrapper, Llama2ChatWrapper, Llama3_1ChatWrapper, MistralChatWrapper, QwenChatWrapper, resolveChatWrapper, HarmonyChatWrapper + Gemma4ChatWrapper, GeneralChatWrapper, Llama2ChatWrapper, Llama3_1ChatWrapper, MistralChatWrapper, QwenChatWrapper, + resolveChatWrapper, HarmonyChatWrapper } from "../../../../src/index.js"; import { - harmonyJinjaTemplate, harmonyJinjaTemplate2, harmonyJinjaTemplate3, harmonyJinjaTemplate4, harmonyJinjaTemplate5 + harmonyJinjaTemplate, harmonyJinjaTemplate2, harmonyJinjaTemplate3, harmonyJinjaTemplate4, harmonyJinjaTemplate5, + gemma4JinjaTemplate1, gemma4JinjaTemplate2 } from "./jinjaTemplates.js"; @@ -783,6 +785,32 @@ describe("resolveChatWrapper", () => { expect(chatWrapper).to.be.instanceof(GemmaChatWrapper); }); + test("should resolve to specialized Gemma4ChatWrapper", () => { + const chatWrapper = resolveChatWrapper({ + customWrapperSettings: { + jinjaTemplate: { + template: gemma4JinjaTemplate1 + } + }, + fallbackToOtherWrappersOnJinjaError: false + }); + + expect(chatWrapper).to.be.instanceof(Gemma4ChatWrapper); + }); + + test("should resolve to specialized Gemma4ChatWrapper 2", () => { + const chatWrapper = resolveChatWrapper({ + customWrapperSettings: { + jinjaTemplate: { + template: gemma4JinjaTemplate2 + } + }, + fallbackToOtherWrappersOnJinjaError: false + }); + + expect(chatWrapper).to.be.instanceof(Gemma4ChatWrapper); + }); + test("should resolve to specialized GeneralChatWrapper", () => { const chatWrapper = resolveChatWrapper({ customWrapperSettings: { diff --git a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap index 4a441965..7bf467a0 100644 --- a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap +++ b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap @@ -60,6 +60,7 @@ exports[`gguf > parser > should parse remote gguf model 1`] = ` "offset": 1097564160, }, ], + "infoEndOffset": 2585504, "metadata": { "falcon": { "attention": { @@ -135,6 +136,15 @@ exports[`gguf > parser > should parse remote gguf model 1`] = ` }, }, "metadataSize": 2547826, + "sourceData": [ + { + "buffer": { + "_type": "buffer", + "length": 2585504, + }, + "type": "buffer", + }, + ], "splicedParts": 1, "tensorCount": 644, "tensorInfo": [ @@ -206,6 +216,7 @@ exports[`gguf > parser > should parse remote gguf model without tensor info 1`] "tensor_data_layout": "jploski", }, "fullTensorInfo": undefined, + "infoEndOffset": undefined, "metadata": { "falcon": { "attention": { @@ -281,6 +292,7 @@ exports[`gguf > parser > should parse remote gguf model without tensor info 1`] }, }, "metadataSize": 2547826, + "sourceData": [], "splicedParts": 1, "tensorCount": 644, "tensorInfo": undefined, diff --git a/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts b/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts index 37b44bf1..dd7abf73 100644 --- a/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts +++ b/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts @@ -1,4 +1,5 @@ -import {GgufFileInfo} from "../../../src/gguf/types/GgufFileInfoTypes.js"; +import path from "path"; +import {GgufFileInfo, GgufFileInfoSourceData, GgufFileInfoSource} from "../../../src/gguf/types/GgufFileInfoTypes.js"; export function simplifyGgufInfoForTestSnapshot(ggufFileInfo: GgufFileInfo) { const ggufFileInfoCopy = structuredClone(ggufFileInfo); @@ -13,6 +14,9 @@ export function simplifyGgufInfoForTestSnapshot(ggufFileInfo: GgufFileInfo) { shortenArray(ggufFileInfoCopy.tensorInfo, 4); shortenArray(ggufFileInfoCopy.fullTensorInfo, 4); + simplifySource(ggufFileInfoCopy.source); + simplifySourceData(ggufFileInfoCopy.sourceData); + return ggufFileInfoCopy; } @@ -22,3 +26,29 @@ function shortenArray(array?: readonly any[], maxSize: number = 10) { (array as any[]).splice(maxSize); } + +function simplifySourceData(sourceData: GgufFileInfoSourceData[]) { + for (const entry of sourceData) { + if (entry.type === "buffer") + entry.buffer = { + _type: "buffer", + length: entry.buffer.length + } as any; + else if (entry.type === "path") + entry.path = { + _type: "path", + path: "/" + path.basename(entry.path) + } as any; + } +} + +function simplifySource(source?: GgufFileInfoSource) { + if (source == null) + return; + + if (source.type === "path") + source.path = { + _type: "path", + path: "/" + path.basename(source.path) + } as any; +} diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts index b8d4accc..525dfb1a 100644 --- a/test/utils/modelFiles.ts +++ b/test/utils/modelFiles.ts @@ -22,7 +22,8 @@ const supportedModels = { "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true", "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true", "Qwen3-0.6B-Q8_0.gguf": "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true", - "Qwen3.5-0.8B-Q8_0.gguf": "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf?download=true" + "Qwen3.5-0.8B-Q8_0.gguf": "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf?download=true", + "gemma-4-E2B-it-Q4_K_M.gguf": "https://huggingface.co/unsloth/gemma-4-E2B-it-GGUF/resolve/main/gemma-4-E2B-it-Q4_K_M.gguf?download=true" } as const; export async function getModelFile(modelName: keyof typeof supportedModels) {