From 965339da332f26b06bbb3ae783f9ad20f8bebad3 Mon Sep 17 00:00:00 2001 From: Adi Eldar Date: Mon, 27 Apr 2026 13:25:15 +0300 Subject: [PATCH 1/2] Add support for harrier-v1-270m model --- .../kusto/includes/slm-embeddings-fl-adx.md | 20 ++++++++++------- .../includes/slm-embeddings-fl-fabric.md | 22 +++++++++++-------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/data-explorer/kusto/includes/slm-embeddings-fl-adx.md b/data-explorer/kusto/includes/slm-embeddings-fl-adx.md index 29e4c476ba..c7bd768424 100644 --- a/data-explorer/kusto/includes/slm-embeddings-fl-adx.md +++ b/data-explorer/kusto/includes/slm-embeddings-fl-adx.md @@ -1,10 +1,10 @@ --- ms.topic: include -ms.date: 01/08/2026 +ms.date: 04/27/2026 --- The function `slm_embeddings_fl()` is a [UDF (user-defined function)](../query/functions/user-defined-functions.md) that generates text embeddings using local Small Language Models (SLM). This function converts text into numerical vector representations that can be used for semantic search, similarity analysis, and other natural language processing tasks. -Currently the function supports [jina-v2-small](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) and [e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) models. +Currently the function supports [harrier-v1-270m](https://huggingface.co/microsoft/harrier-oss-v1-270m), [jina-v2-small](https://huggingface.co/jinaai/jina-embeddings-v2-small-en), and [e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) models. [!INCLUDE [python-zone-pivot-fabric](../includes/python-zone-pivot-fabric.md)] * Alter the cluster's [callout policy](../management/callout-policy.md) to allow access to the external artifacts (which are referenced in the KQL code below): @@ -28,12 +28,13 @@ Note that this change requires [AllDatabasesAdmin](../access-control/role-based- |*text_col*| `string` | :heavy_check_mark:|The name of the column containing the text to embed.| |*embeddings_col*| `string` | :heavy_check_mark:|The name of the column to store the output embeddings.| |*batch_size*| `int` ||The number of texts to process in each batch. Default is 32.| -|*model_name*| `string` ||The name of the embedding model to use. Supported values are `jina-v2-small` (default) and `e5-small-v2`.| -|*prefix*| `string` ||The text prefix to add before each input. Default is `query:`. For E5 model, use `query:` for search queries and `passage:` for documents to be searched. This parameter is ignored for Jina model.| +|*model_name*| `string` ||The name of the embedding model to use. Supported values are `harrier-v1-270m` (default), `jina-v2-small`, and `e5-small-v2`.| +|*prefix*| `string` ||The text prefix to add before each input. Default is `query:`. For the Harrier and E5 models, use `query:` for search queries and `passage:` for documents to be searched (for the Harrier model, `passage:` maps to the empty task). This parameter is ignored for the Jina model.| ## Function definition -You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows: +* You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database. +* To optimize storage and latency you can delete external artifacts for models that are not used. ### [Query-defined](#tab/query-defined) @@ -43,7 +44,7 @@ Define the function using the following [let statement](../query/let-statement.m > A [let statement](../query/let-statement.md) can't run on its own. It must be followed by a [tabular expression statement](../query/tabular-expression-statements.md). To run a working example of `slm_embeddings_fl()`, see [Example](#example). ~~~kusto -let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='jina-v2-small', prefix:string='query:') +let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='harrier-v1-270m', prefix:string='query:') { let kwargs = bag_pack('text_col', text_col, 'embeddings_col', embeddings_col, 'batch_size', batch_size, 'model_name', model_name, 'prefix', prefix); let code = ```if 1: @@ -71,6 +72,7 @@ let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_ | evaluate hint.distribution=per_node python(typeof(*), code, kwargs, external_artifacts = bag_pack( 'embedding_engine.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip', 'tokenizers-0.22.1.whl', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/tokenizers-0.22.1-cp39-abi3-win_amd64.whl', + 'harrier-v1-270m.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/harrier-v1-270m.zip', 'jina-v2-small.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/jina-v2-small.zip', 'e5-small-v2.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/e5-small-v2.zip')) }; @@ -86,7 +88,7 @@ Define the stored function once using the following [`.create function`](../mana ~~~kusto .create-or-alter function with (folder = "Packages\\AI", docstring = "Embedding using local SLM") -slm_embeddings_fl(tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='jina-v2-small', prefix:string='query:') +slm_embeddings_fl(tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='harrier-v1-270m', prefix:string='query:') { let kwargs = bag_pack('text_col', text_col, 'embeddings_col', embeddings_col, 'batch_size', batch_size, 'model_name', model_name, 'prefix', prefix); let code = ```if 1: @@ -114,6 +116,7 @@ slm_embeddings_fl(tbl:(*), text_col:string, embeddings_col:string, batch_size:in | evaluate hint.distribution=per_node python(typeof(*), code, kwargs, external_artifacts = bag_pack( 'embedding_engine.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip', 'tokenizers-0.22.1.whl', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/tokenizers-0.22.1-cp39-abi3-win_amd64.whl', + 'harrier-v1-270m.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/harrier-v1-270m.zip', 'jina-v2-small.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/jina-v2-small.zip', 'e5-small-v2.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/e5-small-v2.zip')) } @@ -132,7 +135,7 @@ The following example uses the [invoke operator](../query/invoke-operator.md) to To use a query-defined function, invoke it after the embedded function definition. ~~~kusto -let slm_embeddings_fl=(tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='jina-v2-small', prefix:string='query:') +let slm_embeddings_fl=(tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='harrier-v1-270m', prefix:string='query:') { let kwargs = bag_pack('text_col', text_col, 'embeddings_col', embeddings_col, 'batch_size', batch_size, 'model_name', model_name, 'prefix', prefix); let code = ```if 1: @@ -160,6 +163,7 @@ let slm_embeddings_fl=(tbl:(*), text_col:string, embeddings_col:string, batch_si | evaluate hint.distribution=per_node python(typeof(*), code, kwargs, external_artifacts = bag_pack( 'embedding_engine.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip', 'tokenizers-0.22.1.whl', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/tokenizers-0.22.1-cp39-abi3-win_amd64.whl', + 'harrier-v1-270m.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/harrier-v1-270m.zip', 'jina-v2-small.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/jina-v2-small.zip', 'e5-small-v2.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/e5-small-v2.zip')) }; diff --git a/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md b/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md index f162a5d011..d12aa57a09 100644 --- a/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md +++ b/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md @@ -1,10 +1,10 @@ --- ms.topic: include -ms.date: 01/08/2026 +ms.date: 04/27/2026 --- The function `slm_embeddings_fl()` is a [UDF (user-defined function)](../query/functions/user-defined-functions.md) that generates text embeddings using local Small Language Models (SLM). This function converts text into numerical vector representations that can be used for semantic search, similarity analysis, and other natural language processing tasks. -Currently the function supports [jina-v2-small](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) and [e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) models. +Currently the function supports [harrier-v1-270m](https://huggingface.co/microsoft/harrier-oss-v1-270m), [jina-v2-small](https://huggingface.co/jinaai/jina-embeddings-v2-small-en), and [e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) models. [!INCLUDE [python-zone-pivot-fabric](../includes/python-zone-pivot-fabric.md)] * Create a lakehouse to host the external artifacts (which are referenced in the KQL code below), preferably in the same workspace as your eventhouse. @@ -22,14 +22,15 @@ Currently the function supports [jina-v2-small](https://huggingface.co/jinaai/ji |*text_col*| `string` | :heavy_check_mark:|The name of the column containing the text to embed.| |*embeddings_col*| `string` | :heavy_check_mark:|The name of the column to store the output embeddings.| |*batch_size*| `int` ||The number of texts to process in each batch. Default is 32.| -|*model_name*| `string` ||The name of the embedding model to use. Supported values are `jina-v2-small` (default) and `e5-small-v2`.| -|*prefix*| `string` ||The text prefix to add before each input. Default is `query:`. For E5 model, use `query:` for search queries and `passage:` for documents to be searched. This parameter is ignored for Jina model.| +|*model_name*| `string` ||The name of the embedding model to use. Supported values are `harrier-v1-270m` (default), `jina-v2-small`, and `e5-small-v2`.| +|*prefix*| `string` ||The text prefix to add before each input. Default is `query:`. For the Harrier and E5 models, use `query:` for search queries and `passage:` for documents to be searched (for the Harrier model, `passage:` maps to the empty task). This parameter is ignored for the Jina model.| ## Function definition -* Download the four artifacts in the KQL code below (at the end of the code block see the external_artifacts parameter that reference artifacts, e.g https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip) and upload them to your lakehouse. +* Download the artifacts in the KQL code below (at the end of the code block see the external_artifacts parameter that reference artifacts, e.g https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip) and upload them to your lakehouse. * In the KQL code below update the artifacts paths to their one lake paths (e.g. https://msit-onelake.dfs.fabric.microsoft.com/MY_WORKSPACE/MY_LAKEHOUSE.Lakehouse/Files/models/SLM/embedding_engine.zip). -* You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows: +* You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database. +* To optimize storage and latency you can delete external artifacts for models that are not used. ### [Query-defined](#tab/query-defined) @@ -39,7 +40,7 @@ Define the function using the following [let statement](../query/let-statement.m > A [let statement](../query/let-statement.md) can't run on its own. It must be followed by a [tabular expression statement](../query/tabular-expression-statements.md). To run a working example of `slm_embeddings_fl()`, see [Example](#example). ~~~kusto -let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='jina-v2-small', prefix:string='query:') +let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='harrier-v1-270m', prefix:string='query:') { let kwargs = bag_pack('text_col', text_col, 'embeddings_col', embeddings_col, 'batch_size', batch_size, 'model_name', model_name, 'prefix', prefix); let code = ```if 1: @@ -67,6 +68,7 @@ let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_ | evaluate hint.distribution=per_node python(typeof(*), code, kwargs, external_artifacts = bag_pack( 'embedding_engine.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip;impersonate', 'tokenizers-0.22.1.whl', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/tokenizers-0.22.1-cp39-abi3-win_amd64.whl;impersonate', + 'harrier-v1-270m.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/harrier-v1-270m.zip;impersonate', 'jina-v2-small.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/jina-v2-small.zip;impersonate', 'e5-small-v2.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/e5-small-v2.zip;impersonate')) }; @@ -82,7 +84,7 @@ Define the stored function once using the following [`.create function`](../mana ~~~kusto .create-or-alter function with (folder = "Packages\\AI", docstring = "Embedding using local SLM") -slm_embeddings_fl(tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='jina-v2-small', prefix:string='query:') +slm_embeddings_fl(tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='harrier-v1-270m', prefix:string='query:') { let kwargs = bag_pack('text_col', text_col, 'embeddings_col', embeddings_col, 'batch_size', batch_size, 'model_name', model_name, 'prefix', prefix); let code = ```if 1: @@ -110,6 +112,7 @@ slm_embeddings_fl(tbl:(*), text_col:string, embeddings_col:string, batch_size:in | evaluate hint.distribution=per_node python(typeof(*), code, kwargs, external_artifacts = bag_pack( 'embedding_engine.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip;impersonate', 'tokenizers-0.22.1.whl', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/tokenizers-0.22.1-cp39-abi3-win_amd64.whl;impersonate', + 'harrier-v1-270m.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/harrier-v1-270m.zip;impersonate', 'jina-v2-small.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/jina-v2-small.zip;impersonate', 'e5-small-v2.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/e5-small-v2.zip;impersonate')) } @@ -128,7 +131,7 @@ The following example uses the [invoke operator](../query/invoke-operator.md) to To use a query-defined function, invoke it after the embedded function definition. ~~~kusto -let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='jina-v2-small', prefix:string='query:') +let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_size:int=32, model_name:string='harrier-v1-270m', prefix:string='query:') { let kwargs = bag_pack('text_col', text_col, 'embeddings_col', embeddings_col, 'batch_size', batch_size, 'model_name', model_name, 'prefix', prefix); let code = ```if 1: @@ -156,6 +159,7 @@ let slm_embeddings_fl = (tbl:(*), text_col:string, embeddings_col:string, batch_ | evaluate hint.distribution=per_node python(typeof(*), code, kwargs, external_artifacts = bag_pack( 'embedding_engine.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip;impersonate', 'tokenizers-0.22.1.whl', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/tokenizers-0.22.1-cp39-abi3-win_amd64.whl;impersonate', + 'harrier-v1-270m.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/harrier-v1-270m.zip;impersonate', 'jina-v2-small.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/jina-v2-small.zip;impersonate', 'e5-small-v2.zip', 'https://artifactswestus.z22.web.core.windows.net/models/SLM/e5-small-v2.zip;impersonate')) }; From 548c8e81904ec6afb130448e219bd6b3a8e3be74 Mon Sep 17 00:00:00 2001 From: Diana Richards Date: Mon, 27 Apr 2026 09:35:03 -0500 Subject: [PATCH 2/2] raising acrolinx --- data-explorer/kusto/includes/slm-embeddings-fl-fabric.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md b/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md index d12aa57a09..647863ead1 100644 --- a/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md +++ b/data-explorer/kusto/includes/slm-embeddings-fl-fabric.md @@ -30,7 +30,7 @@ Currently the function supports [harrier-v1-270m](https://huggingface.co/microso * Download the artifacts in the KQL code below (at the end of the code block see the external_artifacts parameter that reference artifacts, e.g https://artifactswestus.z22.web.core.windows.net/models/SLM/embedding_engine.zip) and upload them to your lakehouse. * In the KQL code below update the artifacts paths to their one lake paths (e.g. https://msit-onelake.dfs.fabric.microsoft.com/MY_WORKSPACE/MY_LAKEHOUSE.Lakehouse/Files/models/SLM/embedding_engine.zip). * You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database. -* To optimize storage and latency you can delete external artifacts for models that are not used. +* To optimize storage and latency you can delete external artifacts for models that aren't used. ### [Query-defined](#tab/query-defined)