diff --git a/docs/docs.json b/docs/docs.json index 2e9f8eb4..8122d6c9 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -157,6 +157,7 @@ "pages": [ "geneva/index", "geneva/overview/index", + "geneva/getting-started", { "group": "Transforms", "pages": [ @@ -181,26 +182,27 @@ { "group": "Running Jobs", "pages": [ - "geneva/jobs/contexts", "geneva/jobs/backfilling", "geneva/jobs/bulk-load-columns", "geneva/jobs/materialized-views", + "geneva/jobs/advanced-job-configuration", "geneva/jobs/lifecycle", "geneva/jobs/conflicts", "geneva/jobs/performance", "geneva/jobs/job_metrics", "geneva/jobs/console", - "geneva/jobs/troubleshooting" + "geneva/jobs/troubleshooting", + "geneva/jobs/contexts" ] }, { "group": "Deployment", "pages": [ - "geneva/deployment/index", "geneva/deployment/helm", "geneva/jobs/startup", "geneva/deployment/dependency-verification", "geneva/udfs/advanced-configuration", + "geneva/deployment/index", "geneva/deployment/troubleshooting" ] }, diff --git a/docs/geneva/deployment/dependency-verification.mdx b/docs/geneva/deployment/dependency-verification.mdx index d9a2126f..400e7f88 100644 --- a/docs/geneva/deployment/dependency-verification.mdx +++ b/docs/geneva/deployment/dependency-verification.mdx @@ -1,13 +1,13 @@ --- title: Dependency Verification sidebarTitle: Dependency Verification -description: Diagnose and resolve package version mismatches between local and Ray worker environments. +description: Diagnose and resolve package version mismatches between local and distributed worker environments. icon: magnifying-glass-chart --- import { PyQuickFixManifest, PyEnvVarsViaCluster, PyPipManifest, PyCondaClusterPath, PyCondaClusterInline } from '/snippets/geneva_dependency_verification.mdx'; -When running Geneva UDFs on Ray, your code is serialized locally and executed on remote workers. If the worker environment differs from your local environment, you may encounter subtle and difficult-to-debug errors. +When running Geneva UDFs on distributed workers, your code is serialized locally and executed on remote workers. If the worker environment differs from your local environment, you may encounter subtle and difficult-to-debug errors. ## Example environment mismatch errors diff --git a/docs/geneva/deployment/helm.mdx b/docs/geneva/deployment/helm.mdx index 4d068c93..4255c80d 100644 --- a/docs/geneva/deployment/helm.mdx +++ b/docs/geneva/deployment/helm.mdx @@ -64,14 +64,14 @@ geneva: azure: # Azure managed identity client ID for the Geneva client. - # This identity should have a federated credential for the Geneva namespace + # This identity should have a federated credential for the LanceDB namespace # and Storage Blob Data Contributor role on the storage account. clientPrincipalId: "" ``` 3. Install kuberay operator ```bash -export NAMESPACE=geneva +export NAMESPACE=lancedb helm repo add kuberay https://ray-project.github.io/kuberay-helm/ helm repo update @@ -89,4 +89,96 @@ kubectl apply -f nvidia-device-plugin.yml 5. Install Geneva Helm chart ```bash helm install geneva ./geneva -n $NAMESPACE --create-namespace +``` + +## Default cluster and manifest + +In LanceDB Enterprise, backfill and refresh jobs run on a **default cluster** (the compute +pool jobs run on) and a **default manifest** (the Python dependency environment — image and +packages). Configuring these in the LanceDB Enterprise chart lets jobs run out of the box +without per-job configuration. They are set under `geneva.defaults` in the chart's +`values.yaml`: + +```yaml +geneva: + defaults: + cluster: + cluster_type: external_ray + name: deployment-default + ray_address: "ray://raycluster-kuberay-head-svc.lancedb.svc.cluster.local:10001" + manifest: + name: deployment-default + pip: [geneva, pyarrow, lancedb, pylance] + head_image: rayproject/ray:2.54.0-py312 + worker_image: rayproject/ray:2.54.0-py312 + skip_site_packages: true +``` + +If no default is configured, jobs must specify a manifest explicitly. Individual transforms can override the default manifest by pinning one +with `@udf` / `@chunker` / `@udtf` (see +[Advanced Job Configuration](/geneva/jobs/advanced-job-configuration)); to override the cluster +at runtime, use an [Advanced Execution Context](/geneva/jobs/contexts). + +## Providing a Ray cluster + +The LanceDB Helm chart can be configured to deploy a static KubeRay cluster, provision KubeRay clusters on demand per job, or +use an existing Ray cluster. + +### Use default LanceDB Enterprise Ray cluster (default) + +By default, LanceDB Enterprise will use a shared, statically provisioned Ray cluster for job execution. + +This can be enabled in the Helm chart by setting the following values. + +```yaml +raycluster: + enabled: true + +global: + rayclusterUri: "ray://raycluster-kuberay-head-svc.lancedb.svc.cluster.local:10001" +``` + +Configuration for the Ray cluster can be specified by modifying raycluster.yaml Helm values. + +### Provision KubeRay clusters on demand + +Set `global.rayclusterUri` to an empty value to provision ephemeral KubeRay clusters on-demand for each execution job. The default KubeRay cluster configuration +is specified in `geneva.defaults.cluster`, i.e. + +```yaml +geneva: + defaults: + cluster: + cluster_type: kuberay + name: deployment-default + kuberay: + namespace: lancedb + config_method: IN_CLUSTER + head_group: + service_account: geneva-service-account + num_cpus: 2 + memory: 8Gi + image: rayproject/ray:2.54.0-py312 + worker_groups: + - name: cpu + service_account: geneva-service-account + num_cpus: 4 + memory: 8Gi + replicas: 2 + min_replicas: 0 + max_replicas: 4 + idle_timeout_seconds: 60 + node_selector: + geneva.lancedb.com/ray-worker-cpu: "true" + image: rayproject/ray:2.54.0-py312 +``` + +### Use an external Ray cluster + +Self-managed enterprise customers can bring an existing Ray cluster to run Geneva jobs. Simply set the rayclusterUri property in the Helm chart +to a Ray address that can be accessed from the LanceDB Enterprise deployment. + +```yaml +global: + rayclusterUri: "ray://my-ray-cluster.my-ns.svc.cluster.local:10001" ``` \ No newline at end of file diff --git a/docs/geneva/deployment/index.mdx b/docs/geneva/deployment/index.mdx index 101e0677..7c6ca892 100644 --- a/docs/geneva/deployment/index.mdx +++ b/docs/geneva/deployment/index.mdx @@ -32,7 +32,7 @@ via the instructions below. In the following sections we'll use these variables: ```bash -NAMESPACE=geneva # replace with your actual namespace if different +NAMESPACE=lancedb # replace with your actual namespace if different KSA_NAME=geneva-ray-runner # replace with an identity name ``` @@ -142,7 +142,7 @@ Geneva needs the ability to deploy a KubeRay cluster and submit jobs to Ray. The In the following sections we'll use these variables: ```bash -NAMESPACE=geneva # replace with your actual namespace if different +NAMESPACE=lancedb # replace with your actual namespace if different KSA_NAME=geneva-ray-runner # replace with an identity name PROJECT_ID=... # replace with your google cloud project name GSA_EMAIL=${KSA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com @@ -218,7 +218,7 @@ Geneva can be used to provision Ray clusters running in Amazon Web Services (AWS In the following sections we'll use these variables: ```bash -NAMESPACE=geneva # replace with your actual namespace if different +NAMESPACE=lancedb # replace with your actual namespace if different CLUSTER=geneva # replace with your actual namespace if different KSA_NAME=geneva-ray-runner # replace with an identity name ``` @@ -428,7 +428,7 @@ worker_spec = _WorkerGroupSpec( with ray_cluster( name="my-ray-cluster", - namespace="geneva", + namespace="lancedb", cluster_name="geneva", config_method=K8sConfigMethod.EKS_AUTH, region="us-east-1", diff --git a/docs/geneva/end-to-end.mdx b/docs/geneva/end-to-end.mdx index 0e63e216..15e4576f 100644 --- a/docs/geneva/end-to-end.mdx +++ b/docs/geneva/end-to-end.mdx @@ -1,36 +1,36 @@ --- title: End-to-End Example sidebarTitle: End-to-end example -description: A complete Geneva example — raw data on S3, a backfill using a KubeRay cluster, and a materialized view with embeddings. +description: A complete Geneva example on LanceDB Enterprise — create a table, backfill computed columns, and build a materialized view with embeddings. icon: play --- -This example walks through a complete Geneva workflow: creating a raw table in S3, adding computed columns via a backfill on a KubeRay cluster, and materializing a view with embeddings for downstream search. +This example walks through a complete Geneva workflow on LanceDB Enterprise: creating a raw +table, adding computed columns with a distributed backfill, and materializing a view with +embeddings for downstream search. -The dataset is a product catalog with titles and descriptions. We'll compute a `word_count` feature column, then create a materialized view that adds text embeddings. +The dataset is a product catalog with titles and descriptions. We'll compute a `word_count` +feature column, then create a materialized view that adds text embeddings. ## 0. What you need to run this -Before running this example, you'll need: - -- **An S3 bucket** (or other cloud object storage) that Geneva can read and write to -- **A Kubernetes cluster** with the [KubeRay operator installed](/geneva/deployment/) and [Geneva deployed](/geneva/deployment/helm/) -- **IAM credentials** granting your Geneva client access to the bucket and Kubernetes API — see [Manual Deployment](/geneva/deployment/) for the required permissions - -Fill in these constants and the rest of the code will run as-is: - -```python -CLOUD_OBJECT_STORAGE_LOCATION = "s3://my-bucket/product-catalog" # your S3/GCS/ABS path -K8S_NAMESPACE = "geneva" # namespace where Geneva is deployed -``` +All you need is an existing **LanceDB Enterprise deployment**. Distributed job execution, +clusters, and dependency manifests are managed for you — there is no Kubernetes or cluster +setup in this example. ## 1. Connect and create a table ```python +import os import pyarrow as pa import geneva -db = geneva.connect(CLOUD_OBJECT_STORAGE_LOCATION) +# Connect to LanceDB Enterprise +db = geneva.connect( + uri="db://my-db", + host_override=os.getenv("LANCEDB_URI"), + api_key=os.getenv("LANCEDB_API_KEY"), +) # Create a raw product table schema = pa.schema([ @@ -84,111 +84,74 @@ def price_tier(price: float) -> str: return "premium" ``` -## 3. Register columns and run a backfill on KubeRay +## 3. Register columns and run a backfill -```python -from geneva.cluster.builder import KubeRayClusterBuilder -from geneva.manifest.builder import PipManifestBuilder -import sys -import ray -from importlib.metadata import version -from geneva.utils.ray import get_ray_image - -# Define the Python environment for workers -manifest = ( - PipManifestBuilder.create("product-catalog-manifest") - .pip([ - "sentence-transformers==3.3.1", - "torch==2.5.1", - f"geneva[udf-text-sentence-transformers]=={version('geneva')}", - ]) - .build() -) -db.define_manifest("product-catalog-manifest", manifest) - -# Make sure to use an image that: -# 1. matches your local ray and python versions -# 2. matches the architecture of the k8s nodes. These are in AWS and are -# x64 nodes, so we set arm=False. -# get_ray_image() helps you get the right image. -ray_image = get_ray_image( - ray.__version__, - f"{sys.version_info.major}.{sys.version_info.minor}", - arm=False, -) - -# Define the KubeRay cluster -cluster = ( - KubeRayClusterBuilder.create("product-catalog-cluster") - .namespace(K8S_NAMESPACE) - .head_group(cpus=4, memory="8Gi", image=ray_image) - .add_worker_group( - KubeRayClusterBuilder.cpu_worker() - .cpus(4) - .memory("16Gi") - .min_replicas(1) - .max_replicas(3) - .image(ray_image) - .build() - ) - .build() -) -db.define_cluster("product-catalog-cluster", cluster) +Register the UDFs as virtual columns and trigger a backfill. The job runs on your +deployment's default distributed execution environment — no cluster or context to configure. +```python # Add computed columns table.add_columns({ "word_count": word_count, "price_tier": price_tier, }) -# Run the backfill on the KubeRay cluster -with db.context(cluster="product-catalog-cluster", manifest="product-catalog-manifest"): - table.backfill("word_count") - table.backfill("price_tier") +# Run the backfills +table.backfill("word_count") +table.backfill("price_tier") ``` ## 4. Create a materialized view with embeddings -The materialized view selects a subset of columns from the source table — here we drop `price` and `price_tier`, keeping only what's needed for search. -After creating a materialized view, we will add and backfill a new column of embeddings. +The embedding model needs extra Python dependencies (`sentence-transformers`, `torch`). Rather +than configuring a deployment-wide environment, we bundle those dependencies **with the UDF** +using `@udf(manifest=...)`. The manifest is snapshotted onto the view, so refreshes use it +automatically. + +The materialized view selects a subset of columns from the source table — here we drop `price` +and `price_tier`, keeping only what's needed for search — plus a derived `embedding` column. ```python -from geneva.udfs import sentence_transformer_udf - -# Define an embedding UDF using a template from LanceDB's UDF library -embed = sentence_transformer_udf( - column="description", - model="BAAI/bge-small-en-v1.5", - normalize=True, - dimension=384, # BAAI/bge-small-en-v1.5 output dimension; avoids loading model locally -) +from geneva import udf +from geneva.manifest import GenevaManifest -# Select only the columns needed for search — price and price_tier are excluded -query = ( - table.search() - .select(["product_id", "title", "description", "category", "word_count"]) +# Bundle the embedding model's dependencies and attach them to the UDF +embed_manifest = ( + GenevaManifest.create_pip("embedding-deps") + .pip(["sentence-transformers==3.3.1", "torch==2.5.1"]) + .build() ) -# Create the materialized view with an embedding column derived from description -db.create_materialized_view( - "products_enriched", - query, - columns={"embedding": embed}, -) +@udf(data_type=pa.list_(pa.float32(), 384), manifest=embed_manifest) +class EmbedDescription: + def __init__(self): + self.model = None + + def __call__(self, description: str) -> list[float]: + if self.model is None: + from sentence_transformers import SentenceTransformer + self.model = SentenceTransformer("BAAI/bge-small-en-v1.5") + return self.model.encode(description, normalize_embeddings=True).tolist() + +# Build a query that selects search columns plus the derived embedding +query = table.search(None).select({ + "product_id": "product_id", + "title": "title", + "description": "description", + "category": "category", + "word_count": "word_count", + "embedding": EmbedDescription(), +}) -# Populate the materialized view +# Create the materialized view and populate it — the embedding UDF runs on refresh +db.create_materialized_view("products_enriched", query) enriched = db.open_table("products_enriched") -with db.context(cluster="product-catalog-cluster", manifest="product-catalog-manifest"): - enriched.refresh() - enriched.add_columns({"embedding": embed}) - enriched.backfill("embedding") # refresh (above) adds rows but doesn't compute UDF columns +enriched.refresh() ``` ## 5. Query the enriched table ```python -import pyarrow.compute as pc - enriched = db.open_table("products_enriched") # Vector search @@ -209,7 +172,8 @@ armor_results = ( ## 6. Incremental refresh -As new products are added to the source table, refresh the view to compute embeddings for the new rows only: +As new products are added to the source table, backfill the new rows and refresh the view to +compute embeddings for them — only the new rows are processed: ```python # Append new products @@ -225,9 +189,15 @@ new_data = pa.table({ }) table.add(new_data) -with db.context(cluster="product-catalog-cluster", manifest="product-catalog-manifest"): - table.backfill("word_count") # only fills null values - table.backfill("price_tier") - enriched.refresh() - enriched.backfill("embedding") # refresh adds new rows but doesn't compute UDF columns +# Only null/new values are computed +table.backfill("word_count") +table.backfill("price_tier") + +# Incrementally materialize the new rows (including their embeddings) into the view +enriched.refresh() ``` + + +To keep source columns in sync automatically, mark their UDFs with `@udf(auto_backfill=True)`. See +[Backfilling](/geneva/jobs/backfilling/). + diff --git a/docs/geneva/getting-started.mdx b/docs/geneva/getting-started.mdx new file mode 100644 index 00000000..51ae74b5 --- /dev/null +++ b/docs/geneva/getting-started.mdx @@ -0,0 +1,106 @@ +--- +title: Getting Started +sidebarTitle: Getting Started +description: Connect to LanceDB Enterprise, define a UDF, and run a distributed backfill — from a notebook or a script. +icon: rocket +--- + +Connect to your LanceDB Enterprise deployment, define a UDF, and run a distributed +backfill — all from a notebook or a script. No cluster setup required. + + +```python Python icon="python" +import os +import geneva +import pyarrow as pa + +# Connect to LanceDB Enterprise +db = geneva.connect( + uri="db://my-db", + host_override=os.getenv("LANCEDB_URI", "http://localhost:10024"), + api_key=os.getenv("LANCEDB_API_KEY"), +) + +tbl = db.open_table("my_table") + +# Define a User Defined Function (UDF) that counts the words in the text column +@geneva.udf(data_type=pa.int32()) +def word_count(text: str) -> int: + return len(text.split()) + +# Register the UDF as a new virtual column +tbl.add_columns({"word_count": word_count}) + +# Backfill the new column using distributed execution with incremental checkpointing +tbl.backfill("word_count") +``` + + +## Auto-backfill + +With `auto_backfill=True`, LanceDB Enterprise recomputes the column for you whenever the +data or the UDF version changes — no explicit `backfill()` call needed (see +[Backfilling](/geneva/jobs/backfilling/)). + + +```python Python icon="python" +# Change the column to use a new UDF version with auto-backfill enabled +@geneva.udf(data_type=pa.int32(), auto_backfill=True) +def word_count(text: str) -> int: + return len(text.split()) + +tbl.alter_columns({"path": "word_count", "udf": word_count}) + +# Add new rows. word_count is computed automatically in the background. +tbl.add([{"text": "hello world"}]) +``` + + +## Materialized views and chunkers + +A [materialized view](/geneva/jobs/materialized-views/) applies UDFs over a query and +refreshes incrementally. A [chunker](/geneva/udfs/scalar-udtfs) view expands each source +row into many rows (1:N) — useful for splitting documents, videos, or images. + + +```python Python icon="python" +# Materialized view: a query with UDF-computed columns, refreshed incrementally +query = tbl.search(None).select({"text": "text", "word_count": word_count}) +view = db.create_materialized_view("my_view", query) +view.refresh() + +# Chunker view: 1:N row expansion — split each row's text into one row per word +from typing import Iterator, NamedTuple + +class Chunk(NamedTuple): + chunk_index: int + chunk_text: str + +@geneva.chunker +def split_text(text: str) -> Iterator[Chunk]: + for i, word in enumerate(text.split()): + yield Chunk(chunk_index=i, chunk_text=word) + +chunks = db.create_udtf_view( + "my_chunks", + source=tbl.search(None).select(["text"]), + udtf=split_text, +) +chunks.refresh() +``` + + +## Connecting to object storage or a local filesystem + +Geneva can also run directly against cloud object storage or a local path. In this mode, jobs run on a +[distributed execution context](/geneva/jobs/contexts) you provide. + + +```python Python icon="python" +# Cloud object storage (S3, GCS, Azure, or any S3-compatible object store) +db = geneva.connect("s3://my-bucket/my-database") + +# Local filesystem +db = geneva.connect("/path/to/my-database") +``` + diff --git a/docs/geneva/index.mdx b/docs/geneva/index.mdx index e1cd2679..ea85711f 100644 --- a/docs/geneva/index.mdx +++ b/docs/geneva/index.mdx @@ -17,6 +17,10 @@ With an API designed to leverage LanceDB's optimized data storage and retrieval, streamlines prototyping extraction and transformation tasks, performing experiments, exploring your data, scaling up execution, and moving to production. +LanceDB Multimodal Feature Engineering enables researchers to seamlessly transition from +experiments in local notebooks to fully-managed distributed job execution on datasets with +billions of rows. + Feature Engineering and the `geneva` Python package are currently only available as part of [LanceDB Enterprise](/enterprise). Please [contact us](mailto:contact@lancedb.com) if you're interested @@ -37,7 +41,7 @@ as columns in a Lance dataset. Adding a feature is straightforward: Register the UDF as a virtual column using `Table.add_columns()`. - (Optional) Configure where the UDF will run: locally, on a Ray cluster, or on a Kubernetes cluster with KubeRay (see [Contexts](/geneva/jobs/contexts)). + (Optional, advanced) Override where the job runs — see [Advanced Execution Contexts](/geneva/jobs/contexts). On LanceDB Enterprise, distributed job execution is fully managed, so most users can skip this step. Trigger a `backfill` operation (see [Backfilling](/geneva/jobs/backfilling/)). @@ -48,13 +52,15 @@ as columns in a Lance dataset. Adding a feature is straightforward: You can build your Python feature generator function in an IDE or a notebook using your project's Python versions and dependencies. `geneva` will automate much of the dependency and version management needed to move from prototype to scale and production. +Ready to write your first feature? Head to [Getting Started](/geneva/getting-started). + ## Continue learning Visit the following pages to learn more about featuring engineering in LanceDB Enterprise: -- **Overview**: [What is Feature Engineering?](/geneva/overview/) · [End-to-end example](/geneva/end-to-end) +- **Get started**: [Getting Started](/geneva/getting-started) · [What is Feature Engineering?](/geneva/overview/) · [End-to-end example](/geneva/end-to-end) - **UDFs**: [Using UDFs](/geneva/udfs/udfs) · [Blob helpers](/geneva/udfs/blobs/) · [Error handling](/geneva/udfs/error_handling) · [Advanced configuration](/geneva/udfs/advanced-configuration) -- **Jobs**: [Backfilling](/geneva/jobs/backfilling/) · [Startup optimizations](/geneva/jobs/startup/) · [Materialized views](/geneva/jobs/materialized-views/) · [Execution contexts](/geneva/jobs/contexts/) · [Geneva console](/geneva/jobs/console) · [Performance](/geneva/jobs/performance/) +- **Jobs**: [Backfilling](/geneva/jobs/backfilling/) · [Materialized views](/geneva/jobs/materialized-views/) · [Startup optimizations](/geneva/jobs/startup/) · [Advanced job configuration](/geneva/jobs/advanced-job-configuration/) · [Advanced execution contexts](/geneva/jobs/contexts/) · [Geneva console](/geneva/jobs/console) · [Performance](/geneva/jobs/performance/) - **Deployment**: [Deployment overview](/geneva/deployment/) · [Helm deployment](/geneva/deployment/helm/) · [Troubleshooting](/geneva/deployment/troubleshooting/) ## API Reference diff --git a/docs/geneva/jobs/advanced-job-configuration.mdx b/docs/geneva/jobs/advanced-job-configuration.mdx new file mode 100644 index 00000000..de619b26 --- /dev/null +++ b/docs/geneva/jobs/advanced-job-configuration.mdx @@ -0,0 +1,157 @@ +--- +title: Advanced Job Configuration +sidebarTitle: Advanced job configuration +description: Pin the dependency manifest a transform's distributed job runs with using @udf, @chunker, and @udtf. +icon: sliders +--- + +Enterprise-only + +On LanceDB Enterprise, backfill and refresh jobs run on a managed, distributed execution +environment configured at deployment time: + +- the **default cluster** — the compute pool jobs run on, and +- the **default manifest** — the Python dependency environment (image and packages) the + distributed workers run with. + +These defaults are set in the [LanceDB Helm chart](/geneva/deployment/helm) and cover most +workloads. When a transform needs dependencies that differ from the deployment default, pin a +**manifest** on the transform itself, as described below. + + +To override the **cluster** a job runs on — for example to route an embedding backfill to a +GPU pool — see [Advanced Execution Contexts](/geneva/jobs/contexts). + + +## Pinning a dependency manifest + +A manifest pins the Python image and packages the distributed workers run with. Build one with +the manifest builders, then attach it to your transform with the `manifest=` argument on +`@udf`, `@chunker`, or `@udtf`. The manifest is snapshotted into the column (or view) metadata +when the transform is registered, so every backfill or refresh of that transform uses it +automatically — there is no per-call manifest argument to remember. + + +**Manifests are immutable at the column / view level.** When a transform is registered, its +manifest is snapshotted onto the column (or view) metadata. Changing the deployment-default +manifest — or the `GenevaManifest` object in your code — does **not** affect existing columns +or views: they keep using the snapshot taken at creation time. To move a column or view to a +new manifest, re-point it to a new (or updated) UDF / chunker / UDTF — for example with +`alter_columns()` for a column, or by recreating the view. + + +```python +import pyarrow as pa +from typing import Iterator, NamedTuple +from geneva import udf, chunker, udtf +from geneva.manifest import GenevaManifest + +# Build a manifest that pins the dependencies these transforms need +embed_manifest = ( + GenevaManifest.create_pip("embedding-deps") + .pip(["sentence-transformers==3.3.1", "torch==2.5.1"]) + .build() +) +``` + +### `@udf(manifest=...)` + +Pin dependencies for a 1:1 computed column: + +```python +@udf(data_type=pa.list_(pa.float32(), 384), manifest=embed_manifest) +def embed(text: str) -> list[float]: + from sentence_transformers import SentenceTransformer + model = SentenceTransformer("BAAI/bge-small-en-v1.5") + return model.encode(text, normalize_embeddings=True).tolist() + +tbl.add_columns({"embedding": embed}) +tbl.backfill("embedding") # the backfill job runs with embed_manifest +``` + +### `@chunker(manifest=...)` + +Pin dependencies for a 1:N [chunker](/geneva/udfs/scalar-udtfs) (scalar UDTF): + +```python +class Chunk(NamedTuple): + chunk_index: int + chunk_text: str + +@chunker(manifest=embed_manifest) +def split_document(text: str) -> Iterator[Chunk]: + for i, part in enumerate(text.split("\n\n")): + yield Chunk(chunk_index=i, chunk_text=part) + +view = db.create_udtf_view("chunks", source=tbl.search(None), udtf=split_document) +view.refresh() # the refresh job runs with embed_manifest +``` + +### `@udtf(manifest=...)` + +Pin dependencies for an N:M [batch UDTF](/geneva/udfs/batch-udtfs): + +```python +@udtf( + output_schema=pa.schema([ + pa.field("label", pa.string()), + pa.field("count", pa.int64()), + ]), + manifest=embed_manifest, +) +def group_stats(source) -> Iterator[pa.RecordBatch]: + df = source.to_pandas() + agg = df.groupby("label").size().reset_index(name="count") + yield pa.RecordBatch.from_pandas(agg) + +view = db.create_udtf_view("summaries", source=tbl.search(None), udtf=group_stats) +view.refresh() # the refresh job runs with embed_manifest +``` + +## Capturing your local environment for testing + +When iterating locally, you often want the workers to run with the *exact* packages from your +current environment rather than a curated pip list. `Connection.capture_local_environment()` +zips your workspace (and, optionally, your site-packages), uploads the archives through the +connection, and returns a ready-to-use `GenevaManifest` you can attach to a transform with +`manifest=`. + +```python +import os +import pyarrow as pa +import geneva +from geneva import udf + +db = geneva.connect( + uri="db://my-db", + host_override=os.getenv("LANCEDB_URI"), + api_key=os.getenv("LANCEDB_API_KEY"), +) + +# Capture the local workspace; rely on the worker image for site-packages +manifest = db.capture_local_environment(skip_site_packages=True) + +@udf(data_type=pa.string(), manifest=manifest) +def shout(text: str) -> str: + return text.upper() + +tbl = db.open_table("my_table") +tbl.add_columns({"shout": shout}) +tbl.backfill("shout") # workers run with your captured environment +``` + +Pass `skip_site_packages=False` (the default) to also upload your local site-packages. + +## Manifest resolution + +For a given transform, the manifest is resolved in this order (first match wins): + +1. The manifest pinned on the transform via `@udf` / `@chunker` / `@udtf` `manifest=`. +2. For a materialized view, the manifest snapshotted on the view when it was created. +3. The deployment-default manifest from the [LanceDB Helm chart](/geneva/deployment/helm). + + +The `manifest=` argument applies to managed enterprise (`db://`) jobs. For direct +object-storage or local-filesystem connections, configure the dependency environment +explicitly with an [Advanced Execution Context](/geneva/jobs/contexts) instead. + diff --git a/docs/geneva/jobs/backfilling.mdx b/docs/geneva/jobs/backfilling.mdx index b8e92b98..690a0d61 100644 --- a/docs/geneva/jobs/backfilling.mdx +++ b/docs/geneva/jobs/backfilling.mdx @@ -11,6 +11,39 @@ Triggering backfill creates a distributed job to run the UDF and populate the co **Checkpoints**: Each batch of UDF execution is checkpointed so that partial results are not lost in case of job failures. Jobs can resume and avoid most of the expense of having to recalculate values. +## Auto-backfill + +Computed columns can be explicitly backfilled or they can be configured to be backfilled +automatically as data changes. Set `auto_backfill=True` on the UDF, and the column is automatically recomputed +whenever it falls out of sync with its source data. + + +```python Python icon="python" +# Mark the column's UDF for automatic backfill +@udf(data_type=pa.list_(pa.float32(), 1536), version="1", auto_backfill=True) +def embed_udf(text: str) -> list[float]: + return embedding_model.encode(text) + +tbl.add_columns({"embedding": embed_udf}) + +# No explicit backfill() needed — adding rows triggers recomputation automatically +tbl.add(new_rows) +``` + + +The `auto_backfill` flag is recorded in the column metadata when the column is added or +altered. LanceDB Enterprise's managed agent watches for columns that need recomputation and +dispatches a distributed backfill job for you — there is no manual trigger and no status +polling. A column is recomputed when, for example, **new rows are added** (leaving it null for +those rows) or the **UDF version changes** (you bump `version=` and `alter_columns()` to the +new function). + + +Auto-backfill is an enterprise feature. On direct object-storage or local-filesystem +connections there is no managed agent, so `auto_backfill=True` has no effect and you must run +`backfill()` explicitly. + + ## Adaptive checkpoint sizing Geneva can automatically adjust checkpoint sizes during a backfill. It starts with small checkpoints (faster proof-of-life) and grows them as it observes stable throughput, while staying within safe bounds. Planning still uses your configured checkpoint size (`checkpoint_size`), but the actual checkpoint chunks can be smaller when adaptive sizing is enabled. diff --git a/docs/geneva/jobs/console.mdx b/docs/geneva/jobs/console.mdx index cb503f64..b94ea6a6 100644 --- a/docs/geneva/jobs/console.mdx +++ b/docs/geneva/jobs/console.mdx @@ -17,16 +17,16 @@ The Geneva Console provides a web-based interface for monitoring and managing Ge ## Getting Started -The Geneva console is installed with the Geneva Helm chart; [contact LanceDB](https://lancedb.com/contact/) for access to the Helm chart). - +The Geneva console is installed with the Geneva Helm chart; [contact LanceDB](https://lancedb.com/contact/) for access to the Helm chart. 1. Install or upgrade the Geneva Helm chart (see [Helm Deployment](/geneva/deployment/helm/)). -2. Forward port 3000 from the geneva-console-ui service: +2. In your web browser, connect to the Geneva Console UI using the external ingress/load balancer URI configured in your deployment. +3. **Backup (no external ingress):** if your deployment doesn't expose the console via ingress or a load balancer, forward port 3000 from the `geneva-console-ui` service and open `http://localhost:3000`: ```bash -kubectl port-forward -n geneva svc/geneva-console-ui 3000:3000 +kubectl port-forward -n lancedb svc/geneva-console-ui 3000:3000 ``` -(Make sure you're using `-n` to specify the namespace correctly, using the value used when you installed the Helm chart. We advise `geneva`, so it's probably `geneva`.) -3. Open `http://localhost:3000` in your browser. When prompted, enter your bucket and database, like: +(Use `-n` to specify the namespace you installed the Helm chart into. We advise `lancedb`.) +4. When prompted, enter your bucket and database, like: ``` s3://my-bucket/my-db ``` diff --git a/docs/geneva/jobs/contexts.mdx b/docs/geneva/jobs/contexts.mdx index 75180489..f81967c7 100644 --- a/docs/geneva/jobs/contexts.mdx +++ b/docs/geneva/jobs/contexts.mdx @@ -1,15 +1,24 @@ --- -title: Execution Contexts -sidebarTitle: Contexts -description: Learn how to set up your cluster for distributed execution. +title: Advanced Execution Contexts +sidebarTitle: Advanced execution contexts +description: Configure the distributed execution backend — clusters and dependency manifests — for Geneva jobs on object-storage and local-filesystem connections. icon: circle-nodes --- -The APIs on this page require Geneva **v0.10.0** or later. +**This page applies to direct object-storage and local-filesystem connections only.** On +LanceDB Enterprise (`db://`) connections, distributed job execution is fully managed: the +cluster and manifest are configured at deployment time (see +[Helm deployment](/geneva/deployment/helm)) and can be overridden per job with `cluster=` / +`manifest=` (see [Advanced Job Configuration](/geneva/jobs/advanced-job-configuration)). You +do not define execution contexts there. + +When you connect Geneva directly to object storage (`s3://`, `gs://`, …) or a local path, +there is no managed control plane, so you configure the execution backend yourself using the +contexts below. The APIs on this page require Geneva **v0.10.0** or later. -Geneva currently supports one processing backend: **Ray**. There are 3 ways to connect to a Ray cluster: +Geneva's distributed execution backend is **Ray**. There are 3 ways to connect to a Ray cluster: 1. Local Ray 2. KubeRay: create a cluster on demand in your Kubernetes cluster. @@ -53,7 +62,7 @@ db = geneva.connect("s3://my-bucket/my-db") cluster_name = "my-geneva-cluster" # lowercase, numbers, hyphens only service_account = "my_k8s_service_account" # k8s service account that Geneva runs as -k8s_namespace = "geneva" # k8s namespace +k8s_namespace = "lancedb" # k8s namespace cluster = ( GenevaCluster.create_kuberay(cluster_name) diff --git a/docs/geneva/jobs/index.mdx b/docs/geneva/jobs/index.mdx index 61c5530e..b2bdec91 100644 --- a/docs/geneva/jobs/index.mdx +++ b/docs/geneva/jobs/index.mdx @@ -19,8 +19,8 @@ Optimize job and session startup times for faster interactive development and pr ## Execution Contexts -### [Execution Contexts](/geneva/jobs/contexts/) -Understand how Geneva automatically packages and deploys your Python execution environment to worker nodes for distributed execution using Ray. +### [Advanced Execution Contexts](/geneva/jobs/contexts/) +Understand how Geneva automatically packages and deploys your Python execution environment to worker nodes for distributed execution. ### [Geneva Console](/geneva/jobs/console/) Set up and access the Geneva Console for monitoring and managing Geneva jobs, clusters, and execution contexts. @@ -30,7 +30,7 @@ Set up and access the Geneva Console for monitoring and managing Geneva jobs, cl - **Distributed Processing**: Scale feature computation across multiple nodes - **Checkpointing**: Resume jobs from failures without losing progress - **Incremental Updates**: Only process new or modified data -- **Multiple Backends**: Support for Ray on Kubernetes and standalone clusters +- **Distributed Execution**: Run jobs on Kubernetes or standalone compute clusters - **Environment Management**: Automatic dependency packaging and deployment ## Getting Started diff --git a/docs/geneva/jobs/job_metrics.mdx b/docs/geneva/jobs/job_metrics.mdx index 7102478b..d6cbb34f 100644 --- a/docs/geneva/jobs/job_metrics.mdx +++ b/docs/geneva/jobs/job_metrics.mdx @@ -1,7 +1,7 @@ --- title: Job Metrics (Diagnostics) sidebarTitle: Job Metrics -description: Use metrics from Geneva to diagnose why a backfill/refresh job is slow. +description: Use Geneva Job Metrics to monitor and troubleshoot jobs in real time. icon: chart-simple --- diff --git a/docs/geneva/udfs/index.mdx b/docs/geneva/udfs/index.mdx index 4fdb4673..51c40a78 100644 --- a/docs/geneva/udfs/index.mdx +++ b/docs/geneva/udfs/index.mdx @@ -1,7 +1,7 @@ --- title: Understanding Transforms sidebarTitle: Understanding Transforms -description: Understand the three types of user-defined functions in Geneva — UDFs, scalar UDTFs, and batch UDTFs — and when to use each. +description: Understand the three types of user-defined functions in Geneva — UDFs, chunkers (scalar UDTFs), and batch UDTFs — and when to use each. icon: code-compare --- @@ -10,19 +10,19 @@ Geneva provides three types of user-defined functions for transforming data. Eac ## Choosing the Right Type - **Adding a column to each row?** Use a [**UDF**](/geneva/udfs/udfs). -- **Splitting each row into multiple rows?** Use a [**Scalar UDTF**](/geneva/udfs/scalar-udtfs). +- **Splitting each row into multiple rows?** Use a [**Chunker**](/geneva/udfs/scalar-udtfs). - **Computing across rows with a different output shape?** Use a [**Batch UDTF**](/geneva/udfs/batch-udtfs). ## At a Glance -| | UDF | Scalar UDTF | Batch UDTF | +| | UDF | Chunker (Scalar UDTF) | Batch UDTF | |---|---|---|---| | **Cardinality** | 1:1 | 1:N | N:M | -| **Decorator** | `@udf` | `@scalar_udtf` | `@udtf` | +| **Decorator** | `@udf` | `@chunker` | `@udtf` | | **Refresh** | Incremental | Incremental | Full | | **Parallelism** | Fragment-parallel | Fragment-parallel | Partition-parallel | | **Inherited columns** | N/A — adds to existing rows | Automatic from query | Independent output schema | -| **Registration** | [`table.add_columns()`](https://lancedb.github.io/geneva/api/table/#geneva.table.Table.add_columns) | [`db.create_scalar_udtf_view()`](https://lancedb.github.io/geneva/api/connection/#geneva.db.Connection.create_scalar_udtf_view) | [`db.create_udtf_view()`](https://lancedb.github.io/geneva/api/connection/#geneva.db.Connection.create_udtf_view) | +| **Registration** | [`table.add_columns()`](https://lancedb.github.io/geneva/api/table/#geneva.table.Table.add_columns) | [`db.create_udtf_view()`](https://lancedb.github.io/geneva/api/connection/#geneva.db.Connection.create_udtf_view) | [`db.create_udtf_view()`](https://lancedb.github.io/geneva/api/connection/#geneva.db.Connection.create_udtf_view) | ## UDFs (1:1) @@ -40,9 +40,9 @@ Each input row produces exactly one output value. The new column is added to the See [UDFs](/geneva/udfs/udfs) for the full guide. -## Scalar UDTFs (1:N) +## Chunkers (Scalar UDTFs, 1:N) -Scalar UDTFs **expand each source row into multiple output rows**. The output is a materialized view that inherits parent columns and supports incremental refresh. +Chunkers — also called scalar UDTFs — **expand each source row into multiple output rows**. The output is a materialized view that inherits parent columns and supports incremental refresh. **Source: `documents`** @@ -51,7 +51,7 @@ Scalar UDTFs **expand each source row into multiple output rows**. The output is | 1 | "Intro to AI" | "Machine learning is..." | | 2 | "Data Guide" | "Data pipelines are..." | -**Derived: `chunks`** (1:N expansion via `@scalar_udtf`) +**Derived: `chunks`** (1:N expansion via `@chunker`) | doc_id | title | chunk_index | chunk_text | |--------|-------|-------------|------------| @@ -66,7 +66,7 @@ Each source row produces **one or more** output rows. Parent columns (`doc_id`, **Use cases**: Document chunking, video segmentation, image tiling. -See [Scalar UDTFs](/geneva/udfs/scalar-udtfs) for the full guide. +See [Chunkers](/geneva/udfs/scalar-udtfs) for the full guide. ## Batch UDTFs (N:M) @@ -99,6 +99,6 @@ See [Batch UDTFs](/geneva/udfs/batch-udtfs) for the full guide. ## API Reference - [UDF](https://lancedb.github.io/geneva/api/udf/) — `@udf` decorator and `UDF` class -- [UDTF](https://lancedb.github.io/geneva/api/udtf/) — `@udtf`, `@scalar_udtf`, `@batch_udtf` decorators and `UDTF`/`ScalarUDTF` classes +- [UDTF](https://lancedb.github.io/geneva/api/udtf/) — `@udtf`, `@chunker`, `@batch_udtf` decorators and `UDTF`/`Chunker` classes - [Table](https://lancedb.github.io/geneva/api/table/) — `add_columns()`, `backfill()` -- [Connection](https://lancedb.github.io/geneva/api/connection/) — `create_udtf_view()`, `create_scalar_udtf_view()` +- [Connection](https://lancedb.github.io/geneva/api/connection/) — `create_udtf_view()`, `create_materialized_view()` diff --git a/docs/geneva/udfs/providers/index.mdx b/docs/geneva/udfs/providers/index.mdx index 36b40397..5328cbbf 100644 --- a/docs/geneva/udfs/providers/index.mdx +++ b/docs/geneva/udfs/providers/index.mdx @@ -64,7 +64,7 @@ table.backfill("summary_openai", where="summary_openai is null") All built-in UDFs share these capabilities: -- **API key handling** — Keys are captured from your local environment at UDF creation time and serialized with the UDF. No cluster-level environment configuration required. +- **API key handling** — Keys are captured from your local environment at UDF creation time and securely serialized with the UDF. No cluster-level environment configuration required. - **Retry with backoff** — Transient API errors (rate limits, timeouts, server errors) are automatically retried with exponential backoff. - **Batch processing** — Embedding UDFs batch multiple rows per API call for better throughput. - **L2 normalization** — Embedding UDFs support optional L2 normalization via the `normalize` parameter (disabled by default since both providers return pre-normalized vectors). diff --git a/docs/geneva/udfs/scalar-udtfs.mdx b/docs/geneva/udfs/scalar-udtfs.mdx index 561390d6..d7b7f38c 100644 --- a/docs/geneva/udfs/scalar-udtfs.mdx +++ b/docs/geneva/udfs/scalar-udtfs.mdx @@ -1,7 +1,7 @@ --- -title: Scalar User-Defined Table Functions (UDTFs) -sidebarTitle: Scalar UDTFs -description: Use scalar UDTFs for 1:N row expansion — split videos into clips, chunk documents, or tile images with automatic parent column inheritance and incremental refresh. +title: Chunkers (Scalar UDTFs) +sidebarTitle: Chunkers +description: Use chunkers (scalar UDTFs) for 1:N row expansion — split videos into clips, chunk documents, or tile images with automatic parent column inheritance and incremental refresh. icon: diagram-subtask --- @@ -18,7 +18,9 @@ import { Beta — introduced in Geneva 0.11.0 -Standard UDFs produce exactly **one output value per input row**. Scalar UDTFs enable **1:N row expansion** — each source row can produce multiple output rows. The results are stored as a materialized view with MV-style incremental refresh. +Standard UDFs produce exactly **one output value per input row**. **Chunkers** — also +called scalar UDTFs — enable **1:N row expansion**: each source row can produce multiple +output rows. The results are stored as a materialized view with MV-style incremental refresh. | Source Table | Derived Table | Expansion | |---|---|---| @@ -26,9 +28,32 @@ Standard UDFs produce exactly **one output value per input row**. Scalar UDTFs e | 1 document row | → N chunk rows | Text chunking | | 1 image row | → N tile rows | Image tiling | -## Defining a Scalar UDTF +For example, a chunker that splits documents into passages turns a `documents` table into a +`chunks` table, carrying the parent columns into every child row: -Use the `@scalar_udtf` decorator on a function that **yields** output rows. Geneva infers the output schema from the return type annotation. +**Source: `documents`** + +| doc_id | title | text | +|--------|-------|------| +| 1 | "Intro to AI" | "Machine learning is..." | +| 2 | "Data Guide" | "Data pipelines are..." | + +**Derived: `chunks`** (1:N expansion) + +| doc_id | title | chunk_index | chunk_text | +|--------|-------|-------------|------------| +| 1 | "Intro to AI" | 0 | "Machine learning..." | +| 1 | "Intro to AI" | 1 | "Neural networks..." | +| 1 | "Intro to AI" | 2 | "Training data..." | +| 2 | "Data Guide" | 0 | "Data pipelines..." | +| 2 | "Data Guide" | 1 | "ETL processes..." | + +Parent columns (`doc_id`, `title`) are inherited automatically; `chunk_index` and +`chunk_text` are generated by the chunker. + +## Defining a Chunker + +Use the `@chunker` decorator on a function that **yields** output rows. Geneva infers the output schema from the return type annotation. @@ -39,7 +64,7 @@ Use the `@scalar_udtf` decorator on a function that **yields** output rows. Gene Input parameters are bound to source columns **by name** — the parameter `video_path` binds to source column `video_path`, just like standard UDFs. -A scalar UDTF can yield **zero rows** for a source row. The source row is still marked as processed and will not be retried on the next refresh. +A chunker can yield **zero rows** for a source row. The source row is still marked as processed and will not be retried on the next refresh. ### List return pattern @@ -52,7 +77,7 @@ If you prefer to build the full list in memory rather than yielding, you can ret -### Batched scalar UDTF +### Batched chunker For vectorized processing, use `batch=True`. The function receives Arrow arrays and returns a `RecordBatch` of expanded rows. Because the return type `pa.RecordBatch` cannot be inferred, you must supply `output_schema` explicitly: @@ -62,9 +87,9 @@ For vectorized processing, use `batch=True`. The function receives Arrow arrays -## Creating a Scalar UDTF View +## Creating a Chunker View -Scalar UDTFs use the `create_scalar_udtf_view` API: +Chunkers use the `create_udtf_view` API (passing the chunker as the `udtf` argument): @@ -100,7 +125,7 @@ The first three rows come from the `/v/a.mp4` source row, the last two from `/v/ ## Adding Computed Columns After Creation -Since scalar UDTF views are materialized views, you can add UDF-computed columns to the child table and backfill them: +Since chunker views are materialized views, you can add UDF-computed columns to the child table and backfill them: @@ -108,11 +133,11 @@ Since scalar UDTF views are materialized views, you can add UDF-computed columns -This is a powerful pattern: expand source rows with a scalar UDTF, then enrich the expanded rows with standard UDFs. +This is a powerful pattern: expand source rows with a chunker, then enrich the expanded rows with standard UDFs. ## Incremental Refresh -Scalar UDTFs support **incremental refresh**, just like standard materialized views: +Chunkers support **incremental refresh**, just like standard materialized views: - **New source rows**: The UDTF runs on new rows, inserting child rows. - **Deleted source rows**: Child rows linked to the deleted parent are cascade-deleted. @@ -126,9 +151,9 @@ Scalar UDTFs support **incremental refresh**, just like standard materialized vi Only the new source rows are processed. Existing clips from previous refreshes are untouched. -## Chaining UDTF Views +## Chaining Chunker Views -Scalar UDTF views are standard materialized views, so they can serve as the source for further views: +Chunker views are standard materialized views, so they can serve as the source for further views: @@ -144,8 +169,8 @@ Scalar UDTF views are standard materialized views, so they can serve as the sour -For a comparison of all three function types (UDFs, Scalar UDTFs, Batch UDTFs), see [Understanding Transforms](/geneva/udfs). +For a comparison of all three function types (UDFs, Chunkers, Batch UDTFs), see [Understanding Transforms](/geneva/udfs). Reference: -* [`scalar_udtf` API](https://lancedb.github.io/geneva/api/udtf/#geneva.scalar_udtf) -* [`create_scalar_udtf_view` API](https://lancedb.github.io/geneva/api/connection/#geneva.db.Connection.create_scalar_udtf_view) +* [`chunker` API](https://lancedb.github.io/geneva/api/udtf/#geneva.chunker) +* [`create_udtf_view` API](https://lancedb.github.io/geneva/api/connection/#geneva.db.Connection.create_udtf_view) diff --git a/docs/geneva/udfs/udfs.mdx b/docs/geneva/udfs/udfs.mdx index 1a3a3db0..1ab7248d 100644 --- a/docs/geneva/udfs/udfs.mdx +++ b/docs/geneva/udfs/udfs.mdx @@ -352,7 +352,41 @@ For example, this filter would only update the rows where area was currently nul table.backfill("area", where="area is null") ``` +## Auto-backfill + +For columns whose values should always stay in sync with their source data, set +`auto_backfill=True` on the UDF. On LanceDB Enterprise (`db://` connections), the column is +then recomputed for you automatically — you don't need to call `backfill()` yourself. + +```python +@udf(data_type=pa.int32(), version="2", auto_backfill=True) +def area_udf(x: int, y: int) -> int: + return x * y + +# Register a new auto-backfill column... +tbl.add_columns({"area": area_udf}) + +# ...or re-point an existing column to an auto-backfill UDF +tbl.alter_columns({"path": "area", "udf": area_udf}) +``` + +### How it works + +The `auto_backfill` flag is recorded in the column's metadata when the column is added or +altered. LanceDB Enterprise's managed agent watches for columns that need recomputation and +dispatches a [distributed backfill job](/geneva/jobs/backfilling/) automatically — there is no +manual trigger and no status polling. A column is recomputed when, for example: + +- **New rows are added** to the table (`tbl.add(...)`), leaving the column null for those rows. +- **The UDF version changes** — you bump `version=` and `alter_columns()` to the new function. + + +Auto-backfill is an enterprise feature. On direct object-storage or local-filesystem +connections there is no managed agent, so `auto_backfill=True` has no effect and you must run +`backfill()` explicitly. + + Reference: * [`alter_columns` API](https://lancedb.github.io/geneva/api/table/#geneva.table.Table.alter_columns) * [`add_columns` API](https://lancedb.github.io/geneva/api/table/#geneva.table.Table.add_columns) -* [UDF](https://lancedb.github.io/geneva/api/udf/) — full `@udf` decorator reference including `data_type`, `num_gpus`, `batch_size`, and other options +* [UDF](https://lancedb.github.io/geneva/api/udf/) — full `@udf` decorator reference including `data_type`, `num_gpus`, `auto_backfill`, `batch_size`, and other options diff --git a/docs/snippets/geneva_scalar_udtfs.mdx b/docs/snippets/geneva_scalar_udtfs.mdx index 59822d0e..a51feb54 100644 --- a/docs/snippets/geneva_scalar_udtfs.mdx +++ b/docs/snippets/geneva_scalar_udtfs.mdx @@ -2,19 +2,19 @@ export const PyAddColumnsScalarUdtf = "@udf(data_type=pa.list_(pa.float32(), 512))\ndef clip_embedding(clip_bytes: bytes) -> list[float]:\n return embed_model.encode(clip_bytes)\n\n# Add an embedding column to the clips table\nclips.add_columns({\"embedding\": clip_embedding})\n\n# Backfill computes embeddings for all existing clips\nclips.backfill(\"embedding\")\n"; -export const PyChainingUdtfViews = "# videos → clips (1:N)\nclips = db.create_scalar_udtf_view(\n \"clips\", source=videos.search(None), scalar_udtf=extract_clips\n)\n\n# clips → frames (1:N)\nframes = db.create_scalar_udtf_view(\n \"frames\", source=clips.search(None), scalar_udtf=extract_frames\n)\n"; +export const PyChainingUdtfViews = "# videos → clips (1:N)\nclips = db.create_udtf_view(\n \"clips\", source=videos.search(None), udtf=extract_clips\n)\n\n# clips → frames (1:N)\nframes = db.create_udtf_view(\n \"frames\", source=clips.search(None), udtf=extract_frames\n)\n"; -export const PyCreateScalarUdtfView = "import geneva\n\ndb = geneva.connect(\"/data/mydb\")\nvideos = db.open_table(\"videos\")\n\n# Create the 1:N materialized view\nclips = db.create_scalar_udtf_view(\n \"clips\",\n source=videos.search(None).select([\"video_path\", \"metadata\"]),\n scalar_udtf=extract_clips,\n)\n\n# Populate — runs the UDTF on every source row\nclips.refresh()\n"; +export const PyCreateScalarUdtfView = "import geneva\n\ndb = geneva.connect(\"/data/mydb\")\nvideos = db.open_table(\"videos\")\n\n# Create the 1:N materialized view\nclips = db.create_udtf_view(\n \"clips\",\n source=videos.search(None).select([\"video_path\", \"metadata\"]),\n udtf=extract_clips,\n)\n\n# Populate — runs the UDTF on every source row\nclips.refresh()\n"; -export const PyDocumentChunkingFull = "from geneva import connect, scalar_udtf, udf\nfrom typing import Iterator, NamedTuple\nimport pyarrow as pa\n\nclass Chunk(NamedTuple):\n chunk_index: int\n chunk_text: str\n\n@scalar_udtf\ndef chunk_document(text: str) -> Iterator[Chunk]:\n \"\"\"Split a document into overlapping chunks.\"\"\"\n words = text.split()\n chunk_size = 500\n overlap = 50\n for i, start in enumerate(range(0, len(words), chunk_size - overlap)):\n chunk_words = words[start:start + chunk_size]\n yield Chunk(chunk_index=i, chunk_text=\" \".join(chunk_words))\n\ndb = connect(\"/data/mydb\")\ndocs = db.open_table(\"documents\")\n\n# Create chunked view — inherits doc_id, title, etc. from source\nchunks = db.create_scalar_udtf_view(\n \"doc_chunks\",\n source=docs.search(None).select([\"doc_id\", \"title\", \"text\"]),\n scalar_udtf=chunk_document,\n)\nchunks.refresh()\n\n# Add embeddings to chunks for semantic search\n@udf(data_type=pa.list_(pa.float32(), 1536))\ndef embed_text(chunk_text: str) -> list[float]:\n return embedding_model.encode(chunk_text)\n\nchunks.add_columns({\"embedding\": embed_text})\nchunks.backfill(\"embedding\") # Backfills embeddings on all existing chunks\n\n# Query — parent columns available alongside chunk columns\nchunks.search(None).select([\"doc_id\", \"title\", \"chunk_text\", \"embedding\"]).to_pandas()\n"; +export const PyDocumentChunkingFull = "from geneva import connect, chunker, udf\nfrom typing import Iterator, NamedTuple\nimport pyarrow as pa\n\nclass Chunk(NamedTuple):\n chunk_index: int\n chunk_text: str\n\n@chunker\ndef chunk_document(text: str) -> Iterator[Chunk]:\n \"\"\"Split a document into overlapping chunks.\"\"\"\n words = text.split()\n chunk_size = 500\n overlap = 50\n for i, start in enumerate(range(0, len(words), chunk_size - overlap)):\n chunk_words = words[start:start + chunk_size]\n yield Chunk(chunk_index=i, chunk_text=\" \".join(chunk_words))\n\ndb = connect(\"/data/mydb\")\ndocs = db.open_table(\"documents\")\n\n# Create chunked view — inherits doc_id, title, etc. from source\nchunks = db.create_udtf_view(\n \"doc_chunks\",\n source=docs.search(None).select([\"doc_id\", \"title\", \"text\"]),\n udtf=chunk_document,\n)\nchunks.refresh()\n\n# Add embeddings to chunks for semantic search\n@udf(data_type=pa.list_(pa.float32(), 1536))\ndef embed_text(chunk_text: str) -> list[float]:\n return embedding_model.encode(chunk_text)\n\nchunks.add_columns({\"embedding\": embed_text})\nchunks.backfill(\"embedding\") # Backfills embeddings on all existing chunks\n\n# Query — parent columns available alongside chunk columns\nchunks.search(None).select([\"doc_id\", \"title\", \"chunk_text\", \"embedding\"]).to_pandas()\n"; -export const PyDocumentChunkingUdtf = "from geneva import scalar_udtf\nfrom typing import Iterator, NamedTuple\n\nclass Chunk(NamedTuple):\n chunk_index: int\n chunk_text: str\n\n@scalar_udtf\ndef chunk_document(text: str) -> Iterator[Chunk]:\n \"\"\"Split a document into overlapping chunks.\"\"\"\n words = text.split()\n chunk_size = 500\n overlap = 50\n for i, start in enumerate(range(0, len(words), chunk_size - overlap)):\n chunk_words = words[start:start + chunk_size]\n yield Chunk(chunk_index=i, chunk_text=\" \".join(chunk_words))\n"; +export const PyDocumentChunkingUdtf = "from geneva import chunker\nfrom typing import Iterator, NamedTuple\n\nclass Chunk(NamedTuple):\n chunk_index: int\n chunk_text: str\n\n@chunker\ndef chunk_document(text: str) -> Iterator[Chunk]:\n \"\"\"Split a document into overlapping chunks.\"\"\"\n words = text.split()\n chunk_size = 500\n overlap = 50\n for i, start in enumerate(range(0, len(words), chunk_size - overlap)):\n chunk_words = words[start:start + chunk_size]\n yield Chunk(chunk_index=i, chunk_text=\" \".join(chunk_words))\n"; export const PyIncrementalRefresh = "# Add new videos to the source table\nvideos.add(new_video_data)\n\n# Incremental refresh — only processes the new videos\nclips.refresh()\n"; -export const PyScalarUdtfBatch = "@scalar_udtf(batch=True, output_schema=clip_schema)\ndef extract_clips(batch: pa.RecordBatch) -> pa.RecordBatch:\n \"\"\"Process rows in batches. Same 1:N semantic per row.\"\"\"\n ...\n"; +export const PyScalarUdtfBatch = "@chunker(batch=True, output_schema=clip_schema)\ndef extract_clips(batch: pa.RecordBatch) -> pa.RecordBatch:\n \"\"\"Process rows in batches. Same 1:N semantic per row.\"\"\"\n ...\n"; -export const PyScalarUdtfIterator = "from geneva import scalar_udtf\nfrom typing import Iterator, NamedTuple\n\nclass Clip(NamedTuple):\n clip_start: float\n clip_end: float\n clip_bytes: bytes\n\n@scalar_udtf\ndef extract_clips(video_path: str, duration: float) -> Iterator[Clip]:\n \"\"\"Yields multiple clips per video.\"\"\"\n clip_length = 10.0\n for start in range(0, int(duration), int(clip_length)):\n end = min(start + clip_length, duration)\n clip_data = extract_video_segment(video_path, start, end)\n yield Clip(clip_start=start, clip_end=end, clip_bytes=clip_data)\n"; +export const PyScalarUdtfIterator = "from geneva import chunker\nfrom typing import Iterator, NamedTuple\n\nclass Clip(NamedTuple):\n clip_start: float\n clip_end: float\n clip_bytes: bytes\n\n@chunker\ndef extract_clips(video_path: str, duration: float) -> Iterator[Clip]:\n \"\"\"Yields multiple clips per video.\"\"\"\n clip_length = 10.0\n for start in range(0, int(duration), int(clip_length)):\n end = min(start + clip_length, duration)\n clip_data = extract_video_segment(video_path, start, end)\n yield Clip(clip_start=start, clip_end=end, clip_bytes=clip_data)\n"; -export const PyScalarUdtfList = "@scalar_udtf\ndef extract_clips(video_path: str, duration: float) -> list[Clip]:\n clips = []\n for start in range(0, int(duration), 10):\n end = min(start + 10, duration)\n clips.append(Clip(clip_start=start, clip_end=end, clip_bytes=b\"...\"))\n return clips\n"; +export const PyScalarUdtfList = "@chunker\ndef extract_clips(video_path: str, duration: float) -> list[Clip]:\n clips = []\n for start in range(0, int(duration), 10):\n end = min(start + 10, duration)\n clips.append(Clip(clip_start=start, clip_end=end, clip_bytes=b\"...\"))\n return clips\n"; diff --git a/docs/snippets/geneva_udfs_index.mdx b/docs/snippets/geneva_udfs_index.mdx index 592abeaf..923102a3 100644 --- a/docs/snippets/geneva_udfs_index.mdx +++ b/docs/snippets/geneva_udfs_index.mdx @@ -1,6 +1,6 @@ {/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */} -export const PyRegistrationScalarUdtf = "db = geneva.connect(\"/data/mydb\")\ndb.create_scalar_udtf_view(\"my_view\", source=my_source, scalar_udtf=my_scalar_udtf)\n"; +export const PyRegistrationScalarUdtf = "db = geneva.connect(\"/data/mydb\")\ndb.create_udtf_view(\"my_view\", source=my_source, udtf=my_chunker)\n"; export const PyRegistrationUdf = "mock_table.add_columns({\"col\": my_udf})\n"; diff --git a/pyproject.toml b/pyproject.toml index 8957a7d9..9bb160b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,20 @@ dependencies = [ "pytest>=9.0.1", "pytest-asyncio>=1.3.0", "Pillow>=12.1.1", - "geneva>=0.12.0", + "geneva>=0.13.0b15", "pyyaml>=6.0", ] + +[tool.uv] +# fury.io hosts beta geneva / pylance / lancedb releases. unsafe-best-match +# mimics pip so stable releases still fall back to PyPI. +index-strategy = "unsafe-best-match" +prerelease = "allow" + +[[tool.uv.index]] +name = "lancedb" +url = "https://pypi.fury.io/lancedb/" + +[[tool.uv.index]] +name = "lance-format" +url = "https://pypi.fury.io/lance-format/" diff --git a/tests/py/test_geneva_scalar_udtfs.py b/tests/py/test_geneva_scalar_udtfs.py index 4ada4702..30aec525 100644 --- a/tests/py/test_geneva_scalar_udtfs.py +++ b/tests/py/test_geneva_scalar_udtfs.py @@ -10,7 +10,7 @@ def extract_video_segment(path, start, end): return bytes(f"clip:{start}-{end}", "utf-8") # --8<-- [start:scalar_udtf_iterator] - from geneva import scalar_udtf + from geneva import chunker from typing import Iterator, NamedTuple class Clip(NamedTuple): @@ -18,7 +18,7 @@ class Clip(NamedTuple): clip_end: float clip_bytes: bytes - @scalar_udtf + @chunker def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: """Yields multiple clips per video.""" clip_length = 10.0 @@ -28,8 +28,8 @@ def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: yield Clip(clip_start=start, clip_end=end, clip_bytes=clip_data) # --8<-- [end:scalar_udtf_iterator] - from geneva.transformer import ScalarUDTF - assert isinstance(extract_clips, ScalarUDTF) + from geneva.transformer import Chunker + assert isinstance(extract_clips, Chunker) assert extract_clips.input_columns == ["video_path", "duration"] assert set(extract_clips.output_schema.names) == {"clip_start", "clip_end", "clip_bytes"} @@ -40,7 +40,7 @@ def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: def test_scalar_udtf_list_return(): - from geneva import scalar_udtf + from geneva import chunker from typing import NamedTuple class Clip(NamedTuple): @@ -49,7 +49,7 @@ class Clip(NamedTuple): clip_bytes: bytes # --8<-- [start:scalar_udtf_list] - @scalar_udtf + @chunker def extract_clips(video_path: str, duration: float) -> list[Clip]: clips = [] for start in range(0, int(duration), 10): @@ -58,8 +58,8 @@ def extract_clips(video_path: str, duration: float) -> list[Clip]: return clips # --8<-- [end:scalar_udtf_list] - from geneva.transformer import ScalarUDTF - assert isinstance(extract_clips, ScalarUDTF) + from geneva.transformer import Chunker + assert isinstance(extract_clips, Chunker) clips = extract_clips.func("/v/a.mp4", 30.0) assert len(clips) == 3 assert all(c.clip_bytes == b"..." for c in clips) @@ -67,7 +67,7 @@ def extract_clips(video_path: str, duration: float) -> list[Clip]: def test_scalar_udtf_batch(): import pyarrow as pa - from geneva import scalar_udtf + from geneva import chunker clip_schema = pa.schema([ ("clip_start", pa.float64()), @@ -75,27 +75,27 @@ def test_scalar_udtf_batch(): ]) # --8<-- [start:scalar_udtf_batch] - @scalar_udtf(batch=True, output_schema=clip_schema) + @chunker(batch=True, output_schema=clip_schema) def extract_clips(batch: pa.RecordBatch) -> pa.RecordBatch: """Process rows in batches. Same 1:N semantic per row.""" ... # --8<-- [end:scalar_udtf_batch] - from geneva.transformer import ScalarUDTF - assert isinstance(extract_clips, ScalarUDTF) + from geneva.transformer import Chunker + assert isinstance(extract_clips, Chunker) assert extract_clips.batch is True def test_create_scalar_udtf_view(monkeypatch): from typing import Iterator, NamedTuple - from geneva.transformer import scalar_udtf + from geneva.transformer import chunker class Clip(NamedTuple): clip_start: float clip_end: float clip_bytes: bytes - @scalar_udtf + @chunker def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: for start in range(0, int(duration), 10): yield Clip(clip_start=start, clip_end=min(start + 10.0, duration), clip_bytes=b"") @@ -104,7 +104,7 @@ def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: from unittest.mock import create_autospec mock_clips = MagicMock() mock_db = create_autospec(geneva.db.Connection, instance=True) - mock_db.create_scalar_udtf_view.return_value = mock_clips + mock_db.create_udtf_view.return_value = mock_clips monkeypatch.setattr("geneva.connect", MagicMock(return_value=mock_db)) # --8<-- [start:create_scalar_udtf_view] @@ -114,19 +114,19 @@ def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: videos = db.open_table("videos") # Create the 1:N materialized view - clips = db.create_scalar_udtf_view( + clips = db.create_udtf_view( "clips", source=videos.search(None).select(["video_path", "metadata"]), - scalar_udtf=extract_clips, + udtf=extract_clips, ) # Populate — runs the UDTF on every source row clips.refresh() # --8<-- [end:create_scalar_udtf_view] - call_kwargs = mock_db.create_scalar_udtf_view.call_args + call_kwargs = mock_db.create_udtf_view.call_args assert call_kwargs.args[0] == "clips" - assert call_kwargs.kwargs["scalar_udtf"] is extract_clips + assert call_kwargs.kwargs["udtf"] is extract_clips mock_clips.refresh.assert_called_once() @@ -173,7 +173,7 @@ def test_incremental_refresh(): def test_chaining_udtf_views(monkeypatch): from typing import Iterator, NamedTuple - from geneva.transformer import scalar_udtf + from geneva.transformer import chunker class Clip(NamedTuple): clip_start: float @@ -183,12 +183,12 @@ class Frame(NamedTuple): frame_index: int frame_bytes: bytes - @scalar_udtf + @chunker def extract_clips(video_path: str, duration: float) -> Iterator[Clip]: for start in range(0, int(duration), 10): yield Clip(clip_start=start, clip_end=min(start + 10.0, duration)) - @scalar_udtf + @chunker def extract_frames(clip_start: float, clip_end: float) -> Iterator[Frame]: yield Frame(frame_index=0, frame_bytes=b"") @@ -202,33 +202,33 @@ def extract_frames(clip_start: float, clip_end: float) -> Iterator[Frame]: # --8<-- [start:chaining_udtf_views] # videos → clips (1:N) - clips = db.create_scalar_udtf_view( - "clips", source=videos.search(None), scalar_udtf=extract_clips + clips = db.create_udtf_view( + "clips", source=videos.search(None), udtf=extract_clips ) # clips → frames (1:N) - frames = db.create_scalar_udtf_view( - "frames", source=clips.search(None), scalar_udtf=extract_frames + frames = db.create_udtf_view( + "frames", source=clips.search(None), udtf=extract_frames ) # --8<-- [end:chaining_udtf_views] - assert mock_db.create_scalar_udtf_view.call_count == 2 - first_call = mock_db.create_scalar_udtf_view.call_args_list[0] - second_call = mock_db.create_scalar_udtf_view.call_args_list[1] - assert first_call.kwargs["scalar_udtf"] is extract_clips - assert second_call.kwargs["scalar_udtf"] is extract_frames + assert mock_db.create_udtf_view.call_count == 2 + first_call = mock_db.create_udtf_view.call_args_list[0] + second_call = mock_db.create_udtf_view.call_args_list[1] + assert first_call.kwargs["udtf"] is extract_clips + assert second_call.kwargs["udtf"] is extract_frames def test_document_chunking_udtf(): # --8<-- [start:document_chunking_udtf] - from geneva import scalar_udtf + from geneva import chunker from typing import Iterator, NamedTuple class Chunk(NamedTuple): chunk_index: int chunk_text: str - @scalar_udtf + @chunker def chunk_document(text: str) -> Iterator[Chunk]: """Split a document into overlapping chunks.""" words = text.split() @@ -255,14 +255,14 @@ def test_document_chunking_full(monkeypatch): from unittest.mock import create_autospec mock_chunks_table = MagicMock() mock_db = create_autospec(geneva.db.Connection, instance=True) - mock_db.create_scalar_udtf_view.return_value = mock_chunks_table + mock_db.create_udtf_view.return_value = mock_chunks_table monkeypatch.setattr("geneva.connect", MagicMock(return_value=mock_db)) embedding_model = MagicMock() embedding_model.encode.return_value = [0.1] * 1536 # --8<-- [start:document_chunking_full] - from geneva import connect, scalar_udtf, udf + from geneva import connect, chunker, udf from typing import Iterator, NamedTuple import pyarrow as pa @@ -270,7 +270,7 @@ class Chunk(NamedTuple): chunk_index: int chunk_text: str - @scalar_udtf + @chunker def chunk_document(text: str) -> Iterator[Chunk]: """Split a document into overlapping chunks.""" words = text.split() @@ -284,10 +284,10 @@ def chunk_document(text: str) -> Iterator[Chunk]: docs = db.open_table("documents") # Create chunked view — inherits doc_id, title, etc. from source - chunks = db.create_scalar_udtf_view( + chunks = db.create_udtf_view( "doc_chunks", source=docs.search(None).select(["doc_id", "title", "text"]), - scalar_udtf=chunk_document, + udtf=chunk_document, ) chunks.refresh() @@ -303,9 +303,9 @@ def embed_text(chunk_text: str) -> list[float]: chunks.search(None).select(["doc_id", "title", "chunk_text", "embedding"]).to_pandas() # --8<-- [end:document_chunking_full] - call_kwargs = mock_db.create_scalar_udtf_view.call_args + call_kwargs = mock_db.create_udtf_view.call_args assert call_kwargs.args[0] == "doc_chunks" - assert call_kwargs.kwargs["scalar_udtf"] is chunk_document + assert call_kwargs.kwargs["udtf"] is chunk_document mock_chunks_table.refresh.assert_called_once() mock_chunks_table.add_columns.assert_called_once() mock_chunks_table.backfill.assert_called_once_with("embedding") diff --git a/tests/py/test_geneva_udfs_index.py b/tests/py/test_geneva_udfs_index.py index 0c31ac96..dffe00d5 100644 --- a/tests/py/test_geneva_udfs_index.py +++ b/tests/py/test_geneva_udfs_index.py @@ -19,20 +19,20 @@ def test_registration_udf(monkeypatch): def test_registration_scalar_udtf(monkeypatch): - # Verifies that Connection.create_scalar_udtf_view exists with the expected signature. + # Verifies that Connection.create_udtf_view accepts a chunker. # If this fails, update the Registration row in docs/geneva/udfs/index.mdx. import geneva mock_db = create_autospec(geneva.db.Connection, instance=True) monkeypatch.setattr("geneva.connect", MagicMock(return_value=mock_db)) my_source = MagicMock() - my_scalar_udtf = MagicMock() + my_chunker = MagicMock() # --8<-- [start:registration_scalar_udtf] db = geneva.connect("/data/mydb") - db.create_scalar_udtf_view("my_view", source=my_source, scalar_udtf=my_scalar_udtf) + db.create_udtf_view("my_view", source=my_source, udtf=my_chunker) # --8<-- [end:registration_scalar_udtf] - mock_db.create_scalar_udtf_view.assert_called_once() + mock_db.create_udtf_view.assert_called_once() def test_registration_udtf(monkeypatch): diff --git a/uv.lock b/uv.lock index dc30c60f..0ad41d51 100644 --- a/uv.lock +++ b/uv.lock @@ -12,6 +12,9 @@ resolution-markers = [ "python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'emscripten' and sys_platform != 'win32'", ] +[options] +prerelease-mode = "allow" + [[package]] name = "aiohappyeyeballs" version = "2.6.2" @@ -379,7 +382,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "geneva", specifier = ">=0.12.0" }, + { name = "geneva", specifier = ">=0.13.0b15" }, { name = "lance-namespace", specifier = ">=0.6.1" }, { name = "lancedb", specifier = ">=0.33.0" }, { name = "pandas", specifier = ">=3.0.1" }, @@ -486,8 +489,8 @@ wheels = [ [[package]] name = "geneva" -version = "0.12.0" -source = { registry = "https://pypi.org/simple" } +version = "0.13.0b17" +source = { registry = "https://pypi.fury.io/lancedb/" } dependencies = [ { name = "aiohttp" }, { name = "attrs" }, @@ -497,6 +500,7 @@ dependencies = [ { name = "docker" }, { name = "emoji" }, { name = "fsspec" }, + { name = "httpx" }, { name = "jinja2" }, { name = "kubernetes" }, { name = "lance-namespace" }, @@ -506,7 +510,6 @@ dependencies = [ { name = "numpy" }, { name = "overrides" }, { name = "pip" }, - { name = "pyarrow" }, { name = "pylance" }, { name = "pyyaml" }, { name = "ray", extra = ["client", "default"] }, @@ -519,9 +522,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ee/e7/97297410bf44686d680217b8238995c233d6222f705a03565c038cdee53a/geneva-0.12.0.tar.gz", hash = "sha256:178f96828ed9ceeadd161e230b1bcdb55ed9ccb38e73a0d72a5024a12770c5cf", size = 3325301, upload-time = "2026-04-01T21:19:22.761Z" } +sdist = { url = "https://pypi.fury.io/lancedb/-/ver_uzisX/geneva-0.13.0b17.tar.gz", hash = "sha256:b275fe8ddb1fd02678c2c369434d827d96a047f0ebda85b996ad64ac5a486fe7" } wheels = [ - { url = "https://files.pythonhosted.org/packages/62/56/d0f40f5cb98892e1ad97eac382dc5109f7b7402b36fa00f390319e950d47/geneva-0.12.0-py3-none-any.whl", hash = "sha256:86ce35551a363b1b66af8951ccb9560875b8517cd557987940ab55567401fc88", size = 371857, upload-time = "2026-04-01T21:19:24.504Z" }, + { url = "https://pypi.fury.io/lancedb/-/ver_2kLcAM/geneva-0.13.0b17-py3-none-any.whl", hash = "sha256:c49d19110b41f8fe8b55d7f721668448733be9d0e889f2b8b48833c3cdf4a47b" }, ] [[package]] @@ -596,6 +599,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/e6/283326a27da9e2c3038bc93eeea36fb118ce0b2d03922a9cda6688f53c5b/grpcio-1.80.0-cp313-cp313-win_amd64.whl", hash = "sha256:e172cf795a3ba5246d3529e4d34c53db70e888fa582a8ffebd2e6e48bc0cba50", size = 4882833, upload-time = "2026-03-30T08:48:07.363Z" }, ] +[[package]] +name = "httpx" +version = "1.0.dev3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/40/b6f25020eeafd822fc473394a5db45a5964a64c88788eb5dc49ffabb64e7/httpx-1.0.dev3.tar.gz", hash = "sha256:e95700e4f9cf6430295f4c195f9cb0ca0549bab4294927f8002bf196851d40db", size = 761377, upload-time = "2025-09-15T16:15:12.087Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/97/8d/ff4f018c9a813994e28fabb998f8d76542fc2b46c1210d4c1b2c615eea7f/httpx-1.0.dev3-py3-none-any.whl", hash = "sha256:80b33db1bc8e1fac2a15f419839e324d472d528822608ea6b7a93fed2011722d", size = 34319, upload-time = "2025-09-15T16:15:10.458Z" }, +] + [[package]] name = "idna" version = "3.17" @@ -675,19 +690,19 @@ wheels = [ [[package]] name = "lance-namespace" -version = "0.7.7" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/06/5c/9822af615fc1bd3ee1073994696c739aecde377be32435ec3303aed1bc5d/lance_namespace-0.7.7.tar.gz", hash = "sha256:d00b525f2e26993a6c61668e798bca6c808605ab8a79f29f86a1a1af92d91ae2", size = 10754, upload-time = "2026-05-20T17:32:59.45Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/12/f7ab93b29be3edbf5fc3610714bf2d06088e7f4524bfb38dfd6852458b08/lance_namespace-0.8.6.tar.gz", hash = "sha256:18232e721c8188145f4ec9389cc2dfbeeabf54a619d94885ea1b3375bee9f4af", size = 11529, upload-time = "2026-06-12T17:36:41.651Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/11/43/186acc1156da20c351db196e2b6241b2453b16dc1b4cc8e0a626667ca471/lance_namespace-0.7.7-py3-none-any.whl", hash = "sha256:477a7ca6b5e1f673a2c9ba52f42d6e8e3ff7c27a601392a21eb90fba98d0309b", size = 12581, upload-time = "2026-05-20T17:32:57.389Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1b/5b1668ee2dc8910965f390640359112a31157092fcf8e000b89c79b58708/lance_namespace-0.8.6-py3-none-any.whl", hash = "sha256:571eae34f9aad70e5b05020416c2860889b9ec82993ccd0eb015e7b39c3ea309", size = 13383, upload-time = "2026-06-12T17:36:43.456Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.7.7" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, @@ -695,15 +710,15 @@ dependencies = [ { name = "typing-extensions" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/07/95/38ab81ccc1e09beeecd8ddfc61b8bc73831dc5053db1e3f9021f64a4896b/lance_namespace_urllib3_client-0.7.7.tar.gz", hash = "sha256:4d8c066628c17c6a10cf643b51a7f7ae1bfb8a614d9cc54a5af38a4ba2b4b102", size = 202930, upload-time = "2026-05-20T17:32:58.308Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/80/fb224b4a89c1c1638cde949cb6cce6c3aca7759effbfea46a3d9c3960b21/lance_namespace_urllib3_client-0.8.6.tar.gz", hash = "sha256:b6fb1d306e74a7576e5309919020be744527de484a63dbf5eed10f8b368548df", size = 228772, upload-time = "2026-06-12T17:36:42.609Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/35/96/5483e48e40433b1d078183c15a92c99e59a156041b0260e7f18ee34e7c08/lance_namespace_urllib3_client-0.7.7-py3-none-any.whl", hash = "sha256:9221c3e00fd89f0c811953d94b32d2ea527765280460a174f5872dc8a74c0ed6", size = 334767, upload-time = "2026-05-20T17:32:55.883Z" }, + { url = "https://files.pythonhosted.org/packages/c5/90/1e27de15cd1b16785a1c7312beb0a59e75c8344a815f600f58173a565bd1/lance_namespace_urllib3_client-0.8.6-py3-none-any.whl", hash = "sha256:9d78249c3fb15aa3d15d668f78f04a275af3d08d800a7027492f37996ac4968b", size = 369950, upload-time = "2026-06-12T17:36:40.438Z" }, ] [[package]] name = "lancedb" -version = "0.33.0" -source = { registry = "https://pypi.org/simple" } +version = "0.33.1b2" +source = { registry = "https://pypi.fury.io/lancedb/" } dependencies = [ { name = "deprecation" }, { name = "lance-namespace" }, @@ -714,12 +729,10 @@ dependencies = [ { name = "tqdm" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/09/2f/d5a4b2a5bb1f800936c76a6d8a4daf127a86fcab621eeb70b574a5adc774/lancedb-0.33.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:d4eaf6fa7c2eac619208f1d396f4de635ee0f535673067118a31c1181575c48b", size = 48338115, upload-time = "2026-05-28T20:37:55.88Z" }, - { url = "https://files.pythonhosted.org/packages/07/12/31787b93a856b2c31382c7771dc22fb05575b70b87c9efe454269f4f0948/lancedb-0.33.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c6c2402ed2744245ae76c4167c0461da0a7a80f1608e0ec491c1548ea2b4302", size = 51162262, upload-time = "2026-05-28T20:37:59.101Z" }, - { url = "https://files.pythonhosted.org/packages/49/b7/081cc29f8e06bf12191b99ab3fe702aceebdb0914476b821a8c0445cacc8/lancedb-0.33.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ebf1ffad811e6254a93931a79489ba1f21f48564bdfa06abae846f5fcaaf3e8", size = 54381368, upload-time = "2026-05-28T20:38:02.2Z" }, - { url = "https://files.pythonhosted.org/packages/1c/bd/e0f4bd621f10ecf96a801b0166e87799ed7ca5a9dbabcef9a6c766a58ef3/lancedb-0.33.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:13da39f80adfea59e5831fe64e4166b2d70a2f843e6507bf644c4fe4c350087c", size = 51188986, upload-time = "2026-05-28T20:38:05.375Z" }, - { url = "https://files.pythonhosted.org/packages/d9/1a/a8647a432ac6aa59cdce1fc061a7050ea4278bcab364539b78af2ecf72d2/lancedb-0.33.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:21b712825f0a00225e8974a41352c4ea84b0899ef8c23b17f672fadc38bd8346", size = 54440958, upload-time = "2026-05-28T20:38:08.474Z" }, - { url = "https://files.pythonhosted.org/packages/08/6c/d0cc8da784cd7ed3b4940a5d1f3e7702e2d99a0a348ba81a376eed782810/lancedb-0.33.0-cp39-abi3-win_amd64.whl", hash = "sha256:4ba78c6202b0f6c2ce8edc7aa470e550d2da56271c7cbdd10428613f1f7126f9", size = 58751944, upload-time = "2026-05-28T20:38:11.549Z" }, + { url = "https://pypi.fury.io/lancedb/-/ver_1Nlsr0/lancedb-0.33.1b2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:ec87aee4fcb644ec84c78cd3f7029b3bf0d412f98cf12374f682c68f14265fef" }, + { url = "https://pypi.fury.io/lancedb/-/ver_1JjYh/lancedb-0.33.1b2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:da5734098ebf16465e850ea661014fe0992f75234aa5a23f04ec95787bd05570" }, + { url = "https://pypi.fury.io/lancedb/-/ver_1a3Ksg/lancedb-0.33.1b2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:22cde3613d74d93aa27b7f426453017538d03a36c6a1ddc6c9bf41944953079c" }, + { url = "https://pypi.fury.io/lancedb/-/ver_1FdgZt/lancedb-0.33.1b2-cp39-abi3-win_amd64.whl", hash = "sha256:db1de7ddb72916acf3ab66bfa1bfc8f32f22230716ef60fc7a88df3bf41a0572" }, ] [[package]] @@ -1461,20 +1474,20 @@ wheels = [ [[package]] name = "pylance" -version = "3.0.0" -source = { registry = "https://pypi.org/simple" } +version = "8.0.0b12" +source = { registry = "https://pypi.fury.io/lance-format/" } dependencies = [ { name = "lance-namespace" }, { name = "numpy" }, { name = "pyarrow" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/98/71/2af25612cdca7ae9491f0681d1d35f70e53888461f7d79cf071d5a22dff9/pylance-3.0.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:87b5039f176d1bff03b0f7bffbb5ea82b2e3bb20b3b2571898dcde63a650a200", size = 54661673, upload-time = "2026-03-13T15:29:04.457Z" }, - { url = "https://files.pythonhosted.org/packages/8c/92/72006bb16a19ddce1e1ef36a161d59b757596ce131502eb0d1a8384b9e1a/pylance-3.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88624cff38e26cd829dd890297032968ea5ea7ffe1e603c706691f5cd161be8d", size = 57023644, upload-time = "2026-03-13T15:33:20.518Z" }, - { url = "https://files.pythonhosted.org/packages/d4/5b/e45140924a9153cd1631b5c84f8a0f0316a08aeab93078744343b232c58f/pylance-3.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7e4c7a31a69d1cff6849ecf50c99f239175241570abad52fcaf68c997ec4d15", size = 60657024, upload-time = "2026-03-13T15:39:51.715Z" }, - { url = "https://files.pythonhosted.org/packages/8d/97/0a2aaef51ff2254602276b5b5bf0c91421be9d502c3cff22ce778593bd8d/pylance-3.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e25d2587517b7aa790e791a45ebefff139c9c8da9dbb018017ec0b13fdc1da0", size = 57052522, upload-time = "2026-03-13T15:31:04.564Z" }, - { url = "https://files.pythonhosted.org/packages/62/98/2e62c8f5051d3f803acef616d80ac27df17c7e839e93a57645ee8daeda6d/pylance-3.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:8a6eada325c5e4c630002d940ad8c6d4f365c09e3704d11d47a00fc2245ea3a5", size = 60589921, upload-time = "2026-03-13T15:39:56.305Z" }, - { url = "https://files.pythonhosted.org/packages/30/7d/d1f4d4a0613e4d323a1646317f3050f4b567edaf5f33bc3a8ef7146527f1/pylance-3.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:0aee697d2d9ceaaf4a4dd9424b795716953d1da36d0731dcfaaa165d72e987a3", size = 65361200, upload-time = "2026-03-13T16:01:02.409Z" }, + { url = "https://pypi.fury.io/lance-format/-/ver_UmCSh/pylance-8.0.0b12-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5b26e12b45eb935cee597ac5cac73ae84a4004599fb4fa0dbccaaa59f6d04920" }, + { url = "https://pypi.fury.io/lance-format/-/ver_2jUi0O/pylance-8.0.0b12-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fbeb3c08c799b172cd0b99821ce7e80f33f42f3ae116a6def3e3ea53153ec507" }, + { url = "https://pypi.fury.io/lance-format/-/ver_1ME7R6/pylance-8.0.0b12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b00af33dd20807a17da1dc59cae6d19332cbe24a3c9962d2ab9d5f5190a8f79d" }, + { url = "https://pypi.fury.io/lance-format/-/ver_w0g3r/pylance-8.0.0b12-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dd6a6141a44c5187df530d57c92c92c45d9c53be530053ee27ff42a1facacca4" }, + { url = "https://pypi.fury.io/lance-format/-/ver_1DVG4V/pylance-8.0.0b12-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:d62050d32dc819870b927bca8f1538967722444173adc55f7531cf728077bced" }, + { url = "https://pypi.fury.io/lance-format/-/ver_14twi9/pylance-8.0.0b12-cp39-abi3-win_amd64.whl", hash = "sha256:1ffeaf981497661eaf97b22a5b22a1269bdae5732fe0cfea58d7ff58efe18297" }, ] [[package]]