Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions packages/gooddata-eval/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ gd-eval = "gooddata_eval.cli.main:main"
Source = "https://github.com/gooddata/gooddata-python-sdk"

[dependency-groups]
dev = [
"pytest>=8.3.5",
]
test = [
"pytest~=8.3.4",
"pytest-cov~=6.0.0",
Expand Down
236 changes: 236 additions & 0 deletions packages/gooddata-eval/src/gooddata_eval/cli/agentic_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# (C) 2026 GoodData Corporation. All rights reserved.
"""Agentic evaluation runner for gd-eval CLI — handles multi-turn agentic test kinds."""

from __future__ import annotations

import time
from typing import Any, TypedDict

from gooddata_eval.core.agentic._langfuse import HttpxLangfuseClient, make_langfuse_client
from gooddata_eval.core.agentic.alert_skill import evaluate_agentic_alert_skill
from gooddata_eval.core.agentic.conversation import ConversationFixture, evaluate_agentic_conversation
from gooddata_eval.core.agentic.general_question import evaluate_agentic_general_question
from gooddata_eval.core.agentic.guardrail import evaluate_agentic_guardrail
from gooddata_eval.core.agentic.metric_skill import evaluate_agentic_metric_skill
from gooddata_eval.core.agentic.search_tool import evaluate_agentic_search_tool
from gooddata_eval.core.agentic.visualization import evaluate_agentic_visualization
from gooddata_eval.core.models import CreatedVisualization, DatasetItem
from gooddata_eval.core.runner import EvalReport, ItemReport

_LfKw = TypedDict(
"_LfKw",
{
"langfuse": Any,
"dataset_item_id": str,
"dataset_name": str,
"run_timestamp": str,
"model_version_override": str | None,
},
total=False,
)

AGENTIC_TEST_KINDS = frozenset(
{
"vis_agentic", # production: expected_output.visualization (single/multi CreatedVisualization)
"agentic_visualization", # experimental: expected_output.expected_outputs (multi-candidate)
"agentic_metric_skill",
"agentic_alert_skill",
"agentic_search",
"agentic_general_question",
"agentic_guardrail",
"agentic_conversation",
}
)


def _parse_visualization_expected(expected_output: Any) -> list[CreatedVisualization]:
"""Parse expected_output into a list of CreatedVisualization candidates.

Accepts:
{"expected_outputs": [{"visualization": {...}}, ...]} <- agentic fixture format
{"visualization": {...}} or {"visualization": [{...}]} <- single/multi candidate
[{"visualization": {...}}, ...] <- bare list
"""
if isinstance(expected_output, dict):
raw_list = expected_output.get("expected_outputs")
if raw_list is not None:
return [
CreatedVisualization.model_validate(v.get("visualization", v) if isinstance(v, dict) else v)
for v in raw_list
]
raw_viz = expected_output.get("visualization")
if raw_viz is not None:
if isinstance(raw_viz, list):
return [CreatedVisualization.model_validate(v) for v in raw_viz]
return [CreatedVisualization.model_validate(raw_viz)]
if isinstance(expected_output, list):
return [
CreatedVisualization.model_validate(v.get("visualization", v) if isinstance(v, dict) else v)
for v in expected_output
]
raise ValueError(
f"Cannot parse agentic_visualization expected_output: {type(expected_output).__name__}. "
'Expected {"expected_outputs": [...]} or {"visualization": {...}}.'
)


def _dispatch_agentic(
item: DatasetItem,
host: str,
token: str,
workspace_id: str,
k: int,
langfuse: Any,
run_ts: str,
model_version_override: str | None,
) -> None:
"""Call the appropriate evaluate_agentic_* function for the item's test_kind."""
kind = item.test_kind
eo = item.expected_output
lf_kw: _LfKw = {
"langfuse": langfuse,
"dataset_item_id": item.id,
"dataset_name": item.dataset_name,
"run_timestamp": run_ts,
"model_version_override": model_version_override,
}

if kind in ("vis_agentic", "agentic_visualization"):
evaluate_agentic_visualization(
host=host,
token=token,
workspace_id=workspace_id,
question=item.question,
expected_outputs=_parse_visualization_expected(eo),
k=k,
**lf_kw,
)
elif kind == "agentic_metric_skill":
evaluate_agentic_metric_skill(
host=host,
token=token,
workspace_id=workspace_id,
question=item.question,
expected_output=eo if isinstance(eo, dict) else {},
k=k,
**lf_kw,
)
elif kind == "agentic_alert_skill":
evaluate_agentic_alert_skill(
host=host,
token=token,
workspace_id=workspace_id,
question=item.question,
expected_output=eo if isinstance(eo, dict) else {},
k=k,
**lf_kw,
)
elif kind == "agentic_search":
eo_dict = eo if isinstance(eo, dict) else {}
tool_call = eo_dict.get("tool_call", {})
expected_args = tool_call.get("function_arguments", eo_dict)
evaluate_agentic_search_tool(
host=host,
token=token,
workspace_id=workspace_id,
question=item.question,
expected_tool_call=expected_args,
k=k,
**lf_kw,
)
elif kind == "agentic_general_question":
evaluate_agentic_general_question(
host=host,
token=token,
workspace_id=workspace_id,
question=item.question,
expected_output=eo if isinstance(eo, str) else str(eo),
k=k,
**lf_kw,
)
elif kind == "agentic_guardrail":
evaluate_agentic_guardrail(
host=host,
token=token,
workspace_id=workspace_id,
question=item.question,
expected_output=eo if isinstance(eo, str) else str(eo),
k=k,
**lf_kw,
)
elif kind == "agentic_conversation":
fixture_data = eo.get("fixture") or eo if isinstance(eo, dict) else {}
evaluate_agentic_conversation(
host=host,
token=token,
workspace_id=workspace_id,
fixture=ConversationFixture.model_validate(fixture_data),
**lf_kw,
)
else:
raise ValueError(f"Unknown agentic test kind: {kind!r}")


def run_agentic_items(
items: list[DatasetItem],
host: str,
token: str,
workspace_id: str,
*,
k: int = 2,
model_version: str | None = None,
use_langfuse: bool = False,
run_ts: str,
on_item_start: Any = None,
on_item_done: Any = None,
) -> EvalReport:
"""Run agentic items through evaluate_agentic_* and return an EvalReport."""
langfuse = make_langfuse_client() if use_langfuse else None

report = EvalReport(model=model_version)
total = len(items)

for index, item in enumerate(items, start=1):
if on_item_start is not None:
try:
on_item_start(index, total, item)
except Exception:
pass

item_report = ItemReport(
id=item.id,
dataset_name=item.dataset_name,
test_kind=item.test_kind,
question=item.question,
)
t0 = time.perf_counter()
try:
_dispatch_agentic(item, host, token, workspace_id, k, langfuse, run_ts, model_version)
item_report.pass_at_k = True
item_report.runs = k
except AssertionError as exc:
item_report.pass_at_k = False
item_report.runs = k
print(f"[agentic] {item.id} FAIL: {exc}", flush=True)
except Exception as exc:
item_report.error = f"{type(exc).__name__}: {exc}"
item_report.runs = 0
finally:
item_report.latency_s = time.perf_counter() - t0

if on_item_done is not None:
try:
on_item_done(index, total, item_report)
except Exception:
pass

report.items.append(item_report)

if langfuse is not None:
try:
langfuse.flush()
langfuse.close()
except Exception:
pass

return report
52 changes: 49 additions & 3 deletions packages/gooddata-eval/src/gooddata_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from gooddata_eval.core.models import ChatResult, DatasetItem
from gooddata_eval.core.reporting.console import render_comparison, render_console
from gooddata_eval.core.reporting.json_report import write_multi_model_report
from gooddata_eval.cli.agentic_runner import AGENTIC_TEST_KINDS, run_agentic_items
from gooddata_eval.core.runner import ItemReport, run_items
from gooddata_eval.core.summary.http_client import SummaryClient
from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
Expand Down Expand Up @@ -62,6 +63,17 @@ def _build_parser() -> argparse.ArgumentParser:
source = run.add_mutually_exclusive_group(required=True)
source.add_argument("--dataset", help="Path to a folder of dataset JSON files.")
source.add_argument("--langfuse-dataset", dest="langfuse_dataset", help="Langfuse dataset name.")
run.add_argument(
"--kind",
dest="kind",
default="visualization",
metavar="TEST_KIND",
help=(
"Default test kind for dataset items that don't embed one. "
"Use 'vis_agentic', 'agentic_visualization', 'agentic_metric_skill', etc. for multi-turn agentic eval. "
"(default: visualization)"
),
)
run.add_argument(
"--model",
action="append",
Expand Down Expand Up @@ -165,7 +177,7 @@ def _load_dataset(config: RunConfig):

if config.langfuse_dataset is None: # pragma: no cover - argparse mutually-exclusive group guarantees one is set
raise ValueError("Either --dataset or --langfuse-dataset is required.")
return load_langfuse_dataset(config.langfuse_dataset)
return load_langfuse_dataset(config.langfuse_dataset, default_test_kind=config.kind)


def _list_models(host: str, token: str, workspace_id: str | None) -> int:
Expand Down Expand Up @@ -228,6 +240,8 @@ def _run(config: RunConfig) -> int:
return _EXIT_OPERATIONAL_ERROR

items = _load_dataset(config)
agentic_items = [i for i in items if i.test_kind in AGENTIC_TEST_KINDS]
non_agentic_items = [i for i in items if i.test_kind not in AGENTIC_TEST_KINDS]
models = config.models or []
run_ts = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M")
n_models = len(models) if models else 1
Expand Down Expand Up @@ -287,13 +301,30 @@ def on_langfuse_item_done(
) -> None:
_sink.log_item(report, dataset_item_id=report.id)

# --- agentic items (multi-turn, use evaluate_agentic_*) ---
agentic_report = None
if agentic_items:
agentic_report = run_agentic_items(
agentic_items,
host=config.host,
token=config.token,
workspace_id=config.workspace_id,
k=config.runs,
model_version=resolved.model_id,
use_langfuse=config.log_to_langfuse,
run_ts=run_ts,
on_item_start=on_item_start,
on_item_done=on_item_done,
)

# --- non-agentic items (single-turn, use Evaluator) ---
backend = _RoutingBackend(
ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
)
try:
report = run_items(
items,
single_report = run_items(
non_agentic_items,
backend,
runs=config.runs,
model=resolved.model_id,
Expand All @@ -310,6 +341,20 @@ def on_langfuse_item_done(
if hasattr(backend, "close"):
backend.close()

# merge into a single report for display/export
from gooddata_eval.core.runner import EvalReport # noqa: PLC0415

report = EvalReport(
model=resolved.model_id,
provider_name=resolved.provider_name or resolved.provider_id,
provider_type=resolved.provider_type,
workspace_id=config.workspace_id,
)
if agentic_report is not None:
report.items.extend(agentic_report.items)
report.items.extend(single_report.items)
report.wall_clock_s = (agentic_report.wall_clock_s if agentic_report else 0.0) + single_report.wall_clock_s

skipped_kinds = sorted({i.test_kind for i in report.items if i.skipped})
if skipped_kinds:
print(
Expand Down Expand Up @@ -363,6 +408,7 @@ def main(argv: list[str] | None = None) -> int:
json_path=Path(args.json_path) if args.json_path else None,
log_to_langfuse=args.langfuse,
quiet=args.quiet,
kind=args.kind,
)
return _run(config)
except (
Expand Down
Loading
Loading