From cbb8029c0e70b7df9388da0fa58f7dbf4115f783 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 25 Mar 2026 16:28:04 +0900 Subject: [PATCH 01/94] feat: enhance deployment configuration and debugging features - Updated the `Readme.md` to reflect changes in the build process and worker scaling options. - Introduced a new environment variable `EVAL_COMPOSE_SCALE_WORKER` to set the default number of worker replicas. - Modified deployment scripts to utilize the new worker scaling configuration, ensuring better management of resources. - Enhanced the `docker-compose.yml` to support multiple Streamlit instances and improved Nginx configuration for load balancing. - Added new helper functions for rendering live container status in the Deployment Debug page, improving operational visibility. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Readme.md | 6 +- evaluation_dashboard_app/deploy/.env | 3 +- evaluation_dashboard_app/deploy/.env.example | 3 + evaluation_dashboard_app/deploy/04_START.sh | 12 +- .../deploy/08_REBUILD_AND_START.sh | 2 +- .../deploy/docker-compose.yml | 100 ++++++----- .../deploy/nginx/nginx.conf | 10 +- .../docs/PRODUCTION_DEPLOYMENT.md | 26 ++- evaluation_dashboard_app/lib/deploy_debug.py | 15 +- .../lib/docker_live_structure.py | 161 ++++++++++++++++++ .../lib/mermaid_render.py | 30 ++++ evaluation_dashboard_app/pages/10_Help.py | 30 +--- .../pages/99_Deployment_Debug.py | 66 ++++--- 13 files changed, 341 insertions(+), 123 deletions(-) create mode 100644 evaluation_dashboard_app/lib/docker_live_structure.py create mode 100644 evaluation_dashboard_app/lib/mermaid_render.py diff --git a/evaluation_dashboard_app/Readme.md b/evaluation_dashboard_app/Readme.md index add4149..8dabed9 100644 --- a/evaluation_dashboard_app/Readme.md +++ b/evaluation_dashboard_app/Readme.md @@ -295,7 +295,7 @@ flowchart LR W2 --> DataRoot ``` -- **ビルド**: 上記「ビルド手順」のとおり `evaluation_dashboard_app/` で `docker build ... -t evaluation-dashboard .`(compose の `streamlit` / `worker` はこのイメージを参照します)。 +- **ビルド**: 上記「ビルド手順」のとおり `evaluation_dashboard_app/` で `docker build ... -t evaluation-dashboard .`(compose の `streamlit1` / `streamlit2` / `worker` はこのイメージを参照します)。 - **推奨フロー(`deploy/` の番号付きスクリプト)**: `deploy/` に移動して順に実行します(すべて `docker compose --env-file .env` を使います)。 | スクリプト | 内容 | @@ -303,11 +303,11 @@ flowchart LR | `01_SETUP_ENV.sh` | `.env` が無ければ `.env.example` から作成(**編集は手動**) | | `02_BUILD.sh` | イメージビルド(引数で `--no-cache` など可) | | `03_INIT_DB.sh` | **初回のみ**: Postgres 起動後に `init_db` でタスク用テーブル作成 | - | `04_START.sh` | スタック起動(例: `./04_START.sh --scale worker=3`) | + | `04_START.sh` | スタック起動(デフォルト worker 数は `.env` の `EVAL_COMPOSE_SCALE_WORKER`、例: `./04_START.sh --scale worker=3` で上書き可) | | `05_STOP.sh` | 停止 | | `06_STATUS.sh` | 状態確認 | | `07_LOGS.sh` | `docker compose logs -f`(省略時は全サービス、例: `./07_LOGS.sh worker`) | - | `08_REBUILD_AND_START.sh` | ビルド後に `up -d` | + | `08_REBUILD_AND_START.sh` | ビルド後に `04_START.sh` と同じ起動(worker 既定本数あり) | | `09_RESTART_WORKER.sh` | ワーカー再起動(コード変更を worker に反映) | - **手動でも同じことは可能**: `cd deploy && cp .env.example .env` → `.env` を編集 → `docker compose --env-file .env up -d`。初回のみ `docker compose --env-file .env run --rm init_db`(`03_INIT_DB.sh` と同等)。 diff --git a/evaluation_dashboard_app/deploy/.env b/evaluation_dashboard_app/deploy/.env index 1bfde14..1e13b72 100644 --- a/evaluation_dashboard_app/deploy/.env +++ b/evaluation_dashboard_app/deploy/.env @@ -26,4 +26,5 @@ RQ_QUEUE=default # NGINX_HTTPS=1 EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT=deploy -EVAL_DEPLOYMENT_DEBUG_EXEC=1 \ No newline at end of file +EVAL_DEPLOYMENT_DEBUG_EXEC=1 +EVAL_COMPOSE_SCALE_WORKER=3 \ No newline at end of file diff --git a/evaluation_dashboard_app/deploy/.env.example b/evaluation_dashboard_app/deploy/.env.example index 1add32b..907a157 100644 --- a/evaluation_dashboard_app/deploy/.env.example +++ b/evaluation_dashboard_app/deploy/.env.example @@ -17,6 +17,9 @@ DATABASE_URL=postgresql://eval_user:eval_pass@postgres:5432/eval_dashboard REDIS_URL=redis://redis:6379/0 RQ_QUEUE=default +# Docker Compose: default worker replica count (04_START.sh / 08_REBUILD_AND_START.sh). Streamlit runs as streamlit1 + streamlit2 (see docker-compose.yml + nginx). +EVAL_COMPOSE_SCALE_WORKER=2 + # RQ: max job runtime before the worker kills the job (seconds). Default 7 days if unset. # RQ_JOB_TIMEOUT_SEC=604800 # Optional: longer timeout for build_parquet only (defaults to RQ_JOB_TIMEOUT_SEC if unset) diff --git a/evaluation_dashboard_app/deploy/04_START.sh b/evaluation_dashboard_app/deploy/04_START.sh index e087e35..3281a86 100755 --- a/evaluation_dashboard_app/deploy/04_START.sh +++ b/evaluation_dashboard_app/deploy/04_START.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # 04 — Start the full stack, or if it is already running: up -d (apply compose/scale) then restart all services. -# Extra args: e.g. ./04_START.sh --scale worker=3 +# Default: 2 worker replicas (EVAL_COMPOSE_SCALE_WORKER in .env). Override: ./04_START.sh --scale worker=1 (last --scale wins). set -euo pipefail DEPLOY_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$DEPLOY_DIR" @@ -9,12 +9,18 @@ if [[ ! -f .env ]]; then exit 1 fi +set -a +# shellcheck disable=SC1091 +source .env +set +a +WORKER_SCALE="${EVAL_COMPOSE_SCALE_WORKER:-2}" + dc() { docker compose --env-file .env "$@"; } if [[ -n "$(dc ps -q --status running 2>/dev/null || true)" ]]; then echo "Stack already running — updating with up -d, then restarting all services." - dc up -d "$@" + dc up -d --scale "worker=${WORKER_SCALE}" "$@" dc restart else - dc up -d "$@" + dc up -d --scale "worker=${WORKER_SCALE}" "$@" fi diff --git a/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh b/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh index b216686..2763b55 100755 --- a/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh +++ b/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh @@ -8,4 +8,4 @@ if [[ ! -f .env ]]; then exit 1 fi docker compose --env-file .env build "$@" -docker compose --env-file .env up -d +exec "$DEPLOY_DIR/04_START.sh" diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index 8fd4b87..aed2365 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -1,8 +1,9 @@ -# Production-style stack: Nginx -> Streamlit, Redis, Worker, Postgres. +# Production-style stack: Nginx -> Streamlit (x2) + Workers (default scale via EVAL_COMPOSE_SCALE_WORKER in .env / 04_START.sh). # Helper scripts (run from deploy/): 01_SETUP_ENV.sh 02_BUILD.sh 03_INIT_DB.sh 04_START.sh 05_STOP.sh # 06_STATUS.sh 07_LOGS.sh 08_REBUILD_AND_START.sh 09_RESTART_WORKER.sh # Run from deploy/: docker compose --env-file .env up -d -# Scale workers: docker-compose up -d --scale worker=3 (default 1 worker) +# Plain `up -d` uses one worker unless you pass --scale worker=N; 04_START.sh defaults to EVAL_COMPOSE_SCALE_WORKER (2). +# More Streamlit boxes: duplicate x-streamlit-app block as streamlit3, add server to nginx upstream. # Build image from repo root: docker build -t evaluation-dashboard . (see Readme) # # Data is bind-mounted to the host so you can access it directly: @@ -11,7 +12,7 @@ # - ${HOME}/.webauto -> Download/Scenario API credentials (streamlit + worker) # # App source is mounted so you can edit Python code without rebuilding the image. -# Streamlit will reload on file changes. Restart the worker to pick up changes: docker compose restart worker +# Streamlit will reload on file changes. Restart workers: docker compose restart worker # # Deployment debug (pages/99_Deployment_Debug.py; nav hidden outside Docker via CSS; sidebar link on Overview in Docker): Streamlit mounts the host # Docker socket and sets EVAL_DEPLOYMENT_DEBUG_DOCKER=1. Anyone who can use the dashboard @@ -19,6 +20,48 @@ # networks. Set EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT in .env (see .env.example). # EVAL_DEPLOYMENT_DEBUG_EXEC=1 in .env enables one-shot shell (docker exec) from the UI. +x-streamlit-app: &streamlit-app + build: + context: .. + dockerfile: Dockerfile + secrets: + - ssh + image: evaluation-dashboard + command: ["/app/docker-entrypoint.sh"] + environment: + - TZ=Asia/Tokyo + - EVAL_DASHBOARD_DATA_ROOT=${EVAL_DASHBOARD_DATA_ROOT:-/app/data} + - EVAL_DASHBOARD_CONFIG=/app/docker_config/autoware_evaluator_dl_config.json + - USE_TASK_QUEUE=${USE_TASK_QUEUE:-true} + - DATABASE_URL=${DATABASE_URL} + - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} + - RQ_QUEUE=${RQ_QUEUE:-default} + - EVAL_DEPLOYMENT_DEBUG_DOCKER=1 + - EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT=${EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT:-} + # One-shot shell in selected container (docker exec). Default off; set to 1 in .env only when needed. + - EVAL_DEPLOYMENT_DEBUG_EXEC=${EVAL_DEPLOYMENT_DEBUG_EXEC:-0} + volumes: + - ../data:/app/data + - ${HOME}/.webauto:/root/.webauto + # Docker-only config (separate from your local configs) + - ./configs:/app/docker_config + # Mount app source so code changes apply without rebuild (Streamlit auto-reloads) + - ../Overview.py:/app/Overview.py + - ../pages:/app/pages + - ../Readme.md:/app/Readme.md + - ../lib:/app/lib + - ../worker:/app/worker + - ../configs:/app/configs + - /var/run/docker.sock:/var/run/docker.sock + env_file: + - .env + depends_on: + redis: + condition: service_started + postgres: + condition: service_healthy + restart: unless-stopped + services: nginx: image: nginx:alpine @@ -27,50 +70,15 @@ services: volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro depends_on: - - streamlit + - streamlit1 + - streamlit2 restart: unless-stopped - streamlit: - build: - context: .. - dockerfile: Dockerfile - secrets: - - ssh - image: evaluation-dashboard - command: ["/app/docker-entrypoint.sh"] - environment: - - TZ=Asia/Tokyo - - EVAL_DASHBOARD_DATA_ROOT=${EVAL_DASHBOARD_DATA_ROOT:-/app/data} - - EVAL_DASHBOARD_CONFIG=/app/docker_config/autoware_evaluator_dl_config.json - - USE_TASK_QUEUE=${USE_TASK_QUEUE:-true} - - DATABASE_URL=${DATABASE_URL} - - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} - - RQ_QUEUE=${RQ_QUEUE:-default} - - EVAL_DEPLOYMENT_DEBUG_DOCKER=1 - - EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT=${EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT:-} - # One-shot shell in selected container (docker exec). Default off; set to 1 in .env only when needed. - - EVAL_DEPLOYMENT_DEBUG_EXEC=${EVAL_DEPLOYMENT_DEBUG_EXEC:-0} - volumes: - - ../data:/app/data - - ${HOME}/.webauto:/root/.webauto - # Docker-only config (separate from your local configs) - - ./configs:/app/docker_config - # Mount app source so code changes apply without rebuild (Streamlit auto-reloads) - - ../Overview.py:/app/Overview.py - - ../pages:/app/pages - - ../Readme.md:/app/Readme.md - - ../lib:/app/lib - - ../worker:/app/worker - - ../configs:/app/configs - - /var/run/docker.sock:/var/run/docker.sock - env_file: - - .env - depends_on: - redis: - condition: service_started - postgres: - condition: service_healthy - restart: unless-stopped + streamlit1: + <<: *streamlit-app + + streamlit2: + <<: *streamlit-app redis: image: redis:7-alpine @@ -144,4 +152,4 @@ services: secrets: ssh: - file: ${HOME}/.ssh/id_rsa \ No newline at end of file + file: ${HOME}/.ssh/id_rsa diff --git a/evaluation_dashboard_app/deploy/nginx/nginx.conf b/evaluation_dashboard_app/deploy/nginx/nginx.conf index a4766dd..ee8278c 100644 --- a/evaluation_dashboard_app/deploy/nginx/nginx.conf +++ b/evaluation_dashboard_app/deploy/nginx/nginx.conf @@ -1,16 +1,14 @@ # Nginx: reverse proxy to Streamlit with WebSocket support. -# For multiple Streamlit replicas, add more "server streamlit:8501" lines in upstream. - +# Multiple Streamlit replicas: ip_hash keeps a client on one backend (Streamlit sessions are not shared). events { worker_connections 1024; } http { upstream streamlit { - server streamlit:8501; - # Add more servers for load balancing: - # server streamlit2:8501; - # server streamlit3:8501; + ip_hash; + server streamlit1:8501; + server streamlit2:8501; } server { diff --git a/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md b/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md index 3e84af6..4bc22fc 100644 --- a/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md +++ b/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md @@ -72,6 +72,7 @@ Heavy operations (download results, download scenarios, run eval_result, generat | `EVAL_DEPLOYMENT_DEBUG_DOCKER` | Set to `1` in [`deploy/docker-compose.yml`](deploy/docker-compose.yml) for Streamlit; enables the **Docker** tab when the host socket is mounted. Override in `.env` only if you change compose. | `1` in compose | | `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` | Compose project name (`docker compose ls`) to filter containers by `com.docker.compose.project`. Strongly recommended when the host runs other stacks. | (empty) | | `EVAL_DEPLOYMENT_DEBUG_EXEC` | When `1`/`true`, the Deployment debug **Docker** tab shows **Run command** (`sh -c` via `docker exec`). Default `0` in compose — enable in `.env` only briefly on trusted networks. | `0` | +| `EVAL_COMPOSE_SCALE_WORKER` | Default number of `worker` replicas when using [`deploy/04_START.sh`](deploy/04_START.sh) / [`08_REBUILD_AND_START.sh`](deploy/08_REBUILD_AND_START.sh). | `2` | ## Build @@ -113,26 +114,23 @@ docker compose build --no-cache docker compose up -d ``` - To run multiple workers, use `--scale worker=N` (e.g. 3 workers): + The stack defaults to **two Streamlit** containers (`streamlit1`, `streamlit2`) behind Nginx and **two workers** (`EVAL_COMPOSE_SCALE_WORKER=2` in `.env`, applied by [`04_START.sh`](deploy/04_START.sh)). Override worker count with `--scale worker=N` (last flag wins) or change `EVAL_COMPOSE_SCALE_WORKER`. ```sh - docker-compose up -d --scale worker=3 + docker compose up -d --scale worker=3 ``` - Default is one worker. All worker replicas share the same RQ queue. + All worker replicas share the same RQ queue. 4. **Access the app** - Via Nginx: **http://localhost** (port 80) - - Streamlit directly (if you expose it): port 8501 on the `streamlit` service (not exposed by default when using Nginx) + - Streamlit directly (if you expose ports in compose): 8501 on `streamlit1` / `streamlit2` (not exposed by default when using Nginx) ## Scaling -- **Workers**: Use Docker Compose `--scale` to run more worker containers. From the `deploy/` directory: - - **Default (1 worker):** `docker-compose up -d` - - **N workers:** `docker-compose up -d --scale worker=N` - Example: `docker-compose up -d --scale worker=3` runs three workers; all consume from the same RQ queue. -- **Streamlit replicas**: In `deploy/docker-compose.yml`, duplicate the `streamlit` service (e.g. `streamlit2`) and add `server streamlit2:8501;` to `deploy/nginx/nginx.conf` in the `upstream streamlit` block. +- **Workers**: Default replica count is `EVAL_COMPOSE_SCALE_WORKER` (see `.env.example`; [`04_START.sh`](deploy/04_START.sh) passes `--scale worker=…`). From the `deploy/` directory you can also run `docker compose up -d --scale worker=N` (e.g. three workers); all consume from the same RQ queue. +- **Streamlit replicas**: By default, `streamlit1` and `streamlit2` share one Nginx `upstream` with `ip_hash` for session stickiness. To add more, duplicate the `x-streamlit-app` service in [`deploy/docker-compose.yml`](deploy/docker-compose.yml), add `depends_on` for Nginx, and add `server streamlit3:8501;` (etc.) in [`deploy/nginx/nginx.conf`](deploy/nginx/nginx.conf). ## TLS (HTTPS) @@ -151,18 +149,18 @@ To serve over HTTPS, configure Nginx with SSL certificates (e.g. Let's Encrypt) | "Failed to enqueue task" | `REDIS_URL` and `DATABASE_URL` are set; Redis and Postgres containers are running; `USE_TASK_QUEUE=true`. | | Tasks stay "pending" | Worker container is running; same `REDIS_URL` and `RQ_QUEUE` as Streamlit; worker logs for errors. | | Postgres connection refused | Postgres is healthy (`docker-compose ps`); `DATABASE_URL` uses hostname `postgres` and correct port (5432). | -| Nginx 502 Bad Gateway | Streamlit container is up and listening on 8501; Nginx `upstream` points to `streamlit:8501`. | +| Nginx 502 Bad Gateway | Streamlit containers are up and listening on 8501; Nginx `upstream` lists `streamlit1:8501` and `streamlit2:8501`. | ## Deployment debug page (Docker socket) The Streamlit page **Deployment debug** (`pages/99_Deployment_Debug.py` — required at top level so `st.page_link` works; default sidebar entry is hidden outside Docker via CSS; **Overview** adds a sidebar link when running in Docker) shows redacted environment variables, Postgres/Redis/RQ checks, task counts, and Docker container status and log tails. -- [`deploy/docker-compose.yml`](deploy/docker-compose.yml) mounts `/var/run/docker.sock` into the `streamlit` service and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`. After `docker compose up -d`, restart or recreate Streamlit if you change compose or env. +- [`deploy/docker-compose.yml`](deploy/docker-compose.yml) mounts `/var/run/docker.sock` into each Streamlit service (`streamlit1`, `streamlit2`) and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`. After `docker compose up -d`, restart or recreate those services if you change compose or env. - Set `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` in `.env` to your Compose project name (from `docker compose ls`) so the UI lists only this stack’s containers. If it is unset, the page lists every container visible to the daemon and shows a warning. -- Rebuild the image after adding the `docker` PyPI package to `requirements-docker.txt` (or `docker compose build streamlit`). +- Rebuild the image after adding the `docker` PyPI package to `requirements-docker.txt` (or `docker compose build streamlit1`). - **Exec**: set `EVAL_DEPLOYMENT_DEBUG_EXEC=1` in `.env` and recreate Streamlit to enable one-shot `sh -c` commands in the selected container (same power as `docker exec`). Leave at `0` when you only need logs. -**Risk**: any user who can open the app with socket access can read logs for containers matched by the filter. With `EVAL_DEPLOYMENT_DEBUG_EXEC=1`, they can also run shell commands inside those containers. Restrict access with VPN, SSO/auth proxy, or remove the socket mount and debug env from the `streamlit` service in compose if that risk is unacceptable. +**Risk**: any user who can open the app with socket access can read logs for containers matched by the filter. With `EVAL_DEPLOYMENT_DEBUG_EXEC=1`, they can also run shell commands inside those containers. Restrict access with VPN, SSO/auth proxy, or remove the socket mount and debug env from the Streamlit services in compose if that risk is unacceptable. ## Data on the host (bind mounts) @@ -198,7 +196,7 @@ Rebuild the image only when you change dependencies (e.g. `requirements-docker.t ``` deploy/ - docker-compose.yml # full stack; streamlit includes Docker socket for Deployment debug + docker-compose.yml # full stack; streamlit1/streamlit2 + Docker socket for Deployment debug .env.example nginx/ nginx.conf diff --git a/evaluation_dashboard_app/lib/deploy_debug.py b/evaluation_dashboard_app/lib/deploy_debug.py index 0edeb76..1336a61 100644 --- a/evaluation_dashboard_app/lib/deploy_debug.py +++ b/evaluation_dashboard_app/lib/deploy_debug.py @@ -223,12 +223,25 @@ def list_containers_for_debug(client) -> Tuple[List[Dict[str, str]], Optional[st rows: List[Dict[str, str]] = [] for c in containers: cid = c.id or "" + attrs = getattr(c, "attrs", None) or {} + state = attrs.get("State") or {} + state_status = (state.get("Status") or getattr(c, "status", "") or "").strip() + health_obj = state.get("Health") or {} + health_s = (health_obj.get("Status") or "").strip() + labels = (attrs.get("Config") or {}).get("Labels") or {} + if not isinstance(labels, dict): + labels = {} + compose_service = (labels.get("com.docker.compose.service") or "").strip() + compose_project = (labels.get("com.docker.compose.project") or "").strip() rows.append( { "id": cid[:12] if len(cid) >= 12 else cid, "full_id": cid, "name": (c.name or "").lstrip("/"), - "status": getattr(c, "status", "") or "", + "state": state_status or "unknown", + "health": health_s if health_s else "—", + "compose_service": compose_service or "—", + "compose_project": compose_project or "—", "image": c.image.tags[0] if c.image and c.image.tags else (c.image.id[:12] if c.image else ""), } ) diff --git a/evaluation_dashboard_app/lib/docker_live_structure.py b/evaluation_dashboard_app/lib/docker_live_structure.py new file mode 100644 index 0000000..55af8de --- /dev/null +++ b/evaluation_dashboard_app/lib/docker_live_structure.py @@ -0,0 +1,161 @@ +""" +Mermaid source for the Deployment debug Docker tab: same subgraph layout as Readme.md (Help). + +Clients → Edge → App Tier → Infrastructure → Workers → Host data, with live container labels. +""" + +from __future__ import annotations + +from collections import defaultdict +from typing import Dict, List, Optional + + +def _by_compose_service(rows: List[Dict[str, str]]) -> Dict[str, List[int]]: + by: Dict[str, List[int]] = defaultdict(list) + for i, r in enumerate(rows): + svc = (r.get("compose_service") or "").strip() + if svc and svc != "—": + by[svc].append(i) + return by + + +def _mermaid_plain(s: str, max_len: int) -> str: + return (s or "")[:max_len].replace('"', "'").replace("\n", " ").replace("#", " ") + + +def _row_mermaid_label(r: Dict[str, str]) -> str: + name = _mermaid_plain(r.get("name"), 38) + stt = _mermaid_plain(r.get("state"), 14) + svc = _mermaid_plain(r.get("compose_service"), 18) or "—" + hl = (r.get("health") or "").strip() + if hl and hl != "—": + return f"{name}
{stt} · {svc}
{_mermaid_plain(hl, 14)}" + return f"{name}
{stt} · {svc}" + + +def _row_class(r: Dict[str, str]) -> str: + s = (r.get("state") or "").lower() + if s == "running": + return "run" + if s in ("exited", "dead"): + return "x" + return "o" + + +def _nid(i: int) -> str: + return f"N{i}" + + +def _nid_list(idxs: List[int]) -> Optional[str]: + if not idxs: + return None + return " & ".join(_nid(i) for i in idxs) + + +def live_containers_mermaid(rows: List[Dict[str, str]]) -> str: + """ + flowchart LR with subgraphs matching Help / Readme.md: + Clients, Edge, App Tier, Infrastructure, Workers, Host data — plus live labels per container. + """ + if not rows: + return 'flowchart LR\n _empty["No containers in filter"]' + + by = _by_compose_service(rows) + nginx = sorted(by.get("nginx", []), key=lambda i: rows[i].get("name", "")) + st: List[int] = [] + for svc in sorted(s for s in by if s.startswith("streamlit")): + st.extend(sorted(by[svc], key=lambda i: rows[i].get("name", ""))) + redis = sorted(by.get("redis", []), key=lambda i: rows[i].get("name", "")) + pg = sorted(by.get("postgres", []), key=lambda i: rows[i].get("name", "")) + init = sorted(by.get("init_db", []), key=lambda i: rows[i].get("name", "")) + workers = sorted(by.get("worker", []), key=lambda i: rows[i].get("name", "")) + known = set(nginx + st + redis + pg + init + workers) + other = [i for i in range(len(rows)) if i not in known] + + def node_line(i: int) -> str: + r = rows[i] + return f' {_nid(i)}["{_row_mermaid_label(r)}"]:::{_row_class(r)}' + + lines: List[str] = [ + "flowchart LR", + " classDef run fill:#c8e6c9,stroke:#2e7d32", + " classDef x fill:#ffcdd2,stroke:#c62828", + " classDef o fill:#e0e0e0,stroke:#616161", + " classDef syn fill:#e3f2fd,stroke:#1565c0", + ' subgraph clients ["Clients"]', + " BR[Browser]:::syn", + " end", + ] + + if nginx: + lines.append(' subgraph edge ["Edge"]') + for i in nginx: + lines.append(node_line(i)) + lines.append(" end") + + if st: + lines.append(' subgraph app ["App Tier"]') + for i in st: + lines.append(node_line(i)) + lines.append(" end") + + infra = redis + pg + init + if infra: + lines.append(' subgraph infra ["Infrastructure"]') + for i in infra: + lines.append(node_line(i)) + lines.append(" end") + + if workers: + lines.append(' subgraph workers ["Workers"]') + for i in workers: + lines.append(node_line(i)) + lines.append(" end") + + lines.append(' subgraph volumes ["Host data"]') + lines.append(' DR[Data root
bind-mounted data]:::syn') + lines.append(" end") + + if other: + lines.append(' subgraph misc ["Other"]') + for i in other: + lines.append(node_line(i)) + lines.append(" end") + + lines.append("") + lines.append(" %% Same topology as Readme.md Help") + + nl_nginx = _nid_list(nginx) + nl_st = _nid_list(st) + nl_redis = _nid_list(redis) + nl_pg = _nid_list(pg) + nl_workers = _nid_list(workers) + + if nl_nginx: + lines.append(f" BR --> {nl_nginx}") + if nl_st: + for i in nginx: + lines.append(f" {_nid(i)} --> {nl_st}") + elif nl_st: + lines.append(f" BR --> {nl_st}") + + for i in st: + if nl_redis: + lines.append(f" {_nid(i)} --> {nl_redis}") + if nl_pg: + lines.append(f" {_nid(i)} --> {nl_pg}") + + for i in redis: + if nl_workers: + lines.append(f" {_nid(i)} --> {nl_workers}") + + for i in workers: + if nl_pg: + lines.append(f" {_nid(i)} --> {nl_pg}") + lines.append(f" {_nid(i)} --> DR") + + for i in init: + for j in pg: + lines.append(f" {_nid(i)} -.-> {_nid(j)}") + + return "\n".join(lines) diff --git a/evaluation_dashboard_app/lib/mermaid_render.py b/evaluation_dashboard_app/lib/mermaid_render.py new file mode 100644 index 0000000..47a72d1 --- /dev/null +++ b/evaluation_dashboard_app/lib/mermaid_render.py @@ -0,0 +1,30 @@ +"""Render Mermaid diagrams in Streamlit via Mermaid.js (Streamlit markdown does not run Mermaid).""" + +import json +import uuid + +import streamlit.components.v1 as components + + +def render_mermaid(definition: str, *, height: int = 480) -> None: + """Render a Mermaid diagram inside an HTML iframe (CDN script).""" + defn_json = json.dumps(definition.strip()) + uid = uuid.uuid4().hex[:12] + html = f""" +
+ + +""" + components.html(html, height=height, scrolling=True) diff --git a/evaluation_dashboard_app/pages/10_Help.py b/evaluation_dashboard_app/pages/10_Help.py index 8c9df7f..cfbf8bd 100644 --- a/evaluation_dashboard_app/pages/10_Help.py +++ b/evaluation_dashboard_app/pages/10_Help.py @@ -1,11 +1,9 @@ -import json import re -import uuid from pathlib import Path import streamlit as st -import streamlit.components.v1 as components +from lib.mermaid_render import render_mermaid from lib.page_chrome import inject_app_page_styles, render_page_hero st.set_page_config( @@ -27,30 +25,6 @@ IMAGE_PATTERN = re.compile(r"!\[(.*?)\]\((.*?)\)") -def _render_mermaid(definition: str) -> None: - """Render a Mermaid diagram inside an HTML component (CDN script).""" - defn_json = json.dumps(definition.strip()) - uid = uuid.uuid4().hex[:12] - html = f""" -
- - -""" - components.html(html, height=480, scrolling=True) - - def _render_markdown_with_images(chunk: str) -> None: parts = IMAGE_PATTERN.split(chunk) i = 0 @@ -76,4 +50,4 @@ def _render_markdown_with_images(chunk: str) -> None: if idx % 2 == 0: _render_markdown_with_images(piece) else: - _render_mermaid(piece) + render_mermaid(piece) diff --git a/evaluation_dashboard_app/pages/99_Deployment_Debug.py b/evaluation_dashboard_app/pages/99_Deployment_Debug.py index a85b1fb..7410a02 100644 --- a/evaluation_dashboard_app/pages/99_Deployment_Debug.py +++ b/evaluation_dashboard_app/pages/99_Deployment_Debug.py @@ -5,7 +5,7 @@ sidebar entry is hidden via CSS in lib/ui/styles_global.py; Overview shows a page_link only in Docker. """ import os -from datetime import timedelta +from datetime import datetime, timedelta import pandas as pd import streamlit as st @@ -27,6 +27,8 @@ running_in_docker, task_counts_by_status, ) +from lib.docker_live_structure import live_containers_mermaid +from lib.mermaid_render import render_mermaid from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header st.set_page_config( @@ -106,7 +108,7 @@ def _render_docker_disabled(reason: str) -> None: """ **Enable Docker debug (trusted operators only)** -1. From the `deploy/` directory, ensure `docker-compose.yml` mounts `/var/run/docker.sock` into the `streamlit` service and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`, then run `docker compose up -d` (or `docker compose up -d --force-recreate streamlit` after editing compose). +1. From the `deploy/` directory, ensure `docker-compose.yml` mounts `/var/run/docker.sock` into each Streamlit service (`streamlit1`, `streamlit2`) and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`, then run `docker compose up -d` (or recreate those services after editing compose). 2. Set `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` in `.env` to your Compose project name (same value as in `docker compose ls`) so the UI lists only this stack’s containers. @@ -158,10 +160,37 @@ def _env_flag(name: str) -> bool: return os.environ.get(name, "").strip().lower() in ("1", "true", "yes") -with tab_docker: - section_header("Containers & logs", "Requires `EVAL_DEPLOYMENT_DEBUG_DOCKER` and `/var/run/docker.sock` in the Streamlit container.") +def _display_columns_for_containers(rows: list) -> pd.DataFrame: + """Column order for the live Docker table (hide internal full_id).""" + df = pd.DataFrame(rows) + if df.empty: + return df + preferred = [ + "name", + "state", + "health", + "compose_service", + "compose_project", + "image", + "id", + ] + cols = [c for c in preferred if c in df.columns] + rest = [c for c in df.columns if c not in cols and c != "full_id"] + return df[cols + rest] + + +def _render_live_stack_mermaid(rows: list) -> None: + """Help-style Mermaid (Clients / Edge / App Tier / …) with live container labels.""" + if not rows: + return + + mh = min(800, 280 + 52 * len(rows)) + render_mermaid(live_containers_mermaid(rows), height=mh) + +with tab_docker: client = docker_client_or_none() + if client is None: if not _env_flag("EVAL_DEPLOYMENT_DEBUG_DOCKER"): _render_docker_disabled( @@ -187,13 +216,6 @@ def _env_flag(name: str) -> bool: ) else: proj = compose_project_filter() - if proj: - st.caption(f"Filtering by Compose project label: `{proj}`") - else: - st.warning( - "Listing all containers on this Docker host. Set `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` in `.env` " - "to match `docker compose ls` and restrict the list." - ) _use_fragment = getattr(st, "fragment", None) is not None @@ -202,6 +224,7 @@ def _env_flag(name: str) -> bool: @st.fragment(run_every=timedelta(seconds=6)) def _docker_fragment(): rows, list_warn = list_containers_for_debug(client) + st.caption(f"Last refreshed (server clock): **{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}** — updates about every 6 s.") if list_warn and isinstance(list_warn, str) and list_warn.startswith("Docker list failed"): st.error(list_warn) return @@ -210,8 +233,10 @@ def _docker_fragment(): if not rows: st.info("No containers match the current filter.") return - display_df = pd.DataFrame(rows).drop(columns=["full_id"], errors="ignore") + section_header("Live container table", "Sortable columns; `full_id` stays internal for log/exec.") + display_df = _display_columns_for_containers(rows) st.dataframe(display_df, use_container_width=True, hide_index=True) + _render_live_stack_mermaid(rows) options = [f"{r['name']} ({r['id']})" for r in rows] id_by_label = {f"{r['name']} ({r['id']})": r["full_id"] for r in rows} @@ -228,6 +253,7 @@ def _docker_fragment(): full_id = id_by_label[pick] st.session_state.deploy_debug_cid = full_id + section_header("Logs", "Stdout/stderr from the selected container.") tail = st.slider( "Log tail (lines)", min_value=50, @@ -237,13 +263,13 @@ def _docker_fragment(): key="deploy_debug_tail", ) logs = container_logs_tail(client, full_id, tail) - st.markdown("**Logs**") st.code(logs or "(empty)", language=None) _render_docker_exec_ui(client, full_id) _docker_fragment() else: rows, list_warn = list_containers_for_debug(client) + st.caption(f"Loaded at **{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}** — use Refresh to re-query.") if list_warn and isinstance(list_warn, str) and list_warn.startswith("Docker list failed"): st.error(list_warn) elif list_warn: @@ -251,15 +277,14 @@ def _docker_fragment(): if not rows: st.info("No containers match the current filter.") else: - df = pd.DataFrame(rows) - st.dataframe( - df.drop(columns=["full_id"], errors="ignore"), - use_container_width=True, - hide_index=True, - ) + _render_live_stack_mermaid(rows) + section_header("Live container table", "Sortable columns; `full_id` stays internal for log/exec.") + display_df = _display_columns_for_containers(rows) + st.dataframe(display_df, use_container_width=True, hide_index=True) options = [f"{r['name']} ({r['id']})" for r in rows] id_by_label = {f"{r['name']} ({r['id']})": r["full_id"] for r in rows} pick = st.selectbox("Container", options=options, key="deploy_debug_pick_legacy") + section_header("Logs", "Stdout/stderr from the selected container.") tail = st.slider( "Log tail (lines)", min_value=50, @@ -270,8 +295,9 @@ def _docker_fragment(): ) full_id_legacy = id_by_label[pick] logs = container_logs_tail(client, full_id_legacy, tail) - st.markdown("**Logs**") st.code(logs or "(empty)", language=None) _render_docker_exec_ui(client, full_id_legacy) if st.button("Refresh container list"): st.rerun() + + st.page_link("pages/10_Help.py", label="Help & guide (full README, including static stack Mermaid)", icon="❔") From 88bb50a7a2d80233628dcf77120377d7e2e30b8e Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 26 Mar 2026 16:35:18 +0900 Subject: [PATCH 02/94] feat: enhance summary comparison and deployment configurations - Added `summary_delta_overlap_stats` function to compute and display statistics on overlapping keys between summary dataframes, improving delta alignment analysis. - Updated the Overview page to incorporate new summary delta statistics in comparison mode, enhancing user feedback on data overlaps. - Modified the Docker Compose configuration to include a health check for the Redis service, ensuring better monitoring of service health. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Overview.py | 86 +++++++++++++++++-- .../deploy/docker-compose.yml | 7 +- .../lib/summary_compare.py | 53 ++++++++++++ .../lib/ui/styles_global.py | 34 ++++---- .../pages/1_TP_Summary.py | 14 ++- 5 files changed, 168 insertions(+), 26 deletions(-) diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py index 3a08af7..3eeb5ec 100644 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -6,7 +6,7 @@ import plotly.express as px import plotly.graph_objects as go from lib.user_config import UserConfig -from lib.summary_compare import build_summary_delta +from lib.summary_compare import build_summary_delta, summary_delta_overlap_stats from lib.page_chrome import ( inject_app_page_styles, render_loaded_data_section, @@ -30,12 +30,12 @@ # ====== CONFIG AND CONSTANTS ====== st.set_page_config(page_title="Overview", layout="wide", initial_sidebar_state="expanded") inject_app_page_styles() -if running_in_docker(): - st.sidebar.page_link( - "pages/99_Deployment_Debug.py", - label="Deployment debug", - icon="🐳", - ) +# if running_in_docker(): +# st.sidebar.page_link( +# "pages/99_Deployment_Debug.py", +# label="Deployment debug", +# icon="🐳", +# ) RUN_ROOT = get_data_root() PRODUCT_LABEL_JA = { "Occlusion-Case": "遮蔽ケース", @@ -407,6 +407,78 @@ def safe_load_run(path, label='Run'): for i in range(1, len(all_runs)): _ov_entries.append((f"Candidate · {run_labels[i]}", path_display(all_runs[i]["path"]))) render_loaded_data_section(_ov_entries) + +if mode == "Compare Mode" and compare_run_dirs: + _all_r = st.session_state.get("all_runs") + _lbls = st.session_state.get("run_labels") + if _all_r and _lbls and all(r.get("summary") is not None for r in _all_r): + _cand_stats: list[tuple[str, dict]] = [] + _overlap_rows: list[dict] = [] + _empty_labels: list[str] = [] + _invalid_msgs: list[str] = [] + for i in range(1, len(_all_r)): + cand = _lbls[i] + stt = summary_delta_overlap_stats(_all_r[0]["summary"], _all_r[i]["summary"]) + _cand_stats.append((cand, stt)) + if not stt.get("valid"): + _invalid_msgs.append(f"**{cand}:** {stt.get('error', 'Unknown error')}") + continue + join_s = " + ".join(stt["key_cols"]) + _overlap_rows.append( + { + "Candidate": cand, + "Join keys": join_s, + "Baseline rows": stt["n_rows_baseline"], + "Candidate rows": stt["n_rows_candidate"], + "Matched (Δ rows)": stt["n_matched_keys"], + "Keys only in A": stt["n_only_baseline"], + "Keys only in candidate": stt["n_only_candidate"], + } + ) + if stt["matched_empty"]: + _empty_labels.append(cand) + if _invalid_msgs: + st.warning( + "Cannot compute Summary delta alignment for some runs:\n\n" + + "\n\n".join(_invalid_msgs) + ) + if _empty_labels: + _join_cols = next( + (" + ".join(f"`{c}`" for c in s["key_cols"]) for cnd, s in _cand_stats if cnd in _empty_labels and s.get("valid")), + "`id` (or `id` + `perception_label` when both have it)", + ) + st.warning( + "**TP Summary delta views will be empty** for candidate(s) " + f"**{', '.join(_empty_labels)}**: baseline **A** and those runs share **no** overlapping " + f"Summary join keys ({_join_cols}). " + "The inner join drops every row; use **Baseline** or **Candidate** in the TP Summary sidebar, " + "or choose runs whose Summary rows use the same keys. " + "Open **Summary key overlap (delta alignment)** below for row counts and sample keys " + "that appear on only one side." + ) + with st.expander("Summary key overlap (delta alignment) — details", expanded=False): + st.markdown( + "Delta tables on **TP Summary** inner-join baseline **A** to each candidate on the " + "same keys as here: **`id`**, or **`id` + `perception_label`** when both summaries " + "include `perception_label`. Only **matched** keys produce rows; the rest are ignored." + ) + st.dataframe(pd.DataFrame(_overlap_rows), width="stretch", hide_index=True) + for cand, stt in _cand_stats: + if not stt.get("valid"): + continue + sb = stt["sample_only_baseline"] + sc = stt["sample_only_candidate"] + if not sb and not sc: + continue + st.markdown(f"**Examples — candidate {cand}**") + c1, c2 = st.columns(2) + with c1: + st.caption("Up to 5 keys only in baseline A") + st.code("\n".join(sb) if sb else "(none)") + with c2: + st.caption(f"Up to 5 keys only in {cand}") + st.code("\n".join(sc) if sc else "(none)") + share_q = f"mode={'compare' if mode == 'Compare Mode' else 'single'}&run_a={run_a_dir.name}" if mode == "Compare Mode" and compare_run_names: for j, name in enumerate(compare_run_names): diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index aed2365..4b4416b 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -83,7 +83,12 @@ services: redis: image: redis:7-alpine restart: unless-stopped - + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + worker: build: context: .. diff --git a/evaluation_dashboard_app/lib/summary_compare.py b/evaluation_dashboard_app/lib/summary_compare.py index bb7272c..e152409 100644 --- a/evaluation_dashboard_app/lib/summary_compare.py +++ b/evaluation_dashboard_app/lib/summary_compare.py @@ -21,3 +21,56 @@ def build_summary_delta(df_a: pd.DataFrame, df_b: pd.DataFrame) -> pd.DataFrame: result[f"{m}_B"] = df_b.loc[common_idx, m] result[f"{m}_delta"] = df_b.loc[common_idx, m] - df_a.loc[common_idx, m] return result.reset_index() + + +def summary_delta_overlap_stats(df_a: pd.DataFrame, df_b: pd.DataFrame) -> dict: + """Describe index overlap used by :func:`build_summary_delta` (same join-key rules).""" + if df_a is None or df_b is None: + return {"valid": False, "error": "Summary dataframe missing.", "key_cols": []} + if "id" not in df_a.columns or "id" not in df_b.columns: + return { + "valid": False, + "error": "Summary must include an `id` column for delta alignment.", + "key_cols": ["id"], + } + if "perception_label" in df_a.columns and "perception_label" in df_b.columns: + key_cols = ["id", "perception_label"] + else: + key_cols = ["id"] + for c in key_cols: + if c not in df_a.columns or c not in df_b.columns: + return { + "valid": False, + "error": f"Join needs column `{c}` in both summaries; one run is missing it.", + "key_cols": key_cols, + } + + idx_a = df_a.set_index(key_cols).index + idx_b = df_b.set_index(key_cols).index + common = idx_a.intersection(idx_b) + only_a = idx_a.difference(idx_b) + only_b = idx_b.difference(idx_a) + + def _sample(idx_diff: pd.Index, k: int = 5) -> list[str]: + if len(idx_diff) == 0: + return [] + out: list[str] = [] + for x in list(idx_diff)[:k]: + if isinstance(x, tuple): + out.append(", ".join(str(p) for p in x)) + else: + out.append(str(x)) + return out + + return { + "valid": True, + "key_cols": key_cols, + "n_rows_baseline": int(len(df_a)), + "n_rows_candidate": int(len(df_b)), + "n_matched_keys": int(len(common)), + "n_only_baseline": int(len(only_a)), + "n_only_candidate": int(len(only_b)), + "sample_only_baseline": _sample(only_a), + "sample_only_candidate": _sample(only_b), + "matched_empty": len(common) == 0, + } diff --git a/evaluation_dashboard_app/lib/ui/styles_global.py b/evaluation_dashboard_app/lib/ui/styles_global.py index f4118e8..a66be10 100644 --- a/evaluation_dashboard_app/lib/ui/styles_global.py +++ b/evaluation_dashboard_app/lib/ui/styles_global.py @@ -49,21 +49,21 @@ def inject_app_page_styles() -> None: """, unsafe_allow_html=True, ) - try: - from lib.deploy_debug import running_in_docker + # try: + # from lib.deploy_debug import running_in_docker - if not running_in_docker(): - st.markdown( - """ - - """, - unsafe_allow_html=True, - ) - except Exception: - pass + # if not running_in_docker(): + # st.markdown( + # """ + # + # """, + # unsafe_allow_html=True, + # ) + # except Exception: + # pass diff --git a/evaluation_dashboard_app/pages/1_TP_Summary.py b/evaluation_dashboard_app/pages/1_TP_Summary.py index c08e7a1..2cc61e9 100644 --- a/evaluation_dashboard_app/pages/1_TP_Summary.py +++ b/evaluation_dashboard_app/pages/1_TP_Summary.py @@ -120,6 +120,18 @@ if tp_col not in df_active.columns: st.warning(f"Missing required column: {tp_col}") st.stop() + +if df_active.empty: + if use_delta: + _keys = "id and perception_label" if "perception_label" in df_active.columns else "id" + st.warning( + f"No delta rows: baseline and candidate share no common Summary keys ({_keys}). " + "Pick Baseline or Candidate in the sidebar, or load runs with overlapping rows." + ) + else: + st.warning("The active Summary has no rows for this view.") + st.stop() + tp_values = df_active[tp_col] tp_min_val = float(tp_values.min()) tp_max_val = float(tp_values.max()) @@ -153,7 +165,7 @@ # ========== Data Filtering ========== df_f = df_active[(df_active[tp_col] >= tp_min) & (df_active[tp_col] <= tp_max)].copy() -if clip_vel: +if clip_vel and not df_f.empty: vx_col = "vx_delta" if use_delta else "vx" vy_col = "vy_delta" if use_delta else "vy" for c in (vx_col, vy_col): From c41458cfee04f01545cda2ce8394ab468b2ace3b Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 27 Mar 2026 16:28:16 +0900 Subject: [PATCH 03/94] feat: enhance Streamlit deployment and debugging capabilities - Added support for multiple Streamlit instances in the Docker configuration, allowing for high availability setups. - Introduced a new `.streamlit/config.toml` file to configure server options, including WebSocket settings for improved performance. - Implemented a `detection_stats_debug.py` module for verbose logging and debugging of the Detection Stats page, aiding in troubleshooting. - Updated the Overview page and other components to utilize session state hydration from URL parameters, improving user experience across multiple replicas. - Enhanced the Readme.md with updated build instructions and environment variable details for better clarity. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../.streamlit/config.toml | 13 + evaluation_dashboard_app/Dockerfile | 3 + evaluation_dashboard_app/Readme.md | 6 +- evaluation_dashboard_app/deploy/.env.example | 8 +- .../deploy/docker-compose.yml | 12 +- .../deploy/nginx/nginx.conf | 22 +- .../lib/detection_stats_debug.py | 160 + .../lib/overview_url_hydrate.py | 73 + .../lib/ui/detection_stats.py | 23 +- .../pages/1_TP_Summary.py | 7 +- .../pages/2_Criteria_Based_Score.py | 8 +- .../pages/3_Detection_Stats.py | 4534 +++++++++-------- 12 files changed, 2653 insertions(+), 2216 deletions(-) create mode 100644 evaluation_dashboard_app/.streamlit/config.toml create mode 100644 evaluation_dashboard_app/lib/detection_stats_debug.py create mode 100644 evaluation_dashboard_app/lib/overview_url_hydrate.py diff --git a/evaluation_dashboard_app/.streamlit/config.toml b/evaluation_dashboard_app/.streamlit/config.toml new file mode 100644 index 0000000..5efa430 --- /dev/null +++ b/evaluation_dashboard_app/.streamlit/config.toml @@ -0,0 +1,13 @@ +# Streamlit project config (used for local `streamlit run` and Docker WORKDIR=/app). +# See https://docs.streamlit.io/develop/api-reference/configuration/config.toml + +[server] +# Container / headless deployments (no local browser) +headless = true + +# Behind nginx or other proxies, per-message WebSocket compression can break or stall +# some setups (see Streamlit troubleshooting: "App is not loading when running remotely"). +enableWebsocketCompression = false + +# cookieSecret: MUST be identical on every Streamlit replica behind a load balancer. +# Set via environment in Docker: STREAMLIT_SERVER_COOKIE_SECRET (see deploy/docker-compose.yml). diff --git a/evaluation_dashboard_app/Dockerfile b/evaluation_dashboard_app/Dockerfile index 891c8a8..df7e71e 100644 --- a/evaluation_dashboard_app/Dockerfile +++ b/evaluation_dashboard_app/Dockerfile @@ -42,6 +42,9 @@ RUN --mount=type=secret,id=ssh,dst=/tmp/ssh_key \ COPY requirements-docker.txt . RUN python3 -m pip install --no-cache-dir -r requirements-docker.txt +# Streamlit server options (WebSocket proxy, headless, etc.) +COPY .streamlit/ .streamlit/ + # Copy application code and config COPY Overview.py . COPY pages/ pages/ diff --git a/evaluation_dashboard_app/Readme.md b/evaluation_dashboard_app/Readme.md index 8dabed9..34cba4b 100644 --- a/evaluation_dashboard_app/Readme.md +++ b/evaluation_dashboard_app/Readme.md @@ -295,7 +295,7 @@ flowchart LR W2 --> DataRoot ``` -- **ビルド**: 上記「ビルド手順」のとおり `evaluation_dashboard_app/` で `docker build ... -t evaluation-dashboard .`(compose の `streamlit1` / `streamlit2` / `worker` はこのイメージを参照します)。 +- **ビルド**: 上記「ビルド手順」のとおり `evaluation_dashboard_app/` で `docker build ... -t evaluation-dashboard .`(compose の `streamlit1`(既定)・任意の `streamlit2`(`--profile ha`)・`worker` はこのイメージを参照します)。 - **推奨フロー(`deploy/` の番号付きスクリプト)**: `deploy/` に移動して順に実行します(すべて `docker compose --env-file .env` を使います)。 | スクリプト | 内容 | @@ -312,6 +312,10 @@ flowchart LR - **手動でも同じことは可能**: `cd deploy && cp .env.example .env` → `.env` を編集 → `docker compose --env-file .env up -d`。初回のみ `docker compose --env-file .env run --rm init_db`(`03_INIT_DB.sh` と同等)。 - **アクセス**: 本番 compose では **Nginx がポート 80**、Streamlit はプロキシ経由(`docker-compose.yml` / `nginx/nginx.conf` 参照)。ソースや `lib/` はマウントされているため **Streamlit はファイル変更でリロード**しやすい一方、**ワーカーは Python 変更後に再起動**が必要です。 +- **UI がずっとロード中になるとき**: Streamlit はブラウザと **WebSocket** でつながります。対処の目安: (1) **ハードリロード**(キャッシュ削除込み)や別タブで開き直す。(2) **既定は Streamlit アプリ 1 台**(`streamlit1`)のみ Nginx が向き先にしています。2 台目が必要な場合のみ `docker compose --profile ha up -d` と `nginx.conf` の upstream 追記を参照。(3) compose で **`STREAMLIT_SERVER_COOKIE_SECRET`**(`deploy/.env.example`)。(4) **`.streamlit/config.toml`** の `enableWebsocketCompression = false` と Nginx の **`proxy_buffering off`** / `proxy_*_timeout`。(5) ログ: `docker compose logs streamlit1 nginx`。 +- **502 Bad Gateway**: Nginx が **Streamlit に繋がらない**ときに出ます(プロセス落ち・OOM・長時間ブロックで切断など)。`docker compose logs streamlit1` とホストの **`dmesg`(OOM)** を確認。重いページはメモリを食うため、**既定の 1 台構成**と `deploy/nginx/nginx.conf` の単一 upstream を推奨します。 +- **Detection Stats のフリーズ / 502 切り分け**: `.env` に **`EVAL_DETECTION_STATS_DEBUG=1`**(compose の `streamlit1` に渡る)を入れて再起動。ページ下部の **Detection Stats debug** 展開と **`docker compose logs streamlit1`** の stderr に、セクション境界・`getrusage` メモリ・DuckDB 前後の経過時間が出ます。 +- **サブページで「Overview で読み込み」と出るのに Overview は済んでいるとき**: セッション状態は **レプリカごとのメモリ**にあります。Overview は URL に `mode` / `run_a` / `run_b`…を同期するため、**同じ URL のクエリが付いたまま**ならサブページ(Detection Stats など)が **`run_a` から `runA` を再構築**します(`lib/overview_url_hydrate.py`)。一度 **Overview を開いて**アドレスバーに `run_a=` があることを確認してからサブページへ進むか、または **Overview の共有リンク**から開き直してください。 - **設定の二重管理を避ける**: compose 実行時は `deploy/configs/autoware_evaluator_dl_config.json` がコンテナ内 `EVAL_DASHBOARD_CONFIG`(`/app/docker_config/...`)としてマウントされます。ホストの `configs/` とは別ファイルなので、Docker 用に変えたい値はこちらを編集します。 - 詳細・環境変数一覧は [docs/PRODUCTION_DEPLOYMENT.md](docs/PRODUCTION_DEPLOYMENT.md) を参照してください。 diff --git a/evaluation_dashboard_app/deploy/.env.example b/evaluation_dashboard_app/deploy/.env.example index 907a157..5cf7d38 100644 --- a/evaluation_dashboard_app/deploy/.env.example +++ b/evaluation_dashboard_app/deploy/.env.example @@ -17,9 +17,15 @@ DATABASE_URL=postgresql://eval_user:eval_pass@postgres:5432/eval_dashboard REDIS_URL=redis://redis:6379/0 RQ_QUEUE=default -# Docker Compose: default worker replica count (04_START.sh / 08_REBUILD_AND_START.sh). Streamlit runs as streamlit1 + streamlit2 (see docker-compose.yml + nginx). +# Docker Compose: default worker replica count (04_START.sh / 08_REBUILD_AND_START.sh). Streamlit defaults to streamlit1 only; optional second app server: compose --profile ha (see docker-compose.yml + nginx.conf). EVAL_COMPOSE_SCALE_WORKER=2 +# Same secret on both Streamlit containers (session cookies / multi-replica). Compose sets a dev default; override in production: openssl rand -hex 32 +# STREAMLIT_SERVER_COOKIE_SECRET= + +# Detection Stats page: stderr timing logs + debug expander (docker compose logs streamlit1) +# EVAL_DETECTION_STATS_DEBUG=1 + # RQ: max job runtime before the worker kills the job (seconds). Default 7 days if unset. # RQ_JOB_TIMEOUT_SEC=604800 # Optional: longer timeout for build_parquet only (defaults to RQ_JOB_TIMEOUT_SEC if unset) diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index 4b4416b..ce2d1fe 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -1,4 +1,5 @@ -# Production-style stack: Nginx -> Streamlit (x2) + Workers (default scale via EVAL_COMPOSE_SCALE_WORKER in .env / 04_START.sh). +# Production-style stack: Nginx -> Streamlit (default: one app replica) + Workers (default scale via EVAL_COMPOSE_SCALE_WORKER in .env / 04_START.sh). +# Second Streamlit: optional HA profile — `docker compose --profile ha up -d` and uncomment streamlit2 in deploy/nginx/nginx.conf upstream. # Helper scripts (run from deploy/): 01_SETUP_ENV.sh 02_BUILD.sh 03_INIT_DB.sh 04_START.sh 05_STOP.sh # 06_STATUS.sh 07_LOGS.sh 08_REBUILD_AND_START.sh 09_RESTART_WORKER.sh # Run from deploy/: docker compose --env-file .env up -d @@ -30,6 +31,11 @@ x-streamlit-app: &streamlit-app command: ["/app/docker-entrypoint.sh"] environment: - TZ=Asia/Tokyo + # Same value on streamlit1 + streamlit2 so session cookies validate behind nginx (see .streamlit/config.toml). + # Override in .env for production (e.g. openssl rand -hex 32). + - STREAMLIT_SERVER_COOKIE_SECRET=${STREAMLIT_SERVER_COOKIE_SECRET:-evaluationdashboard-streamlit-cookie-secret-change-in-production} + # Verbose stderr logs + timing expander on Detection Stats page (see lib/detection_stats_debug.py) + - EVAL_DETECTION_STATS_DEBUG=${EVAL_DETECTION_STATS_DEBUG:-0} - EVAL_DASHBOARD_DATA_ROOT=${EVAL_DASHBOARD_DATA_ROOT:-/app/data} - EVAL_DASHBOARD_CONFIG=/app/docker_config/autoware_evaluator_dl_config.json - USE_TASK_QUEUE=${USE_TASK_QUEUE:-true} @@ -52,6 +58,7 @@ x-streamlit-app: &streamlit-app - ../lib:/app/lib - ../worker:/app/worker - ../configs:/app/configs + - ../.streamlit:/app/.streamlit - /var/run/docker.sock:/var/run/docker.sock env_file: - .env @@ -71,7 +78,6 @@ services: - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro depends_on: - streamlit1 - - streamlit2 restart: unless-stopped streamlit1: @@ -79,6 +85,8 @@ services: streamlit2: <<: *streamlit-app + profiles: + - ha redis: image: redis:7-alpine diff --git a/evaluation_dashboard_app/deploy/nginx/nginx.conf b/evaluation_dashboard_app/deploy/nginx/nginx.conf index ee8278c..dd03703 100644 --- a/evaluation_dashboard_app/deploy/nginx/nginx.conf +++ b/evaluation_dashboard_app/deploy/nginx/nginx.conf @@ -1,30 +1,42 @@ # Nginx: reverse proxy to Streamlit with WebSocket support. -# Multiple Streamlit replicas: ip_hash keeps a client on one backend (Streamlit sessions are not shared). +# +# Default upstream is streamlit1 only. A second replica (streamlit2) is optional in docker-compose +# (profile "ha"); if you add it, duplicate the server line below and use ip_hash for sticky sessions. +# Pointing nginx at a dead/crashed upstream yields 502 — single replica reduces RAM pressure and failure modes. events { - worker_connections 1024; + worker_connections 2048; } http { upstream streamlit { - ip_hash; - server streamlit1:8501; - server streamlit2:8501; + server streamlit1:8501 max_fails=3 fail_timeout=10s; + # Optional second app server (start compose with --profile ha and uncomment): + # server streamlit2:8501 max_fails=3 fail_timeout=10s; + # ip_hash; # required if you use two server lines above } server { listen 80; server_name _; + client_max_body_size 200m; + location / { proxy_pass http://streamlit; proxy_http_version 1.1; + proxy_buffering off; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; + proxy_connect_timeout 60s; + proxy_send_timeout 86400; proxy_read_timeout 86400; + # Large Streamlit responses / occasional upstream quirks + proxy_buffer_size 128k; + proxy_buffers 8 256k; } } } diff --git a/evaluation_dashboard_app/lib/detection_stats_debug.py b/evaluation_dashboard_app/lib/detection_stats_debug.py new file mode 100644 index 0000000..23c1df4 --- /dev/null +++ b/evaluation_dashboard_app/lib/detection_stats_debug.py @@ -0,0 +1,160 @@ +""" +Optional verbose logging for pages/3_Detection_Stats.py (502 / freeze / OOM debugging). + +Enable with environment variable: + EVAL_DETECTION_STATS_DEBUG=1 + +Logs go to stderr (visible in `docker compose logs streamlit1`). +""" + +from __future__ import annotations + +import logging +import os +import resource +import sys +import time +import traceback +from contextlib import contextmanager +from typing import Any, List, Tuple + +_LOG = logging.getLogger("eval_dashboard.detection_stats") +_CONFIGURED = False + + +def detection_stats_debug_enabled() -> bool: + v = os.environ.get("EVAL_DETECTION_STATS_DEBUG", "").strip().lower() + return v in ("1", "true", "yes", "on") + + +def _ensure_logging() -> None: + global _CONFIGURED + if not detection_stats_debug_enabled(): + return + if _CONFIGURED: + return + _LOG.setLevel(logging.DEBUG) + h = logging.StreamHandler(sys.stderr) + h.setFormatter( + logging.Formatter("%(asctime)s [%(levelname)s] detection_stats: %(message)s") + ) + _LOG.addHandler(h) + _LOG.propagate = False + _CONFIGURED = True + + +def ds_dlog(fmt: str, *args: Any) -> None: + """Log one line when debug is enabled.""" + if not detection_stats_debug_enabled(): + return + _ensure_logging() + try: + _LOG.info(fmt, *args) + except Exception: + _LOG.info("%s %s", fmt, args) + + +def ds_debug_init_session_state(session_state: Any) -> None: + """Call once per script run (after set_page_config). Resets timing buffer.""" + if not detection_stats_debug_enabled(): + return + session_state["_ds_debug_timings"] = [] + session_state["_ds_debug_run_started"] = time.perf_counter() + ds_dlog("=== Detection Stats script run started ===") + ds_dlog("pid=%s argv[0]=%s", os.getpid(), sys.argv[0] if sys.argv else "") + for key in ( + "EVAL_DETECTION_STATS_DEBUG", + "STREAMLIT_SERVER_COOKIE_SECRET", + "EVAL_DASHBOARD_DATA_ROOT", + ): + v = os.environ.get(key) + if key == "STREAMLIT_SERVER_COOKIE_SECRET" and v: + ds_dlog("env %s=(set len=%s)", key, len(v)) + else: + ds_dlog("env %s=%r", key, v) + + +def ds_debug_log_memory(note: str = "") -> None: + if not detection_stats_debug_enabled(): + return + try: + ru = resource.getrusage(resource.RUSAGE_SELF) + # Linux: ru_maxrss kilobytes; macOS: bytes (best-effort label) + ds_dlog( + "MEM %s ru_maxrss=%s ru_utime=%.3fs ru_stime=%.3fs", + note, + ru.ru_maxrss, + ru.ru_utime, + ru.ru_stime, + ) + except Exception as e: + ds_dlog("MEM %s (unavailable: %s)", note, e) + + +def _append_timing(session_state: Any, name: str, seconds: float) -> None: + if not detection_stats_debug_enabled(): + return + lst = session_state.get("_ds_debug_timings") + if not isinstance(lst, list): + lst = [] + session_state["_ds_debug_timings"] = lst + lst.append((name, seconds)) + + +@contextmanager +def ds_dtimer(name: str, session_state: Any): + """Time a block; record to session_state for the debug expander.""" + if not detection_stats_debug_enabled(): + yield + return + t0 = time.perf_counter() + ds_dlog("TIMER start %s", name) + try: + yield + finally: + dt = time.perf_counter() - t0 + ds_dlog("TIMER end %s (%.3fs)", name, dt) + _append_timing(session_state, name, dt) + + +def ds_debug_log_exception(where: str, exc: BaseException) -> None: + if not detection_stats_debug_enabled(): + return + _ensure_logging() + _LOG.exception("EXCEPTION in %s: %s", where, exc) + + +def ds_debug_render_expander(session_state: Any) -> None: + """Renders a Streamlit expander with timings + env (only if debug on).""" + import streamlit as st + + if not detection_stats_debug_enabled(): + return + t_run = session_state.get("_ds_debug_run_started") + total_s = None + if isinstance(t_run, (int, float)): + total_s = time.perf_counter() - float(t_run) + + timings: List[Tuple[str, float]] = session_state.get("_ds_debug_timings") or [] + lines = [ + f"Total wall time (approx): {total_s:.3f}s" if total_s is not None else "Total wall time: n/a", + "", + "Section timings (seconds):", + ] + for name, sec in timings: + lines.append(f" - {name}: {sec:.3f}s") + if not timings: + lines.append(" (no ds_dtimer sections recorded)") + + lines.extend( + [ + "", + "Environment (subset):", + f" EVAL_DETECTION_STATS_DEBUG={os.environ.get('EVAL_DETECTION_STATS_DEBUG', '')!r}", + f" EVAL_DASHBOARD_DATA_ROOT={os.environ.get('EVAL_DASHBOARD_DATA_ROOT', '')!r}", + ] + ) + + with st.expander("Detection Stats debug (EVAL_DETECTION_STATS_DEBUG=1)", expanded=False): + st.code("\n".join(lines), language="text") + st.caption("Check `docker compose logs streamlit1` for the same lines on stderr.") diff --git a/evaluation_dashboard_app/lib/overview_url_hydrate.py b/evaluation_dashboard_app/lib/overview_url_hydrate.py new file mode 100644 index 0000000..5e4b2ed --- /dev/null +++ b/evaluation_dashboard_app/lib/overview_url_hydrate.py @@ -0,0 +1,73 @@ +""" +Rehydrate session_state from Overview URL query params when server-side session is empty. + +Overview syncs `mode`, `run_a`, `run_b`, ... via `st.query_params`. After a load-balancer hop to a +different Streamlit replica, `st.session_state` may not contain `runA` even though the user already +used Overview — the URL still encodes the selection. This module rebuilds `runA` / compare state +from that URL so multipage analysis works without requiring Overview to run again on the same box. +""" + +from __future__ import annotations + +import streamlit as st + +from lib.path_utils import get_data_root, list_run_directories +from lib.run_loader import load_run + + +def try_hydrate_session_from_overview_query_params() -> bool: + """ + If `runA` is missing but the URL has Overview-style params (`run_a`, optional `mode` / `run_b`…), + load runs and populate `session_state`. Returns True if `runA` is present afterward. + """ + if "runA" in st.session_state: + return True + params = st.query_params + run_a_name = params.get("run_a") + if not run_a_name: + return False + root = get_data_root() + if not root.exists() or not root.is_dir(): + return False + run_dirs = list_run_directories() + name_to_dir = {p.name: p for p in run_dirs} + if run_a_name not in name_to_dir: + return False + mode_param = (params.get("mode") or "single").lower() + try: + if mode_param == "compare": + url_compare = [ + params.get(k) + for k in ("run_b", "run_c", "run_d", "run_e") + if params.get(k) + ] + valid = [n for n in url_compare if n in name_to_dir] + if not valid: + return False + run_a_dir = name_to_dir[run_a_name] + compare_dirs = [name_to_dir[n] for n in valid] + all_dirs = [run_a_dir] + compare_dirs + run_labels = ["A"] + [chr(66 + i) for i in range(len(compare_dirs))] + all_runs = [load_run(d) for d in all_dirs] + st.session_state.update( + { + "mode": "Compare Mode", + "runA": all_runs[0], + "all_runs": all_runs, + "run_labels": run_labels, + "df_cmp": None, + } + ) + if len(all_runs) >= 2: + st.session_state["runB"] = all_runs[1] + else: + st.session_state["runB"] = None + return True + run_a = load_run(name_to_dir[run_a_name]) + st.session_state["runA"] = run_a + st.session_state["mode"] = "Single Mode" + for key in ("all_runs", "run_labels", "runB", "df_cmp"): + st.session_state.pop(key, None) + return True + except Exception: + return False diff --git a/evaluation_dashboard_app/lib/ui/detection_stats.py b/evaluation_dashboard_app/lib/ui/detection_stats.py index 780d245..b4d0f1d 100644 --- a/evaluation_dashboard_app/lib/ui/detection_stats.py +++ b/evaluation_dashboard_app/lib/ui/detection_stats.py @@ -2,7 +2,6 @@ from __future__ import annotations -import html from contextlib import contextmanager import streamlit as st @@ -272,25 +271,15 @@ def section_header_html(title: str, caption: str = "") -> str: return f'
{title}
' -def ds_spot_loading_markup(label: str) -> str: - """Compact inline HTML: shows where the app is busy (Streamlit runs top-to-bottom, so this “moves” down the page).""" - safe = html.escape(label) - return f"""
- - Working here - {safe} - -
""" +def ds_spot_loading_markup(_label: str) -> str: + """Spot loader HTML disabled (was: “Working here” + label); returns empty string.""" + return "" @contextmanager -def ds_spot_loading(label: str): - slot = st.empty() - slot.markdown(ds_spot_loading_markup(label), unsafe_allow_html=True) - try: - yield - finally: - slot.empty() +def ds_spot_loading(_label: str): + """Spot loader context manager disabled (no-op); kept for call-site compatibility.""" + yield def detection_stats_page_loading_banner_markup() -> str: """Top-of-page banner while queries and charts stream in.""" diff --git a/evaluation_dashboard_app/pages/1_TP_Summary.py b/evaluation_dashboard_app/pages/1_TP_Summary.py index 2cc61e9..cd955d3 100644 --- a/evaluation_dashboard_app/pages/1_TP_Summary.py +++ b/evaluation_dashboard_app/pages/1_TP_Summary.py @@ -2,15 +2,20 @@ import plotly.express as px import pandas as pd from lib.path_utils import path_display +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero, section_header from lib.summary_compare import build_summary_delta st.set_page_config(layout="wide", page_title="TP Summary", page_icon="📈", initial_sidebar_state="expanded") +try_hydrate_session_from_overview_query_params() inject_app_page_styles() # ========== Safety Check ========== if "runA" not in st.session_state: - st.warning("Please load data from the Overview page first.") + st.warning( + "Please load data from the Overview page first. " + "If you already did, open Overview once so the URL includes `run_a=...`, then return (multiple Streamlit replicas)." + ) st.stop() mode = st.session_state.get("mode", "Single Run") diff --git a/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py b/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py index 96d13e3..ba7b553 100644 --- a/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py +++ b/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py @@ -4,6 +4,7 @@ import plotly.express as px import plotly.graph_objects as go from lib.path_utils import path_display +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.page_chrome import ( inject_app_page_styles, render_loaded_data_section, @@ -33,7 +34,7 @@ page_icon="📊", initial_sidebar_state="expanded", ) - +try_hydrate_session_from_overview_query_params() # Plotly theme (multi-run palette aligned with Overview / run cards) _COMPARE_RUN_COLORS = ["#312e81", "#0f766e", "#e86a33", "#6b8e23", "#9b59b6", "#1abc9c"] @@ -143,7 +144,10 @@ def _apply_gate_data_filters( # Safety check # ========================= if "runA" not in st.session_state: - st.warning("Please load data from the Overview page first.") + st.warning( + "Please load data from the Overview page first. " + "If you already did, open Overview once so the URL includes `run_a=...`, then return (multiple Streamlit replicas)." + ) st.stop() mode = st.session_state.get("mode", "Single Run") diff --git a/evaluation_dashboard_app/pages/3_Detection_Stats.py b/evaluation_dashboard_app/pages/3_Detection_Stats.py index c10fcc7..c6fd72c 100644 --- a/evaluation_dashboard_app/pages/3_Detection_Stats.py +++ b/evaluation_dashboard_app/pages/3_Detection_Stats.py @@ -12,6 +12,15 @@ from typing import Optional, List, Tuple from lib.path_utils import path_display +from lib.detection_stats_debug import ( + ds_debug_init_session_state, + ds_debug_log_exception, + ds_debug_log_memory, + ds_debug_render_expander, + ds_dlog, + ds_dtimer, +) +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.parquet_schema import schema_flags from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero from lib.ui.detection_stats import ( @@ -281,11 +290,18 @@ def _scalar_metric_spider_compare( initial_sidebar_state="expanded", ) +try_hydrate_session_from_overview_query_params() +ds_debug_init_session_state(st.session_state) + # ============================= # Session state from Overview (mode, run paths) # ============================= if "runA" not in st.session_state: - st.warning("Please load data from the **Overview** page first (select mode and run(s)).") + st.warning( + "Please load data from the **Overview** page first (select mode and run(s)). " + "If you already did, open **Overview** once so the URL includes `run_a=...` (share link), then return — " + "or hard-refresh. With multiple Streamlit replicas, the server-side session may not follow until the URL is synced." + ) st.stop() inject_app_page_styles() @@ -316,16 +332,24 @@ def list_parquets_in_run(run_path) -> List[str]: return sorted([str(f.resolve()) for f in p.glob("*.parquet")]) # ============================= -# DuckDB Connection +# DuckDB Connection (one in-memory DB per Streamlit browser session) # ============================= -_duckdb_connection: Optional[duckdb.DuckDBPyConnection] = None - def get_duckdb_connection() -> duckdb.DuckDBPyConnection: - """Return a shared DuckDB connection for all queries.""" - global _duckdb_connection - if _duckdb_connection is None: - _duckdb_connection = duckdb.connect() - return _duckdb_connection + """Return a DuckDB connection scoped to this Streamlit session.""" + if "_ds_duckdb" not in st.session_state: + st.session_state["_ds_duckdb"] = duckdb.connect() + return st.session_state["_ds_duckdb"] + + +def _parquet_selection_fingerprint(paths: List[str]) -> Tuple[Tuple[str, float], ...]: + """Path + mtime per file so filter-only reruns skip rebuilding views when data is unchanged.""" + fp: List[Tuple[str, float]] = [] + for p in paths: + try: + fp.append((p, os.path.getmtime(p))) + except OSError: + fp.append((p, 0.0)) + return tuple(fp) # ============================= # Helper Functions @@ -413,12 +437,8 @@ def create_view_eval_flat(con, target_file: str, view_name: str = "view_eval_fla """ con.execute(query) -def create_view_tpr_fpr(con, view_name: str = "view_tpr_fpr_by_class_dist_topic", source_eval_flat: str = "view_eval_flat"): - """Create TPR/FPR view. source_eval_flat is the name of the eval_flat view to read from.""" - query = f""" - CREATE OR REPLACE VIEW {view_name} AS - WITH stats AS ( - SELECT +# Per-(dataset, topic, label, bin, visibility, suite) aggregates — shared by distance-bin rate queries. +_TPR_FPR_STATS_SELECT = """SELECT t4dataset_id, topic_name, label, @@ -429,55 +449,110 @@ def create_view_tpr_fpr(con, view_name: str = "view_tpr_fpr_by_class_dist_topic" COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) AS gt_total, COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS tp_gt, COUNT(*) FILTER (WHERE source='EST' AND status IN ('TP','FP')) AS est_total, - COUNT(*) FILTER (WHERE source='EST' AND status='FP') AS fp_est - FROM {source_eval_flat} - GROUP BY - t4dataset_id, topic_name, label, distance_bin, bin_idx, + COUNT(*) FILTER (WHERE source='EST' AND status='FP') AS fp_est""" + +_TPR_FPR_STATS_GROUP_BY = """t4dataset_id, topic_name, label, distance_bin, bin_idx, coalesce(try(CAST(visibility AS VARCHAR)), 'not available'), - coalesce(try(CAST(suite_name AS VARCHAR)), '') - ) - SELECT - *, - CASE WHEN gt_total > 0 THEN CAST(tp_gt AS DOUBLE) / gt_total ELSE NULL END AS tpr, - CASE WHEN est_total > 0 THEN CAST(fp_est AS DOUBLE) / est_total ELSE NULL END AS fpr - FROM stats + coalesce(try(CAST(suite_name AS VARCHAR)), '')""" + + +def sql_distance_bin_rates_from_eval_flat( + source_eval_flat: str, + filter_clause: str, + *, + metrics: str = "both", +) -> str: + """TPR/FPR by ``distance_bin`` from ``view_eval_flat`` rows, with filters pushed into the stats CTE. + + Distance charts used to ``SELECT ... FROM view_tpr_fpr_* WHERE ...`` (nested view over parquet). On some + DuckDB builds that plan can **SIGSEGV** the process (container exit **139**). This query inlines the same + stats aggregation and applies ``WHERE`` on the flat view instead. """ - con.execute(query) + order_by = "ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER)" + inner = f""" + WITH stats AS ( + {_TPR_FPR_STATS_SELECT} + FROM {source_eval_flat} + WHERE ({filter_clause}) + GROUP BY + {_TPR_FPR_STATS_GROUP_BY} + )""" + if metrics == "both": + return f""" + {inner} + SELECT + distance_bin, + CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr, + CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr + FROM stats + GROUP BY distance_bin + {order_by} + """ + if metrics == "tpr": + return f""" + {inner} + SELECT distance_bin, + CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr + FROM stats + GROUP BY distance_bin + {order_by} + """ + if metrics == "fpr": + return f""" + {inner} + SELECT distance_bin, + CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr + FROM stats + GROUP BY distance_bin + {order_by} + """ + raise ValueError(f"metrics must be 'both', 'tpr', or 'fpr', got {metrics!r}") + def build_filter_clause(filters: dict,*, enable_dist_h: bool = True) -> str: - """Build WHERE clause from filters.""" + """Build WHERE clause from filters. + + For label / suites / visibility: ``None`` means this dimension is inactive (e.g. no suite column). + An empty list ``[]`` means no restriction on that dimension (same as all options selected). + Using ``if filters.get('label')`` would treat ``[]`` as falsy and accidentally drop the filter, + causing full scans (very slow on large Parquet). + """ conditions = [] if filters.get('topic_name') and filters['topic_name'] != '__all__': conditions.append(f"topic_name = '{filters['topic_name']}'") - if filters.get('label'): - if isinstance(filters['label'], list) and len(filters['label']) > 0: - # Escape single quotes in labels - labels_escaped = [str(l).replace("'", "''") for l in filters['label']] - labels_str = "', '".join(labels_escaped) - conditions.append(f"label IN ('{labels_str}')") - elif not isinstance(filters['label'], list) and filters['label'] != '__all__': - label_escaped = str(filters['label']).replace("'", "''") + lbl = filters.get('label') + if lbl is not None: + if isinstance(lbl, list): + if len(lbl) > 0: + labels_escaped = [str(l).replace("'", "''") for l in lbl] + labels_str = "', '".join(labels_escaped) + conditions.append(f"label IN ('{labels_str}')") + elif not isinstance(lbl, list) and lbl != '__all__': + label_escaped = str(lbl).replace("'", "''") conditions.append(f"label = '{label_escaped}'") - if filters.get('suites'): - if isinstance(filters['suites'], list) and len(filters['suites']) > 0: - suite_escaped = [str(s).replace("'", "''") for s in filters['suites']] - suite_str = "', '".join(suite_escaped) - conditions.append(f"COALESCE(CAST(suite_name AS VARCHAR), '') IN ('{suite_str}')") - elif not isinstance(filters['suites'], list) and filters['suites'] != '__all__': - s_escaped = str(filters['suites']).replace("'", "''") + su = filters.get('suites') + if su is not None: + if isinstance(su, list): + if len(su) > 0: + suite_escaped = [str(s).replace("'", "''") for s in su] + suite_str = "', '".join(suite_escaped) + conditions.append(f"COALESCE(CAST(suite_name AS VARCHAR), '') IN ('{suite_str}')") + elif not isinstance(su, list) and su != '__all__': + s_escaped = str(su).replace("'", "''") conditions.append(f"COALESCE(CAST(suite_name AS VARCHAR), '') = '{s_escaped}'") - if filters.get('visibility'): - if isinstance(filters['visibility'], list) and len(filters['visibility']) > 0: - # Escape single quotes in visibility values - vis_escaped = [str(v).replace("'", "''") for v in filters['visibility']] - vis_str = "', '".join(vis_escaped) - conditions.append(f"COALESCE(visibility, 'not available') IN ('{vis_str}')") - elif not isinstance(filters['visibility'], list): - vis_escaped = str(filters['visibility']).replace("'", "''") + vis = filters.get('visibility') + if vis is not None: + if isinstance(vis, list): + if len(vis) > 0: + vis_escaped = [str(v).replace("'", "''") for v in vis] + vis_str = "', '".join(vis_escaped) + conditions.append(f"COALESCE(visibility, 'not available') IN ('{vis_str}')") + elif not isinstance(vis, list): + vis_escaped = str(vis).replace("'", "''") conditions.append(f"COALESCE(visibility, 'not available') = '{vis_escaped}'") if enable_dist_h and filters.get('max_eval_range'): @@ -542,1438 +617,1719 @@ def build_filter_clause(filters: dict,*, enable_dist_h: bool = True) -> str: target_files.append(tf) con = get_duckdb_connection() -for i, (path, lbl) in enumerate(zip(target_files, run_labels_list)): - ok, msg = validate_parquet_file(con, path) - if not ok: - st.sidebar.error(f"**Run ({lbl}) file** cannot be read: {msg}") - st.stop() +fp = _parquet_selection_fingerprint(target_files) +cache_hit = st.session_state.get("_ds_parquet_fp") == fp and "_ds_filter_opts" in st.session_state + +ds_dlog( + "duckdb setup: fp=%s cache_hit=%s n_runs=%s target_files=%s", + fp, + cache_hit, + len(target_files), + [os.path.basename(p) for p in target_files], +) +ds_debug_log_memory("before_duckdb_validate_views") + +with ds_dtimer("duckdb_validate_views_list_values_or_cache", st.session_state): + if not cache_hit: + for i, (path, lbl) in enumerate(zip(target_files, run_labels_list)): + ok, msg = validate_parquet_file(con, path) + if not ok: + st.sidebar.error(f"**Run ({lbl}) file** cannot be read: {msg}") + st.stop() + + # One eval_flat view per run. (TPR/FPR layered views are not created: Distance queries inline the same + # stats from eval_flat — nested view + aggregate can segfault DuckDB, exit 139.) + try: + for i, path in enumerate(target_files): + v_flat = "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" + create_view_eval_flat(con, path, v_flat) + except Exception as e: + st.error(f"Error creating views: {e}") + st.stop() -# Create one eval_flat + tpr_fpr view per run (view_eval_flat_1, view_tpr_fpr_1, ...) -try: - for i, path in enumerate(target_files): - v_flat = "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" - v_tpr = "view_tpr_fpr_by_class_dist_topic" if i == 0 else f"view_tpr_fpr_{i}" - create_view_eval_flat(con, path, v_flat) - create_view_tpr_fpr(con, v_tpr, source_eval_flat=v_flat) -except Exception as e: - st.error(f"Error creating views: {e}") - st.stop() + # Filter options from first file (applied to all runs) + target_file = target_files[0] + topics = list_values(con, target_file, "topic_name") + labels = list_values(con, target_file, "label") + try: + suite_options = list_values(con, target_file, "COALESCE(CAST(suite_name AS VARCHAR), '')") + except Exception: + suite_options = [] + vis_options = list_values(con, target_file, "COALESCE(CAST(visibility AS VARCHAR), 'not available') AS visibility") + schema = schema_flags(con, target_file) + st.session_state["_ds_parquet_fp"] = fp + st.session_state["_ds_filter_opts"] = { + "topics": topics, + "labels": labels, + "suite_options": suite_options, + "vis_options": vis_options, + "schema": schema, + } + else: + opts = st.session_state["_ds_filter_opts"] + topics = opts["topics"] + labels = opts["labels"] + suite_options = opts["suite_options"] + vis_options = opts["vis_options"] + schema = opts["schema"] + target_file = target_files[0] + +ds_debug_log_memory("after_duckdb_validate_views") -# Filter options from first file (applied to all runs) -target_file = target_files[0] with st.sidebar: - topics = list_values(con, target_file, "topic_name") topic_name = st.selectbox("Topic Name", ["__all__"] + topics, key="topic_name") if topics else "__all__" - labels = list_values(con, target_file, "label") - selected_labels = st.multiselect("Label(s)", labels, default=labels[:5] if labels and len(labels) > 5 else (labels or []), key="labels") - try: - suite_options = list_values(con, target_file, "COALESCE(CAST(suite_name AS VARCHAR), '')") - except Exception: - suite_options = [] - selected_suites = st.multiselect("Suites", suite_options, default=suite_options, key="suites", help="Filter by suite(s). Default: all included.") if suite_options else [] - vis_options = list_values(con, target_file, "COALESCE(CAST(visibility AS VARCHAR), 'not available') AS visibility") - selected_visibility = st.multiselect("Visibility", vis_options, default=vis_options, key="visibility") if vis_options else [] + # Widget keys: avoid generic "labels"/"visibility" (session_state collisions, ambiguous with run_labels). + if "ds_filter_class_labels" not in st.session_state and "labels" in st.session_state: + st.session_state["ds_filter_class_labels"] = st.session_state["labels"] + if "ds_filter_visibility" not in st.session_state and "visibility" in st.session_state: + st.session_state["ds_filter_visibility"] = st.session_state["visibility"] + if labels: + if "ds_filter_class_labels" not in st.session_state: + st.session_state["ds_filter_class_labels"] = list(labels) + selected_labels = st.multiselect( + "Label(s)", + labels, + key="ds_filter_class_labels", + ) + else: + selected_labels = [] + if suite_options: + if "suites" not in st.session_state: + st.session_state["suites"] = list(suite_options) + selected_suites = st.multiselect( + "Suites", + suite_options, + key="suites", + help="Filter by suite(s). Default: all included.", + ) + else: + selected_suites = [] + if vis_options: + if "ds_filter_visibility" not in st.session_state: + st.session_state["ds_filter_visibility"] = list(vis_options) + selected_visibility = st.multiselect( + "Visibility", + vis_options, + key="ds_filter_visibility", + ) + else: + selected_visibility = [] max_eval_range = st.selectbox("Max Evaluation Range [m]", [50, 80, 100, 120, 150], index=0, key="max_eval_range") -# Build filters (same values for all runs) +# Build filters (same values for all runs). None = dimension unused (no suite/visibility column in UI). filters_base = { 'topic_name': topic_name, 'label': selected_labels, - 'suites': selected_suites, - 'visibility': selected_visibility, + 'suites': selected_suites if suite_options else None, + 'visibility': selected_visibility if vis_options else None, 'max_eval_range': max_eval_range } filters_list = [filters_base] * len(runs) -# Schema flags for optional columns (confidence, velocity, etc.) -schema = schema_flags(con, target_file) +try: + _fcl_preview = build_filter_clause(filters_base) +except Exception as _e_fcl: + _fcl_preview = f"" +ds_dlog("filters_base keys=%s filter_clause_preview=%s", list(filters_base.keys()), _fcl_preview[:800]) -# Banner while the rest of the page (queries + charts) streams in — cleared at end of script. +# Banner while the rest of the page (queries + charts) streams in — cleared in finally (even on errors). _ds_loading_banner = st.empty() _ds_loading_banner.markdown(detection_stats_page_loading_banner_markup(), unsafe_allow_html=True) +try: + ds_dlog("main_content_try_enter") + ds_debug_log_memory("main_content_start") -# ============================= -# Main Content -# ============================= - -# ----------------------------- -# KPI strip (TP, FP, FN, TPR, FPR, Precision, Recall, F1) -# ----------------------------- -def _flat_view(i: int) -> str: - return "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" - -def _kpi_row_for_view(con, view: str, filter_clause: str): - """Return dict with tp_gt, fn, tp_est, fp and derived TPR, FPR, Precision, Recall, F1.""" - q = f""" - SELECT - COUNT(*) FILTER (WHERE source = 'GT' AND status = 'TP') AS tp_gt, - COUNT(*) FILTER (WHERE source = 'GT' AND status = 'FN') AS fn, - COUNT(*) FILTER (WHERE source = 'EST' AND status = 'TP') AS tp_est, - COUNT(*) FILTER (WHERE source = 'EST' AND status = 'FP') AS fp - FROM {view} - WHERE {filter_clause} - """ - row = con.execute(q).fetchone() - if not row: - return None - tp_gt, fn, tp_est, fp = int(row[0]), int(row[1]), int(row[2]), int(row[3]) - gt_total = tp_gt + fn - est_total = tp_est + fp - tpr = (tp_gt / gt_total) if gt_total > 0 else None - fpr = (fp / est_total) if est_total > 0 else None - precision = (tp_est / est_total) if est_total > 0 else None - recall = tpr - if precision is not None and recall is not None and (precision + recall) > 0: - f1 = 2 * precision * recall / (precision + recall) - else: - f1 = None - return { - "tp": tp_gt, "fp": fp, "fn": fn, - "tpr": tpr, "fpr": fpr, "precision": precision, "recall": recall, "f1": f1, - } - -# ============================= -# Panel 1: t4dataset Summary -# ============================= -st.markdown(section_header_html("Summary", "Within selected filters and max evaluation range."), unsafe_allow_html=True) -if single_mode: - with ds_spot_loading("Summary · KPI metrics"): - fc = build_filter_clause(filters_base) - kpi = _kpi_row_for_view(con, "view_eval_flat", fc) - inject_detection_stats_kpi_styles() - if kpi: - html = '
' + render_kpi_card("Metrics (within filters & max range)", kpi) + "
" - st.markdown(html, unsafe_allow_html=True) + # ============================= + # Main Content + # ============================= + + # ----------------------------- + # KPI strip (TP, FP, FN, TPR, FPR, Precision, Recall, F1) + # ----------------------------- + def _flat_view(i: int) -> str: + return "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" + + def _kpi_row_for_view(con, view: str, filter_clause: str): + """Return dict with tp_gt, fn, tp_est, fp and derived TPR, FPR, Precision, Recall, F1.""" + q = f""" + SELECT + COUNT(*) FILTER (WHERE source = 'GT' AND status = 'TP') AS tp_gt, + COUNT(*) FILTER (WHERE source = 'GT' AND status = 'FN') AS fn, + COUNT(*) FILTER (WHERE source = 'EST' AND status = 'TP') AS tp_est, + COUNT(*) FILTER (WHERE source = 'EST' AND status = 'FP') AS fp + FROM {view} + WHERE {filter_clause} + """ + row = con.execute(q).fetchone() + if not row: + return None + tp_gt, fn, tp_est, fp = int(row[0]), int(row[1]), int(row[2]), int(row[3]) + gt_total = tp_gt + fn + est_total = tp_est + fp + tpr = (tp_gt / gt_total) if gt_total > 0 else None + fpr = (fp / est_total) if est_total > 0 else None + precision = (tp_est / est_total) if est_total > 0 else None + recall = tpr + if precision is not None and recall is not None and (precision + recall) > 0: + f1 = 2 * precision * recall / (precision + recall) + else: + f1 = None + return { + "tp": tp_gt, "fp": fp, "fn": fn, + "tpr": tpr, "fpr": fpr, "precision": precision, "recall": recall, "f1": f1, + } + + # ============================= + # Panel 1: t4dataset Summary + # ============================= + ds_dlog("section: Panel1_Summary_start") + st.markdown(section_header_html("Summary", "Within selected filters and max evaluation range."), unsafe_allow_html=True) + if single_mode: + with ds_spot_loading("Summary · KPI metrics"): + fc = build_filter_clause(filters_base) + kpi = _kpi_row_for_view(con, "view_eval_flat", fc) + inject_detection_stats_kpi_styles() + if kpi: + html = '
' + render_kpi_card("Metrics (within filters & max range)", kpi) + "
" + st.markdown(html, unsafe_allow_html=True) + else: + st.caption("No KPI data.") else: - st.caption("No KPI data.") -else: - with ds_spot_loading("Summary · KPI metrics"): - kpis = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i]) - kpi = _kpi_row_for_view(con, _flat_view(i), fc) - kpis.append((run_labels_list[i], kpi)) - inject_detection_stats_kpi_styles() - baseline = kpis[0][1] if kpis else None - cards_html_parts = [] - for lbl, kpi in kpis: - deltas = None - if baseline and kpi and lbl != run_labels_list[0]: - deltas = { - "tp": kpi["tp"] - baseline["tp"], - "fp": kpi["fp"] - baseline["fp"], - "fn": kpi["fn"] - baseline["fn"], - "tpr": (kpi["tpr"] - baseline["tpr"]) if (kpi.get("tpr") is not None and baseline.get("tpr") is not None) else None, - "fpr": (kpi["fpr"] - baseline["fpr"]) if (kpi.get("fpr") is not None and baseline.get("fpr") is not None) else None, - "precision": (kpi["precision"] - baseline["precision"]) if (kpi.get("precision") is not None and baseline.get("precision") is not None) else None, - "recall": (kpi["recall"] - baseline["recall"]) if (kpi.get("recall") is not None and baseline.get("recall") is not None) else None, - "f1": (kpi["f1"] - baseline["f1"]) if (kpi.get("f1") is not None and baseline.get("f1") is not None) else None, - } - cards_html_parts.append(render_kpi_card(f"Run {lbl}", kpi or {}, f"kpi-run-{lbl}", deltas=deltas)) - st.markdown('
' + "".join(cards_html_parts) + "
", unsafe_allow_html=True) - -if st.checkbox("Debug: Inspect Parquet (All Runs)" if not single_mode else "Debug: Inspect Parquet"): - cols_used = st.columns(len(target_files)) - file_labels = [(f"Run ({run_labels_list[i]}) File", target_files[i]) for i in range(len(target_files))] - schema_results = [] - for col, (label, file_path) in zip(cols_used, file_labels): - with col: - st.markdown(f"### {label}") - # Schema - schema_df = con.execute(""" - DESCRIBE SELECT * FROM read_parquet(?) - """, [file_path]).df() - schema_results.append((label, schema_df)) - st.write("**Schema (Column Names, Types)**") - st.markdown("Shows the schema (column names and their DuckDB/Parquet data types) of the selected Parquet file. Useful to check data structure and types as interpreted by DuckDB.") - st.dataframe(schema_df, width='stretch', hide_index=True) - - # Preview rows - row_options = [10, 20, 50, 100, 200, "All"] - preview_key = f"preview_row_limit_{label.replace(' ', '_').lower()}" - row_choice = st.selectbox(f"Preview rows to show ({label})", row_options, index=1, key=preview_key) - if row_choice == "All": - limit_clause = "" - else: - limit_clause = f"LIMIT {row_choice}" - preview_df = con.execute(f""" - SELECT * - FROM read_parquet(?) - {limit_clause} - """, [file_path]).df() - st.write(f"**Preview (First {row_choice} rows)**") - st.markdown(f"Shows the first {row_choice} preview rows from the Parquet file. Use this preview to examine example data contents and check that your file is as expected.") - st.dataframe(preview_df, width='stretch', hide_index=True) - - # Stats - stats_df = con.execute(""" - SELECT - COUNT(*) AS total_rows, - COUNT(t4dataset_id) AS non_null_ids, - COUNT(DISTINCT t4dataset_id) AS distinct_ids - FROM read_parquet(?) - """, [file_path]).df() - st.write("**Stats (Row Count, t4dataset_id non-null count, Distinct t4dataset_id count)**") - st.markdown(""" - - `total_rows`: Total rows in the file - - `non_null_ids`: Rows where t4dataset_id is not null - - `distinct_ids`: Unique t4dataset_id values - - This helps rapidly assess the completeness and distribution of the key ID field. - """) - st.dataframe(stats_df, width='stretch', hide_index=True) - - # --- Show info about schema differences (compare mode only) --- - if not single_mode and len(schema_results) >= 2: - with st.expander("⚖️ Difference between schemas", expanded=(len(schema_results) == 2)): - if len(schema_results) == 2: - label1, df1 = schema_results[0] - label2, df2 = schema_results[1] - names1 = set(df1["column_name"]) - names2 = set(df2["column_name"]) - added, removed = names2 - names1, names1 - names2 - common = names1 & names2 - types1 = {row["column_name"]: row["column_type"] for _, row in df1.iterrows()} - types2 = {row["column_name"]: row["column_type"] for _, row in df2.iterrows()} - dtype_changes = [(c, types1.get(c), types2.get(c)) for c in sorted(common) if types1.get(c) != types2.get(c)] - if not (added or removed or dtype_changes): - st.success("✅ The schemas are identical (column names and types match exactly).") + with ds_spot_loading("Summary · KPI metrics"): + kpis = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i]) + kpi = _kpi_row_for_view(con, _flat_view(i), fc) + kpis.append((run_labels_list[i], kpi)) + inject_detection_stats_kpi_styles() + baseline = kpis[0][1] if kpis else None + cards_html_parts = [] + for lbl, kpi in kpis: + deltas = None + if baseline and kpi and lbl != run_labels_list[0]: + deltas = { + "tp": kpi["tp"] - baseline["tp"], + "fp": kpi["fp"] - baseline["fp"], + "fn": kpi["fn"] - baseline["fn"], + "tpr": (kpi["tpr"] - baseline["tpr"]) if (kpi.get("tpr") is not None and baseline.get("tpr") is not None) else None, + "fpr": (kpi["fpr"] - baseline["fpr"]) if (kpi.get("fpr") is not None and baseline.get("fpr") is not None) else None, + "precision": (kpi["precision"] - baseline["precision"]) if (kpi.get("precision") is not None and baseline.get("precision") is not None) else None, + "recall": (kpi["recall"] - baseline["recall"]) if (kpi.get("recall") is not None and baseline.get("recall") is not None) else None, + "f1": (kpi["f1"] - baseline["f1"]) if (kpi.get("f1") is not None and baseline.get("f1") is not None) else None, + } + cards_html_parts.append(render_kpi_card(f"Run {lbl}", kpi or {}, f"kpi-run-{lbl}", deltas=deltas)) + st.markdown('
' + "".join(cards_html_parts) + "
", unsafe_allow_html=True) + + if st.checkbox("Debug: Inspect Parquet (All Runs)" if not single_mode else "Debug: Inspect Parquet"): + cols_used = st.columns(len(target_files)) + file_labels = [(f"Run ({run_labels_list[i]}) File", target_files[i]) for i in range(len(target_files))] + schema_results = [] + for col, (label, file_path) in zip(cols_used, file_labels): + with col: + st.markdown(f"### {label}") + # Schema + schema_df = con.execute(""" + DESCRIBE SELECT * FROM read_parquet(?) + """, [file_path]).df() + schema_results.append((label, schema_df)) + st.write("**Schema (Column Names, Types)**") + st.markdown("Shows the schema (column names and their DuckDB/Parquet data types) of the selected Parquet file. Useful to check data structure and types as interpreted by DuckDB.") + st.dataframe(schema_df, width='stretch', hide_index=True) + + # Preview rows + row_options = [10, 20, 50, 100, 200, "All"] + preview_key = f"preview_row_limit_{label.replace(' ', '_').lower()}" + row_choice = st.selectbox(f"Preview rows to show ({label})", row_options, index=1, key=preview_key) + if row_choice == "All": + limit_clause = "" + else: + limit_clause = f"LIMIT {row_choice}" + preview_df = con.execute(f""" + SELECT * + FROM read_parquet(?) + {limit_clause} + """, [file_path]).df() + st.write(f"**Preview (First {row_choice} rows)**") + st.markdown(f"Shows the first {row_choice} preview rows from the Parquet file. Use this preview to examine example data contents and check that your file is as expected.") + st.dataframe(preview_df, width='stretch', hide_index=True) + + # Stats + stats_df = con.execute(""" + SELECT + COUNT(*) AS total_rows, + COUNT(t4dataset_id) AS non_null_ids, + COUNT(DISTINCT t4dataset_id) AS distinct_ids + FROM read_parquet(?) + """, [file_path]).df() + st.write("**Stats (Row Count, t4dataset_id non-null count, Distinct t4dataset_id count)**") + st.markdown(""" + - `total_rows`: Total rows in the file + - `non_null_ids`: Rows where t4dataset_id is not null + - `distinct_ids`: Unique t4dataset_id values + + This helps rapidly assess the completeness and distribution of the key ID field. + """) + st.dataframe(stats_df, width='stretch', hide_index=True) + + # --- Show info about schema differences (compare mode only) --- + if not single_mode and len(schema_results) >= 2: + with st.expander("⚖️ Difference between schemas", expanded=(len(schema_results) == 2)): + if len(schema_results) == 2: + label1, df1 = schema_results[0] + label2, df2 = schema_results[1] + names1 = set(df1["column_name"]) + names2 = set(df2["column_name"]) + added, removed = names2 - names1, names1 - names2 + common = names1 & names2 + types1 = {row["column_name"]: row["column_type"] for _, row in df1.iterrows()} + types2 = {row["column_name"]: row["column_type"] for _, row in df2.iterrows()} + dtype_changes = [(c, types1.get(c), types2.get(c)) for c in sorted(common) if types1.get(c) != types2.get(c)] + if not (added or removed or dtype_changes): + st.success("✅ The schemas are identical (column names and types match exactly).") + else: + if added: + st.error(f"Columns only in `{label2}`: {', '.join(sorted(added))}") + if removed: + st.error(f"Columns only in `{label1}`: {', '.join(sorted(removed))}") + if dtype_changes: + st.warning("Columns with different types:") + st.dataframe(pd.DataFrame(dtype_changes, columns=["Column", f"Type in {label1}", f"Type in {label2}"]), width='stretch', hide_index=True) else: - if added: - st.error(f"Columns only in `{label2}`: {', '.join(sorted(added))}") - if removed: - st.error(f"Columns only in `{label1}`: {', '.join(sorted(removed))}") - if dtype_changes: - st.warning("Columns with different types:") - st.dataframe(pd.DataFrame(dtype_changes, columns=["Column", f"Type in {label1}", f"Type in {label2}"]), width='stretch', hide_index=True) + st.info(f"{len(schema_results)} runs loaded. Compare schemas per run in the columns above.") + + + + ds_dlog("section: Dataset_summary_status_distribution_try") + try: + with ds_spot_loading("Dataset summary & status distribution"): + if single_mode: + query_base = f""" + SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{os.path.basename(target_file)}' AS series + FROM view_eval_flat + """ + df_summary = con.execute(query_base).df() + query_status = """ + SELECT label, status, COUNT(*) AS num + FROM view_eval_flat + GROUP BY label, status + ORDER BY label, status + """ + df_status = con.execute(query_status).df() else: - st.info(f"{len(schema_results)} runs loaded. Compare schemas per run in the columns above.") - - - -try: - with ds_spot_loading("Dataset summary & status distribution"): + parts = [f"SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{run_labels_list[i]}' AS series FROM {_flat_view(i)}" for i in range(len(runs))] + query_base = " UNION ALL ".join(parts) + df_summary = con.execute(query_base).df() + parts_status = [f"SELECT '{run_labels_list[i]}' AS dataset, label, status, COUNT(*) AS num FROM {_flat_view(i)} GROUP BY label, status" for i in range(len(runs))] + query_status = " UNION ALL ".join(parts_status) + " ORDER BY dataset, label, status" + df_status = con.execute(query_status).df() + if single_mode: - query_base = f""" - SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{os.path.basename(target_file)}' AS series - FROM view_eval_flat - """ - df_summary = con.execute(query_base).df() - query_status = """ - SELECT label, status, COUNT(*) AS num - FROM view_eval_flat - GROUP BY label, status - ORDER BY label, status - """ - df_status = con.execute(query_status).df() - else: - parts = [f"SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{run_labels_list[i]}' AS series FROM {_flat_view(i)}" for i in range(len(runs))] - query_base = " UNION ALL ".join(parts) - df_summary = con.execute(query_base).df() - parts_status = [f"SELECT '{run_labels_list[i]}' AS dataset, label, status, COUNT(*) AS num FROM {_flat_view(i)} GROUP BY label, status" for i in range(len(runs))] - query_status = " UNION ALL ".join(parts_status) + " ORDER BY dataset, label, status" - df_status = con.execute(query_status).df() - - if single_mode: - if not df_status.empty: - if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): - df_status_wide = df_status.pivot_table(index='label', columns='status', values='num', fill_value=0).reset_index() - st.download_button("Download status count (CSV)", data=df_status_wide.to_csv(index=False).encode("utf-8"), file_name="detection_status_count.csv", mime="text/csv", key="dl_status_count") - st.dataframe(df_status_wide, width='stretch', hide_index=True) - status_viz = st.radio( - "Status chart style", - options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], - index=0, - horizontal=True, - key="status_dist_viz", - ) - n_labels = df_status["label"].nunique() - use_horizontal = n_labels > 6 - if status_viz == "Stacked bar (counts)": - if use_horizontal: - fig2 = px.bar( - df_status, - y="label", - x="num", - color="status", - barmode="stack", - title="Status Distribution per Label", - labels={"num": "Count", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - orientation="h", - ) - else: - fig2 = px.bar( - df_status, - x="label", - y="num", - color="status", - barmode="stack", - title="Status Distribution per Label", - labels={"num": "Count", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - ) - apply_chart_theme(fig2) - st.plotly_chart(fig2, width='stretch') - elif status_viz == "Treemap": - fig2 = px.treemap( - df_status, - path=["label", "status"], - values="num", - color="status", - color_discrete_map=STATUS_COLORS, - title="Status Distribution per Label (area = count)", - ) - fig2.update_traces( - textinfo="label+value+percent parent", - hovertemplate="%{label}
Count: %{value}", + if not df_status.empty: + if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): + df_status_wide = df_status.pivot_table(index='label', columns='status', values='num', fill_value=0).reset_index() + st.download_button("Download status count (CSV)", data=df_status_wide.to_csv(index=False).encode("utf-8"), file_name="detection_status_count.csv", mime="text/csv", key="dl_status_count") + st.dataframe(df_status_wide, width='stretch', hide_index=True) + status_viz = st.radio( + "Status chart style", + options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], + index=0, + horizontal=True, + key="status_dist_viz", ) - apply_chart_theme(fig2, height=420) - st.plotly_chart(fig2, width='stretch') - elif status_viz == "Spider chart (TP, FP & FN)": - wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) - cats = sorted(wide.index.astype(str).unique()) - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - run_single = [os.path.basename(target_file) if target_file else "Run"] - rcols = st.columns(3) - for col_i, st_name in enumerate(["TP", "FP", "FN"]): - vals = wide[st_name] if st_name in wide.columns else pd.Series(0, index=wide.index) - df_m = pd.DataFrame({"label": wide.index.astype(str), "count": vals.values}) - df_m["run"] = run_single[0] - fig_r = _count_spider_compare( - df_m, - cats, - f"{st_name} count per label", - run_single, - f"{st_name} count", - ) - with rcols[col_i]: - st.plotly_chart(fig_r, width='stretch') - else: - # 100% stacked: proportion per label - wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) - wide_pct = wide.div(wide.sum(axis=1), axis=0) - df_pct = wide_pct.reset_index().melt(id_vars="label", var_name="status", value_name="pct") - df_pct = df_pct[df_pct["pct"] > 0] - if not df_pct.empty: + n_labels = df_status["label"].nunique() + use_horizontal = n_labels > 6 + if status_viz == "Stacked bar (counts)": if use_horizontal: fig2 = px.bar( - df_pct, + df_status, y="label", - x="pct", + x="num", color="status", barmode="stack", - title="Status proportion per Label (100% stacked)", - labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, color_discrete_map=STATUS_COLORS, orientation="h", ) else: fig2 = px.bar( - df_pct, + df_status, x="label", - y="pct", + y="num", color="status", barmode="stack", - title="Status proportion per Label (100% stacked)", - labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, color_discrete_map=STATUS_COLORS, ) apply_chart_theme(fig2) - if use_horizontal: - fig2.update_layout(xaxis_tickformat=".0%", xaxis_range=[0, 1]) - else: - fig2.update_layout(yaxis_tickformat=".0%", yaxis_range=[0, 1]) st.plotly_chart(fig2, width='stretch') - else: - st.info("No data for proportions.") - else: - st.info("No status count data available") - else: - if not df_status.empty: - if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): - df_status_wide = df_status.pivot_table(index='label', columns=['dataset', 'status'], values='num', fill_value=0) - df_status_wide.columns = [f"{col[0]} {col[1]}" for col in df_status_wide.columns] - df_status_wide = df_status_wide.reset_index() - st.dataframe(df_status_wide, width='stretch', hide_index=True) - status_viz = st.radio( - "Status chart style", - options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], - index=0, - horizontal=True, - key="status_dist_viz_compare", - ) - if status_viz == "Stacked bar (counts)": - fig2 = px.bar( - df_status, - x="label", - y="num", - color="status", - barmode="stack", - facet_col="dataset", - title="Status Distribution per Label (by Run)", - category_orders={"dataset": run_labels_list}, - labels={"num": "Count", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - ) - apply_chart_theme(fig2) - st.plotly_chart(fig2, width='stretch') - elif status_viz == "Spider chart (TP, FP & FN)": - # Same counts as stacked bar: one spider per status (TP / FP / FN), axes = labels, r = count - status_wide = df_status.pivot_table( - index=["dataset", "label"], columns="status", values="num", fill_value=0 - ).reset_index() - cats = sorted(df_status["label"].astype(str).unique()) - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - rcols = st.columns(3) - for col_i, st_name in enumerate(["TP", "FP", "FN"]): - col_data = ( - status_wide[st_name] - if st_name in status_wide.columns - else pd.Series(0, index=status_wide.index) - ) - df_m = pd.DataFrame( - { - "run": status_wide["dataset"].astype(str), - "label": status_wide["label"].astype(str), - "count": col_data.values, - } + elif status_viz == "Treemap": + fig2 = px.treemap( + df_status, + path=["label", "status"], + values="num", + color="status", + color_discrete_map=STATUS_COLORS, + title="Status Distribution per Label (area = count)", ) - fig_r = _count_spider_compare( - df_m, - cats, - f"{st_name} count per label (by run)", - run_labels_list, - f"{st_name} count", + fig2.update_traces( + textinfo="label+value+percent parent", + hovertemplate="%{label}
Count: %{value}", ) - with rcols[col_i]: - st.plotly_chart(fig_r, width='stretch') - elif status_viz == "Treemap": - n_runs = len(run_labels_list) - cols = st.columns(min(n_runs, 3)) - for idx, lbl in enumerate(run_labels_list): - df_r = df_status[df_status["dataset"] == lbl] - if not df_r.empty: - fig_t = px.treemap( - df_r, - path=["label", "status"], - values="num", - color="status", - color_discrete_map=STATUS_COLORS, - title=f"{lbl}", - ) - fig_t.update_traces( - textinfo="label+value+percent parent", - hovertemplate="%{label}
Count: %{value}", + apply_chart_theme(fig2, height=420) + st.plotly_chart(fig2, width='stretch') + elif status_viz == "Spider chart (TP, FP & FN)": + wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) + cats = sorted(wide.index.astype(str).unique()) + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + run_single = [os.path.basename(target_file) if target_file else "Run"] + rcols = st.columns(3) + for col_i, st_name in enumerate(["TP", "FP", "FN"]): + vals = wide[st_name] if st_name in wide.columns else pd.Series(0, index=wide.index) + df_m = pd.DataFrame({"label": wide.index.astype(str), "count": vals.values}) + df_m["run"] = run_single[0] + fig_r = _count_spider_compare( + df_m, + cats, + f"{st_name} count per label", + run_single, + f"{st_name} count", ) - apply_chart_theme(fig_t, height=360) - with cols[idx % len(cols)]: - st.plotly_chart(fig_t, width='stretch') - else: - # 100% stacked per run (facet) - df_pct_list = [] - for lbl in run_labels_list: - df_r = df_status[df_status["dataset"] == lbl] - wide = df_r.pivot_table(index="label", columns="status", values="num", fill_value=0) - if wide.empty: - continue + with rcols[col_i]: + st.plotly_chart(fig_r, width='stretch') + else: + # 100% stacked: proportion per label + wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) wide_pct = wide.div(wide.sum(axis=1), axis=0) - wide_pct["dataset"] = lbl - wide_pct = wide_pct.reset_index() - df_pct_list.append(wide_pct) - if df_pct_list: - wide_all = pd.concat(df_pct_list, ignore_index=True) - df_pct_melt = wide_all.melt( - id_vars=["label", "dataset"], - value_vars=[c for c in wide_all.columns if c not in ("label", "dataset")], - var_name="status", - value_name="pct", - ) - df_pct_melt = df_pct_melt[df_pct_melt["pct"] > 0] - if not df_pct_melt.empty: - fig2 = px.bar( - df_pct_melt, - x="label", - y="pct", - color="status", - barmode="stack", - facet_col="dataset", - category_orders={"dataset": run_labels_list}, - title="Status proportion per Label (100% stacked, by Run)", - labels={"pct": "Proportion", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - ) + df_pct = wide_pct.reset_index().melt(id_vars="label", var_name="status", value_name="pct") + df_pct = df_pct[df_pct["pct"] > 0] + if not df_pct.empty: + if use_horizontal: + fig2 = px.bar( + df_pct, + y="label", + x="pct", + color="status", + barmode="stack", + title="Status proportion per Label (100% stacked)", + labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, + orientation="h", + ) + else: + fig2 = px.bar( + df_pct, + x="label", + y="pct", + color="status", + barmode="stack", + title="Status proportion per Label (100% stacked)", + labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, + ) apply_chart_theme(fig2) - fig2.update_layout( - yaxis_tickformat=".0%", - yaxis_range=[0, 1], - ) - for ann in fig2.layout.annotations: - ann.text = ann.text.split("=")[-1] + if use_horizontal: + fig2.update_layout(xaxis_tickformat=".0%", xaxis_range=[0, 1]) + else: + fig2.update_layout(yaxis_tickformat=".0%", yaxis_range=[0, 1]) st.plotly_chart(fig2, width='stretch') else: st.info("No data for proportions.") - else: - st.info("No data for proportions.") - else: - st.info("No status count data available") - -except Exception as e: - st.error(f"Error in summary: {e}") - - - -def _tpr_fpr_view(i: int) -> str: - return "view_tpr_fpr_by_class_dist_topic" if i == 0 else f"view_tpr_fpr_{i}" - - -def _distance_bin_order_and_label(bin_str: str) -> Tuple[int, str]: - """Parse distance_bin e.g. '[0,10)' -> (0, '0–10 m'). Used for sorting and axis labels.""" - import re - s = str(bin_str).strip() - m = re.match(r"\[(\d+)\s*,\s*(\d+)\)", s) - if m: - lo, hi = int(m.group(1)), int(m.group(2)) - return (lo, f"{lo}–{hi} m") - m = re.match(r"\[(\d+)\s*,\s*inf\)", s, re.I) - if m: - return (int(m.group(1)), f"{m.group(1)}+ m") - return (0, s) - - -# Same 10 m bins as view_tpr_fpr / eval_flat (used for object-count alignment) -_DIST_BIN_CASE = """CASE - WHEN dist_h >= 0 AND dist_h < 10 THEN '[0,10)' - WHEN dist_h >= 10 AND dist_h < 20 THEN '[10,20)' - WHEN dist_h >= 20 AND dist_h < 30 THEN '[20,30)' - WHEN dist_h >= 30 AND dist_h < 40 THEN '[30,40)' - WHEN dist_h >= 40 AND dist_h < 50 THEN '[40,50)' - WHEN dist_h >= 50 AND dist_h < 60 THEN '[50,60)' - WHEN dist_h >= 60 AND dist_h < 70 THEN '[60,70)' - WHEN dist_h >= 70 AND dist_h < 80 THEN '[70,80)' - WHEN dist_h >= 80 AND dist_h < 90 THEN '[80,90)' - WHEN dist_h >= 90 AND dist_h < 100 THEN '[90,100)' - WHEN dist_h >= 100 AND dist_h < 110 THEN '[100,110)' - WHEN dist_h >= 110 AND dist_h < 120 THEN '[110,120)' - WHEN dist_h >= 120 AND dist_h < 130 THEN '[120,130)' - WHEN dist_h >= 130 AND dist_h < 140 THEN '[130,140)' - WHEN dist_h >= 140 AND dist_h < 150 THEN '[140,150)' - WHEN dist_h >= 150 THEN '[150,inf)' - ELSE '[unknown]' END""" - - -# ============================= -# Panel 3–5: Distance — TP/FP rates by bin + object count vs range -# ============================= -st.divider() -st.markdown( - section_header_html( - "Distance: TP/FP rates & object count", - "Same distance bins and chart style (line or bar) for rates and object counts; x-axis order matches across charts.", - ), - unsafe_allow_html=True, -) -rate_by_dist_style = st.radio( - "Chart style", - options=["Line chart (trend)", "Bar chart (histogram)"], - index=0, - horizontal=True, - key="tp_fp_rate_by_dist_style", -) - -filter_clause_base = build_filter_clause(filters_base, enable_dist_h=False) -_dist_slot = st.empty() -_dist_slot.markdown(ds_spot_loading_markup("Distance · TP/FP rates & object counts"), unsafe_allow_html=True) -try: - use_line_chart = rate_by_dist_style == "Line chart (trend)" - rate_bin_labels_order: Optional[List[str]] = None - - if single_mode: - # Fetch both TP and FP rate by distance - query_both = f""" - SELECT - distance_bin, - CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr, - CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr - FROM view_tpr_fpr_by_class_dist_topic - WHERE {filter_clause_base} - GROUP BY distance_bin - ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER) - """ - df_both = con.execute(query_both).df() - if not df_both.empty: - df_both["bin_order"], df_both["bin_label"] = zip( - *df_both["distance_bin"].map(_distance_bin_order_and_label) - ) - df_both = df_both.sort_values("bin_order") - x_labels = df_both["bin_label"].tolist() - rate_bin_labels_order = x_labels - - if use_line_chart: - fig = go.Figure() - fig.add_trace( - go.Scatter( - x=x_labels, - y=df_both["tpr"], - name="TP rate", - mode="lines", - line=dict(color=RUN_COLORS[0], width=2.5, shape="spline"), - fill="tozeroy", - fillcolor="rgba(74, 144, 217, 0.2)", - hovertemplate="%{x}
TP rate: %{y:.2%}", - ) - ) - fig.add_trace( - go.Scatter( - x=x_labels, - y=df_both["fpr"], - name="FP rate", - mode="lines", - line=dict(color=RUN_COLORS[1], width=2.5, shape="spline"), - fill="tozeroy", - fillcolor="rgba(232, 106, 51, 0.2)", - hovertemplate="%{x}
FP rate: %{y:.2%}", - ) - ) - apply_chart_theme(fig, height=420) - fig.update_layout( - title=f"TP & FP rate by distance (within {max_eval_range} m)", - xaxis_title="Distance bin", - yaxis_title="Rate", - yaxis_range=[0, 1], - xaxis=dict( - tickangle=-35, - categoryorder="array", - categoryarray=x_labels, - ), - hovermode="x unified", - ) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig, width='stretch') else: - # Bar chart (histogram): combined TP + FP grouped bars - fig = go.Figure() - fig.add_trace( - go.Bar( - x=x_labels, - y=df_both["tpr"], - name="TP rate", - marker_color=RUN_COLORS[0], - hovertemplate="%{x}
TP rate: %{y:.2%}", - ) - ) - fig.add_trace( - go.Bar( - x=x_labels, - y=df_both["fpr"], - name="FP rate", - marker_color=RUN_COLORS[1], - hovertemplate="%{x}
FP rate: %{y:.2%}", - ) - ) - apply_chart_theme(fig, height=420) - fig.update_layout( - title=f"TP & FP rate by distance (within {max_eval_range} m)", - xaxis_title="Distance bin", - yaxis_title="Rate", - yaxis_range=[0, 1], - barmode="group", - xaxis=dict( - tickangle=-35, - categoryorder="array", - categoryarray=x_labels, - ), - hovermode="x unified", - ) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig, width='stretch') + st.info("No status count data available") else: - st.info("No distance-bin data available.") - else: - # Compare mode: fetch TP and FP by distance per run - dfs_tpr = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i], enable_dist_h=False) - q = f""" - SELECT distance_bin, - CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr - FROM {_tpr_fpr_view(i)} - WHERE {fc} - GROUP BY distance_bin - ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER) - """ - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) - df_i = df_i.sort_values("bin_order") - dfs_tpr.append(df_i) - df_tpr_dist = pd.concat(dfs_tpr, ignore_index=True) - - dfs_fpr = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i], enable_dist_h=False) - q = f""" - SELECT distance_bin, - CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr - FROM {_tpr_fpr_view(i)} - WHERE {fc} - GROUP BY distance_bin - ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER) - """ - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) - df_i = df_i.sort_values("bin_order") - dfs_fpr.append(df_i) - df_fpr_dist = pd.concat(dfs_fpr, ignore_index=True) - - if not df_tpr_dist.empty: - rate_bin_labels_order = ( - df_tpr_dist[df_tpr_dist["run"] == run_labels_list[0]] - .sort_values("bin_order")["bin_label"] - .tolist() - ) - _xaxis_dist_bins = ( - dict(tickangle=-35, categoryorder="array", categoryarray=rate_bin_labels_order) - if rate_bin_labels_order - else dict(tickangle=-35) - ) - - if use_line_chart: - if not df_tpr_dist.empty: - fig_tpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") - c = RUN_COLORS[i % len(RUN_COLORS)] - r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - fig_tpr.add_trace( - go.Scatter( - x=d["bin_label"], - y=d["tpr"], - name=lbl, - mode="lines", - line=dict(color=c, width=2.2, shape="spline"), - fill="tozeroy", - fillcolor=f"rgba({r},{g},{b},0.15)", - hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", - ) - ) - apply_chart_theme(fig_tpr, height=420) - fig_tpr.update_layout( - title=f"TP rate by distance", - xaxis_title="Distance bin", - yaxis_title="TP rate", - yaxis_range=[0, 1], - xaxis=_xaxis_dist_bins, - hovermode="x unified", + if not df_status.empty: + if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): + df_status_wide = df_status.pivot_table(index='label', columns=['dataset', 'status'], values='num', fill_value=0) + df_status_wide.columns = [f"{col[0]} {col[1]}" for col in df_status_wide.columns] + df_status_wide = df_status_wide.reset_index() + st.dataframe(df_status_wide, width='stretch', hide_index=True) + status_viz = st.radio( + "Status chart style", + options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], + index=0, + horizontal=True, + key="status_dist_viz_compare", ) - fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_tpr, width='stretch') - else: - st.info("No TP rate by distance data.") - - if not df_fpr_dist.empty: - fig_fpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") - c = RUN_COLORS[i % len(RUN_COLORS)] - r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - fig_fpr.add_trace( - go.Scatter( - x=d["bin_label"], - y=d["fpr"], - name=lbl, - mode="lines", - line=dict(color=c, width=2.2, shape="spline"), - fill="tozeroy", - fillcolor=f"rgba({r},{g},{b},0.15)", - hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", - ) + if status_viz == "Stacked bar (counts)": + fig2 = px.bar( + df_status, + x="label", + y="num", + color="status", + barmode="stack", + facet_col="dataset", + title="Status Distribution per Label (by Run)", + category_orders={"dataset": run_labels_list}, + labels={"num": "Count", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, ) - apply_chart_theme(fig_fpr, height=420) - fig_fpr.update_layout( - title=f"FP rate by distance", - xaxis_title="Distance bin", - yaxis_title="FP rate", - yaxis_range=[0, 1], - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_fpr, width='stretch') - else: - st.info("No FP rate by distance data.") - else: - # Bar chart (histogram) for compare: TP then FP, grouped by run - if not df_tpr_dist.empty: - fig_tpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") - fig_tpr.add_trace( - go.Bar( - x=d["bin_label"], - y=d["tpr"], - name=lbl, - marker_color=RUN_COLORS[i % len(RUN_COLORS)], - hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", + apply_chart_theme(fig2) + st.plotly_chart(fig2, width='stretch') + elif status_viz == "Spider chart (TP, FP & FN)": + # Same counts as stacked bar: one spider per status (TP / FP / FN), axes = labels, r = count + status_wide = df_status.pivot_table( + index=["dataset", "label"], columns="status", values="num", fill_value=0 + ).reset_index() + cats = sorted(df_status["label"].astype(str).unique()) + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + rcols = st.columns(3) + for col_i, st_name in enumerate(["TP", "FP", "FN"]): + col_data = ( + status_wide[st_name] + if st_name in status_wide.columns + else pd.Series(0, index=status_wide.index) ) - ) - apply_chart_theme(fig_tpr, height=420) - fig_tpr.update_layout( - title=f"TP rate by distance", - xaxis_title="Distance bin", - yaxis_title="TP rate", - yaxis_range=[0, 1], - barmode="group", - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_tpr, width='stretch') - else: - st.info("No TP rate by distance data.") - - if not df_fpr_dist.empty: - fig_fpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") - fig_fpr.add_trace( - go.Bar( - x=d["bin_label"], - y=d["fpr"], - name=lbl, - marker_color=RUN_COLORS[i % len(RUN_COLORS)], - hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", + df_m = pd.DataFrame( + { + "run": status_wide["dataset"].astype(str), + "label": status_wide["label"].astype(str), + "count": col_data.values, + } ) - ) - apply_chart_theme(fig_fpr, height=420) - fig_fpr.update_layout( - title=f"FP rate by distance", - xaxis_title="Distance bin", - yaxis_title="FP rate", - yaxis_range=[0, 1], - barmode="group", - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_fpr, width='stretch') + fig_r = _count_spider_compare( + df_m, + cats, + f"{st_name} count per label (by run)", + run_labels_list, + f"{st_name} count", + ) + with rcols[col_i]: + st.plotly_chart(fig_r, width='stretch') + elif status_viz == "Treemap": + n_runs = len(run_labels_list) + cols = st.columns(min(n_runs, 3)) + for idx, lbl in enumerate(run_labels_list): + df_r = df_status[df_status["dataset"] == lbl] + if not df_r.empty: + fig_t = px.treemap( + df_r, + path=["label", "status"], + values="num", + color="status", + color_discrete_map=STATUS_COLORS, + title=f"{lbl}", + ) + fig_t.update_traces( + textinfo="label+value+percent parent", + hovertemplate="%{label}
Count: %{value}", + ) + apply_chart_theme(fig_t, height=360) + with cols[idx % len(cols)]: + st.plotly_chart(fig_t, width='stretch') + else: + # 100% stacked per run (facet) + df_pct_list = [] + for lbl in run_labels_list: + df_r = df_status[df_status["dataset"] == lbl] + wide = df_r.pivot_table(index="label", columns="status", values="num", fill_value=0) + if wide.empty: + continue + wide_pct = wide.div(wide.sum(axis=1), axis=0) + wide_pct["dataset"] = lbl + wide_pct = wide_pct.reset_index() + df_pct_list.append(wide_pct) + if df_pct_list: + wide_all = pd.concat(df_pct_list, ignore_index=True) + df_pct_melt = wide_all.melt( + id_vars=["label", "dataset"], + value_vars=[c for c in wide_all.columns if c not in ("label", "dataset")], + var_name="status", + value_name="pct", + ) + df_pct_melt = df_pct_melt[df_pct_melt["pct"] > 0] + if not df_pct_melt.empty: + fig2 = px.bar( + df_pct_melt, + x="label", + y="pct", + color="status", + barmode="stack", + facet_col="dataset", + category_orders={"dataset": run_labels_list}, + title="Status proportion per Label (100% stacked, by Run)", + labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, + ) + apply_chart_theme(fig2) + fig2.update_layout( + yaxis_tickformat=".0%", + yaxis_range=[0, 1], + ) + for ann in fig2.layout.annotations: + ann.text = ann.text.split("=")[-1] + st.plotly_chart(fig2, width='stretch') + else: + st.info("No data for proportions.") + else: + st.info("No data for proportions.") else: - st.info("No FP rate by distance data.") - - # Object count by same distance bins as TP/FP; same line vs bar style; aligned x-axis - + st.info("No status count data available") + + except Exception as e: + st.error(f"Error in summary: {e}") + + + + def _distance_bin_order_and_label(bin_str: str) -> Tuple[int, str]: + """Parse distance_bin e.g. '[0,10)' -> (0, '0–10 m'). Used for sorting and axis labels.""" + import re + s = str(bin_str).strip() + m = re.match(r"\[(\d+)\s*,\s*(\d+)\)", s) + if m: + lo, hi = int(m.group(1)), int(m.group(2)) + return (lo, f"{lo}–{hi} m") + m = re.match(r"\[(\d+)\s*,\s*inf\)", s, re.I) + if m: + return (int(m.group(1)), f"{m.group(1)}+ m") + return (0, s) + + + # Same 10 m bins as eval_flat / TPR-FPR stats (used for object-count alignment) + _DIST_BIN_CASE = """CASE + WHEN dist_h >= 0 AND dist_h < 10 THEN '[0,10)' + WHEN dist_h >= 10 AND dist_h < 20 THEN '[10,20)' + WHEN dist_h >= 20 AND dist_h < 30 THEN '[20,30)' + WHEN dist_h >= 30 AND dist_h < 40 THEN '[30,40)' + WHEN dist_h >= 40 AND dist_h < 50 THEN '[40,50)' + WHEN dist_h >= 50 AND dist_h < 60 THEN '[50,60)' + WHEN dist_h >= 60 AND dist_h < 70 THEN '[60,70)' + WHEN dist_h >= 70 AND dist_h < 80 THEN '[70,80)' + WHEN dist_h >= 80 AND dist_h < 90 THEN '[80,90)' + WHEN dist_h >= 90 AND dist_h < 100 THEN '[90,100)' + WHEN dist_h >= 100 AND dist_h < 110 THEN '[100,110)' + WHEN dist_h >= 110 AND dist_h < 120 THEN '[110,120)' + WHEN dist_h >= 120 AND dist_h < 130 THEN '[120,130)' + WHEN dist_h >= 130 AND dist_h < 140 THEN '[130,140)' + WHEN dist_h >= 140 AND dist_h < 150 THEN '[140,150)' + WHEN dist_h >= 150 THEN '[150,inf)' + ELSE '[unknown]' END""" + + + # ============================= + # Panel 3–5: Distance — TP/FP rates by bin + object count vs range + # ============================= + ds_dlog("section: Panel3_5_Distance_start") + st.divider() + st.markdown( + section_header_html( + "Distance: TP/FP rates & object count", + "Same distance bins and chart style (line or bar) for rates and object counts; x-axis order matches across charts.", + ), + unsafe_allow_html=True, + ) + rate_by_dist_style = st.radio( + "Chart style", + options=["Line chart (trend)", "Bar chart (histogram)"], + index=0, + horizontal=True, + key="tp_fp_rate_by_dist_style", + ) + + filter_clause_base = build_filter_clause(filters_base, enable_dist_h=False) + ds_dlog( + "distance: filter_clause_base (no dist_h) len=%s preview=%s", + len(filter_clause_base), + filter_clause_base[:600], + ) + _dist_slot = st.empty() + _dist_slot.markdown(ds_spot_loading_markup("Distance · TP/FP rates & object counts"), unsafe_allow_html=True) try: + ds_dlog("distance_inner_try: single_mode=%s", single_mode) + ds_debug_log_memory("distance_inner_try_start") + use_line_chart = rate_by_dist_style == "Line chart (trend)" + rate_bin_labels_order: Optional[List[str]] = None + if single_mode: - q_oc = f""" - SELECT ({_DIST_BIN_CASE}) AS distance_bin, label, COUNT(*) AS n - FROM view_eval_flat - WHERE {filter_clause_base} - GROUP BY 1, 2 - """ - df_oc = con.execute(q_oc).df() + # Inline stats from view_eval_flat (avoid nested TPR/FPR view — DuckDB can SIGSEGV on that plan). + query_both = sql_distance_bin_rates_from_eval_flat( + "view_eval_flat", filter_clause_base, metrics="both" + ) + ds_dlog("distance: executing query_both (single_mode TPR/FPR by bin, inlined from eval_flat)") + df_both = con.execute(query_both).df() + ds_dlog("distance: query_both done rows=%s cols=%s", len(df_both), list(df_both.columns)) + ds_debug_log_memory("distance_after_query_both") + if not df_both.empty: + df_both["bin_order"], df_both["bin_label"] = zip( + *df_both["distance_bin"].map(_distance_bin_order_and_label) + ) + df_both = df_both.sort_values("bin_order") + x_labels = df_both["bin_label"].tolist() + rate_bin_labels_order = x_labels + + if use_line_chart: + fig = go.Figure() + fig.add_trace( + go.Scatter( + x=x_labels, + y=df_both["tpr"], + name="TP rate", + mode="lines", + line=dict(color=RUN_COLORS[0], width=2.5, shape="spline"), + fill="tozeroy", + fillcolor="rgba(74, 144, 217, 0.2)", + hovertemplate="%{x}
TP rate: %{y:.2%}", + ) + ) + fig.add_trace( + go.Scatter( + x=x_labels, + y=df_both["fpr"], + name="FP rate", + mode="lines", + line=dict(color=RUN_COLORS[1], width=2.5, shape="spline"), + fill="tozeroy", + fillcolor="rgba(232, 106, 51, 0.2)", + hovertemplate="%{x}
FP rate: %{y:.2%}", + ) + ) + apply_chart_theme(fig, height=420) + fig.update_layout( + title=f"TP & FP rate by distance (within {max_eval_range} m)", + xaxis_title="Distance bin", + yaxis_title="Rate", + yaxis_range=[0, 1], + xaxis=dict( + tickangle=-35, + categoryorder="array", + categoryarray=x_labels, + ), + hovermode="x unified", + ) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig, width='stretch') + else: + # Bar chart (histogram): combined TP + FP grouped bars + fig = go.Figure() + fig.add_trace( + go.Bar( + x=x_labels, + y=df_both["tpr"], + name="TP rate", + marker_color=RUN_COLORS[0], + hovertemplate="%{x}
TP rate: %{y:.2%}", + ) + ) + fig.add_trace( + go.Bar( + x=x_labels, + y=df_both["fpr"], + name="FP rate", + marker_color=RUN_COLORS[1], + hovertemplate="%{x}
FP rate: %{y:.2%}", + ) + ) + apply_chart_theme(fig, height=420) + fig.update_layout( + title=f"TP & FP rate by distance (within {max_eval_range} m)", + xaxis_title="Distance bin", + yaxis_title="Rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=dict( + tickangle=-35, + categoryorder="array", + categoryarray=x_labels, + ), + hovermode="x unified", + ) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig, width='stretch') + else: + st.info("No distance-bin data available.") else: - dfs_oc = [] + # Compare mode: fetch TP and FP by distance per run + ds_dlog("distance: compare_mode n_runs=%s", len(runs)) + dfs_tpr = [] for i in range(len(runs)): - fc_oc = build_filter_clause(filters_list[i], enable_dist_h=False) - q_oc_i = f""" - SELECT ({_DIST_BIN_CASE}) AS distance_bin, COUNT(*) AS n - FROM {_flat_view(i)} - WHERE {fc_oc} - GROUP BY 1 - """ - df_oci = con.execute(q_oc_i).df() - df_oci["run"] = run_labels_list[i] - dfs_oc.append(df_oci) - df_oc = pd.concat(dfs_oc, ignore_index=True) - - if df_oc.empty: - st.info("No object count data by distance bin.") - else: - df_oc = df_oc.copy() - df_oc["bin_order"], df_oc["bin_label"] = zip(*df_oc["distance_bin"].map(_distance_bin_order_and_label)) - if rate_bin_labels_order: - align_x = list(rate_bin_labels_order) - else: - align_x = ( - df_oc.drop_duplicates("distance_bin") + fc = build_filter_clause(filters_list[i], enable_dist_h=False) + q = sql_distance_bin_rates_from_eval_flat(_flat_view(i), fc, metrics="tpr") + ds_dlog("distance: compare run %s/%s TPR by bin query", i + 1, len(runs)) + df_i = con.execute(q).df() + ds_dlog("distance: compare TPR query run %s rows=%s", i, len(df_i)) + df_i["run"] = run_labels_list[i] + df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) + df_i = df_i.sort_values("bin_order") + dfs_tpr.append(df_i) + df_tpr_dist = pd.concat(dfs_tpr, ignore_index=True) + ds_dlog("distance: df_tpr_dist total_rows=%s", len(df_tpr_dist)) + + dfs_fpr = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i], enable_dist_h=False) + q = sql_distance_bin_rates_from_eval_flat(_flat_view(i), fc, metrics="fpr") + ds_dlog("distance: compare run %s/%s FPR by bin query", i + 1, len(runs)) + df_i = con.execute(q).df() + ds_dlog("distance: compare FPR query run %s rows=%s", i, len(df_i)) + df_i["run"] = run_labels_list[i] + df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) + df_i = df_i.sort_values("bin_order") + dfs_fpr.append(df_i) + df_fpr_dist = pd.concat(dfs_fpr, ignore_index=True) + + if not df_tpr_dist.empty: + rate_bin_labels_order = ( + df_tpr_dist[df_tpr_dist["run"] == run_labels_list[0]] .sort_values("bin_order")["bin_label"] .tolist() ) - - xaxis_oc = dict(tickangle=-35, categoryorder="array", categoryarray=align_x) - - if single_mode: - pivot_oc = df_oc.pivot_table( - index="bin_label", columns="label", values="n", aggfunc="sum", fill_value=0 - ) - pivot_oc = pivot_oc.reindex(align_x, fill_value=0) - - fig_oc = go.Figure() - if use_line_chart: - for j, lab in enumerate(pivot_oc.columns): - c = RUN_COLORS[j % len(RUN_COLORS)] + _xaxis_dist_bins = ( + dict(tickangle=-35, categoryorder="array", categoryarray=rate_bin_labels_order) + if rate_bin_labels_order + else dict(tickangle=-35) + ) + + if use_line_chart: + if not df_tpr_dist.empty: + fig_tpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") + c = RUN_COLORS[i % len(RUN_COLORS)] r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - nm = str(lab) - fig_oc.add_trace( + fig_tpr.add_trace( go.Scatter( - x=align_x, - y=pivot_oc[lab].values, - name=nm, + x=d["bin_label"], + y=d["tpr"], + name=lbl, mode="lines", line=dict(color=c, width=2.2, shape="spline"), fill="tozeroy", - fillcolor=f"rgba({r},{g},{b},0.12)", - hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", + fillcolor=f"rgba({r},{g},{b},0.15)", + hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", ) ) + apply_chart_theme(fig_tpr, height=420) + fig_tpr.update_layout( + title=f"TP rate by distance", + xaxis_title="Distance bin", + yaxis_title="TP rate", + yaxis_range=[0, 1], + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_tpr, width='stretch') else: - for j, lab in enumerate(pivot_oc.columns): - c = RUN_COLORS[j % len(RUN_COLORS)] - nm = str(lab) - fig_oc.add_trace( - go.Bar( - x=align_x, - y=pivot_oc[lab].values, - name=nm, - marker_color=c, - hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", - ) - ) - apply_chart_theme(fig_oc, height=420) - fig_oc.update_layout( - title=f"Object count by distance bin (within {max_eval_range} m)", - xaxis_title="Distance bin", - yaxis_title="Count", - xaxis=xaxis_oc, - hovermode="x unified", - **({"barmode": "group"} if not use_line_chart else {}), - ) - st.plotly_chart(fig_oc, width='stretch') - else: - pivot_oc = df_oc.pivot_table( - index="bin_label", columns="run", values="n", aggfunc="sum", fill_value=0 - ) - pivot_oc = pivot_oc.reindex(align_x, fill_value=0) - run_cols = [r for r in run_labels_list if r in pivot_oc.columns] - - fig_oc = go.Figure() - if use_line_chart: - for j, rl in enumerate(run_cols): - c = RUN_COLORS[j % len(RUN_COLORS)] + st.info("No TP rate by distance data.") + + if not df_fpr_dist.empty: + fig_fpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") + c = RUN_COLORS[i % len(RUN_COLORS)] r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - fig_oc.add_trace( + fig_fpr.add_trace( go.Scatter( - x=align_x, - y=pivot_oc[rl].values, - name=str(rl), + x=d["bin_label"], + y=d["fpr"], + name=lbl, mode="lines", line=dict(color=c, width=2.2, shape="spline"), fill="tozeroy", fillcolor=f"rgba({r},{g},{b},0.15)", - hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", ) ) + apply_chart_theme(fig_fpr, height=420) + fig_fpr.update_layout( + title=f"FP rate by distance", + xaxis_title="Distance bin", + yaxis_title="FP rate", + yaxis_range=[0, 1], + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_fpr, width='stretch') else: - for j, rl in enumerate(run_cols): - c = RUN_COLORS[j % len(RUN_COLORS)] - fig_oc.add_trace( + st.info("No FP rate by distance data.") + else: + # Bar chart (histogram) for compare: TP then FP, grouped by run + if not df_tpr_dist.empty: + fig_tpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") + fig_tpr.add_trace( go.Bar( - x=align_x, - y=pivot_oc[rl].values, - name=str(rl), - marker_color=c, - hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + x=d["bin_label"], + y=d["tpr"], + name=lbl, + marker_color=RUN_COLORS[i % len(RUN_COLORS)], + hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", ) ) - apply_chart_theme(fig_oc, height=420) - fig_oc.update_layout( - title=f"Object count by distance bin", - xaxis_title="Distance bin", - yaxis_title="Count", - xaxis=xaxis_oc, - hovermode="x unified", - **({"barmode": "group"} if not use_line_chart else {}), - ) - st.plotly_chart(fig_oc, width='stretch') - except Exception as e_oc: - st.error(f"Error (object count by distance bin): {e_oc}") - -except Exception as e: - st.error(f"Error: {e}") -finally: - _dist_slot.empty() -# ============================= -# Panel 2: TP Rate (single) / TP Rate Comparison (compare) -# ============================= -st.markdown( - section_header_html( - "TP Rate" + (" Comparison" if not single_mode else ""), - "TP rate per object class (GT TP / (TP+FN)). Pick a chart style below.", - ), - unsafe_allow_html=True, -) - -_tpr_query = """ -SELECT - label, - CASE - WHEN COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) > 0 - THEN CAST(COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS DOUBLE) - / COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) - ELSE 0 - END AS tpr -FROM {view} -WHERE {filter_clause} -GROUP BY label -ORDER BY label -""" - -# Compare-mode TP rate spider charts: several distance caps + no cap (sidebar range not used for this view) -TPR_COMPARE_SPIDER_RANGES: List[Tuple[Optional[int], str]] = [ - (50, "≤50 m"), - (80, "≤80 m"), - (100, "≤100 m"), - (120, "≤120 m"), - (150, "≤150 m"), - (None, "All distances"), -] - -if single_mode: - tpr_viz = st.radio( - "TP rate chart style", - options=["Bar chart", "Lollipop (ranked)"], - index=0, - horizontal=True, - key="tpr_viz_single", - ) - try: - with ds_spot_loading("TP rate"): - filter_clause = build_filter_clause(filters_base) - query = _tpr_query.format(view="view_eval_flat", filter_clause=filter_clause) - df_tpr_base = con.execute(query).df() - if not df_tpr_base.empty: - title = f"Total TP rate within {max_eval_range} [m]" - if tpr_viz == "Bar chart": - fig = px.bar( - df_tpr_base, - x="label", - y="tpr", - title=title, - labels={"tpr": "TP Rate", "label": "Label"}, - ) - apply_chart_theme(fig) - fig.update_layout(yaxis_range=[0, 1.2]) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") - st.plotly_chart(fig, width='stretch') + apply_chart_theme(fig_tpr, height=420) + fig_tpr.update_layout( + title=f"TP rate by distance", + xaxis_title="Distance bin", + yaxis_title="TP rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_tpr, width='stretch') + else: + st.info("No TP rate by distance data.") + + if not df_fpr_dist.empty: + fig_fpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") + fig_fpr.add_trace( + go.Bar( + x=d["bin_label"], + y=d["fpr"], + name=lbl, + marker_color=RUN_COLORS[i % len(RUN_COLORS)], + hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", + ) + ) + apply_chart_theme(fig_fpr, height=420) + fig_fpr.update_layout( + title=f"FP rate by distance", + xaxis_title="Distance bin", + yaxis_title="FP rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_fpr, width='stretch') + else: + st.info("No FP rate by distance data.") + + # Object count by same distance bins as TP/FP; same line vs bar style; aligned x-axis + + try: + if single_mode: + q_oc = f""" + SELECT ({_DIST_BIN_CASE}) AS distance_bin, label, COUNT(*) AS n + FROM view_eval_flat + WHERE {filter_clause_base} + GROUP BY 1, 2 + """ + df_oc = con.execute(q_oc).df() else: - fig = _tpr_lollipop_single(df_tpr_base, title) - st.plotly_chart(fig, width='stretch') - else: - st.info("No data available") - except Exception as e: - st.error(f"Error: {e}") -else: - tpr_opts = ["Spider chart", "Grouped bar", "Heatmap (label × run)", "Line profile"] - tpr_viz = st.radio( - "TP rate chart style", - options=tpr_opts, - index=0, - horizontal=True, - key="tpr_viz_compare", - ) - try: - with ds_spot_loading("TP rate"): - dfs_tpr = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i]) - q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - dfs_tpr.append(df_i) - df_tpr_all = pd.concat(dfs_tpr, ignore_index=True) - if tpr_viz == "Spider chart": - st.caption( - "Six spider charts use **fixed distance cutoffs** (50–150 m) plus **all distances**. " - "Topic / label / suite / visibility filters still apply. " - "Other chart types and the rest of the page use the sidebar **Max Evaluation Range**." - ) - fb_all = {**filters_base, "max_eval_range": None} - label_union: set = set() - for i in range(len(runs)): - fc_a = build_filter_clause(fb_all) - q_a = _tpr_query.format(view=_flat_view(i), filter_clause=fc_a) - dfa = con.execute(q_a).df() - label_union |= set(dfa["label"].astype(str)) - cats = sorted(label_union) - if not cats: - st.info("No TP rate data for any distance range with current filters.") + dfs_oc = [] + for i in range(len(runs)): + fc_oc = build_filter_clause(filters_list[i], enable_dist_h=False) + q_oc_i = f""" + SELECT ({_DIST_BIN_CASE}) AS distance_bin, COUNT(*) AS n + FROM {_flat_view(i)} + WHERE {fc_oc} + GROUP BY 1 + """ + df_oci = con.execute(q_oc_i).df() + df_oci["run"] = run_labels_list[i] + dfs_oc.append(df_oci) + df_oc = pd.concat(dfs_oc, ignore_index=True) + + if df_oc.empty: + st.info("No object count data by distance bin.") else: - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - for row_start in range(0, len(TPR_COMPARE_SPIDER_RANGES), 3): - row_ranges = TPR_COMPARE_SPIDER_RANGES[row_start : row_start + 3] - cols = st.columns(len(row_ranges)) - for col, (max_r, cap_lbl) in zip(cols, row_ranges): - fb = {**filters_base, "max_eval_range": max_r} - dfs_slice = [] - for i in range(len(runs)): - fc = build_filter_clause(fb) - q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) - dfi = con.execute(q).df() - dfi["run"] = run_labels_list[i] - dfs_slice.append(dfi) - df_slice = pd.concat(dfs_slice, ignore_index=True) - with col: - if df_slice.empty: - st.info(f"No data ({cap_lbl}).") - else: - fig = _tpr_spider_compare( - df_slice, - cats, - f"TP rate ({cap_lbl})", - run_labels_list, - height=360, + df_oc = df_oc.copy() + df_oc["bin_order"], df_oc["bin_label"] = zip(*df_oc["distance_bin"].map(_distance_bin_order_and_label)) + if rate_bin_labels_order: + align_x = list(rate_bin_labels_order) + else: + align_x = ( + df_oc.drop_duplicates("distance_bin") + .sort_values("bin_order")["bin_label"] + .tolist() + ) + + xaxis_oc = dict(tickangle=-35, categoryorder="array", categoryarray=align_x) + + if single_mode: + pivot_oc = df_oc.pivot_table( + index="bin_label", columns="label", values="n", aggfunc="sum", fill_value=0 + ) + pivot_oc = pivot_oc.reindex(align_x, fill_value=0) + + fig_oc = go.Figure() + if use_line_chart: + for j, lab in enumerate(pivot_oc.columns): + c = RUN_COLORS[j % len(RUN_COLORS)] + r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) + nm = str(lab) + fig_oc.add_trace( + go.Scatter( + x=align_x, + y=pivot_oc[lab].values, + name=nm, + mode="lines", + line=dict(color=c, width=2.2, shape="spline"), + fill="tozeroy", + fillcolor=f"rgba({r},{g},{b},0.12)", + hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", ) - st.plotly_chart(fig, width='stretch') - elif not df_tpr_all.empty: - title = f"Total TP rate within {max_eval_range} [m] by run" - if tpr_viz == "Grouped bar": - fig = px.bar( - df_tpr_all, - x="label", - y="tpr", - color="run", - barmode="group", - title=title, - labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, - color_discrete_sequence=RUN_COLORS, - ) - apply_chart_theme(fig) - fig.update_layout(yaxis_range=[0, 1.2]) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") - st.plotly_chart(fig, width='stretch') - elif tpr_viz == "Heatmap (label × run)": - pivot = df_tpr_all.pivot_table(index="label", columns="run", values="tpr", aggfunc="first") - cols_present = [c for c in run_labels_list if c in pivot.columns] - if cols_present: - pivot = pivot[cols_present] - fig = px.imshow( - pivot, - labels=dict(x="Run", y="Label", color="TP rate"), - title=title, - color_continuous_scale="RdYlGn", - zmin=0, - zmax=1, - aspect="auto", - ) - apply_chart_theme(fig, height=max(360, 32 + 22 * len(pivot.index))) - fig.update_layout(xaxis_side="top") - st.plotly_chart(fig, width='stretch') - elif tpr_viz == "Line profile": - fig = px.line( - df_tpr_all, - x="label", - y="tpr", - color="run", - markers=True, - title=title, - labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, - color_discrete_sequence=RUN_COLORS, - ) - fig.update_traces(line=dict(width=2.5), marker=dict(size=8)) - apply_chart_theme(fig, height=400) - fig.update_layout(yaxis_range=[0, 1.15], xaxis_tickangle=-35, hovermode="x unified") - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") - st.plotly_chart(fig, width='stretch') - else: - st.info("No data available") + ) + else: + for j, lab in enumerate(pivot_oc.columns): + c = RUN_COLORS[j % len(RUN_COLORS)] + nm = str(lab) + fig_oc.add_trace( + go.Bar( + x=align_x, + y=pivot_oc[lab].values, + name=nm, + marker_color=c, + hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + apply_chart_theme(fig_oc, height=420) + fig_oc.update_layout( + title=f"Object count by distance bin (within {max_eval_range} m)", + xaxis_title="Distance bin", + yaxis_title="Count", + xaxis=xaxis_oc, + hovermode="x unified", + **({"barmode": "group"} if not use_line_chart else {}), + ) + st.plotly_chart(fig_oc, width='stretch') + else: + pivot_oc = df_oc.pivot_table( + index="bin_label", columns="run", values="n", aggfunc="sum", fill_value=0 + ) + pivot_oc = pivot_oc.reindex(align_x, fill_value=0) + run_cols = [r for r in run_labels_list if r in pivot_oc.columns] + + fig_oc = go.Figure() + if use_line_chart: + for j, rl in enumerate(run_cols): + c = RUN_COLORS[j % len(RUN_COLORS)] + r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) + fig_oc.add_trace( + go.Scatter( + x=align_x, + y=pivot_oc[rl].values, + name=str(rl), + mode="lines", + line=dict(color=c, width=2.2, shape="spline"), + fill="tozeroy", + fillcolor=f"rgba({r},{g},{b},0.15)", + hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + else: + for j, rl in enumerate(run_cols): + c = RUN_COLORS[j % len(RUN_COLORS)] + fig_oc.add_trace( + go.Bar( + x=align_x, + y=pivot_oc[rl].values, + name=str(rl), + marker_color=c, + hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + apply_chart_theme(fig_oc, height=420) + fig_oc.update_layout( + title=f"Object count by distance bin", + xaxis_title="Distance bin", + yaxis_title="Count", + xaxis=xaxis_oc, + hovermode="x unified", + **({"barmode": "group"} if not use_line_chart else {}), + ) + st.plotly_chart(fig_oc, width='stretch') + except Exception as e_oc: + st.error(f"Error (object count by distance bin): {e_oc}") + except Exception as e: st.error(f"Error: {e}") -# ============================= -# Panel 5: Perception diff vs baseline A (compare mode only) -# ============================= -def _baobab_hierarchy_from_objects( - df_obj: pd.DataFrame, - change_type: str, - root_label: str, - max_scenarios: int, - max_frames: int, -) -> pd.DataFrame: - """ - Build a leaf table for Plotly sunburst/treemap: root → scenario → frame → label. - Caps scenarios and frames per scenario; merges the rest into Other buckets. - """ - if df_obj.empty or "change_type" not in df_obj.columns: - return pd.DataFrame() - sub = df_obj[df_obj["change_type"] == change_type].copy() - if sub.empty: - return pd.DataFrame() - sub["scenario_name"] = sub["scenario_name"].fillna("").astype(str).replace("", "(no scenario)") - sub["label"] = sub["label"].fillna("").astype(str).replace("", "(no label)") - sub["frame_key"] = ( - sub["t4dataset_id"].astype(str) + "|f" + sub["frame_index"].astype(str) - ) - leaf = ( - sub.groupby(["scenario_name", "frame_key", "label"], dropna=False) - .size() - .reset_index(name="n") - ) - if leaf.empty: - return pd.DataFrame() - ms = max(int(max_scenarios), 1) - mf = max(int(max_frames), 1) - scen_tot = leaf.groupby("scenario_name")["n"].sum().sort_values(ascending=False) - top_scen = set(scen_tot.head(ms).index) - leaf["scen_g"] = np.where( - leaf["scenario_name"].isin(top_scen), - leaf["scenario_name"], - "Other scenarios", - ) - parts = [] - for _, g in leaf.groupby("scen_g"): - fr_tot = g.groupby("frame_key")["n"].sum().sort_values(ascending=False) - top_fr = set(fr_tot.head(mf).index) - g2 = g.copy() - g2["fr_g"] = np.where(g2["frame_key"].isin(top_fr), g2["frame_key"], "Other frames") - agg = g2.groupby(["scen_g", "fr_g", "label"], as_index=False)["n"].sum() - parts.append(agg) - out = pd.concat(parts, ignore_index=True) - out["root"] = root_label - - def _frame_ring_label(fr_g: str, scen_g: str) -> str: - if fr_g == "Other frames" or str(fr_g) == "Other frames": - return "Other frames" - sfg = str(fr_g) - if "|f" not in sfg: - return sfg - fid = sfg.split("|f", 1)[-1] - if scen_g == "Other scenarios": - t4 = sfg.split("|f", 1)[0] - t4s = t4 if len(t4) <= 14 else ("…" + t4[-12:]) - return f"{t4s}|f{fid}" - return f"f{fid}" - - out["fr_display"] = out.apply( - lambda r: _frame_ring_label(r["fr_g"], r["scen_g"]), axis=1 - ) - return out - - -def _comparison_lens_treemap_df( - names: pd.Series, - improved: pd.Series, - degraded: pd.Series, - root_title: str, -) -> pd.DataFrame: - """Rows for px.treemap path root → Improved|Degraded → item (area = n).""" - rows = [] - for i in range(len(names)): - nm = str(names.iloc[i]).strip() or "—" - if len(nm) > 72: - nm = nm[:69] + "…" - ip = float(improved.iloc[i]) if pd.notna(improved.iloc[i]) else 0.0 - dg = float(degraded.iloc[i]) if pd.notna(degraded.iloc[i]) else 0.0 - if ip > 0: - rows.append( - {"root": root_title, "side": "Improved", "item": nm, "n": ip} - ) - if dg > 0: - rows.append( - {"root": root_title, "side": "Degraded", "item": nm, "n": dg} - ) - return pd.DataFrame(rows) - - -def _plot_comparison_lens_treemap( - tdf: pd.DataFrame, - st_key: str, - title: str, -) -> None: - if tdf is None or tdf.empty: - st.caption("_No data for this view._") - return - fig = px.treemap( - tdf, - path=["root", "side", "item"], - values="n", - color="side", - color_discrete_map={"Improved": IMPROVED_COLOR, "Degraded": DEGRADED_COLOR}, - ) - fig.update_traces( - textfont_size=12, - textinfo="label+value+percent parent", - hovertemplate=( - "%{label}
" - "GT objects: %{value:.0f}
" - "% of parent: %{percentParent}" - ), - marker_line_width=1.5, - marker_line_color="rgba(255,255,255,0.45)", - root_color="rgba(240,240,245,0.95)", - ) - _title_layout = {**PLOTLY_LAYOUT_THEME["title"], "text": title} - apply_chart_theme( - fig, - height=430, - margin=dict(t=20, l=2, r=2, b=2), - paper_bgcolor="rgba(0,0,0,0)", - title=_title_layout, - ) - st.plotly_chart(fig, width='stretch', key=st_key) - - -if not single_mode: - st.divider() + finally: + _dist_slot.empty() + ds_dlog("section: Panel3_5_Distance_end") + # ============================= + # Panel 2: TP Rate (single) / TP Rate Comparison (compare) + # ============================= + ds_dlog("section: Panel2_TP_Rate_start") st.markdown( section_header_html( - "Perception diff (vs baseline A)", - "Per-GT-object comparison vs baseline A: degraded = was TP on A and FN on candidate; improved = was FN on A and TP on candidate. Hotspots prioritize regressions.", + "TP Rate" + (" Comparison" if not single_mode else ""), + "TP rate per object class (GT TP / (TP+FN)). Pick a chart style below.", ), unsafe_allow_html=True, ) - for idx in range(1, len(runs)): - lbl = run_labels_list[idx] - _pd_slot = st.empty() - _pd_slot.markdown(ds_spot_loading_markup(f"Perception diff · run {lbl}"), unsafe_allow_html=True) + + _tpr_query = """ + SELECT + label, + CASE + WHEN COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) > 0 + THEN CAST(COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS DOUBLE) + / COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) + ELSE 0 + END AS tpr + FROM {view} + WHERE {filter_clause} + GROUP BY label + ORDER BY label + """ + + # Compare-mode TP rate spider charts: several distance caps + no cap (sidebar range not used for this view) + TPR_COMPARE_SPIDER_RANGES: List[Tuple[Optional[int], str]] = [ + (50, "≤50 m"), + (80, "≤80 m"), + (100, "≤100 m"), + (120, "≤120 m"), + (150, "≤150 m"), + (None, "All distances"), + ] + + if single_mode: + tpr_viz = st.radio( + "TP rate chart style", + options=["Bar chart", "Lollipop (ranked)"], + index=0, + horizontal=True, + key="tpr_viz_single", + ) try: - filter_clause_comp_p5 = build_filter_clause(filters_list[idx], enable_dist_h=False) - comp_flat = _flat_view(idx) - query = f""" - WITH base_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM view_eval_flat - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_base} - GROUP BY 1,2,3 + with ds_spot_loading("TP rate"): + filter_clause = build_filter_clause(filters_base) + query = _tpr_query.format(view="view_eval_flat", filter_clause=filter_clause) + df_tpr_base = con.execute(query).df() + if not df_tpr_base.empty: + title = f"Total TP rate within {max_eval_range} [m]" + if tpr_viz == "Bar chart": + fig = px.bar( + df_tpr_base, + x="label", + y="tpr", + title=title, + labels={"tpr": "TP Rate", "label": "Label"}, + ) + apply_chart_theme(fig) + fig.update_layout(yaxis_range=[0, 1.2]) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + st.plotly_chart(fig, width='stretch') + else: + fig = _tpr_lollipop_single(df_tpr_base, title) + st.plotly_chart(fig, width='stretch') + else: + st.info("No data available") + except Exception as e: + st.error(f"Error: {e}") + else: + tpr_opts = ["Spider chart", "Grouped bar", "Heatmap (label × run)", "Line profile"] + tpr_viz = st.radio( + "TP rate chart style", + options=tpr_opts, + index=0, + horizontal=True, + key="tpr_viz_compare", + ) + try: + with ds_spot_loading("TP rate"): + dfs_tpr = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i]) + q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) + df_i = con.execute(q).df() + df_i["run"] = run_labels_list[i] + dfs_tpr.append(df_i) + df_tpr_all = pd.concat(dfs_tpr, ignore_index=True) + if tpr_viz == "Spider chart": + st.caption( + "Six spider charts use **fixed distance cutoffs** (50–150 m) plus **all distances**. " + "Topic / label / suite / visibility filters still apply. " + "Other chart types and the rest of the page use the sidebar **Max Evaluation Range**." + ) + fb_all = {**filters_base, "max_eval_range": None} + label_union: set = set() + for i in range(len(runs)): + fc_a = build_filter_clause(fb_all) + q_a = _tpr_query.format(view=_flat_view(i), filter_clause=fc_a) + dfa = con.execute(q_a).df() + label_union |= set(dfa["label"].astype(str)) + cats = sorted(label_union) + if not cats: + st.info("No TP rate data for any distance range with current filters.") + else: + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + for row_start in range(0, len(TPR_COMPARE_SPIDER_RANGES), 3): + row_ranges = TPR_COMPARE_SPIDER_RANGES[row_start : row_start + 3] + cols = st.columns(len(row_ranges)) + for col, (max_r, cap_lbl) in zip(cols, row_ranges): + fb = {**filters_base, "max_eval_range": max_r} + dfs_slice = [] + for i in range(len(runs)): + fc = build_filter_clause(fb) + q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) + dfi = con.execute(q).df() + dfi["run"] = run_labels_list[i] + dfs_slice.append(dfi) + df_slice = pd.concat(dfs_slice, ignore_index=True) + with col: + if df_slice.empty: + st.info(f"No data ({cap_lbl}).") + else: + fig = _tpr_spider_compare( + df_slice, + cats, + f"TP rate ({cap_lbl})", + run_labels_list, + height=360, + ) + st.plotly_chart(fig, width='stretch') + elif not df_tpr_all.empty: + title = f"Total TP rate within {max_eval_range} [m] by run" + if tpr_viz == "Grouped bar": + fig = px.bar( + df_tpr_all, + x="label", + y="tpr", + color="run", + barmode="group", + title=title, + labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, + color_discrete_sequence=RUN_COLORS, + ) + apply_chart_theme(fig) + fig.update_layout(yaxis_range=[0, 1.2]) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + st.plotly_chart(fig, width='stretch') + elif tpr_viz == "Heatmap (label × run)": + pivot = df_tpr_all.pivot_table(index="label", columns="run", values="tpr", aggfunc="first") + cols_present = [c for c in run_labels_list if c in pivot.columns] + if cols_present: + pivot = pivot[cols_present] + fig = px.imshow( + pivot, + labels=dict(x="Run", y="Label", color="TP rate"), + title=title, + color_continuous_scale="RdYlGn", + zmin=0, + zmax=1, + aspect="auto", + ) + apply_chart_theme(fig, height=max(360, 32 + 22 * len(pivot.index))) + fig.update_layout(xaxis_side="top") + st.plotly_chart(fig, width='stretch') + elif tpr_viz == "Line profile": + fig = px.line( + df_tpr_all, + x="label", + y="tpr", + color="run", + markers=True, + title=title, + labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, + color_discrete_sequence=RUN_COLORS, + ) + fig.update_traces(line=dict(width=2.5), marker=dict(size=8)) + apply_chart_theme(fig, height=400) + fig.update_layout(yaxis_range=[0, 1.15], xaxis_tickangle=-35, hovermode="x unified") + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + st.plotly_chart(fig, width='stretch') + else: + st.info("No data available") + except Exception as e: + st.error(f"Error: {e}") + # ============================= + # Panel 5: Perception diff vs baseline A (compare mode only) + # ============================= + def _baobab_hierarchy_from_objects( + df_obj: pd.DataFrame, + change_type: str, + root_label: str, + max_scenarios: int, + max_frames: int, + ) -> pd.DataFrame: + """ + Build a leaf table for Plotly sunburst/treemap: root → scenario → frame → label. + Caps scenarios and frames per scenario; merges the rest into Other buckets. + """ + if df_obj.empty or "change_type" not in df_obj.columns: + return pd.DataFrame() + sub = df_obj[df_obj["change_type"] == change_type].copy() + if sub.empty: + return pd.DataFrame() + sub["scenario_name"] = sub["scenario_name"].fillna("").astype(str).replace("", "(no scenario)") + sub["label"] = sub["label"].fillna("").astype(str).replace("", "(no label)") + sub["frame_key"] = ( + sub["t4dataset_id"].astype(str) + "|f" + sub["frame_index"].astype(str) + ) + leaf = ( + sub.groupby(["scenario_name", "frame_key", "label"], dropna=False) + .size() + .reset_index(name="n") + ) + if leaf.empty: + return pd.DataFrame() + ms = max(int(max_scenarios), 1) + mf = max(int(max_frames), 1) + scen_tot = leaf.groupby("scenario_name")["n"].sum().sort_values(ascending=False) + top_scen = set(scen_tot.head(ms).index) + leaf["scen_g"] = np.where( + leaf["scenario_name"].isin(top_scen), + leaf["scenario_name"], + "Other scenarios", + ) + parts = [] + for _, g in leaf.groupby("scen_g"): + fr_tot = g.groupby("frame_key")["n"].sum().sort_values(ascending=False) + top_fr = set(fr_tot.head(mf).index) + g2 = g.copy() + g2["fr_g"] = np.where(g2["frame_key"].isin(top_fr), g2["frame_key"], "Other frames") + agg = g2.groupby(["scen_g", "fr_g", "label"], as_index=False)["n"].sum() + parts.append(agg) + out = pd.concat(parts, ignore_index=True) + out["root"] = root_label + + def _frame_ring_label(fr_g: str, scen_g: str) -> str: + if fr_g == "Other frames" or str(fr_g) == "Other frames": + return "Other frames" + sfg = str(fr_g) + if "|f" not in sfg: + return sfg + fid = sfg.split("|f", 1)[-1] + if scen_g == "Other scenarios": + t4 = sfg.split("|f", 1)[0] + t4s = t4 if len(t4) <= 14 else ("…" + t4[-12:]) + return f"{t4s}|f{fid}" + return f"f{fid}" + + out["fr_display"] = out.apply( + lambda r: _frame_ring_label(r["fr_g"], r["scen_g"]), axis=1 + ) + return out + + + def _comparison_lens_treemap_df( + names: pd.Series, + improved: pd.Series, + degraded: pd.Series, + root_title: str, + ) -> pd.DataFrame: + """Rows for px.treemap path root → Improved|Degraded → item (area = n).""" + rows = [] + for i in range(len(names)): + nm = str(names.iloc[i]).strip() or "—" + if len(nm) > 72: + nm = nm[:69] + "…" + ip = float(improved.iloc[i]) if pd.notna(improved.iloc[i]) else 0.0 + dg = float(degraded.iloc[i]) if pd.notna(degraded.iloc[i]) else 0.0 + if ip > 0: + rows.append( + {"root": root_title, "side": "Improved", "item": nm, "n": ip} + ) + if dg > 0: + rows.append( + {"root": root_title, "side": "Degraded", "item": nm, "n": dg} + ) + return pd.DataFrame(rows) + + + def _plot_comparison_lens_treemap( + tdf: pd.DataFrame, + st_key: str, + title: str, + ) -> None: + if tdf is None or tdf.empty: + st.caption("_No data for this view._") + return + fig = px.treemap( + tdf, + path=["root", "side", "item"], + values="n", + color="side", + color_discrete_map={"Improved": IMPROVED_COLOR, "Degraded": DEGRADED_COLOR}, + ) + fig.update_traces( + textfont_size=12, + textinfo="label+value+percent parent", + hovertemplate=( + "%{label}
" + "GT objects: %{value:.0f}
" + "% of parent: %{percentParent}" ), - comp_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM {comp_flat} - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_comp_p5} - GROUP BY 1,2,3 + marker_line_width=1.5, + marker_line_color="rgba(255,255,255,0.45)", + root_color="rgba(240,240,245,0.95)", + ) + _title_layout = {**PLOTLY_LAYOUT_THEME["title"], "text": title} + apply_chart_theme( + fig, + height=430, + margin=dict(t=20, l=2, r=2, b=2), + paper_bgcolor="rgba(0,0,0,0)", + title=_title_layout, + ) + st.plotly_chart(fig, width='stretch', key=st_key) + + + if not single_mode: + ds_dlog("section: Perception_diff_start") + st.divider() + st.markdown( + section_header_html( + "Perception diff (vs baseline A)", + "Per-GT-object comparison vs baseline A: degraded = was TP on A and FN on candidate; improved = was FN on A and TP on candidate. Hotspots prioritize regressions.", ), - joined AS ( + unsafe_allow_html=True, + ) + for idx in range(1, len(runs)): + lbl = run_labels_list[idx] + _pd_slot = st.empty() + _pd_slot.markdown(ds_spot_loading_markup(f"Perception diff · run {lbl}"), unsafe_allow_html=True) + try: + filter_clause_comp_p5 = build_filter_clause(filters_list[idx], enable_dist_h=False) + comp_flat = _flat_view(idx) + query = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM view_eval_flat + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_base} + GROUP BY 1,2,3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_flat} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_comp_p5} + GROUP BY 1,2,3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ) SELECT - COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, - COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, - COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, - COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp, - COALESCE(b.suite_name, c.suite_name, '') AS suite_name, - COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, - COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name - FROM base_gt b - FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid - ) - SELECT - t4dataset_id, - CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, - CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta, - suite_name, - scenario_name, - t4dataset_name - FROM joined - GROUP BY t4dataset_id, suite_name, scenario_name, t4dataset_name - ORDER BY net_tp_delta DESC - """ - df_improved = con.execute(query).df() - if not df_improved.empty: - query_frame_p5 = f""" - WITH base_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM view_eval_flat - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_base} - GROUP BY 1, 2, 3 - ), - comp_gt AS ( + t4dataset_id, + CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, + CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta, + suite_name, + scenario_name, + t4dataset_name + FROM joined + GROUP BY t4dataset_id, suite_name, scenario_name, t4dataset_name + ORDER BY net_tp_delta DESC + """ + df_improved = con.execute(query).df() + if not df_improved.empty: + query_frame_p5 = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM view_eval_flat + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_base} + GROUP BY 1, 2, 3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_flat} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_comp_p5} + GROUP BY 1, 2, 3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ) SELECT t4dataset_id, frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM {comp_flat} - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_comp_p5} - GROUP BY 1, 2, 3 - ), - joined AS ( + scenario_name, + suite_name, + t4dataset_name, + CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, + CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta + FROM joined + GROUP BY t4dataset_id, frame_index, suite_name, scenario_name, t4dataset_name + ORDER BY net_tp_delta DESC + """ + query_object_p5 = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM view_eval_flat + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_base} + GROUP BY 1, 2, 3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_flat} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_comp_p5} + GROUP BY 1, 2, 3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ), + obj_attrs AS ( + SELECT + t4dataset_id, + frame_index, + uuid, + MAX(CAST(label AS VARCHAR)) AS label, + MAX(dist_h) AS dist_h + FROM view_eval_flat + WHERE source = 'GT' + GROUP BY 1, 2, 3 + ) SELECT - COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, - COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, - COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, - COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp, - COALESCE(b.suite_name, c.suite_name, '') AS suite_name, - COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, - COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name - FROM base_gt b - FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid + j.t4dataset_id, + j.frame_index, + j.gt_uuid, + COALESCE(e.label, '') AS label, + COALESCE(e.dist_h, 0.0) AS dist_h, + {_DIST_BIN_CASE.replace("dist_h", "COALESCE(e.dist_h, 0.0)")} AS distance_bin, + j.suite_name, + j.scenario_name, + j.t4dataset_name, + CASE + WHEN NOT j.tp_base AND j.tp_comp THEN 'improved' + WHEN j.tp_base AND NOT j.tp_comp THEN 'degraded' + WHEN j.tp_base AND j.tp_comp THEN 'both_tp' + ELSE 'both_fn' + END AS change_type, + j.tp_base, + j.tp_comp + FROM joined j + LEFT JOIN obj_attrs e + ON CAST(j.t4dataset_id AS VARCHAR) = CAST(e.t4dataset_id AS VARCHAR) + AND j.frame_index = CAST(e.frame_index AS VARCHAR) + AND j.gt_uuid = e.uuid + ORDER BY change_type, j.t4dataset_id, j.frame_index + """ + try: + df_by_frame = con.execute(query_frame_p5).df() + except Exception: + df_by_frame = pd.DataFrame() + try: + df_by_object_full = con.execute(query_object_p5).df() + except Exception: + df_by_object_full = pd.DataFrame() + + tot_imp = float(df_improved["improved_cnt"].sum()) + tot_deg = float(df_improved["degraded_cnt"].sum()) + tot_net = tot_imp - tot_deg + net_s = f"+{int(tot_net)}" if tot_net > 0 else str(int(tot_net)) + + with st.expander(f"Run {lbl} vs A", expanded=(len(runs) == 2)): + c1, c2, c3, c4 = st.columns(4) + c1.metric("Improved (FN→TP)", int(tot_imp)) + c2.metric("Degraded (TP→FN)", int(tot_deg)) + c3.metric("Net TP delta", net_s) + c4.caption("Start with scenarios and frames with the most **degraded** counts.") + st.markdown( + f"**Summary:** Net **{net_s}** TP vs baseline A — " + f"**{int(tot_deg)}** degraded vs **{int(tot_imp)}** improved." ) - SELECT - t4dataset_id, - frame_index, - scenario_name, - suite_name, - t4dataset_name, - CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, - CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta - FROM joined - GROUP BY t4dataset_id, frame_index, suite_name, scenario_name, t4dataset_name - ORDER BY net_tp_delta DESC - """ - query_object_p5 = f""" + + b_key = f"p5_baobab_{lbl}_{idx}" + c1b, c2b, c3b = st.columns([1, 1, 1]) + with c1b: + baobab_viz = st.radio( + "Chart type", + ["Sunburst", "Treemap"], + horizontal=True, + key=f"{b_key}_viz", + ) + with c2b: + baobab_ns = st.slider( + "Max scenarios", + min_value=5, + max_value=25, + value=15, + key=f"{b_key}_ns", + ) + with c3b: + baobab_nf = st.slider( + "Max frames / scenario", + min_value=5, + max_value=20, + value=10, + key=f"{b_key}_nf", + ) + if df_by_object_full.empty: + st.caption("No object-level rows for hierarchy.") + else: + path_cols = ["root", "scen_g", "fr_display", "label"] + h_imp = _baobab_hierarchy_from_objects( + df_by_object_full, + "improved", + f"Improved ({lbl} vs A)", + baobab_ns, + baobab_nf, + ) + h_deg = _baobab_hierarchy_from_objects( + df_by_object_full, + "degraded", + f"Degraded ({lbl} vs A)", + baobab_ns, + baobab_nf, + ) + pair_both = (not h_imp.empty) and (not h_deg.empty) + plot_entries = [] + for ct, hdf, cmap in ( + ("improved", h_imp, IMPROVED_SCALE), + ("degraded", h_deg, DEGRADED_SCALE), + ): + if hdf.empty: + plot_entries.append((ct, None)) + continue + title = f"{baobab_viz}: {ct} (n = {int(hdf['n'].sum())} GT objects)" + if baobab_viz == "Sunburst": + fig_b = px.sunburst( + hdf, + path=path_cols, + values="n", + color="n", + color_continuous_scale=cmap, + title=title, + ) + h_sb = 480 if pair_both else 620 + apply_chart_theme(fig_b, height=h_sb, margin=dict(t=36, l=4, r=4, b=4)) + else: + fig_b = px.treemap( + hdf, + path=path_cols, + values="n", + color="n", + color_continuous_scale=cmap, + title=title, + ) + h_tr = 440 if pair_both else 520 + apply_chart_theme(fig_b, height=h_tr, margin=dict(t=40, l=4, r=4, b=4)) + plot_entries.append((ct, fig_b)) + + two_up = ( + len(plot_entries) == 2 + and plot_entries[0][1] is not None + and plot_entries[1][1] is not None + ) + if two_up: + bc1, bc2 = st.columns(2, gap="small") + with bc1: + st.plotly_chart( + plot_entries[0][1], + width='stretch', + key=f"{b_key}_fig_{plot_entries[0][0]}", + ) + with bc2: + st.plotly_chart( + plot_entries[1][1], + width='stretch', + key=f"{b_key}_fig_{plot_entries[1][0]}", + ) + else: + for ct, fig_b in plot_entries: + if fig_b is not None: + st.plotly_chart( + fig_b, + width='stretch', + key=f"{b_key}_fig_{ct}", + ) + else: + st.caption(f"No **{ct}** objects to chart.") + + # --- Comparison lens: label / scenario / frame (treemap trio, Baobab-aligned) --- + query_label = f""" WITH base_gt AS ( SELECT t4dataset_id, frame_index, uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base FROM view_eval_flat WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL AND {filter_clause_base} @@ -1984,10 +2340,8 @@ def _plot_comparison_lens_treemap( t4dataset_id, frame_index, uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp FROM {comp_flat} WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL AND {filter_clause_comp_p5} @@ -1995,853 +2349,659 @@ def _plot_comparison_lens_treemap( ), joined AS ( SELECT - COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, - COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, - COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.label, c.label) AS label, COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp, - COALESCE(b.suite_name, c.suite_name, '') AS suite_name, - COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, - COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + COALESCE(c.tp_comp, FALSE) AS tp_comp FROM base_gt b FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid - ), - obj_attrs AS ( - SELECT - t4dataset_id, - frame_index, - uuid, - MAX(CAST(label AS VARCHAR)) AS label, - MAX(dist_h) AS dist_h - FROM view_eval_flat - WHERE source = 'GT' - GROUP BY 1, 2, 3 + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid ) SELECT - j.t4dataset_id, - j.frame_index, - j.gt_uuid, - COALESCE(e.label, '') AS label, - COALESCE(e.dist_h, 0.0) AS dist_h, - {_DIST_BIN_CASE.replace("dist_h", "COALESCE(e.dist_h, 0.0)")} AS distance_bin, - j.suite_name, - j.scenario_name, - j.t4dataset_name, - CASE - WHEN NOT j.tp_base AND j.tp_comp THEN 'improved' - WHEN j.tp_base AND NOT j.tp_comp THEN 'degraded' - WHEN j.tp_base AND j.tp_comp THEN 'both_tp' - ELSE 'both_fn' - END AS change_type, - j.tp_base, - j.tp_comp - FROM joined j - LEFT JOIN obj_attrs e - ON CAST(j.t4dataset_id AS VARCHAR) = CAST(e.t4dataset_id AS VARCHAR) - AND j.frame_index = CAST(e.frame_index AS VARCHAR) - AND j.gt_uuid = e.uuid - ORDER BY change_type, j.t4dataset_id, j.frame_index + label, + CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, + CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta + FROM joined + GROUP BY label + ORDER BY net_tp_delta DESC """ - try: - df_by_frame = con.execute(query_frame_p5).df() - except Exception: - df_by_frame = pd.DataFrame() - try: - df_by_object_full = con.execute(query_object_p5).df() - except Exception: - df_by_object_full = pd.DataFrame() - - tot_imp = float(df_improved["improved_cnt"].sum()) - tot_deg = float(df_improved["degraded_cnt"].sum()) - tot_net = tot_imp - tot_deg - net_s = f"+{int(tot_net)}" if tot_net > 0 else str(int(tot_net)) - - with st.expander(f"Run {lbl} vs A", expanded=(len(runs) == 2)): - c1, c2, c3, c4 = st.columns(4) - c1.metric("Improved (FN→TP)", int(tot_imp)) - c2.metric("Degraded (TP→FN)", int(tot_deg)) - c3.metric("Net TP delta", net_s) - c4.caption("Start with scenarios and frames with the most **degraded** counts.") - st.markdown( - f"**Summary:** Net **{net_s}** TP vs baseline A — " - f"**{int(tot_deg)}** degraded vs **{int(tot_imp)}** improved." - ) - - b_key = f"p5_baobab_{lbl}_{idx}" - c1b, c2b, c3b = st.columns([1, 1, 1]) - with c1b: - baobab_viz = st.radio( - "Chart type", - ["Sunburst", "Treemap"], - horizontal=True, - key=f"{b_key}_viz", - ) - with c2b: - baobab_ns = st.slider( - "Max scenarios", - min_value=5, - max_value=25, - value=15, - key=f"{b_key}_ns", - ) - with c3b: - baobab_nf = st.slider( - "Max frames / scenario", - min_value=5, - max_value=20, - value=10, - key=f"{b_key}_nf", - ) - if df_by_object_full.empty: - st.caption("No object-level rows for hierarchy.") - else: - path_cols = ["root", "scen_g", "fr_display", "label"] - h_imp = _baobab_hierarchy_from_objects( - df_by_object_full, - "improved", - f"Improved ({lbl} vs A)", - baobab_ns, - baobab_nf, - ) - h_deg = _baobab_hierarchy_from_objects( - df_by_object_full, - "degraded", - f"Degraded ({lbl} vs A)", - baobab_ns, - baobab_nf, - ) - pair_both = (not h_imp.empty) and (not h_deg.empty) - plot_entries = [] - for ct, hdf, cmap in ( - ("improved", h_imp, IMPROVED_SCALE), - ("degraded", h_deg, DEGRADED_SCALE), - ): - if hdf.empty: - plot_entries.append((ct, None)) - continue - title = f"{baobab_viz}: {ct} (n = {int(hdf['n'].sum())} GT objects)" - if baobab_viz == "Sunburst": - fig_b = px.sunburst( - hdf, - path=path_cols, - values="n", - color="n", - color_continuous_scale=cmap, - title=title, + df_by_label = pd.DataFrame() + try: + df_by_label = con.execute(query_label).df() + except Exception as e_label: + st.caption(f"Label query: {e_label}") + + scen_agg = pd.DataFrame() + if not df_improved.empty: + scen_agg = ( + df_improved.groupby("scenario_name", dropna=False) + .agg( + improved_cnt=("improved_cnt", "sum"), + degraded_cnt=("degraded_cnt", "sum"), + ) + .reset_index() + ) + scen_agg = scen_agg.sort_values( + by=["degraded_cnt", "improved_cnt"], + ascending=[False, True], + ) + + df_frame_sorted = pd.DataFrame() + if not df_by_frame.empty: + df_frame_sorted = df_by_frame.sort_values( + by=["degraded_cnt", "improved_cnt"], + ascending=[False, True], + ).reset_index(drop=True) + + root_lens = f"{lbl} vs A" + lc1, lc2, lc3 = st.columns(3, gap="small") + with lc1: + if not df_by_label.empty: + tdf_l = _comparison_lens_treemap_df( + df_by_label["label"], + df_by_label["improved_cnt"], + df_by_label["degraded_cnt"], + root_lens, + ) + _plot_comparison_lens_treemap( + tdf_l, + f"p5_lens_lab_{lbl}_{idx}", + "By class", ) - h_sb = 480 if pair_both else 620 - apply_chart_theme(fig_b, height=h_sb, margin=dict(t=36, l=4, r=4, b=4)) else: - fig_b = px.treemap( - hdf, - path=path_cols, - values="n", - color="n", - color_continuous_scale=cmap, - title=title, + st.caption("_No label data._") + with lc2: + if not scen_agg.empty: + tdf_s = _comparison_lens_treemap_df( + scen_agg["scenario_name"].astype(str), + scen_agg["improved_cnt"], + scen_agg["degraded_cnt"], + root_lens, ) - h_tr = 440 if pair_both else 520 - apply_chart_theme(fig_b, height=h_tr, margin=dict(t=40, l=4, r=4, b=4)) - plot_entries.append((ct, fig_b)) - - two_up = ( - len(plot_entries) == 2 - and plot_entries[0][1] is not None - and plot_entries[1][1] is not None - ) - if two_up: - bc1, bc2 = st.columns(2, gap="small") - with bc1: - st.plotly_chart( - plot_entries[0][1], + _plot_comparison_lens_treemap( + tdf_s, + f"p5_lens_scen_{lbl}_{idx}", + "By scenario", + ) + else: + st.caption("_No scenario data._") + with lc3: + if not df_frame_sorted.empty: + fr_cap = 36 + fr_top = df_frame_sorted.head(fr_cap).copy() + nms = ( + fr_top["scenario_name"].astype(str).str.slice(0, 26) + + "\n· f" + + fr_top["frame_index"].astype(str) + ).tolist() + ims = fr_top["improved_cnt"].astype(float).tolist() + dgs = fr_top["degraded_cnt"].astype(float).tolist() + rest = df_frame_sorted.iloc[fr_cap:] + if not rest.empty: + io = float(rest["improved_cnt"].sum()) + do = float(rest["degraded_cnt"].sum()) + if io > 0 or do > 0: + nms.append( + f"Other frames\n({len(rest)} frames)" + ) + ims.append(io) + dgs.append(do) + tdf_f = _comparison_lens_treemap_df( + pd.Series(nms), + pd.Series(ims), + pd.Series(dgs), + root_lens, + ) + _plot_comparison_lens_treemap( + tdf_f, + f"p5_lens_fr_{lbl}_{idx}", + "By frame", + ) + st.caption( + f"Top **{fr_cap}** frames by degraded, plus **Other frames** " + f"so totals match **By class** / **By scenario**." + ) + else: + st.caption("_No frame data._") + + with st.expander("Tables behind the lens (label / scenario / frame)"): + if not df_by_label.empty: + st.markdown("**Per label**") + st.dataframe( + df_by_label, width='stretch', - key=f"{b_key}_fig_{plot_entries[0][0]}", + hide_index=True, ) - with bc2: - st.plotly_chart( - plot_entries[1][1], + if not scen_agg.empty: + st.markdown("**Per scenario**") + st.dataframe(scen_agg, width='stretch', hide_index=True) + if not df_frame_sorted.empty: + st.markdown("**Per frame** (sorted by degraded)") + st.dataframe( + df_frame_sorted.head(200), width='stretch', - key=f"{b_key}_fig_{plot_entries[1][0]}", + hide_index=True, ) - else: - for ct, fig_b in plot_entries: - if fig_b is not None: - st.plotly_chart( - fig_b, - width='stretch', - key=f"{b_key}_fig_{ct}", - ) - else: - st.caption(f"No **{ct}** objects to chart.") - - # --- Comparison lens: label / scenario / frame (treemap trio, Baobab-aligned) --- - query_label = f""" - WITH base_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base - FROM view_eval_flat - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_base} - GROUP BY 1, 2, 3 - ), - comp_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp - FROM {comp_flat} - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_comp_p5} - GROUP BY 1, 2, 3 - ), - joined AS ( - SELECT - COALESCE(b.label, c.label) AS label, - COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp - FROM base_gt b - FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid - ) - SELECT - label, - CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, - CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta - FROM joined - GROUP BY label - ORDER BY net_tp_delta DESC - """ - df_by_label = pd.DataFrame() - try: - df_by_label = con.execute(query_label).df() - except Exception as e_label: - st.caption(f"Label query: {e_label}") - - scen_agg = pd.DataFrame() - if not df_improved.empty: - scen_agg = ( - df_improved.groupby("scenario_name", dropna=False) - .agg( - improved_cnt=("improved_cnt", "sum"), - degraded_cnt=("degraded_cnt", "sum"), + + with st.expander("Full dataset breakdown (per t4dataset_id row)"): + st.dataframe(df_improved, width='stretch', hide_index=True) + + # --- Drill-down: filters + objects --- + with st.expander("Drill-down: objects"): + scen_key = f"p5_scen_{lbl}_{idx}" + t4_key = f"p5_t4_{lbl}_{idx}" + lab_key = f"p5_lab_{lbl}_{idx}" + for k, default in ((scen_key, []), (t4_key, []), (lab_key, [])): + if k not in st.session_state: + st.session_state[k] = default + + scenarios_all = sorted( + df_improved["scenario_name"].dropna().astype(str).unique().tolist() ) - .reset_index() - ) - scen_agg = scen_agg.sort_values( - by=["degraded_cnt", "improved_cnt"], - ascending=[False, True], - ) - - df_frame_sorted = pd.DataFrame() - if not df_by_frame.empty: - df_frame_sorted = df_by_frame.sort_values( - by=["degraded_cnt", "improved_cnt"], - ascending=[False, True], - ).reset_index(drop=True) - - root_lens = f"{lbl} vs A" - lc1, lc2, lc3 = st.columns(3, gap="small") - with lc1: - if not df_by_label.empty: - tdf_l = _comparison_lens_treemap_df( - df_by_label["label"], - df_by_label["improved_cnt"], - df_by_label["degraded_cnt"], - root_lens, + t4_all = sorted( + df_improved["t4dataset_name"].dropna().astype(str).unique().tolist() ) - _plot_comparison_lens_treemap( - tdf_l, - f"p5_lens_lab_{lbl}_{idx}", - "By class", + labels_all = ( + sorted(df_by_object_full["label"].dropna().astype(str).unique().tolist()) + if not df_by_object_full.empty + else [] ) - else: - st.caption("_No label data._") - with lc2: - if not scen_agg.empty: - tdf_s = _comparison_lens_treemap_df( - scen_agg["scenario_name"].astype(str), - scen_agg["improved_cnt"], - scen_agg["degraded_cnt"], - root_lens, + # Keep prior picks valid so Streamlit does not reset widgets when options refresh + scenarios_opts = sorted( + set(scenarios_all) | set(st.session_state.get(scen_key, []) or []) ) - _plot_comparison_lens_treemap( - tdf_s, - f"p5_lens_scen_{lbl}_{idx}", - "By scenario", + t4_opts = sorted(set(t4_all) | set(st.session_state.get(t4_key, []) or [])) + labels_opts = sorted( + set(labels_all) | set(st.session_state.get(lab_key, []) or []) ) - else: - st.caption("_No scenario data._") - with lc3: - if not df_frame_sorted.empty: - fr_cap = 36 - fr_top = df_frame_sorted.head(fr_cap).copy() - nms = ( - fr_top["scenario_name"].astype(str).str.slice(0, 26) - + "\n· f" - + fr_top["frame_index"].astype(str) - ).tolist() - ims = fr_top["improved_cnt"].astype(float).tolist() - dgs = fr_top["degraded_cnt"].astype(float).tolist() - rest = df_frame_sorted.iloc[fr_cap:] - if not rest.empty: - io = float(rest["improved_cnt"].sum()) - do = float(rest["degraded_cnt"].sum()) - if io > 0 or do > 0: - nms.append( - f"Other frames\n({len(rest)} frames)" + + pr1, pr2 = st.columns(2) + with pr1: + if st.button( + "Preset: top 5 degraded scenarios", + key=f"p5_pre_scen_{lbl}_{idx}", + ): + if not df_improved.empty: + sa = ( + df_improved.groupby("scenario_name", dropna=False)[ + "degraded_cnt" + ] + .sum() + .sort_values(ascending=False) + .head(5) + ) + st.session_state[scen_key] = [ + str(x) for x in sa.index.tolist() + ] + st.rerun() + fr_multiselect_key = f"p5_frkeys_{lbl}_{idx}" + if fr_multiselect_key not in st.session_state: + st.session_state[fr_multiselect_key] = [] + frame_key_labels = {} + if not df_frame_sorted.empty: + for _, rw in df_frame_sorted.head(40).iterrows(): + fk = f"{rw['t4dataset_id']}|{rw['frame_index']}" + # Use scenario_name (not suite_name) for frame option labels + frame_key_labels[fk] = ( + f"{str(rw.get('scenario_name', ''))[:36]} | " + f"f{rw['frame_index']} | deg {int(rw['degraded_cnt'])}" ) - ims.append(io) - dgs.append(do) - tdf_f = _comparison_lens_treemap_df( - pd.Series(nms), - pd.Series(ims), - pd.Series(dgs), - root_lens, - ) - _plot_comparison_lens_treemap( - tdf_f, - f"p5_lens_fr_{lbl}_{idx}", - "By frame", - ) - st.caption( - f"Top **{fr_cap}** frames by degraded, plus **Other frames** " - f"so totals match **By class** / **By scenario**." + with pr2: + if st.button( + "Preset: top 10 degraded frames (object filter)", + key=f"p5_pre_fr_{lbl}_{idx}", + ): + if frame_key_labels: + topk = list(frame_key_labels.keys())[:10] + st.session_state[fr_multiselect_key] = topk + st.rerun() + + colf1, colf2, colf3 = st.columns(3) + with colf1: + if scenarios_opts: + st.multiselect( + "Filter scenario_name", + scenarios_opts, + key=scen_key, + ) + else: + st.caption("No scenarios.") + with colf2: + if t4_opts: + st.multiselect( + "Filter t4dataset_name", + t4_opts, + key=t4_key, + ) + else: + st.caption("No t4dataset_name.") + with colf3: + if labels_opts: + st.multiselect( + "Filter label", + labels_opts, + key=lab_key, + ) + else: + st.caption("No labels.") + + prev_fr = st.session_state.get(fr_multiselect_key) or [] + base_frame_keys = list(frame_key_labels.keys()) + for k in prev_fr: + if k not in frame_key_labels: + frame_key_labels[k] = f"(selected) frame {str(k).split('|')[-1]}" + frame_opts_keys = base_frame_keys + [ + k for k in prev_fr if k not in base_frame_keys + ] + if frame_opts_keys: + st.multiselect( + "Limit objects to frames (optional)", + options=frame_opts_keys, + format_func=lambda k: frame_key_labels.get(k, k), + key=fr_multiselect_key, + ) + + change_type_filter = st.selectbox( + "Change type", + ["degraded", "improved", "all", "both_tp", "both_fn"], + key=f"change_type_{lbl}_{idx}", + help="Filter objects by TP change between runs.", ) - else: - st.caption("_No frame data._") - - with st.expander("Tables behind the lens (label / scenario / frame)"): - if not df_by_label.empty: - st.markdown("**Per label**") - st.dataframe( - df_by_label, - width='stretch', - hide_index=True, + sort_obj = st.selectbox( + "Sort objects by", + [ + "degraded_priority_then_dist", + "frame_then_uuid", + "label_then_dist", + ], + key=f"p5_sort_{lbl}_{idx}", ) - if not scen_agg.empty: - st.markdown("**Per scenario**") - st.dataframe(scen_agg, width='stretch', hide_index=True) - if not df_frame_sorted.empty: - st.markdown("**Per frame** (sorted by degraded)") - st.dataframe( - df_frame_sorted.head(200), - width='stretch', - hide_index=True, + + df_obj_show = ( + df_by_object_full.copy() + if not df_by_object_full.empty + else pd.DataFrame() ) - - with st.expander("Full dataset breakdown (per t4dataset_id row)"): - st.dataframe(df_improved, width='stretch', hide_index=True) - - # --- Drill-down: filters + objects --- - with st.expander("Drill-down: objects"): - scen_key = f"p5_scen_{lbl}_{idx}" - t4_key = f"p5_t4_{lbl}_{idx}" - lab_key = f"p5_lab_{lbl}_{idx}" - for k, default in ((scen_key, []), (t4_key, []), (lab_key, [])): - if k not in st.session_state: - st.session_state[k] = default - - scenarios_all = sorted( - df_improved["scenario_name"].dropna().astype(str).unique().tolist() - ) - t4_all = sorted( - df_improved["t4dataset_name"].dropna().astype(str).unique().tolist() - ) - labels_all = ( - sorted(df_by_object_full["label"].dropna().astype(str).unique().tolist()) - if not df_by_object_full.empty - else [] - ) - # Keep prior picks valid so Streamlit does not reset widgets when options refresh - scenarios_opts = sorted( - set(scenarios_all) | set(st.session_state.get(scen_key, []) or []) - ) - t4_opts = sorted(set(t4_all) | set(st.session_state.get(t4_key, []) or [])) - labels_opts = sorted( - set(labels_all) | set(st.session_state.get(lab_key, []) or []) - ) - - pr1, pr2 = st.columns(2) - with pr1: - if st.button( - "Preset: top 5 degraded scenarios", - key=f"p5_pre_scen_{lbl}_{idx}", - ): - if not df_improved.empty: - sa = ( - df_improved.groupby("scenario_name", dropna=False)[ - "degraded_cnt" - ] - .sum() - .sort_values(ascending=False) - .head(5) - ) - st.session_state[scen_key] = [ - str(x) for x in sa.index.tolist() + if not df_obj_show.empty: + ss = st.session_state.get(scen_key) or [] + if ss: + df_obj_show = df_obj_show[ + df_obj_show["scenario_name"].astype(str).isin(ss) ] - st.rerun() - fr_multiselect_key = f"p5_frkeys_{lbl}_{idx}" - if fr_multiselect_key not in st.session_state: - st.session_state[fr_multiselect_key] = [] - frame_key_labels = {} - if not df_frame_sorted.empty: - for _, rw in df_frame_sorted.head(40).iterrows(): - fk = f"{rw['t4dataset_id']}|{rw['frame_index']}" - # Use scenario_name (not suite_name) for frame option labels - frame_key_labels[fk] = ( - f"{str(rw.get('scenario_name', ''))[:36]} | " - f"f{rw['frame_index']} | deg {int(rw['degraded_cnt'])}" - ) - with pr2: - if st.button( - "Preset: top 10 degraded frames (object filter)", - key=f"p5_pre_fr_{lbl}_{idx}", - ): - if frame_key_labels: - topk = list(frame_key_labels.keys())[:10] - st.session_state[fr_multiselect_key] = topk - st.rerun() - - colf1, colf2, colf3 = st.columns(3) - with colf1: - if scenarios_opts: - st.multiselect( - "Filter scenario_name", - scenarios_opts, - key=scen_key, - ) - else: - st.caption("No scenarios.") - with colf2: - if t4_opts: - st.multiselect( - "Filter t4dataset_name", - t4_opts, - key=t4_key, - ) - else: - st.caption("No t4dataset_name.") - with colf3: - if labels_opts: - st.multiselect( - "Filter label", - labels_opts, - key=lab_key, - ) - else: - st.caption("No labels.") - - prev_fr = st.session_state.get(fr_multiselect_key) or [] - base_frame_keys = list(frame_key_labels.keys()) - for k in prev_fr: - if k not in frame_key_labels: - frame_key_labels[k] = f"(selected) frame {str(k).split('|')[-1]}" - frame_opts_keys = base_frame_keys + [ - k for k in prev_fr if k not in base_frame_keys - ] - if frame_opts_keys: - st.multiselect( - "Limit objects to frames (optional)", - options=frame_opts_keys, - format_func=lambda k: frame_key_labels.get(k, k), - key=fr_multiselect_key, + tt = st.session_state.get(t4_key) or [] + if tt: + df_obj_show = df_obj_show[ + df_obj_show["t4dataset_name"].astype(str).isin(tt) + ] + ll = st.session_state.get(lab_key) or [] + if ll: + df_obj_show = df_obj_show[ + df_obj_show["label"].astype(str).isin(ll) + ] + fk_sel = st.session_state.get(fr_multiselect_key) or [] + if fk_sel: + fk_set = set(fk_sel) + df_obj_show = df_obj_show[ + ( + df_obj_show["t4dataset_id"].astype(str) + + "|" + + df_obj_show["frame_index"].astype(str) + ).isin(fk_set) + ] + if change_type_filter != "all": + df_obj_show = df_obj_show[ + df_obj_show["change_type"] == change_type_filter + ] + if sort_obj == "degraded_priority_then_dist": + df_obj_show = df_obj_show.copy() + df_obj_show["_prio"] = df_obj_show["change_type"].map( + { + "degraded": 0, + "improved": 1, + "both_tp": 2, + "both_fn": 3, + } + ) + df_obj_show = df_obj_show.sort_values( + by=["_prio", "dist_h"], + ascending=[True, True], + ).drop(columns=["_prio"], errors="ignore") + elif sort_obj == "frame_then_uuid": + df_obj_show = df_obj_show.sort_values( + by=["t4dataset_id", "frame_index", "gt_uuid"] + ) + else: + df_obj_show = df_obj_show.sort_values( + by=["label", "dist_h", "t4dataset_id", "frame_index"] + ) + + n_show = 200 + st.caption( + f"Showing up to {n_show} rows; use **Download CSV** for the full filtered list." ) - - change_type_filter = st.selectbox( - "Change type", - ["degraded", "improved", "all", "both_tp", "both_fn"], - key=f"change_type_{lbl}_{idx}", - help="Filter objects by TP change between runs.", - ) - sort_obj = st.selectbox( - "Sort objects by", - [ - "degraded_priority_then_dist", - "frame_then_uuid", - "label_then_dist", - ], - key=f"p5_sort_{lbl}_{idx}", - ) - - df_obj_show = ( - df_by_object_full.copy() - if not df_by_object_full.empty - else pd.DataFrame() - ) - if not df_obj_show.empty: - ss = st.session_state.get(scen_key) or [] - if ss: - df_obj_show = df_obj_show[ - df_obj_show["scenario_name"].astype(str).isin(ss) - ] - tt = st.session_state.get(t4_key) or [] - if tt: - df_obj_show = df_obj_show[ - df_obj_show["t4dataset_name"].astype(str).isin(tt) - ] - ll = st.session_state.get(lab_key) or [] - if ll: - df_obj_show = df_obj_show[ - df_obj_show["label"].astype(str).isin(ll) - ] - fk_sel = st.session_state.get(fr_multiselect_key) or [] - if fk_sel: - fk_set = set(fk_sel) - df_obj_show = df_obj_show[ - ( - df_obj_show["t4dataset_id"].astype(str) - + "|" - + df_obj_show["frame_index"].astype(str) - ).isin(fk_set) - ] - if change_type_filter != "all": - df_obj_show = df_obj_show[ - df_obj_show["change_type"] == change_type_filter - ] - if sort_obj == "degraded_priority_then_dist": - df_obj_show = df_obj_show.copy() - df_obj_show["_prio"] = df_obj_show["change_type"].map( - { - "degraded": 0, - "improved": 1, - "both_tp": 2, - "both_fn": 3, - } + if not df_obj_show.empty: + st.download_button( + label="Download filtered objects (CSV)", + data=df_obj_show.to_csv(index=False).encode("utf-8"), + file_name=f"perception_diff_{lbl}_vs_A_objects.csv", + mime="text/csv", + key=f"p5_dl_{lbl}_{idx}", ) - df_obj_show = df_obj_show.sort_values( - by=["_prio", "dist_h"], - ascending=[True, True], - ).drop(columns=["_prio"], errors="ignore") - elif sort_obj == "frame_then_uuid": - df_obj_show = df_obj_show.sort_values( - by=["t4dataset_id", "frame_index", "gt_uuid"] + st.dataframe( + df_obj_show.head(n_show), + width='stretch', + hide_index=True, ) else: - df_obj_show = df_obj_show.sort_values( - by=["label", "dist_h", "t4dataset_id", "frame_index"] - ) - - n_show = 200 - st.caption( - f"Showing up to {n_show} rows; use **Download CSV** for the full filtered list." - ) - if not df_obj_show.empty: - st.download_button( - label="Download filtered objects (CSV)", - data=df_obj_show.to_csv(index=False).encode("utf-8"), - file_name=f"perception_diff_{lbl}_vs_A_objects.csv", - mime="text/csv", - key=f"p5_dl_{lbl}_{idx}", - ) - st.dataframe( - df_obj_show.head(n_show), - width='stretch', - hide_index=True, - ) - else: - st.caption("No objects match filters.") - - with st.expander("Full frame table (sort: degraded desc)"): - if not df_frame_sorted.empty: - st.dataframe(df_frame_sorted, width='stretch', hide_index=True) - else: - st.caption("No frame breakdown.") - else: - st.caption(f"Run {lbl} vs A: No data.") - except Exception as e: - st.error(f"Error (Run {lbl} vs A): {e}") - finally: - _pd_slot.empty() - -# ============================= -# Single mode: Frame / Object level — Where are the misses? -# ============================= -if single_mode: - st.markdown(section_header_html("Frame / Object level: Where are the misses?"), unsafe_allow_html=True) - _fn_slot = st.empty() - _fn_slot.markdown(ds_spot_loading_markup("FN by frame & object"), unsafe_allow_html=True) - try: - with st.expander("FN by frame and by object", expanded=True): - query_fn_frame = f""" - SELECT - t4dataset_id, - frame_index, - COALESCE(MAX(CAST(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(CAST(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(CAST(t4dataset_name AS VARCHAR)), '') AS t4dataset_name, - COUNT(*) AS fn_cnt - FROM view_eval_flat - WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} - GROUP BY t4dataset_id, frame_index - ORDER BY fn_cnt DESC - """ - df_fn_frame = con.execute(query_fn_frame).df() - query_fn_object = f""" - SELECT - t4dataset_id, - frame_index, - uuid, - COALESCE(CAST(label AS VARCHAR), '') AS label, - dist_h, - COALESCE(CAST(scenario_name AS VARCHAR), '') AS scenario_name, - COALESCE(CAST(suite_name AS VARCHAR), '') AS suite_name - FROM view_eval_flat - WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} - ORDER BY t4dataset_id, frame_index, uuid - """ - df_fn_object = con.execute(query_fn_object).df() - if not df_fn_frame.empty: - st.markdown("**FN count by frame**") - st.download_button("Download FN by frame (CSV)", data=df_fn_frame.to_csv(index=False).encode("utf-8"), file_name="fn_by_frame.csv", mime="text/csv", key="dl_fn_frame") - st.dataframe(df_fn_frame, width='stretch', hide_index=True) - else: - st.caption("No FN by frame.") - if not df_fn_object.empty: - st.markdown("**FN objects**") - if len(df_fn_object) > 500: - st.caption(f"Showing first 500 of {len(df_fn_object)} FN objects.") - st.dataframe(df_fn_object.head(500), width='stretch', hide_index=True) + st.caption("No objects match filters.") + + with st.expander("Full frame table (sort: degraded desc)"): + if not df_frame_sorted.empty: + st.dataframe(df_frame_sorted, width='stretch', hide_index=True) + else: + st.caption("No frame breakdown.") else: - st.dataframe(df_fn_object, width='stretch', hide_index=True) - else: - st.caption("No FN objects.") - except Exception as e: - st.error(f"Error in FN by frame/object: {e}") - finally: - _fn_slot.empty() - -# ============================= -# Panel 6: Mean Error (single) / Mean Error Comparison (compare) -# ============================= -st.divider() -st.markdown( - section_header_html( - "Mean Error" + (" Comparison" if not single_mode else ""), - "Mean absolute error on TP matches (X/Y in m, Yaw in rad)." - + (" Compare mode: choose grouped bars or spider charts." if not single_mode else ""), - ), - unsafe_allow_html=True, -) - -try: - sample_query = "SELECT * FROM view_eval_flat LIMIT 1" - sample_df = con.execute(sample_query).df() - has_error_cols = all(col in sample_df.columns for col in ['x_error', 'y_error', 'yaw_error']) -except Exception: - has_error_cols = False - -if not has_error_cols: - st.info("Error columns (x_error, y_error, yaw_error) not found in data. Skipping error analysis.") -else: + st.caption(f"Run {lbl} vs A: No data.") + except Exception as e: + st.error(f"Error (Run {lbl} vs A): {e}") + finally: + _pd_slot.empty() + + # ============================= + # Single mode: Frame / Object level — Where are the misses? + # ============================= if single_mode: + ds_dlog("section: Frame_FN_misses_start") + st.markdown(section_header_html("Frame / Object level: Where are the misses?"), unsafe_allow_html=True) + _fn_slot = st.empty() + _fn_slot.markdown(ds_spot_loading_markup("FN by frame & object"), unsafe_allow_html=True) try: - with ds_spot_loading("Mean error"): - query = f""" + with st.expander("FN by frame and by object", expanded=True): + query_fn_frame = f""" SELECT - label, - AVG(ABS(CAST(x_error AS DOUBLE))) FILTER ( - WHERE status = 'TP' AND x_error IS NOT NULL - ) AS mean_abs_x_error, - AVG(ABS(CAST(y_error AS DOUBLE))) FILTER ( - WHERE status = 'TP' AND y_error IS NOT NULL - ) AS mean_abs_y_error, - AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER ( - WHERE status = 'TP' AND yaw_error IS NOT NULL - ) AS mean_abs_yaw_error + t4dataset_id, + frame_index, + COALESCE(MAX(CAST(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(CAST(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(CAST(t4dataset_name AS VARCHAR)), '') AS t4dataset_name, + COUNT(*) AS fn_cnt FROM view_eval_flat - WHERE {filter_clause_base} - GROUP BY label - ORDER BY label + WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} + GROUP BY t4dataset_id, frame_index + ORDER BY fn_cnt DESC """ - df_error_base = con.execute(query).df() - if not df_error_base.empty: - fig = go.Figure() - fig.add_trace(go.Bar( - x=df_error_base['label'], - y=df_error_base['mean_abs_x_error'], - name='X Error', - marker_color=RUN_COLORS[0], - )) - fig.add_trace(go.Bar( - x=df_error_base['label'], - y=df_error_base['mean_abs_y_error'], - name='Y Error', - marker_color=RUN_COLORS[1], - )) - fig.add_trace(go.Bar( - x=df_error_base['label'], - y=df_error_base['mean_abs_yaw_error'], - name='Yaw Error', - marker_color=RUN_COLORS[2], - )) - apply_chart_theme(fig) - fig.update_layout( - title=f"Mean Error within {max_eval_range} [m]", - xaxis_title="Label", - yaxis_title="Error [m] or [rad]", - barmode='group' - ) - st.plotly_chart(fig, width="stretch") - else: - st.info("No data available") + df_fn_frame = con.execute(query_fn_frame).df() + query_fn_object = f""" + SELECT + t4dataset_id, + frame_index, + uuid, + COALESCE(CAST(label AS VARCHAR), '') AS label, + dist_h, + COALESCE(CAST(scenario_name AS VARCHAR), '') AS scenario_name, + COALESCE(CAST(suite_name AS VARCHAR), '') AS suite_name + FROM view_eval_flat + WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} + ORDER BY t4dataset_id, frame_index, uuid + """ + df_fn_object = con.execute(query_fn_object).df() + if not df_fn_frame.empty: + st.markdown("**FN count by frame**") + st.download_button("Download FN by frame (CSV)", data=df_fn_frame.to_csv(index=False).encode("utf-8"), file_name="fn_by_frame.csv", mime="text/csv", key="dl_fn_frame") + st.dataframe(df_fn_frame, width='stretch', hide_index=True) + else: + st.caption("No FN by frame.") + if not df_fn_object.empty: + st.markdown("**FN objects**") + if len(df_fn_object) > 500: + st.caption(f"Showing first 500 of {len(df_fn_object)} FN objects.") + st.dataframe(df_fn_object.head(500), width='stretch', hide_index=True) + else: + st.dataframe(df_fn_object, width='stretch', hide_index=True) + else: + st.caption("No FN objects.") except Exception as e: - st.error(f"Error: {e}") + st.error(f"Error in FN by frame/object: {e}") + finally: + _fn_slot.empty() + + # ============================= + # Panel 6: Mean Error (single) / Mean Error Comparison (compare) + # ============================= + ds_dlog("section: Panel6_Mean_Error_start") + st.divider() + st.markdown( + section_header_html( + "Mean Error" + (" Comparison" if not single_mode else ""), + "Mean absolute error on TP matches (X/Y in m, Yaw in rad)." + + (" Compare mode: choose grouped bars or spider charts." if not single_mode else ""), + ), + unsafe_allow_html=True, + ) + + try: + sample_query = "SELECT * FROM view_eval_flat LIMIT 1" + sample_df = con.execute(sample_query).df() + has_error_cols = all(col in sample_df.columns for col in ['x_error', 'y_error', 'yaw_error']) + except Exception: + has_error_cols = False + + if not has_error_cols: + st.info("Error columns (x_error, y_error, yaw_error) not found in data. Skipping error analysis.") else: - try: - with ds_spot_loading("Mean error"): - dfs_err = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i]) - q = f""" + if single_mode: + try: + with ds_spot_loading("Mean error"): + query = f""" SELECT label, - AVG(ABS(CAST(x_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND x_error IS NOT NULL) AS mean_abs_x_error, - AVG(ABS(CAST(y_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND y_error IS NOT NULL) AS mean_abs_y_error, - AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND yaw_error IS NOT NULL) AS mean_abs_yaw_error - FROM {_flat_view(i)} - WHERE {fc} + AVG(ABS(CAST(x_error AS DOUBLE))) FILTER ( + WHERE status = 'TP' AND x_error IS NOT NULL + ) AS mean_abs_x_error, + AVG(ABS(CAST(y_error AS DOUBLE))) FILTER ( + WHERE status = 'TP' AND y_error IS NOT NULL + ) AS mean_abs_y_error, + AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER ( + WHERE status = 'TP' AND yaw_error IS NOT NULL + ) AS mean_abs_yaw_error + FROM view_eval_flat + WHERE {filter_clause_base} GROUP BY label ORDER BY label """ - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - dfs_err.append(df_i) - df_err_melt = pd.concat(dfs_err, ignore_index=True) - if not df_err_melt.empty: - mean_err_viz = st.radio( - "Mean error chart style", - options=["Spider chart (X, Y & Yaw)", "Grouped bar"], - index=0, - horizontal=True, - key="mean_err_compare_viz", - ) - if mean_err_viz == "Grouped bar": - for err_type, col in [ - ("X Error", "mean_abs_x_error"), - ("Y Error", "mean_abs_y_error"), - ("Yaw Error", "mean_abs_yaw_error"), - ]: - fig = px.bar( - df_err_melt, - x="label", - y=col, - color="run", - barmode="group", - title=f"Mean {err_type} within {max_eval_range} [m] by run", - labels={"label": "Label", col: err_type, "run": "Run"}, - color_discrete_sequence=RUN_COLORS, - ) - apply_chart_theme(fig) - st.plotly_chart(fig, width="stretch") + df_error_base = con.execute(query).df() + if not df_error_base.empty: + fig = go.Figure() + fig.add_trace(go.Bar( + x=df_error_base['label'], + y=df_error_base['mean_abs_x_error'], + name='X Error', + marker_color=RUN_COLORS[0], + )) + fig.add_trace(go.Bar( + x=df_error_base['label'], + y=df_error_base['mean_abs_y_error'], + name='Y Error', + marker_color=RUN_COLORS[1], + )) + fig.add_trace(go.Bar( + x=df_error_base['label'], + y=df_error_base['mean_abs_yaw_error'], + name='Yaw Error', + marker_color=RUN_COLORS[2], + )) + apply_chart_theme(fig) + fig.update_layout( + title=f"Mean Error within {max_eval_range} [m]", + xaxis_title="Label", + yaxis_title="Error [m] or [rad]", + barmode='group' + ) + st.plotly_chart(fig, width="stretch") else: - st.caption( - f"Three spiders: mean |error| per label per run (TP only), within **{max_eval_range} m** " - "(same as sidebar max range)." + st.info("No data available") + except Exception as e: + st.error(f"Error: {e}") + else: + try: + with ds_spot_loading("Mean error"): + dfs_err = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i]) + q = f""" + SELECT + label, + AVG(ABS(CAST(x_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND x_error IS NOT NULL) AS mean_abs_x_error, + AVG(ABS(CAST(y_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND y_error IS NOT NULL) AS mean_abs_y_error, + AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND yaw_error IS NOT NULL) AS mean_abs_yaw_error + FROM {_flat_view(i)} + WHERE {fc} + GROUP BY label + ORDER BY label + """ + df_i = con.execute(q).df() + df_i["run"] = run_labels_list[i] + dfs_err.append(df_i) + df_err_melt = pd.concat(dfs_err, ignore_index=True) + if not df_err_melt.empty: + mean_err_viz = st.radio( + "Mean error chart style", + options=["Spider chart (X, Y & Yaw)", "Grouped bar"], + index=0, + horizontal=True, + key="mean_err_compare_viz", ) - cats = sorted(df_err_melt["label"].astype(str).unique()) - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - rcols = st.columns(3) - err_specs = [ - ( - f"Mean |x error| (within {max_eval_range} m)", - "mean_abs_x_error", - "Mean |x error| (m)", - ".3f", - ), - ( - f"Mean |y error| (within {max_eval_range} m)", - "mean_abs_y_error", - "Mean |y error| (m)", - ".3f", - ), - ( - f"Mean |yaw error| (within {max_eval_range} m)", - "mean_abs_yaw_error", - "Mean |yaw error| (rad)", - ".4f", - ), - ] - for ci, (chart_title, col, hover_lbl, tfmt) in enumerate(err_specs): - fig_r = _scalar_metric_spider_compare( - df_err_melt, - cats, - chart_title, - run_labels_list, - col, - hover_lbl, - height=400, - tickformat=tfmt, + if mean_err_viz == "Grouped bar": + for err_type, col in [ + ("X Error", "mean_abs_x_error"), + ("Y Error", "mean_abs_y_error"), + ("Yaw Error", "mean_abs_yaw_error"), + ]: + fig = px.bar( + df_err_melt, + x="label", + y=col, + color="run", + barmode="group", + title=f"Mean {err_type} within {max_eval_range} [m] by run", + labels={"label": "Label", col: err_type, "run": "Run"}, + color_discrete_sequence=RUN_COLORS, + ) + apply_chart_theme(fig) + st.plotly_chart(fig, width="stretch") + else: + st.caption( + f"Three spiders: mean |error| per label per run (TP only), within **{max_eval_range} m** " + "(same as sidebar max range)." ) - with rcols[ci]: - st.plotly_chart(fig_r, width='stretch') - else: - st.info("No data available") - except Exception as e: - st.error(f"Error: {e}") - - st.markdown(section_header_html("Difference of mean absolute error (each run − Baseline A)"), unsafe_allow_html=True) - for idx in range(1, len(runs)): - lbl = run_labels_list[idx] - _med_slot = st.empty() - _med_slot.markdown(ds_spot_loading_markup(f"Mean error diff · run {lbl}"), unsafe_allow_html=True) - try: - fc_c = build_filter_clause(filters_list[idx]) - query = f""" - WITH topic_a AS ( - SELECT label, - AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_a, - AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_a, - AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_a - FROM view_eval_flat - WHERE {filter_clause_base} - GROUP BY label - ), - topic_c AS ( - SELECT label, - AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_c, - AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_c, - AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_c - FROM {_flat_view(idx)} - WHERE {fc_c} - GROUP BY label - ) - SELECT a.label, - (c.x_c - a.x_a) AS x_diff, - (c.y_c - a.y_a) AS y_diff, - (c.yaw_c - a.yaw_a) AS yaw_diff - FROM topic_a a - JOIN topic_c c USING (label) - ORDER BY label - """ - df_ed = con.execute(query).df() - if not df_ed.empty: - with st.expander(f"Run {lbl} − A", expanded=(len(runs) == 2)): - fig = go.Figure() - fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["x_diff"], name="X Diff", marker_color=RUN_COLORS[0])) - fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["y_diff"], name="Y Diff", marker_color=RUN_COLORS[1])) - fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["yaw_diff"], name="Yaw Diff", marker_color=RUN_COLORS[2])) - apply_chart_theme(fig) - fig.update_layout(title=f"Error diff ({lbl} − A) within {max_eval_range} [m]", xaxis_title="Label", yaxis_title="Error Difference [m] or [rad]", barmode="group") - st.plotly_chart(fig, width="stretch") + cats = sorted(df_err_melt["label"].astype(str).unique()) + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + rcols = st.columns(3) + err_specs = [ + ( + f"Mean |x error| (within {max_eval_range} m)", + "mean_abs_x_error", + "Mean |x error| (m)", + ".3f", + ), + ( + f"Mean |y error| (within {max_eval_range} m)", + "mean_abs_y_error", + "Mean |y error| (m)", + ".3f", + ), + ( + f"Mean |yaw error| (within {max_eval_range} m)", + "mean_abs_yaw_error", + "Mean |yaw error| (rad)", + ".4f", + ), + ] + for ci, (chart_title, col, hover_lbl, tfmt) in enumerate(err_specs): + fig_r = _scalar_metric_spider_compare( + df_err_melt, + cats, + chart_title, + run_labels_list, + col, + hover_lbl, + height=400, + tickformat=tfmt, + ) + with rcols[ci]: + st.plotly_chart(fig_r, width='stretch') + else: + st.info("No data available") except Exception as e: - st.error(f"Error (Run {lbl} − A): {e}") - finally: - _med_slot.empty() + st.error(f"Error: {e}") + + st.markdown(section_header_html("Difference of mean absolute error (each run − Baseline A)"), unsafe_allow_html=True) + for idx in range(1, len(runs)): + lbl = run_labels_list[idx] + _med_slot = st.empty() + _med_slot.markdown(ds_spot_loading_markup(f"Mean error diff · run {lbl}"), unsafe_allow_html=True) + try: + fc_c = build_filter_clause(filters_list[idx]) + query = f""" + WITH topic_a AS ( + SELECT label, + AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_a, + AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_a, + AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_a + FROM view_eval_flat + WHERE {filter_clause_base} + GROUP BY label + ), + topic_c AS ( + SELECT label, + AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_c, + AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_c, + AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_c + FROM {_flat_view(idx)} + WHERE {fc_c} + GROUP BY label + ) + SELECT a.label, + (c.x_c - a.x_a) AS x_diff, + (c.y_c - a.y_a) AS y_diff, + (c.yaw_c - a.yaw_a) AS yaw_diff + FROM topic_a a + JOIN topic_c c USING (label) + ORDER BY label + """ + df_ed = con.execute(query).df() + if not df_ed.empty: + with st.expander(f"Run {lbl} − A", expanded=(len(runs) == 2)): + fig = go.Figure() + fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["x_diff"], name="X Diff", marker_color=RUN_COLORS[0])) + fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["y_diff"], name="Y Diff", marker_color=RUN_COLORS[1])) + fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["yaw_diff"], name="Yaw Diff", marker_color=RUN_COLORS[2])) + apply_chart_theme(fig) + fig.update_layout(title=f"Error diff ({lbl} − A) within {max_eval_range} [m]", xaxis_title="Label", yaxis_title="Error Difference [m] or [rad]", barmode="group") + st.plotly_chart(fig, width="stretch") + except Exception as e: + st.error(f"Error (Run {lbl} − A): {e}") + finally: + _med_slot.empty() + + ds_dlog("main_content_try_exit_ok") + ds_debug_log_memory("main_content_end") + +except Exception as _e_ds_main: + ds_debug_log_exception("detection_stats_main_try", _e_ds_main) + raise -_ds_loading_banner.empty() +finally: + try: + ds_debug_render_expander(st.session_state) + except Exception as _e_dbg_exp: + ds_debug_log_exception("ds_debug_render_expander", _e_dbg_exp) + ds_dlog("main_content_finally_banner_clear") + _ds_loading_banner.empty() + ds_dlog("detection_stats_script_run_complete") From 875d03774d1b5305c9ca5449277319d79f83ba08 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 30 Mar 2026 14:49:41 +0900 Subject: [PATCH 04/94] feat: add T4 visualizer client and integrate with bounding box viewer - Introduced a new HTTP client for the T4 Visualizer FastAPI server, enabling interaction with its health, datasets, and rendering endpoints. - Enhanced the Bounding Box Viewer page to fetch and display camera renders from the T4 server, incorporating user-configurable options for dataset and scenario selection. - Added unit and integration tests for the T4 visualizer client, ensuring robust functionality and error handling. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_visualizer_client.py | 228 ++++++++++++++++++ .../pages/4_Bounding_Box_Viewer.py | 139 ++++++++++- evaluation_dashboard_app/tests/conftest.py | 8 + .../tests/test_t4_visualizer_client.py | 173 +++++++++++++ 4 files changed, 547 insertions(+), 1 deletion(-) create mode 100644 evaluation_dashboard_app/lib/t4_visualizer_client.py create mode 100644 evaluation_dashboard_app/tests/conftest.py create mode 100644 evaluation_dashboard_app/tests/test_t4_visualizer_client.py diff --git a/evaluation_dashboard_app/lib/t4_visualizer_client.py b/evaluation_dashboard_app/lib/t4_visualizer_client.py new file mode 100644 index 0000000..ede0837 --- /dev/null +++ b/evaluation_dashboard_app/lib/t4_visualizer_client.py @@ -0,0 +1,228 @@ +"""HTTP client for the T4 Visualizer FastAPI server (render_frame over HTTP). + +Default base URL: ``T4_VISUALIZER_BASE_URL`` environment variable, or ``http://127.0.0.1:8000``. + +Does not import t4_devkit or t4_visualizer; only uses ``requests`` against the server's +``GET /health``, ``GET /datasets``, and ``POST /render`` endpoints. +""" + +from __future__ import annotations + +import base64 +import os +from dataclasses import asdict, dataclass, field +from typing import Any, List, Mapping, Optional, Tuple + +import requests + +DEFAULT_BASE_URL = "http://127.0.0.1:8000" +ENV_BASE_URL = "T4_VISUALIZER_BASE_URL" + + +class T4VisualizerError(Exception): + """Raised when the T4 visualizer HTTP API returns an error or invalid response.""" + + def __init__( + self, + message: str, + *, + status_code: Optional[int] = None, + response_text: str = "", + ) -> None: + super().__init__(message) + self.status_code = status_code + self.response_text = response_text + + +@dataclass +class TargetObjectIn: + """One object to draw on the render (matches server ``TargetObjectIn``).""" + + uuid: str = "" + x: float = 0.0 + y: float = 0.0 + z: float = 0.0 + label: str = "" + width: float = 0.0 + length: float = 0.0 + height: float = 0.0 + yaw: float = 0.0 + + +@dataclass +class RenderRequest: + """Request body for ``POST /render`` (matches server ``RenderRequest``).""" + + t4dataset_id: str + scenario_name: str + frame_index: int + target_objects: List[TargetObjectIn] = field(default_factory=list) + cameras: Optional[List[str]] = None + show_annotations: bool = True + version: Optional[str] = None + crop_cameras: bool = False + crop_padding: int = 40 + crop_min_size: int = 300 + + +@dataclass +class ImageResult: + """One rendered PNG in the response.""" + + label: str + png_base64: str + + +@dataclass +class RenderResult: + """Parsed ``POST /render`` JSON response.""" + + sample_token: str + timestamp_us: int + images: List[ImageResult] + + def decode_png(self, label: str) -> bytes: + """Decode base64 PNG bytes for the image with the given label.""" + for img in self.images: + if img.label == label: + return base64.b64decode(img.png_base64) + raise KeyError(f"No image with label {label!r}") + + def decode_all_images(self) -> List[Tuple[str, bytes]]: + """Decode all images to ``(label, png_bytes)``.""" + return [(img.label, base64.b64decode(img.png_base64)) for img in self.images] + + +def _default_base_url() -> str: + return os.environ.get(ENV_BASE_URL, DEFAULT_BASE_URL).rstrip("/") + + +def _serialize_target_object(o: TargetObjectIn) -> dict: + d = asdict(o) + return d + + +def render_request_to_json_body(req: RenderRequest) -> dict: + """Build a JSON-serializable dict for ``POST /render``.""" + out: dict = { + "t4dataset_id": req.t4dataset_id, + "scenario_name": req.scenario_name, + "frame_index": req.frame_index, + "target_objects": [_serialize_target_object(o) for o in req.target_objects], + "show_annotations": req.show_annotations, + "crop_cameras": req.crop_cameras, + "crop_padding": req.crop_padding, + "crop_min_size": req.crop_min_size, + } + if req.cameras is not None: + out["cameras"] = req.cameras + if req.version is not None: + out["version"] = req.version + return out + + +def target_object_from_gt_row(row: Mapping[str, Any]) -> dict: + """Map a GT / eval parquet row to one ``target_objects`` entry for ``RenderRequest``. + + Uses ``uuid`` or ``gt_uuid`` for the instance id; position from ``x``, ``y``, ``z``; + optional bbox fields default to ``0.0`` when missing. + """ + raw_id = row.get("uuid") + if raw_id is None or raw_id == "": + raw_id = row.get("gt_uuid") + uuid_str = "" if raw_id is None else str(raw_id) + + def _float(key: str, default: float = 0.0) -> float: + v = row.get(key) + if v is None: + return default + return float(v) + + return { + "uuid": uuid_str, + "x": _float("x"), + "y": _float("y"), + "z": _float("z"), + "label": str(row.get("label") or ""), + "width": _float("width"), + "length": _float("length"), + "height": _float("height"), + "yaw": _float("yaw"), + } + + +class T4VisualizerClient: + """Thin HTTP client for the T4 Visualizer server.""" + + def __init__( + self, + base_url: Optional[str] = None, + *, + timeout: float = 120.0, + session: Optional[requests.Session] = None, + ) -> None: + raw = base_url if base_url is not None else _default_base_url() + self.base_url = raw.rstrip("/") + self.timeout = timeout + self._session = session if session is not None else requests.Session() + + def _url(self, path: str) -> str: + if not path.startswith("/"): + path = "/" + path + return f"{self.base_url}{path}" + + def _raise_for_status(self, resp: requests.Response) -> None: + if resp.ok: + return + text = (resp.text or "")[:2000] + raise T4VisualizerError( + f"T4 visualizer HTTP {resp.status_code}: {text[:500]}", + status_code=resp.status_code, + response_text=text, + ) + + def health(self) -> dict: + """GET /health.""" + resp = self._session.get(self._url("/health"), timeout=self.timeout) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /health") from exc + + def list_datasets(self) -> dict: + """GET /datasets — returns at least ``data_dir`` and ``datasets``.""" + resp = self._session.get(self._url("/datasets"), timeout=self.timeout) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /datasets") from exc + + def render(self, payload: RenderRequest) -> RenderResult: + """POST /render with a :class:`RenderRequest`.""" + body = render_request_to_json_body(payload) + resp = self._session.post( + self._url("/render"), + json=body, + timeout=self.timeout, + ) + self._raise_for_status(resp) + try: + data = resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /render") from exc + + try: + images_raw = data["images"] + imgs = [ + ImageResult(label=str(x["label"]), png_base64=str(x["png_base64"])) + for x in images_raw + ] + return RenderResult( + sample_token=str(data["sample_token"]), + timestamp_us=int(data["timestamp_us"]), + images=imgs, + ) + except (KeyError, TypeError, ValueError) as exc: + raise T4VisualizerError(f"Unexpected /render response shape: {data!r}") from exc diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index 8df5009..9b46beb 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -12,6 +12,15 @@ from lib.parquet_schema import schema_flags from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero from lib.ui.bounding_box_viewer_ui import bev_overlay_line_and_status_legend_markup, bev_status_legend_markup +from lib.t4_visualizer_client import ( + DEFAULT_BASE_URL, + ENV_BASE_URL, + RenderRequest, + TargetObjectIn, + T4VisualizerClient, + T4VisualizerError, + target_object_from_gt_row, +) st.set_page_config( layout="wide", @@ -329,9 +338,14 @@ def list_parquets_in_run(run_path) -> List[str]: params.extend(selected_visibility) select_extras = (", " + ", ".join(hover_extra_cols)) if hover_extra_cols else "" +# Optional columns for T4 server overlay (z/height) and resolving dataset / scenario per row +_geom_for_t4 = [c for c in ("z", "height") if c in cols and c not in hover_extra_cols] +_geom_select = (", " + ", ".join(_geom_for_t4)) if _geom_for_t4 else "" +_t4_meta_cols = [c for c in ("t4dataset_id", "t4dataset_name", "scenario_name") if c in cols] +_t4_meta_select = (", " + ", ".join(_t4_meta_cols)) if _t4_meta_cols else "" sql = f""" SELECT frame_index, x, y, length, width, yaw, label, topic_name, source, status, uuid -{select_vis}{select_extras} +{select_vis}{select_extras}{_geom_select}{_t4_meta_select} FROM parquet_scan(?) WHERE {" AND ".join(where)} ORDER BY frame_index @@ -447,6 +461,129 @@ def get_color(source, status): return color_map.get((source, status), "#999999") with k4: st.metric("TP (EST)", tp_est_count) with k5: st.metric("TPR", f"{tpr_frame:.2%}" if tpr_frame is not None else "—") +# ---------------------------- +# T4 visualizer (HTTP server): camera PNGs for current frame +# ---------------------------- +def _bbox_resolve_t4_dataset_id(dff: pd.DataFrame) -> str: + if dff is None or dff.empty: + return "" + if "t4dataset_id" in dff.columns and dff["t4dataset_id"].notna().any(): + return str(dff["t4dataset_id"].dropna().astype(str).iloc[0]) + if "t4dataset_name" in dff.columns and dff["t4dataset_name"].notna().any(): + return str(dff["t4dataset_name"].dropna().iloc[0]) + return "" + + +def _bbox_resolve_t4_scenario(dff: pd.DataFrame, scenario_from_sidebar: Optional[str]) -> str: + if scenario_from_sidebar is not None and str(scenario_from_sidebar).strip() != "": + return str(scenario_from_sidebar) + if dff is not None and not dff.empty and "scenario_name" in dff.columns and dff["scenario_name"].notna().any(): + return str(dff["scenario_name"].dropna().iloc[0]) + return "" + + +with st.expander("T4 visualizer — camera renders (external server)", expanded=False): + if "bbox_t4_base_url" not in st.session_state: + st.session_state["bbox_t4_base_url"] = ( + (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL + ) + base_url_t4 = st.text_input( + "T4 server base URL", + key="bbox_t4_base_url", + help=f"Default from env `{ENV_BASE_URL}`; must reach the FastAPI app (GET /health, POST /render).", + ) + c1, c2, c3 = st.columns(3) + with c1: + t4_crop = st.checkbox("Crop cameras", value=True, key="bbox_t4_crop_cameras") + with c2: + t4_show_ann = st.checkbox("Show dataset annotations", value=True, key="bbox_t4_show_ann") + with c3: + t4_overlay_gt = st.checkbox("Draw GT rows as target boxes", value=True, key="bbox_t4_overlay_gt") + + _ds_t4 = _bbox_resolve_t4_dataset_id(df_frame) + if not _ds_t4 and selected_t4dataset is not None: + _ds_t4 = str(selected_t4dataset) + _sc_t4 = _bbox_resolve_t4_scenario(df_frame, selected_scenario) + + st.caption( + f"API params: **t4dataset_id** `{_ds_t4 or '—'}` · **scenario_name** `{_sc_t4 or '—'}` · **frame_index** `{frame}`" + ) + if not _ds_t4 or not _sc_t4: + st.info( + "Set a scene with **scenario_name** in the sidebar and ensure parquet includes **t4dataset_id** or " + "**t4dataset_name** (or pick **t4dataset_name** when multiple datasets exist). " + "The T4 server must have that dataset under its `--data-dir`." + ) + + fetch_t4 = st.button("Fetch camera renders from T4 server", key="bbox_t4_fetch_btn", type="primary") + + if fetch_t4: + if not _ds_t4 or not _sc_t4: + st.error("Cannot call T4 server: missing t4dataset id or scenario name.") + else: + try: + client = T4VisualizerClient( + base_url=(base_url_t4 or "").strip() or DEFAULT_BASE_URL, + timeout=120.0, + ) + targets: List[TargetObjectIn] = [] + if t4_overlay_gt: + for _, row in df_frame[df_frame["source"] == "GT"].iterrows(): + d = target_object_from_gt_row(row.to_dict()) + targets.append(TargetObjectIn(**d)) + req = RenderRequest( + t4dataset_id=_ds_t4, + scenario_name=_sc_t4, + frame_index=int(frame), + target_objects=targets, + crop_cameras=t4_crop, + show_annotations=t4_show_ann, + ) + with st.spinner("Calling T4 visualizer (POST /render)…"): + t4_res = client.render(req) + st.session_state["bbox_t4_last_images"] = t4_res.decode_all_images() + st.session_state["bbox_t4_last_meta"] = { + "sample_token": t4_res.sample_token, + "timestamp_us": t4_res.timestamp_us, + "frame_index": int(frame), + "t4dataset_id": _ds_t4, + "scenario_name": _sc_t4, + } + except T4VisualizerError as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.error(f"T4 server error ({ex.status_code}): {ex}") + except OSError as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.error(f"Network error: {ex}") + + _meta = st.session_state.get("bbox_t4_last_meta") + _imgs = st.session_state.get("bbox_t4_last_images") + if _meta and _imgs: + if int(_meta.get("frame_index", -1)) != int(frame): + st.warning( + f"Images below are from **frame {_meta['frame_index']}**; current slider is **{frame}**. " + "Click **Fetch** again to update." + ) + st.success( + f"**sample_token:** `{_meta.get('sample_token', '')}` · " + f"**timestamp_us:** `{_meta.get('timestamp_us', '')}`" + ) + _nc = min(3, max(1, len(_imgs))) + for _row_start in range(0, len(_imgs), _nc): + _cols_img = st.columns(_nc) + for _j, _k in enumerate(range(_row_start, min(_row_start + _nc, len(_imgs)))): + _lbl, _png = _imgs[_k] + with _cols_img[_j]: + st.caption(_lbl) + st.image(_png, use_container_width=True) + + st.caption( + "Runs the Tier4 HTTP visualizer (`t4-server`); does not bundle t4_devkit. " + "Point **T4 server base URL** at your instance or set `T4_VISUALIZER_BASE_URL`." + ) + # ---------------------------- # Quick view: switch between "All (comparison)" and single-run view # ---------------------------- diff --git a/evaluation_dashboard_app/tests/conftest.py b/evaluation_dashboard_app/tests/conftest.py new file mode 100644 index 0000000..e24af09 --- /dev/null +++ b/evaluation_dashboard_app/tests/conftest.py @@ -0,0 +1,8 @@ +"""Pytest configuration for evaluation_dashboard_app tests.""" + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "integration: tests that require a live service or network (opt-in)", + ) diff --git a/evaluation_dashboard_app/tests/test_t4_visualizer_client.py b/evaluation_dashboard_app/tests/test_t4_visualizer_client.py new file mode 100644 index 0000000..0e1f169 --- /dev/null +++ b/evaluation_dashboard_app/tests/test_t4_visualizer_client.py @@ -0,0 +1,173 @@ +"""Tests for lib/t4_visualizer_client.py. + +Unit tests use mocks (no network). Optional integration tests call a live server when +``T4_VISUALIZER_BASE_URL`` points at a reachable instance (e.g. ``t4-server``); they +skip if the server is down. +""" + +from __future__ import annotations + +import base64 +import os +from unittest.mock import MagicMock + +import pytest + +from lib.t4_visualizer_client import ( + ENV_BASE_URL, + RenderRequest, + T4VisualizerClient, + T4VisualizerError, + TargetObjectIn, + target_object_from_gt_row, +) + + +# Minimal valid 1x1 PNG (transparent pixel) +_TINY_PNG_BYTES = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01" + b"\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01" + b"\x00\x00\x05\x00\x01\r\n-\xdb\x00\x00\x00\x00IEND\xaeB`\x82" +) +_TINY_PNG_B64 = base64.b64encode(_TINY_PNG_BYTES).decode("ascii") + + +def _ok_response(json_data): + r = MagicMock() + r.ok = True + r.status_code = 200 + r.text = "" + r.json.return_value = json_data + return r + + +def _err_response(status_code: int, text: str = "not found"): + r = MagicMock() + r.ok = False + r.status_code = status_code + r.text = text + return r + + +def test_health_success(): + session = MagicMock() + session.get.return_value = _ok_response({"status": "ok"}) + c = T4VisualizerClient(base_url="http://test:9999", session=session) + assert c.health() == {"status": "ok"} + session.get.assert_called_once() + assert "health" in session.get.call_args[0][0] + + +def test_list_datasets_success(): + session = MagicMock() + session.get.return_value = _ok_response( + {"data_dir": "/data", "datasets": ["ds_a", "ds_b"]} + ) + c = T4VisualizerClient(base_url="http://test", session=session) + d = c.list_datasets() + assert d["datasets"] == ["ds_a", "ds_b"] + assert d["data_dir"] == "/data" + + +def test_render_success_decode(): + session = MagicMock() + session.post.return_value = _ok_response( + { + "sample_token": "tok1", + "timestamp_us": 1234567890000000, + "images": [{"label": "CAM_FRONT", "png_base64": _TINY_PNG_B64}], + } + ) + c = T4VisualizerClient(base_url="http://test", session=session) + req = RenderRequest( + t4dataset_id="ds1", + scenario_name="scene-1", + frame_index=0, + target_objects=[TargetObjectIn(uuid="u1", x=1.0, y=2.0, z=0.5, label="car")], + ) + out = c.render(req) + assert out.sample_token == "tok1" + assert out.timestamp_us == 1234567890000000 + assert len(out.images) == 1 + raw = out.decode_png("CAM_FRONT") + assert raw == _TINY_PNG_BYTES + all_pairs = out.decode_all_images() + assert all_pairs == [("CAM_FRONT", _TINY_PNG_BYTES)] + + +def test_render_http_error(): + session = MagicMock() + session.post.return_value = _err_response(404, "Dataset 'x' not found") + c = T4VisualizerClient(base_url="http://test", session=session) + req = RenderRequest(t4dataset_id="x", scenario_name="s", frame_index=0) + with pytest.raises(T4VisualizerError) as ei: + c.render(req) + assert ei.value.status_code == 404 + assert "404" in str(ei.value) or "not found" in ei.value.response_text.lower() + + +def test_render_invalid_json_body(): + session = MagicMock() + r = MagicMock() + r.ok = True + r.status_code = 200 + r.json.side_effect = ValueError("bad json") + session.post.return_value = r + c = T4VisualizerClient(base_url="http://test", session=session) + with pytest.raises(T4VisualizerError, match="Invalid JSON"): + c.render(RenderRequest(t4dataset_id="a", scenario_name="b", frame_index=0)) + + +def test_target_object_from_gt_row_full(): + row = { + "uuid": "abc-123", + "x": 10.5, + "y": -2.0, + "z": 0.1, + "label": "pedestrian", + "width": 0.5, + "length": 0.6, + "height": 1.7, + "yaw": 0.25, + } + d = target_object_from_gt_row(row) + assert d["uuid"] == "abc-123" + assert d["x"] == 10.5 + assert d["y"] == -2.0 + assert d["z"] == 0.1 + assert d["label"] == "pedestrian" + assert d["width"] == 0.5 + assert d["length"] == 0.6 + assert d["height"] == 1.7 + assert d["yaw"] == 0.25 + + +def test_target_object_from_gt_row_gt_uuid_partial(): + row = {"gt_uuid": "g1", "x": 1.0, "y": 2.0, "label": "car"} + d = target_object_from_gt_row(row) + assert d["uuid"] == "g1" + assert d["z"] == 0.0 + assert d["width"] == 0.0 + assert d["length"] == 0.0 + assert d["height"] == 0.0 + assert d["yaw"] == 0.0 + + +def test_target_object_from_gt_row_uuid_precedence(): + row = {"uuid": "u", "gt_uuid": "g", "x": 0, "y": 0} + d = target_object_from_gt_row(row) + assert d["uuid"] == "u" + + +@pytest.mark.integration +def test_live_health_if_configured(): + """Skips unless T4_VISUALIZER_BASE_URL is set and server responds.""" + base = os.environ.get(ENV_BASE_URL) + if not base: + pytest.skip(f"Set {ENV_BASE_URL} to run integration test against a live server") + client = T4VisualizerClient(base_url=base, timeout=5.0) + try: + h = client.health() + except (T4VisualizerError, OSError) as e: + pytest.skip(f"Server not reachable: {e}") + assert h.get("status") == "ok" From 0602e6b5a0c4c20b5c7d80e18246575c1933a975 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 30 Mar 2026 17:40:00 +0900 Subject: [PATCH 05/94] feat: add T4 dataset embedding and visualization features - Introduced `t4_dataset_embed.py` to build embeddable T4 dataset metadata, including structured records and query strings for integration with the T4 visualizer client. - Added a new Streamlit page `11_T4_Dataset_Server.py` to exercise the T4 visualizer HTTP API, enabling users to interact with health checks, dataset listings, and rendering requests. - Updated the default base URL in `t4_visualizer_client.py` for improved connectivity to the T4 server. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_dataset_embed.py | 97 +++++++ .../lib/t4_visualizer_client.py | 2 +- .../pages/11_T4_Dataset_Server.py | 265 ++++++++++++++++++ 3 files changed, 363 insertions(+), 1 deletion(-) create mode 100644 evaluation_dashboard_app/lib/t4_dataset_embed.py create mode 100644 evaluation_dashboard_app/pages/11_T4_Dataset_Server.py diff --git a/evaluation_dashboard_app/lib/t4_dataset_embed.py b/evaluation_dashboard_app/lib/t4_dataset_embed.py new file mode 100644 index 0000000..ff090e5 --- /dev/null +++ b/evaluation_dashboard_app/lib/t4_dataset_embed.py @@ -0,0 +1,97 @@ +"""Build embeddable T4 dataset metadata: JSON records, query strings, and ``POST /render`` bodies. + +Use with :mod:`lib.t4_visualizer_client` when wiring eval parquet rows or dashboards to ``t4-server``. +""" + +from __future__ import annotations + +from typing import Any, List, Mapping, Optional, Sequence +from urllib.parse import quote + +from lib.t4_visualizer_client import ( + RenderRequest, + TargetObjectIn, + render_request_to_json_body, + target_object_from_gt_row, +) + + +def t4_dataset_context( + t4dataset_id: str, + scenario_name: str, + *, + frame_index: Optional[int] = None, + data_dir: Optional[str] = None, + sample_token: Optional[str] = None, +) -> dict[str, Any]: + """Structured record for logging, sidecar JSON, or UI state.""" + out: dict[str, Any] = { + "t4dataset_id": t4dataset_id, + "scenario_name": scenario_name, + } + if frame_index is not None: + out["frame_index"] = int(frame_index) + if data_dir: + out["data_dir"] = data_dir + if sample_token: + out["sample_token"] = sample_token + return out + + +def t4_share_query_params( + t4dataset_id: str, + scenario_name: str, + frame_index: int = 0, +) -> str: + """Query string without leading ``?`` (for bookmarks or deep links).""" + return ( + f"t4dataset_id={quote(str(t4dataset_id), safe='')}" + f"&scenario_name={quote(str(scenario_name), safe='')}" + f"&frame_index={int(frame_index)}" + ) + + +def target_objects_from_rows(rows: Sequence[Mapping[str, Any]]) -> List[dict[str, Any]]: + """Map each row to a ``target_objects`` dict (see :func:`target_object_from_gt_row`).""" + return [target_object_from_gt_row(r) for r in rows] + + +def build_render_request_embed( + t4dataset_id: str, + scenario_name: str, + frame_index: int, + *, + target_rows: Optional[Sequence[Mapping[str, Any]]] = None, + target_objects: Optional[Sequence[TargetObjectIn]] = None, + show_annotations: bool = True, + crop_cameras: bool = False, + crop_padding: int = 40, + crop_min_size: int = 300, + cameras: Optional[List[str]] = None, + version: Optional[str] = None, +) -> dict[str, Any]: + """Return ``context`` plus a ``post_render_json`` body ready for ``POST /render``.""" + to_list: List[TargetObjectIn] = [] + if target_objects is not None: + to_list = list(target_objects) + elif target_rows is not None: + for r in target_rows: + d = target_object_from_gt_row(r) + to_list.append(TargetObjectIn(**d)) + req = RenderRequest( + t4dataset_id=t4dataset_id, + scenario_name=scenario_name, + frame_index=int(frame_index), + target_objects=to_list, + show_annotations=show_annotations, + crop_cameras=crop_cameras, + crop_padding=crop_padding, + crop_min_size=crop_min_size, + cameras=cameras, + version=version, + ) + body = render_request_to_json_body(req) + return { + "context": t4_dataset_context(t4dataset_id, scenario_name, frame_index=frame_index), + "post_render_json": body, + } diff --git a/evaluation_dashboard_app/lib/t4_visualizer_client.py b/evaluation_dashboard_app/lib/t4_visualizer_client.py index ede0837..399ef17 100644 --- a/evaluation_dashboard_app/lib/t4_visualizer_client.py +++ b/evaluation_dashboard_app/lib/t4_visualizer_client.py @@ -15,7 +15,7 @@ import requests -DEFAULT_BASE_URL = "http://127.0.0.1:8000" +DEFAULT_BASE_URL = "http://10.0.6.148:8000" ENV_BASE_URL = "T4_VISUALIZER_BASE_URL" diff --git a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py new file mode 100644 index 0000000..4cd5dea --- /dev/null +++ b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py @@ -0,0 +1,265 @@ +""" +Exercise the T4 visualizer HTTP API (``t4-server``): ``GET /health``, ``GET /datasets``, ``POST /render``. +Build embeddable JSON / query strings for T4 dataset context and render payloads. +""" +from __future__ import annotations + +import json +import os +import shlex +from typing import Any, List, Optional + +import streamlit as st + +from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.t4_dataset_embed import ( + build_render_request_embed, + t4_dataset_context, + t4_share_query_params, + target_objects_from_rows, +) +from lib.t4_visualizer_client import ( + DEFAULT_BASE_URL, + ENV_BASE_URL, + RenderRequest, + T4VisualizerClient, + T4VisualizerError, + TargetObjectIn, + render_request_to_json_body, + target_object_from_gt_row, +) + +st.set_page_config( + page_title="T4 dataset server", + page_icon="📡", + layout="wide", + initial_sidebar_state="expanded", +) +inject_app_page_styles() + +render_page_hero( + kicker="Integration", + title="T4 dataset server & embed helpers", + description=( + "Call the Tier4 visualizer HTTP service (same client as Bounding Box Viewer): health, dataset list, " + "and camera render. Generate JSON and query strings to embed T4dataset id, scenario, and frame " + "in tooling or documentation." + ), + mode="Single Run", +) + +if "t4_test_base_url" not in st.session_state: + st.session_state["t4_test_base_url"] = os.environ.get(ENV_BASE_URL, DEFAULT_BASE_URL).rstrip("/") + +base_url = st.sidebar.text_input( + "Server base URL", + key="t4_test_base_url", + help=f"Override env {ENV_BASE_URL} for this session.", +) +timeout_s = st.sidebar.number_input("HTTP timeout (s)", min_value=5.0, max_value=600.0, value=120.0, step=5.0) + + +def _client() -> T4VisualizerClient: + return T4VisualizerClient(base_url=(base_url or "").strip() or DEFAULT_BASE_URL, timeout=float(timeout_s)) + + +tab_health, tab_ds, tab_render, tab_embed = st.tabs( + ["Health", "Datasets", "Render", "Embed JSON"] +) + +with tab_health: + section_header("/health", "GET — server liveness and any metadata the service returns.") + if st.button("GET /health", type="primary", key="t4_btn_health"): + try: + h = _client().health() + st.success("OK") + st.json(h) + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + +with tab_ds: + section_header("/datasets", "GET — ``data_dir`` and registered dataset ids under the server.") + if st.button("GET /datasets", type="primary", key="t4_btn_datasets"): + try: + d = _client().list_datasets() + st.success("OK") + st.json(d) + ds = d.get("datasets") + if isinstance(ds, list) and ds: + st.caption(f"{len(ds)} dataset id(s) returned.") + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + +with tab_render: + section_header("POST /render", "Request camera PNGs; optional ``target_objects`` from JSON below.") + c1, c2, c3 = st.columns(3) + with c1: + ds_id = st.text_input("t4dataset_id", value="", key="t4_render_ds", placeholder="dataset folder id") + with c2: + scen = st.text_input("scenario_name", value="", key="t4_render_scen", placeholder="scenario") + with c3: + frame = st.number_input("frame_index", min_value=0, value=0, step=1, key="t4_render_frame") + + tgt_json = st.text_area( + "target_objects (JSON array, optional)", + value="[]", + height=140, + key="t4_render_targets", + help="List of objects with uuid/x/y/z/label/width/length/height/yaw (matches GT row shape).", + ) + o1, o2, o3 = st.columns(3) + with o1: + crop = st.checkbox("crop_cameras", value=False, key="t4_render_crop") + with o2: + show_ann = st.checkbox("show_annotations", value=True, key="t4_render_ann") + with o3: + overlay_gt = st.checkbox("Use target_objects in request", value=True, key="t4_render_use_tgt") + + req: Optional[RenderRequest] = None + parse_err: Optional[str] = None + if overlay_gt: + try: + raw = json.loads(tgt_json or "[]") + if not isinstance(raw, list): + parse_err = "target_objects JSON must be an array" + else: + objs: List[TargetObjectIn] = [] + for item in raw: + if not isinstance(item, dict): + parse_err = "each target must be an object" + break + d = target_object_from_gt_row(item) + objs.append(TargetObjectIn(**d)) + if parse_err is None: + req = RenderRequest( + t4dataset_id=ds_id.strip(), + scenario_name=scen.strip(), + frame_index=int(frame), + target_objects=objs, + crop_cameras=crop, + show_annotations=show_ann, + ) + except json.JSONDecodeError as ex: + parse_err = f"Invalid JSON: {ex}" + else: + req = RenderRequest( + t4dataset_id=ds_id.strip(), + scenario_name=scen.strip(), + frame_index=int(frame), + target_objects=[], + crop_cameras=crop, + show_annotations=show_ann, + ) + + if parse_err: + st.warning(parse_err) + + col_go, col_prev = st.columns([1, 2]) + with col_go: + do_render = st.button("POST /render", type="primary", key="t4_btn_render", disabled=req is None) + with col_prev: + if req is not None: + with st.expander("Request body preview", expanded=False): + st.json(render_request_to_json_body(req)) + + if do_render and req is not None: + try: + with st.spinner("Rendering…"): + res = _client().render(req) + imgs = res.decode_all_images() + st.caption(f"sample_token={res.sample_token!r} · timestamp_us={res.timestamp_us}") + if not imgs: + st.info("No images in response.") + else: + n = min(len(imgs), 6) + cols = st.columns(n) + for i in range(n): + label, png = imgs[i] + cols[i].image(png, caption=label, use_container_width=True) + if len(imgs) > n: + st.caption(f"Showing first {n} of {len(imgs)} images.") + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + +with tab_embed: + section_header( + "Embed helpers", + "Copy structured context, query strings, and full ``POST /render`` JSON for scripts or docs.", + ) + e1, e2, e3 = st.columns(3) + with e1: + emb_ds = st.text_input("t4dataset_id", value="", key="t4_emb_ds") + with e2: + emb_scen = st.text_input("scenario_name", value="", key="t4_emb_scen") + with e3: + emb_frame = st.number_input("frame_index", min_value=0, value=0, step=1, key="t4_emb_frame") + + emb_ta = st.text_area( + "Optional GT rows as JSON array (for target_objects_from_rows)", + value="[]", + height=120, + key="t4_emb_rows", + ) + + rows_err: Optional[str] = None + rows_list: List[dict[str, Any]] = [] + try: + parsed = json.loads(emb_ta or "[]") + if not isinstance(parsed, list): + rows_err = "Must be a JSON array" + else: + for i, row in enumerate(parsed): + if not isinstance(row, dict): + rows_err = f"Item {i} is not an object" + break + if rows_err is None: + rows_list = [r for r in parsed if isinstance(r, dict)] + except json.JSONDecodeError as ex: + rows_err = str(ex) + + if rows_err: + st.warning(rows_err) + + ctx = t4_dataset_context(emb_ds.strip(), emb_scen.strip(), frame_index=int(emb_frame)) + q = t4_share_query_params(emb_ds.strip(), emb_scen.strip(), frame_index=int(emb_frame)) + + st.subheader("t4_dataset_context") + st.json(ctx) + + st.subheader("Shareable query fragment") + st.code(q, language="text") + + full = build_render_request_embed( + emb_ds.strip(), + emb_scen.strip(), + int(emb_frame), + target_rows=rows_list if rows_list else None, + show_annotations=True, + crop_cameras=False, + ) + st.subheader("context + post_render_json") + st.json(full) + + if rows_list: + st.subheader("target_objects_from_rows (preview)") + st.json(target_objects_from_rows(rows_list)) + + curl_base = shlex.quote((base_url or "").strip() or DEFAULT_BASE_URL) + body_s = json.dumps(full["post_render_json"]) + st.subheader("Example curl") + st.code( + f"curl -sS {curl_base}/render -H 'Content-Type: application/json' -d {shlex.quote(body_s)}", + language="bash", + ) From 2e407494883b218589570d58edc238dd8cb583e2 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 30 Mar 2026 18:06:36 +0900 Subject: [PATCH 06/94] feat: add dataset scenarios retrieval to T4 visualizer client - Implemented `list_dataset_scenarios` method in `T4VisualizerClient` to fetch scene names and frame counts for a given dataset. - Updated the Streamlit page `11_T4_Dataset_Server.py` to include a new tab for retrieving and displaying dataset scenarios. - Added unit tests for the new scenarios retrieval functionality to ensure correct behavior and response handling. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_visualizer_client.py | 23 ++++++++ .../pages/11_T4_Dataset_Server.py | 59 +++++++++++++++++-- .../tests/test_t4_visualizer_client.py | 27 +++++++++ 3 files changed, 104 insertions(+), 5 deletions(-) diff --git a/evaluation_dashboard_app/lib/t4_visualizer_client.py b/evaluation_dashboard_app/lib/t4_visualizer_client.py index 399ef17..94c8304 100644 --- a/evaluation_dashboard_app/lib/t4_visualizer_client.py +++ b/evaluation_dashboard_app/lib/t4_visualizer_client.py @@ -199,6 +199,29 @@ def list_datasets(self) -> dict: except ValueError as exc: raise T4VisualizerError("Invalid JSON from /datasets") from exc + def list_dataset_scenarios( + self, t4dataset_id: str, version: Optional[str] = None + ) -> dict: + """GET /datasets/{t4dataset_id}/scenarios — scene names and ``nbr_samples`` (frame counts). + + Response keys typically include ``t4dataset_id``, ``scenarios`` (list of dicts with + ``name``, ``token``, ``description``, ``nbr_samples``), and optional ``version``. + """ + from urllib.parse import quote + + tid = quote(str(t4dataset_id), safe="") + params = {"version": version} if version is not None else None + resp = self._session.get( + self._url(f"/datasets/{tid}/scenarios"), + params=params, + timeout=self.timeout, + ) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /datasets/.../scenarios") from exc + def render(self, payload: RenderRequest) -> RenderResult: """POST /render with a :class:`RenderRequest`.""" body = render_request_to_json_body(payload) diff --git a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py index 4cd5dea..dc1b6e7 100644 --- a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py +++ b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py @@ -1,5 +1,6 @@ """ -Exercise the T4 visualizer HTTP API (``t4-server``): ``GET /health``, ``GET /datasets``, ``POST /render``. +Exercise the T4 visualizer HTTP API (``t4-server``): ``GET /health``, ``GET /datasets``, +``GET /datasets/{t4dataset_id}/scenarios``, and ``POST /render``. Build embeddable JSON / query strings for T4 dataset context and render payloads. """ from __future__ import annotations @@ -9,6 +10,7 @@ import shlex from typing import Any, List, Optional +import pandas as pd import streamlit as st from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header @@ -42,8 +44,8 @@ title="T4 dataset server & embed helpers", description=( "Call the Tier4 visualizer HTTP service (same client as Bounding Box Viewer): health, dataset list, " - "and camera render. Generate JSON and query strings to embed T4dataset id, scenario, and frame " - "in tooling or documentation." + "scenarios per dataset (names and frame counts), camera render. Generate JSON and query strings to " + "embed T4dataset id, scenario, and frame in tooling or documentation." ), mode="Single Run", ) @@ -63,8 +65,8 @@ def _client() -> T4VisualizerClient: return T4VisualizerClient(base_url=(base_url or "").strip() or DEFAULT_BASE_URL, timeout=float(timeout_s)) -tab_health, tab_ds, tab_render, tab_embed = st.tabs( - ["Health", "Datasets", "Render", "Embed JSON"] +tab_health, tab_ds, tab_scenarios, tab_render, tab_embed = st.tabs( + ["Health", "Datasets", "Scenarios", "Render", "Embed JSON"] ) with tab_health: @@ -98,6 +100,53 @@ def _client() -> T4VisualizerClient: except OSError as ex: st.error(f"Network error: {ex}") +with tab_scenarios: + section_header( + "/datasets/{t4dataset_id}/scenarios", + "GET — scene **name** (use as ``scenario_name`` in ``POST /render``), token, description, " + "and **nbr_samples** (frame count; valid ``frame_index`` is ``0 .. nbr_samples - 1``).", + ) + s1, s2 = st.columns([2, 1]) + with s1: + scen_ds_id = st.text_input( + "t4dataset_id", + value="", + key="t4_scenarios_ds", + placeholder="dataset id as listed by GET /datasets", + ) + with s2: + scen_version = st.text_input( + "version (optional)", + value="", + key="t4_scenarios_ver", + help="Same as Tier4 / POST /render ``version`` (annotation dir); leave empty to omit.", + ) + + if st.button("GET /datasets/…/scenarios", type="primary", key="t4_btn_scenarios"): + _tid = (scen_ds_id or "").strip() + if not _tid: + st.warning("Enter a t4dataset_id.") + else: + try: + _ver = (scen_version or "").strip() or None + out = _client().list_dataset_scenarios(_tid, version=_ver) + st.success("OK") + st.json(out) + rows = out.get("scenarios") + if isinstance(rows, list) and rows: + st.subheader("Scenarios table") + st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True) + st.caption( + "Use **name** as **scenario_name** when calling **Render** or **Embed JSON**. " + "**nbr_samples** is the number of frames in that scene." + ) + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + with tab_render: section_header("POST /render", "Request camera PNGs; optional ``target_objects`` from JSON below.") c1, c2, c3 = st.columns(3) diff --git a/evaluation_dashboard_app/tests/test_t4_visualizer_client.py b/evaluation_dashboard_app/tests/test_t4_visualizer_client.py index 0e1f169..26aabc6 100644 --- a/evaluation_dashboard_app/tests/test_t4_visualizer_client.py +++ b/evaluation_dashboard_app/tests/test_t4_visualizer_client.py @@ -69,6 +69,33 @@ def test_list_datasets_success(): assert d["data_dir"] == "/data" +def test_list_dataset_scenarios_success(): + session = MagicMock() + session.get.return_value = _ok_response( + { + "t4dataset_id": "ds1", + "scenarios": [ + { + "name": "scene-a", + "token": "tok", + "description": "", + "nbr_samples": 42, + } + ], + "version": None, + } + ) + c = T4VisualizerClient(base_url="http://test", session=session) + out = c.list_dataset_scenarios("ds1") + assert out["t4dataset_id"] == "ds1" + assert len(out["scenarios"]) == 1 + assert out["scenarios"][0]["name"] == "scene-a" + assert out["scenarios"][0]["nbr_samples"] == 42 + session.get.assert_called_once() + call_url = session.get.call_args[0][0] + assert "ds1" in call_url and "scenarios" in call_url + + def test_render_success_decode(): session = MagicMock() session.post.return_value = _ok_response( From 3f8d3cfa4bf89851fc5d943041ae1f001c421fd1 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 31 Mar 2026 10:27:17 +0900 Subject: [PATCH 07/94] feat: update T4 visualizer client and Streamlit page for enhanced debugging - Changed the default base URL in `t4_visualizer_client.py` to `localhost` for local development. - Added optional fields to `RenderResult` for server-reported timings and raw JSON response. - Introduced `render_response_json_for_debug` function to truncate base64 image data for easier debugging. - Updated `11_T4_Dataset_Server.py` to cache API results and improve user interaction with dataset and scenario selections. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_visualizer_client.py | 41 ++- .../pages/11_T4_Dataset_Server.py | 252 ++++++++++++------ 2 files changed, 206 insertions(+), 87 deletions(-) diff --git a/evaluation_dashboard_app/lib/t4_visualizer_client.py b/evaluation_dashboard_app/lib/t4_visualizer_client.py index 94c8304..333b1e8 100644 --- a/evaluation_dashboard_app/lib/t4_visualizer_client.py +++ b/evaluation_dashboard_app/lib/t4_visualizer_client.py @@ -15,7 +15,7 @@ import requests -DEFAULT_BASE_URL = "http://10.0.6.148:8000" +DEFAULT_BASE_URL = "http://localhost:8000" ENV_BASE_URL = "T4_VISUALIZER_BASE_URL" @@ -80,6 +80,11 @@ class RenderResult: sample_token: str timestamp_us: int images: List[ImageResult] + raw_json: Optional[dict] = None + # Optional server-reported timings (newer t4-server JSON body) + elapsed_ms: Optional[float] = None + tier4_load_ms: Optional[float] = None + render_ms: Optional[float] = None def decode_png(self, label: str) -> bytes: """Decode base64 PNG bytes for the image with the given label.""" @@ -93,6 +98,29 @@ def decode_all_images(self) -> List[Tuple[str, bytes]]: return [(img.label, base64.b64decode(img.png_base64)) for img in self.images] +def render_response_json_for_debug( + data: Mapping[str, Any], *, max_b64_preview: int = 120 +) -> dict[str, Any]: + """Copy of a ``POST /render`` JSON object with ``png_base64`` truncated for UI/debug.""" + out: dict[str, Any] = dict(data) + imgs = out.get("images") + if not isinstance(imgs, list): + return out + trimmed: list[Any] = [] + for item in imgs: + if not isinstance(item, dict): + trimmed.append(item) + continue + row = dict(item) + b64 = row.get("png_base64") + if isinstance(b64, str) and len(b64) > max_b64_preview: + row["png_base64"] = f"{b64[:max_b64_preview]}…" + row["png_base64_len"] = len(b64) + trimmed.append(row) + out["images"] = trimmed + return out + + def _default_base_url() -> str: return os.environ.get(ENV_BASE_URL, DEFAULT_BASE_URL).rstrip("/") @@ -242,10 +270,21 @@ def render(self, payload: RenderRequest) -> RenderResult: ImageResult(label=str(x["label"]), png_base64=str(x["png_base64"])) for x in images_raw ] + + def _opt_float(key: str) -> Optional[float]: + v = data.get(key) + if v is None: + return None + return float(v) + return RenderResult( sample_token=str(data["sample_token"]), timestamp_us=int(data["timestamp_us"]), images=imgs, + raw_json=dict(data), + elapsed_ms=_opt_float("elapsed_ms"), + tier4_load_ms=_opt_float("tier4_load_ms"), + render_ms=_opt_float("render_ms"), ) except (KeyError, TypeError, ValueError) as exc: raise T4VisualizerError(f"Unexpected /render response shape: {data!r}") from exc diff --git a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py index dc1b6e7..58de3ca 100644 --- a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py +++ b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py @@ -28,6 +28,7 @@ T4VisualizerError, TargetObjectIn, render_request_to_json_body, + render_response_json_for_debug, target_object_from_gt_row, ) @@ -44,8 +45,8 @@ title="T4 dataset server & embed helpers", description=( "Call the Tier4 visualizer HTTP service (same client as Bounding Box Viewer): health, dataset list, " - "scenarios per dataset (names and frame counts), camera render. Generate JSON and query strings to " - "embed T4dataset id, scenario, and frame in tooling or documentation." + "scenarios per dataset (names and frame counts), camera render. Fetch lists, pick ids from the server " + "or type your own, then render or copy embed JSON." ), mode="Single Run", ) @@ -53,6 +54,16 @@ if "t4_test_base_url" not in st.session_state: st.session_state["t4_test_base_url"] = os.environ.get(ENV_BASE_URL, DEFAULT_BASE_URL).rstrip("/") +# Cached API results for pickers +if "t4_dataset_ids" not in st.session_state: + st.session_state["t4_dataset_ids"] = [] +if "t4_last_datasets_payload" not in st.session_state: + st.session_state["t4_last_datasets_payload"] = None +if "t4_scenario_rows" not in st.session_state: + st.session_state["t4_scenario_rows"] = [] +if "t4_last_scenarios_payload" not in st.session_state: + st.session_state["t4_last_scenarios_payload"] = None + base_url = st.sidebar.text_input( "Server base URL", key="t4_test_base_url", @@ -65,34 +76,36 @@ def _client() -> T4VisualizerClient: return T4VisualizerClient(base_url=(base_url or "").strip() or DEFAULT_BASE_URL, timeout=float(timeout_s)) -tab_health, tab_ds, tab_scenarios, tab_render, tab_embed = st.tabs( - ["Health", "Datasets", "Scenarios", "Render", "Embed JSON"] -) +def _on_dataset_pick() -> None: + sel = st.session_state.get("t4_pick_ds", "—") + if sel != "—": + st.session_state["t4_ctx_ds"] = sel -with tab_health: - section_header("/health", "GET — server liveness and any metadata the service returns.") - if st.button("GET /health", type="primary", key="t4_btn_health"): - try: - h = _client().health() - st.success("OK") - st.json(h) - except T4VisualizerError as ex: - st.error(f"{ex} (status={ex.status_code})") - if ex.response_text: - st.code(ex.response_text[:4000], language="text") - except OSError as ex: - st.error(f"Network error: {ex}") -with tab_ds: - section_header("/datasets", "GET — ``data_dir`` and registered dataset ids under the server.") +def _on_scenario_pick() -> None: + sel = st.session_state.get("t4_pick_scen", "—") + if sel != "—": + st.session_state["t4_ctx_scen"] = sel + + +# --- Shared context (dataset, version, scenario, frame) --------------------------------- +section_header( + "Context", + "Fetch lists from the server, then choose **t4dataset_id** and **scenario_name** from the dropdowns " + "or type any value in the text fields.", +) + +row_fetch = st.columns([1, 1, 2]) +with row_fetch[0]: if st.button("GET /datasets", type="primary", key="t4_btn_datasets"): try: d = _client().list_datasets() - st.success("OK") - st.json(d) + st.session_state["t4_last_datasets_payload"] = d ds = d.get("datasets") - if isinstance(ds, list) and ds: - st.caption(f"{len(ds)} dataset id(s) returned.") + st.session_state["t4_dataset_ids"] = [str(x) for x in ds] if isinstance(ds, list) else [] + st.session_state["t4_scenario_rows"] = [] + st.session_state["t4_last_scenarios_payload"] = None + st.success(f"OK — {len(st.session_state['t4_dataset_ids'])} dataset id(s).") except T4VisualizerError as ex: st.error(f"{ex} (status={ex.status_code})") if ex.response_text: @@ -100,46 +113,19 @@ def _client() -> T4VisualizerClient: except OSError as ex: st.error(f"Network error: {ex}") -with tab_scenarios: - section_header( - "/datasets/{t4dataset_id}/scenarios", - "GET — scene **name** (use as ``scenario_name`` in ``POST /render``), token, description, " - "and **nbr_samples** (frame count; valid ``frame_index`` is ``0 .. nbr_samples - 1``).", - ) - s1, s2 = st.columns([2, 1]) - with s1: - scen_ds_id = st.text_input( - "t4dataset_id", - value="", - key="t4_scenarios_ds", - placeholder="dataset id as listed by GET /datasets", - ) - with s2: - scen_version = st.text_input( - "version (optional)", - value="", - key="t4_scenarios_ver", - help="Same as Tier4 / POST /render ``version`` (annotation dir); leave empty to omit.", - ) - +with row_fetch[1]: if st.button("GET /datasets/…/scenarios", type="primary", key="t4_btn_scenarios"): - _tid = (scen_ds_id or "").strip() + _tid = (st.session_state.get("t4_ctx_ds") or "").strip() if not _tid: - st.warning("Enter a t4dataset_id.") + st.warning("Set **t4dataset_id** first.") else: try: - _ver = (scen_version or "").strip() or None + _ver = (st.session_state.get("t4_ctx_ver") or "").strip() or None out = _client().list_dataset_scenarios(_tid, version=_ver) - st.success("OK") - st.json(out) + st.session_state["t4_last_scenarios_payload"] = out rows = out.get("scenarios") - if isinstance(rows, list) and rows: - st.subheader("Scenarios table") - st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True) - st.caption( - "Use **name** as **scenario_name** when calling **Render** or **Embed JSON**. " - "**nbr_samples** is the number of frames in that scene." - ) + st.session_state["t4_scenario_rows"] = rows if isinstance(rows, list) else [] + st.success(f"OK — {len(st.session_state['t4_scenario_rows'])} scenario(s).") except T4VisualizerError as ex: st.error(f"{ex} (status={ex.status_code})") if ex.response_text: @@ -147,15 +133,97 @@ def _client() -> T4VisualizerClient: except OSError as ex: st.error(f"Network error: {ex}") +with row_fetch[2]: + if st.session_state.get("t4_last_datasets_payload") is not None: + with st.expander("Last GET /datasets JSON", expanded=False): + st.json(st.session_state["t4_last_datasets_payload"]) + if st.session_state.get("t4_last_scenarios_payload") is not None: + with st.expander("Last GET /datasets/…/scenarios JSON", expanded=False): + st.json(st.session_state["t4_last_scenarios_payload"]) + +_ids = st.session_state["t4_dataset_ids"] +_ds_options = ["—"] + sorted(_ids) +_name_rows = st.session_state["t4_scenario_rows"] +_scen_names: List[str] = [] +for r in _name_rows: + if isinstance(r, dict) and r.get("name") is not None: + _scen_names.append(str(r["name"])) +_scen_options = ["—"] + sorted(set(_scen_names)) + +c1, c2, c3, c4 = st.columns(4) +with c1: + st.selectbox( + "Pick dataset (from last /datasets)", + options=_ds_options, + key="t4_pick_ds", + on_change=_on_dataset_pick, + help="Choose a server-reported id, or leave as — and type below.", + ) + st.text_input( + "t4dataset_id", + key="t4_ctx_ds", + placeholder="uuid or folder id", + ) +with c2: + st.text_input( + "version (optional)", + key="t4_ctx_ver", + help="Annotation dir version; passed to scenarios and render when non-empty.", + ) +with c3: + st.selectbox( + "Pick scenario (from last /scenarios)", + options=_scen_options, + key="t4_pick_scen", + on_change=_on_scenario_pick, + help="Choose **name** from the server, or type any scenario below.", + ) + st.text_input( + "scenario_name", + key="t4_ctx_scen", + placeholder="scene name for POST /render", + ) +with c4: + st.number_input("frame_index", min_value=0, value=0, step=1, key="t4_ctx_frame") + +if _name_rows: + st.caption( + "Valid **frame_index** for each scene is **0 … nbr_samples − 1** (see table). " + "Use **Render & embed** to request PNGs." + ) + st.dataframe(pd.DataFrame(_name_rows), use_container_width=True, hide_index=True) + +st.divider() + +tab_overview, tab_render = st.tabs(["Overview", "Render & embed JSON"]) + +with tab_overview: + section_header("/health", "GET — server liveness.") + if st.button("GET /health", type="primary", key="t4_btn_health"): + try: + h = _client().health() + st.success("OK") + st.json(h) + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + with tab_render: section_header("POST /render", "Request camera PNGs; optional ``target_objects`` from JSON below.") - c1, c2, c3 = st.columns(3) - with c1: - ds_id = st.text_input("t4dataset_id", value="", key="t4_render_ds", placeholder="dataset folder id") - with c2: - scen = st.text_input("scenario_name", value="", key="t4_render_scen", placeholder="scenario") - with c3: - frame = st.number_input("frame_index", min_value=0, value=0, step=1, key="t4_render_frame") + ds_id = (st.session_state.get("t4_ctx_ds") or "").strip() + scen = (st.session_state.get("t4_ctx_scen") or "").strip() + frame = int(st.session_state.get("t4_ctx_frame") or 0) + ver_raw = (st.session_state.get("t4_ctx_ver") or "").strip() + version_opt: Optional[str] = ver_raw if ver_raw else None + + st.caption( + f"Using context: **t4dataset_id**=`{ds_id or '…'}` · **scenario_name**=`{scen or '…'}` · " + f"**frame_index**={frame}" + + (f" · **version**=`{version_opt}`" if version_opt else "") + ) tgt_json = st.text_area( "target_objects (JSON array, optional)", @@ -189,23 +257,25 @@ def _client() -> T4VisualizerClient: objs.append(TargetObjectIn(**d)) if parse_err is None: req = RenderRequest( - t4dataset_id=ds_id.strip(), - scenario_name=scen.strip(), - frame_index=int(frame), + t4dataset_id=ds_id, + scenario_name=scen, + frame_index=frame, target_objects=objs, crop_cameras=crop, show_annotations=show_ann, + version=version_opt, ) except json.JSONDecodeError as ex: parse_err = f"Invalid JSON: {ex}" else: req = RenderRequest( - t4dataset_id=ds_id.strip(), - scenario_name=scen.strip(), - frame_index=int(frame), + t4dataset_id=ds_id, + scenario_name=scen, + frame_index=frame, target_objects=[], crop_cameras=crop, show_annotations=show_ann, + version=version_opt, ) if parse_err: @@ -224,7 +294,20 @@ def _client() -> T4VisualizerClient: with st.spinner("Rendering…"): res = _client().render(req) imgs = res.decode_all_images() - st.caption(f"sample_token={res.sample_token!r} · timestamp_us={res.timestamp_us}") + cap_parts = [ + f"sample_token={res.sample_token!r}", + f"timestamp_us={res.timestamp_us}", + ] + if res.elapsed_ms is not None: + cap_parts.append(f"elapsed_ms={res.elapsed_ms}") + if res.tier4_load_ms is not None: + cap_parts.append(f"tier4_load_ms={res.tier4_load_ms}") + if res.render_ms is not None: + cap_parts.append(f"render_ms={res.render_ms}") + st.caption(" · ".join(cap_parts)) + if res.raw_json is not None: + with st.expander("Response JSON (debug)", expanded=False): + st.json(render_response_json_for_debug(res.raw_json)) if not imgs: st.info("No images in response.") else: @@ -242,18 +325,15 @@ def _client() -> T4VisualizerClient: except OSError as ex: st.error(f"Network error: {ex}") -with tab_embed: + st.divider() section_header( "Embed helpers", - "Copy structured context, query strings, and full ``POST /render`` JSON for scripts or docs.", + "Same **context** fields as above. Copy structured context, query strings, and full ``POST /render`` JSON.", ) - e1, e2, e3 = st.columns(3) - with e1: - emb_ds = st.text_input("t4dataset_id", value="", key="t4_emb_ds") - with e2: - emb_scen = st.text_input("scenario_name", value="", key="t4_emb_scen") - with e3: - emb_frame = st.number_input("frame_index", min_value=0, value=0, step=1, key="t4_emb_frame") + + emb_ds = (st.session_state.get("t4_ctx_ds") or "").strip() + emb_scen = (st.session_state.get("t4_ctx_scen") or "").strip() + emb_frame = int(st.session_state.get("t4_ctx_frame") or 0) emb_ta = st.text_area( "Optional GT rows as JSON array (for target_objects_from_rows)", @@ -281,8 +361,8 @@ def _client() -> T4VisualizerClient: if rows_err: st.warning(rows_err) - ctx = t4_dataset_context(emb_ds.strip(), emb_scen.strip(), frame_index=int(emb_frame)) - q = t4_share_query_params(emb_ds.strip(), emb_scen.strip(), frame_index=int(emb_frame)) + ctx = t4_dataset_context(emb_ds, emb_scen, frame_index=emb_frame) + q = t4_share_query_params(emb_ds, emb_scen, frame_index=emb_frame) st.subheader("t4_dataset_context") st.json(ctx) @@ -291,9 +371,9 @@ def _client() -> T4VisualizerClient: st.code(q, language="text") full = build_render_request_embed( - emb_ds.strip(), - emb_scen.strip(), - int(emb_frame), + emb_ds, + emb_scen, + emb_frame, target_rows=rows_list if rows_list else None, show_annotations=True, crop_cameras=False, From d71dc83ae97748dfebfce936068d5da87d893d94 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 31 Mar 2026 11:51:03 +0900 Subject: [PATCH 08/94] feat: enhance T4 dataset embedding and Streamlit integration - Added `t4_share_query_params_from_post_render_json` function to generate query strings from JSON bodies for easier integration with the T4 visualizer. - Implemented `_hydrate_t4_from_url` function in `11_T4_Dataset_Server.py` to populate session state from URL parameters, improving user experience. - Updated rendering logic to utilize session state values for annotations and cropping options, enhancing the rendering process. - Improved curl command generation for better usability in API interactions. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_dataset_embed.py | 7 + .../pages/11_T4_Dataset_Server.py | 95 +++++-- .../pages/4_Bounding_Box_Viewer.py | 239 +++++++++++++----- evaluation_dashboard_app/pages/5_Tools.py | 118 --------- .../pages/99_Deployment_Debug.py | 8 +- 5 files changed, 265 insertions(+), 202 deletions(-) delete mode 100644 evaluation_dashboard_app/pages/5_Tools.py diff --git a/evaluation_dashboard_app/lib/t4_dataset_embed.py b/evaluation_dashboard_app/lib/t4_dataset_embed.py index ff090e5..c49d3bf 100644 --- a/evaluation_dashboard_app/lib/t4_dataset_embed.py +++ b/evaluation_dashboard_app/lib/t4_dataset_embed.py @@ -5,6 +5,7 @@ from __future__ import annotations +import json from typing import Any, List, Mapping, Optional, Sequence from urllib.parse import quote @@ -51,6 +52,12 @@ def t4_share_query_params( ) +def t4_share_query_params_from_post_render_json(body: Mapping[str, Any]) -> str: + """Query string (no ``?``) with a single ``render_json`` param: same object as curl ``-d`` / ``post_render_json``.""" + compact = json.dumps(dict(body), separators=(",", ":"), ensure_ascii=False) + return f"render_json={quote(compact, safe='')}" + + def target_objects_from_rows(rows: Sequence[Mapping[str, Any]]) -> List[dict[str, Any]]: """Map each row to a ``target_objects`` dict (see :func:`target_object_from_gt_row`).""" return [target_object_from_gt_row(r) for r in rows] diff --git a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py index 58de3ca..7297d87 100644 --- a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py +++ b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py @@ -7,7 +7,6 @@ import json import os -import shlex from typing import Any, List, Optional import pandas as pd @@ -64,6 +63,51 @@ if "t4_last_scenarios_payload" not in st.session_state: st.session_state["t4_last_scenarios_payload"] = None + +def _hydrate_t4_from_url() -> None: + """Fill context + render/embed widgets from ``?render_json=…`` (same JSON as curl ``-d``).""" + qp = st.query_params + raw = qp.get("render_json") + if raw is None: + return + if isinstance(raw, list): + raw = raw[0] if raw else None + if not raw: + return + sig = f"render_json:{raw}" + if st.session_state.get("_t4_hydrate_sig") == sig: + return + try: + body = json.loads(str(raw)) + except json.JSONDecodeError: + return + if not isinstance(body, dict): + return + st.session_state["t4_ctx_ds"] = str(body.get("t4dataset_id", "")) + st.session_state["t4_ctx_scen"] = str(body.get("scenario_name", "")) + try: + st.session_state["t4_ctx_frame"] = int(body.get("frame_index", 0)) + except (TypeError, ValueError): + st.session_state["t4_ctx_frame"] = 0 + ver = body.get("version") + st.session_state["t4_ctx_ver"] = "" if ver is None else str(ver) + to = body.get("target_objects") + if isinstance(to, list): + tgt = json.dumps(to, ensure_ascii=False, indent=2) + st.session_state["t4_emb_rows"] = tgt + st.session_state["t4_render_targets"] = tgt + st.session_state["t4_render_use_tgt"] = len(to) > 0 + else: + st.session_state["t4_emb_rows"] = "[]" + st.session_state["t4_render_targets"] = "[]" + st.session_state["t4_render_use_tgt"] = False + st.session_state["t4_render_crop"] = bool(body.get("crop_cameras", False)) + st.session_state["t4_render_ann"] = bool(body.get("show_annotations", True)) + st.session_state["_t4_hydrate_sig"] = sig + + +_hydrate_t4_from_url() + base_url = st.sidebar.text_input( "Server base URL", key="t4_test_base_url", @@ -76,6 +120,11 @@ def _client() -> T4VisualizerClient: return T4VisualizerClient(base_url=(base_url or "").strip() or DEFAULT_BASE_URL, timeout=float(timeout_s)) +def _bash_single_quoted(s: str) -> str: + """Wrap *s* for safe use as a bash single-quoted string (e.g. ``-d '…'``).""" + return "'" + s.replace("'", "'\"'\"'") + "'" + + def _on_dataset_pick() -> None: sel = st.session_state.get("t4_pick_ds", "—") if sel != "—": @@ -191,7 +240,7 @@ def _on_scenario_pick() -> None: "Valid **frame_index** for each scene is **0 … nbr_samples − 1** (see table). " "Use **Render & embed** to request PNGs." ) - st.dataframe(pd.DataFrame(_name_rows), use_container_width=True, hide_index=True) + st.dataframe(pd.DataFrame(_name_rows), width='stretch', hide_index=True) st.divider() @@ -315,7 +364,7 @@ def _on_scenario_pick() -> None: cols = st.columns(n) for i in range(n): label, png = imgs[i] - cols[i].image(png, caption=label, use_container_width=True) + cols[i].image(png, caption=label, width='stretch') if len(imgs) > n: st.caption(f"Showing first {n} of {len(imgs)} images.") except T4VisualizerError as ex: @@ -362,33 +411,37 @@ def _on_scenario_pick() -> None: st.warning(rows_err) ctx = t4_dataset_context(emb_ds, emb_scen, frame_index=emb_frame) - q = t4_share_query_params(emb_ds, emb_scen, frame_index=emb_frame) - - st.subheader("t4_dataset_context") - st.json(ctx) - - st.subheader("Shareable query fragment") - st.code(q, language="text") - + emb_ver = (st.session_state.get("t4_ctx_ver") or "").strip() full = build_render_request_embed( emb_ds, emb_scen, emb_frame, target_rows=rows_list if rows_list else None, - show_annotations=True, - crop_cameras=False, + show_annotations=bool(st.session_state.get("t4_render_ann", True)), + crop_cameras=bool(st.session_state.get("t4_render_crop", False)), + version=emb_ver if emb_ver else None, + ) + viz_base = (base_url or "").strip().rstrip("/") or DEFAULT_BASE_URL + q = t4_share_query_params(emb_ds, emb_scen, frame_index=emb_frame) + render_get_url = f"{viz_base}/render?{q}" + + st.subheader("Render GET URL") + st.caption( + "GET-style URL on the **visualizer server** (same **Server base URL** as API calls). " + "Requires **GET /render** with ``t4dataset_id``, ``scenario_name``, ``frame_index``; otherwise use **curl** (POST JSON) below." ) - st.subheader("context + post_render_json") - st.json(full) + st.markdown(f"[{render_get_url}]({render_get_url})") if rows_list: st.subheader("target_objects_from_rows (preview)") st.json(target_objects_from_rows(rows_list)) - curl_base = shlex.quote((base_url or "").strip() or DEFAULT_BASE_URL) - body_s = json.dumps(full["post_render_json"]) - st.subheader("Example curl") - st.code( - f"curl -sS {curl_base}/render -H 'Content-Type: application/json' -d {shlex.quote(body_s)}", - language="bash", + curl_base = (base_url or "").strip() or DEFAULT_BASE_URL + body_pretty = json.dumps(full["post_render_json"], indent=2, ensure_ascii=False) + curl_lines = ( + f"curl -sS {curl_base}/render \\\n" + f" -H 'Content-Type: application/json' \\\n" + f" -d {_bash_single_quoted(body_pretty)}" ) + st.subheader("curl") + st.code(curl_lines, language="bash") diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index 9b46beb..0e4f894 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -1,5 +1,8 @@ +import html import duckdb +import requests import streamlit as st +import streamlit.components.v1 as components import plotly.graph_objects as go import plotly.express as px import numpy as np @@ -12,6 +15,7 @@ from lib.parquet_schema import schema_flags from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero from lib.ui.bounding_box_viewer_ui import bev_overlay_line_and_status_legend_markup, bev_status_legend_markup +from lib.t4_dataset_embed import t4_share_query_params from lib.t4_visualizer_client import ( DEFAULT_BASE_URL, ENV_BASE_URL, @@ -320,6 +324,36 @@ def list_parquets_in_run(run_path) -> List[str]: ) compare_view_mode = "overlay" if "Overlay" in compare_view_mode else "side_by_side" +# --- T4 visualizer (base URL + preview mode in sidebar) +with st.sidebar: + st.markdown("##### T4 visualizer") + st.caption("Choose HTML iframe or in-app PNGs (POST); both use the same server URL.") + if "bbox_t4_base_url" not in st.session_state: + st.session_state["bbox_t4_base_url"] = ( + (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL + ) + st.text_input( + "T4 server base URL", + key="bbox_t4_base_url", + help=f"Default from env `{ENV_BASE_URL}`; needs **GET /render/html** (iframe) and **POST /render** (PNG mode).", + ) + _t4_mode = st.radio( + "T4 preview", + ["html_iframe", "post_png"], + format_func=lambda m: ( + "HTML iframe (/render/html)" if m == "html_iframe" else "POST /render (PNGs here)" + ), + key="bbox_t4_preview_mode", + horizontal=True, + ) + if _t4_mode == "post_png": + _t4p1, _t4p2 = st.columns(2) + with _t4p1: + st.checkbox("Crop cameras", value=True, key="bbox_t4_crop_cameras") + st.checkbox("Show dataset annotations", value=True, key="bbox_t4_show_ann") + with _t4p2: + st.checkbox("Draw GT rows as target boxes", value=True, key="bbox_t4_overlay_gt") + # ---------------------------- # Build query safely & load data @@ -482,51 +516,112 @@ def _bbox_resolve_t4_scenario(dff: pd.DataFrame, scenario_from_sidebar: Optional return "" -with st.expander("T4 visualizer — camera renders (external server)", expanded=False): - if "bbox_t4_base_url" not in st.session_state: - st.session_state["bbox_t4_base_url"] = ( - (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL - ) - base_url_t4 = st.text_input( - "T4 server base URL", - key="bbox_t4_base_url", - help=f"Default from env `{ENV_BASE_URL}`; must reach the FastAPI app (GET /health, POST /render).", +def _bbox_t4_request_key( + ds: str, + sc: str, + frame_idx: int, + base_url: str, + crop: bool, + show_ann: bool, + overlay_gt: bool, +) -> Tuple[Any, ...]: + return ( + str(ds), + str(sc), + int(frame_idx), + str(base_url).rstrip("/"), + bool(crop), + bool(show_ann), + bool(overlay_gt), + ) + + +st.markdown("##### T4 camera renders") +_t4_preview_mode = st.session_state.get("bbox_t4_preview_mode", "html_iframe") +if _t4_preview_mode == "html_iframe": + st.caption( + "Mode: **HTML iframe** (**GET /render/html**). Loads in the browser without blocking the rest of the page. " + f"**T4 server base URL** is in the sidebar (or `{ENV_BASE_URL}`)." + ) +else: + st.caption( + "Mode: **POST /render** — camera PNGs fetched in-app (~2s). " + f"**T4 server base URL** is in the sidebar (or `{ENV_BASE_URL}`)." ) - c1, c2, c3 = st.columns(3) - with c1: - t4_crop = st.checkbox("Crop cameras", value=True, key="bbox_t4_crop_cameras") - with c2: - t4_show_ann = st.checkbox("Show dataset annotations", value=True, key="bbox_t4_show_ann") - with c3: - t4_overlay_gt = st.checkbox("Draw GT rows as target boxes", value=True, key="bbox_t4_overlay_gt") - - _ds_t4 = _bbox_resolve_t4_dataset_id(df_frame) - if not _ds_t4 and selected_t4dataset is not None: - _ds_t4 = str(selected_t4dataset) - _sc_t4 = _bbox_resolve_t4_scenario(df_frame, selected_scenario) +base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL + +_ds_t4 = _bbox_resolve_t4_dataset_id(df_frame) +if not _ds_t4 and selected_t4dataset is not None: + _ds_t4 = str(selected_t4dataset) +_sc_t4 = _bbox_resolve_t4_scenario(df_frame, selected_scenario) + +if not _ds_t4: + for _k in ( + "bbox_t4_last_images", + "bbox_t4_last_meta", + "bbox_t4_success_key", + "bbox_t4_error_key", + "bbox_t4_error_msg", + ): + st.session_state.pop(_k, None) + st.caption("T4 camera preview is not available for this scene.") + with st.expander("Details", expanded=False): + st.markdown( + "Needs parquet **t4dataset_id** or **t4dataset_name** (or **t4dataset_name** in the sidebar when " + "multiple datasets exist). " + "The Tier4 HTTP visualizer (`t4-server`) must serve that dataset. " + f"Set **T4 server base URL** in the sidebar or `{ENV_BASE_URL}`." + ) +elif _t4_preview_mode == "html_iframe": + _q = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) + _render_html_url = f"{base_url_t4.rstrip('/')}/render/html?{_q}" st.caption( - f"API params: **t4dataset_id** `{_ds_t4 or '—'}` · **scenario_name** `{_sc_t4 or '—'}` · **frame_index** `{frame}`" + f"**Request:** t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4 or '—'}` · frame_index `{frame}`" + ) + st.markdown(f"[Open in new tab]({_render_html_url})") + _iframe_h = 900 + components.html( + f'', + height=_iframe_h + 24, + scrolling=True, ) - if not _ds_t4 or not _sc_t4: - st.info( - "Set a scene with **scenario_name** in the sidebar and ensure parquet includes **t4dataset_id** or " - "**t4dataset_name** (or pick **t4dataset_name** when multiple datasets exist). " - "The T4 server must have that dataset under its `--data-dir`." +elif not _sc_t4: + st.caption("POST /render mode needs **scenario_name** (sidebar or parquet) for this scene.") + with st.expander("Details", expanded=False): + st.markdown( + "Pick a **Scenario name** in the sidebar or ensure parquet includes **scenario_name**. " + "Alternatively switch to **HTML iframe** mode if the server accepts an empty scenario for your dataset." ) +else: + t4_crop = bool(st.session_state.get("bbox_t4_crop_cameras", True)) + t4_show_ann = bool(st.session_state.get("bbox_t4_show_ann", True)) + t4_overlay_gt = bool(st.session_state.get("bbox_t4_overlay_gt", True)) + + _req_key = _bbox_t4_request_key( + _ds_t4, + _sc_t4, + int(frame), + base_url_t4, + t4_crop, + t4_show_ann, + t4_overlay_gt, + ) + _ok_key = st.session_state.get("bbox_t4_success_key") + _bad_key = st.session_state.get("bbox_t4_error_key") - fetch_t4 = st.button("Fetch camera renders from T4 server", key="bbox_t4_fetch_btn", type="primary") + _should_fetch = _req_key != _ok_key and _req_key != _bad_key - if fetch_t4: - if not _ds_t4 or not _sc_t4: - st.error("Cannot call T4 server: missing t4dataset id or scenario name.") - else: - try: + if _should_fetch: + try: + with st.spinner("Loading T4 camera renders… (usually ~2 seconds)"): client = T4VisualizerClient( - base_url=(base_url_t4 or "").strip() or DEFAULT_BASE_URL, + base_url=base_url_t4, timeout=120.0, ) - targets: List[TargetObjectIn] = [] + targets = [] if t4_overlay_gt: for _, row in df_frame[df_frame["source"] == "GT"].iterrows(): d = target_object_from_gt_row(row.to_dict()) @@ -539,9 +634,19 @@ def _bbox_resolve_t4_scenario(dff: pd.DataFrame, scenario_from_sidebar: Optional crop_cameras=t4_crop, show_annotations=t4_show_ann, ) - with st.spinner("Calling T4 visualizer (POST /render)…"): - t4_res = client.render(req) - st.session_state["bbox_t4_last_images"] = t4_res.decode_all_images() + t4_res = client.render(req) + _imgs = t4_res.decode_all_images() + if not _imgs: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = ( + "T4 server returned no camera images for this frame. " + "Check that the dataset and scenario exist on the server and the frame index is valid." + ) + st.session_state.pop("bbox_t4_success_key", None) + else: + st.session_state["bbox_t4_last_images"] = _imgs st.session_state["bbox_t4_last_meta"] = { "sample_token": t4_res.sample_token, "timestamp_us": t4_res.timestamp_us, @@ -549,26 +654,47 @@ def _bbox_resolve_t4_scenario(dff: pd.DataFrame, scenario_from_sidebar: Optional "t4dataset_id": _ds_t4, "scenario_name": _sc_t4, } - except T4VisualizerError as ex: - st.session_state.pop("bbox_t4_last_images", None) - st.session_state.pop("bbox_t4_last_meta", None) - st.error(f"T4 server error ({ex.status_code}): {ex}") - except OSError as ex: - st.session_state.pop("bbox_t4_last_images", None) - st.session_state.pop("bbox_t4_last_meta", None) - st.error(f"Network error: {ex}") + st.session_state["bbox_t4_success_key"] = _req_key + st.session_state.pop("bbox_t4_error_key", None) + st.session_state.pop("bbox_t4_error_msg", None) + except T4VisualizerError as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"T4 server error ({ex.status_code}): {ex}" + except (OSError, requests.RequestException) as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"Network error: {ex}" + except Exception as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"T4 render failed: {ex}" _meta = st.session_state.get("bbox_t4_last_meta") _imgs = st.session_state.get("bbox_t4_last_images") - if _meta and _imgs: - if int(_meta.get("frame_index", -1)) != int(frame): - st.warning( - f"Images below are from **frame {_meta['frame_index']}**; current slider is **{frame}**. " - "Click **Fetch** again to update." + _show_err = st.session_state.get("bbox_t4_error_msg") + + st.caption( + f"**Request:** t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}`" + ) + if _req_key == st.session_state.get("bbox_t4_error_key") and _show_err: + st.caption("T4 camera preview could not be loaded.") + with st.expander("Details", expanded=False): + st.caption( + f"t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}` · " + f"server `{base_url_t4}`" ) - st.success( - f"**sample_token:** `{_meta.get('sample_token', '')}` · " - f"**timestamp_us:** `{_meta.get('timestamp_us', '')}`" + st.markdown(_show_err) + elif _meta and _imgs: + st.caption( + f"**sample_token** `{_meta.get('sample_token', '')}` · " + f"**timestamp_us** `{_meta.get('timestamp_us', '')}`" ) _nc = min(3, max(1, len(_imgs))) for _row_start in range(0, len(_imgs), _nc): @@ -579,11 +705,6 @@ def _bbox_resolve_t4_scenario(dff: pd.DataFrame, scenario_from_sidebar: Optional st.caption(_lbl) st.image(_png, use_container_width=True) - st.caption( - "Runs the Tier4 HTTP visualizer (`t4-server`); does not bundle t4_devkit. " - "Point **T4 server base URL** at your instance or set `T4_VISUALIZER_BASE_URL`." - ) - # ---------------------------- # Quick view: switch between "All (comparison)" and single-run view # ---------------------------- diff --git a/evaluation_dashboard_app/pages/5_Tools.py b/evaluation_dashboard_app/pages/5_Tools.py deleted file mode 100644 index 0dc8958..0000000 --- a/evaluation_dashboard_app/pages/5_Tools.py +++ /dev/null @@ -1,118 +0,0 @@ -import streamlit as st -import re -import subprocess - -from lib.page_chrome import inject_app_page_styles, render_page_hero - -st.set_page_config( - page_title="lsim_analysis_tool runner", - page_icon="⚙️", - layout="centered", -) -inject_app_page_styles() -render_page_hero( - kicker="CLI bridge", - title="lsim_analysis_tool runner", - description=( - "Paste Autoware Evaluator report or suite URLs, generate shell snippets, and run analysis commands " - "from a simple form." - ), - mode="Single Run", -) - -# Constants and regexes -JOB_RE = re.compile(r"/reports/([0-9a-fA-F-]{36})") -SUITE_RE = re.compile(r"/suites/([0-9a-fA-F-]{36})") -DEFAULT_REPORT_URL = ( - "https://evaluation.tier4.jp/evaluation/reports/" - "71b8eec9-7e28-5f9c-9b89-8e88545e742f?project_id=x2_dev" -) -DEFAULT_SUITE_URL = ( - "https://evaluation.tier4.jp/evaluation/suites/" - "1af11feb-362d-4c48-b258-02cd433a3866?project_id=x2_dev" -) -DEFAULT_OUTPUT = "~/data/x2gen2/evaluator_summary/NO_shorten_left_lower_gpu2_No3/" - -def extract_job_id(report_url): - m = JOB_RE.search(report_url or "") - return m.group(1) if m else "" - -def extract_suite_id(suite_url): - m = SUITE_RE.search(suite_url or "") - return m.group(1) if m else "" - -# App state initialization -if 'report_url' not in st.session_state: - st.session_state['report_url'] = DEFAULT_REPORT_URL -if 'suite_url' not in st.session_state: - st.session_state['suite_url'] = DEFAULT_SUITE_URL - -# Layout inputs -with st.form(key="eval_runner_form"): - col1, col2 = st.columns([1, 1]) - with col1: - project_id = st.text_input("Project ID", value="x2_dev", key="project_id") - setup_bash = st.text_area( - "setup.bash path", - value="/home/leigu/pilot-auto.x2.v4.3/install/setup.bash", - key="setup_bash", - height=120, - placeholder="Enter full path(s) to your setup.bash file(s), one per line." - ) - output_dir = st.text_area( - "Output Directory", - value=DEFAULT_OUTPUT, - key="output_dir", - height=120, - placeholder="Enter one or more output directories, one per line." - ) - with col2: - report_url = st.text_area( - "Report URL", - value=st.session_state['report_url'], - key="report_url", - height=120, - placeholder="Paste the full Evaluation Report URL here." - ) - suite_url = st.text_area( - "Suite URL", - value=st.session_state['suite_url'], - key="suite_url", - height=120, - placeholder="Paste the full Evaluation Suite URL here." - ) - - # Job ID and Suite ID auto-extracted from URL text fields live as you type - # So always extract from form inputs (not session state nor callbacks) - job_id = extract_job_id(report_url) - suite_id = extract_suite_id(suite_url) - - st.text_input("Job ID", value=job_id, key="job_id", disabled=True) - st.text_input("Suite ID", value=suite_id, key="suite_id", disabled=True) - - # Build command - cmd = ( - f"./perception_evaluation_result_creator2.sh " - f"{setup_bash} " - f"./perception_eval_result_summarizer.py " - f"{project_id} " - f"{job_id} " - f"{suite_id} " - f"{output_dir}" - ) - - # Submit button as required for Streamlit forms - submitted = st.form_submit_button("Run in Terminal") - -# "Run in Terminal" logic -if submitted: - st.info(f"Command to run (copy below and paste into your terminal):\n\n{cmd}") - - -st.markdown(""" ---- -**Instructions:** -- Enter your parameters above. -- Job ID / Suite ID are automatically parsed when you enter the Evaluation URLs. -- Click **Run in Terminal** to show the command for copy-paste. -""") \ No newline at end of file diff --git a/evaluation_dashboard_app/pages/99_Deployment_Debug.py b/evaluation_dashboard_app/pages/99_Deployment_Debug.py index 7410a02..575c356 100644 --- a/evaluation_dashboard_app/pages/99_Deployment_Debug.py +++ b/evaluation_dashboard_app/pages/99_Deployment_Debug.py @@ -60,7 +60,7 @@ with tab_env: section_header("Deployment environment", "Sensitive connection strings are redacted.") env_df = pd.DataFrame(redacted_deployment_env_rows(), columns=["Variable", "Value"]) - st.dataframe(env_df, use_container_width=True, hide_index=True) + st.dataframe(env_df, width='stretch', hide_index=True) with tab_dep: section_header("Postgres") @@ -95,7 +95,7 @@ cdf = pd.DataFrame( [{"status": k, "count": v} for k, v in sorted(counts.items())] ) - st.dataframe(cdf, use_container_width=True, hide_index=True) + st.dataframe(cdf, width='stretch', hide_index=True) elif ok_t: st.success("No task rows yet (empty table).") else: @@ -235,7 +235,7 @@ def _docker_fragment(): return section_header("Live container table", "Sortable columns; `full_id` stays internal for log/exec.") display_df = _display_columns_for_containers(rows) - st.dataframe(display_df, use_container_width=True, hide_index=True) + st.dataframe(display_df, width='stretch', hide_index=True) _render_live_stack_mermaid(rows) options = [f"{r['name']} ({r['id']})" for r in rows] @@ -280,7 +280,7 @@ def _docker_fragment(): _render_live_stack_mermaid(rows) section_header("Live container table", "Sortable columns; `full_id` stays internal for log/exec.") display_df = _display_columns_for_containers(rows) - st.dataframe(display_df, use_container_width=True, hide_index=True) + st.dataframe(display_df, width='stretch', hide_index=True) options = [f"{r['name']} ({r['id']})" for r in rows] id_by_label = {f"{r['name']} ({r['id']})": r["full_id"] for r in rows} pick = st.selectbox("Container", options=options, key="deploy_debug_pick_legacy") From 0e1f13c9486d8bbe42b4db54d954575f73cc0478 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 31 Mar 2026 14:40:47 +0900 Subject: [PATCH 09/94] fix: refine captions in Bounding Box Viewer for clarity - Updated captions in the T4 camera render section to simplify language and improve readability. - Removed redundant details about HTTP methods while maintaining essential information about rendering modes. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index 0e4f894..03f51cb 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -540,12 +540,12 @@ def _bbox_t4_request_key( _t4_preview_mode = st.session_state.get("bbox_t4_preview_mode", "html_iframe") if _t4_preview_mode == "html_iframe": st.caption( - "Mode: **HTML iframe** (**GET /render/html**). Loads in the browser without blocking the rest of the page. " + "Mode: **HTML iframe**. Loads in the browser without blocking the rest of the page. " f"**T4 server base URL** is in the sidebar (or `{ENV_BASE_URL}`)." ) else: st.caption( - "Mode: **POST /render** — camera PNGs fetched in-app (~2s). " + "Mode: **POST** — camera PNGs fetched in-app. " f"**T4 server base URL** is in the sidebar (or `{ENV_BASE_URL}`)." ) From 9652107f0c27f79d4dcdab229525048f3ec80721 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 31 Mar 2026 15:29:00 +0900 Subject: [PATCH 10/94] feat: add dataset availability check to T4 visualizer client and update Bounding Box Viewer - Implemented `dataset_availability` method in `T4VisualizerClient` to check if a dataset is available on the server. - Updated the Bounding Box Viewer to utilize the new availability check before rendering, enhancing user feedback on dataset status. - Refined captions in the sidebar to clarify the dataset availability process and its impact on rendering options. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_visualizer_client.py | 21 +- .../pages/4_Bounding_Box_Viewer.py | 315 ++++++++++-------- 2 files changed, 201 insertions(+), 135 deletions(-) diff --git a/evaluation_dashboard_app/lib/t4_visualizer_client.py b/evaluation_dashboard_app/lib/t4_visualizer_client.py index 333b1e8..0e13307 100644 --- a/evaluation_dashboard_app/lib/t4_visualizer_client.py +++ b/evaluation_dashboard_app/lib/t4_visualizer_client.py @@ -3,7 +3,8 @@ Default base URL: ``T4_VISUALIZER_BASE_URL`` environment variable, or ``http://127.0.0.1:8000``. Does not import t4_devkit or t4_visualizer; only uses ``requests`` against the server's -``GET /health``, ``GET /datasets``, and ``POST /render`` endpoints. +``GET /health``, ``GET /datasets``, ``GET /datasets/{id}/availability``, ``GET /datasets/{id}/scenarios``, +and ``POST /render`` endpoints. """ from __future__ import annotations @@ -250,6 +251,24 @@ def list_dataset_scenarios( except ValueError as exc: raise T4VisualizerError("Invalid JSON from /datasets/.../scenarios") from exc + def dataset_availability(self, t4dataset_id: str) -> dict: + """GET /datasets/{t4dataset_id}/availability — whether the dataset is on disk for this server. + + Typical JSON: ``t4dataset_id``, ``available`` (bool), ``dataset_path`` (str or null). + """ + from urllib.parse import quote + + tid = quote(str(t4dataset_id), safe="") + resp = self._session.get( + self._url(f"/datasets/{tid}/availability"), + timeout=self.timeout, + ) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /datasets/.../availability") from exc + def render(self, payload: RenderRequest) -> RenderResult: """POST /render with a :class:`RenderRequest`.""" body = render_request_to_json_body(payload) diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index 03f51cb..64ebf9e 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -327,7 +327,7 @@ def list_parquets_in_run(run_path) -> List[str]: # --- T4 visualizer (base URL + preview mode in sidebar) with st.sidebar: st.markdown("##### T4 visualizer") - st.caption("Choose HTML iframe or in-app PNGs (POST); both use the same server URL.") + st.caption("Uses **GET /datasets/{id}/availability** first; preview runs only if the server reports the dataset is available.") if "bbox_t4_base_url" not in st.session_state: st.session_state["bbox_t4_base_url"] = ( (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL @@ -536,18 +536,7 @@ def _bbox_t4_request_key( ) -st.markdown("##### T4 camera renders") _t4_preview_mode = st.session_state.get("bbox_t4_preview_mode", "html_iframe") -if _t4_preview_mode == "html_iframe": - st.caption( - "Mode: **HTML iframe**. Loads in the browser without blocking the rest of the page. " - f"**T4 server base URL** is in the sidebar (or `{ENV_BASE_URL}`)." - ) -else: - st.caption( - "Mode: **POST** — camera PNGs fetched in-app. " - f"**T4 server base URL** is in the sidebar (or `{ENV_BASE_URL}`)." - ) base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL @@ -563,6 +552,7 @@ def _bbox_t4_request_key( "bbox_t4_success_key", "bbox_t4_error_key", "bbox_t4_error_msg", + "bbox_t4_availability", ): st.session_state.pop(_k, None) st.caption("T4 camera preview is not available for this scene.") @@ -573,137 +563,194 @@ def _bbox_t4_request_key( "The Tier4 HTTP visualizer (`t4-server`) must serve that dataset. " f"Set **T4 server base URL** in the sidebar or `{ENV_BASE_URL}`." ) -elif _t4_preview_mode == "html_iframe": - _q = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) - _render_html_url = f"{base_url_t4.rstrip('/')}/render/html?{_q}" - st.caption( - f"**Request:** t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4 or '—'}` · frame_index `{frame}`" - ) - st.markdown(f"[Open in new tab]({_render_html_url})") - _iframe_h = 900 - components.html( - f'', - height=_iframe_h + 24, - scrolling=True, - ) -elif not _sc_t4: - st.caption("POST /render mode needs **scenario_name** (sidebar or parquet) for this scene.") - with st.expander("Details", expanded=False): - st.markdown( - "Pick a **Scenario name** in the sidebar or ensure parquet includes **scenario_name**. " - "Alternatively switch to **HTML iframe** mode if the server accepts an empty scenario for your dataset." - ) else: - t4_crop = bool(st.session_state.get("bbox_t4_crop_cameras", True)) - t4_show_ann = bool(st.session_state.get("bbox_t4_show_ann", True)) - t4_overlay_gt = bool(st.session_state.get("bbox_t4_overlay_gt", True)) - - _req_key = _bbox_t4_request_key( - _ds_t4, - _sc_t4, - int(frame), - base_url_t4, - t4_crop, - t4_show_ann, - t4_overlay_gt, - ) - _ok_key = st.session_state.get("bbox_t4_success_key") - _bad_key = st.session_state.get("bbox_t4_error_key") - - _should_fetch = _req_key != _ok_key and _req_key != _bad_key - - if _should_fetch: + _t4_avail_cache_key = f"{base_url_t4.rstrip('/')}|{_ds_t4}" + _cached_av = st.session_state.get("bbox_t4_availability") + _need_avail_fetch = _cached_av is None or _cached_av.get("cache_key") != _t4_avail_cache_key + if _need_avail_fetch: try: - with st.spinner("Loading T4 camera renders… (usually ~2 seconds)"): - client = T4VisualizerClient( - base_url=base_url_t4, - timeout=120.0, - ) - targets = [] - if t4_overlay_gt: - for _, row in df_frame[df_frame["source"] == "GT"].iterrows(): - d = target_object_from_gt_row(row.to_dict()) - targets.append(TargetObjectIn(**d)) - req = RenderRequest( - t4dataset_id=_ds_t4, - scenario_name=_sc_t4, - frame_index=int(frame), - target_objects=targets, - crop_cameras=t4_crop, - show_annotations=t4_show_ann, - ) - t4_res = client.render(req) - _imgs = t4_res.decode_all_images() - if not _imgs: - st.session_state.pop("bbox_t4_last_images", None) - st.session_state.pop("bbox_t4_last_meta", None) - st.session_state["bbox_t4_error_key"] = _req_key - st.session_state["bbox_t4_error_msg"] = ( - "T4 server returned no camera images for this frame. " - "Check that the dataset and scenario exist on the server and the frame index is valid." - ) - st.session_state.pop("bbox_t4_success_key", None) - else: - st.session_state["bbox_t4_last_images"] = _imgs - st.session_state["bbox_t4_last_meta"] = { - "sample_token": t4_res.sample_token, - "timestamp_us": t4_res.timestamp_us, - "frame_index": int(frame), - "t4dataset_id": _ds_t4, - "scenario_name": _sc_t4, - } - st.session_state["bbox_t4_success_key"] = _req_key - st.session_state.pop("bbox_t4_error_key", None) - st.session_state.pop("bbox_t4_error_msg", None) + with st.spinner("Checking T4 dataset on the server…"): + _av_client = T4VisualizerClient(base_url=base_url_t4, timeout=30.0) + _av_data = _av_client.dataset_availability(_ds_t4) + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": True, + "available": bool(_av_data.get("available")), + "data": _av_data, + "error": None, + } except T4VisualizerError as ex: - st.session_state.pop("bbox_t4_last_images", None) - st.session_state.pop("bbox_t4_last_meta", None) - st.session_state.pop("bbox_t4_success_key", None) - st.session_state["bbox_t4_error_key"] = _req_key - st.session_state["bbox_t4_error_msg"] = f"T4 server error ({ex.status_code}): {ex}" + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"T4 server error ({ex.status_code}): {ex}", + } except (OSError, requests.RequestException) as ex: - st.session_state.pop("bbox_t4_last_images", None) - st.session_state.pop("bbox_t4_last_meta", None) - st.session_state.pop("bbox_t4_success_key", None) - st.session_state["bbox_t4_error_key"] = _req_key - st.session_state["bbox_t4_error_msg"] = f"Network error: {ex}" + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Network error: {ex}", + } except Exception as ex: - st.session_state.pop("bbox_t4_last_images", None) - st.session_state.pop("bbox_t4_last_meta", None) - st.session_state.pop("bbox_t4_success_key", None) - st.session_state["bbox_t4_error_key"] = _req_key - st.session_state["bbox_t4_error_msg"] = f"T4 render failed: {ex}" - - _meta = st.session_state.get("bbox_t4_last_meta") - _imgs = st.session_state.get("bbox_t4_last_images") - _show_err = st.session_state.get("bbox_t4_error_msg") - - st.caption( - f"**Request:** t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}`" - ) - if _req_key == st.session_state.get("bbox_t4_error_key") and _show_err: - st.caption("T4 camera preview could not be loaded.") + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Availability check failed: {ex}", + } + + _av = st.session_state.get("bbox_t4_availability") or {} + + if not _av.get("ok"): + st.caption("T4 preview skipped — could not verify dataset on the visualizer server.") with st.expander("Details", expanded=False): - st.caption( - f"t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}` · " - f"server `{base_url_t4}`" + st.markdown(_av.get("error") or "Unknown error.") + elif not _av.get("available"): + st.caption("T4 preview skipped — this dataset is not on the visualizer server host.") + with st.expander("Details", expanded=False): + _d = _av.get("data") + if isinstance(_d, dict) and _d: + st.json(_d) + else: + st.markdown( + "The server reported **available: false** (no local dataset path for this id on the machine " + "running `t4-server`)." + ) + elif _t4_preview_mode == "html_iframe": + _q = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) + _render_html_url = f"{base_url_t4.rstrip('/')}/render/html?{_q}" + st.markdown(f"[Open in new tab]({_render_html_url})") + _iframe_h = 900 + # Iframe shell: neutral gray while the document loads (avoid #141418 — reads as a black box for ~2s until + # the large /render/html response paints; inner page still sets its own dark background). + components.html( + f'', + height=_iframe_h + 24, + scrolling=True, + ) + elif not _sc_t4: + st.caption("POST /render mode needs **scenario_name** (sidebar or parquet) for this scene.") + with st.expander("Details", expanded=False): + st.markdown( + "Pick a **Scenario name** in the sidebar or ensure parquet includes **scenario_name**. " + "Alternatively switch to **HTML iframe** mode if the server accepts an empty scenario for your dataset." ) - st.markdown(_show_err) - elif _meta and _imgs: + else: + t4_crop = bool(st.session_state.get("bbox_t4_crop_cameras", True)) + t4_show_ann = bool(st.session_state.get("bbox_t4_show_ann", True)) + t4_overlay_gt = bool(st.session_state.get("bbox_t4_overlay_gt", True)) + + _req_key = _bbox_t4_request_key( + _ds_t4, + _sc_t4, + int(frame), + base_url_t4, + t4_crop, + t4_show_ann, + t4_overlay_gt, + ) + _ok_key = st.session_state.get("bbox_t4_success_key") + _bad_key = st.session_state.get("bbox_t4_error_key") + + _should_fetch = _req_key != _ok_key and _req_key != _bad_key + + if _should_fetch: + try: + with st.spinner("Loading T4 camera renders… (usually ~2 seconds)"): + client = T4VisualizerClient( + base_url=base_url_t4, + timeout=120.0, + ) + targets = [] + if t4_overlay_gt: + for _, row in df_frame[df_frame["source"] == "GT"].iterrows(): + d = target_object_from_gt_row(row.to_dict()) + targets.append(TargetObjectIn(**d)) + req = RenderRequest( + t4dataset_id=_ds_t4, + scenario_name=_sc_t4, + frame_index=int(frame), + target_objects=targets, + crop_cameras=t4_crop, + show_annotations=t4_show_ann, + ) + t4_res = client.render(req) + _imgs = t4_res.decode_all_images() + if not _imgs: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = ( + "T4 server returned no camera images for this frame. " + "Check that the dataset and scenario exist on the server and the frame index is valid." + ) + st.session_state.pop("bbox_t4_success_key", None) + else: + st.session_state["bbox_t4_last_images"] = _imgs + st.session_state["bbox_t4_last_meta"] = { + "sample_token": t4_res.sample_token, + "timestamp_us": t4_res.timestamp_us, + "frame_index": int(frame), + "t4dataset_id": _ds_t4, + "scenario_name": _sc_t4, + } + st.session_state["bbox_t4_success_key"] = _req_key + st.session_state.pop("bbox_t4_error_key", None) + st.session_state.pop("bbox_t4_error_msg", None) + except T4VisualizerError as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"T4 server error ({ex.status_code}): {ex}" + except (OSError, requests.RequestException) as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"Network error: {ex}" + except Exception as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"T4 render failed: {ex}" + + _meta = st.session_state.get("bbox_t4_last_meta") + _imgs = st.session_state.get("bbox_t4_last_images") + _show_err = st.session_state.get("bbox_t4_error_msg") + st.caption( - f"**sample_token** `{_meta.get('sample_token', '')}` · " - f"**timestamp_us** `{_meta.get('timestamp_us', '')}`" + f"**Request:** t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}`" ) - _nc = min(3, max(1, len(_imgs))) - for _row_start in range(0, len(_imgs), _nc): - _cols_img = st.columns(_nc) - for _j, _k in enumerate(range(_row_start, min(_row_start + _nc, len(_imgs)))): - _lbl, _png = _imgs[_k] - with _cols_img[_j]: - st.caption(_lbl) - st.image(_png, use_container_width=True) + if _req_key == st.session_state.get("bbox_t4_error_key") and _show_err: + st.caption("T4 camera preview could not be loaded.") + with st.expander("Details", expanded=False): + st.caption( + f"t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}` · " + f"server `{base_url_t4}`" + ) + st.markdown(_show_err) + elif _meta and _imgs: + st.caption( + f"**sample_token** `{_meta.get('sample_token', '')}` · " + f"**timestamp_us** `{_meta.get('timestamp_us', '')}`" + ) + _nc = min(3, max(1, len(_imgs))) + for _row_start in range(0, len(_imgs), _nc): + _cols_img = st.columns(_nc) + for _j, _k in enumerate(range(_row_start, min(_row_start + _nc, len(_imgs)))): + _lbl, _png = _imgs[_k] + with _cols_img[_j]: + st.caption(_lbl) + st.image(_png, use_container_width=True) # ---------------------------- # Quick view: switch between "All (comparison)" and single-run view From 37673ce6be86606a1aaebbfe351309632d3ccd63 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 31 Mar 2026 16:52:42 +0900 Subject: [PATCH 11/94] feat: add T4 visualizer base URL configuration for Docker deployment - Introduced `T4_VISUALIZER_BASE_URL` in both `.env` and `.env.example` files to specify the base URL for the T4 visualizer server, enhancing integration with the Bounding Box Viewer. - Updated `docker-compose.yml` to include `extra_hosts` configuration for `host.docker.internal`, ensuring proper network routing in Docker environments. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/deploy/.env | 4 ++++ evaluation_dashboard_app/deploy/.env.example | 4 ++++ evaluation_dashboard_app/deploy/docker-compose.yml | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/evaluation_dashboard_app/deploy/.env b/evaluation_dashboard_app/deploy/.env index 1e13b72..eab4de3 100644 --- a/evaluation_dashboard_app/deploy/.env +++ b/evaluation_dashboard_app/deploy/.env @@ -17,6 +17,10 @@ DATABASE_URL=postgresql://eval_user:eval_pass@postgres:5432/eval_dashboard REDIS_URL=redis://redis:6379/0 RQ_QUEUE=default +# T4 visualizer server base URL used by Bounding Box Viewer / T4 pages +# For Docker-on-Linux, host.docker.internal is mapped via docker-compose extra_hosts +T4_VISUALIZER_BASE_URL=http://http://10.0.6.148:8000 + # Optional: per-user task visibility (company auth / WebAutoAuth) # Header name set by auth proxy with current user id (e.g. X-Forwarded-User) # AUTH_USER_HEADER=X-Forwarded-User diff --git a/evaluation_dashboard_app/deploy/.env.example b/evaluation_dashboard_app/deploy/.env.example index 5cf7d38..50d5a65 100644 --- a/evaluation_dashboard_app/deploy/.env.example +++ b/evaluation_dashboard_app/deploy/.env.example @@ -17,6 +17,10 @@ DATABASE_URL=postgresql://eval_user:eval_pass@postgres:5432/eval_dashboard REDIS_URL=redis://redis:6379/0 RQ_QUEUE=default +# T4 visualizer server base URL used by Bounding Box Viewer / T4 pages +# In Docker, set to a host-reachable endpoint (compose maps host.docker.internal) +T4_VISUALIZER_BASE_URL=http://host.docker.internal:8000 + # Docker Compose: default worker replica count (04_START.sh / 08_REBUILD_AND_START.sh). Streamlit defaults to streamlit1 only; optional second app server: compose --profile ha (see docker-compose.yml + nginx.conf). EVAL_COMPOSE_SCALE_WORKER=2 diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index ce2d1fe..59c7f8d 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -60,6 +60,8 @@ x-streamlit-app: &streamlit-app - ../configs:/app/configs - ../.streamlit:/app/.streamlit - /var/run/docker.sock:/var/run/docker.sock + extra_hosts: + - "host.docker.internal:host-gateway" env_file: - .env depends_on: @@ -124,6 +126,8 @@ services: - ../lib:/app/lib - ../worker:/app/worker - ../configs:/app/configs + extra_hosts: + - "host.docker.internal:host-gateway" env_file: - .env depends_on: From 4225a4c0d7b455fdf71677391a55f9661e7d4c77 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 31 Mar 2026 17:02:55 +0900 Subject: [PATCH 12/94] fix: correct T4 visualizer base URL in .env file - Removed duplicate 'http://' from the T4 visualizer base URL in the .env configuration, ensuring proper URL formatting for the Bounding Box Viewer integration. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/deploy/.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation_dashboard_app/deploy/.env b/evaluation_dashboard_app/deploy/.env index eab4de3..75a1ec2 100644 --- a/evaluation_dashboard_app/deploy/.env +++ b/evaluation_dashboard_app/deploy/.env @@ -19,7 +19,7 @@ RQ_QUEUE=default # T4 visualizer server base URL used by Bounding Box Viewer / T4 pages # For Docker-on-Linux, host.docker.internal is mapped via docker-compose extra_hosts -T4_VISUALIZER_BASE_URL=http://http://10.0.6.148:8000 +T4_VISUALIZER_BASE_URL=http://10.0.6.148:8000 # Optional: per-user task visibility (company auth / WebAutoAuth) # Header name set by auth proxy with current user id (e.g. X-Forwarded-User) From c60f31c9919cdeee3e8e6ecb01d0f83188bc7b3f Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 1 Apr 2026 13:58:38 +0900 Subject: [PATCH 13/94] feat: update Streamlit server configuration for Docker deployment - Modified the Docker entrypoint script to run Streamlit in headless mode, improving compatibility for remote deployments. - Added a new `.streamlit/config.toml` file to configure server options, including WebSocket compression settings for enhanced performance. - Removed unnecessary copying of the `.streamlit` directory in the Dockerfile, streamlining the build process. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Dockerfile | 2 -- evaluation_dashboard_app/{ => deploy}/.streamlit/config.toml | 4 ++-- evaluation_dashboard_app/docker-entrypoint.sh | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) rename evaluation_dashboard_app/{ => deploy}/.streamlit/config.toml (88%) diff --git a/evaluation_dashboard_app/Dockerfile b/evaluation_dashboard_app/Dockerfile index df7e71e..4a05e90 100644 --- a/evaluation_dashboard_app/Dockerfile +++ b/evaluation_dashboard_app/Dockerfile @@ -42,8 +42,6 @@ RUN --mount=type=secret,id=ssh,dst=/tmp/ssh_key \ COPY requirements-docker.txt . RUN python3 -m pip install --no-cache-dir -r requirements-docker.txt -# Streamlit server options (WebSocket proxy, headless, etc.) -COPY .streamlit/ .streamlit/ # Copy application code and config COPY Overview.py . diff --git a/evaluation_dashboard_app/.streamlit/config.toml b/evaluation_dashboard_app/deploy/.streamlit/config.toml similarity index 88% rename from evaluation_dashboard_app/.streamlit/config.toml rename to evaluation_dashboard_app/deploy/.streamlit/config.toml index 5efa430..14d8726 100644 --- a/evaluation_dashboard_app/.streamlit/config.toml +++ b/evaluation_dashboard_app/deploy/.streamlit/config.toml @@ -2,8 +2,8 @@ # See https://docs.streamlit.io/develop/api-reference/configuration/config.toml [server] -# Container / headless deployments (no local browser) -headless = true +# Local default: open browser when running outside Docker +headless = false # Behind nginx or other proxies, per-message WebSocket compression can break or stall # some setups (see Streamlit troubleshooting: "App is not loading when running remotely"). diff --git a/evaluation_dashboard_app/docker-entrypoint.sh b/evaluation_dashboard_app/docker-entrypoint.sh index c37a1b5..8f8357d 100644 --- a/evaluation_dashboard_app/docker-entrypoint.sh +++ b/evaluation_dashboard_app/docker-entrypoint.sh @@ -5,4 +5,4 @@ if [[ -n "${ROS_DISTRO}" && -f "/opt/ros/${ROS_DISTRO}/setup.bash" ]]; then source "/opt/ros/${ROS_DISTRO}/setup.bash" fi -exec streamlit run Overview.py --server.address=0.0.0.0 --server.port=8501 "$@" +exec streamlit run Overview.py --server.address=0.0.0.0 --server.port=8501 --server.headless=true "$@" From 57c480a00ba860a66b7f4e574bfd55fb8e55f3c9 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 2 Apr 2026 13:16:25 +0900 Subject: [PATCH 14/94] feat: add embedded viewer option to Bounding Box Viewer - Implemented an embedded viewer for the T4 visualizer within the Bounding Box Viewer, allowing users to view the T4 three viewer directly in the application. - Added functionality to generate and display the viewer URL based on dataset and scenario parameters, enhancing user experience and accessibility. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/4_Bounding_Box_Viewer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index 64ebf9e..b4a4495 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -621,6 +621,22 @@ def _bbox_t4_request_key( "The server reported **available: false** (no local dataset path for this id on the machine " "running `t4-server`)." ) + else: + _q_three = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) + _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" + st.caption("Embedded viewer (/viewer/three)") + st.markdown(f"[Open embedded viewer in new tab]({_viewer_three_url})") + _viewer_three_h = 700 + components.html( + f'', + height=_viewer_three_h + 24, + scrolling=True, + ) + + if not _av.get("ok") or not _av.get("available"): + pass elif _t4_preview_mode == "html_iframe": _q = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) _render_html_url = f"{base_url_t4.rstrip('/')}/render/html?{_q}" From f289c6de015994ae9a40f2338ff2fb30363ae09f Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 2 Apr 2026 14:20:38 +0900 Subject: [PATCH 15/94] feat: enhance Bounding Box Viewer with three-layer payload functionality - Added a new function `_build_three_layer_payload` to construct payloads for the T4 three viewer, enabling the display of ground truth, predicted, and matched bounding boxes. - Implemented client-side matching of ground truth and predicted boxes based on UUIDs for improved visualization accuracy. - Updated the iframe integration to include payload data for enhanced debugging and user interaction. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/4_Bounding_Box_Viewer.py | 106 +++++++++++++++++- 1 file changed, 103 insertions(+), 3 deletions(-) diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index b4a4495..edfa23d 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -1,4 +1,5 @@ import html +import json import duckdb import requests import streamlit as st @@ -536,6 +537,53 @@ def _bbox_t4_request_key( ) +def _build_three_layer_payload(df_frame: pd.DataFrame) -> dict: + """Build GT/Pred/Matched overlay payload for `/viewer/three` iframe.""" + if df_frame is None or df_frame.empty: + return {"type": "bbox_layers_clear"} + + def _row_to_box(row: pd.Series) -> dict: + return { + "x": float(row.get("x", 0.0) or 0.0), + "y": float(row.get("y", 0.0) or 0.0), + "z": float(row.get("z", 0.0) or 0.0), + "width": float(row.get("width", 0.0) or 0.0), + "length": float(row.get("length", 0.0) or 0.0), + "height": float(row.get("height", 1.5) or 1.5), + "yaw": float(row.get("yaw", 0.0) or 0.0), + "label": str(row.get("label", "") or ""), + "uuid": str(row.get("uuid", "") or ""), + "status": str(row.get("status", "") or ""), + } + + gt_df = df_frame[df_frame["source"] == "GT"].copy() + pred_df = df_frame[df_frame["source"] == "EST"].copy() + gt_boxes = [_row_to_box(r) for _, r in gt_df.iterrows()] + pred_boxes = [_row_to_box(r) for _, r in pred_df.iterrows()] + + # Client-side matching: pair GT/EST by UUID for rows marked TP. + gt_tp_idx: dict[str, int] = {} + for i, b in enumerate(gt_boxes): + if b["status"] == "TP" and b["uuid"]: + gt_tp_idx.setdefault(b["uuid"], i) + pred_tp_idx: dict[str, int] = {} + for i, b in enumerate(pred_boxes): + if b["status"] == "TP" and b["uuid"]: + pred_tp_idx.setdefault(b["uuid"], i) + matched_pairs = [] + for u, gi in gt_tp_idx.items(): + pi = pred_tp_idx.get(u) + if pi is not None: + matched_pairs.append({"gt_idx": int(gi), "pred_idx": int(pi)}) + + return { + "type": "bbox_layers", + "gt": gt_boxes, + "pred": pred_boxes, + "matched_pairs": matched_pairs, + } + + _t4_preview_mode = st.session_state.get("bbox_t4_preview_mode", "html_iframe") base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL @@ -626,11 +674,63 @@ def _bbox_t4_request_key( _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" st.caption("Embedded viewer (/viewer/three)") st.markdown(f"[Open embedded viewer in new tab]({_viewer_three_url})") + _layer_payload = _build_three_layer_payload(df_frame) + _payload_json = json.dumps(_layer_payload, ensure_ascii=True) + _payload_b64 = _payload_json.encode("utf-8").hex() + with st.expander("Three.js layer debug", expanded=False): + st.write( + { + "viewer_url": _viewer_three_url, + "payload_type": _layer_payload.get("type"), + "gt_count": len(_layer_payload.get("gt", [])), + "pred_count": len(_layer_payload.get("pred", [])), + "matched_pairs_count": len(_layer_payload.get("matched_pairs", [])), + } + ) _viewer_three_h = 700 + _iframe_src = html.escape(_viewer_three_url, quote=True) components.html( - f'', + ( + f'' + "" + ), height=_viewer_three_h + 24, scrolling=True, ) From a7c7f4c44f69c62768db957c7436b9718e84aa7b Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 2 Apr 2026 16:35:58 +0900 Subject: [PATCH 16/94] feat: introduce T4 three layers functionality for enhanced 3D visualization - Added a new module `t4_three_layers.py` to handle the construction of payloads for the T4 three viewer, enabling the display of ground truth, predicted, and matched bounding boxes. - Refactored the Bounding Box Viewer to utilize the new functions for resolving dataset IDs and scenarios, improving code organization and reusability. - Implemented a new page `5_T4_3D_Viewer.py` for a dedicated Three.js viewer, enhancing user experience with integrated visualizations. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/t4_three_layers.py | 158 ++++++ .../pages/4_Bounding_Box_Viewer.py | 139 +----- .../pages/5_T4_3D_Viewer.py | 456 ++++++++++++++++++ 3 files changed, 623 insertions(+), 130 deletions(-) create mode 100644 evaluation_dashboard_app/lib/t4_three_layers.py create mode 100644 evaluation_dashboard_app/pages/5_T4_3D_Viewer.py diff --git a/evaluation_dashboard_app/lib/t4_three_layers.py b/evaluation_dashboard_app/lib/t4_three_layers.py new file mode 100644 index 0000000..de18291 --- /dev/null +++ b/evaluation_dashboard_app/lib/t4_three_layers.py @@ -0,0 +1,158 @@ +"""T4 `/viewer/three` embed: GT / pred / matched 3D box layers via postMessage.""" + +from __future__ import annotations + +import html +import json +from typing import TYPE_CHECKING + +import streamlit.components.v1 as components + +if TYPE_CHECKING: + import pandas as pd + + +def resolve_t4_dataset_id(dff: "pd.DataFrame") -> str: + """Parquet **t4dataset_id** or **t4dataset_name** for the current frame (empty if missing).""" + if dff is None or dff.empty: + return "" + if "t4dataset_id" in dff.columns and dff["t4dataset_id"].notna().any(): + return str(dff["t4dataset_id"].dropna().astype(str).iloc[0]) + if "t4dataset_name" in dff.columns and dff["t4dataset_name"].notna().any(): + return str(dff["t4dataset_name"].dropna().iloc[0]) + return "" + + +def resolve_t4_scenario(dff: "pd.DataFrame", scenario_from_sidebar: str | None) -> str: + if scenario_from_sidebar is not None and str(scenario_from_sidebar).strip() != "": + return str(scenario_from_sidebar) + if dff is not None and not dff.empty and "scenario_name" in dff.columns and dff["scenario_name"].notna().any(): + return str(dff["scenario_name"].dropna().iloc[0]) + return "" + + +def _single_frame_layer_dict(df_frame: "pd.DataFrame") -> dict: + """Per-frame gt / pred / matched_pairs (no ``type`` field); used by single- and all-frame payloads.""" + if df_frame is None or df_frame.empty: + return {"gt": [], "pred": [], "matched_pairs": []} + + def _row_to_box(row: "pd.Series") -> dict: + return { + "x": float(row.get("x", 0.0) or 0.0), + "y": float(row.get("y", 0.0) or 0.0), + "z": float(row.get("z", 0.0) or 0.0), + "width": float(row.get("width", 0.0) or 0.0), + "length": float(row.get("length", 0.0) or 0.0), + "height": float(row.get("height", 1.5) or 1.5), + "yaw": float(row.get("yaw", 0.0) or 0.0), + "label": str(row.get("label", "") or ""), + "uuid": str(row.get("uuid", "") or ""), + "status": str(row.get("status", "") or ""), + } + + gt_df = df_frame[df_frame["source"] == "GT"].copy() + pred_df = df_frame[df_frame["source"] == "EST"].copy() + gt_boxes = [_row_to_box(r) for _, r in gt_df.iterrows()] + pred_boxes = [_row_to_box(r) for _, r in pred_df.iterrows()] + + gt_tp_idx: dict[str, int] = {} + for i, b in enumerate(gt_boxes): + if b["status"] == "TP" and b["uuid"]: + gt_tp_idx.setdefault(b["uuid"], i) + pred_tp_idx: dict[str, int] = {} + for i, b in enumerate(pred_boxes): + if b["status"] == "TP" and b["uuid"]: + pred_tp_idx.setdefault(b["uuid"], i) + matched_pairs = [] + for u, gi in gt_tp_idx.items(): + pi = pred_tp_idx.get(u) + if pi is not None: + matched_pairs.append({"gt_idx": int(gi), "pred_idx": int(pi)}) + + return { + "gt": gt_boxes, + "pred": pred_boxes, + "matched_pairs": matched_pairs, + } + + +def build_three_layer_payload(df_frame: "pd.DataFrame") -> dict: + """Build GT/Pred/Matched overlay payload for `/viewer/three` iframe (single frame).""" + if df_frame is None or df_frame.empty: + return {"type": "bbox_layers_clear"} + inner = _single_frame_layer_dict(df_frame) + return { + "type": "bbox_layers", + "gt": inner["gt"], + "pred": inner["pred"], + "matched_pairs": inner["matched_pairs"], + } + + +def build_three_layer_payload_all_frames(df: "pd.DataFrame") -> dict: + """Build payload with eval layers for every ``frame_index`` in *df* (viewer picks by internal frame).""" + if df is None or df.empty: + return {"type": "bbox_layers_by_frame", "frames": {}} + if "frame_index" not in df.columns: + return {"type": "bbox_layers_by_frame", "frames": {}} + frames: dict[str, dict] = {} + for fi, group in df.groupby("frame_index", sort=True): + try: + key = str(int(fi)) + except (TypeError, ValueError): + continue + frames[key] = _single_frame_layer_dict(group) + return {"type": "bbox_layers_by_frame", "frames": frames} + + +def render_t4_three_js_embed(viewer_three_url: str, layer_payload: dict, height: int = 700) -> None: + """Iframe to T4 three viewer + postMessage with bbox layer payload (GT, pred, matched pairs).""" + _payload_json = json.dumps(layer_payload, ensure_ascii=True) + _payload_b64 = _payload_json.encode("utf-8").hex() + _iframe_src = html.escape(viewer_three_url, quote=True) + components.html( + ( + f'' + "" + ), + height=height + 24, + scrolling=True, + ) diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index edfa23d..1d36567 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -1,5 +1,4 @@ import html -import json import duckdb import requests import streamlit as st @@ -17,6 +16,7 @@ from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero from lib.ui.bounding_box_viewer_ui import bev_overlay_line_and_status_legend_markup, bev_status_legend_markup from lib.t4_dataset_embed import t4_share_query_params +from lib.t4_three_layers import resolve_t4_dataset_id, resolve_t4_scenario from lib.t4_visualizer_client import ( DEFAULT_BASE_URL, ENV_BASE_URL, @@ -499,24 +499,6 @@ def get_color(source, status): return color_map.get((source, status), "#999999") # ---------------------------- # T4 visualizer (HTTP server): camera PNGs for current frame # ---------------------------- -def _bbox_resolve_t4_dataset_id(dff: pd.DataFrame) -> str: - if dff is None or dff.empty: - return "" - if "t4dataset_id" in dff.columns and dff["t4dataset_id"].notna().any(): - return str(dff["t4dataset_id"].dropna().astype(str).iloc[0]) - if "t4dataset_name" in dff.columns and dff["t4dataset_name"].notna().any(): - return str(dff["t4dataset_name"].dropna().iloc[0]) - return "" - - -def _bbox_resolve_t4_scenario(dff: pd.DataFrame, scenario_from_sidebar: Optional[str]) -> str: - if scenario_from_sidebar is not None and str(scenario_from_sidebar).strip() != "": - return str(scenario_from_sidebar) - if dff is not None and not dff.empty and "scenario_name" in dff.columns and dff["scenario_name"].notna().any(): - return str(dff["scenario_name"].dropna().iloc[0]) - return "" - - def _bbox_t4_request_key( ds: str, sc: str, @@ -537,61 +519,14 @@ def _bbox_t4_request_key( ) -def _build_three_layer_payload(df_frame: pd.DataFrame) -> dict: - """Build GT/Pred/Matched overlay payload for `/viewer/three` iframe.""" - if df_frame is None or df_frame.empty: - return {"type": "bbox_layers_clear"} - - def _row_to_box(row: pd.Series) -> dict: - return { - "x": float(row.get("x", 0.0) or 0.0), - "y": float(row.get("y", 0.0) or 0.0), - "z": float(row.get("z", 0.0) or 0.0), - "width": float(row.get("width", 0.0) or 0.0), - "length": float(row.get("length", 0.0) or 0.0), - "height": float(row.get("height", 1.5) or 1.5), - "yaw": float(row.get("yaw", 0.0) or 0.0), - "label": str(row.get("label", "") or ""), - "uuid": str(row.get("uuid", "") or ""), - "status": str(row.get("status", "") or ""), - } - - gt_df = df_frame[df_frame["source"] == "GT"].copy() - pred_df = df_frame[df_frame["source"] == "EST"].copy() - gt_boxes = [_row_to_box(r) for _, r in gt_df.iterrows()] - pred_boxes = [_row_to_box(r) for _, r in pred_df.iterrows()] - - # Client-side matching: pair GT/EST by UUID for rows marked TP. - gt_tp_idx: dict[str, int] = {} - for i, b in enumerate(gt_boxes): - if b["status"] == "TP" and b["uuid"]: - gt_tp_idx.setdefault(b["uuid"], i) - pred_tp_idx: dict[str, int] = {} - for i, b in enumerate(pred_boxes): - if b["status"] == "TP" and b["uuid"]: - pred_tp_idx.setdefault(b["uuid"], i) - matched_pairs = [] - for u, gi in gt_tp_idx.items(): - pi = pred_tp_idx.get(u) - if pi is not None: - matched_pairs.append({"gt_idx": int(gi), "pred_idx": int(pi)}) - - return { - "type": "bbox_layers", - "gt": gt_boxes, - "pred": pred_boxes, - "matched_pairs": matched_pairs, - } - - _t4_preview_mode = st.session_state.get("bbox_t4_preview_mode", "html_iframe") base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL -_ds_t4 = _bbox_resolve_t4_dataset_id(df_frame) +_ds_t4 = resolve_t4_dataset_id(df_frame) if not _ds_t4 and selected_t4dataset is not None: _ds_t4 = str(selected_t4dataset) -_sc_t4 = _bbox_resolve_t4_scenario(df_frame, selected_scenario) +_sc_t4 = resolve_t4_scenario(df_frame, selected_scenario) if not _ds_t4: for _k in ( @@ -672,68 +607,12 @@ def _row_to_box(row: pd.Series) -> dict: else: _q_three = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" - st.caption("Embedded viewer (/viewer/three)") - st.markdown(f"[Open embedded viewer in new tab]({_viewer_three_url})") - _layer_payload = _build_three_layer_payload(df_frame) - _payload_json = json.dumps(_layer_payload, ensure_ascii=True) - _payload_b64 = _payload_json.encode("utf-8").hex() - with st.expander("Three.js layer debug", expanded=False): - st.write( - { - "viewer_url": _viewer_three_url, - "payload_type": _layer_payload.get("type"), - "gt_count": len(_layer_payload.get("gt", [])), - "pred_count": len(_layer_payload.get("pred", [])), - "matched_pairs_count": len(_layer_payload.get("matched_pairs", [])), - } - ) - _viewer_three_h = 700 - _iframe_src = html.escape(_viewer_three_url, quote=True) - components.html( - ( - f'' - "" - ), - height=_viewer_three_h + 24, - scrolling=True, - ) + st.caption("**3D viewer** (Three.js, GT / pred / matched layers) lives on a dedicated page.") + c3d_a, c3d_b = st.columns([1, 2]) + with c3d_a: + st.page_link("pages/5_T4_3D_Viewer.py", label="Open T4 3D Viewer", icon="🧊") + with c3d_b: + st.markdown(f"[Open `/viewer/three` in new tab]({_viewer_three_url})") if not _av.get("ok") or not _av.get("available"): pass diff --git a/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py b/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py new file mode 100644 index 0000000..b65358d --- /dev/null +++ b/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py @@ -0,0 +1,456 @@ +"""T4 dataset Three.js viewer: GT / prediction / matched 3D boxes via postMessage to `/viewer/three`.""" + +import duckdb +import requests +import streamlit as st +import numpy as np +import pandas as pd +import os +from pathlib import Path +from typing import Any, List + +from lib.path_utils import path_display +from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero +from lib.t4_dataset_embed import t4_share_query_params +from lib.t4_three_layers import ( + build_three_layer_payload_all_frames, + render_t4_three_js_embed, + resolve_t4_dataset_id, + resolve_t4_scenario, +) +from lib.t4_visualizer_client import ( + DEFAULT_BASE_URL, + ENV_BASE_URL, + T4VisualizerClient, + T4VisualizerError, +) + +st.set_page_config( + layout="wide", + page_title="T4 3D Viewer", + page_icon="🧊", + initial_sidebar_state="expanded", +) +inject_app_page_styles() + +# ============================= +# Session state from Overview (run path) +# ============================= +if "runA" not in st.session_state: + st.warning("Please load data from the **Overview** page first (select mode and run(s)).") + st.stop() + +runA = st.session_state["runA"] +mode = st.session_state.get("mode", "Single Mode") +if mode == "Compare Mode": + all_runs = st.session_state.get("all_runs") + run_labels_state = st.session_state.get("run_labels") + if all_runs and run_labels_state and len(all_runs) >= 2: + runs = all_runs + run_labels_list = run_labels_state + else: + runB = st.session_state.get("runB") + runs = [runA] if runB is None else [runA, runB] + run_labels_list = ["A"] if len(runs) == 1 else ["A", "B"] +else: + runs = [runA] + run_labels_list = ["A"] + + +def list_parquets_in_run(run_path) -> List[str]: + p = Path(run_path) + if not p.is_dir(): + return [] + return sorted([str(f.resolve()) for f in p.glob("*.parquet")]) + + +parquet_lists = [list_parquets_in_run(r["path"]) for r in runs] +for i, (r, pl) in enumerate(zip(runs, parquet_lists)): + if not pl: + lbl = run_labels_list[i] if i < len(run_labels_list) else str(i) + st.error( + f"No parquet files in run ({lbl}): {path_display(r['path'])}. " + "Add a .parquet file or generate one from the Download page." + ) + st.stop() + +multi_run = len(runs) >= 2 + +_ld_entries = [] +for i, r in enumerate(runs): + lbl = run_labels_list[i] if i < len(run_labels_list) else str(i) + if lbl == "A": + _ltitle = "Baseline · A" + else: + _ltitle = f"Candidate · {lbl}" + _ld_entries.append((_ltitle, path_display(r["path"]))) +render_loaded_data_section(_ld_entries) +render_page_hero( + kicker="T4 visualizer", + title="T4 3D bounding box viewer", + description=( + "Embedded **Three.js** view with GT, prediction (EST), and UUID-matched pairs from parquet (**postMessage**). " + "Scrub **time inside the viewer** (bottom slider); eval boxes follow that frame. Same filters as the BEV page." + ), + mode=mode, +) + +# ---------------------------- +# Sidebar (Filters) — shared keys with Bounding Box Viewer +# ---------------------------- +with st.sidebar: + st.markdown("##### Filters") + st.caption("Same scene / topic / labels as the BEV viewer. Frame / playback: use the **3D viewer** controls.") + + if multi_run: + runs_to_show = st.multiselect( + "Runs to show", + run_labels_list, + default=run_labels_list, + key="bbox_viewer_runs_to_show", + ) + if not runs_to_show: + st.warning("Select at least one run.") + st.stop() + else: + runs_to_show = run_labels_list + + selected_files = {} + for i, lbl in enumerate(run_labels_list): + if lbl not in runs_to_show: + continue + pl = parquet_lists[i] + if len(pl) == 1: + selected_files[lbl] = pl[0] + else: + selected_files[lbl] = st.selectbox( + f"File (Run {lbl})", + pl, + format_func=os.path.basename, + key=f"bbox_viewer_file_{lbl}", + ) + + first_shown = runs_to_show[0] if runs_to_show else run_labels_list[0] + filter_file = selected_files.get(first_shown) or parquet_lists[run_labels_list.index(first_shown)][0] + +con = duckdb.connect() + +cols = con.execute("DESCRIBE SELECT * FROM parquet_scan(?)", [filter_file]).df()["column_name"].tolist() +has_visibility = "visibility" in cols +has_suite_name = "suite_name" in cols +has_scenario_name = "scenario_name" in cols +has_t4dataset_name = "t4dataset_name" in cols +hover_extra_cols = [c for c in ["z", "height", "vx", "vy", "confidence", "pointcloud_num"] if c in cols] + +scene_where = "1=1" +scene_params: List[str] = [filter_file] + +if has_suite_name: + suite_list = con.execute( + "SELECT DISTINCT suite_name AS v FROM parquet_scan(?) WHERE suite_name IS NOT NULL ORDER BY v", + [filter_file], + ).df()["v"].dropna().astype(str).tolist() +else: + suite_list = [] + +if "bbox_viewer_link_suite" in st.session_state: + _lsu = st.session_state.pop("bbox_viewer_link_suite", None) + if suite_list and _lsu is not None and str(_lsu) in suite_list: + st.session_state["bbox_viewer_suite"] = str(_lsu) + +with st.sidebar: + selected_suite = None + selected_scenario = None + if suite_list: + selected_suite = st.selectbox( + "Suite name", + suite_list, + key="bbox_viewer_suite", + ) + if has_scenario_name: + if selected_suite is not None: + scenario_list = con.execute( + "SELECT DISTINCT scenario_name AS v FROM parquet_scan(?) WHERE suite_name = ? AND scenario_name IS NOT NULL ORDER BY v", + [filter_file, selected_suite], + ).df()["v"].dropna().astype(str).tolist() + else: + scenario_list = con.execute( + "SELECT DISTINCT scenario_name AS v FROM parquet_scan(?) WHERE scenario_name IS NOT NULL ORDER BY v", + [filter_file], + ).df()["v"].dropna().astype(str).tolist() + if scenario_list: + if "bbox_viewer_link_scenario" in st.session_state: + _lsc = st.session_state.pop("bbox_viewer_link_scenario", None) + if _lsc is not None and str(_lsc) in scenario_list: + st.session_state["bbox_viewer_scenario"] = str(_lsc) + selected_scenario = st.selectbox( + "Scenario name", + scenario_list, + key="bbox_viewer_scenario", + ) + t4dataset_list: List[str] = [] + if has_t4dataset_name: + t4_where_parts = ["t4dataset_name IS NOT NULL"] + t4_params: List[Any] = [filter_file] + if selected_suite is not None: + t4_where_parts.insert(0, "suite_name = ?") + t4_params.append(selected_suite) + if selected_scenario is not None: + t4_where_parts.insert(0, "scenario_name = ?") + t4_params.insert(1, selected_scenario) + t4_where = " AND ".join(t4_where_parts) + t4dataset_list = con.execute( + f"SELECT DISTINCT t4dataset_name AS v FROM parquet_scan(?) WHERE {t4_where} ORDER BY v", + t4_params, + ).df()["v"].dropna().astype(str).tolist() + has_multiple_t4dataset = len(t4dataset_list) > 1 + selected_t4dataset = None + if has_multiple_t4dataset and t4dataset_list: + if "bbox_viewer_link_t4dataset" in st.session_state: + _lt4 = st.session_state.pop("bbox_viewer_link_t4dataset", None) + if _lt4 is not None and str(_lt4) in t4dataset_list: + st.session_state["bbox_viewer_t4dataset"] = str(_lt4) + selected_t4dataset = st.selectbox( + "t4dataset_name", + t4dataset_list, + key="bbox_viewer_t4dataset", + ) + +if selected_suite is not None: + scene_where = "suite_name = ?" + scene_params = [filter_file, selected_suite] +if selected_scenario is not None: + scene_where = scene_where + " AND scenario_name = ?" if scene_where != "1=1" else "scenario_name = ?" + scene_params = scene_params + [selected_scenario] +if selected_t4dataset is not None: + scene_where = scene_where + " AND t4dataset_name = ?" if scene_where != "1=1" else "t4dataset_name = ?" + scene_params = scene_params + [selected_t4dataset] +if scene_where == "1=1": + scene_params = [filter_file] + +topic_names = con.execute( + f"SELECT DISTINCT topic_name AS v FROM parquet_scan(?) WHERE {scene_where} ORDER BY v", + scene_params, +).df()["v"].dropna().tolist() +if not topic_names: + for key in ( + "bbox_viewer_scenario", + "bbox_viewer_suite", + "bbox_viewer_link_suite", + "bbox_viewer_link_scenario", + "bbox_viewer_link_t4dataset", + ): + if key in st.session_state: + del st.session_state[key] + st.warning( + "No topic_name for the selected scene (from Detection Stats link). " + "Cleared scene selection; please choose a scene from the sidebar." + ) + st.rerun() + +with st.sidebar: + selected_topic = st.selectbox("topic_name (single)", topic_names) + +labels = con.execute( + f"SELECT DISTINCT label AS v FROM parquet_scan(?) WHERE {scene_where} AND topic_name=? ORDER BY v", + scene_params + [selected_topic], +).df()["v"].dropna().tolist() +if not labels: + st.warning("No label for selected topic.") + st.stop() + +with st.sidebar: + selected_labels = st.multiselect("label(s)", labels, default=labels) + +selected_visibility = None +if has_visibility: + vis_list = con.execute( + f"SELECT DISTINCT COALESCE(visibility,'UNKNOWN') AS v FROM parquet_scan(?) WHERE {scene_where} AND topic_name=? ORDER BY v", + scene_params + [selected_topic], + ).df()["v"].tolist() + with st.sidebar: + if vis_list: + selected_visibility = st.multiselect("visibility", vis_list, default=vis_list) + else: + st.info("No visibility values found — skipping.") +else: + with st.sidebar: + st.info("No 'visibility' column found — skipping visibility filter.") + +if not selected_labels: + st.warning("No label selected.") + st.stop() + +with st.sidebar: + st.markdown("##### T4 server") + st.caption("**GET /datasets/{id}/availability** must succeed before the iframe loads.") + if "bbox_t4_base_url" not in st.session_state: + st.session_state["bbox_t4_base_url"] = ( + (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL + ) + st.text_input( + "T4 server base URL", + key="bbox_t4_base_url", + help=f"Default from env `{ENV_BASE_URL}`. Embeds `/viewer/three` and posts GT / pred / matched bbox layers.", + ) + +# ---------------------------- +# Load data (same SQL as Bounding Box Viewer) +# ---------------------------- +where = [scene_where, "topic_name = ?"] +params = scene_params + [selected_topic] +where.append(f"label IN ({','.join(['?']*len(selected_labels))})") +params.extend(selected_labels) + +select_vis = ", visibility" if has_visibility else "" +if has_visibility and selected_visibility: + where.append(f"COALESCE(visibility,'UNKNOWN') IN ({','.join(['?']*len(selected_visibility))})") + params.extend(selected_visibility) + +select_extras = (", " + ", ".join(hover_extra_cols)) if hover_extra_cols else "" +_geom_for_t4 = [c for c in ("z", "height") if c in cols and c not in hover_extra_cols] +_geom_select = (", " + ", ".join(_geom_for_t4)) if _geom_for_t4 else "" +_t4_meta_cols = [c for c in ("t4dataset_id", "t4dataset_name", "scenario_name") if c in cols] +_t4_meta_select = (", " + ", ".join(_t4_meta_cols)) if _t4_meta_cols else "" +sql = f""" +SELECT frame_index, x, y, length, width, yaw, label, topic_name, source, status, uuid +{select_vis}{select_extras}{_geom_select}{_t4_meta_select} +FROM parquet_scan(?) +WHERE {" AND ".join(where)} +ORDER BY frame_index +""" + +files_to_load: List[tuple] = [(selected_files[lbl], lbl) for lbl in runs_to_show if lbl in selected_files] +base_params = scene_params[1:] + [selected_topic] + list(selected_labels) +if has_visibility and selected_visibility: + base_params = base_params + list(selected_visibility) + +dfs = [] +for file_path, run_label in files_to_load: + qparams = [file_path] + base_params + df_part = con.execute(sql, qparams).df() + if not df_part.empty: + df_part = df_part.copy() + df_part["run"] = run_label + dfs.append(df_part) + +if not dfs: + st.warning("No data matches the selected filters.") + st.stop() + +df = pd.concat(dfs, ignore_index=True) +if len(files_to_load) == 1: + df["run"] = df["run"].iloc[0] + +if "frame_index" in df.columns and not np.issubdtype(df["frame_index"].dtype, np.integer): + df["frame_index"] = ( + pd.to_numeric(df["frame_index"], errors="coerce").fillna(0).astype(int) + ) + +if len(files_to_load) == 1: + st.info(f"**Currently showing:** Run {files_to_load[0][1]} only") +else: + run_names = [f[1] for f in files_to_load] + st.info(f"**Currently showing:** Runs {', '.join(run_names)} — 3D layers include boxes from all selected runs.") + +f_min, f_max = int(df.frame_index.min()), int(df.frame_index.max()) + +# One reference slice for resolving t4dataset_id / scenario_name (same as iframe entry frame). +_ref_frame = f_min +df_frame = df[df.frame_index == _ref_frame] +if df_frame.empty and not df.empty: + df_frame = df.iloc[:1].copy() + +# ---------------------------- +# T4 Three.js embed +# ---------------------------- +base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL + +_ds_t4 = resolve_t4_dataset_id(df_frame) +if not _ds_t4 and selected_t4dataset is not None: + _ds_t4 = str(selected_t4dataset) +_sc_t4 = resolve_t4_scenario(df_frame, selected_scenario) + +if not _ds_t4: + for _k in ( + "bbox_t4_last_images", + "bbox_t4_last_meta", + "bbox_t4_success_key", + "bbox_t4_error_key", + "bbox_t4_error_msg", + "bbox_t4_availability", + ): + st.session_state.pop(_k, None) + st.warning( + "Cannot resolve a T4 dataset id for this frame. Needs parquet **t4dataset_id** or **t4dataset_name**, " + f"or **t4dataset_name** in the sidebar when multiple datasets exist. Set **T4 server base URL** or `{ENV_BASE_URL}`." + ) +else: + _t4_avail_cache_key = f"{base_url_t4.rstrip('/')}|{_ds_t4}" + _cached_av = st.session_state.get("bbox_t4_availability") + _need_avail_fetch = _cached_av is None or _cached_av.get("cache_key") != _t4_avail_cache_key + if _need_avail_fetch: + try: + with st.spinner("Checking T4 dataset on the server…"): + _av_client = T4VisualizerClient(base_url=base_url_t4, timeout=30.0) + _av_data = _av_client.dataset_availability(_ds_t4) + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": True, + "available": bool(_av_data.get("available")), + "data": _av_data, + "error": None, + } + except T4VisualizerError as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"T4 server error ({ex.status_code}): {ex}", + } + except (OSError, requests.RequestException) as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Network error: {ex}", + } + except Exception as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Availability check failed: {ex}", + } + + _av = st.session_state.get("bbox_t4_availability") or {} + + if not _av.get("ok"): + st.error("Could not verify the dataset on the T4 visualizer server.") + with st.expander("Details", expanded=False): + st.markdown(_av.get("error") or "Unknown error.") + elif not _av.get("available"): + st.warning("This dataset is not available on the visualizer server host.") + with st.expander("Details", expanded=False): + _d = _av.get("data") + if isinstance(_d, dict) and _d: + st.json(_d) + else: + st.markdown( + "The server reported **available: false** (no local dataset path for this id on the machine " + "running `t4-server`)." + ) + else: + # Fixed entry frame so Streamlit slider does not reload the iframe; eval layers use bbox_layers_by_frame. + _iframe_entry_frame = int(df["frame_index"].min()) + _q_three = t4_share_query_params(_ds_t4, _sc_t4, _iframe_entry_frame) + _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" + _layer_payload = build_three_layer_payload_all_frames(df) + + _viewer_three_h = 1200 + render_t4_three_js_embed(_viewer_three_url, _layer_payload, height=_viewer_three_h) + +st.page_link("pages/4_Bounding_Box_Viewer.py", label="Back to Bounding Box & BEV viewer", icon="🖼️") From 9351490e8e7333cb6f82461ab3abbb572a2afc8a Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 2 Apr 2026 18:34:38 +0900 Subject: [PATCH 17/94] feat: enhance iframe integration in T4 three viewer - Updated the iframe element in `t4_three_layers.py` to include the `allowfullscreen` attribute, improving user experience by enabling fullscreen mode for the T4 three viewer. - Added `allow` attribute to specify permissions for fullscreen functionality, enhancing the viewer's usability. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/t4_three_layers.py | 1 + evaluation_dashboard_app/pages/5_T4_3D_Viewer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/evaluation_dashboard_app/lib/t4_three_layers.py b/evaluation_dashboard_app/lib/t4_three_layers.py index de18291..8aa37a3 100644 --- a/evaluation_dashboard_app/lib/t4_three_layers.py +++ b/evaluation_dashboard_app/lib/t4_three_layers.py @@ -114,6 +114,7 @@ def render_t4_three_js_embed(viewer_three_url: str, layer_payload: dict, height: ( f'' "" + ), + height=height + 8, + scrolling=False, + ) + + def _render_tlr_viewer_tab(detail_sources: dict[str, pd.DataFrame | None], *, key_prefix: str) -> None: st.subheader("Embedded traffic light viewer") st.caption("Pick a dataset from the current TLR details, then load the external `/viewer/tlr` page inline.") @@ -144,6 +251,7 @@ def _render_tlr_viewer_tab(detail_sources: dict[str, pd.DataFrame | None], *, ke dataset_rows = dataset_rows.sort_values(["scenario", "frame_index"]).reset_index(drop=True) selected_row = dataset_rows.iloc[0] selected_frame = int(selected_row["frame_index"]) + payload = _build_tlr_eval_payload_by_frame(dataset_rows) viewer_url = f"{base_url.rstrip('/')}/viewer/tlr?t4dataset_id={quote(selected_dataset, safe='')}&frame_index={selected_frame}" st.markdown(f"[Open `/viewer/tlr` in new tab]({viewer_url})") @@ -157,14 +265,7 @@ def _render_tlr_viewer_tab(detail_sources: dict[str, pd.DataFrame | None], *, ke with st.expander("Matching rows", expanded=False): st.dataframe(dataset_rows[preview_cols].sort_values(["scenario", "frame_index"]), width="stretch", hide_index=True) - iframe_h = 1400 - components.html( - f'', - height=iframe_h + 8, - scrolling=False, - ) + _render_tlr_viewer_embed(viewer_url, payload, iframe_id=f"{key_prefix}_iframe", height=1600) def _render_single_tabs(analyzer, tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer): From dad0b693b58054f36b2f24431841caa320776a18 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 21 Apr 2026 15:13:00 +0900 Subject: [PATCH 39/94] feat: add filtering and sorting options to TLR analysis details - Enhanced the TLR analysis page by introducing filters for scenarios, vehicle statuses, and traffic light types, allowing users to narrow down displayed data. - Implemented sorting options for the filtered results, improving data organization and accessibility. - Updated download functionality to reflect filtered data in CSV and JSON exports, ensuring consistency in user experience. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/9_TLR_Analysis.py | 72 ++++++++++++++++++- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/evaluation_dashboard_app/pages/9_TLR_Analysis.py b/evaluation_dashboard_app/pages/9_TLR_Analysis.py index b6e449a..6f9519b 100644 --- a/evaluation_dashboard_app/pages/9_TLR_Analysis.py +++ b/evaluation_dashboard_app/pages/9_TLR_Analysis.py @@ -326,12 +326,78 @@ def _render_single_tabs(analyzer, tab_criteria, tab_vehicle, tab_critical, tab_d details_df = analyzer.get_vehicle_status_details_df() if details_df is not None and not details_df.empty: st.caption("One row per frame. Use filters to narrow down by scenario, status, or traffic light type.") - st.dataframe(details_df, width='stretch', hide_index=True) + filtered_details = details_df.copy() + all_scenarios = sorted(filtered_details["scenario"].dropna().astype(str).unique().tolist()) + all_statuses = sorted(filtered_details["status"].dropna().astype(str).unique().tolist()) + all_tlr_types = sorted(filtered_details["traffic_light_type"].dropna().astype(str).unique().tolist()) + + with st.expander("Filters & sort", expanded=False): + f1, f2, f3 = st.columns(3) + with f1: + sel_scenarios = st.multiselect( + "Scenario(s)", + options=all_scenarios, + default=[], + key="tlr_single_tab_filter_scenario", + help="Leave empty to show all scenarios.", + ) + with f2: + sel_statuses = st.multiselect( + "Vehicle status", + options=all_statuses, + default=[], + key="tlr_single_tab_filter_status", + help="Leave empty to show all statuses.", + ) + with f3: + sel_tlr_types = st.multiselect( + "Traffic light type", + options=all_tlr_types, + default=[], + key="tlr_single_tab_filter_tlr_type", + help="Leave empty to show all traffic light types.", + ) + sort_by = st.selectbox( + "Sort by", + [ + "Scenario, then frame index", + "Frame index only", + "Vehicle status, then scenario, frame index", + "Traffic light type, then scenario, frame index", + ], + key="tlr_single_tab_sort_by", + ) + + if sel_scenarios: + filtered_details = filtered_details[filtered_details["scenario"].astype(str).isin(sel_scenarios)] + if sel_statuses: + filtered_details = filtered_details[filtered_details["status"].astype(str).isin(sel_statuses)] + if sel_tlr_types: + filtered_details = filtered_details[ + filtered_details["traffic_light_type"].astype(str).isin(sel_tlr_types) + ] + + if sort_by == "Scenario, then frame index": + filtered_details = filtered_details.sort_values(["scenario", "frame_index"]).reset_index(drop=True) + elif sort_by == "Frame index only": + filtered_details = filtered_details.sort_values(["frame_index", "scenario"]).reset_index(drop=True) + elif sort_by == "Vehicle status, then scenario, frame index": + filtered_details = filtered_details.sort_values(["status", "scenario", "frame_index"]).reset_index(drop=True) + else: + filtered_details = filtered_details.sort_values( + ["traffic_light_type", "scenario", "frame_index"] + ).reset_index(drop=True) + + st.dataframe(filtered_details, width='stretch', hide_index=True) + caption = f"Showing **{len(filtered_details)}** frame(s). Total before filters: {len(details_df)}." + if sel_scenarios or sel_statuses or sel_tlr_types: + caption += " Filters applied." + st.caption(caption) dl_col_csv, dl_col_json = st.columns(2) with dl_col_csv: st.download_button( "Download as CSV", - data=details_df.to_csv(index=False).encode("utf-8"), + data=filtered_details.to_csv(index=False).encode("utf-8"), file_name="tlr_details.csv", mime="text/csv", key="tlr_dl_single_tab_csv", @@ -339,7 +405,7 @@ def _render_single_tabs(analyzer, tab_criteria, tab_vehicle, tab_critical, tab_d with dl_col_json: st.download_button( "Download as JSON", - data=_dataframe_to_json_bytes(details_df, export_kind="single_dataset_details"), + data=_dataframe_to_json_bytes(filtered_details, export_kind="single_dataset_details"), file_name="tlr_details.json", mime="application/json", key="tlr_dl_single_tab_json", From 1ec74b67498bbb10ffb2037d29ceb769a323bae1 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 22 Apr 2026 15:33:05 +0900 Subject: [PATCH 40/94] feat: add English and Japanese README support in Help page - Introduced a language selection feature in the Help page, allowing users to switch between English and Japanese README files. - Updated the description to reflect the new language toggle functionality. - Ensured that the correct README file is loaded based on user selection, enhancing accessibility for diverse users. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Readme.en.md | 372 ++++++++++++++++++++++ evaluation_dashboard_app/pages/10_Help.py | 21 +- 2 files changed, 390 insertions(+), 3 deletions(-) create mode 100644 evaluation_dashboard_app/Readme.en.md diff --git a/evaluation_dashboard_app/Readme.en.md b/evaluation_dashboard_app/Readme.en.md new file mode 100644 index 0000000..5d94cf1 --- /dev/null +++ b/evaluation_dashboard_app/Readme.en.md @@ -0,0 +1,372 @@ +# Evaluation Dashboard + +## Required Installation + +This dashboard and evaluation tool require the following prerequisites and Python packages. + +### Python packages (local development / full functionality) +The easiest way is to install from the single `requirements.txt` at the repository root, including private dependencies. + +```sh +cd evaluation_dashboard_app +pip install -r requirements.txt +``` + +Example if you want to install packages manually in separate steps: + +```sh +# Basic +pip install \ + streamlit pandas plotly duckdb numpy \ + requests pyyaml matplotlib shapely + +# Download / Scenario API authentication +pip install git+ssh://git@github.com/tier4/webauto-auth-py.git + +# Production task queue (when USE_TASK_QUEUE=true) +pip install rq psycopg2-binary +``` + +In the **Docker image**, public dependencies are installed from [`requirements-docker.txt`](requirements-docker.txt), and private packages such as `webauto-auth` and the evaluation dependencies are added during build time using SSH secrets (see [`Dockerfile`](Dockerfile)). + +PDF export uses Plotly/Kaleido static image rendering, so **Chrome is also installed in the Docker image**. If you see `Kaleido requires Google Chrome to be installed` in the deployment environment, **rebuild and redeploy** with the latest image. + +```sh +# Install CLI tool (if you use it for generating evaluation command lines) +pipx install git+ssh://git@github.com/tier4/v_and_v_util.git +``` + +### pilot-auto / perception_eval (only needed when generating Summary / Score) +- A pilot-auto environment with `perception_eval` available is required. See "Usage" below. +- If importing `perception_eval` fails, generation of `Summary.csv` / `Score.csv` stops. + +### Configuration file +- Input values are saved in `configs/autoware_evaluator_dl_config.json` (created / updated automatically). + +## Overview +This is an evaluation dashboard built with Streamlit. It reads evaluation results under `data/` (`Summary.csv`, `Score.csv`, `.parquet`) and visualizes them across multiple pages. In addition, `pages/6_Download.py` supports bulk collection of evaluation results such as `result.txt`, automatic generation of `Summary.csv` / `Score.csv`, and searching / downloading result directories. The **TLR (Traffic Light Recognition) Analysis** page can visualize criteria matrices, vehicle state vs. signal type, important zones, and more for traffic-light recognition evaluation. To use it, you must first download scenario data from **tab 2 "Download Scenarios"** on the Download page. + +## Usage + +1. To generate summary or score files from `pages/6_Download.py` ("Generate Summary.csv / Score.csv"), you must **activate the pilot-auto (ROS 2) environment in advance** with the following command: + ``` + source path_to_pilot/install/setup.sh + ``` + This step is required for "Summary / Score CSV generation" in `pages/6_Download.py`. + +2. Start Streamlit from `evaluation_dashboard_app/`. + ``` + streamlit run Overview.py + ``` + +3. Choose pages and filters from the sidebar to explore the data. + +### Visualization quick start (recommended workflow) + +The recommended flow from downloading logs for a test to generating summaries and then reviewing the details in Overview is the following three-step process: + +1. **Download the target test logs from the Download page** +2. **Generate summary / score files from "Eval Results" on the Download page** +3. **Select that log (Run) on the Overview page and inspect the details** + +Below is a summary of what to do and what to watch out for in each step. + +#### Step 1: Download logs from the Download page + +- **Page**: Open **Download** (`6_Download.py`) from the sidebar. +- **Tab**: Select **"Download Results"**. +- **Inputs**: + - Enter **Project ID** and **Job ID**. Optionally specify a Suite ID if needed. + - For **Output Path**, specify **a folder dedicated to this test**. + To make it show up as a selectable "Run" in Overview, it is recommended to place one folder per test directly under `data/`. + Example: `./data/my_test_20250203` +- **Download Type**: + - **Archives (ZIP)**: Downloads ZIP archives, extracts them, and takes data for the selected phase. Suitable for full local analysis. + - **Result JSON only**: Downloads only the result JSON. Lightweight and useful when you only want summary / score generation. +- **Run**: Click "Download Results" and wait for completion. +- **Result**: Under the specified Output Path, logs and, when needed, source files such as `result.txt` and `score.json` are stored in a directory structure based on the job / suite. + +![Download page settings (Download Results tab)](docs/images/download_config.png) + +![After download finishes](docs/images/download_result.png) + +#### Step 2: Generate summary analysis results in Eval Results + +- **Page**: Stay on the same **Download** page. +- **Tab**: Switch to **"Eval Results (per directory)"** or **"Eval Results"**. +- **Root directory to evaluate**: + - Specify **the same path used as Output Path in Step 1**. + Example: `./data/my_test_20250203` +- **Options**: + - **Search subdirectories**: Searches subdirectories for `result.txt` / `score.json`. Usually this should be enabled. + - **Only generate Summary.csv and Score.csv**: + If each directory already contains `result.txt` or `score.json`, enabling this skips re-running `perception_eval` and generates **only `Summary.csv` and `Score.csv`** from the existing results. + On the first run, if `result.txt` and related outputs do not exist yet, leave this unchecked and run the full evaluation with "Run eval_result for all directories". +- **Run**: + - Click either "Run eval_result for all directories" or "Generate Summary and Score CSV only". +- **Result**: **`Summary.csv` and `Score.csv`** are generated directly under the specified root directory. + These files are the "summary analysis results" used by Overview and pages such as TP Summary and Criteria Based Score. + +![Eval Results tab (summary / score generation)](docs/images/eval_result.png) + +If `perception_eval` is used during Summary / Score generation, you must run `source path_to_pilot/install/setup.sh` in advance as described in "Usage". + +#### Step 3: Select the log in Overview and inspect the details + +- **Page**: Open **Overview** (`Overview.py`) from the sidebar. +- **Selecting a Run**: + - Overview treats **each direct subdirectory under `data/`** as one "Run". + - If the Output Path in Step 1 was `./data/`, that `` appears in the sidebar dropdown for **"Baseline (A)"**. + - Choose the log (Run) you want to inspect in **Baseline (A)**. + If you want to compare runs, switch to **Compare Mode** and choose another Run in **Candidate (B)**. +- **Displayed contents**: + - Overall metrics based on the selected Run's **Summary.csv** are shown, such as TP mean and XRMS / YRMS / XSTD / YSTD. + - By filtering with Perception Label / Product Label, you can inspect label-specific TP and metric breakdowns. + - Other pages such as TP Summary, Criteria Based Score, Detection Stats, and Bounding Box Viewer share the Run selected in Overview through `st.session_state`, so it is best to **select the Run in Overview first** and then move to the detailed pages. + +![Overview page (Run selection and metrics display)](docs/images/overview.png) + +**Key point**: +- Whenever you add a new test, use `./data/` as the Output Path in Download, then use that same path in Eval Results to generate Summary / Score. The new test will appear in the Overview Run list, and you can inspect it immediately. + +## Main Features +- Select a Run on the Overview page, switch between single-run and compare mode, and display overall metrics +- When the production task queue is enabled, track heavy jobs from the UI such as "Recent tasks" +- TP / position / velocity statistical viewers (scatter plots and distributions) +- Criteria-based evaluation viewer (metric distributions, averages, and box plots) +- Detection statistics comparison viewer (for example TP / FP distance-bin comparison) +- BEV bounding-box visualization +- TLR (Traffic Light Recognition) evaluation analysis: criteria matrices, vehicle state vs. signal type, important zones. Requires scenario data downloaded from tab 2 of the Download page. +- Evaluation command generation tool +- **Docker production**: Navigate from Overview to **Deployment debug** (Postgres / Redis / RQ and optional Docker operations) + +## Directory Structure +```text +evaluation_dashboard_app/ + Overview.py + pages/ + 1_TP_Summary.py … 10_Help.py, 99_Deployment_Debug.py (sidebar order follows the page numbers) + lib/ + worker/ # Production: RQ tasks and worker entrypoint + configs/ + autoware_evaluator_dl_config.json + deploy/ # Production: compose, nginx, numbered shell steps + docker-compose.yml + .env.example + 01_SETUP_ENV.sh ... 09_RESTART_WORKER.sh + configs/ + autoware_evaluator_dl_config.json # Mounted inside the container at /app/docker_config during compose runs + nginx/ + data/ + / + Summary.csv + Score.csv + *.parquet +``` + +## Page Guide + +The sidebar order follows the numbering of **`number_name.py` files directly under `pages/`**. **Deployment debug** (`99_Deployment_Debug.py`) must stay directly under `pages/` because it is registered through `st.page_link`. Outside Docker, `inject_app_page_styles` hides that sidebar item with CSS. Inside Docker, there is an explicit link from **Overview**. + +Many visualization pages rely on `st.session_state`, so it is best to **select the mode (single / compare) and Run in Overview first**. In compare mode, Baseline (A) and Candidate (B...) are shared across pages. + +### `Overview.py` (entry point) +- Starting point for **shared filters** such as single / compare mode, Run selection, and Perception / Product labels. +- **Shareable URL**: The same view can be reproduced using query parameters like `mode`, `run_a`, `run_b`, and so on. Some other pages follow the same pattern. +- When running in Docker, the sidebar shows a link to **Deployment debug** (`pages/99_Deployment_Debug.py`). + +### `pages/1_TP_Summary.py` +- **Prerequisite**: Data must already be loaded in Overview. **`Summary.csv` is required**. If a Run does not have it, TP Summary is unavailable, while Detection Stats / BB Viewer can still work with only parquet files and show guidance accordingly. +- In compare mode, **deltas between runs** can be reflected in plots. +- `TP` range, velocity outlier clipping, scatter plots (`xrms`-`yrms`, `vx`-`vy`), and distribution histograms. + +### `pages/2_Criteria_Based_Score.py` +- A criteria evaluation viewer based on **`Score.csv`**. Follows the mode selected in Overview. +- Criteria block switching, metric distributions, group averages, box plots, and scenario-level comparisons. +- Includes UI for **Absolute gates** (sign-off by threshold pass / fail) and gate comparison across multiple Runs. + +### `pages/3_Detection_Stats.py` +- Aggregates detection evaluation data using **`.parquet` + DuckDB**. Supports filters, hierarchical views, scenario breakdown, and **comparison across multiple Runs** when Overview is in compare mode. +- Distance-bin comparison by status such as TP / FP and color schemes for perception diffs (improved / worsened). + +### `pages/4_Bounding_Box_Viewer.py` +- **Prerequisite**: A Run must already be selected in Overview. +- Displays bounding boxes on a **BEV** from `.parquet`. Supports filtering by t4dataset, topic, label, visibility, and more. In compare mode, it can handle multiple Runs. + +### `pages/5_Tools.py` +- Evaluation command generation tool +- Extract Job ID / Suite ID from Report / Suite URLs + +### `pages/6_Download.py` +- Main integration point with the evaluator. The **tabs** are organized as follows: + + | Tab | Contents | + |------|------| + | **Download Results** | Retrieve job results such as archive ZIPs or Result JSON. Output Path is restricted under the data root. | + | **Download Scenarios** | Download scenario data. Required by **TLR Analysis**. | + | **View Downloads** | Review downloaded jobs and scenarios. | + | **Eval Results** | Run evaluation or generate **Summary.csv / Score.csv** from `result.txt` / `score.json` under a root directory. | + +- When **`USE_TASK_QUEUE=true`** (Redis + Worker + Postgres), heavy work is queued to workers, and you can track status from the UI through **Recent tasks** and related sections. + +### `pages/7_Data_Management.py` +- Displays the list of Runs under the data root, including size, update time, and whether Summary / Score / Parquet files exist. +- Download outputs as a **ZIP**, copy **share links** for Overview, and **delete** Runs to manage storage in a multi-user server environment. + +### `pages/8_Parquet_Debug.py` +- For development and troubleshooting. Reads **`.parquet` / `.pkl` / `result.json`** from file paths and shows schemas, keys, criteria state, and optional quick plots. +- Useful for debugging pipeline outputs inside the dashboard. + +### `pages/9_TLR_Analysis.py` +- **TLR (Traffic Light Recognition)** evaluation: criteria matrices, vehicle state vs. signal type, important zones, and more. Supports single / compare mode and **shareable URLs** such as `mode`, `path_a`, `path_b`. +- **Prerequisite**: Download scenario data from **Download Scenarios** on the **Download** page and select the TLR result directory as a Run. + +### `pages/10_Help.py` +- Displays the repository **README inside the app** so setup instructions, workflows, and documentation can be read directly in the browser. +- Since **Mermaid diagrams** in Markdown are not rendered by default in Streamlit, this page renders them with JavaScript (Mermaid.js). + +### `pages/99_Deployment_Debug.py` (Docker only) +- Available only when Streamlit is running **inside a container**. With local `streamlit run`, it stops at a guidance message. +- Because it must be registered as **`pages/*.py` directly under the folder** for `st.page_link`, the corresponding auto-navigation item is **hidden with CSS outside Docker**. In Docker, you can also open it from the **Overview** sidebar via "Deployment debug". +- Lets you inspect the state of Postgres / Redis / RQ, task counts, and, depending on configuration, the host Docker container list, recent logs, and restricted `docker exec`. +- In production, mounting the **Docker socket grants strong privileges**, so check the authentication, VPN, and `EVAL_DEPLOYMENT_DEBUG_*` settings in [docs/PRODUCTION_DEPLOYMENT.md](docs/PRODUCTION_DEPLOYMENT.md). + +## Data Formats (high level) +- `Summary.csv`: `id`, `TP`, `xstd`, `xrms`, `ystd`, `yrms`, `vx`, `vy`, `perception_label`, `product_label` +- `Score.csv`: Criteria evaluation metric blocks (`Scenario`, `Option`, `GT_OBJ`, then `criteria0..n`) +- `.parquet`: Fields used for detection statistics / bounding-box viewing, such as `x`, `y`, `length`, `width`, `yaw`, `label`, `source`, `status` + +# Docker Usage Guide + +The image is **ROS-based**, so the container environment matches the host ROS environment. + +### Build Steps + +Because private repositories (`tier4/webauto-auth-py`, `tier4/v_and_v_util`) are used, you must provide a **GitHub SSH key** during build time. +Use `~/.ssh/id_rsa` directly. No ssh-agent is required. + +```sh +cd evaluation_dashboard_app + +# Recommended: add --no-cache if you want to rebuild with the latest dependencies every time. +# If ROS is Humble (can be omitted) +docker build --no-cache --secret id=ssh,src=$HOME/.ssh/id_rsa -t evaluation-dashboard . + +# If you want to switch ROS_DISTRO to Iron / Jazzy etc. +docker build --build-arg ROS_DISTRO=iron --secret id=ssh,src=$HOME/.ssh/id_rsa -t evaluation-dashboard . +``` + +### Production deployment + +For multi-user / production use, the recommended setup is **Nginx -> Streamlit -> Redis (task queue) -> Worker -> Postgres**. Heavy jobs such as downloads, evaluation, Summary / Score CSV generation, and parquet generation are executed by workers instead of the UI process, and task state is stored in Postgres. + +**Target Architecture:** + +```mermaid +flowchart LR + subgraph clients [Clients] + Browser[Browser] + end + subgraph edge [Edge] + Nginx[Nginx] + end + subgraph app [App Tier] + S1[Streamlit 1] + S2[Streamlit 2] + end + subgraph infra [Infrastructure] + Redis[Redis] + Postgres[Postgres] + end + subgraph workers [Workers] + W1[Worker 1] + W2[Worker N] + end + Browser --> Nginx + Nginx --> S1 + Nginx --> S2 + S1 --> Redis + S2 --> Redis + S1 --> Postgres + S2 --> Postgres + Redis --> W1 + Redis --> W2 + W1 --> Postgres + W2 --> Postgres + W1 --> DataRoot[Data root] + W2 --> DataRoot +``` + +- **Build**: As described above in "Build Steps", run `docker build ... -t evaluation-dashboard .` in `evaluation_dashboard_app/`. The compose services `streamlit1` (default), optional `streamlit2` (`--profile ha`), and `worker` all use this image. +- **Recommended flow (`deploy/` numbered scripts)**: Move into `deploy/` and run the scripts in order. All of them use `docker compose --env-file .env`. + + | Script | Description | + |-----------|------| + | `01_SETUP_ENV.sh` | Create `.env` from `.env.example` if it does not exist. **You still edit it manually.** | + | `02_BUILD.sh` | Build the image. You can pass arguments such as `--no-cache`. | + | `03_INIT_DB.sh` | **First time only**: after Postgres starts, run `init_db` to create task tables. | + | `04_START.sh` | Start the stack. Default worker count comes from `.env` `EVAL_COMPOSE_SCALE_WORKER`; for example `./04_START.sh --scale worker=3` overrides it. | + | `05_STOP.sh` | Stop the stack. | + | `06_STATUS.sh` | Check service status. | + | `07_LOGS.sh` | Run `docker compose logs -f`. Without arguments it shows all services; for example `./07_LOGS.sh worker`. | + | `08_REBUILD_AND_START.sh` | Build and then start the stack, same startup behavior as `04_START.sh`. | + | `09_RESTART_WORKER.sh` | Restart workers so code changes are reflected on the worker side. | + +- **Manual setup is also possible**: `cd deploy && cp .env.example .env` -> edit `.env` -> `docker compose --env-file .env up -d`. For first-time setup only, run `docker compose --env-file .env run --rm init_db` (equivalent to `03_INIT_DB.sh`). +- **Access**: In production compose, **Nginx listens on port 80**, and Streamlit is accessed through the proxy (see `docker-compose.yml` / `nginx/nginx.conf`). Since the source code and `lib/` are mounted, **Streamlit reloads easily when files change**, but **workers must be restarted after Python code changes**. +- **If the UI keeps loading forever**: Streamlit communicates with the browser over **WebSocket**. Suggested checks: (1) do a **hard reload** including cache reset or reopen in another tab, (2) by default Nginx points only to **one Streamlit app** (`streamlit1`), and a second instance should be enabled only when needed with `docker compose --profile ha up -d` plus upstream changes in `nginx.conf`, (3) set **`STREAMLIT_SERVER_COOKIE_SECRET`** in `deploy/.env.example`, (4) use `.streamlit/config.toml` `enableWebsocketCompression = false` and Nginx `proxy_buffering off` plus suitable `proxy_*_timeout`, and (5) check logs with `docker compose logs streamlit1 nginx`. +- **502 Bad Gateway**: This happens when Nginx **cannot reach Streamlit** because the process exited, was killed by OOM, or stayed blocked for too long. Check `docker compose logs streamlit1` and host **`dmesg`** for OOM messages. Heavy pages can consume significant memory, so the **default single-instance setup** and the single upstream in `deploy/nginx/nginx.conf` are recommended. +- **Troubleshooting Detection Stats freezes / 502**: Set **`EVAL_DETECTION_STATS_DEBUG=1`** in `.env` so it is passed into the compose `streamlit1` service, then restart. The **Detection Stats debug** expander at the bottom of the page and the stderr of **`docker compose logs streamlit1`** will show section boundaries, `getrusage` memory values, and elapsed time before / after DuckDB calls. +- **If a subpage says "load in Overview" even though Overview was already opened**: Session state is stored **in memory per replica**. Overview also syncs `mode` / `run_a` / `run_b`... into the URL, so when those query parameters remain in the address bar, subpages such as Detection Stats can **rebuild `run_a` into `runA`** via `lib/overview_url_hydrate.py`. Open **Overview once**, confirm the address bar contains `run_a=`, then move to the subpage, or reopen from the **Overview share link**. +- **Avoid duplicate config management**: During compose runs, `deploy/configs/autoware_evaluator_dl_config.json` is mounted inside the container as `EVAL_DASHBOARD_CONFIG` (`/app/docker_config/...`). This is a separate file from the host `configs/` version, so edit the one under `deploy/configs/` for Docker-specific settings. +- For detailed settings and environment variables, see [docs/PRODUCTION_DEPLOYMENT.md](docs/PRODUCTION_DEPLOYMENT.md). + +### Startup and data mount (single container) + +Always mount the `data/` directory so data is persisted and visible. + +```sh +docker run -p 8501:8501 \ + -v "$(pwd)/data:/app/data" \ + -v ~/.webauto:/root/.webauto \ + evaluation-dashboard +``` + +### Example: run in background (`-d`) + +If you want to start the container in detached mode, add `-d` and optionally set `--name`. If you want to synchronize the entire `/app` tree, including code and notebooks, with the host, use the following form. + +```sh +docker run -d --name evaluation-dashboard \ + -p 8501:8501 \ + -v "$(pwd):/app" \ + -v ~/.webauto:/root/.webauto \ + evaluation-dashboard +``` + +### Multi-user deployment + +If multiple people access the same server for downloads, evaluation, result review, sharing, and data management, refer to the following points. + +- **Data root**: You can set the evaluation data root with environment variable `EVAL_DASHBOARD_DATA_ROOT` (default is `data`). Example: `-e EVAL_DASHBOARD_DATA_ROOT=/var/eval_dashboard/data` +- **Path restriction**: The Download Output Path and Eval Root directory are restricted under this data root, and path traversal is rejected. +- **Data Management page**: Lets you view the Run list, show sizes, delete Runs, and copy share links. You can remove unnecessary Runs to manage disk usage. +- **Sharing results**: By adding `?mode=...&run_a=...&run_b=...` to the Overview URL, you can share the same Run view. Links can be copied from Data Management or "Share this view" in Overview. +- See [docs/MULTI_USER_DEPLOYMENT.md](docs/MULTI_USER_DEPLOYMENT.md) for more details. + +### Debugging and shell access + +If you want shell access inside a running container, use one of the following methods. + +**1. Enter bash by container ID** +```sh +docker ps # check the [CONTAINER ID] +docker exec -it [CONTAINER ID] /bin/bash +``` + +**2. Start directly with bash as the entrypoint** +```sh +docker run -it --entrypoint bash \ + -v "$(pwd)/data:/app/data" \ + evaluation-dashboard +``` diff --git a/evaluation_dashboard_app/pages/10_Help.py b/evaluation_dashboard_app/pages/10_Help.py index cfbf8bd..857b7b9 100644 --- a/evaluation_dashboard_app/pages/10_Help.py +++ b/evaluation_dashboard_app/pages/10_Help.py @@ -16,13 +16,17 @@ render_page_hero( kicker="Documentation", title="Help & guide", - description="In-app copy of the project README — setup, pages, and workflows for the evaluation dashboard.", + description="In-app copy of the project README with a simple Japanese / English switch.", mode="Single Run", ) # Streamlit markdown does not run Mermaid; split fenced ```mermaid blocks and render via Mermaid.js. MERMAID_FENCE = re.compile(r"```mermaid\s*\n([\s\S]*?)```", re.IGNORECASE) IMAGE_PATTERN = re.compile(r"!\[(.*?)\]\((.*?)\)") +README_FILES = { + "Japanese": Path("Readme.md"), + "English": Path("Readme.en.md"), +} def _render_markdown_with_images(chunk: str) -> None: @@ -43,8 +47,19 @@ def _render_markdown_with_images(chunk: str) -> None: break -readme_path = Path("Readme.md") -content = readme_path.read_text(encoding="utf-8") +language = st.radio( + "README language", + options=list(README_FILES.keys()), + horizontal=True, + label_visibility="collapsed", +) + +selected_readme_path = README_FILES[language] +if not selected_readme_path.exists(): + st.error(f"README file not found: {selected_readme_path}") + st.stop() + +content = selected_readme_path.read_text(encoding="utf-8") for idx, piece in enumerate(MERMAID_FENCE.split(content)): if idx % 2 == 0: From fc33b9a34aef164533ffc50590c6a1541cfb5012 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 22 Apr 2026 15:38:04 +0900 Subject: [PATCH 41/94] feat: add English README support in Docker configuration - Updated Dockerfile to include the English README file (Readme.en.md) for better documentation accessibility. - Modified docker-compose.yml to mount the English README file, ensuring it is available within the container. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Dockerfile | 2 ++ evaluation_dashboard_app/deploy/docker-compose.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/evaluation_dashboard_app/Dockerfile b/evaluation_dashboard_app/Dockerfile index ce43856..e76b5f7 100644 --- a/evaluation_dashboard_app/Dockerfile +++ b/evaluation_dashboard_app/Dockerfile @@ -71,6 +71,8 @@ RUN python3 -m pip install --no-cache-dir -r requirements-docker.txt # Copy application code and config COPY Overview.py . +COPY Readme.md . +COPY Readme.en.md . COPY pages/ pages/ COPY lib/ lib/ COPY worker/ worker/ diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index 59c7f8d..b55f647 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -55,6 +55,7 @@ x-streamlit-app: &streamlit-app - ../Overview.py:/app/Overview.py - ../pages:/app/pages - ../Readme.md:/app/Readme.md + - ../Readme.en.md:/app/Readme.en.md - ../lib:/app/lib - ../worker:/app/worker - ../configs:/app/configs @@ -123,6 +124,7 @@ services: - ../Overview.py:/app/Overview.py - ../pages:/app/pages - ../Readme.md:/app/Readme.md + - ../Readme.en.md:/app/Readme.en.md - ../lib:/app/lib - ../worker:/app/worker - ../configs:/app/configs From edeed0b0dc4c9c665995b9674662df864800c961 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 30 Apr 2026 10:07:34 +0900 Subject: [PATCH 42/94] feat: implement combined download, evaluation, and parquet generation workflow - Added a new task type "download_and_eval" to streamline the process of downloading results, running evaluations, and generating parquet files in a single workflow. - Implemented the `run_download_and_eval` function to handle the combined operations, including error handling and progress reporting. - Updated the UI to include a button for initiating the combined workflow, enhancing user experience by allowing one-click execution. - Enhanced the task summary display to show detailed results for the combined operation, including download success, evaluation summary, and parquet generation status. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/db.py | 1 + evaluation_dashboard_app/lib/download_core.py | 135 +++++++++++ evaluation_dashboard_app/pages/6_Download.py | 224 ++++++++++++++++++ .../slides/evaluation_dashboard_intro_ja.md | 202 ++++++++++++++++ evaluation_dashboard_app/worker/tasks.py | 78 ++++++ 5 files changed, 640 insertions(+) create mode 100644 evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md diff --git a/evaluation_dashboard_app/lib/db.py b/evaluation_dashboard_app/lib/db.py index 7110d13..930de51 100644 --- a/evaluation_dashboard_app/lib/db.py +++ b/evaluation_dashboard_app/lib/db.py @@ -29,6 +29,7 @@ def _task_log_timestamp_prefix() -> str: "run_eval_dirs", "generate_summary_csv", "build_parquet", + "download_and_eval", ) TASK_STATUSES = ("pending", "running", "completed", "failed") diff --git a/evaluation_dashboard_app/lib/download_core.py b/evaluation_dashboard_app/lib/download_core.py index 5dbfc64..1314321 100644 --- a/evaluation_dashboard_app/lib/download_core.py +++ b/evaluation_dashboard_app/lib/download_core.py @@ -530,3 +530,138 @@ def run_download_scenarios( organize_files_into_directories(out_dir) total_attempted = len(log_dicts) return (failure_count, total_attempted, rows) + + +def run_download_and_eval( + project_id: str, + job_id: str, + suite_id: Optional[str], + output_path: str, + download_type: str = "archives", + phase: str = "perception.object_recognition.tracking.objects", + *, + skip_large_file: bool = False, + large_file_mb: float = 50.0, + keep_zip_files: bool = False, + suite_ids: Optional[List[str]] = None, + run_eval: bool = True, + generate_parquet: bool = True, + eval_recursive: bool = True, + eval_overwrite: bool = False, + on_progress: Optional[Callable[[str], None]] = None, + on_warning: Optional[Callable[[str], None]] = None, +) -> Dict[str, Any]: + """ + Combined workflow: Download results, then optionally run eval and generate parquet. + + Returns dict with: + - download_success: bool + - download_summary: dict with success/fail counts + - eval_summary: dict with directories_processed, etc. (if run_eval=True) + - parquet_path: str (if generate_parquet=True) + """ + from lib import eval_summary + + # Try to import parquet generation + pkl_archive_to_parquet = None + try: + from lib.perception_catalog_io import pkl_archive_to_parquet as _p2p + pkl_archive_to_parquet = _p2p + except ImportError: + pass + + result: Dict[str, Any] = { + "download_success": False, + "download_summary": {}, + "eval_summary": {}, + "parquet_path": "", + "errors": [], + } + + # Step 1: Download + if on_progress: + on_progress("Starting download phase...") + + try: + failure_count, total_attempted, rows = run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=suite_id, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + on_progress=on_progress, + on_warning=on_warning, + ) + success_count = total_attempted - failure_count + result["download_summary"] = { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "rows": rows, + } + + # Check if download was successful (at least some files downloaded) + result["download_success"] = success_count > 0 + if failure_count > 0 and success_count == 0: + result["errors"].append(f"Download failed: {failure_count} of {total_attempted} scenarios failed") + return result + if success_count == 0: + result["errors"].append("Download: No scenarios were successfully downloaded") + return result + + except Exception as e: + result["errors"].append(f"Download exception: {e}") + return result + + # Step 2: Run eval (if requested and download succeeded) + if run_eval and result["download_success"]: + if on_progress: + on_progress("Download complete. Starting eval phase...") + + try: + eval_root = output_path + target_dirs = eval_summary.find_eval_result_dirs(eval_root, recursive=eval_recursive) + if target_dirs: + total = len(target_dirs) + for i, result_dir in enumerate(target_dirs): + if on_progress: + on_progress(f"Eval: Processing {i+1}/{total}: {result_dir}") + eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + + # Generate summary CSVs + csv_info = eval_summary.generate_summary_and_score_csv(eval_root) + result["eval_summary"] = { + "directories_processed": total, + "summary_path": csv_info.get("summary_path", eval_root), + "summary_rows": csv_info.get("summary_rows", 0), + "score_rows": csv_info.get("score_rows", 0), + } + else: + if on_warning: + on_warning("No eval result directories found") + except Exception as e: + result["errors"].append(f"Eval exception: {e}") + + # Step 3: Generate parquet (if requested and download succeeded) + if generate_parquet and result["download_success"] and pkl_archive_to_parquet: + if on_progress: + on_progress("Generating parquet...") + + try: + parquet_path = pkl_archive_to_parquet( + output_path, + on_progress=None, + on_skip=None, + project_id=project_id, + job_id=job_id, + ) + result["parquet_path"] = parquet_path + except Exception as e: + result["errors"].append(f"Parquet exception: {e}") + + return result diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 83bae3b..0f74f8b 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -783,6 +783,7 @@ def _task_type_label(task_type: str) -> str: "run_eval_dirs": "Run eval dirs", "generate_summary_csv": "Generate summary CSV", "build_parquet": "Build parquet", + "download_and_eval": "Download + Eval", } return labels.get(task_type, task_type or "Task") @@ -801,6 +802,14 @@ def _task_summary(t: Dict[str, Any]) -> str: return params.get("eval_root", "") if task_type == "build_parquet": return params.get("pkl_dir", "") + if task_type == "download_and_eval": + out = params.get("output_path") or params.get("job_id") or "" + parts = ["download"] + if params.get("run_eval"): + parts.append("eval") + if params.get("generate_parquet"): + parts.append("parquet") + return f"job_id={params.get('job_id', '')} [{'+'.join(parts)}] → {out}" return "" @@ -899,6 +908,40 @@ def _render_result_summary(summary: Dict[str, Any]) -> None: path = summary.get("output_path", "") st.subheader("Summary") st.write(f"- Output: `{path}`") + elif job == "download_and_eval": + dl_summary = summary.get("download_summary", {}) + eval_summary_data = summary.get("eval_summary", {}) + parquet_path = summary.get("parquet_path", "") + errors = summary.get("errors", []) + + st.subheader("Download + Eval + Parquet Summary") + + # Download summary + dl_success = summary.get("download_success", False) + if dl_success: + st.write("✅ **Download: SUCCESS**") + st.write(f" - Total: **{dl_summary.get('total', 0)}**, Success: **{dl_summary.get('success', 0)}**, Failed: **{dl_summary.get('failed', 0)}**") + else: + st.write("❌ **Download: FAILED**") + if errors: + for err in errors: + st.write(f" - {err}") + + # Eval summary + if eval_summary_data: + st.write("✅ **Eval: SUCCESS**") + st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") + st.write(f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows") + + # Parquet summary + if parquet_path: + st.write(f"✅ **Parquet: SUCCESS** → `{parquet_path}`") + + # Show errors + if errors: + st.error("Errors during execution:") + for err in errors: + st.write(f"- {err}") else: st.json(summary) @@ -1518,6 +1561,187 @@ def on_suite_id_change(): st.error(f"❌ Error: {str(e)}") st.exception(e) + # === Combined Download + Eval + Parquet Button === + st.divider() + st.subheader("🚀 Combined Workflow: Download + Eval + Parquet") + st.caption("Download results, run evaluation, and generate parquet in one click. Eval only runs if download succeeds.") + + # Options for combined workflow + col_combo1, col_combo2 = st.columns(2) + with col_combo1: + combined_run_eval = st.checkbox( + "Run evaluation (eval_result + Summary/Score CSV)", + value=True, + key="combined_run_eval", + help="Run eval_result on downloaded directories and generate Summary.csv/Score.csv" + ) + with col_combo2: + combined_generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + key="combined_generate_parquet", + help="Build scene_result.parquet from .pkl files" if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable", + disabled=not CATALOG_IO_AVAILABLE, + ) + + combined_eval_recursive = st.checkbox( + "Search subdirectories for eval", + value=True, + key="combined_eval_recursive", + help="Recursively search for result directories" + ) + + if st.button("📥 Download + Eval + Parquet", type="primary", key="download_and_eval_btn"): + st.session_state.stop_downloads = False + if not all([project_id, st.session_state.job_id]): + st.error("Please fill in all required fields: Project ID and Job ID") + st.stop() + resolved_output, path_err = resolve_under_data_root(output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}. Use a path under the server data root.") + st.stop() + resolved_path_str = str(resolved_output) + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("job_id", st.session_state.job_id) + set_config_value("suite_id", suite_id) + set_config_value("suite_ids", selected_suite_ids) + set_config_value("download_type", download_type) + if download_type == "Archives (ZIP)": + set_config_value("phase", phase) + set_config_value("skip_large_file", skip_large_file) + set_config_value("large_file_mb", large_file_mb) + set_config_value("keep_zip_files", keep_zip_files) + + if is_task_queue_enabled(): + # Enqueue combined task + params = { + "output_path": resolved_path_str, + "project_id": project_id, + "job_id": st.session_state.job_id, + "suite_id": suite_id or "", + "suite_ids": selected_suite_ids or None, + "download_type": "archives" if download_type == "Archives (ZIP)" else "result_json", + "phase": phase if download_type == "Archives (ZIP)" else "", + "skip_large_file": skip_large_file, + "large_file_mb": large_file_mb, + "keep_zip_files": keep_zip_files, + "run_eval": combined_run_eval, + "generate_parquet": combined_generate_parquet, + "eval_recursive": combined_eval_recursive, + "eval_overwrite": False, + } + task_id = _enqueue_task("download_and_eval", params) + if task_id: + st.success("Combined task queued. It will appear in the **Task status** section below; the list updates automatically.") + st.info("The task will: 1) Download results → 2) Run eval (if download succeeds) → 3) Generate parquet (if download succeeds)") + else: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + st.stop() + + # Inline execution (non-task-queue mode) + os.makedirs(resolved_path_str, exist_ok=True) + try: + job_result = JobResult( + environment=environment, + project_id=project_id, + job_id=st.session_state.job_id, + suite_id=suite_id, + suite_ids=selected_suite_ids, + output_path=resolved_path_str, + ) + + # Progress containers + progress_placeholder = st.empty() + status_placeholder = st.empty() + + def inline_progress(msg: str): + status_placeholder.info(msg) + + # Step 1: Download + progress_placeholder.info("📥 Step 1/3: Downloading results...") + download_successful = False + if download_type == "Archives (ZIP)": + with st.expander("Downloading Archives", expanded=True): + remain_list = job_result.download_archive_and_unzip( + phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + ) + download_successful = len(remain_list) > 0 + st.success(f"✅ Downloaded and extracted {len(remain_list)} archives") + else: + with st.expander("Downloading Result JSON", expanded=True): + log_dicts = job_result.download_result_json() + download_successful = len(log_dicts) > 0 + st.success(f"✅ Downloaded {len(log_dicts)} JSON files") + + if not download_successful: + st.error("❌ Download failed. Cannot continue with evaluation.") + st.stop() + + # Step 2: Run eval + if combined_run_eval: + progress_placeholder.info("🧮 Step 2/3: Running evaluation...") + target_dirs = find_eval_result_dirs(resolved_path_str, recursive=combined_eval_recursive) + if target_dirs: + eval_results = [] + eval_progress = st.progress(0) + for i, result_dir in enumerate(target_dirs): + eval_progress.progress((i + 1) / len(target_dirs), f"Evaluating {i+1}/{len(target_dirs)}") + eval_results.append(run_eval_result_for_dir(result_dir, overwrite=False)) + eval_progress.empty() + + success_eval = sum(1 for r in eval_results if r["status"] == "success") + failed_eval = sum(1 for r in eval_results if r["status"] == "failed") + st.success(f"✅ Eval complete: {success_eval} success, {failed_eval} failed") + + # Generate summary CSVs + with st.spinner("Generating Summary.csv and Score.csv..."): + csv_info = generate_summary_and_score_csv(resolved_path_str) + st.success(f"Generated Summary.csv ({csv_info['summary_rows']} rows) and Score.csv ({csv_info['score_rows']} rows)") + else: + st.warning("⚠️ No eval result directories found") + + # Step 3: Generate parquet + if combined_generate_parquet and CATALOG_IO_AVAILABLE: + progress_placeholder.info("📦 Step 3/3: Generating parquet...") + pkl_dir = Path(resolved_path_str) + all_pkl_files = list(pkl_dir.rglob("*.pkl")) + list(pkl_dir.rglob("*.pkl.z")) + pkl_count = len(all_pkl_files) + if pkl_count > 0: + with st.spinner(f"Processing {pkl_count} pkl files..."): + parquet_path = pkl_archive_to_parquet( + pkl_dir, + on_progress=None, + on_skip=None, + project_id=project_id, + job_id=st.session_state.job_id, + ) + st.success(f"✅ Parquet generated: {parquet_path}") + else: + st.warning("⚠️ No .pkl files found for parquet generation") + + progress_placeholder.empty() + status_placeholder.empty() + st.success("🎉 Combined workflow complete!") + + # Show file tree + with st.expander("📁 File Structure"): + for root, dirs, files in os.walk(resolved_path_str): + level = root.replace(resolved_path_str, '').count(os.sep) + indent = ' ' * 4 * level + st.text(f"{indent}{os.path.basename(root)}/") + subindent = ' ' * 4 * (level + 1) + for file in files: + st.text(f"{subindent}{file}") + + except Exception as e: + st.error(f"❌ Error: {str(e)}") + st.exception(e) + # Information section with st.expander("ℹ️ How to use"): st.markdown(""" diff --git a/evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md b/evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md new file mode 100644 index 0000000..a9e3ea5 --- /dev/null +++ b/evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md @@ -0,0 +1,202 @@ +--- +marp: true +theme: default +paginate: true +size: 16:9 +--- + +# 評価業務を回す統合ダッシュボード + +### Perception Evaluation Dashboard 紹介 + +- 発表者: (お名前) +- 日付: 2026-04-22 + +--- + +## 今日お話しすること + +1. 背景: 何が課題だったか +2. システム全体像と基本導線 +3. 主要機能の紹介 +4. 技術構成(運用アーキテクチャ) +5. 価値と今後の展開 + +--- + +## 背景: 何が困っていたのか + +- 評価結果の取得、整形、可視化、比較、共有が分断 +- ツール間移動や手作業が多く、時間がかかる +- 比較条件が揃わず、議論が噛み合わない +- 属人化しやすく、再現性が下がる + +--- + +## このシステムの狙い + +**評価業務の一連の流れを一つの場所で回すこと** + +- 取得 +- 整形(Summary / Score / parquet) +- 分析(単体・比較) +- 共有 +- データ管理 + +> 「見るための道具」ではなく「評価業務の基盤」 + +--- + +## システム全体像 + +- Download: 結果・シナリオの取得 +- Eval Results: CSV/評価データ生成 +- Overview: Run/比較条件の統一 +- 各分析ページ: 観点別の深掘り +- Data Management: 共有・整理・運用 + +--- + +## 典型的なユーザー導線(3ステップ) + +1. Downloadで対象結果を取得 +2. Summary.csv / Score.csv を生成 +3. OverviewでRun選択 → 分析ページへ + +**効果:** 作業切替が減り、初心者でも入りやすい + +--- + +## Overviewの役割(ハブ) + +- Single / Compare モード切替 +- Baseline / Candidate の比較前提を統一 +- Perception Label / Product Label で共通フィルタ +- 共有URLで表示状態を再現 + +**ポイント:** 前提を揃えて議論のズレを防ぐ + +--- + +## Downloadの価値 + +- ダウンロード前後の作業を一体化 +- 取得だけでなく、後続分析で使える形まで整備 +- 重い処理はタスク化し、進捗を可視化 + +**運用効果:** 属人化の低減、日常業務の安定化 + +--- + +## TP Summary / Criteria Based Score + +### TP Summary +- Summary.csvベースの全体傾向把握 +- 平均だけでなく分布や外れ値を確認 + +### Criteria Based Score +- Score.csvベースの基準別評価 +- しきい値(ゲート)を使った合否判断に有効 + +--- + +## Detection Stats + +- parquet + DuckDBで詳細分析 +- TP/FPなどの状態別・距離ビン別比較 +- 全体値では見えない偏りを発見 + +**使いどころ:** 「差がある」から「どこで差がある」へ + +--- + +## Bounding Box Viewer + +- BEV上でバウンディングボックスを可視化 +- topic / label / visibility で絞り込み +- Compareで挙動差を視覚的に確認 + +**意義:** 数値の裏にある実体を理解する + +--- + +## TLR Analysis + +- 信号認識評価に特化 +- criteriaマトリクス、車両状態×信号種別、zone分析 +- 比較時は差分ヒートマップで把握 + +**強み:** ドメイン特化で弱点を構造的に把握 + +--- + +## Prediction Evaluation + +- minADE / minFDEなどを距離・方向・ラベルで分解 +- リング表示などで偏りを直感的に把握 + +**意義:** 全体平均だけでなく改善対象を特定できる + +--- + +## Data Management + +- Run一覧(サイズ/更新日時/成果物有無)を可視化 +- ZIP化、共有リンク生成、不要Run削除 + +**実運用で重要:** 分析品質を落とさない整理整頓 + +--- + +## Help / Debugページ + +- Help: アプリ内で使い方を参照 +- Parquet Debug: スキーマやデータ切り分け +- Deployment Debug: Postgres / Redis / Worker状態確認 + +**設計思想:** 使う人だけでなく支える人の導線も用意 + +--- + +## 技術構成(フロント〜バック) + +- フロント: Streamlit +- 非同期: Redis + Worker +- 状態管理: Postgres +- 配置: Nginx配下に複数Streamlit + Worker群 + +**狙い:** UIを軽く保ち、重い処理はバックグラウンド化 + +--- + +## このアーキテクチャのメリット + +- 重い処理で画面が固まりにくい +- 複数人運用でも影響を分離しやすい +- 障害切り分けがしやすい +- スケールしやすい(UIと処理を分離) + +--- + +## まとめ + +このシステムは、 + +- 取得 +- 整形 +- 分析 +- 共有 +- 運用 + +を一体化した**評価業務の基盤**です。 + +> **評価を見える化するだけでなく、回せるようにするシステム** + +--- + +## Q&A + +- ご質問はこのセクションへご記入ください +- 時間内に回答しきれない場合は後ほどフォローします + +ありがとうございました。 diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index a27cb80..2ada23f 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -270,6 +270,83 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: raise +def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: + """Download results, then run eval and parquet generation. Stops on download failure.""" + update_task_status(task_id, "running") + append_task_log(task_id, "Starting download_and_eval combined workflow") + try: + from lib import download_core + output_path = parameters.get("output_path") + project_id = parameters.get("project_id") + job_id = parameters.get("job_id") + suite_id = parameters.get("suite_id") + suite_ids = parameters.get("suite_ids") + download_type = parameters.get("download_type", "archives") + phase = parameters.get("phase", "perception.object_recognition.tracking.objects") + skip_large_file = parameters.get("skip_large_file", False) + large_file_mb = float(parameters.get("large_file_mb", 50.0)) + keep_zip_files = parameters.get("keep_zip_files", False) + run_eval = parameters.get("run_eval", True) + generate_parquet = parameters.get("generate_parquet", True) + eval_recursive = parameters.get("eval_recursive", True) + eval_overwrite = parameters.get("eval_overwrite", False) + + if not all([output_path, project_id, job_id]): + update_task_status(task_id, "failed", error_message="Missing output_path, project_id, or job_id") + return + + on_progress = lambda msg: _progress_callback(task_id, msg) + on_warning = lambda msg: append_task_log(task_id, msg) + + result = download_core.run_download_and_eval( + project_id=project_id, + job_id=job_id, + suite_id=suite_id, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + run_eval=run_eval, + generate_parquet=generate_parquet, + eval_recursive=eval_recursive, + eval_overwrite=eval_overwrite, + on_progress=on_progress, + on_warning=on_warning, + ) + + # Build result summary + summary = { + "job": "download_and_eval", + "download_success": result.get("download_success", False), + "download_summary": result.get("download_summary", {}), + "eval_summary": result.get("eval_summary", {}), + "parquet_path": result.get("parquet_path", ""), + "errors": result.get("errors", []), + } + update_task_result_summary(task_id, summary) + + if not result.get("download_success"): + err_msg = result.get("errors", ["Download failed"])[0] + append_task_log(task_id, f"Stopped: {err_msg}") + update_task_status(task_id, "failed", result_path=output_path, error_message=err_msg) + elif result.get("errors"): + # Partial success with some errors + errs = "; ".join(result["errors"][:5]) + append_task_log(task_id, f"Completed with errors: {errs}") + update_task_status(task_id, "completed", result_path=output_path) + else: + append_task_log(task_id, "Download and eval completed successfully") + update_task_status(task_id, "completed", result_path=output_path) + + except Exception as e: + append_task_log(task_id, f"Failed: {e}") + update_task_status(task_id, "failed", error_message=str(e)) + raise + + # Map task_type (from Postgres) to job function TASK_JOB_MAP = { "generate_summary_csv": job_generate_summary_csv, @@ -277,6 +354,7 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: "build_parquet": job_build_parquet, "download_results": job_download_results, "download_scenarios": job_download_scenarios, + "download_and_eval": job_download_and_eval, } From 6070bb702f5786c05ba12a2edb5b7c10c3151b2e Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 30 Apr 2026 10:49:34 +0900 Subject: [PATCH 43/94] fix: update .dockerignore to include README files - Modified .dockerignore to ensure Readme.md and Readme.en.md are not ignored, allowing them to be included in the Docker build context for better documentation access. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/.dockerignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation_dashboard_app/.dockerignore b/evaluation_dashboard_app/.dockerignore index 08992ee..ea95081 100644 --- a/evaluation_dashboard_app/.dockerignore +++ b/evaluation_dashboard_app/.dockerignore @@ -3,6 +3,8 @@ __pycache__ .git .gitignore *.md +!Readme.md +!Readme.en.md .env .venv venv From 2efd229aea5609fbe9bb124b36398c4286d41143 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 30 Apr 2026 14:51:19 +0900 Subject: [PATCH 44/94] feat: enhance evaluation result handling and subprocess execution - Updated the `run_download_and_eval` function to track evaluation statuses, including success, failure, and skipped results, improving error reporting. - Refactored `run_eval_result_for_dir` to support both inline and subprocess execution, allowing for better handling of native crashes. - Introduced detailed logging for subprocess failures, including stdout and stderr outputs, enhancing debugging capabilities. - Modified the worker tasks to aggregate evaluation results and log failures, providing clearer insights into processing outcomes. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/download_core.py | 18 ++- evaluation_dashboard_app/lib/eval_summary.py | 105 +++++++++++++++++- evaluation_dashboard_app/worker/run_worker.py | 2 + evaluation_dashboard_app/worker/tasks.py | 12 +- 4 files changed, 133 insertions(+), 4 deletions(-) diff --git a/evaluation_dashboard_app/lib/download_core.py b/evaluation_dashboard_app/lib/download_core.py index 1314321..9ed5cd8 100644 --- a/evaluation_dashboard_app/lib/download_core.py +++ b/evaluation_dashboard_app/lib/download_core.py @@ -628,19 +628,35 @@ def run_download_and_eval( target_dirs = eval_summary.find_eval_result_dirs(eval_root, recursive=eval_recursive) if target_dirs: total = len(target_dirs) + eval_statuses: List[Dict[str, Any]] = [] for i, result_dir in enumerate(target_dirs): if on_progress: on_progress(f"Eval: Processing {i+1}/{total}: {result_dir}") - eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + eval_statuses.append(status) + if status.get("status") == "failed" and on_warning: + on_warning(f"Eval failed for {result_dir}: {status.get('detail', '')}") # Generate summary CSVs csv_info = eval_summary.generate_summary_and_score_csv(eval_root) + failed = [s for s in eval_statuses if s.get("status") == "failed"] + skipped = [s for s in eval_statuses if s.get("status") == "skipped"] + succeeded = [s for s in eval_statuses if s.get("status") == "success"] result["eval_summary"] = { "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), "summary_path": csv_info.get("summary_path", eval_root), "summary_rows": csv_info.get("summary_rows", 0), "score_rows": csv_info.get("score_rows", 0), } + if failed: + first = failed[0] + result["errors"].append( + f"Eval failed for {len(failed)} of {total} directories; " + f"first: {first.get('path', '')} ({first.get('detail', '')})" + ) else: if on_warning: on_warning("No eval result directories found") diff --git a/evaluation_dashboard_app/lib/eval_summary.py b/evaluation_dashboard_app/lib/eval_summary.py index 4080003..ee89af7 100644 --- a/evaluation_dashboard_app/lib/eval_summary.py +++ b/evaluation_dashboard_app/lib/eval_summary.py @@ -5,6 +5,9 @@ import glob import json import os +import signal +import subprocess +import sys from pathlib import Path from typing import Any, Dict, List @@ -28,8 +31,8 @@ def find_eval_result_dirs(root_dir: str, recursive: bool = True) -> List[str]: return sorted(result_dirs) -def run_eval_result_for_dir(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: - """Run eval_result and generate score.json for one directory. Returns status dict.""" +def _run_eval_result_for_dir_inline(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: + """Run eval_result in the current process and generate score.json for one directory.""" result_file = os.path.join(result_dir, "result.txt") score_file = os.path.join(result_dir, "score.json") if os.path.exists(result_file) and not overwrite: @@ -59,6 +62,89 @@ def run_eval_result_for_dir(result_dir: str, overwrite: bool = False) -> Dict[st return {"path": result_dir, "status": "failed", "detail": str(e)} +def _signal_detail(returncode: int) -> str: + """Return a human-readable detail string for a subprocess return code.""" + if returncode < 0: + sig_num = -returncode + elif returncode > 128: + sig_num = returncode - 128 + else: + return f"exit code {returncode}" + try: + sig_name = signal.Signals(sig_num).name + except ValueError: + sig_name = f"signal {sig_num}" + return f"{sig_name} ({sig_num})" + + +def _write_eval_subprocess_failure( + result_dir: str, + message: str, + stdout: str = "", + stderr: str = "", +) -> None: + """Persist native-crash details where the UI and user can inspect them.""" + result_path = Path(result_dir) / "result.txt" + log_path = Path(result_dir) / "eval_subprocess.log" + detail = f"Error: {message}\n" + with open(result_path, "w", encoding="utf-8") as f: + f.write(detail) + with open(log_path, "w", encoding="utf-8") as f: + f.write(detail) + if stdout: + f.write("\n--- stdout ---\n") + f.write(stdout) + if stderr: + f.write("\n--- stderr ---\n") + f.write(stderr) + + +def _run_eval_result_for_dir_subprocess(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: + """Run one scenario eval in a child Python process so native crashes are contained.""" + env = os.environ.copy() + env.setdefault("PYTHONFAULTHANDLER", "1") + cmd = [ + sys.executable, + "-m", + "lib.eval_summary", + "__run_eval_dir", + result_dir, + "1" if overwrite else "0", + ] + completed = subprocess.run( + cmd, + cwd=os.fspath(Path(__file__).resolve().parents[1]), + env=env, + text=True, + capture_output=True, + ) + if completed.returncode == 0: + for line in reversed(completed.stdout.splitlines()): + if line.startswith("__EVAL_RESULT_JSON__"): + try: + return json.loads(line.removeprefix("__EVAL_RESULT_JSON__")) + except json.JSONDecodeError: + break + return {"path": result_dir, "status": "success", "detail": "completed"} + + detail = f"eval subprocess failed with {_signal_detail(completed.returncode)}" + _write_eval_subprocess_failure( + result_dir, + detail, + stdout=completed.stdout, + stderr=completed.stderr, + ) + return {"path": result_dir, "status": "failed", "detail": detail} + + +def run_eval_result_for_dir(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: + """Run eval_result and generate score.json for one directory. Returns status dict.""" + isolated = os.environ.get("EVAL_RUN_ISOLATED_SUBPROCESS", "1").lower() + if isolated in ("0", "false", "no"): + return _run_eval_result_for_dir_inline(result_dir, overwrite=overwrite) + return _run_eval_result_for_dir_subprocess(result_dir, overwrite=overwrite) + + def generate_summary_and_score_csv(input_path: str) -> Dict[str, Any]: """ Generate Summary.csv and Score.csv in input_path from each subdirectory's result.txt and score.json. @@ -202,3 +288,18 @@ def _infer_suite_name(dir_name: str) -> str: "summary_rows": len(summary_lines), "score_rows": len(score_lines), } + + +def _main() -> int: + if len(sys.argv) >= 2 and sys.argv[1] == "__run_eval_dir": + result_dir = sys.argv[2] + overwrite = len(sys.argv) >= 4 and sys.argv[3] == "1" + result = _run_eval_result_for_dir_inline(result_dir, overwrite=overwrite) + print("__EVAL_RESULT_JSON__" + json.dumps(result, ensure_ascii=False)) + return 0 + print("Usage: python -m lib.eval_summary __run_eval_dir ", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + raise SystemExit(_main()) diff --git a/evaluation_dashboard_app/worker/run_worker.py b/evaluation_dashboard_app/worker/run_worker.py index 4a4b8d2..f98a2fd 100644 --- a/evaluation_dashboard_app/worker/run_worker.py +++ b/evaluation_dashboard_app/worker/run_worker.py @@ -6,11 +6,13 @@ import os import sys +import faulthandler _APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _APP_ROOT not in sys.path: sys.path.insert(0, _APP_ROOT) os.chdir(_APP_ROOT) +faulthandler.enable(all_threads=True) def main(): from rq import Worker diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 2ada23f..8fbef57 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -76,17 +76,27 @@ def job_run_eval_dirs(task_id: str, parameters: Dict[str, Any]) -> None: return total = len(target_dirs) append_task_log(task_id, f"Processing {total} directories") + statuses = [] for i, result_dir in enumerate(target_dirs): pct = 100.0 * (i + 1) / total if total else 0 update_task_progress(task_id, message=f"Processing {i+1}/{total}: {result_dir}", pct=pct) append_task_log(task_id, f"Processing {i+1}/{total}: {result_dir}") - eval_summary.run_eval_result_for_dir(result_dir, overwrite=overwrite) + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=overwrite) + statuses.append(status) + if status.get("status") == "failed": + append_task_log(task_id, f"Eval failed for {result_dir}: {status.get('detail', '')}") append_task_log(task_id, "Generating summary CSV") info = eval_summary.generate_summary_and_score_csv(eval_root) result_path = info.get("summary_path", eval_root) + failed = [s for s in statuses if s.get("status") == "failed"] + skipped = [s for s in statuses if s.get("status") == "skipped"] + succeeded = [s for s in statuses if s.get("status") == "success"] summary = { "job": "run_eval_dirs", "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), "summary_path": result_path, "summary_rows": info.get("summary_rows", 0), "score_rows": info.get("score_rows", 0), From ec27fa6584b558f44745924617cb74795f0b4ca2 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 30 Apr 2026 15:29:56 +0900 Subject: [PATCH 45/94] refactor: streamline error handling and progress reporting in data processing - Removed several unused helper functions related to sanitizing dynamic objects and results, simplifying the codebase. - Enhanced the `build_scene_dataframe_from_pkl_dir` function to include improved error handling and progress reporting through callback functions for skipped files. - Introduced a mechanism to track and report skipped files with reasons, improving user feedback during data processing. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/perception_catalog_io.py | 137 ++++-------------- .../lib/specsheet_report.py | 15 +- 2 files changed, 42 insertions(+), 110 deletions(-) diff --git a/evaluation_dashboard_app/lib/perception_catalog_io.py b/evaluation_dashboard_app/lib/perception_catalog_io.py index 2809cc0..3a59504 100644 --- a/evaluation_dashboard_app/lib/perception_catalog_io.py +++ b/evaluation_dashboard_app/lib/perception_catalog_io.py @@ -256,105 +256,6 @@ def _normalize_loaded_pkl( return data -def _enum_like(value: Any) -> Any: - """Return an enum-like object with a `.value` attribute when the input is missing.""" - return value if value is not None else SimpleNamespace(value=None) - - -def _label_like(value: Any = "UNKNOWN") -> Any: - """Return a label-like object that matches analyzer expectations.""" - return SimpleNamespace(value=value) - - -def _sanitize_dynamic_object(dynamic_object: Any) -> int: - """Repair common missing fields on dynamic objects used by scene2df().""" - repairs = 0 - if dynamic_object is None: - return repairs - - try: - from perception_eval.common.schema import FrameID - except ImportError: - FrameID = None - - if getattr(dynamic_object, "frame_id", None) is None and FrameID is not None: - dynamic_object.frame_id = FrameID.BASE_LINK - repairs += 1 - - state = getattr(dynamic_object, "state", None) - if state is not None and getattr(state, "shape_type", None) is None: - state.shape_type = _label_like("UNKNOWN") - repairs += 1 - - semantic_label = getattr(dynamic_object, "semantic_label", None) - if semantic_label is None: - dynamic_object.semantic_label = SimpleNamespace(label=_label_like("UNKNOWN")) - repairs += 1 - elif getattr(semantic_label, "label", None) is None: - semantic_label.label = _label_like("UNKNOWN") - repairs += 1 - - return repairs - - -def _sanitize_object_result(result: Any) -> int: - """Repair object-result fields that the analyzer expects to expose `.value`.""" - repairs = 0 - if result is None: - return repairs - - if getattr(result, "center_distance", None) is None: - result.center_distance = _enum_like(None) - repairs += 1 - if getattr(result, "plane_distance", None) is None: - result.plane_distance = _enum_like(None) - repairs += 1 - - repairs += _sanitize_dynamic_object(getattr(result, "estimated_object", None)) - repairs += _sanitize_dynamic_object(getattr(result, "ground_truth_object", None)) - return repairs - - -def _sanitize_pass_fail_result(pass_fail_result: Any) -> int: - """Repair pass/fail result containers before handing them to scene2df().""" - repairs = 0 - if pass_fail_result is None: - return repairs - - for attr in ("tp_object_results", "fp_object_results"): - for result in getattr(pass_fail_result, attr, []) or []: - repairs += _sanitize_object_result(result) - - for obj in getattr(pass_fail_result, "fn_objects", []) or []: - repairs += _sanitize_dynamic_object(obj) - - return repairs - - -def _sanitize_loaded_pkl(data: Any) -> int: - """Walk normalized PKL data and repair common missing fields for analyzer compatibility.""" - repairs = 0 - - scenarios = data - if not isinstance(scenarios, Iterable) or hasattr(scenarios, "frame_results"): - scenarios = [scenarios] - - for scenario in scenarios: - frame_results_dict = getattr(scenario, "frame_results", None) - if isinstance(frame_results_dict, dict): - frame_lists = frame_results_dict.values() - elif isinstance(scenario, list): - frame_lists = [scenario] - else: - frame_lists = [[scenario]] - - for frame_list in frame_lists: - for frame in frame_list or []: - repairs += _sanitize_pass_fail_result(getattr(frame, "pass_fail_result", None)) - - return repairs - - def _require_analyzer() -> None: if not _ANALYZER_AVAILABLE: raise ImportError( @@ -436,15 +337,27 @@ def build_scene_dataframe_from_pkl_dir( total = len(pkl_files) df = SceneDataFrame(current=pd.DataFrame()) + + def _report_progress(done: int) -> None: + if on_progress: + on_progress(done, total) + for i, pkl_file in enumerate(pkl_files): - if str(pkl_file).lower().endswith(".pkl.z"): - try: - data = joblib.load(pkl_file) - except NameError: - raise ImportError("joblib is required for .pkl.z: pip install joblib") - else: - with open(pkl_file, "rb") as f: - data = pickle.load(f) + try: + if str(pkl_file).lower().endswith(".pkl.z"): + try: + data = joblib.load(pkl_file) + except NameError: + raise ImportError("joblib is required for .pkl.z: pip install joblib") + else: + with open(pkl_file, "rb") as f: + data = pickle.load(f) + except Exception as e: + if on_skip: + on_skip(pkl_file, f"failed to load: {e}") + _report_progress(i + 1) + continue + raise data = _normalize_loaded_pkl( data, pkl_file=pkl_file, @@ -456,6 +369,7 @@ def build_scene_dataframe_from_pkl_dir( except Exception as e: if on_skip: on_skip(pkl_file, f"failed to convert: {e}") + _report_progress(i + 1) continue raise del data @@ -463,17 +377,22 @@ def build_scene_dataframe_from_pkl_dir( if skip_empty: if on_skip: on_skip(pkl_file, "empty") + del df_ + gc.collect() + _report_progress(i + 1) continue if skip_bad_dtype and hasattr(df_, "current") and "x_error" in getattr(df_.current, "columns", []): if df_.current["x_error"].dtype != "float64": if on_skip: on_skip(pkl_file, f"bad dtype x_error={df_.current['x_error'].dtype}") + del df_ + gc.collect() + _report_progress(i + 1) continue df = df.concatenate(df_) del df_ gc.collect() - if on_progress: - on_progress(i + 1, total) + _report_progress(i + 1) return df diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index 65ff686..d940916 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -172,11 +172,24 @@ def ensure_specsheet_csvs( _copy_parquet_to_csv(fallback, current_csv) else: _notify(progress_callback, "No CSV found. Building CSV from pkl / pkl.z files") + skip_counts: dict[str, int] = {} def _on_progress(done: int, total: int) -> None: _notify(progress_callback, f"Processing pkl files {done}/{total}") - df = build_scene_dataframe_from_pkl_dir(run_dir, on_progress=_on_progress) + def _on_skip(path: str | Path, reason: str) -> None: + skip_counts[reason] = skip_counts.get(reason, 0) + 1 + + df = build_scene_dataframe_from_pkl_dir( + run_dir, + on_progress=_on_progress, + on_skip=_on_skip, + ) + if skip_counts: + details = ", ".join( + f"{count} {reason}" for reason, count in sorted(skip_counts.items()) + ) + _notify(progress_callback, f"Skipped pkl files: {details}") df.to_csv(run_dir) if not current_csv.exists(): raise FileNotFoundError(f"Failed to generate {current_csv}") From 3c2c7735f0934dd51e4236bc86779d4f7e7abb05 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 30 Apr 2026 16:31:07 +0900 Subject: [PATCH 46/94] feat: add compatibility layer for template updates in specsheet report - Introduced a new function `_update_template_compat` to handle variations in the `update_template` function's signature across different analyzer versions. - Enhanced the `generate_specsheet_pdf` function to utilize the new compatibility layer, ensuring seamless integration of template directory handling. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/specsheet_report.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index d940916..a0f00f7 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -1,6 +1,7 @@ from __future__ import annotations from contextlib import contextmanager +import inspect import re from pathlib import Path from typing import Callable, Iterable, Sequence @@ -151,6 +152,28 @@ def _prefer_cjk_font_stack(html_lines: Sequence[str]) -> list[str]: return [line.replace(generic, preferred) for line in rendered] +def _update_template_compat( + update_template_func: Callable[..., Sequence[str]], + project_id: str, + version: str, + *, + template_dir: Path, +) -> Sequence[str]: + """Call update_template across analyzer versions with different signatures.""" + try: + parameters = inspect.signature(update_template_func).parameters + except (TypeError, ValueError): + parameters = {} + + supports_template_dir = ( + "template_dir" in parameters + or any(param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values()) + ) + if supports_template_dir: + return update_template_func(project_id, version, template_dir=str(template_dir)) + return update_template_func(project_id, version) + + def ensure_specsheet_csvs( run_dir: str | Path, *, @@ -260,7 +283,12 @@ def generate_specsheet_pdf( _notify(progress_callback, "Rendering PDF") template_dir = Path(template_module.__file__).resolve().parent.parent / "template" html = _prefer_cjk_font_stack( - update_template(project_id, version, template_dir=str(template_dir)) + _update_template_compat( + update_template, + project_id, + version, + template_dir=template_dir, + ) ) specsheet( html=html, From e45502f6b121eaac50be13487a6e81a6c436b271 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 1 May 2026 13:54:36 +0900 Subject: [PATCH 47/94] fix: enhance figure generation logic in detection perception diff - Updated the `_build_detection_perception_diff_figures` function to ensure figures are only generated if the relevant dataframes contain the necessary columns. - Added checks for empty dataframes and the presence of the "n" column before appending figures to the results list, preventing potential errors and improving robustness. - Modified the `_comparison_lens_treemap_figure` function to return None for empty dataframes or those lacking the "n" column, enhancing error handling. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/overview_pdf_report.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/evaluation_dashboard_app/lib/overview_pdf_report.py b/evaluation_dashboard_app/lib/overview_pdf_report.py index ce4cf2c..0332ee4 100644 --- a/evaluation_dashboard_app/lib/overview_pdf_report.py +++ b/evaluation_dashboard_app/lib/overview_pdf_report.py @@ -1355,7 +1355,7 @@ def _build_detection_perception_diff_figures( continue h_imp = _baobab_hierarchy_from_objects(df_obj, "improved", f"Improved ({lbl} vs A)", 15, 10) h_deg = _baobab_hierarchy_from_objects(df_obj, "degraded", f"Degraded ({lbl} vs A)", 15, 10) - if not h_imp.empty: + if not h_imp.empty and "n" in h_imp.columns: fig_imp = px.sunburst( h_imp, path=["root", "scen_g", "fr_display", "label"], @@ -1366,7 +1366,7 @@ def _build_detection_perception_diff_figures( ) _apply_detection_theme(fig_imp, f"Sunburst: improved ({lbl} vs A)") figures.append((fig_imp, f"Perception diff sunburst for improved objects: {lbl} vs baseline A.")) - if not h_deg.empty: + if not h_deg.empty and "n" in h_deg.columns: fig_deg = px.sunburst( h_deg, path=["root", "scen_g", "fr_display", "label"], @@ -1387,7 +1387,9 @@ def _build_detection_perception_diff_figures( df_by_label["degraded_cnt"], root_lens, ) - figures.append((_comparison_lens_treemap_figure(tdf_l, "By class"), f"Perception diff comparison lens by class: {lbl} vs baseline A.")) + fig_l = _comparison_lens_treemap_figure(tdf_l, "By class") + if fig_l is not None: + figures.append((fig_l, f"Perception diff comparison lens by class: {lbl} vs baseline A.")) if not scen_agg.empty: tdf_s = _comparison_lens_treemap_df( scen_agg["scenario_name"].astype(str), @@ -1395,7 +1397,9 @@ def _build_detection_perception_diff_figures( scen_agg["degraded_cnt"], root_lens, ) - figures.append((_comparison_lens_treemap_figure(tdf_s, "By scenario"), f"Perception diff comparison lens by scenario: {lbl} vs baseline A.")) + fig_s = _comparison_lens_treemap_figure(tdf_s, "By scenario") + if fig_s is not None: + figures.append((fig_s, f"Perception diff comparison lens by scenario: {lbl} vs baseline A.")) if not df_frame_sorted.empty: fr_cap = 36 fr_top = df_frame_sorted.head(fr_cap).copy() @@ -1411,7 +1415,9 @@ def _build_detection_perception_diff_figures( ims.append(io) dgs.append(do) tdf_f = _comparison_lens_treemap_df(pd.Series(nms), pd.Series(ims), pd.Series(dgs), root_lens) - figures.append((_comparison_lens_treemap_figure(tdf_f, "By frame"), f"Perception diff comparison lens by frame: {lbl} vs baseline A.")) + fig_f = _comparison_lens_treemap_figure(tdf_f, "By frame") + if fig_f is not None: + figures.append((fig_f, f"Perception diff comparison lens by frame: {lbl} vs baseline A.")) return figures @@ -1651,10 +1657,14 @@ def _comparison_lens_treemap_df(names: pd.Series, improved: pd.Series, degraded: rows.append({"root": root_label, "side": "Improved", "item": name, "n": float(imp)}) if deg > 0: rows.append({"root": root_label, "side": "Degraded", "item": name, "n": float(deg)}) + if not rows: + return pd.DataFrame(columns=["root", "side", "item", "n"]) return pd.DataFrame(rows) -def _comparison_lens_treemap_figure(tdf: pd.DataFrame, title: str) -> go.Figure: +def _comparison_lens_treemap_figure(tdf: pd.DataFrame, title: str) -> Optional[go.Figure]: + if tdf.empty or "n" not in tdf.columns: + return None fig = px.treemap( tdf, path=["root", "side", "item"], From daa8361ec233ef57af44bbfd728fa482425732d4 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 1 May 2026 14:06:59 +0900 Subject: [PATCH 48/94] feat: implement evaluator API and run evaluator workflow - Introduced a new Evaluator API wrapper for job scheduling and status polling, enhancing the evaluation process. - Added a new task `job_run_evaluator_and_process` to streamline the workflow of scheduling an evaluator job, polling for completion, downloading results, running evaluations, and generating parquet files. - Updated the UI to include a new tab for running the evaluator process, allowing users to configure and execute the complete workflow in one click. - Enhanced the task summary display to provide detailed results for the new evaluator workflow, improving user feedback and accessibility. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/evaluator_api.py | 518 ++++++++++++++++++ evaluation_dashboard_app/pages/6_Download.py | 378 ++++++++++++- evaluation_dashboard_app/worker/tasks.py | 258 +++++++++ 3 files changed, 1151 insertions(+), 3 deletions(-) create mode 100644 evaluation_dashboard_app/lib/evaluator_api.py diff --git a/evaluation_dashboard_app/lib/evaluator_api.py b/evaluation_dashboard_app/lib/evaluator_api.py new file mode 100644 index 0000000..3ceb243 --- /dev/null +++ b/evaluation_dashboard_app/lib/evaluator_api.py @@ -0,0 +1,518 @@ +""" +Evaluator API wrapper for job scheduling and status polling. +Based on evaluator_run_api.py from EvaluatorRunnerUITest, extended with polling support. +""" + +from __future__ import annotations + +import json +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Optional + +import requests +import webautoauth.requests +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +EVALUATION_API_BASE_URL = "https://evaluation.ci.web.auto/v3" +EVALUATION_REPORT_BASE_URL = "https://evaluation.tier4.jp/evaluation/reports" +DEFAULT_WEBAUTO_AUTH_PATH = Path.home() / ".webauto" / "auth.toml" + + +@dataclass(frozen=True) +class TestCaseDefinition: + test_id: str + project_id: str + catalog_id: str + integration_id: str + suite_ids: list[str] + catalog_display_name_prefix: str = "" + + +class EvaluationAPIError(RuntimeError): + """Raised when the evaluation API returns an unexpected response.""" + + +def load_test_cases(path: Path | str) -> dict[str, dict[str, Any]]: + path = Path(path) + with path.open("r", encoding="utf-8") as file: + return json.load(file) + + +def resolve_test_case(test_id: str, source: Any) -> TestCaseDefinition: + test_cases = normalize_test_case_mapping(source) + if test_id not in test_cases: + raise KeyError(f"Unknown test_id: {test_id}") + data = test_cases[test_id] + return make_test_case_definition(test_id, data) + + +def make_test_case_definition(test_id: str, data: dict[str, Any]) -> TestCaseDefinition: + return TestCaseDefinition( + test_id=test_id, + project_id=data["project_id"], + catalog_id=data["catalog_id"], + integration_id=data["integration_id"], + suite_ids=list(data.get("suite_ids", [])), + catalog_display_name_prefix=data.get("catalog_display_name_prefix", ""), + ) + + +def normalize_test_case_mapping(source: Any) -> dict[str, dict[str, Any]]: + """Normalize a test-case source into a mapping keyed by test_id.""" + if isinstance(source, dict): + return source + if isinstance(source, (str, Path)): + return load_test_cases(Path(source)) + raise TypeError("test case source must be a dict or JSON file path") + + +def normalize_test_case_definition( + test_case: Any, *, test_id: str = "custom" +) -> TestCaseDefinition: + """Normalize one test case definition.""" + if isinstance(test_case, TestCaseDefinition): + return test_case + if isinstance(test_case, dict): + return make_test_case_definition(test_id, test_case) + raise TypeError("test_case must be a TestCaseDefinition or dict") + + +def get_job_report_url(project_id: str, job_id: str) -> str: + return f"{EVALUATION_REPORT_BASE_URL}/{job_id}/?project_id={project_id}" + + +def get_suite_report_url(project_id: str, job_id: str, suite_report_id: str) -> str: + return f"{EVALUATION_REPORT_BASE_URL}/{job_id}/tests/{suite_report_id}?project_id={project_id}" + + +def extract_job_id(url: str) -> str: + if "/reports/" in url: + url = url.split("/reports/")[1] + if "/" in url: + url = url.split("/")[0] + if "?" in url: + url = url.split("?")[0] + return url + + +def extract_project_id(url: str) -> str: + if "project_id=" in url: + return url.split("project_id=")[1] + return url + + +def _make_session(auth_path: Path | str | None = DEFAULT_WEBAUTO_AUTH_PATH): + """Build authenticated session for evaluation.ci.web.auto API.""" + headers = { + "Content-Type": "application/json", + "accept": "application/json", + } + if auth_path is not None: + auth_path = Path(auth_path).expanduser().resolve() + if not auth_path.exists(): + raise FileNotFoundError(f"webauto auth config not found: {auth_path}") + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + session = webautoauth.requests.make_session(token_source) + presigned = requests.Session() + retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) + presigned.mount("http://", HTTPAdapter(max_retries=retries)) + presigned.mount("https://", HTTPAdapter(max_retries=retries)) + return session, presigned, headers + + +def get_evaluator_session(environment: str = "default"): + """Public API: same session as worker. Returns (session, presigned, headers).""" + import os + os.environ["AUTH_PROFILE"] = environment + return _make_session() + + +class EvaluationRunAPI: + """Minimal wrapper for scheduling evaluation jobs and collecting reports.""" + + def __init__( + self, + api_base_url: str = EVALUATION_API_BASE_URL, + *, + auth_path: Path | str | None = DEFAULT_WEBAUTO_AUTH_PATH, + test_cases: Optional[dict[str, dict[str, Any]]] = None, + ) -> None: + self.api_base_url = api_base_url.rstrip("/") + self._session, self._presigned, self._headers = _make_session(auth_path) + self.test_cases = test_cases or {} + + def request(self, url: str, params: Optional[dict[str, Any]] = None, method: str = "GET"): + if method == "GET": + from urllib.parse import urlencode + if params: + return self._session.get(f"{url}?{urlencode(params)}", headers=self._headers) + return self._session.get(url, headers=self._headers) + + if method == "POST": + if params is None: + return self._session.post(url, headers=self._headers) + return self._session.post( + url, + data=json.dumps(params).encode("utf-8"), + headers=self._headers, + ) + + raise ValueError(f"Unsupported method: {method}") + + def schedule_job( + self, + *, + project_id: str, + catalog_id: str, + integration_id: str, + target_name: str, + suite_ids: Optional[list[str]] = None, + max_retries: int = 1, + description: str = "no description", + clean_build: bool = False, + debug: bool = False, + release: bool = False, + record_caret: bool = False, + log_expiration_time_in_days: float = 14.0, + is_tag: bool = False, + ) -> dict[str, Any]: + payload = { + "build_options": { + "clean_build": clean_build, + "debug": debug, + }, + "catalog_id": catalog_id, + "description": description, + "integration_id": integration_id, + "release": release, + "source": {"git_tag" if is_tag else "git_branch": str(target_name)}, + "suite_ids": suite_ids or [], + "test_options": { + "max_retries": max_retries, + "record_caret": record_caret, + "log_expiration_time": int(log_expiration_time_in_days * 24 * 60 * 60), + }, + } + if record_caret: + payload["build_options"]["developer_option_names"] = [ + "webauto:ci:caret_enabled" + ] + + url = f"{self.api_base_url}/projects/{project_id}/jobs/schedule" + response = self.request(url, payload, method="POST") + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 202: + raise EvaluationAPIError( + f"Failed to schedule job: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def schedule_job_by_test_id( + self, + test_id: str, + *, + target_name: str, + test_cases: Any = None, + max_retries: int = 1, + description: str = "no description", + clean_build: bool = False, + debug: bool = False, + release: bool = False, + record_caret: bool = False, + log_expiration_time_in_days: float = 14.0, + is_tag: bool = False, + ) -> dict[str, Any]: + if test_cases is None: + if not self.test_cases: + raise ValueError( + "No test case source provided. Pass `test_cases=...` or use schedule_job()." + ) + source = self.test_cases + else: + source = test_cases + + test_case = resolve_test_case(test_id, source) + return self.schedule_job( + project_id=test_case.project_id, + catalog_id=test_case.catalog_id, + integration_id=test_case.integration_id, + target_name=target_name, + suite_ids=test_case.suite_ids, + max_retries=max_retries, + description=description, + clean_build=clean_build, + debug=debug, + release=release, + record_caret=record_caret, + log_expiration_time_in_days=log_expiration_time_in_days, + is_tag=is_tag, + ) + + def schedule_job_by_definition( + self, + test_case: TestCaseDefinition | dict[str, Any], + *, + target_name: str, + test_id: str = "custom", + max_retries: int = 1, + description: str = "no description", + clean_build: bool = False, + debug: bool = False, + release: bool = False, + record_caret: bool = False, + log_expiration_time_in_days: float = 14.0, + is_tag: bool = False, + ) -> dict[str, Any]: + definition = normalize_test_case_definition(test_case, test_id=test_id) + return self.schedule_job( + project_id=definition.project_id, + catalog_id=definition.catalog_id, + integration_id=definition.integration_id, + target_name=target_name, + suite_ids=definition.suite_ids, + max_retries=max_retries, + description=description, + clean_build=clean_build, + debug=debug, + release=release, + record_caret=record_caret, + log_expiration_time_in_days=log_expiration_time_in_days, + is_tag=is_tag, + ) + + def get_job_status(self, project_id: str, job_id: str) -> dict[str, Any]: + """Get current job status from the API.""" + url = f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/report" + response = self.request(url, {}) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to get job status: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def is_job_completed(self, project_id: str, job_id: str) -> tuple[bool, str, dict[str, Any]]: + """ + Check if a job has completed (success or failure). + Returns (is_completed, status, report_data). + Status can be: 'pending', 'running', 'succeeded', 'failed', 'canceled', 'unknown' + """ + report = self.get_job_status(project_id, job_id) + + # Check test status first (this is the actual evaluation result) + test = report.get("test") or {} + test_status = test.get("status", "") + + # Check build status as fallback + build = report.get("build") or {} + build_status = build.get("status", "") + + # Determine overall status + if test_status: + status = test_status + elif build_status: + status = build_status + else: + status = report.get("status", "unknown") + + # Check if completed (not pending/running) + is_completed = status.lower() in ("succeeded", "failed", "canceled", "cancelled") + + return is_completed, status, report + + def wait_for_job_completion( + self, + project_id: str, + job_id: str, + poll_interval: float = 60.0, + max_wait_seconds: float = 3600.0 * 24 * 7, # Default 1 week + on_progress: Optional[Callable[[str], None]] = None, + on_check: Optional[Callable[[str, float], None]] = None, + ) -> dict[str, Any]: + """ + Poll job status until completion or timeout. + + Args: + project_id: Project ID + job_id: Job ID to wait for + poll_interval: Seconds between status checks (default 60s) + max_wait_seconds: Maximum seconds to wait (default 1 week) + on_progress: Callback for progress messages (receives message string) + on_check: Callback after each check (receives status string, elapsed seconds) + + Returns: + Final job report dict + + Raises: + EvaluationAPIError: If timeout or API error + """ + start_time = time.time() + last_status = "unknown" + + if on_progress: + on_progress(f"Waiting for evaluator job {job_id} to complete...") + + while True: + elapsed = time.time() - start_time + + # Check timeout + if elapsed > max_wait_seconds: + raise EvaluationAPIError( + f"Timeout waiting for job {job_id} after {elapsed:.0f}s" + ) + + try: + is_completed, status, report = self.is_job_completed(project_id, job_id) + last_status = status + + if on_check: + on_check(status, elapsed) + + if is_completed: + if on_progress: + on_progress(f"Job {job_id} completed with status: {status}") + return report + + # Log progress periodically (every 5 minutes or on status change) + if on_progress and (elapsed < 60 or int(elapsed) % 300 < poll_interval): + on_progress( + f"Job {job_id} status: {status} (elapsed: {elapsed/3600:.1f}h)" + ) + + except Exception as e: + if on_progress: + on_progress(f"Error checking job status: {e}") + # Continue polling on transient errors + + time.sleep(poll_interval) + + def get_report_list( + self, + project_id: str, + *, + status: str = "all", + max_results: Optional[int] = None, + catalog_id: Optional[str] = None, + ) -> list[dict[str, Any]]: + reports: list[dict[str, Any]] = [] + next_token = "" + url = f"{self.api_base_url}/projects/{project_id}/jobs/reports" + while True: + params = { + "next_token": next_token, + "size": 100, + "status": status, + } + if catalog_id is not None: + params["catalog_id"] = catalog_id + + response = self.request(url, params) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch report list: status={response.status_code}, body={response.text}" + ) + + data = json.loads(response.content) + reports.extend(data.get("reports", [])) + next_token = data.get("next_token", "") + if next_token == "": + return reports + if max_results is not None and len(reports) >= max_results: + return reports[:max_results] + + def get_suite_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: + return self._get_paginated_reports( + f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/suite/reports" + ) + + def get_spec_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: + return self._get_paginated_reports( + f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/spec/reports" + ) + + def get_case_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: + return self._get_paginated_reports( + f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/case/reports" + ) + + def get_build_reports(self, project_id: str, job_id: str) -> dict[str, Any]: + url = f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/build/reports" + response = self.request(url, {}) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch build reports: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def get_job_report(self, project_id: str, job_id: str) -> dict[str, Any]: + url = f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/report" + response = self.request(url, {}) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch job report: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def get_suite_summary( + self, + project_id: str, + job_id: str, + *, + use_available_case_results: bool = False, + ) -> list[dict[str, Any]]: + mode = "available_case_results" if use_available_case_results else "case_results" + summaries: list[dict[str, Any]] = [] + for suite_report in self.get_suite_reports(project_id, job_id): + if mode not in suite_report: + continue + + result = suite_report[mode] + cancellation_count = result.get("cancellation_count", 0) + summaries.append( + { + "name": suite_report["suite"]["display_name"], + "all": result["total_count"] + cancellation_count, + "success": result["success_count"], + "fail": result["failure_count"] + cancellation_count, + "cancel": cancellation_count, + "simulation": suite_report["simulation"]["name"], + "url": get_suite_report_url(project_id, job_id, suite_report["id"]), + } + ) + return summaries + + def _get_paginated_reports(self, url: str) -> list[dict[str, Any]]: + reports: list[dict[str, Any]] = [] + next_token = "" + while True: + params = { + "next_token": next_token, + "size": 100, + } + response = self.request(url, params) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch paginated reports: status={response.status_code}, body={response.text}" + ) + + data = json.loads(response.content) + reports.extend(data.get("reports", [])) + next_token = data.get("next_token", "") + if next_token == "": + return reports diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 0f74f8b..ef19fd4 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -784,6 +784,7 @@ def _task_type_label(task_type: str) -> str: "generate_summary_csv": "Generate summary CSV", "build_parquet": "Build parquet", "download_and_eval": "Download + Eval", + "run_evaluator_and_process": "Run Evaluator + Process", } return labels.get(task_type, task_type or "Task") @@ -810,6 +811,11 @@ def _task_summary(t: Dict[str, Any]) -> str: if params.get("generate_parquet"): parts.append("parquet") return f"job_id={params.get('job_id', '')} [{'+'.join(parts)}] → {out}" + if task_type == "run_evaluator_and_process": + target = params.get("target_name", "") + is_tag = params.get("is_tag", False) + target_type = "tag" if is_tag else "branch" + return f"{target_type}={target} → {params.get('output_path', '')}" return "" @@ -942,6 +948,45 @@ def _render_result_summary(summary: Dict[str, Any]) -> None: st.error("Errors during execution:") for err in errors: st.write(f"- {err}") + elif job == "run_evaluator_and_process": + evaluator_job_id = summary.get("evaluator_job_id", "") + evaluator_report_url = summary.get("evaluator_report_url", "") + evaluator_status = summary.get("evaluator_status", "unknown") + dl_summary = summary.get("download_summary", {}) + eval_summary_data = summary.get("eval_summary", {}) + parquet_path = summary.get("parquet_path", "") + + st.subheader("Run Evaluator + Download + Eval + Parquet Summary") + + # Evaluator summary + st.write("🎯 **Evaluator**") + st.write(f" - Job ID: `{evaluator_job_id}`") + st.write(f" - Status: **{evaluator_status}**") + if evaluator_report_url: + st.markdown(f" - Report: [Open]({evaluator_report_url})") + + # Download summary + dl_total = dl_summary.get("total", 0) + dl_success = dl_summary.get("success", 0) + dl_failed = dl_summary.get("failed", 0) + st.write("📥 **Download**") + st.write(f" - Total: **{dl_total}**, Success: **{dl_success}**, Failed: **{dl_failed}**") + + # Eval summary + if eval_summary_data: + st.write("🧮 **Evaluation**") + st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") + st.write(f" - Success: **{eval_summary_data.get('success', 0)}**, Failed: **{eval_summary_data.get('failed', 0)}**") + st.write(f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows") + + # Parquet summary + if parquet_path: + st.write("📦 **Parquet**") + st.write(f" - Output: `{parquet_path}`") + + # Show report URL prominently + if evaluator_report_url: + st.markdown(f"### [📊 View Evaluator Report]({evaluator_report_url})") else: st.json(summary) @@ -1375,8 +1420,8 @@ def on_suite_id_change(): st.markdown('

Pick a workflow

', unsafe_allow_html=True) -tab1, tab2, tab3, tab4 = st.tabs( - ["📥 Download Results", "🗺️ Download Scenarios", "📊 View Downloads", "🧮 Eval Results"] +tab1, tab2, tab3, tab4, tab5 = st.tabs( + ["📥 Download Results", "🗺️ Download Scenarios", "📊 View Downloads", "🧮 Eval Results", "🚀 Run Evaluator + Process"] ) @@ -2330,4 +2375,331 @@ def _update_progress_status(done: int, total_dirs: int): if notify_when_done: _emit_eval_finished_notification( f"Eval run finished with CSV error. Success: {success_count}, Skipped: {skipped_count}, Failed: {failed_count}. {e}" - ) \ No newline at end of file + ) + + +# === TAB 5: Run Evaluator + Download + Eval + Parquet === +with tab5: + st.header("🚀 Run Evaluator + Download + Eval + Parquet") + st.caption( + "Complete workflow: Schedule an evaluator job, wait for completion, download results, " + "run evaluation, and generate parquet - all in one click." + ) + + # Load catalog presets from sibling EvaluatorRunnerUITest directory + CATALOGS_PATH = Path("/home/leigu/EvaluatorRunnerUITest/catalogs.json") + try: + with open(CATALOGS_PATH, "r", encoding="utf-8") as f: + CATALOG_PRESETS = json.load(f) + catalog_names = [c["display_name"] for c in CATALOG_PRESETS] + except Exception: + CATALOG_PRESETS = [] + catalog_names = [] + + # Evaluator configuration + st.subheader("Evaluator Configuration") + + # Project ID + eval_project_id = st.text_input( + "Project ID", + value=get_config_value("eval_project_id", "x2_dev"), + help="Evaluator project ID (e.g., x2_dev)" + ) + set_config_value("eval_project_id", eval_project_id) + + # Catalog selection + if catalog_names: + selected_catalog_name = st.selectbox( + "Catalog (from presets)", + options=catalog_names, + index=0, + help="Select a catalog from presets" + ) + selected_catalog = next((c for c in CATALOG_PRESETS if c["display_name"] == selected_catalog_name), None) + if selected_catalog: + catalog_id = selected_catalog["catalog_id"] + integration_id = selected_catalog["integration_id"] + with st.expander("Selected Catalog Details"): + st.json(selected_catalog) + else: + st.warning("No catalog presets found. Enter manually below.") + catalog_id = None + integration_id = None + + # Manual override + with st.expander("Manual Override"): + manual_catalog_id = st.text_input( + "Catalog ID (override)", + value=get_config_value("manual_catalog_id", ""), + help="Override catalog ID" + ) + set_config_value("manual_catalog_id", manual_catalog_id) + manual_integration_id = st.text_input( + "Integration ID (override)", + value=get_config_value("manual_integration_id", ""), + help="Override integration ID" + ) + set_config_value("manual_integration_id", manual_integration_id) + if manual_catalog_id: + catalog_id = manual_catalog_id + if manual_integration_id: + integration_id = manual_integration_id + + # Branch/Tag configuration + st.subheader("Branch Configuration") + target_name = st.text_input( + "Branch or Tag Name", + value=get_config_value("target_name", "beta/v4.3.2"), + help="Git branch name or tag to evaluate" + ) + set_config_value("target_name", target_name) + + is_tag = st.checkbox( + "Use as git tag (instead of branch)", + value=get_config_value("is_tag", False), + key="is_tag_checkbox" + ) + set_config_value("is_tag", is_tag) + + description = st.text_input( + "Description", + value=get_config_value("eval_description", ""), + help="Description for this evaluation run" + ) + if not description: + description = f"Auto-eval from dashboard at {datetime.now().isoformat()}" + set_config_value("eval_description", description) + + # Scheduling options + with st.expander("Advanced Scheduling Options"): + max_retries = st.number_input( + "Max Retries", + value=0, + min_value=0, + max_value=10, + help="Number of retries on failure" + ) + clean_build = st.checkbox( + "Clean Build", + value=get_config_value("clean_build", False), + help="Clean build before evaluation" + ) + set_config_value("clean_build", clean_build) + debug = st.checkbox( + "Debug Mode", + value=get_config_value("debug_mode", False), + help="Run in debug mode" + ) + set_config_value("debug_mode", debug) + + # Output path + st.subheader("Output Configuration") + eval_output_path = st.text_input( + "Output Path", + value=get_config_value("eval_output_path", "evaluator_run"), + help="Folder under data directory to save results" + ) + set_config_value("eval_output_path", eval_output_path) + + # Download options + with st.expander("Download Options"): + eval_download_type = st.radio( + "Download Type", + ["Archives (ZIP)", "Result JSON only"], + index=0, + horizontal=True, + help="What to download from evaluator results" + ) + if eval_download_type == "Archives (ZIP)": + eval_phase = st.text_input( + "Phase to extract", + value=get_config_value("eval_phase", "perception.object_recognition.tracking.objects"), + help="Phase name to extract from archives" + ) + eval_skip_large = st.checkbox( + "Skip large files", + value=get_config_value("eval_skip_large", False), + help="Skip large ZIP files" + ) + eval_large_mb = st.number_input( + "Large file threshold (MB)", + value=50.0, + min_value=1.0, + max_value=5000.0, + help="ZIP files larger than this will be skipped" + ) + set_config_value("eval_skip_large", eval_skip_large) + set_config_value("eval_large_mb", eval_large_mb) + else: + eval_phase = "" + eval_skip_large = False + eval_large_mb = 50.0 + set_config_value("eval_download_type", eval_download_type) + set_config_value("eval_phase", eval_phase) + + # Evaluator polling options + with st.expander("Evaluator Polling Options"): + st.caption("How long to wait for evaluator to complete") + poll_interval = st.number_input( + "Poll interval (seconds)", + value=60.0, + min_value=10.0, + max_value=600.0, + step=10.0, + help="How often to check evaluator status" + ) + max_wait_hours = st.number_input( + "Max wait time (hours)", + value=168.0, + min_value=1.0, + max_value=720.0, + step=1.0, + help="Maximum hours to wait for evaluator (default 168h = 1 week)" + ) + max_wait_seconds = max_wait_hours * 3600 + set_config_value("poll_interval", poll_interval) + + # Post-evaluator options + st.subheader("Post-Evaluator Processing") + eval_run_eval = st.checkbox( + "Run evaluation (eval_result + Summary/Score CSV)", + value=True, + key="eval_run_eval_checkbox", + help="Run eval_result on downloaded directories" + ) + eval_generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + key="eval_generate_parquet_checkbox", + help="Build scene_result.parquet from .pkl files" if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable", + disabled=not CATALOG_IO_AVAILABLE + ) + eval_recursive = st.checkbox( + "Search subdirectories for eval", + value=True, + key="eval_recursive_checkbox", + help="Recursively search for result directories" + ) + + # Run button + st.divider() + + if st.button("🚀 Run Evaluator + Download + Eval + Parquet", type="primary", key="run_evaluator_full_btn"): + # Validate inputs + if not eval_project_id: + st.error("Project ID is required") + st.stop() + if not catalog_id: + st.error("Catalog ID is required (select preset or enter manually)") + st.stop() + if not integration_id: + st.error("Integration ID is required (select preset or enter manually)") + st.stop() + if not target_name: + st.error("Branch or Tag name is required") + st.stop() + + # Resolve output path + resolved_output, path_err = resolve_under_data_root(eval_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + st.stop() + resolved_path_str = str(resolved_output) + + # Prepare parameters + params = { + "project_id": eval_project_id, + "catalog_id": catalog_id, + "integration_id": integration_id, + "suite_ids": None, # Can be configured later if needed + "target_name": target_name, + "description": description, + "output_path": resolved_path_str, + "environment": environment, + # Scheduling options + "max_retries": max_retries, + "clean_build": clean_build, + "debug": debug, + "is_tag": is_tag, + # Download options + "download_type": "archives" if eval_download_type == "Archives (ZIP)" else "result_json", + "phase": eval_phase, + "skip_large_file": eval_skip_large, + "large_file_mb": eval_large_mb, + "keep_zip_files": False, + # Polling options + "poll_interval": poll_interval, + "max_wait_seconds": max_wait_seconds, + # Post-evaluator options + "run_eval": eval_run_eval, + "generate_parquet": eval_generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + } + + if is_task_queue_enabled(): + task_id = _enqueue_task("run_evaluator_and_process", params) + if task_id: + st.success(f"Task queued: {task_id}") + st.info( + "The workflow will:\n" + "1. Schedule evaluator job\n" + "2. Poll until evaluator completes (may take hours)\n" + "3. Download results\n" + "4. Run eval (if enabled)\n" + "5. Generate parquet (if enabled)\n\n" + "Check the **Task status** section below for progress." + ) + # Show preview of params + with st.expander("Task Parameters Preview"): + st.json({ + "project_id": params["project_id"], + "catalog_id": params["catalog_id"], + "integration_id": params["integration_id"], + "target_name": params["target_name"], + "is_tag": params["is_tag"], + "output_path": params["output_path"], + "poll_interval": params["poll_interval"], + "max_wait_hours": params["max_wait_seconds"] / 3600, + "run_eval": params["run_eval"], + "generate_parquet": params["generate_parquet"], + }) + else: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + else: + st.error( + "Task queue is not enabled. Please set USE_TASK_QUEUE=true in your environment. " + "This workflow requires background task execution because the evaluator can take a long time." + ) + + # Information + with st.expander("ℹ️ How this workflow works"): + st.markdown(""" + **Workflow Steps:** + + 1. **Schedule Evaluator Job** + - Submits job to Evaluator API with selected catalog and branch + - Returns immediately with a job_id + + 2. **Wait for Completion** + - Polls evaluator status every {poll_interval}s + - Maximum wait time: {max_wait_hours}h (configurable) + - Progress updates are logged to the task + + 3. **Download Results** + - Downloads archives or result JSON from completed job + - Extracts and organizes files by scenario + + 4. **Run Evaluation** (if enabled) + - Processes result.json files + - Generates Summary.csv and Score.csv + + 5. **Generate Parquet** (if enabled) + - Converts .pkl files to scene_result.parquet + + **Important Notes:** + - This workflow runs in the background via the worker + - You can close the browser and check progress later + - The evaluator job itself may take hours depending on queue and run time + - If evaluator fails or times out, download/eval will not proceed + """) diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 8fbef57..cb26646 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -357,6 +357,263 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: raise +def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> None: + """ + Full combined workflow: Run Evaluator + Download + Eval + Parquet. + + Steps: + 1. Schedule evaluator job (get job_id) + 2. Poll until evaluator completes + 3. Download results + 4. Run eval + 5. Generate parquet + """ + update_task_status(task_id, "running") + append_task_log(task_id, "Starting run_evaluator_and_process workflow") + + try: + from lib import evaluator_api + from lib import download_core + + # Import eval_summary + eval_summary = _import_eval_summary() + pkl_archive_to_parquet = _import_catalog_io() + + # Extract parameters + project_id = parameters.get("project_id") + catalog_id = parameters.get("catalog_id") + integration_id = parameters.get("integration_id") + suite_ids = parameters.get("suite_ids") + target_name = parameters.get("target_name") # branch name or tag + description = parameters.get("description", "no description") + output_path = parameters.get("output_path") + + # Eval options + run_eval = parameters.get("run_eval", True) + generate_parquet = parameters.get("generate_parquet", True) + eval_recursive = parameters.get("eval_recursive", True) + eval_overwrite = parameters.get("eval_overwrite", False) + + # Download options + download_type = parameters.get("download_type", "archives") + phase = parameters.get("phase", "perception.object_recognition.tracking.objects") + skip_large_file = parameters.get("skip_large_file", False) + large_file_mb = float(parameters.get("large_file_mb", 50.0)) + keep_zip_files = parameters.get("keep_zip_files", False) + + # Evaluator polling options + poll_interval = float(parameters.get("poll_interval", 60.0)) + max_wait_seconds = float(parameters.get("max_wait_seconds", 3600.0 * 24 * 7)) # 1 week default + + # Scheduling options + max_retries = parameters.get("max_retries", 1) + clean_build = parameters.get("clean_build", False) + debug = parameters.get("debug", False) + is_tag = parameters.get("is_tag", False) + + if not all([project_id, catalog_id, integration_id, target_name, output_path]): + update_task_status(task_id, "failed", error_message="Missing required parameters") + return + + environment = parameters.get("environment", "default") + + def on_progress(msg: str) -> None: + append_task_log(task_id, msg) + update_task_progress(task_id, message=msg) + + def on_warning(msg: str) -> None: + append_task_log(task_id, f"WARNING: {msg}") + + # Step 1: Schedule evaluator job + on_progress("Step 1/5: Scheduling evaluator job...") + append_task_log(task_id, f"Project: {project_id}, Catalog: {catalog_id}, Target: {target_name}") + + try: + import os + os.environ["AUTH_PROFILE"] = environment + api = evaluator_api.EvaluationRunAPI() + + result = api.schedule_job( + project_id=project_id, + catalog_id=catalog_id, + integration_id=integration_id, + target_name=target_name, + suite_ids=suite_ids, + max_retries=max_retries, + description=description, + clean_build=clean_build, + debug=debug, + is_tag=is_tag, + ) + except Exception as e: + update_task_status(task_id, "failed", error_message=f"Failed to schedule evaluator job: {e}") + return + + job_id = result.get("job_id") + if not job_id: + update_task_status(task_id, "failed", error_message="No job_id returned from evaluator API") + return + + report_url = evaluator_api.get_job_report_url(project_id, job_id) + append_task_log(task_id, f"Scheduled evaluator job: {job_id}") + append_task_log(task_id, f"Report URL: {report_url}") + update_task_progress(task_id, message=f"Evaluator job scheduled: {job_id}", pct=5) + + # Step 2: Poll for evaluator completion + on_progress("Step 2/5: Waiting for evaluator to complete...") + append_task_log(task_id, "This may take a while depending on evaluator queue and run time...") + + def on_eval_progress(status: str, elapsed: float) -> None: + hours = elapsed / 3600 + msg = f"Evaluator status: {status} (elapsed: {hours:.1f}h)" + append_task_log(task_id, msg) + # Progress: 5% to 40% during evaluation wait + pct = min(5 + (elapsed / max_wait_seconds) * 35, 40) + update_task_progress(task_id, message=f"Evaluator: {status} ({hours:.1f}h elapsed)", pct=pct) + + try: + final_report = api.wait_for_job_completion( + project_id=project_id, + job_id=job_id, + poll_interval=poll_interval, + max_wait_seconds=max_wait_seconds, + on_check=on_eval_progress, + ) + except evaluator_api.EvaluationAPIError as e: + append_task_log(task_id, f"Evaluator wait error: {e}") + update_task_status(task_id, "failed", error_message=f"Evaluator failed or timed out: {e}") + return + + # Check if evaluator succeeded + test = final_report.get("test") or {} + test_status = test.get("status", "unknown") + if test_status not in ("succeeded", "success"): + update_task_status(task_id, "failed", error_message=f"Evaluator job failed with status: {test_status}") + return + + update_task_progress(task_id, message="Evaluator completed successfully", pct=40) + append_task_log(task_id, f"Evaluator completed with status: {test_status}") + + # Step 3: Download results + on_progress("Step 3/5: Downloading results...") + update_task_progress(task_id, message="Downloading results...", pct=45) + + try: + dl_result = download_core.run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=None, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + on_progress=on_progress, + on_warning=on_warning, + ) + failure_count, total_attempted, rows = dl_result + success_count = total_attempted - failure_count + download_success = success_count > 0 + + if not download_success: + update_task_status(task_id, "failed", + error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed") + return + + except Exception as e: + update_task_status(task_id, "failed", error_message=f"Download failed: {e}") + return + + update_task_progress(task_id, message=f"Download complete: {success_count}/{total_attempted} succeeded", pct=60) + + # Step 4: Run eval + if run_eval: + on_progress("Step 4/5: Running evaluation...") + update_task_progress(task_id, message="Running evaluation...", pct=65) + + target_dirs = eval_summary.find_eval_result_dirs(output_path, recursive=eval_recursive) + if target_dirs: + total = len(target_dirs) + eval_statuses = [] + for i, result_dir in enumerate(target_dirs): + pct = 65 + (i / total) * 20 + update_task_progress(task_id, message=f"Evaluating {i+1}/{total}: {result_dir}", pct=pct) + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + eval_statuses.append(status) + if status.get("status") == "failed": + append_task_log(task_id, f"Eval failed for {result_dir}: {status.get('detail', '')}") + + # Generate summary CSVs + csv_info = eval_summary.generate_summary_and_score_csv(output_path) + failed = [s for s in eval_statuses if s.get("status") == "failed"] + skipped = [s for s in eval_statuses if s.get("status") == "skipped"] + succeeded = [s for s in eval_statuses if s.get("status") == "success"] + + eval_result_summary = { + "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), + "summary_path": csv_info.get("summary_path", output_path), + "summary_rows": csv_info.get("summary_rows", 0), + "score_rows": csv_info.get("score_rows", 0), + } + append_task_log(task_id, f"Eval complete: {len(succeeded)}/{total} succeeded") + else: + eval_result_summary = {"directories_processed": 0, "success": 0, "failed": 0, "skipped": 0} + append_task_log(task_id, "No eval result directories found") + else: + eval_result_summary = {} + + update_task_progress(task_id, message="Evaluation complete", pct=85) + + # Step 5: Generate parquet + parquet_path = "" + if generate_parquet and pkl_archive_to_parquet: + on_progress("Step 5/5: Generating parquet...") + update_task_progress(task_id, message="Generating parquet...", pct=90) + + try: + parquet_path = pkl_archive_to_parquet( + output_path, + on_progress=None, + on_skip=None, + project_id=project_id, + job_id=job_id, + ) + append_task_log(task_id, f"Parquet generated: {parquet_path}") + except Exception as e: + append_task_log(task_id, f"Parquet generation failed: {e}") + parquet_path = "" + + update_task_progress(task_id, message="All steps complete", pct=100) + + # Build final summary + summary = { + "job": "run_evaluator_and_process", + "evaluator_job_id": job_id, + "evaluator_report_url": report_url, + "evaluator_status": test_status, + "download_summary": { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + }, + "eval_summary": eval_result_summary, + "parquet_path": parquet_path, + } + update_task_result_summary(task_id, summary) + append_task_log(task_id, "Workflow complete!") + update_task_status(task_id, "completed", result_path=output_path) + + except Exception as e: + append_task_log(task_id, f"Failed: {e}") + update_task_status(task_id, "failed", error_message=str(e)) + raise + + # Map task_type (from Postgres) to job function TASK_JOB_MAP = { "generate_summary_csv": job_generate_summary_csv, @@ -365,6 +622,7 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: "download_results": job_download_results, "download_scenarios": job_download_scenarios, "download_and_eval": job_download_and_eval, + "run_evaluator_and_process": job_run_evaluator_and_process, } From 69010f04c8f5481af9739a7721ab41017b0ecb59 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 1 May 2026 14:28:45 +0900 Subject: [PATCH 49/94] feat: add catalog presets and enhance loading mechanism - Introduced a new `catalogs.json` file containing various evaluation catalogs for testing and performance metrics. - Updated the Dockerfile and docker-compose.yml to include the new catalogs.json file in the application context. - Implemented a new loading mechanism in the 6_Download.py page to dynamically load catalog presets from multiple potential paths, improving flexibility and error handling. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Dockerfile | 1 + evaluation_dashboard_app/catalogs.json | 38 +++++++++ .../deploy/docker-compose.yml | 2 + evaluation_dashboard_app/lib/db.py | 1 + evaluation_dashboard_app/pages/6_Download.py | 77 ++++++++++++++++--- 5 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 evaluation_dashboard_app/catalogs.json diff --git a/evaluation_dashboard_app/Dockerfile b/evaluation_dashboard_app/Dockerfile index e76b5f7..95dc9fb 100644 --- a/evaluation_dashboard_app/Dockerfile +++ b/evaluation_dashboard_app/Dockerfile @@ -73,6 +73,7 @@ RUN python3 -m pip install --no-cache-dir -r requirements-docker.txt COPY Overview.py . COPY Readme.md . COPY Readme.en.md . +COPY catalogs.json . COPY pages/ pages/ COPY lib/ lib/ COPY worker/ worker/ diff --git a/evaluation_dashboard_app/catalogs.json b/evaluation_dashboard_app/catalogs.json new file mode 100644 index 0000000..4270cc5 --- /dev/null +++ b/evaluation_dashboard_app/catalogs.json @@ -0,0 +1,38 @@ +[ + { + "display_name": "Build Test Catalog", + "catalog_id": "bd0569ec-9826-44ac-8780-45b4cea624e6", + "description": "Try this catalog for testing build integration", + "integration_id": "900d2096-a112-48f0-a65e-27e122aad86a" + }, + { + "display_name": "Performance Test", + "catalog_id": "e36d75b9-6c3a-4970-9b9b-5cd13f7a9da3", + "description": "To calculate perception metrics and create metrics report", + "integration_id": "96ad8fba-0228-4c2b-9166-07d4de1a0760" + }, + { + "display_name": "Old performance test", + "catalog_id": "e2efe01d-e0c6-4d49-8223-817ff5d73204", + "description": "Run perception metrics test we have done previously", + "integration_id": "6126e86f-615f-4b84-9643-91b88db606bd" + }, + { + "display_name": "Devops Test", + "catalog_id": "ab0f8498-cc1b-4726-836f-e18e8bcb3200", + "description": "Edge case for devops integration", + "integration_id": "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" + }, + { + "display_name": "Usecase Performance Catalog", + "catalog_id": "09039022-ec91-41bf-9e93-fdefccdfc9bc", + "description": "[WIP] Run evaluation based on planning scene catalog.", + "integration_id": "51f89d37-5c65-4449-9add-8971d0a79a7a" + }, + { + "display_name": "L4 regression test", + "catalog_id": "14b1d54b-5c9f-4cbf-a7e1-0eebceb1d30f", + "description": "[WARN] This is a regression test for L4, please do not use it for other purposes", + "integration_id": "c5f58b3c-8974-4f33-a8fa-e1f443320cfd" + } +] diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index b55f647..17bd525 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -53,6 +53,7 @@ x-streamlit-app: &streamlit-app - ./configs:/app/docker_config # Mount app source so code changes apply without rebuild (Streamlit auto-reloads) - ../Overview.py:/app/Overview.py + - ../catalogs.json:/app/catalogs.json - ../pages:/app/pages - ../Readme.md:/app/Readme.md - ../Readme.en.md:/app/Readme.en.md @@ -122,6 +123,7 @@ services: - ./configs:/app/docker_config # Mount app source so code changes apply without rebuild (restart worker to pick up: docker compose restart worker) - ../Overview.py:/app/Overview.py + - ../catalogs.json:/app/catalogs.json - ../pages:/app/pages - ../Readme.md:/app/Readme.md - ../Readme.en.md:/app/Readme.en.md diff --git a/evaluation_dashboard_app/lib/db.py b/evaluation_dashboard_app/lib/db.py index 930de51..0276c72 100644 --- a/evaluation_dashboard_app/lib/db.py +++ b/evaluation_dashboard_app/lib/db.py @@ -30,6 +30,7 @@ def _task_log_timestamp_prefix() -> str: "generate_summary_csv", "build_parquet", "download_and_eval", + "run_evaluator_and_process", ) TASK_STATUSES = ("pending", "running", "completed", "failed") diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index ef19fd4..a9a5705 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -103,6 +103,60 @@ def _parse_rq_timeout_sec(raw: Optional[str], *, default: int, minimum: int) -> else: _BUILD_PARQUET_JOB_TIMEOUT_SEC = _RQ_DEFAULT_JOB_TIMEOUT_SEC +_APP_ROOT = Path(__file__).resolve().parents[1] +_CATALOGS_FILENAME = "catalogs.json" +_LEGACY_CATALOGS_PATH = Path("/home/leigu/EvaluatorRunnerUITest/catalogs.json") + + +def _catalog_preset_candidate_paths() -> List[Path]: + """Return catalog preset paths in priority order.""" + paths: List[Path] = [] + env_path = os.environ.get("EVAL_CATALOGS_PATH") + if env_path: + paths.append(Path(env_path).expanduser()) + + paths.extend( + [ + _APP_ROOT / _CATALOGS_FILENAME, + Path.cwd() / _CATALOGS_FILENAME, + _LEGACY_CATALOGS_PATH, + ] + ) + + unique_paths: List[Path] = [] + seen = set() + for path in paths: + key = os.fspath(path) + if key not in seen: + unique_paths.append(path) + seen.add(key) + return unique_paths + + +def _load_catalog_presets() -> tuple[List[Dict[str, Any]], Optional[Path], Optional[str]]: + """Load evaluator catalog presets from the first available catalogs.json.""" + required_keys = {"display_name", "catalog_id", "integration_id"} + for path in _catalog_preset_candidate_paths(): + if not path.is_file(): + continue + + try: + with path.open("r", encoding="utf-8") as f: + presets = json.load(f) + if not isinstance(presets, list): + raise ValueError("catalog preset file must contain a JSON list") + + valid_presets = [ + preset + for preset in presets + if isinstance(preset, dict) and required_keys.issubset(preset) + ] + return valid_presets, path, None + except Exception as exc: + return [], path, str(exc) + + return [], None, None + def _enqueue_task( task_type: str, @@ -2386,15 +2440,9 @@ def _update_progress_status(done: int, total_dirs: int): "run evaluation, and generate parquet - all in one click." ) - # Load catalog presets from sibling EvaluatorRunnerUITest directory - CATALOGS_PATH = Path("/home/leigu/EvaluatorRunnerUITest/catalogs.json") - try: - with open(CATALOGS_PATH, "r", encoding="utf-8") as f: - CATALOG_PRESETS = json.load(f) - catalog_names = [c["display_name"] for c in CATALOG_PRESETS] - except Exception: - CATALOG_PRESETS = [] - catalog_names = [] + # Load catalog presets from the app root, with env/cwd/legacy fallbacks. + CATALOG_PRESETS, CATALOGS_PATH, catalog_load_error = _load_catalog_presets() + catalog_names = [c["display_name"] for c in CATALOG_PRESETS] # Evaluator configuration st.subheader("Evaluator Configuration") @@ -2422,7 +2470,16 @@ def _update_progress_status(done: int, total_dirs: int): with st.expander("Selected Catalog Details"): st.json(selected_catalog) else: - st.warning("No catalog presets found. Enter manually below.") + if catalog_load_error and CATALOGS_PATH is not None: + st.warning( + f"Catalog presets could not be loaded from `{CATALOGS_PATH}`: " + f"{catalog_load_error}. Enter manually below." + ) + else: + st.warning( + f"No catalog presets found. Expected `{_APP_ROOT / _CATALOGS_FILENAME}`. " + "Enter manually below." + ) catalog_id = None integration_id = None From 8e923da8c03a05d5792f2f16ba8ce8ee9e226679 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 1 May 2026 16:07:41 +0900 Subject: [PATCH 50/94] feat: enhance job status handling in evaluator API - Introduced functions to normalize and extract job statuses from evaluation reports, improving the robustness of status handling. - Added terminal and success job status checks to streamline evaluation result processing. - Refactored the `job_run_evaluator_and_process` function to utilize the new status extraction logic, enhancing clarity and maintainability. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/evaluator_api.py | 67 ++++++++++++++----- evaluation_dashboard_app/worker/tasks.py | 7 +- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/evaluation_dashboard_app/lib/evaluator_api.py b/evaluation_dashboard_app/lib/evaluator_api.py index 3ceb243..1b99be7 100644 --- a/evaluation_dashboard_app/lib/evaluator_api.py +++ b/evaluation_dashboard_app/lib/evaluator_api.py @@ -19,6 +19,11 @@ EVALUATION_API_BASE_URL = "https://evaluation.ci.web.auto/v3" EVALUATION_REPORT_BASE_URL = "https://evaluation.tier4.jp/evaluation/reports" DEFAULT_WEBAUTO_AUTH_PATH = Path.home() / ".webauto" / "auth.toml" +SUCCESS_JOB_STATUSES = frozenset({"succeeded", "success"}) +FAILED_JOB_STATUSES = frozenset( + {"failed", "failure", "error", "canceled", "cancelled", "aborted"} +) +TERMINAL_JOB_STATUSES = SUCCESS_JOB_STATUSES | FAILED_JOB_STATUSES @dataclass(frozen=True) @@ -35,6 +40,48 @@ class EvaluationAPIError(RuntimeError): """Raised when the evaluation API returns an unexpected response.""" +def normalize_job_status(status: Any) -> str: + if status is None: + return "" + return str(status).strip().lower() + + +def extract_job_status(report: dict[str, Any]) -> str: + """Return the best evaluator status from known report response shapes.""" + if not isinstance(report, dict): + return "unknown" + + status_paths = ( + ("test", "status"), + ("build", "status"), + ("job", "status"), + ("evaluation", "status"), + ("status",), + ("state",), + ) + for path in status_paths: + current: Any = report + for key in path: + if not isinstance(current, dict): + current = None + break + current = current.get(key) + + status = normalize_job_status(current) + if status: + return status + + return "unknown" + + +def is_terminal_job_status(status: Any) -> bool: + return normalize_job_status(status) in TERMINAL_JOB_STATUSES + + +def is_success_job_status(status: Any) -> bool: + return normalize_job_status(status) in SUCCESS_JOB_STATUSES + + def load_test_cases(path: Path | str) -> dict[str, dict[str, Any]]: path = Path(path) with path.open("r", encoding="utf-8") as file: @@ -307,24 +354,8 @@ def is_job_completed(self, project_id: str, job_id: str) -> tuple[bool, str, dic """ report = self.get_job_status(project_id, job_id) - # Check test status first (this is the actual evaluation result) - test = report.get("test") or {} - test_status = test.get("status", "") - - # Check build status as fallback - build = report.get("build") or {} - build_status = build.get("status", "") - - # Determine overall status - if test_status: - status = test_status - elif build_status: - status = build_status - else: - status = report.get("status", "unknown") - - # Check if completed (not pending/running) - is_completed = status.lower() in ("succeeded", "failed", "canceled", "cancelled") + status = extract_job_status(report) + is_completed = is_terminal_job_status(status) return is_completed, status, report diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index cb26646..0cbc66e 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -484,10 +484,9 @@ def on_eval_progress(status: str, elapsed: float) -> None: update_task_status(task_id, "failed", error_message=f"Evaluator failed or timed out: {e}") return - # Check if evaluator succeeded - test = final_report.get("test") or {} - test_status = test.get("status", "unknown") - if test_status not in ("succeeded", "success"): + # Check if evaluator succeeded using the same status extraction as the poller. + test_status = evaluator_api.extract_job_status(final_report) + if not evaluator_api.is_success_job_status(test_status): update_task_status(task_id, "failed", error_message=f"Evaluator job failed with status: {test_status}") return From fa8e8788ec05a2f71dc1f0c0521006fc97d30a93 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 1 May 2026 16:17:36 +0900 Subject: [PATCH 51/94] feat: enhance job status extraction and download handling in evaluator workflow - Added support for additional job statuses, including "timed_out" and "timeout", to improve status reporting. - Refactored the `extract_job_status` function to utilize a new helper function for better clarity and maintainability. - Implemented a timeout mechanism in the `job_run_evaluator_and_process` function to handle download readiness, improving robustness against delays. - Enhanced error handling during downloads to provide clearer feedback on failures, ensuring better user experience. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/evaluator_api.py | 85 +++++++++++++++---- evaluation_dashboard_app/worker/tasks.py | 74 ++++++++++------ 2 files changed, 116 insertions(+), 43 deletions(-) diff --git a/evaluation_dashboard_app/lib/evaluator_api.py b/evaluation_dashboard_app/lib/evaluator_api.py index 1b99be7..c339b1b 100644 --- a/evaluation_dashboard_app/lib/evaluator_api.py +++ b/evaluation_dashboard_app/lib/evaluator_api.py @@ -21,9 +21,26 @@ DEFAULT_WEBAUTO_AUTH_PATH = Path.home() / ".webauto" / "auth.toml" SUCCESS_JOB_STATUSES = frozenset({"succeeded", "success"}) FAILED_JOB_STATUSES = frozenset( - {"failed", "failure", "error", "canceled", "cancelled", "aborted"} + { + "failed", + "failure", + "error", + "canceled", + "cancelled", + "aborted", + "timed_out", + "timeout", + } ) TERMINAL_JOB_STATUSES = SUCCESS_JOB_STATUSES | FAILED_JOB_STATUSES +_TEST_STATUS_PATHS = (("test", "status"),) +_OVERALL_STATUS_PATHS = ( + ("job", "status"), + ("evaluation", "status"), + ("status",), + ("state",), +) +_BUILD_STATUS_PATHS = (("build", "status"),) @dataclass(frozen=True) @@ -46,20 +63,8 @@ def normalize_job_status(status: Any) -> str: return str(status).strip().lower() -def extract_job_status(report: dict[str, Any]) -> str: - """Return the best evaluator status from known report response shapes.""" - if not isinstance(report, dict): - return "unknown" - - status_paths = ( - ("test", "status"), - ("build", "status"), - ("job", "status"), - ("evaluation", "status"), - ("status",), - ("state",), - ) - for path in status_paths: +def _get_first_status(report: dict[str, Any], paths: tuple[tuple[str, ...], ...]) -> str: + for path in paths: current: Any = report for key in path: if not isinstance(current, dict): @@ -71,6 +76,26 @@ def extract_job_status(report: dict[str, Any]) -> str: if status: return status + return "" + + +def extract_job_status(report: dict[str, Any]) -> str: + """Return the best evaluator status from known report response shapes.""" + if not isinstance(report, dict): + return "unknown" + + test_status = _get_first_status(report, _TEST_STATUS_PATHS) + if test_status: + return test_status + + overall_status = _get_first_status(report, _OVERALL_STATUS_PATHS) + if overall_status: + return overall_status + + build_status = _get_first_status(report, _BUILD_STATUS_PATHS) + if build_status: + return f"build:{build_status}" + return "unknown" @@ -82,6 +107,33 @@ def is_success_job_status(status: Any) -> bool: return normalize_job_status(status) in SUCCESS_JOB_STATUSES +def get_job_completion(report: dict[str, Any]) -> tuple[bool, str]: + """ + Return (is_completed, status) for an evaluator job report. + + Build success only means the build phase is done; evaluator jobs can still be + running suites/tests after that. Build failure is terminal because tests cannot + proceed, but build success must not unlock downloads by itself. + """ + if not isinstance(report, dict): + return False, "unknown" + + status = extract_job_status(report) + test_status = _get_first_status(report, _TEST_STATUS_PATHS) + if test_status: + return is_terminal_job_status(test_status), status + + overall_status = _get_first_status(report, _OVERALL_STATUS_PATHS) + if overall_status and is_terminal_job_status(overall_status): + return True, status + + build_status = _get_first_status(report, _BUILD_STATUS_PATHS) + if build_status in FAILED_JOB_STATUSES: + return True, status + + return False, status + + def load_test_cases(path: Path | str) -> dict[str, dict[str, Any]]: path = Path(path) with path.open("r", encoding="utf-8") as file: @@ -354,8 +406,7 @@ def is_job_completed(self, project_id: str, job_id: str) -> tuple[bool, str, dic """ report = self.get_job_status(project_id, job_id) - status = extract_job_status(report) - is_completed = is_terminal_job_status(status) + is_completed, status = get_job_completion(report) return is_completed, status, report diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 0cbc66e..5b06c88 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -6,6 +6,7 @@ import os import re import sys +import time from typing import Any, Dict # App root on path for lib imports @@ -404,6 +405,10 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N # Evaluator polling options poll_interval = float(parameters.get("poll_interval", 60.0)) max_wait_seconds = float(parameters.get("max_wait_seconds", 3600.0 * 24 * 7)) # 1 week default + download_ready_timeout = float(parameters.get("download_ready_timeout", 1800.0)) + download_ready_poll_interval = float( + parameters.get("download_ready_poll_interval", min(max(poll_interval, 10.0), 60.0)) + ) # Scheduling options max_retries = parameters.get("max_retries", 1) @@ -497,33 +502,50 @@ def on_eval_progress(status: str, elapsed: float) -> None: on_progress("Step 3/5: Downloading results...") update_task_progress(task_id, message="Downloading results...", pct=45) - try: - dl_result = download_core.run_download_results( - project_id=project_id, - job_id=job_id, - suite_id=None, - output_path=output_path, - download_type=download_type, - phase=phase, - skip_large_file=skip_large_file, - large_file_mb=large_file_mb, - keep_zip_files=keep_zip_files, - suite_ids=suite_ids, - on_progress=on_progress, - on_warning=on_warning, - ) - failure_count, total_attempted, rows = dl_result - success_count = total_attempted - failure_count - download_success = success_count > 0 - - if not download_success: - update_task_status(task_id, "failed", - error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed") - return + download_deadline = time.time() + download_ready_timeout + while True: + try: + dl_result = download_core.run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=None, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + on_progress=on_progress, + on_warning=on_warning, + ) + failure_count, total_attempted, rows = dl_result + success_count = total_attempted - failure_count + download_success = success_count > 0 - except Exception as e: - update_task_status(task_id, "failed", error_message=f"Download failed: {e}") - return + if not download_success: + update_task_status(task_id, "failed", + error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed") + return + break + + except RuntimeError as e: + if "No case reports found" not in str(e) or time.time() >= download_deadline: + update_task_status(task_id, "failed", error_message=f"Download failed: {e}") + return + + wait_seconds = min( + download_ready_poll_interval, + max(1.0, download_deadline - time.time()), + ) + msg = f"Case reports are not ready yet; retrying download in {wait_seconds:.0f}s" + append_task_log(task_id, f"{msg}. Detail: {e}") + update_task_progress(task_id, message=msg, pct=45) + time.sleep(wait_seconds) + + except Exception as e: + update_task_status(task_id, "failed", error_message=f"Download failed: {e}") + return update_task_progress(task_id, message=f"Download complete: {success_count}/{total_attempted} succeeded", pct=60) From 99c5e21b814889af359026416e92461dc3c30ec1 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 7 May 2026 14:34:20 +0900 Subject: [PATCH 52/94] feat: update description handling in download page for auto-evaluation - Changed the default value of the description input to an auto-generated timestamp, ensuring clarity on when evaluations are created. - Removed redundant logic that checked for an empty description, simplifying the code and enhancing user experience with a consistent auto-generated message. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index a9a5705..7350f21 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -2518,14 +2518,13 @@ def _update_progress_status(done: int, total_dirs: int): ) set_config_value("is_tag", is_tag) + # Description - always use fresh timestamp for new tasks + default_description = f"Auto-eval from dashboard at {datetime.now().isoformat()}" description = st.text_input( "Description", - value=get_config_value("eval_description", ""), - help="Description for this evaluation run" + value=default_description, + help="Description for this evaluation run (auto-generated with current timestamp)" ) - if not description: - description = f"Auto-eval from dashboard at {datetime.now().isoformat()}" - set_config_value("eval_description", description) # Scheduling options with st.expander("Advanced Scheduling Options"): From 2a0248c7262d5e5a37271c942ebb91b4f06ec89c Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 7 May 2026 15:59:46 +0900 Subject: [PATCH 53/94] feat: simplify download page and introduce evaluator workflow - Removed the fifth tab from the download page, streamlining the user interface and focusing on essential workflows. - Added a new Evaluator Workflow page, providing a comprehensive end-to-end process for running evaluator jobs and processing results, enhancing user experience and functionality. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 332 +--------- .../pages/7_Evaluator_Workflow.py | 575 ++++++++++++++++++ 2 files changed, 578 insertions(+), 329 deletions(-) create mode 100644 evaluation_dashboard_app/pages/7_Evaluator_Workflow.py diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 7350f21..a1d7519 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -1474,8 +1474,8 @@ def on_suite_id_change(): st.markdown('

Pick a workflow

', unsafe_allow_html=True) -tab1, tab2, tab3, tab4, tab5 = st.tabs( - ["📥 Download Results", "🗺️ Download Scenarios", "📊 View Downloads", "🧮 Eval Results", "🚀 Run Evaluator + Process"] +tab1, tab2, tab3, tab4 = st.tabs( + ["📥 Download Results", "🗺️ Download Scenarios", "📊 View Downloads", "🧮 Eval Results"] ) @@ -2432,330 +2432,4 @@ def _update_progress_status(done: int, total_dirs: int): ) -# === TAB 5: Run Evaluator + Download + Eval + Parquet === -with tab5: - st.header("🚀 Run Evaluator + Download + Eval + Parquet") - st.caption( - "Complete workflow: Schedule an evaluator job, wait for completion, download results, " - "run evaluation, and generate parquet - all in one click." - ) - - # Load catalog presets from the app root, with env/cwd/legacy fallbacks. - CATALOG_PRESETS, CATALOGS_PATH, catalog_load_error = _load_catalog_presets() - catalog_names = [c["display_name"] for c in CATALOG_PRESETS] - - # Evaluator configuration - st.subheader("Evaluator Configuration") - - # Project ID - eval_project_id = st.text_input( - "Project ID", - value=get_config_value("eval_project_id", "x2_dev"), - help="Evaluator project ID (e.g., x2_dev)" - ) - set_config_value("eval_project_id", eval_project_id) - - # Catalog selection - if catalog_names: - selected_catalog_name = st.selectbox( - "Catalog (from presets)", - options=catalog_names, - index=0, - help="Select a catalog from presets" - ) - selected_catalog = next((c for c in CATALOG_PRESETS if c["display_name"] == selected_catalog_name), None) - if selected_catalog: - catalog_id = selected_catalog["catalog_id"] - integration_id = selected_catalog["integration_id"] - with st.expander("Selected Catalog Details"): - st.json(selected_catalog) - else: - if catalog_load_error and CATALOGS_PATH is not None: - st.warning( - f"Catalog presets could not be loaded from `{CATALOGS_PATH}`: " - f"{catalog_load_error}. Enter manually below." - ) - else: - st.warning( - f"No catalog presets found. Expected `{_APP_ROOT / _CATALOGS_FILENAME}`. " - "Enter manually below." - ) - catalog_id = None - integration_id = None - - # Manual override - with st.expander("Manual Override"): - manual_catalog_id = st.text_input( - "Catalog ID (override)", - value=get_config_value("manual_catalog_id", ""), - help="Override catalog ID" - ) - set_config_value("manual_catalog_id", manual_catalog_id) - manual_integration_id = st.text_input( - "Integration ID (override)", - value=get_config_value("manual_integration_id", ""), - help="Override integration ID" - ) - set_config_value("manual_integration_id", manual_integration_id) - if manual_catalog_id: - catalog_id = manual_catalog_id - if manual_integration_id: - integration_id = manual_integration_id - - # Branch/Tag configuration - st.subheader("Branch Configuration") - target_name = st.text_input( - "Branch or Tag Name", - value=get_config_value("target_name", "beta/v4.3.2"), - help="Git branch name or tag to evaluate" - ) - set_config_value("target_name", target_name) - - is_tag = st.checkbox( - "Use as git tag (instead of branch)", - value=get_config_value("is_tag", False), - key="is_tag_checkbox" - ) - set_config_value("is_tag", is_tag) - - # Description - always use fresh timestamp for new tasks - default_description = f"Auto-eval from dashboard at {datetime.now().isoformat()}" - description = st.text_input( - "Description", - value=default_description, - help="Description for this evaluation run (auto-generated with current timestamp)" - ) - - # Scheduling options - with st.expander("Advanced Scheduling Options"): - max_retries = st.number_input( - "Max Retries", - value=0, - min_value=0, - max_value=10, - help="Number of retries on failure" - ) - clean_build = st.checkbox( - "Clean Build", - value=get_config_value("clean_build", False), - help="Clean build before evaluation" - ) - set_config_value("clean_build", clean_build) - debug = st.checkbox( - "Debug Mode", - value=get_config_value("debug_mode", False), - help="Run in debug mode" - ) - set_config_value("debug_mode", debug) - - # Output path - st.subheader("Output Configuration") - eval_output_path = st.text_input( - "Output Path", - value=get_config_value("eval_output_path", "evaluator_run"), - help="Folder under data directory to save results" - ) - set_config_value("eval_output_path", eval_output_path) - - # Download options - with st.expander("Download Options"): - eval_download_type = st.radio( - "Download Type", - ["Archives (ZIP)", "Result JSON only"], - index=0, - horizontal=True, - help="What to download from evaluator results" - ) - if eval_download_type == "Archives (ZIP)": - eval_phase = st.text_input( - "Phase to extract", - value=get_config_value("eval_phase", "perception.object_recognition.tracking.objects"), - help="Phase name to extract from archives" - ) - eval_skip_large = st.checkbox( - "Skip large files", - value=get_config_value("eval_skip_large", False), - help="Skip large ZIP files" - ) - eval_large_mb = st.number_input( - "Large file threshold (MB)", - value=50.0, - min_value=1.0, - max_value=5000.0, - help="ZIP files larger than this will be skipped" - ) - set_config_value("eval_skip_large", eval_skip_large) - set_config_value("eval_large_mb", eval_large_mb) - else: - eval_phase = "" - eval_skip_large = False - eval_large_mb = 50.0 - set_config_value("eval_download_type", eval_download_type) - set_config_value("eval_phase", eval_phase) - - # Evaluator polling options - with st.expander("Evaluator Polling Options"): - st.caption("How long to wait for evaluator to complete") - poll_interval = st.number_input( - "Poll interval (seconds)", - value=60.0, - min_value=10.0, - max_value=600.0, - step=10.0, - help="How often to check evaluator status" - ) - max_wait_hours = st.number_input( - "Max wait time (hours)", - value=168.0, - min_value=1.0, - max_value=720.0, - step=1.0, - help="Maximum hours to wait for evaluator (default 168h = 1 week)" - ) - max_wait_seconds = max_wait_hours * 3600 - set_config_value("poll_interval", poll_interval) - - # Post-evaluator options - st.subheader("Post-Evaluator Processing") - eval_run_eval = st.checkbox( - "Run evaluation (eval_result + Summary/Score CSV)", - value=True, - key="eval_run_eval_checkbox", - help="Run eval_result on downloaded directories" - ) - eval_generate_parquet = st.checkbox( - "Generate parquet", - value=CATALOG_IO_AVAILABLE, - key="eval_generate_parquet_checkbox", - help="Build scene_result.parquet from .pkl files" if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable", - disabled=not CATALOG_IO_AVAILABLE - ) - eval_recursive = st.checkbox( - "Search subdirectories for eval", - value=True, - key="eval_recursive_checkbox", - help="Recursively search for result directories" - ) - - # Run button - st.divider() - - if st.button("🚀 Run Evaluator + Download + Eval + Parquet", type="primary", key="run_evaluator_full_btn"): - # Validate inputs - if not eval_project_id: - st.error("Project ID is required") - st.stop() - if not catalog_id: - st.error("Catalog ID is required (select preset or enter manually)") - st.stop() - if not integration_id: - st.error("Integration ID is required (select preset or enter manually)") - st.stop() - if not target_name: - st.error("Branch or Tag name is required") - st.stop() - - # Resolve output path - resolved_output, path_err = resolve_under_data_root(eval_output_path, allow_create=True) - if path_err: - st.error(f"Output path is invalid: {path_err}") - st.stop() - resolved_path_str = str(resolved_output) - - # Prepare parameters - params = { - "project_id": eval_project_id, - "catalog_id": catalog_id, - "integration_id": integration_id, - "suite_ids": None, # Can be configured later if needed - "target_name": target_name, - "description": description, - "output_path": resolved_path_str, - "environment": environment, - # Scheduling options - "max_retries": max_retries, - "clean_build": clean_build, - "debug": debug, - "is_tag": is_tag, - # Download options - "download_type": "archives" if eval_download_type == "Archives (ZIP)" else "result_json", - "phase": eval_phase, - "skip_large_file": eval_skip_large, - "large_file_mb": eval_large_mb, - "keep_zip_files": False, - # Polling options - "poll_interval": poll_interval, - "max_wait_seconds": max_wait_seconds, - # Post-evaluator options - "run_eval": eval_run_eval, - "generate_parquet": eval_generate_parquet, - "eval_recursive": eval_recursive, - "eval_overwrite": False, - } - - if is_task_queue_enabled(): - task_id = _enqueue_task("run_evaluator_and_process", params) - if task_id: - st.success(f"Task queued: {task_id}") - st.info( - "The workflow will:\n" - "1. Schedule evaluator job\n" - "2. Poll until evaluator completes (may take hours)\n" - "3. Download results\n" - "4. Run eval (if enabled)\n" - "5. Generate parquet (if enabled)\n\n" - "Check the **Task status** section below for progress." - ) - # Show preview of params - with st.expander("Task Parameters Preview"): - st.json({ - "project_id": params["project_id"], - "catalog_id": params["catalog_id"], - "integration_id": params["integration_id"], - "target_name": params["target_name"], - "is_tag": params["is_tag"], - "output_path": params["output_path"], - "poll_interval": params["poll_interval"], - "max_wait_hours": params["max_wait_seconds"] / 3600, - "run_eval": params["run_eval"], - "generate_parquet": params["generate_parquet"], - }) - else: - st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") - else: - st.error( - "Task queue is not enabled. Please set USE_TASK_QUEUE=true in your environment. " - "This workflow requires background task execution because the evaluator can take a long time." - ) - - # Information - with st.expander("ℹ️ How this workflow works"): - st.markdown(""" - **Workflow Steps:** - - 1. **Schedule Evaluator Job** - - Submits job to Evaluator API with selected catalog and branch - - Returns immediately with a job_id - - 2. **Wait for Completion** - - Polls evaluator status every {poll_interval}s - - Maximum wait time: {max_wait_hours}h (configurable) - - Progress updates are logged to the task - - 3. **Download Results** - - Downloads archives or result JSON from completed job - - Extracts and organizes files by scenario - - 4. **Run Evaluation** (if enabled) - - Processes result.json files - - Generates Summary.csv and Score.csv - - 5. **Generate Parquet** (if enabled) - - Converts .pkl files to scene_result.parquet - - **Important Notes:** - - This workflow runs in the background via the worker - - You can close the browser and check progress later - - The evaluator job itself may take hours depending on queue and run time - - If evaluator fails or times out, download/eval will not proceed - """) + diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py new file mode 100644 index 0000000..bf08200 --- /dev/null +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -0,0 +1,575 @@ +""" +Evaluator Workflow Page +======================= +Complete end-to-end workflow for running evaluator jobs and processing results. +""" + +import streamlit as st +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Optional + +from lib.WebAPI import scenarioAPI +from lib.user_config import UserConfig + +# Initialize or load user config +_user_config = UserConfig(warning_fn=st.warning) + +def get_config_value(key, default=None): + return _user_config.get(key, default) + +def set_config_value(key, value): + _user_config.set(key, value) + +from lib.path_utils import get_data_root, resolve_under_data_root +from lib.page_chrome import inject_app_page_styles +from lib.db import ( + create_task, + is_task_queue_enabled, + list_recent_tasks, +) + +try: + from lib.perception_catalog_io import pkl_archive_to_parquet + CATALOG_IO_AVAILABLE = True +except ImportError: + CATALOG_IO_AVAILABLE = False + +# JST timezone for display +_JST = timezone(timedelta(hours=9)) +_TASK_LIST_MAX_ROWS = 200 + + +def _to_jst(dt): + if dt is None: + return None + try: + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(_JST) + except Exception: + return None + + +def _load_catalog_presets(): + import os + import json + _APP_ROOT = Path(__file__).parent.parent + _CATALOGS_FILENAME = "catalogs.json" + search_paths = [ + _APP_ROOT / _CATALOGS_FILENAME, + Path(os.environ.get("CATALOGS_PATH", "")), + Path.cwd() / _CATALOGS_FILENAME, + ] + catalogs = [] + loaded_path = None + load_error = None + for p in search_paths: + if p.exists() and p.is_file(): + try: + with open(p, "r", encoding="utf-8") as f: + data = json.load(f) + catalogs = data.get("catalogs", []) if isinstance(data, dict) else (data if isinstance(data, list) else []) + loaded_path = str(p) + load_error = None + break + except Exception as e: + load_error = str(e) + presets = [] + for c in catalogs: + if isinstance(c, dict): + name = c.get("display_name") or c.get("name") or c.get("catalog_id", "Unknown") + presets.append({**c, "display_name": name}) + return presets, loaded_path, load_error + + +def _enqueue_task(queue_name: str, params: dict) -> Optional[str]: + try: + from redis import Redis + from rq import Queue + import os + redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379") + redis_conn = Redis.from_url(redis_url) + q = Queue(name=queue_name, connection=redis_conn, default_timeout="7d") + from worker.tasks import job_run_evaluator_and_process + job = q.enqueue(job_run_evaluator_and_process, kwargs=params, job_timeout="7d", result_ttl="7d") + user_id = None + try: + from lib.auth import get_current_user_id + user_id = get_current_user_id() + except Exception: + pass + create_task( + task_id=job.id, + task_type="run_evaluator_and_process", + description=f"Evaluator workflow: {params.get('description', 'no description')}", + parameters=params, + created_by=user_id, + ) + return job.id + except Exception as e: + st.error(f"Failed to enqueue task: {e}") + return None + + +def render_task_row(task): + """Render a single task row.""" + status = task.get("status", "unknown") + task_id = task.get("task_id", "") + description = task.get("description", "")[:70] + created = task.get("created_at") + created_str = _to_jst(created).strftime("%m/%d %H:%M") if created else "N/A" + + status_config = { + "running": {"color": "#f59e0b", "bg": "#fffbeb"}, + "finished": {"color": "#10b981", "bg": "#ecfdf5"}, + "failed": {"color": "#ef4444", "bg": "#fef2f2"}, + "queued": {"color": "#6b7280", "bg": "#f9fafb"}, + } + cfg = status_config.get(status, status_config["queued"]) + + st.markdown( + f""" +
+
+ {task_id[:24]}... +
{description}
+
🕐 {created_str}
+
+
{status}
+
+ """, + unsafe_allow_html=True, + ) + + +# Page config +st.set_page_config(page_title="Evaluator Workflow", layout="wide", initial_sidebar_state="expanded") +inject_app_page_styles() + +# Beautiful CSS +st.markdown(""" + +""", unsafe_allow_html=True) + +# Load catalog presets +CATALOG_PRESETS, CATALOGS_PATH, catalog_load_error = _load_catalog_presets() +catalog_names = [c["display_name"] for c in CATALOG_PRESETS] + +# ============================================ +# HERO +# ============================================ +st.markdown(""" +
+

🚀 Evaluator Workflow

+

Schedule jobs, download results, and generate reports — all in one click

+
+""", unsafe_allow_html=True) + +# ============================================ +# SIDEBAR +# ============================================ +st.sidebar.markdown("### ⚙️ Configuration") + +eval_project_id = st.sidebar.text_input("Project ID", value=get_config_value("eval_project_id", "x2_dev")) +set_config_value("eval_project_id", eval_project_id) + +if catalog_names: + selected_catalog_name = st.sidebar.selectbox("Catalog", options=catalog_names, index=0) + selected_catalog = next((c for c in CATALOG_PRESETS if c["display_name"] == selected_catalog_name), None) + if selected_catalog: + catalog_id = selected_catalog["catalog_id"] + integration_id = selected_catalog["integration_id"] + + # Display catalog info + st.sidebar.markdown("#### 📋 Catalog Info") + info_cols = st.sidebar.columns(2) + with info_cols[0]: + st.markdown(f"**ID:** `{catalog_id}`") + with info_cols[1]: + st.markdown(f"**Integration:** `{integration_id}`") + if selected_catalog.get("description"): + st.sidebar.markdown(f"📝 {selected_catalog['description']}") + if selected_catalog.get("tags"): + st.sidebar.markdown(f"🏷️ Tags: {', '.join(selected_catalog['tags'])}") +else: + catalog_id = None + integration_id = None + +with st.sidebar.expander("Manual override"): + manual_catalog_id = st.text_input("Catalog ID", value="") + manual_integration_id = st.text_input("Integration ID", value="") + if manual_catalog_id: + catalog_id = manual_catalog_id + if manual_integration_id: + integration_id = manual_integration_id + +target_name = st.sidebar.text_input("Branch or Tag", value=get_config_value("target_name", "beta/v4.3.2")) +set_config_value("target_name", target_name) + +# Auto-generate output folder based on branch name and timestamp +def _make_default_output_path(branch_name): + import re + clean_branch = re.sub(r'[^\w]', '_', branch_name.strip('/')) if branch_name else "eval" + clean_branch = re.sub(r'_+', '_', clean_branch).strip('_') + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"eval_{clean_branch}_{ts}" + +# Always auto-generate fresh output path +eval_output_path = st.sidebar.text_input("📁 Output folder", value=_make_default_output_path(target_name), key="eval_output_path") + +eval_download_type = get_config_value("eval_download_type", "Archives (ZIP)") +eval_phase = get_config_value("eval_phase", "perception.object_recognition.tracking.objects") +poll_interval = int(get_config_value("poll_interval", 60)) +max_wait_hours = int(get_config_value("max_wait_hours", 24)) +environment = get_config_value("environment", "") + +with st.sidebar.expander("Advanced"): + eval_download_type = st.radio("Download", ["Archives (ZIP)", "Result JSON"], index=0, horizontal=True) + set_config_value("eval_download_type", eval_download_type) + if eval_download_type == "Archives (ZIP)": + eval_phase = st.text_input("Phase", value=eval_phase) + set_config_value("eval_phase", eval_phase) + poll_interval = st.slider("Poll interval (s)", 10, 300, poll_interval, step=10) + set_config_value("poll_interval", poll_interval) + max_wait_hours = st.slider("Max wait (h)", 1, 168, max_wait_hours) + set_config_value("max_wait_hours", max_wait_hours) + +# ============================================ +# MAIN CONTENT +# ============================================ + +# Validation +validation_errors = [] +if not eval_project_id: + validation_errors.append("Project ID") +if not catalog_id: + validation_errors.append("Catalog ID") +if not integration_id: + validation_errors.append("Integration ID") +if not target_name: + validation_errors.append("Target") + +if validation_errors: + for err in validation_errors: + st.error(f"❌ {err}") + st.stop() + +resolved_output, path_err = resolve_under_data_root(eval_output_path, allow_create=True) +if path_err: + st.error(f"❌ {path_err}") + st.stop() +resolved_path_str = str(resolved_output) +max_wait_seconds = max_wait_hours * 3600 + +# Pipeline visualization +st.markdown(""" +
+
1
📤 Schedule
+
+
2
⏳ Wait
+
+
3
📥 Download
+
+
4
📊 Evaluate
+
+
5
📦 Parquet
+
+""", unsafe_allow_html=True) + +# Options +col1, col2, col3 = st.columns(3) +with col1: + eval_run_eval = st.checkbox("📊 Run Evaluation", value=True) +with col2: + eval_generate_parquet = st.checkbox("📦 Generate Parquet", value=CATALOG_IO_AVAILABLE, disabled=not CATALOG_IO_AVAILABLE) +with col3: + eval_recursive = st.checkbox("🔍 Recursive Scan", value=True) + +# START BUTTON +st.markdown("
", unsafe_allow_html=True) +st.markdown(""" + +""", unsafe_allow_html=True) +clicked = st.button("🚀 Start Evaluator Workflow", type="primary", use_container_width=True) + +if clicked: + if not is_task_queue_enabled(): + st.error("❌ Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") + st.stop() + + task_id = _enqueue_task("run_evaluator_and_process", { + "project_id": eval_project_id, + "catalog_id": catalog_id, + "integration_id": integration_id, + "suite_ids": None, + "target_name": target_name, + "description": f"Eval {datetime.now().strftime('%Y-%m-%d %H:%M')}", + "output_path": resolved_path_str, + "environment": environment, + "max_retries": 0, + "clean_build": False, + "debug": False, + "is_tag": False, + "download_type": "archives" if eval_download_type == "Archives (ZIP)" else "result_json", + "phase": eval_phase, + "skip_large_file": False, + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": poll_interval, + "max_wait_seconds": max_wait_seconds, + "run_eval": eval_run_eval, + "generate_parquet": eval_generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + }) + + if task_id: + st.success(f"✅ Workflow queued! Task: `{task_id[:24]}...`") + st.info("💡 Running in background — close browser, check Task Status below.") + else: + st.error("❌ Failed to enqueue task. Check worker logs.") + +# ============================================ +# TASK STATUS +# ============================================ +st.markdown('
📋 Recent Tasks
', unsafe_allow_html=True) + +if not is_task_queue_enabled(): + st.info("Task queue not enabled. Set `USE_TASK_QUEUE=true` to track background tasks.") +else: + tasks = list_recent_tasks(limit=_TASK_LIST_MAX_ROWS) + running = len([t for t in tasks if t.get("status") == "running"]) + finished = len([t for t in tasks if t.get("status") == "finished"]) + failed = len([t for t in tasks if t.get("status") == "failed"]) + + # Metrics row + m1, m2, m3, m4 = st.columns([1, 1, 1, 2]) + m1.metric("⏳ Running", running) + m2.metric("✅ Finished", finished) + m3.metric("❌ Failed", failed) + filter_status = m4.selectbox("Filter", ["All", "Running", "Finished", "Failed", "Queued"], index=0, label_visibility="collapsed") + + filtered = tasks if filter_status == "All" else [t for t in tasks if t.get("status") == filter_status.lower()] + + # Task list + for task in filtered[:10]: + status = task.get("status", "unknown") + task_id_str = task.get("task_id", "")[:24] + desc = task.get("description", "No description")[:60] + created = task.get("created_at") + created_str = _to_jst(created).strftime("%m/%d %H:%M") if created else "N/A" + + st.markdown(f""" +
+
+ {task_id_str}... +
{desc}
+
🕐 {created_str}
+
+
{status}
+
+ """, unsafe_allow_html=True) + + if task.get("error"): + with st.expander("❌ Error details"): + st.code(task["error"]) + + if len(filtered) > 10: + st.caption(f"Showing 10 of {len(filtered)} tasks") + +st.sidebar.divider() +st.sidebar.caption("💡 Runs async — close browser safely") From 80a80f39d3faa8fbfebcdbb37846621eb4673398 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 13 May 2026 14:58:33 +0900 Subject: [PATCH 54/94] feat: refactor evaluator workflow UI and enhance styling - Removed inline CSS for the hero section and replaced it with a new `render_page_hero` function for better modularity. - Introduced a new CSS style block for the pipeline visualization, improving the layout and visual appeal of the evaluator workflow. - Streamlined the code by eliminating unnecessary markdown for the hero section, enhancing readability and maintainability. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/7_Evaluator_Workflow.py | 231 ++++-------------- 1 file changed, 49 insertions(+), 182 deletions(-) diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index bf08200..86f8e0c 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -22,7 +22,7 @@ def set_config_value(key, value): _user_config.set(key, value) from lib.path_utils import get_data_root, resolve_under_data_root -from lib.page_chrome import inject_app_page_styles +from lib.page_chrome import inject_app_page_styles, render_page_hero from lib.db import ( create_task, is_task_queue_enabled, @@ -164,181 +164,6 @@ def render_task_row(task): st.set_page_config(page_title="Evaluator Workflow", layout="wide", initial_sidebar_state="expanded") inject_app_page_styles() -# Beautiful CSS -st.markdown(""" - -""", unsafe_allow_html=True) - # Load catalog presets CATALOG_PRESETS, CATALOGS_PATH, catalog_load_error = _load_catalog_presets() catalog_names = [c["display_name"] for c in CATALOG_PRESETS] @@ -346,12 +171,11 @@ def render_task_row(task): # ============================================ # HERO # ============================================ -st.markdown(""" -
-

🚀 Evaluator Workflow

-

Schedule jobs, download results, and generate reports — all in one click

-
-""", unsafe_allow_html=True) +render_page_hero( + kicker="Workflow automation", + title="Evaluator Workflow", + description="Schedule jobs, download results, and generate reports — all in one click", +) # ============================================ # SIDEBAR @@ -451,6 +275,49 @@ def _make_default_output_path(branch_name): # Pipeline visualization st.markdown(""" +
1
📤 Schedule
From bd6b8bf84604f559bf20bccea74a6d6b883e989c Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 13 May 2026 16:13:05 +0900 Subject: [PATCH 55/94] feat: integrate task history and result summary rendering in UI - Added new `task_history.py` and `task_result_summary.py` modules to encapsulate task history and result summary rendering logic. - Updated `__init__.py` to include new rendering functions for task details and summaries, enhancing modularity and reusability. - Refactored the download and evaluator workflow pages to utilize the new task rendering functions, improving code organization and user experience. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/ui/__init__.py | 7 + .../lib/ui/task_history.py | 280 ++++++++++++ .../lib/ui/task_result_summary.py | 178 ++++++++ evaluation_dashboard_app/pages/6_Download.py | 411 +----------------- .../pages/7_Evaluator_Workflow.py | 160 +++---- evaluation_dashboard_app/worker/tasks.py | 257 +++++++++-- 6 files changed, 755 insertions(+), 538 deletions(-) create mode 100644 evaluation_dashboard_app/lib/ui/task_history.py create mode 100644 evaluation_dashboard_app/lib/ui/task_result_summary.py diff --git a/evaluation_dashboard_app/lib/ui/__init__.py b/evaluation_dashboard_app/lib/ui/__init__.py index 6bae170..762606b 100644 --- a/evaluation_dashboard_app/lib/ui/__init__.py +++ b/evaluation_dashboard_app/lib/ui/__init__.py @@ -35,6 +35,8 @@ ) from lib.ui.styles_download import inject_download_page_styles from lib.ui.styles_global import inject_app_page_styles +from lib.ui.task_history import get_task_list_current_user, render_task_detail_content, render_task_list +from lib.ui.task_result_summary import render_summary_table, render_task_result_summary __all__ = [ "ImpressiveProgressHUD", @@ -62,6 +64,11 @@ "render_job_json_summary_panel", "render_recent_scenario_downloads_intro", "render_scenario_download_summary_panel", + "get_task_list_current_user", + "render_summary_table", + "render_task_detail_content", + "render_task_list", + "render_task_result_summary", "render_kpi_card", "section_header_html", ] diff --git a/evaluation_dashboard_app/lib/ui/task_history.py b/evaluation_dashboard_app/lib/ui/task_history.py new file mode 100644 index 0000000..a0c2136 --- /dev/null +++ b/evaluation_dashboard_app/lib/ui/task_history.py @@ -0,0 +1,280 @@ +"""Shared task history/list rendering used across pages.""" + +from __future__ import annotations + +import json +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, List, Optional + +import streamlit as st + +from lib.auth import get_current_user_id, is_auth_enabled +from lib.db import delete_task, get_task +from lib.ui.download_ui import TaskCardMode, render_task_list_empty_state, task_list_card_markup +from lib.ui.task_result_summary import render_task_result_summary + +_JST = timezone(timedelta(hours=9)) + + +def _to_jst(dt: Any) -> Optional[datetime]: + """Convert datetime to JST for display. Naive datetimes are assumed UTC.""" + if dt is None: + return None + if not hasattr(dt, "astimezone"): + return None + try: + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(_JST) + except Exception: + return None + + +def _task_type_label(task_type: str) -> str: + labels = { + "download_results": "Download results", + "download_scenarios": "Download scenarios", + "run_eval_dirs": "Run eval dirs", + "generate_summary_csv": "Generate summary CSV", + "build_parquet": "Build parquet", + "download_and_eval": "Download + Eval", + "run_evaluator_and_process": "Run Evaluator + Process", + } + return labels.get(task_type, task_type or "Task") + + +def _task_summary(t: Dict[str, Any]) -> str: + params = t.get("parameters") or {} + task_type = t.get("type", "") + if task_type == "download_results": + out = params.get("output_path") or params.get("job_id") or "" + return f"job_id={params.get('job_id', '')} → {out}" + if task_type == "download_scenarios": + out = params.get("output_dir") or params.get("output_path") or "" + return f"job_id={params.get('job_id', '')} → {out}" + if task_type in ("run_eval_dirs", "generate_summary_csv"): + return params.get("eval_root", "") + if task_type == "build_parquet": + return params.get("pkl_dir", "") + if task_type == "download_and_eval": + out = params.get("output_path") or params.get("job_id") or "" + parts = ["download"] + if params.get("run_eval"): + parts.append("eval") + if params.get("generate_parquet"): + parts.append("parquet") + return f"job_id={params.get('job_id', '')} [{'+'.join(parts)}] → {out}" + if task_type == "run_evaluator_and_process": + target = params.get("target_name", "") + target_type = "tag" if params.get("is_tag", False) else "branch" + return f"{target_type}={target} → {params.get('output_path', '')}" + return "" + + +def _task_time_str(t: Dict[str, Any]) -> str: + created = t.get("created_at") + dt = _to_jst(created) if created else None + if not dt: + return "—" + try: + return dt.strftime("%b %d, %H:%M") + except Exception: + return str(created)[:16] if created else "—" + + +def _task_duration(t: Dict[str, Any]) -> Optional[str]: + created = t.get("created_at") + updated = t.get("updated_at") + if not created or not updated: + return None + try: + start = created.timestamp() if hasattr(created, "timestamp") else None + end = updated.timestamp() if hasattr(updated, "timestamp") else None + if start is None or end is None: + return None + secs = int(end - start) + if secs < 60: + return f"{secs}s" + if secs < 3600: + return f"{secs // 60}m {secs % 60}s" + return f"{secs // 3600}h {(secs % 3600) // 60}m" + except Exception: + return None + + +def render_task_detail_content(t: Dict[str, Any]) -> None: + """Render full task detail content.""" + try: + _render_task_detail_content_impl(t) + except Exception as e: + st.error(f"Could not load task details: {e}") + import traceback + st.code(traceback.format_exc(), language=None) + + +def _render_task_detail_content_impl(t: Dict[str, Any]) -> None: + status = t.get("status", "") + created_jst = _to_jst(t.get("created_at")) + updated_jst = _to_jst(t.get("updated_at")) + time_parts = [] + if created_jst: + try: + time_parts.append(f"Created: {created_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") + except Exception: + time_parts.append(f"Created: {t.get('created_at')}") + if updated_jst and updated_jst != created_jst: + try: + time_parts.append(f"Updated: {updated_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") + except Exception: + time_parts.append(f"Updated: {t.get('updated_at')}") + if time_parts: + st.caption(" · ".join(time_parts)) + + result_summary_raw = t.get("result_summary") + if result_summary_raw: + try: + result_summary = json.loads(result_summary_raw) if isinstance(result_summary_raw, str) else result_summary_raw + render_task_result_summary(result_summary) + st.markdown("---") + except (TypeError, ValueError): + pass + if t.get("result_path"): + st.text_input( + "Result path", + value=t["result_path"], + key=f"rp_modal_{str(t.get('id'))}", + disabled=True, + label_visibility="collapsed", + ) + if status == "failed" and t.get("error_message"): + st.error(t.get("error_message")) + progress_message = (t.get("progress_message") or "").strip() + if progress_message: + st.info(progress_message) + log_output = (t.get("log_output") or "").strip() + if log_output: + st.caption("Log output") + st.code(log_output, language=None) + params = t.get("parameters") or {} + if params: + st.caption("Parameters") + st.json(params) + + +def _open_task_detail(task_id: str) -> None: + st.session_state["_task_detail_id"] = str(task_id) + + +def _render_one_task_row( + t: Dict[str, Any], + current_user: Optional[str], + use_dialog: bool, + *, + mode: TaskCardMode, +) -> None: + task_id = t.get("id", "") + status = t.get("status", "") + status_labels = {"pending": "Pending", "running": "Running", "completed": "Completed", "failed": "Failed"} + status_label = status_labels.get(status, status) + summary = _task_summary(t) + sid = str(task_id) + summary_short = ( + (summary[:72] + "…") if mode == "history" and summary and len(summary) > 72 else (summary if mode == "history" else "—") + ) or "—" + progress_msg = (t.get("progress_message") or "").strip() + card = task_list_card_markup( + task_id=sid, + type_label=_task_type_label(t.get("type", "")), + status=status, + status_label=status_label, + time_str=_task_time_str(t), + duration=_task_duration(t) or "—", + summary_short=summary_short, + progress_pct=t.get("progress_pct"), + progress_message=progress_msg, + mode=mode, + ) + st.markdown(f'
{card}
', unsafe_allow_html=True) + + if use_dialog: + bv, bd, _sp = st.columns([1.15, 1.15, 4]) + with bv: + st.button("View", key=f"view_{sid}", on_click=_open_task_detail, args=(sid,)) + with bd: + stop_lbl = "Stop" if status in ("pending", "running") else "Remove" + stop_help = ( + "Cancels the Redis/RQ job when possible, then removes this row from the list." + if status in ("pending", "running") + else "Remove this row from the task list." + ) + if st.button(stop_lbl, key=f"del_{sid}", type="secondary", help=stop_help): + delete_task(sid, session_id=current_user) + st.rerun() + else: + bd, _sp = st.columns([1.15, 4]) + with bd: + stop_lbl = "Stop" if status in ("pending", "running") else "Remove" + stop_help = ( + "Cancels the Redis/RQ job when possible, then removes this row from the list." + if status in ("pending", "running") + else "Remove this row from the task list." + ) + if st.button(stop_lbl, key=f"del_{sid}", type="secondary", help=stop_help): + delete_task(sid, session_id=current_user) + st.rerun() + + if not use_dialog: + with st.expander("More", expanded=False): + render_task_detail_content(t) + + +def render_task_list(tasks: List[Dict[str, Any]], current_user: Optional[str]) -> bool: + """Render the shared active/history task list. Returns True if any active tasks exist.""" + if current_user: + st.caption(f"Logged in as **{current_user}** · your recent tasks only") + if not tasks: + render_task_list_empty_state() + return False + + active = [t for t in tasks if t.get("status") in ("pending", "running")] + history = [t for t in tasks if t.get("status") not in ("pending", "running")] + use_dialog = callable(getattr(st, "dialog", None)) + + for t in active: + _render_one_task_row(t, current_user, use_dialog, mode="active_compact") + + if not active: + st.caption("No queued or running jobs.") + + if history: + with st.expander(f"Task history ({len(history)})", expanded=False): + for t in history: + _render_one_task_row(t, current_user, use_dialog, mode="history") + + if use_dialog and st.session_state.get("_task_detail_id"): + task_id = st.session_state["_task_detail_id"] + try: + detail_task = next((x for x in tasks if str(x.get("id")) == task_id), None) + if detail_task is None: + detail_task = get_task(task_id) + if detail_task: + + @st.dialog("Task details", width="large") + def _task_detail_modal(): + render_task_detail_content(detail_task) + if st.button("Close"): + st.session_state.pop("_task_detail_id", None) + st.rerun() + + _task_detail_modal() + except Exception as e: + st.error(f"Could not open task details: {e}") + finally: + st.session_state.pop("_task_detail_id", None) + + return len(active) > 0 + + +def get_task_list_current_user() -> Optional[str]: + """Return current user id when auth is enabled, else None.""" + return get_current_user_id() if is_auth_enabled() else None diff --git a/evaluation_dashboard_app/lib/ui/task_result_summary.py b/evaluation_dashboard_app/lib/ui/task_result_summary.py new file mode 100644 index 0000000..37a9234 --- /dev/null +++ b/evaluation_dashboard_app/lib/ui/task_result_summary.py @@ -0,0 +1,178 @@ +"""Shared task result-summary renderers used by background task pages.""" + +from typing import Any, Dict, List, Optional + +import pandas as pd +import streamlit as st + + +def render_summary_table(rows: Optional[List[Dict[str, Any]]]) -> None: + """Render a summary table from rows (e.g. Scenario Name, Scenario ID, Status) when present.""" + if not rows: + return + try: + df = pd.DataFrame(rows) + st.subheader("Download Status") + st.dataframe(df, width="stretch") + except Exception: + pass + + +def render_task_result_summary(summary: Dict[str, Any]) -> None: + """Render a result summary block from task result_summary JSON.""" + job = summary.get("job", "") + if job == "download_results": + total = summary.get("total", 0) + success = summary.get("success", 0) + failed = summary.get("failed", 0) + out = summary.get("output_path", "") + st.subheader("Summary") + st.write(f"- Total scenarios processed: **{total}**") + st.write(f"- Successfully downloaded: **{success}**") + if failed: + st.write(f"- Failed: **{failed}**") + st.write(f"- Output directory: `{out}`") + if success > 0: + st.info("To generate the final summary CSV files, go to the **Eval Results** tab and run the evaluation.") + render_summary_table(summary.get("rows")) + elif job == "download_scenarios": + total = summary.get("total", 0) + success = summary.get("success", 0) + failed = summary.get("failed", 0) + out = summary.get("output_path", "") + st.subheader("Summary") + st.write(f"- Total scenarios: **{total}**") + st.write(f"- Successfully downloaded: **{success}**") + if failed: + st.write(f"- Failed: **{failed}**") + st.write(f"- Result JSON files: **{total}** downloaded.") + st.write(f"- Output directory: `{out}`") + if success > 0: + st.info("To generate summary CSV files, go to the **Eval Results** tab and run the evaluation.") + render_summary_table(summary.get("rows")) + elif job == "run_eval_dirs": + dirs = summary.get("directories_processed", 0) + path = summary.get("summary_path", "") + srows = summary.get("summary_rows", 0) + scrows = summary.get("score_rows", 0) + st.subheader("Eval Summary") + st.write(f"- Directories processed: **{dirs}**") + st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") + elif job == "generate_summary_csv": + path = summary.get("summary_path", "") + srows = summary.get("summary_rows", 0) + scrows = summary.get("score_rows", 0) + st.subheader("Summary") + st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") + elif job == "build_parquet": + path = summary.get("output_path", "") + st.subheader("Summary") + st.write(f"- Output: `{path}`") + elif job == "download_and_eval": + dl_summary = summary.get("download_summary", {}) + eval_summary_data = summary.get("eval_summary", {}) + parquet_path = summary.get("parquet_path", "") + errors = summary.get("errors", []) + + st.subheader("Download + Eval + Parquet Summary") + + dl_success = summary.get("download_success", False) + if dl_success: + st.write("✅ **Download: SUCCESS**") + st.write( + f" - Total: **{dl_summary.get('total', 0)}**, " + f"Success: **{dl_summary.get('success', 0)}**, " + f"Failed: **{dl_summary.get('failed', 0)}**" + ) + else: + st.write("❌ **Download: FAILED**") + if errors: + for err in errors: + st.write(f" - {err}") + + if eval_summary_data: + st.write("✅ **Eval: SUCCESS**") + st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") + st.write( + f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, " + f"Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows" + ) + + if parquet_path: + st.write(f"✅ **Parquet: SUCCESS** → `{parquet_path}`") + + if errors: + st.error("Errors during execution:") + for err in errors: + st.write(f"- {err}") + elif job == "run_evaluator_and_process": + evaluator_job_id = summary.get("evaluator_job_id", "") + evaluator_report_url = summary.get("evaluator_report_url", "") + evaluator_status = summary.get("evaluator_status", "unknown") + evaluator_build_status = summary.get("evaluator_build_status", "") + evaluator_test_status = summary.get("evaluator_test_status", "") + evaluator_fail_message = summary.get("evaluator_fail_message", "") + evaluator_case_totals = summary.get("evaluator_case_totals", {}) + evaluator_suites = summary.get("evaluator_suites", []) + evaluator_failed_cases = summary.get("evaluator_failed_cases", []) + dl_summary = summary.get("download_summary", {}) + download_rows = summary.get("download_rows", []) + eval_summary_data = summary.get("eval_summary", {}) + parquet_path = summary.get("parquet_path", "") + + st.subheader("Run Evaluator + Download + Eval + Parquet Summary") + + st.write("🎯 **Evaluator**") + st.write(f" - Job ID: `{evaluator_job_id}`") + st.write(f" - Status: **{evaluator_status}**") + if evaluator_build_status: + st.write(f" - Build: **{evaluator_build_status}**") + if evaluator_test_status: + st.write(f" - Test: **{evaluator_test_status}**") + if evaluator_case_totals: + st.write( + " - Case results: " + f"**{evaluator_case_totals.get('success', 0)}** success, " + f"**{evaluator_case_totals.get('failed', 0)}** failed, " + f"**{evaluator_case_totals.get('canceled', 0)}** canceled " + f"(total **{evaluator_case_totals.get('total', 0)}**)" + ) + if evaluator_fail_message: + st.write(f" - Message: `{evaluator_fail_message}`") + if evaluator_report_url: + st.markdown(f" - Report: [Open]({evaluator_report_url})") + if evaluator_suites: + st.caption("Evaluator suite summary") + st.dataframe(pd.DataFrame(evaluator_suites), width="stretch", hide_index=True) + if evaluator_failed_cases: + st.caption("Failed cases from evaluator") + st.dataframe(pd.DataFrame(evaluator_failed_cases), width="stretch", hide_index=True) + + dl_total = dl_summary.get("total", 0) + dl_success = dl_summary.get("success", 0) + dl_failed = dl_summary.get("failed", 0) + st.write("📥 **Download**") + st.write(f" - Total: **{dl_total}**, Success: **{dl_success}**, Failed: **{dl_failed}**") + if download_rows: + render_summary_table(download_rows) + + if eval_summary_data: + st.write("🧮 **Evaluation**") + st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") + st.write( + f" - Success: **{eval_summary_data.get('success', 0)}**, " + f"Failed: **{eval_summary_data.get('failed', 0)}**" + ) + st.write( + f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, " + f"Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows" + ) + + if parquet_path: + st.write("📦 **Parquet**") + st.write(f" - Output: `{parquet_path}`") + + if evaluator_report_url: + st.markdown(f"### [📊 View Evaluator Report]({evaluator_report_url})") + else: + st.json(summary) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index a1d7519..3215f83 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -42,7 +42,6 @@ def _to_jst(dt: Any) -> Optional[datetime]: from lib.page_chrome import inject_app_page_styles from lib.ui.download_ui import ( ImpressiveProgressHUD, - TaskCardMode, render_detailed_scenario_download_panel, render_download_hero, render_download_status_table_intro, @@ -51,9 +50,8 @@ def _to_jst(dt: Any) -> Optional[datetime]: render_job_json_summary_panel, render_recent_scenario_downloads_intro, render_scenario_download_summary_panel, - render_task_list_empty_state, - task_list_card_markup, ) +from lib.ui.task_history import get_task_list_current_user, render_task_list from lib.ui.styles_download import inject_download_page_styles from lib.db import ( create_task, @@ -829,408 +827,10 @@ def download_scenarios( -def _task_type_label(task_type: str) -> str: - """Human-readable label for task type.""" - labels = { - "download_results": "Download results", - "download_scenarios": "Download scenarios", - "run_eval_dirs": "Run eval dirs", - "generate_summary_csv": "Generate summary CSV", - "build_parquet": "Build parquet", - "download_and_eval": "Download + Eval", - "run_evaluator_and_process": "Run Evaluator + Process", - } - return labels.get(task_type, task_type or "Task") - - -def _task_summary(t: Dict[str, Any]) -> str: - """One-line summary from task parameters (job_id, output_path, etc.).""" - params = t.get("parameters") or {} - task_type = t.get("type", "") - if task_type == "download_results": - out = params.get("output_path") or params.get("job_id") or "" - return f"job_id={params.get('job_id', '')} → {out}" - if task_type == "download_scenarios": - out = params.get("output_dir") or params.get("output_path") or "" - return f"job_id={params.get('job_id', '')} → {out}" - if task_type in ("run_eval_dirs", "generate_summary_csv"): - return params.get("eval_root", "") - if task_type == "build_parquet": - return params.get("pkl_dir", "") - if task_type == "download_and_eval": - out = params.get("output_path") or params.get("job_id") or "" - parts = ["download"] - if params.get("run_eval"): - parts.append("eval") - if params.get("generate_parquet"): - parts.append("parquet") - return f"job_id={params.get('job_id', '')} [{'+'.join(parts)}] → {out}" - if task_type == "run_evaluator_and_process": - target = params.get("target_name", "") - is_tag = params.get("is_tag", False) - target_type = "tag" if is_tag else "branch" - return f"{target_type}={target} → {params.get('output_path', '')}" - return "" - - -def _task_time_str(t: Dict[str, Any]) -> str: - """Format task created_at for display in JST (e.g. 'Feb 24, 16:45').""" - created = t.get("created_at") - dt = _to_jst(created) if created else None - if not dt: - return "—" - try: - return dt.strftime("%b %d, %H:%M") - except Exception: - return str(created)[:16] if created else "—" - - -def _task_duration(t: Dict[str, Any]) -> Optional[str]: - """Format duration from created_at to updated_at if both exist.""" - created = t.get("created_at") - updated = t.get("updated_at") - if not created or not updated: - return None - try: - start = created.timestamp() if hasattr(created, "timestamp") else None - end = updated.timestamp() if hasattr(updated, "timestamp") else None - if start is None or end is None: - return None - secs = int(end - start) - if secs < 60: - return f"{secs}s" - if secs < 3600: - return f"{secs // 60}m {secs % 60}s" - return f"{secs // 3600}h {(secs % 3600) // 60}m" - except Exception: - return None - - -def _render_summary_table(rows: Optional[List[Dict[str, Any]]]) -> None: - """Render a summary table from rows (e.g. Scenario Name, Scenario ID, Status) when present.""" - if not rows: - return - try: - df = pd.DataFrame(rows) - st.subheader("Download Status") - st.dataframe(df, width="stretch") - except Exception: - pass - - -def _render_result_summary(summary: Dict[str, Any]) -> None: - """Render a result summary block (like local mode) from task result_summary JSON.""" - job = summary.get("job", "") - if job == "download_results": - total = summary.get("total", 0) - success = summary.get("success", 0) - failed = summary.get("failed", 0) - out = summary.get("output_path", "") - st.subheader("Summary") - st.write(f"- Total scenarios processed: **{total}**") - st.write(f"- Successfully downloaded: **{success}**") - if failed: - st.write(f"- Failed: **{failed}**") - st.write(f"- Output directory: `{out}`") - if success > 0: - st.info("To generate the final summary CSV files, go to the **Eval Results** tab and run the evaluation.") - _render_summary_table(summary.get("rows")) - elif job == "download_scenarios": - total = summary.get("total", 0) - success = summary.get("success", 0) - failed = summary.get("failed", 0) - out = summary.get("output_path", "") - st.subheader("Summary") - st.write(f"- Total scenarios: **{total}**") - st.write(f"- Successfully downloaded: **{success}**") - if failed: - st.write(f"- Failed: **{failed}**") - st.write(f"- Result JSON files: **{total}** downloaded.") - st.write(f"- Output directory: `{out}`") - if success > 0: - st.info("To generate summary CSV files, go to the **Eval Results** tab and run the evaluation.") - _render_summary_table(summary.get("rows")) - elif job == "run_eval_dirs": - dirs = summary.get("directories_processed", 0) - path = summary.get("summary_path", "") - srows = summary.get("summary_rows", 0) - scrows = summary.get("score_rows", 0) - st.subheader("Eval Summary") - st.write(f"- Directories processed: **{dirs}**") - st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") - elif job == "generate_summary_csv": - path = summary.get("summary_path", "") - srows = summary.get("summary_rows", 0) - scrows = summary.get("score_rows", 0) - st.subheader("Summary") - st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") - elif job == "build_parquet": - path = summary.get("output_path", "") - st.subheader("Summary") - st.write(f"- Output: `{path}`") - elif job == "download_and_eval": - dl_summary = summary.get("download_summary", {}) - eval_summary_data = summary.get("eval_summary", {}) - parquet_path = summary.get("parquet_path", "") - errors = summary.get("errors", []) - - st.subheader("Download + Eval + Parquet Summary") - - # Download summary - dl_success = summary.get("download_success", False) - if dl_success: - st.write("✅ **Download: SUCCESS**") - st.write(f" - Total: **{dl_summary.get('total', 0)}**, Success: **{dl_summary.get('success', 0)}**, Failed: **{dl_summary.get('failed', 0)}**") - else: - st.write("❌ **Download: FAILED**") - if errors: - for err in errors: - st.write(f" - {err}") - - # Eval summary - if eval_summary_data: - st.write("✅ **Eval: SUCCESS**") - st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") - st.write(f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows") - - # Parquet summary - if parquet_path: - st.write(f"✅ **Parquet: SUCCESS** → `{parquet_path}`") - - # Show errors - if errors: - st.error("Errors during execution:") - for err in errors: - st.write(f"- {err}") - elif job == "run_evaluator_and_process": - evaluator_job_id = summary.get("evaluator_job_id", "") - evaluator_report_url = summary.get("evaluator_report_url", "") - evaluator_status = summary.get("evaluator_status", "unknown") - dl_summary = summary.get("download_summary", {}) - eval_summary_data = summary.get("eval_summary", {}) - parquet_path = summary.get("parquet_path", "") - - st.subheader("Run Evaluator + Download + Eval + Parquet Summary") - - # Evaluator summary - st.write("🎯 **Evaluator**") - st.write(f" - Job ID: `{evaluator_job_id}`") - st.write(f" - Status: **{evaluator_status}**") - if evaluator_report_url: - st.markdown(f" - Report: [Open]({evaluator_report_url})") - - # Download summary - dl_total = dl_summary.get("total", 0) - dl_success = dl_summary.get("success", 0) - dl_failed = dl_summary.get("failed", 0) - st.write("📥 **Download**") - st.write(f" - Total: **{dl_total}**, Success: **{dl_success}**, Failed: **{dl_failed}**") - - # Eval summary - if eval_summary_data: - st.write("🧮 **Evaluation**") - st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") - st.write(f" - Success: **{eval_summary_data.get('success', 0)}**, Failed: **{eval_summary_data.get('failed', 0)}**") - st.write(f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows") - - # Parquet summary - if parquet_path: - st.write("📦 **Parquet**") - st.write(f" - Output: `{parquet_path}`") - - # Show report URL prominently - if evaluator_report_url: - st.markdown(f"### [📊 View Evaluator Report]({evaluator_report_url})") - else: - st.json(summary) - - -def _render_task_detail_content(t: Dict[str, Any]) -> None: - """Render full task detail (summary, path, error, log, params) into current container.""" - try: - _render_task_detail_content_impl(t) - except Exception as e: - st.error(f"Could not load task details: {e}") - import traceback - st.code(traceback.format_exc(), language=None) - - -def _render_task_detail_content_impl(t: Dict[str, Any]) -> None: - """Implementation of task detail rendering (called inside try/except).""" - status = t.get("status", "") - created_jst = _to_jst(t.get("created_at")) - updated_jst = _to_jst(t.get("updated_at")) - time_parts = [] - if created_jst: - try: - time_parts.append(f"Created: {created_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") - except Exception: - time_parts.append(f"Created: {t.get('created_at')}") - if updated_jst and updated_jst != created_jst: - try: - time_parts.append(f"Updated: {updated_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") - except Exception: - time_parts.append(f"Updated: {t.get('updated_at')}") - if time_parts: - st.caption(" · ".join(time_parts)) - result_summary_raw = t.get("result_summary") - if result_summary_raw: - try: - result_summary = json.loads(result_summary_raw) if isinstance(result_summary_raw, str) else result_summary_raw - _render_result_summary(result_summary) - st.markdown("---") - except (TypeError, ValueError): - pass - if t.get("result_path"): - st.text_input("Result path", value=t["result_path"], key=f"rp_modal_{str(t.get('id'))}", disabled=True, label_visibility="collapsed") - if status == "failed" and t.get("error_message"): - st.error(t.get("error_message")) - log_output = (t.get("log_output") or "").strip() - if log_output: - st.caption("Log output") - st.code(log_output, language=None) - params = t.get("parameters") or {} - if params: - st.caption("Parameters") - st.json(params) - - -def _open_task_detail(task_id: str) -> None: - st.session_state["_task_detail_id"] = str(task_id) - - -def _render_one_task_row( - t: Dict[str, Any], - current_user: Optional[str], - use_dialog: bool, - *, - mode: TaskCardMode, -) -> None: - """One task: compact card + View/Delete (and inline More when no dialog).""" - task_id = t.get("id", "") - task_type = t.get("type", "") - status = t.get("status", "") - status_labels = {"pending": "Pending", "running": "Running", "completed": "Completed", "failed": "Failed"} - status_label = status_labels.get(status, status) - type_label = _task_type_label(task_type) - summary = _task_summary(t) - duration = _task_duration(t) or "—" - time_str = _task_time_str(t) - sid = str(task_id) - if mode == "history": - summary_short = (summary[:72] + "…") if summary and len(summary) > 72 else (summary or "—") - else: - summary_short = "—" - progress_msg = (t.get("progress_message") or "").strip() - _card = task_list_card_markup( - task_id=sid, - type_label=type_label, - status=status, - status_label=status_label, - time_str=time_str, - duration=duration, - summary_short=summary_short, - progress_pct=t.get("progress_pct"), - progress_message=progress_msg, - mode=mode, - ) - st.markdown(f'
{_card}
', unsafe_allow_html=True) - - if use_dialog: - bv, bd, _sp = st.columns([1.15, 1.15, 4]) - with bv: - st.button("View", key=f"view_{sid}", on_click=_open_task_detail, args=(sid,)) - with bd: - _stop_lbl = "Stop" if status in ("pending", "running") else "Remove" - _stop_help = ( - "Cancels the Redis/RQ job when possible, then removes this row from the list." - if status in ("pending", "running") - else "Remove this row from the task list." - ) - if st.button( - _stop_lbl, - key=f"del_{sid}", - type="secondary", - help=_stop_help, - ): - delete_task(sid, session_id=current_user) - st.rerun() - else: - bd, _sp = st.columns([1.15, 4]) - with bd: - _stop_lbl = "Stop" if status in ("pending", "running") else "Remove" - _stop_help = ( - "Cancels the Redis/RQ job when possible, then removes this row from the list." - if status in ("pending", "running") - else "Remove this row from the task list." - ) - if st.button( - _stop_lbl, - key=f"del_{sid}", - type="secondary", - help=_stop_help, - ): - delete_task(sid, session_id=current_user) - st.rerun() - - if not use_dialog: - with st.expander("More", expanded=False): - _render_task_detail_content(t) - - -def _render_task_list(tasks: List[Dict[str, Any]], current_user: Optional[str]) -> bool: - """Active tasks visible; completed/failed in a collapsed expander. True if any active.""" - if current_user: - st.caption(f"Logged in as **{current_user}** · your recent tasks only") - if not tasks: - render_task_list_empty_state() - return False - - active = [t for t in tasks if t.get("status") in ("pending", "running")] - history = [t for t in tasks if t.get("status") not in ("pending", "running")] - use_dialog = callable(getattr(st, "dialog", None)) - - for t in active: - _render_one_task_row(t, current_user, use_dialog, mode="active_compact") - - if not active: - st.caption("No queued or running jobs.") - - if history: - with st.expander(f"Task history ({len(history)})", expanded=False): - for t in history: - _render_one_task_row(t, current_user, use_dialog, mode="history") - - # Modal for task detail when dialog is available - if use_dialog and st.session_state.get("_task_detail_id"): - _task_id = st.session_state["_task_detail_id"] - try: - detail_task = next((x for x in tasks if str(x.get("id")) == _task_id), None) - if detail_task is None: - detail_task = get_task(_task_id) - if detail_task: - - @st.dialog("Task details", width="large") - def _task_detail_modal(): - _render_task_detail_content(detail_task) - if st.button("Close"): - st.session_state.pop("_task_detail_id", None) - st.rerun() - - _task_detail_modal() - except Exception as e: - st.error(f"Could not open task details: {e}") - finally: - # Clear so X/outside click or error doesn't leave page stuck; next run shows main content - st.session_state.pop("_task_detail_id", None) - - return len(active) > 0 - - # Task queue status (production deployment); per-user when auth is enabled _current_user = None if is_task_queue_enabled(): - _current_user = get_current_user_id() if is_auth_enabled() else None + _current_user = get_task_list_current_user() render_download_task_section_header( since_days=_TASK_LIST_SINCE_DAYS, max_rows=_TASK_LIST_MAX_ROWS, @@ -1245,7 +845,7 @@ def _task_list_poll(): session_id=_current_user, since_days=_TASK_LIST_SINCE_DAYS, ) - _render_task_list(_t, _current_user) + render_task_list(_t, _current_user) _task_list_poll() except (TypeError, AttributeError): _use_fragment = False @@ -1255,7 +855,7 @@ def _task_list_poll(): session_id=_current_user, since_days=_TASK_LIST_SINCE_DAYS, ) - has_active = _render_task_list(tasks, _current_user) + has_active = render_task_list(tasks, _current_user) if st.button("Refresh task list", key="refresh_tasks"): st.rerun() if has_active: @@ -2430,6 +2030,3 @@ def _update_progress_status(done: int, total_dirs: int): _emit_eval_finished_notification( f"Eval run finished with CSV error. Success: {success_count}, Skipped: {skipped_count}, Failed: {failed_count}. {e}" ) - - - diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 86f8e0c..0ce686c 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -10,6 +10,9 @@ from typing import Optional from lib.WebAPI import scenarioAPI +from lib.ui.download_ui import render_download_task_section_header +from lib.ui.task_history import get_task_list_current_user, render_task_list +from lib.ui.styles_download import inject_download_page_styles from lib.user_config import UserConfig # Initialize or load user config @@ -38,6 +41,7 @@ def set_config_value(key, value): # JST timezone for display _JST = timezone(timedelta(hours=9)) _TASK_LIST_MAX_ROWS = 200 +_TASK_LIST_SINCE_DAYS = 7 def _to_jst(dt): @@ -83,86 +87,41 @@ def _load_catalog_presets(): return presets, loaded_path, load_error -def _enqueue_task(queue_name: str, params: dict) -> Optional[str]: +def _enqueue_task(task_type: str, params: dict) -> Optional[str]: try: + session_id = get_task_list_current_user() + task_id = create_task(task_type, params, session_id=session_id) + if not task_id: + return None from redis import Redis from rq import Queue import os redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379") redis_conn = Redis.from_url(redis_url) - q = Queue(name=queue_name, connection=redis_conn, default_timeout="7d") - from worker.tasks import job_run_evaluator_and_process - job = q.enqueue(job_run_evaluator_and_process, kwargs=params, job_timeout="7d", result_ttl="7d") - user_id = None - try: - from lib.auth import get_current_user_id - user_id = get_current_user_id() - except Exception: - pass - create_task( - task_id=job.id, - task_type="run_evaluator_and_process", - description=f"Evaluator workflow: {params.get('description', 'no description')}", - parameters=params, - created_by=user_id, + q = Queue(name=os.environ.get("RQ_QUEUE", "default"), connection=redis_conn, default_timeout="7d") + from worker.tasks import run_job + job = q.enqueue( + run_job, + task_id, + task_type, + params, + job_timeout="7d", + result_ttl="7d", ) - return job.id + rq_id = getattr(job, "id", None) + if rq_id: + from lib.db import update_task_rq_job_id + update_task_rq_job_id(task_id, str(rq_id)) + return task_id except Exception as e: st.error(f"Failed to enqueue task: {e}") return None -def render_task_row(task): - """Render a single task row.""" - status = task.get("status", "unknown") - task_id = task.get("task_id", "") - description = task.get("description", "")[:70] - created = task.get("created_at") - created_str = _to_jst(created).strftime("%m/%d %H:%M") if created else "N/A" - - status_config = { - "running": {"color": "#f59e0b", "bg": "#fffbeb"}, - "finished": {"color": "#10b981", "bg": "#ecfdf5"}, - "failed": {"color": "#ef4444", "bg": "#fef2f2"}, - "queued": {"color": "#6b7280", "bg": "#f9fafb"}, - } - cfg = status_config.get(status, status_config["queued"]) - - st.markdown( - f""" -
-
- {task_id[:24]}... -
{description}
-
🕐 {created_str}
-
-
{status}
-
- """, - unsafe_allow_html=True, - ) - - # Page config st.set_page_config(page_title="Evaluator Workflow", layout="wide", initial_sidebar_state="expanded") inject_app_page_styles() +inject_download_page_styles() # Load catalog presets CATALOG_PRESETS, CATALOGS_PATH, catalog_load_error = _load_catalog_presets() @@ -393,50 +352,39 @@ def _make_default_output_path(branch_name): # ============================================ # TASK STATUS # ============================================ -st.markdown('
📋 Recent Tasks
', unsafe_allow_html=True) - if not is_task_queue_enabled(): st.info("Task queue not enabled. Set `USE_TASK_QUEUE=true` to track background tasks.") else: - tasks = list_recent_tasks(limit=_TASK_LIST_MAX_ROWS) - running = len([t for t in tasks if t.get("status") == "running"]) - finished = len([t for t in tasks if t.get("status") == "finished"]) - failed = len([t for t in tasks if t.get("status") == "failed"]) - - # Metrics row - m1, m2, m3, m4 = st.columns([1, 1, 1, 2]) - m1.metric("⏳ Running", running) - m2.metric("✅ Finished", finished) - m3.metric("❌ Failed", failed) - filter_status = m4.selectbox("Filter", ["All", "Running", "Finished", "Failed", "Queued"], index=0, label_visibility="collapsed") - - filtered = tasks if filter_status == "All" else [t for t in tasks if t.get("status") == filter_status.lower()] - - # Task list - for task in filtered[:10]: - status = task.get("status", "unknown") - task_id_str = task.get("task_id", "")[:24] - desc = task.get("description", "No description")[:60] - created = task.get("created_at") - created_str = _to_jst(created).strftime("%m/%d %H:%M") if created else "N/A" - - st.markdown(f""" -
-
- {task_id_str}... -
{desc}
-
🕐 {created_str}
-
-
{status}
-
- """, unsafe_allow_html=True) - - if task.get("error"): - with st.expander("❌ Error details"): - st.code(task["error"]) - - if len(filtered) > 10: - st.caption(f"Showing 10 of {len(filtered)} tasks") + current_user = get_task_list_current_user() + render_download_task_section_header( + since_days=_TASK_LIST_SINCE_DAYS, + max_rows=_TASK_LIST_MAX_ROWS, + ) + use_fragment = getattr(st, "fragment", None) is not None + if use_fragment: + try: + @st.fragment(run_every=timedelta(seconds=3)) + def _task_list_poll(): + current_tasks = list_recent_tasks( + limit=_TASK_LIST_MAX_ROWS, + session_id=current_user, + since_days=_TASK_LIST_SINCE_DAYS, + ) + render_task_list(current_tasks, current_user) + _task_list_poll() + except (TypeError, AttributeError): + use_fragment = False + if not use_fragment: + tasks = list_recent_tasks( + limit=_TASK_LIST_MAX_ROWS, + session_id=current_user, + since_days=_TASK_LIST_SINCE_DAYS, + ) + has_active = render_task_list(tasks, current_user) + if st.button("Refresh task list", key="refresh_tasks_workflow"): + st.rerun() + if has_active: + st.info("You have running tasks. Refresh the page to see latest status and logs.") st.sidebar.divider() st.sidebar.caption("💡 Runs async — close browser safely") diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 5b06c88..2b07e0d 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -155,6 +155,98 @@ def _progress_callback(task_id: str, message: str) -> None: update_task_progress(task_id, message=message) +def _is_failed_case_status(case_report: Dict[str, Any]) -> bool: + """Best-effort failure check for case report payloads.""" + result = case_report.get("result") or {} + status = (result.get("status") or case_report.get("status") or "").strip().lower() + return status in {"failed", "failure", "error", "timed_out", "timeout", "canceled", "cancelled", "aborted"} + + +def _summarize_suite_reports(suite_rows: Any, *, limit: int = 10) -> list[Dict[str, Any]]: + """Normalize suite rows into a compact summary suitable for task result_summary.""" + normalized = [] + for row in suite_rows or []: + normalized.append( + { + "suite_name": row.get("name", ""), + "total": int(row.get("all", 0) or 0), + "success": int(row.get("success", 0) or 0), + "failed": int(row.get("fail", 0) or 0), + "canceled": int(row.get("cancel", 0) or 0), + "simulation": row.get("simulation", ""), + "url": row.get("url", ""), + } + ) + normalized.sort(key=lambda item: (-item["failed"], item["suite_name"])) + return normalized[:limit] + + +def _suite_case_totals(suite_rows: Any) -> Dict[str, int]: + """Aggregate totals from full suite rows.""" + totals = {"total": 0, "success": 0, "failed": 0, "canceled": 0} + for row in suite_rows or []: + totals["total"] += int(row.get("all", 0) or 0) + totals["success"] += int(row.get("success", 0) or 0) + totals["failed"] += int(row.get("fail", 0) or 0) + totals["canceled"] += int(row.get("cancel", 0) or 0) + return totals + + +def _extract_failed_case_details(case_reports: Any, *, limit: int = 12) -> list[Dict[str, Any]]: + """Return a compact list of failed cases for UI/log display.""" + failed = [] + for report in case_reports or []: + if not _is_failed_case_status(report): + continue + failed.append( + { + "scenario_name": ((report.get("scenario") or {}).get("display_name", "")), + "suite_name": ((report.get("suite") or {}).get("display_name", "")), + "status": report.get("status", ""), + "fail_message": report.get("fail_message", ""), + "failure_cause_labels": report.get("failure_cause_labels", []), + "archive_log_id": (((report.get("logs") or {}).get("simulation_archive") or {}).get("id", "")), + "result_json_log_id": (((report.get("logs") or {}).get("simulation_result_json") or {}).get("id", "")), + } + ) + failed.sort(key=lambda item: (item["suite_name"], item["scenario_name"], item["fail_message"])) + return failed[:limit] + + +def _build_evaluator_result_summary( + *, + job_id: str, + report_url: str, + evaluator_status: str, + final_report: Dict[str, Any], + suite_rows: Any = None, + failed_cases: Any = None, +) -> Dict[str, Any]: + """Build a compact evaluator summary that the task detail UI can render.""" + build = final_report.get("build") or {} + test = final_report.get("test") or {} + available = test.get("available_case_results") or test.get("case_results") or {} + case_totals = _suite_case_totals(suite_rows) + if not any(case_totals.values()): + case_totals = { + "total": int(available.get("total_count", 0) or 0), + "success": int(available.get("success_count", 0) or 0), + "failed": int(available.get("failure_count", 0) or 0), + "canceled": int(available.get("cancellation_count", 0) or 0), + } + return { + "evaluator_job_id": job_id, + "evaluator_report_url": report_url, + "evaluator_status": evaluator_status, + "evaluator_build_status": build.get("status", ""), + "evaluator_test_status": test.get("status", ""), + "evaluator_fail_message": final_report.get("fail_message", ""), + "evaluator_case_totals": case_totals, + "evaluator_suites": _summarize_suite_reports(suite_rows), + "evaluator_failed_cases": _extract_failed_case_details(failed_cases), + } + + def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: """Download job results (archives or result JSON) and extract/organize. Requires auth.""" update_task_status(task_id, "running") @@ -421,6 +513,8 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N return environment = parameters.get("environment", "default") + os.environ["AUTH_PROFILE"] = environment + os.environ["EVALUATOR_ENVIRONMENT"] = environment def on_progress(msg: str) -> None: append_task_log(task_id, msg) @@ -434,8 +528,6 @@ def on_warning(msg: str) -> None: append_task_log(task_id, f"Project: {project_id}, Catalog: {catalog_id}, Target: {target_name}") try: - import os - os.environ["AUTH_PROFILE"] = environment api = evaluator_api.EvaluationRunAPI() result = api.schedule_job( @@ -463,10 +555,21 @@ def on_warning(msg: str) -> None: append_task_log(task_id, f"Scheduled evaluator job: {job_id}") append_task_log(task_id, f"Report URL: {report_url}") update_task_progress(task_id, message=f"Evaluator job scheduled: {job_id}", pct=5) + summary = { + "job": "run_evaluator_and_process", + "evaluator_job_id": job_id, + "evaluator_report_url": report_url, + "evaluator_status": "scheduled", + "download_summary": {"total": 0, "success": 0, "failed": 0}, + "eval_summary": {}, + "parquet_path": "", + } + update_task_result_summary(task_id, summary) # Step 2: Poll for evaluator completion on_progress("Step 2/5: Waiting for evaluator to complete...") append_task_log(task_id, "This may take a while depending on evaluator queue and run time...") + last_suite_snapshot = {"key": None, "time": 0.0} def on_eval_progress(status: str, elapsed: float) -> None: hours = elapsed / 3600 @@ -475,6 +578,61 @@ def on_eval_progress(status: str, elapsed: float) -> None: # Progress: 5% to 40% during evaluation wait pct = min(5 + (elapsed / max_wait_seconds) * 35, 40) update_task_progress(task_id, message=f"Evaluator: {status} ({hours:.1f}h elapsed)", pct=pct) + summary["evaluator_status"] = status + + should_snapshot = elapsed < 60 or (elapsed - last_suite_snapshot["time"]) >= 600 + if not should_snapshot: + update_task_result_summary(task_id, summary) + return + + try: + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + except Exception: + update_task_result_summary(task_id, summary) + return + + suite_summary = _summarize_suite_reports(suite_rows) + totals = _suite_case_totals(suite_rows) + snapshot_key = ( + totals["total"], + totals["success"], + totals["failed"], + totals["canceled"], + tuple((row["suite_name"], row["failed"]) for row in suite_summary if row["failed"] > 0), + ) + last_suite_snapshot["time"] = elapsed + if snapshot_key == last_suite_snapshot["key"]: + summary["evaluator_case_totals"] = totals + summary["evaluator_suites"] = suite_summary + update_task_result_summary(task_id, summary) + return + + last_suite_snapshot["key"] = snapshot_key + summary["evaluator_case_totals"] = totals + summary["evaluator_suites"] = suite_summary + if totals["total"] > 0: + failing = [row for row in suite_summary if row["failed"] > 0] + if failing: + top = ", ".join(f"{row['suite_name']}={row['failed']}" for row in failing[:3]) + append_task_log( + task_id, + ( + "Evaluator progress snapshot: " + f"{totals['success']}/{totals['total']} success, " + f"{totals['failed']} failed, {totals['canceled']} canceled. " + f"Failing suites: {top}" + ), + ) + else: + append_task_log( + task_id, + ( + "Evaluator progress snapshot: " + f"{totals['success']}/{totals['total']} success, " + f"{totals['failed']} failed, {totals['canceled']} canceled." + ), + ) + update_task_result_summary(task_id, summary) try: final_report = api.wait_for_job_completion( @@ -489,14 +647,54 @@ def on_eval_progress(status: str, elapsed: float) -> None: update_task_status(task_id, "failed", error_message=f"Evaluator failed or timed out: {e}") return - # Check if evaluator succeeded using the same status extraction as the poller. test_status = evaluator_api.extract_job_status(final_report) - if not evaluator_api.is_success_job_status(test_status): - update_task_status(task_id, "failed", error_message=f"Evaluator job failed with status: {test_status}") - return - - update_task_progress(task_id, message="Evaluator completed successfully", pct=40) - append_task_log(task_id, f"Evaluator completed with status: {test_status}") + try: + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + except Exception as e: + append_task_log(task_id, f"Could not fetch suite summary: {e}") + suite_rows = [] + try: + case_reports = api.get_case_reports(project_id, job_id) + except Exception as e: + append_task_log(task_id, f"Could not fetch case reports: {e}") + case_reports = [] + + evaluator_summary = _build_evaluator_result_summary( + job_id=job_id, + report_url=report_url, + evaluator_status=test_status, + final_report=final_report, + suite_rows=suite_rows, + failed_cases=case_reports, + ) + summary.update(evaluator_summary) + update_task_result_summary(task_id, summary) + + fail_message = summary.get("evaluator_fail_message", "") + if evaluator_api.is_success_job_status(test_status): + update_task_progress(task_id, message="Evaluator completed successfully", pct=40) + append_task_log(task_id, f"Evaluator completed with status: {test_status}") + else: + append_task_log(task_id, f"Evaluator completed with non-success status: {test_status}") + if fail_message: + append_task_log(task_id, f"Evaluator fail message: {fail_message}") + case_totals = summary.get("evaluator_case_totals", {}) + append_task_log( + task_id, + ( + "Evaluator result summary: " + f"{case_totals.get('success', 0)}/{case_totals.get('total', 0)} success, " + f"{case_totals.get('failed', 0)} failed, {case_totals.get('canceled', 0)} canceled" + ), + ) + failed_cases = summary.get("evaluator_failed_cases", []) + for case in failed_cases[:5]: + detail = case.get("fail_message", "") or case.get("status", "") + append_task_log( + task_id, + f"Failed case: {case.get('suite_name', '')} / {case.get('scenario_name', '')} - {detail}", + ) + update_task_progress(task_id, message=f"Evaluator finished with status {test_status}; trying download", pct=40) # Step 3: Download results on_progress("Step 3/5: Downloading results...") @@ -524,14 +722,23 @@ def on_eval_progress(status: str, elapsed: float) -> None: download_success = success_count > 0 if not download_success: + evaluator_msg = "" + if not evaluator_api.is_success_job_status(test_status): + evaluator_msg = f" Evaluator status was {test_status}." update_task_status(task_id, "failed", - error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed") + error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed.{evaluator_msg}") return break except RuntimeError as e: if "No case reports found" not in str(e) or time.time() >= download_deadline: - update_task_status(task_id, "failed", error_message=f"Download failed: {e}") + evaluator_msg = "" + if not evaluator_api.is_success_job_status(test_status): + evaluator_msg = ( + f" Evaluator status was {test_status}. " + "This usually means the job failed before producing downloadable case logs." + ) + update_task_status(task_id, "failed", error_message=f"Download failed: {e}{evaluator_msg}") return wait_seconds = min( @@ -548,6 +755,13 @@ def on_eval_progress(status: str, elapsed: float) -> None: return update_task_progress(task_id, message=f"Download complete: {success_count}/{total_attempted} succeeded", pct=60) + summary["download_summary"] = { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + } + summary["download_rows"] = rows[:500] + update_task_result_summary(task_id, summary) # Step 4: Run eval if run_eval: @@ -589,6 +803,8 @@ def on_eval_progress(status: str, elapsed: float) -> None: eval_result_summary = {} update_task_progress(task_id, message="Evaluation complete", pct=85) + summary["eval_summary"] = eval_result_summary + update_task_result_summary(task_id, summary) # Step 5: Generate parquet parquet_path = "" @@ -610,23 +826,14 @@ def on_eval_progress(status: str, elapsed: float) -> None: parquet_path = "" update_task_progress(task_id, message="All steps complete", pct=100) + summary["parquet_path"] = parquet_path # Build final summary - summary = { - "job": "run_evaluator_and_process", - "evaluator_job_id": job_id, - "evaluator_report_url": report_url, - "evaluator_status": test_status, - "download_summary": { - "total": total_attempted, - "success": success_count, - "failed": failure_count, - }, - "eval_summary": eval_result_summary, - "parquet_path": parquet_path, - } update_task_result_summary(task_id, summary) - append_task_log(task_id, "Workflow complete!") + if evaluator_api.is_success_job_status(test_status): + append_task_log(task_id, "Workflow complete!") + else: + append_task_log(task_id, "Workflow complete. Evaluator job had failed test cases, but downloadable results were processed.") update_task_status(task_id, "completed", result_path=output_path) except Exception as e: From 52224bf165b9f19eaa54df9ac6ea638a3a3f6f29 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 13 May 2026 21:26:40 +0900 Subject: [PATCH 56/94] feat: enhance evaluator report processing and display - Added new utility functions for parsing and formatting evaluator API timestamps, improving the handling of date and time data. - Implemented functions to extract and normalize job report metadata, including case totals and status color variants, enhancing the clarity of job summaries. - Improved the display logic for failed case rows and suite summaries, ensuring better organization and readability of evaluation results. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 697 +++++++++++++++++++ 1 file changed, 697 insertions(+) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 3215f83..f65aa3b 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -62,6 +62,7 @@ def _to_jst(dt: Any) -> Optional[datetime]: update_task_rq_job_id, ) from lib import download_core +from lib import evaluator_api from lib.auth import get_current_user_id, is_auth_enabled try: @@ -878,6 +879,701 @@ def _run_eval_result_worker(result_dir: str, overwrite: bool) -> Dict[str, Any]: return run_eval_result_for_dir(result_dir, overwrite=overwrite) +def _parse_api_dt(value: Any) -> Optional[datetime]: + """Parse evaluator API timestamps into timezone-aware datetimes.""" + if value is None: + return None + if isinstance(value, datetime): + if getattr(value, "tzinfo", None) is None: + return value.replace(tzinfo=timezone.utc) + return value + try: + text = str(value).strip() + if not text: + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + dt = datetime.fromisoformat(text) + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except Exception: + return None + + +def _format_jst_time(value: Any, *, include_seconds: bool = False) -> str: + """Format timestamps for display in JST.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%Y-%m-%d %H:%M:%S JST" if include_seconds else "%Y-%m-%d %H:%M JST") + + +def _format_relative_time(value: Any) -> str: + """Human-friendly age/duration from a timestamp until now.""" + dt = _parse_api_dt(value) + if not dt: + return "—" + now = datetime.now(timezone.utc) + secs = max(0, int((now - dt.astimezone(timezone.utc)).total_seconds())) + if secs < 60: + return f"{secs}s ago" + if secs < 3600: + return f"{secs // 60}m ago" + if secs < 86400: + return f"{secs // 3600}h ago" + return f"{secs // 86400}d ago" + + +def _format_duration(start_value: Any, end_value: Any) -> str: + """Format elapsed duration between two evaluator timestamps.""" + start = _parse_api_dt(start_value) + end = _parse_api_dt(end_value) + if not start or not end: + return "—" + secs = max(0, int((end - start).total_seconds())) + if secs < 60: + return f"{secs}s" + if secs < 3600: + return f"{secs // 60}m {secs % 60}s" + return f"{secs // 3600}h {(secs % 3600) // 60}m" + + +def _extract_git_target(report: Dict[str, Any]) -> str: + """Return a compact branch/tag label from evaluator job report metadata.""" + source = ((report.get("event") or {}).get("source") or {}) + git_ref = str(source.get("git_ref") or "").strip() + if git_ref.startswith("refs/heads/"): + return git_ref[len("refs/heads/"):] + if git_ref.startswith("refs/tags/"): + return git_ref[len("refs/tags/"):] + return git_ref or str(source.get("git_sha") or "").strip()[:12] or "—" + + +def _extract_case_totals(report: Dict[str, Any]) -> Dict[str, int]: + """Return total/success/failed/canceled counts from job report.""" + test = report.get("test") or {} + result = test.get("available_case_results") or test.get("case_results") or {} + return { + "total": int(result.get("total_count", 0) or 0), + "success": int(result.get("success_count", 0) or 0), + "failed": int(result.get("failure_count", 0) or 0), + "canceled": int(result.get("cancellation_count", 0) or 0), + } + + +def _extract_failed_case_rows(case_reports: List[Dict[str, Any]], *, limit: int = 50) -> List[Dict[str, Any]]: + """Normalize failed case rows for display tables.""" + rows: List[Dict[str, Any]] = [] + for report in case_reports: + status = str(report.get("status") or "").strip().lower() + result_status = str(((report.get("result") or {}).get("status") or "")).strip().lower() + if status not in evaluator_api.FAILED_JOB_STATUSES and result_status not in evaluator_api.FAILED_JOB_STATUSES: + continue + logs = report.get("logs") or {} + rows.append( + { + "Suite": ((report.get("suite") or {}).get("display_name") or ""), + "Scenario": ((report.get("scenario") or {}).get("display_name") or ""), + "Status": report.get("status", ""), + "Fail message": report.get("fail_message", ""), + "Cause": ", ".join(report.get("failure_cause_labels", []) or []), + "Archive log": "yes" if ((logs.get("simulation_archive") or {}).get("id")) else "no", + "Result JSON": "yes" if ((logs.get("simulation_result_json") or {}).get("id")) else "no", + } + ) + rows.sort(key=lambda row: (row["Suite"], row["Scenario"], row["Fail message"])) + return rows[:limit] + + +def _extract_suite_rows(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Normalize suite summary rows for display tables.""" + rows = [ + { + "Suite": row.get("name", ""), + "Total": int(row.get("all", 0) or 0), + "Success": int(row.get("success", 0) or 0), + "Failed": int(row.get("fail", 0) or 0), + "Canceled": int(row.get("cancel", 0) or 0), + "Simulation": row.get("simulation", ""), + "Report": row.get("url", ""), + } + for row in suite_rows or [] + ] + rows.sort(key=lambda row: (-row["Failed"], row["Suite"])) + return rows + + +def _status_color_variant(status: str) -> str: + """Map evaluator status to a style token used by the recent-job cards.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in evaluator_api.SUCCESS_JOB_STATUSES: + return "success" + if normalized in evaluator_api.FAILED_JOB_STATUSES: + return "failed" + if normalized in ("started", "running", "pending", "queued", "created"): + return "running" + return "unknown" + + +def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: + """Compact summary for one evaluator job card.""" + status = evaluator_api.extract_job_status(report) + totals = _extract_case_totals(report) + source = ((report.get("event") or {}).get("source") or {}) + integration = ((report.get("event") or {}).get("integration") or {}) + git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() + source_label = git_url.rstrip("/").split("/")[-1] if git_url else str(integration.get("type") or "—") + return { + "job_id": report.get("job_id") or report.get("id") or "", + "status": status, + "status_variant": _status_color_variant(status), + "build_status": ((report.get("build") or {}).get("status") or ""), + "test_status": ((report.get("test") or {}).get("status") or ""), + "target": _extract_git_target(report), + "catalog": ((report.get("catalog") or {}).get("display_name") or ""), + "description": report.get("description", ""), + "source_label": source_label, + "source_type": str(integration.get("type") or ""), + "scheduled_at": report.get("scheduled_at"), + "started_at": report.get("started_at"), + "finished_at": report.get("finished_at"), + "duration": _format_duration(report.get("started_at"), report.get("finished_at")), + "created_label": _format_relative_time(report.get("scheduled_at") or report.get("started_at")), + "report_url": evaluator_api.get_job_report_url(report.get("project_id", ""), report.get("job_id") or report.get("id") or ""), + "fail_message": report.get("fail_message", ""), + "total": totals["total"], + "success": totals["success"], + "failed": totals["failed"], + "canceled": totals["canceled"], + "git_sha": str(source.get("git_sha") or "")[:12], + "git_ref_url": source.get("git_ref_url", ""), + "git_commit_url": source.get("git_commit_url", ""), + "source_url": git_url, + } + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_recent_evaluator_jobs(project_id: str, environment: str, limit: int) -> List[Dict[str, Any]]: + """Fetch recent evaluator jobs and normalize them for list rendering.""" + if not project_id: + return [] + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + reports = api.get_report_list(project_id, status="all", max_results=max(1, int(limit))) + reports = sorted( + reports, + key=lambda report: _parse_api_dt(report.get("scheduled_at") or report.get("started_at") or report.get("finished_at")) or datetime.min.replace(tzinfo=timezone.utc), + reverse=True, + ) + normalized = [] + for report in reports[:limit]: + normalized.append(_summarize_recent_job(report)) + return normalized + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_evaluator_job_detail(project_id: str, environment: str, job_id: str) -> Dict[str, Any]: + """Fetch deep evaluator detail for one job on demand.""" + if not project_id or not job_id: + return {} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + report = api.get_job_report(project_id, job_id) + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + case_reports = api.get_case_reports(project_id, job_id) + summary = _summarize_recent_job(report) + return { + **summary, + "suite_rows": _extract_suite_rows(suite_rows), + "failed_case_rows": _extract_failed_case_rows(case_reports), + "raw_report": report, + } + + +def _inject_recent_evaluator_jobs_styles() -> None: + """Task-adjacent styles for the recent evaluator jobs section.""" + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: + """Render one recent evaluator job as a single-row list item.""" + variant = html.escape(job.get("status_variant", "unknown")) + status = html.escape(job.get("status", "unknown") or "unknown") + title_text = html.escape(job.get("target", "—")) + description = html.escape(job.get("description", "") or "") + catalog = html.escape(job.get("catalog", "") or "—") + scheduled = html.escape(_format_jst_time(job.get("scheduled_at"))) + duration = html.escape(job.get("duration", "—")) + job_id = html.escape(str(job.get("job_id", ""))) + build_status = html.escape(job.get("build_status", "") or "—") + test_status = html.escape(job.get("test_status", "") or "—") + created_label = html.escape(job.get("created_label", "—")) + git_sha = html.escape(job.get("git_sha", "") or "—") + source_label = html.escape(job.get("source_label", "") or "—") + source_type = html.escape(job.get("source_type", "") or "") + report_url = html.escape(job.get("report_url", "") or "") + source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") + running_dot = '' if job.get("status_variant") == "running" else '' + counts = ( + f'S {int(job.get("success", 0))} · ' + f'F {int(job.get("failed", 0))} · ' + f'C {int(job.get("canceled", 0))} / ' + f'{int(job.get("total", 0))}' + ) + title_html = f'{title_text}' if report_url else title_text + source_html = ( + f'{source_label}' + if source_url else source_label + ) + st.markdown( + f""" +
+
+
+
{title_html}
+
{description if description else f"job {job_id[:8]}"}
+
+
+ {running_dot}{status} +
+
+ {scheduled}
{created_label} · {duration} +
+
+ {catalog}
{source_html}{f" ({source_type})" if source_type else ""} +
+
+ build {build_status} · test {test_status} · {git_sha}
+ {counts} +
+
+ job {job_id[:8]} +
+
+
+ """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: Dict[str, Any]) -> None: + """Render detailed evaluator-job information inside an expander.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.warning("Missing job id.") + return + try: + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + except Exception as e: + st.error(f"Could not fetch evaluator details: {e}") + return + + st.markdown("**Overview**") + top_cols = st.columns(4) + top_cols[0].metric("Total", int(detail.get("total", 0))) + top_cols[1].metric("Success", int(detail.get("success", 0))) + top_cols[2].metric("Failed", int(detail.get("failed", 0))) + top_cols[3].metric("Canceled", int(detail.get("canceled", 0))) + + overview_left, overview_right = st.columns([1.3, 1.1]) + with overview_left: + st.write(f"Status: `{detail.get('status', 'unknown')}`") + st.write(f"Build/Test: `{detail.get('build_status', '—')}` / `{detail.get('test_status', '—')}`") + st.write(f"Target: `{detail.get('target', '—')}`") + st.write(f"Catalog: `{detail.get('catalog', '—')}`") + st.write(f"Source: `{detail.get('source_label', '—')}`") + with overview_right: + st.write(f"Scheduled: `{_format_jst_time(detail.get('scheduled_at'), include_seconds=True)}`") + st.write(f"Started: `{_format_jst_time(detail.get('started_at'), include_seconds=True)}`") + st.write(f"Finished: `{_format_jst_time(detail.get('finished_at'), include_seconds=True)}`") + st.write(f"Duration: `{detail.get('duration', '—')}`") + st.write(f"SHA: `{detail.get('git_sha', '—')}`") + + action_cols = st.columns([1.2, 1.2, 4]) + report_url = detail.get("report_url", "") + source_url = detail.get("source_url", "") or detail.get("git_ref_url", "") + with action_cols[0]: + if report_url: + st.link_button("Open report", report_url, use_container_width=True) + with action_cols[1]: + if source_url: + st.link_button("Open source", source_url, use_container_width=True) + + if detail.get("fail_message"): + st.warning(detail.get("fail_message")) + + suite_rows = detail.get("suite_rows") or [] + with st.expander(f"Suites ({len(suite_rows)})", expanded=bool(suite_rows)): + if suite_rows: + st.dataframe(pd.DataFrame(suite_rows), width="stretch", hide_index=True) + else: + st.caption("No suite summary available.") + + failed_case_rows = detail.get("failed_case_rows") or [] + with st.expander(f"Failed Cases ({len(failed_case_rows)})", expanded=bool(failed_case_rows)): + if failed_case_rows: + st.dataframe(pd.DataFrame(failed_case_rows), width="stretch", hide_index=True) + else: + st.caption("No failed cases in the current report.") + + with st.expander("Raw JSON", expanded=False): + st.json(detail.get("raw_report", {})) + + +def _render_recent_evaluator_jobs_section(project_id: str, environment: str) -> None: + """Render a direct evaluator-jobs browser above the download tabs.""" + _inject_recent_evaluator_jobs_styles() + show_section = st.toggle( + "Show recent evaluator jobs", + value=st.session_state.get("recent_eval_jobs_show", False), + key="recent_eval_jobs_show", + help="Load recent evaluator jobs only when you want to browse them.", + ) + if not show_section: + return + + st.subheader("Recent evaluator jobs") + st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") + + control_cols = st.columns([1.2, 1.8, 1.6, 1.1]) + with control_cols[0]: + limit = int( + st.selectbox( + "Jobs", + options=[6, 12, 20, 30], + index=1, + key="recent_eval_jobs_limit", + help="How many recent evaluator jobs to fetch for this project.", + ) + ) + with control_cols[1]: + status_filter = st.multiselect( + "Status filter", + options=["running", "success", "failed", "canceled", "unknown"], + default=[], + key="recent_eval_jobs_status_filter", + help="Leave empty to show all recent jobs.", + ) + with control_cols[2]: + branch_filter = st.text_input( + "Branch/tag contains", + value=st.session_state.get("recent_eval_jobs_branch_filter", ""), + key="recent_eval_jobs_branch_filter", + help="Optional substring filter for branch or tag name.", + ).strip() + with control_cols[3]: + if st.button("Refresh jobs", key="refresh_recent_eval_jobs", use_container_width=True): + _fetch_recent_evaluator_jobs.clear() + _fetch_evaluator_job_detail.clear() + st.rerun() + + page_key = "recent_eval_jobs_page" + if page_key not in st.session_state: + st.session_state[page_key] = 1 + + def _render_job_list() -> None: + if not project_id: + st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") + return + current_page = max(1, int(st.session_state.get(page_key, 1))) + fetch_limit = max(limit * (current_page + 1), limit + 1) + try: + jobs = _fetch_recent_evaluator_jobs(project_id, environment, fetch_limit) + except Exception as e: + st.error(f"Could not fetch recent evaluator jobs: {e}") + return + + if branch_filter: + branch_filter_lower = branch_filter.lower() + jobs = [job for job in jobs if branch_filter_lower in str(job.get("target", "")).lower()] + if status_filter: + selected = set(status_filter) + jobs = [job for job in jobs if job.get("status_variant") in selected or evaluator_api.normalize_job_status(job.get("status", "")) in selected] + + if not jobs: + st.session_state[page_key] = 1 + st.markdown('
No recent evaluator jobs matched the current filters.
', unsafe_allow_html=True) + return + + total_loaded = len(jobs) + has_next_page = total_loaded > current_page * limit + max_known_page = max(1, (total_loaded + limit - 1) // limit) + if current_page > max_known_page: + current_page = max_known_page + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + if not visible_jobs and current_page > 1: + current_page = max(1, current_page - 1) + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + has_next_page = total_loaded > current_page * limit + + pager_cols = st.columns([1.1, 1.4, 4.5, 1.1, 1.1]) + with pager_cols[0]: + if st.button("Prev", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): + st.session_state[page_key] = max(1, current_page - 1) + st.rerun() + with pager_cols[1]: + st.markdown( + f"
Page {current_page}
", + unsafe_allow_html=True, + ) + with pager_cols[3]: + if st.button("Next", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): + st.session_state[page_key] = current_page + 1 + st.rerun() + with pager_cols[4]: + st.markdown( + f"
{total_loaded}+ loaded
", + unsafe_allow_html=True, + ) + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id and not any(str(job.get("job_id", "")) == str(selected_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_selected", None) + selected_job_id = None + + st.markdown('
', unsafe_allow_html=True) + for job in visible_jobs: + row_cols = st.columns([9.2, 1.0]) + with row_cols[0]: + _render_recent_evaluator_job_card(job) + with row_cols[1]: + if st.button("View", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + st.markdown("
", unsafe_allow_html=True) + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id: + selected_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_job_id)), None) + if selected_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Job details · {selected_job.get('target', '—')}", width="large") + def _recent_eval_job_dialog() -> None: + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + if st.button("Close", key="recent_eval_jobs_close_detail", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + + _recent_eval_job_dialog() + finally: + st.session_state.pop("recent_eval_jobs_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Job details · {selected_job.get('target', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_detail_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + st.markdown("
", unsafe_allow_html=True) + + _render_job_list() + + # Sidebar for configuration with st.sidebar: st.header("Configuration") @@ -1072,6 +1768,7 @@ def on_suite_id_change(): skip_large_file = False large_file_mb = 50.0 # Doesn't apply +_render_recent_evaluator_jobs_section(project_id, environment) st.markdown('

Pick a workflow

', unsafe_allow_html=True) tab1, tab2, tab3, tab4 = st.tabs( From 1d9b9f1b68f8294c71c07372d47549ce55d1bc14 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 13 May 2026 21:51:22 +0900 Subject: [PATCH 57/94] feat: enhance download workflow with suite selection and job run dialog - Introduced a new function to extract suite selection options from evaluator suite summary rows, improving user interface for suite downloads. - Refactored the job rendering function to include a dialog for configuring download options, enhancing user experience with clearer workflow controls. - Added input fields for output path, download type, and suite selection, allowing users to customize their download process effectively. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 516 ++++++++++++++++--- 1 file changed, 454 insertions(+), 62 deletions(-) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index f65aa3b..becbb25 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -909,6 +909,22 @@ def _format_jst_time(value: Any, *, include_seconds: bool = False) -> str: return dt.strftime("%Y-%m-%d %H:%M:%S JST" if include_seconds else "%Y-%m-%d %H:%M JST") +def _format_jst_time_compact(value: Any) -> str: + """Compact timestamp for dense recent-job rows.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%m-%d %H:%M") + + +def _format_jst_time_title(value: Any) -> str: + """Readable timestamp for fallback job titles.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "unknown time" + return f"{dt.year}/{dt.month}/{dt.day} {dt.hour}:{dt.minute:02d}:{dt.second:02d}" + + def _format_relative_time(value: Any) -> str: """Human-friendly age/duration from a timestamp until now.""" dt = _parse_api_dt(value) @@ -950,6 +966,38 @@ def _extract_git_target(report: Dict[str, Any]) -> str: return git_ref or str(source.get("git_sha") or "").strip()[:12] or "—" +def _extract_catalog_url(report: Dict[str, Any]) -> str: + """Return a best-effort catalog URL for linking from recent evaluator jobs.""" + catalog = report.get("catalog") or {} + direct_url = str( + catalog.get("web_url") + or catalog.get("url") + or catalog.get("catalog_url") + or "" + ).strip() + if direct_url: + return direct_url + + project_id = str(report.get("project_id") or "").strip() + catalog_id = str( + catalog.get("catalog_id") + or catalog.get("id") + or "" + ).strip() + if project_id and catalog_id: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog_id}?project_id={project_id}" + return "" + + +def _extract_job_title(report: Dict[str, Any]) -> str: + """Prefer evaluator description for display title, with a readable fallback.""" + description = str(report.get("description") or "").strip() + if description: + return description + started_like = report.get("started_at") or report.get("scheduled_at") or report.get("finished_at") + return f"no description (Started at {_format_jst_time_title(started_like)})" + + def _extract_case_totals(report: Dict[str, Any]) -> Dict[str, int]: """Return total/success/failed/canceled counts from job report.""" test = report.get("test") or {} @@ -1004,6 +1052,24 @@ def _extract_suite_rows(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, Any] return rows +def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]: + """Build suite picker options from evaluator suite summary rows.""" + options: List[Dict[str, str]] = [] + seen_ids = set() + for row in suite_rows or []: + report_url = str(row.get("url") or "").strip() + suite_id = "" + if "/tests/" in report_url: + tail = report_url.split("/tests/", 1)[1] + suite_id = tail.split("?", 1)[0].split("/", 1)[0].strip() + if not suite_id or suite_id in seen_ids: + continue + seen_ids.add(suite_id) + suite_name = str(row.get("name") or suite_id).strip() + options.append({"id": suite_id, "label": f"{suite_name} ({suite_id})"}) + return options + + def _status_color_variant(status: str) -> str: """Map evaluator status to a style token used by the recent-job cards.""" normalized = evaluator_api.normalize_job_status(status) @@ -1016,25 +1082,43 @@ def _status_color_variant(status: str) -> str: return "unknown" +def _status_display_label(status: str) -> str: + """Short status label for compact list rows.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in ("succeeded", "success"): + return "success" + if normalized in ("failed", "failure", "error"): + return "failed" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in ("started", "running"): + return "running" + if normalized in ("pending", "queued", "created"): + return "queued" + return normalized or "unknown" + + def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: """Compact summary for one evaluator job card.""" status = evaluator_api.extract_job_status(report) totals = _extract_case_totals(report) source = ((report.get("event") or {}).get("source") or {}) - integration = ((report.get("event") or {}).get("integration") or {}) git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() - source_label = git_url.rstrip("/").split("/")[-1] if git_url else str(integration.get("type") or "—") + source_repo_label = git_url.rstrip("/").split("/")[-1] if git_url else "—" + git_ref_label = _extract_git_target(report) return { "job_id": report.get("job_id") or report.get("id") or "", + "title": _extract_job_title(report), "status": status, "status_variant": _status_color_variant(status), "build_status": ((report.get("build") or {}).get("status") or ""), "test_status": ((report.get("test") or {}).get("status") or ""), - "target": _extract_git_target(report), + "target": git_ref_label, "catalog": ((report.get("catalog") or {}).get("display_name") or ""), + "catalog_url": _extract_catalog_url(report), "description": report.get("description", ""), - "source_label": source_label, - "source_type": str(integration.get("type") or ""), + "source_label": git_ref_label, + "source_repo_label": source_repo_label, "scheduled_at": report.get("scheduled_at"), "started_at": report.get("started_at"), "finished_at": report.get("finished_at"), @@ -1124,12 +1208,12 @@ def _inject_recent_evaluator_jobs_styles() -> None: .evj-top { justify-content: space-between; } .evj-row { display: grid; - grid-template-columns: minmax(220px, 1.6fr) minmax(110px, 0.8fr) minmax(150px, 1fr) minmax(120px, 0.9fr) minmax(170px, 1.2fr) auto; - gap: 10px; + grid-template-columns: minmax(180px, 1.35fr) minmax(86px, 0.5fr) minmax(108px, 0.7fr) minmax(180px, 1.25fr) minmax(190px, 1.15fr); + gap: 8px; align-items: center; } .evj-title { - font-size: 0.94rem; + font-size: 0.9rem; font-weight: 800; color: #0f172a; margin: 0; @@ -1145,24 +1229,29 @@ def _inject_recent_evaluator_jobs_styles() -> None: .evj-name { min-width: 0; } - .evj-name-sub { - margin-top: 0.15rem; - font-size: 0.78rem; - color: #64748b; + .evj-name .evj-title, + .evj-name .evj-name-sub, + .evj-ref-cell, + .evj-ref-cell .evj-name-sub { white-space: nowrap; overflow: hidden; text-overflow: ellipsis; } + .evj-name-sub { + margin-top: 0.15rem; + font-size: 0.74rem; + color: #64748b; + } .evj-status { display: inline-flex; align-items: center; - gap: 6px; - padding: 0.28rem 0.7rem; + gap: 5px; + padding: 0.24rem 0.5rem; border-radius: 999px; - font-size: 0.76rem; + font-size: 0.7rem; font-weight: 800; - text-transform: uppercase; - letter-spacing: 0.04em; + text-transform: lowercase; + letter-spacing: 0.01em; border: 1px solid transparent; } .evj-status--running { color: #9a6700; background: #fff7db; border-color: rgba(245, 158, 11, 0.28); } @@ -1185,17 +1274,6 @@ def _inject_recent_evaluator_jobs_styles() -> None: 50% { transform: scale(1.2); opacity: 1; } 100% { transform: scale(0.9); opacity: 0.55; } } - .evj-chip { - display: inline-flex; - align-items: center; - padding: 0.22rem 0.52rem; - border-radius: 999px; - background: #f8fafc; - border: 1px solid rgba(148, 163, 184, 0.24); - color: #334155; - font-size: 0.74rem; - font-weight: 700; - } .evj-meta { color: #475569; font-size: 0.82rem; @@ -1206,9 +1284,23 @@ def _inject_recent_evaluator_jobs_styles() -> None: gap: 8px; margin-top: 0.7rem; } + .evj-toolbar-note { + margin: 0.15rem 0 0.35rem; + font-size: 0.72rem; + font-weight: 700; + letter-spacing: 0.02em; + color: #64748b; + text-transform: uppercase; + } + .evj-pager-note { + margin-top: 0.28rem; + font-size: 0.76rem; + color: #475569; + white-space: nowrap; + } .evj-cell { min-width: 0; - font-size: 0.82rem; + font-size: 0.78rem; color: #334155; } .evj-cell a { @@ -1246,10 +1338,48 @@ def _inject_recent_evaluator_jobs_styles() -> None: .evj-inline-stats { display: flex; flex-wrap: wrap; - gap: 8px; - font-size: 0.82rem; + gap: 6px; + font-size: 0.76rem; color: #334155; } + [class*="st-key-recent_eval_view_"] button, + [class*="st-key-recent_eval_run_"] button, + [class*="st-key-recent_eval_jobs_prev"] button, + [class*="st-key-recent_eval_jobs_next"] button, + [class*="st-key-refresh_recent_eval_jobs"] button { + min-height: 2rem; + padding: 0.18rem 0.58rem; + border-radius: 999px; + font-size: 0.72rem; + font-weight: 700; + box-shadow: none; + } + [class*="st-key-recent_eval_view_"] button, + [class*="st-key-recent_eval_jobs_prev"] button, + [class*="st-key-recent_eval_jobs_next"] button, + [class*="st-key-refresh_recent_eval_jobs"] button { + border-color: rgba(148, 163, 184, 0.34); + color: #334155; + background: #ffffff; + } + [class*="st-key-recent_eval_view_"] button:hover, + [class*="st-key-recent_eval_jobs_prev"] button:hover, + [class*="st-key-recent_eval_jobs_next"] button:hover, + [class*="st-key-refresh_recent_eval_jobs"] button:hover { + border-color: rgba(15, 118, 110, 0.28); + color: #0f766e; + background: #f8fffd; + } + [class*="st-key-recent_eval_run_"] button { + border-color: rgba(13, 148, 136, 0.22); + background: linear-gradient(180deg, #f0fdfa, #ecfeff); + color: #0f766e; + } + [class*="st-key-recent_eval_run_"] button:hover { + border-color: rgba(13, 148, 136, 0.34); + background: linear-gradient(180deg, #ccfbf1, #ecfeff); + color: #115e59; + } .evj-stat-label { display: block; font-size: 0.68rem; @@ -1292,11 +1422,12 @@ def _inject_recent_evaluator_jobs_styles() -> None: def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: """Render one recent evaluator job as a single-row list item.""" variant = html.escape(job.get("status_variant", "unknown")) - status = html.escape(job.get("status", "unknown") or "unknown") - title_text = html.escape(job.get("target", "—")) + status = html.escape(_status_display_label(job.get("status", "unknown") or "unknown")) + title_text = html.escape(job.get("title", "—")) description = html.escape(job.get("description", "") or "") catalog = html.escape(job.get("catalog", "") or "—") - scheduled = html.escape(_format_jst_time(job.get("scheduled_at"))) + catalog_url = html.escape(job.get("catalog_url", "") or "") + scheduled = html.escape(_format_jst_time_compact(job.get("scheduled_at"))) duration = html.escape(job.get("duration", "—")) job_id = html.escape(str(job.get("job_id", ""))) build_status = html.escape(job.get("build_status", "") or "—") @@ -1304,10 +1435,10 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: created_label = html.escape(job.get("created_label", "—")) git_sha = html.escape(job.get("git_sha", "") or "—") source_label = html.escape(job.get("source_label", "") or "—") - source_type = html.escape(job.get("source_type", "") or "") report_url = html.escape(job.get("report_url", "") or "") source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") running_dot = '' if job.get("status_variant") == "running" else '' + meta_line = f"id {job_id[:8]}" counts = ( f'S {int(job.get("success", 0))} · ' f'F {int(job.get("failed", 0))} · ' @@ -1319,30 +1450,31 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: f'{source_label}' if source_url else source_label ) + catalog_html = ( + f'{catalog}' + if catalog_url else catalog + ) st.markdown( f"""
{title_html}
-
{description if description else f"job {job_id[:8]}"}
+
{meta_line}
{running_dot}{status}
- {scheduled}
{created_label} · {duration} + {scheduled}
{duration} · {created_label}
-
- {catalog}
{source_html}{f" ({source_type})" if source_type else ""} +
+ {catalog_html}
{source_html}
build {build_status} · test {test_status} · {git_sha}
{counts}
-
- job {job_id[:8]} -
""", @@ -1372,10 +1504,11 @@ def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: overview_left, overview_right = st.columns([1.3, 1.1]) with overview_left: st.write(f"Status: `{detail.get('status', 'unknown')}`") + st.write(f"Title: `{detail.get('title', '—')}`") st.write(f"Build/Test: `{detail.get('build_status', '—')}` / `{detail.get('test_status', '—')}`") - st.write(f"Target: `{detail.get('target', '—')}`") + st.write(f"Ref: `{detail.get('target', '—')}`") st.write(f"Catalog: `{detail.get('catalog', '—')}`") - st.write(f"Source: `{detail.get('source_label', '—')}`") + st.write(f"Repo: `{detail.get('source_repo_label', '—')}`") with overview_right: st.write(f"Scheduled: `{_format_jst_time(detail.get('scheduled_at'), include_seconds=True)}`") st.write(f"Started: `{_format_jst_time(detail.get('started_at'), include_seconds=True)}`") @@ -1385,11 +1518,15 @@ def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: action_cols = st.columns([1.2, 1.2, 4]) report_url = detail.get("report_url", "") + catalog_url = detail.get("catalog_url", "") source_url = detail.get("source_url", "") or detail.get("git_ref_url", "") with action_cols[0]: if report_url: st.link_button("Open report", report_url, use_container_width=True) with action_cols[1]: + if catalog_url: + st.link_button("Open catalog", catalog_url, use_container_width=True) + with action_cols[2]: if source_url: st.link_button("Open source", source_url, use_container_width=True) @@ -1414,7 +1551,181 @@ def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: st.json(detail.get("raw_report", {})) -def _render_recent_evaluator_jobs_section(project_id: str, environment: str) -> None: +def _render_recent_evaluator_job_run_dialog( + project_id: str, + environment: str, + job: Dict[str, Any], + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: + """Render the dialog used to enqueue Download + Eval + Parquet from a recent job row.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.error("Missing evaluator job id.") + return + + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) + suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} + suite_labels = [opt["label"] for opt in suite_options] + + st.caption("Confirm the workflow options for this evaluator job, then start a background task.") + summary_cols = st.columns([1.45, 1.15, 1.35, 1.05]) + summary_cols[0].markdown(f"**Title** \n`{detail.get('title', '—')}`") + summary_cols[1].markdown(f"**Status** \n`{detail.get('status', 'unknown')}`") + summary_cols[2].markdown(f"**Catalog** \n`{detail.get('catalog', '—')}`") + summary_cols[3].markdown(f"**Cases** \n`{int(detail.get('total', 0))}`") + + with st.form(key=f"recent_eval_run_form_{job_id}", border=False): + run_output_path = st.text_input( + "Output path", + value=output_path_default, + help="Folder under the data directory. This uses the same safe path rules as the main download workflow.", + ) + + selected_suite_labels = st.multiselect( + "Suites to download (optional)", + options=suite_labels, + default=[], + help="Leave empty to download all suites from this evaluator job.", + ) + + run_download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON only"], + index=0 if download_type_default == "Archives (ZIP)" else 1, + horizontal=True, + ) + + run_phase = "" + run_skip_large_file = False + run_large_file_mb = 50.0 + run_keep_zip_files = False + if run_download_type == "Archives (ZIP)": + run_phase = st.text_input( + "Phase to extract", + value=phase_default, + help="Enter the phase name to extract from archives.", + ) + opt_cols = st.columns([1.2, 1.3, 1.2]) + with opt_cols[0]: + run_skip_large_file = st.checkbox( + "Skip large files", + value=skip_large_file_default, + help="Skip unusually large archives during download.", + ) + with opt_cols[1]: + run_large_file_mb = st.number_input( + "Skip threshold (MB)", + min_value=1.0, + max_value=5000.0, + step=1.0, + value=float(large_file_mb_default), + ) + with opt_cols[2]: + run_keep_zip_files = st.checkbox( + "Keep ZIP files", + value=keep_zip_files_default, + help="Keep downloaded ZIPs after extraction.", + ) + + run_cols = st.columns([1.25, 1.25, 1.1]) + with run_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=True, + help="Run eval_result and generate Summary.csv / Score.csv after download.", + ) + with run_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + help="Build scene_result.parquet from .pkl files." if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable this.", + ) + with run_cols[2]: + eval_recursive = st.checkbox( + "Recursive eval", + value=True, + help="Search subdirectories for evaluation result folders.", + ) + + action_cols = st.columns([1.15, 1.15, 3.7]) + cancel_clicked = action_cols[0].form_submit_button("Cancel", use_container_width=True) + start_clicked = action_cols[1].form_submit_button("Start", type="primary", use_container_width=True) + + if cancel_clicked: + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + if not start_clicked: + return + + resolved_output, path_err = resolve_under_data_root(run_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + return + + selected_suite_ids = [suite_label_to_id[label] for label in selected_suite_labels] + resolved_path_str = str(resolved_output) + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("job_id", job_id) + set_config_value("suite_id", "") + set_config_value("suite_ids", selected_suite_ids) + set_config_value("download_type", run_download_type) + if run_download_type == "Archives (ZIP)": + set_config_value("phase", run_phase) + set_config_value("skip_large_file", run_skip_large_file) + set_config_value("large_file_mb", run_large_file_mb) + set_config_value("keep_zip_files", run_keep_zip_files) + + params = { + "output_path": resolved_path_str, + "project_id": project_id, + "job_id": job_id, + "suite_id": "", + "suite_ids": selected_suite_ids or None, + "download_type": "archives" if run_download_type == "Archives (ZIP)" else "result_json", + "phase": run_phase if run_download_type == "Archives (ZIP)" else "", + "skip_large_file": run_skip_large_file if run_download_type == "Archives (ZIP)" else False, + "large_file_mb": run_large_file_mb if run_download_type == "Archives (ZIP)" else 50.0, + "keep_zip_files": run_keep_zip_files if run_download_type == "Archives (ZIP)" else False, + "run_eval": run_eval, + "generate_parquet": generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + } + task_id = _enqueue_task("download_and_eval", params) + if not task_id: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + return + + st.session_state["recent_eval_jobs_flash"] = ( + f"Queued Download + Eval + Parquet for `{detail.get('title', job_id)}`. " + f"Task id: `{task_id}`." + ) + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + +def _render_recent_evaluator_jobs_section( + project_id: str, + environment: str, + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: """Render a direct evaluator-jobs browser above the download tabs.""" _inject_recent_evaluator_jobs_styles() show_section = st.toggle( @@ -1428,35 +1739,47 @@ def _render_recent_evaluator_jobs_section(project_id: str, environment: str) -> st.subheader("Recent evaluator jobs") st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") + flash_message = st.session_state.pop("recent_eval_jobs_flash", None) + if flash_message: + st.success(flash_message) - control_cols = st.columns([1.2, 1.8, 1.6, 1.1]) + control_cols = st.columns([0.95, 1.45, 1.65, 0.9]) with control_cols[0]: + st.markdown('
Rows
', unsafe_allow_html=True) limit = int( st.selectbox( - "Jobs", + "Rows", options=[6, 12, 20, 30], index=1, key="recent_eval_jobs_limit", help="How many recent evaluator jobs to fetch for this project.", + label_visibility="collapsed", ) ) with control_cols[1]: + st.markdown('
Status
', unsafe_allow_html=True) status_filter = st.multiselect( - "Status filter", + "Status", options=["running", "success", "failed", "canceled", "unknown"], default=[], key="recent_eval_jobs_status_filter", help="Leave empty to show all recent jobs.", + label_visibility="collapsed", + placeholder="All statuses", ) with control_cols[2]: + st.markdown('
Branch Or Tag
', unsafe_allow_html=True) branch_filter = st.text_input( "Branch/tag contains", value=st.session_state.get("recent_eval_jobs_branch_filter", ""), key="recent_eval_jobs_branch_filter", help="Optional substring filter for branch or tag name.", + label_visibility="collapsed", + placeholder="Filter by branch or tag", ).strip() with control_cols[3]: - if st.button("Refresh jobs", key="refresh_recent_eval_jobs", use_container_width=True): + st.markdown('
Actions
', unsafe_allow_html=True) + if st.button("Refresh", key="refresh_recent_eval_jobs", use_container_width=True): _fetch_recent_evaluator_jobs.clear() _fetch_evaluator_job_detail.clear() st.rerun() @@ -1506,23 +1829,23 @@ def _render_job_list() -> None: visible_jobs = jobs[start_idx:end_idx] has_next_page = total_loaded > current_page * limit - pager_cols = st.columns([1.1, 1.4, 4.5, 1.1, 1.1]) + pager_cols = st.columns([0.9, 1.1, 5.2, 0.9, 1.2]) with pager_cols[0]: - if st.button("Prev", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): + if st.button("Back", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): st.session_state[page_key] = max(1, current_page - 1) st.rerun() with pager_cols[1]: st.markdown( - f"
Page {current_page}
", + f"
{current_page}
", unsafe_allow_html=True, ) with pager_cols[3]: - if st.button("Next", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): + if st.button("More", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): st.session_state[page_key] = current_page + 1 st.rerun() with pager_cols[4]: st.markdown( - f"
{total_loaded}+ loaded
", + f"
{len(visible_jobs)} shown · {total_loaded}+ loaded
", unsafe_allow_html=True, ) @@ -1531,16 +1854,28 @@ def _render_job_list() -> None: st.session_state.pop("recent_eval_jobs_selected", None) selected_job_id = None + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id and not any(str(job.get("job_id", "")) == str(selected_run_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_run_selected", None) + selected_run_job_id = None + st.markdown('
', unsafe_allow_html=True) for job in visible_jobs: - row_cols = st.columns([9.2, 1.0]) + row_cols = st.columns([9.4, 2.0]) with row_cols[0]: _render_recent_evaluator_job_card(job) with row_cols[1]: - if st.button("View", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): - st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) - _fetch_evaluator_job_detail.clear() - st.rerun() + action_cols = st.columns([1.0, 1.0], gap="small") + with action_cols[0]: + if st.button("Details", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + with action_cols[1]: + if st.button("Run", key=f"recent_eval_run_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_run_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() st.markdown("
", unsafe_allow_html=True) selected_job_id = st.session_state.get("recent_eval_jobs_selected") @@ -1549,7 +1884,7 @@ def _render_job_list() -> None: if selected_job: if callable(getattr(st, "dialog", None)): try: - @st.dialog(f"Job details · {selected_job.get('target', '—')}", width="large") + @st.dialog(f"Job details · {selected_job.get('title', '—')}", width="large") def _recent_eval_job_dialog() -> None: _render_recent_evaluator_job_detail(project_id, environment, selected_job) if st.button("Close", key="recent_eval_jobs_close_detail", use_container_width=True): @@ -1563,7 +1898,7 @@ def _recent_eval_job_dialog() -> None: st.markdown('
', unsafe_allow_html=True) hdr_cols = st.columns([4.4, 1.1]) with hdr_cols[0]: - st.subheader(f"Job details · {selected_job.get('target', '—')}") + st.subheader(f"Job details · {selected_job.get('title', '—')}") with hdr_cols[1]: if st.button("Close", key="recent_eval_jobs_close_detail_fallback", use_container_width=True): st.session_state.pop("recent_eval_jobs_selected", None) @@ -1571,6 +1906,52 @@ def _recent_eval_job_dialog() -> None: _render_recent_evaluator_job_detail(project_id, environment, selected_job) st.markdown("
", unsafe_allow_html=True) + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id: + selected_run_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_run_job_id)), None) + if selected_run_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}", width="large") + def _recent_eval_run_dialog() -> None: + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + + _recent_eval_run_dialog() + finally: + if st.session_state.get("recent_eval_jobs_run_selected") == str(selected_run_job_id): + st.session_state.pop("recent_eval_jobs_run_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_run_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + st.markdown("
", unsafe_allow_html=True) + _render_job_list() @@ -1768,7 +2149,18 @@ def on_suite_id_change(): skip_large_file = False large_file_mb = 50.0 # Doesn't apply -_render_recent_evaluator_jobs_section(project_id, environment) +_render_recent_evaluator_jobs_section( + project_id, + environment, + output_path_default=output_path, + download_type_default=download_type, + phase_default=phase if download_type == "Archives (ZIP)" else get_config_value( + "phase", "perception.object_recognition.tracking.objects" + ), + skip_large_file_default=skip_large_file, + large_file_mb_default=large_file_mb, + keep_zip_files_default=bool(get_config_value("keep_zip_files", False)) if download_type == "Archives (ZIP)" else False, +) st.markdown('

Pick a workflow

', unsafe_allow_html=True) tab1, tab2, tab3, tab4 = st.tabs( From 262c0846b6c3094eda4b3571a7ed76497afcfe00 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 08:40:36 +0900 Subject: [PATCH 58/94] feat: enhance job status visualization in download page - Added new CSS styles for job status indicators, improving the visual representation of job statuses (success, failed, unknown, running). - Updated the job rendering function to include status marks, enhancing user experience by providing clearer feedback on job states. - Introduced animations for the running status mark, adding a dynamic element to the UI. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 51 +++++++++++++++++++- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index becbb25..126b6e4 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -1075,6 +1075,8 @@ def _status_color_variant(status: str) -> str: normalized = evaluator_api.normalize_job_status(status) if normalized in evaluator_api.SUCCESS_JOB_STATUSES: return "success" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" if normalized in evaluator_api.FAILED_JOB_STATUSES: return "failed" if normalized in ("started", "running", "pending", "queued", "created"): @@ -1257,7 +1259,42 @@ def _inject_recent_evaluator_jobs_styles() -> None: .evj-status--running { color: #9a6700; background: #fff7db; border-color: rgba(245, 158, 11, 0.28); } .evj-status--success { color: #047857; background: #dcfce7; border-color: rgba(16, 185, 129, 0.28); } .evj-status--failed { color: #b91c1c; background: #fee2e2; border-color: rgba(239, 68, 68, 0.28); } + .evj-status--canceled { color: #7c3aed; background: #f3e8ff; border-color: rgba(124, 58, 237, 0.24); } .evj-status--unknown { color: #475569; background: #f1f5f9; border-color: rgba(148, 163, 184, 0.28); } + .evj-status-mark { + display: inline-flex; + align-items: center; + justify-content: center; + width: 14px; + height: 14px; + border-radius: 999px; + font-size: 0.62rem; + font-weight: 900; + line-height: 1; + border: 1px solid currentColor; + flex: 0 0 auto; + } + .evj-status-mark--success { + background: rgba(4, 120, 87, 0.08); + } + .evj-status-mark--failed { + background: rgba(185, 28, 28, 0.08); + } + .evj-status-mark--canceled { + background: rgba(124, 58, 237, 0.08); + } + .evj-status-mark--unknown { + background: rgba(71, 85, 105, 0.08); + } + .evj-status-mark--running { + position: relative; + border-radius: 999px; + border: 1.5px solid rgba(154, 103, 0, 0.18); + border-top-color: currentColor; + border-right-color: currentColor; + background: transparent; + animation: evj-spin 0.9s linear infinite; + } .evj-dot { width: 8px; height: 8px; @@ -1274,6 +1311,10 @@ def _inject_recent_evaluator_jobs_styles() -> None: 50% { transform: scale(1.2); opacity: 1; } 100% { transform: scale(0.9); opacity: 0.55; } } + @keyframes evj-spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } + } .evj-meta { color: #475569; font-size: 0.82rem; @@ -1437,7 +1478,13 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: source_label = html.escape(job.get("source_label", "") or "—") report_url = html.escape(job.get("report_url", "") or "") source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") - running_dot = '' if job.get("status_variant") == "running" else '' + status_variant = job.get("status_variant", "unknown") + status_mark = { + "running": '', + "success": '', + "failed": '', + "canceled": '', + }.get(status_variant, '') meta_line = f"id {job_id[:8]}" counts = ( f'S {int(job.get("success", 0))} · ' @@ -1463,7 +1510,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None:
{meta_line}
- {running_dot}{status} + {status_mark}{status}
{scheduled}
{duration} · {created_label} From f6502acfc33af02769f36a9901f3d084de639f35 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 09:57:01 +0900 Subject: [PATCH 59/94] feat: enhance TP Summary visualizations and layout - Refactored the TP Summary section to allow for multiple candidate comparisons, improving clarity in delta views. - Updated figure captions to provide more descriptive context for comparisons between baseline and candidates. - Introduced a new function to apply a compact layout to charts, enhancing visual appeal and space efficiency. - Adjusted marker sizes and opacity for scatter plots to improve readability and aesthetics. - Streamlined the metric distribution section with a more compact layout for histograms and violin plots. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/overview_pdf_report.py | 141 ++++++++++++++---- .../pages/1_TP_Summary.py | 91 +++++++---- 2 files changed, 168 insertions(+), 64 deletions(-) diff --git a/evaluation_dashboard_app/lib/overview_pdf_report.py b/evaluation_dashboard_app/lib/overview_pdf_report.py index 0332ee4..d3f636f 100644 --- a/evaluation_dashboard_app/lib/overview_pdf_report.py +++ b/evaluation_dashboard_app/lib/overview_pdf_report.py @@ -322,13 +322,21 @@ def _build_tp_summary_section( tables.append(metrics_table) if len(summaries) >= 2: - focus_idx = min(len(summaries) - 1, 1) - candidate_label = labels[focus_idx] - delta_df = build_summary_delta(summaries[0], summaries[focus_idx]) - if not delta_df.empty: - figures.extend(_build_tp_default_compare_figures(delta_df, candidate_label)) - else: - figures.append((_make_text_placeholder_figure("No overlapping Summary rows for delta view."), "Delta view is empty because the baseline and candidate do not share Summary keys.")) + baseline_lbl = labels[0] + for cand_idx in range(1, len(summaries)): + cand_lbl = labels[cand_idx] + delta_df = build_summary_delta(summaries[0], summaries[cand_idx]) + if delta_df.empty: + figures.append( + ( + _make_text_placeholder_figure( + f"No overlapping Summary rows for delta ({cand_lbl} vs {baseline_lbl})." + ), + f"Delta view is empty because baseline {baseline_lbl} and candidate {cand_lbl} do not share Summary keys.", + ) + ) + else: + figures.extend(_build_tp_default_compare_figures(delta_df, cand_lbl)) else: figures.extend(_build_tp_default_single_figures(summaries[0])) @@ -709,7 +717,9 @@ def _build_tp_default_compare_figures(df_delta: pd.DataFrame, candidate_label: s ) fig_rms_x.update_traces(marker=dict(size=8, opacity=0.6)) _apply_tp_clean_theme(fig_rms_x) - figures.append((fig_rms_x, "Default compare TP Summary view: X RMS candidate vs baseline.")) + figures.append( + (fig_rms_x, f"TP Summary compare ({candidate_label} vs baseline): X RMS scatter, colored by TP delta.") + ) fig_rms_y = px.scatter( df_delta, @@ -729,10 +739,22 @@ def _build_tp_default_compare_figures(df_delta: pd.DataFrame, candidate_label: s ) fig_rms_y.update_traces(marker=dict(size=8, opacity=0.6)) _apply_tp_clean_theme(fig_rms_y) - figures.append((fig_rms_y, "Default compare TP Summary view: Y RMS candidate vs baseline.")) + figures.append( + (fig_rms_y, f"TP Summary compare ({candidate_label} vs baseline): Y RMS scatter, colored by TP delta.") + ) - figures.append((_build_tp_distribution_figure(df_delta, "TP_delta"), "Default compare TP distribution view (metric = TP_delta).")) - figures.append((_build_tp_violin_figure(df_delta, "TP_delta"), "Default compare TP density violin for metric = TP_delta.")) + figures.append( + ( + _build_tp_distribution_figure(df_delta, "TP_delta"), + f"TP Summary compare ({candidate_label} vs baseline): TP delta distribution.", + ) + ) + figures.append( + ( + _build_tp_violin_figure(df_delta, "TP_delta"), + f"TP Summary compare ({candidate_label} vs baseline): TP delta violin.", + ) + ) return figures @@ -835,7 +857,9 @@ def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFr figures: List[Tuple[go.Figure, str]] = [] metric = "pass_rate" group_by = "GT_OBJ" + run_order = [lbl for lbl, _ in views] combined = pd.concat([df.assign(Run=lbl) for lbl, df in views], ignore_index=True) + combined["Run"] = pd.Categorical(combined["Run"], categories=run_order, ordered=True) px_map = {lbl: _COMPARE_RUN_COLORS[i % len(_COMPARE_RUN_COLORS)] for i, (lbl, _) in enumerate(views)} fig_hist = px.histogram( @@ -843,6 +867,7 @@ def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFr x=metric, color="Run", color_discrete_map=px_map, + category_orders={"Run": run_order}, nbins=30, barmode="overlay", opacity=0.55, @@ -851,13 +876,18 @@ def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFr _apply_criteria_theme(fig_hist, f"{metric} · row-level distribution") figures.append((fig_hist, "Default compare overlay view for pass-rate distribution.")) - df_avg = combined.groupby([group_by, "Run"], as_index=False)[metric].mean().sort_values(metric, ascending=False) + df_avg = combined.groupby([group_by, "Run"], as_index=False)[metric].mean() + obj_means = df_avg.groupby(group_by, as_index=False)[metric].mean().sort_values(metric, ascending=False) + obj_order = [x for x in obj_means[group_by].tolist() if x in set(df_avg[group_by])] + df_avg[group_by] = pd.Categorical(df_avg[group_by], categories=obj_order, ordered=True) + df_avg = df_avg.sort_values([group_by, "Run"]) fig_bar = px.bar( df_avg, x=group_by, y=metric, color="Run", color_discrete_map=px_map, + category_orders={group_by: obj_order, "Run": run_order}, barmode="group", text_auto=".2f", ) @@ -870,6 +900,7 @@ def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFr y="pass_rate", color="Run", color_discrete_map=px_map, + category_orders={group_by: obj_order, "Run": run_order}, points="all", ) _apply_criteria_theme(fig_box, "Pass rate overview") @@ -877,7 +908,16 @@ def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFr scenario_delta = _build_criteria_compare_delta_figure(views) if scenario_delta is not None: - figures.append((scenario_delta, "Default compare per-scenario delta view for candidate B vs baseline A.")) + base_l = run_order[0] + if len(run_order) == 2: + cap = f"Default compare per-scenario delta view for candidate {run_order[1]} vs baseline {base_l}." + else: + rest = ", ".join(run_order[1:]) + cap = ( + f"Default compare per-scenario delta vs baseline {base_l} " + f"for candidates {rest} (grouped bars)." + ) + figures.append((scenario_delta, cap)) return figures @@ -899,26 +939,34 @@ def _build_criteria_compare_table(views: Sequence[Tuple[str, pd.DataFrame]]) -> for g in merges[1:]: per_scenario = per_scenario.merge(g, on="Scenario", how="inner") base = labels[0] - focus = labels[1] - delta_col = f"delta_{focus}" - per_scenario[delta_col] = per_scenario[f"pr_{focus}"] - per_scenario[f"pr_{base}"] - per_scenario = per_scenario.reindex(per_scenario[delta_col].abs().sort_values(ascending=False).index).head(20) - rows = [["Scenario", f"Pass rate ({base})", f"Pass rate ({focus})", f"Delta ({focus} - {base})"]] + delta_cols: List[str] = [] + for cand in labels[1:]: + dcol = f"delta_{cand}" + per_scenario[dcol] = per_scenario[f"pr_{cand}"] - per_scenario[f"pr_{base}"] + delta_cols.append(dcol) + rank_key = per_scenario[delta_cols].abs().max(axis=1) + per_scenario = per_scenario.reindex(rank_key.sort_values(ascending=False).index).head(20) + header: List[str] = ["Scenario", f"Pass rate ({base})"] + for cand in labels[1:]: + header.extend([f"Pass rate ({cand})", f"Δ({cand} - {base})"]) + rows = [header] for _, row in per_scenario.iterrows(): - rows.append([ - _shorten_scenario_name(str(row["Scenario"])), - _fmt_number(row[f"pr_{base}"]), - _fmt_number(row[f"pr_{focus}"]), - _fmt_number(row[delta_col]), - ]) - return {"rows": rows, "col_width_weights": [0.52, 0.16, 0.16, 0.16]} + cells: List[str] = [_shorten_scenario_name(str(row["Scenario"])), _fmt_number(row[f"pr_{base}"])] + for cand in labels[1:]: + cells.extend([_fmt_number(row[f"pr_{cand}"]), _fmt_number(row[f"delta_{cand}"])]) + rows.append(cells) + ncols = len(header) + scen_w = 0.34 if ncols > 4 else 0.52 + rest_w = (1.0 - scen_w) / max(ncols - 1, 1) + weights = [scen_w] + [rest_w] * (ncols - 1) + return {"rows": rows, "col_width_weights": weights} def _build_criteria_compare_delta_figure(views: Sequence[Tuple[str, pd.DataFrame]]) -> Optional[go.Figure]: if len(views) < 2: return None labels = [lbl for lbl, _ in views] - focus = labels[1] + base = labels[0] merges = [] for lbl, df in views: g = df.groupby("Scenario", as_index=False)["pass_rate"].mean() @@ -928,17 +976,44 @@ def _build_criteria_compare_delta_figure(views: Sequence[Tuple[str, pd.DataFrame per_scenario = per_scenario.merge(g, on="Scenario", how="inner") if per_scenario.empty: return None - delta_col = f"delta_{focus}" - per_scenario[delta_col] = per_scenario[f"pr_{focus}"] - per_scenario[f"pr_{labels[0]}"] - vis = per_scenario.reindex(per_scenario[delta_col].abs().sort_values(ascending=False).index).head(20) + long_rows: List[dict] = [] + delta_cols: List[str] = [] + for cand in labels[1:]: + dcol = f"delta_{cand}" + per_scenario[dcol] = per_scenario[f"pr_{cand}"] - per_scenario[f"pr_{base}"] + delta_cols.append(dcol) + rank_key = per_scenario[delta_cols].abs().max(axis=1) + vis = per_scenario.reindex(rank_key.sort_values(ascending=False).index).head(20) + scen_order = [_shorten_scenario_name(str(s)) for s in vis["Scenario"].tolist()] + for _, row in vis.iterrows(): + scen_disp = _shorten_scenario_name(str(row["Scenario"])) + for cand in labels[1:]: + long_rows.append( + { + "Scenario": scen_disp, + "vs_baseline": f"Δ({cand} - {base})", + "delta": float(row[f"delta_{cand}"]), + } + ) + melted = pd.DataFrame(long_rows) + if melted.empty: + return None + legend_order = [f"Δ({cand} - {base})" for cand in labels[1:]] + color_map = { + leg: _COMPARE_RUN_COLORS[(i + 1) % len(_COMPARE_RUN_COLORS)] + for i, leg in enumerate(legend_order) + } fig = px.bar( - vis, + melted, x="Scenario", - y=delta_col, - color=delta_col, - color_continuous_scale="RdYlGn", + y="delta", + color="vs_baseline", + color_discrete_map=color_map, + category_orders={"Scenario": scen_order, "vs_baseline": legend_order}, + barmode="group", text_auto=".2f", ) + fig.update_layout(coloraxis_showscale=False, legend_title_text="") _apply_criteria_theme(fig, "Pass rate delta by scenario") return fig diff --git a/evaluation_dashboard_app/pages/1_TP_Summary.py b/evaluation_dashboard_app/pages/1_TP_Summary.py index cd955d3..1425fca 100644 --- a/evaluation_dashboard_app/pages/1_TP_Summary.py +++ b/evaluation_dashboard_app/pages/1_TP_Summary.py @@ -76,6 +76,18 @@ mode=mode, ) + +def _apply_compact_chart_layout(fig, *, height: int = 300) -> None: + """Keep TP Summary charts visually lighter and more compact.""" + fig.update_layout( + template="plotly_white", + height=height, + margin=dict(t=48, b=40, l=48, r=18), + paper_bgcolor="rgba(248,250,252,0.9)", + plot_bgcolor="rgba(255,255,255,0.95)", + font=dict(family="system-ui, sans-serif", size=12, color="#334155"), + ) + # ========== View Selector ========== st.sidebar.markdown("##### Scope") if mode == "Compare Mode" and all_runs and run_labels and delta_by_label: @@ -211,7 +223,8 @@ section_header("Position RMS (X vs Y)", "Lateral vs longitudinal RMS error; color encodes TP or ΔTP.") # Always compare the two sources side by side (before and after/delta) if use_delta: - # Show both reference and target RMS comparisons for X and Y, as well as their deltas + # Show both reference and target RMS comparisons in a tighter 2-up row. + rms_left, rms_right = st.columns(2) fig_rms_x_compare = px.scatter( df_f, x="xrms_B", @@ -225,11 +238,14 @@ "xrms_delta": "Δ X RMS", "yrms_delta": "Δ Y RMS", }, - title=f"Scatter: X RMS ({cand}) vs X RMS (A)", + title=f"X RMS · {cand} vs A", color_continuous_scale="Viridis", ) - fig_rms_x_compare.update_traces(marker=dict(size=8, opacity=0.6)) - st.plotly_chart(fig_rms_x_compare, width="stretch") + fig_rms_x_compare.update_traces(marker=dict(size=7, opacity=0.58)) + _apply_compact_chart_layout(fig_rms_x_compare, height=290) + with rms_left: + st.plotly_chart(fig_rms_x_compare, width="stretch") + fig_rms_y_compare = px.scatter( df_f, x="yrms_B", @@ -243,11 +259,13 @@ "xrms_delta": "Δ X RMS", "yrms_delta": "Δ Y RMS", }, - title=f"Scatter: Y RMS ({cand}) vs Y RMS (A)", + title=f"Y RMS · {cand} vs A", color_continuous_scale="Viridis", ) - fig_rms_y_compare.update_traces(marker=dict(size=8, opacity=0.6)) - st.plotly_chart(fig_rms_y_compare, width="stretch") + fig_rms_y_compare.update_traces(marker=dict(size=7, opacity=0.58)) + _apply_compact_chart_layout(fig_rms_y_compare, height=290) + with rms_right: + st.plotly_chart(fig_rms_y_compare, width="stretch") else: # Just show the submission's RMS (x/y) for standard analysis fig_rms = px.scatter( @@ -263,13 +281,14 @@ }, color_continuous_scale="Viridis", ) - fig_rms.update_traces(marker=dict(size=8, opacity=0.7)) + fig_rms.update_traces(marker=dict(size=8, opacity=0.68)) + _apply_compact_chart_layout(fig_rms, height=320) st.plotly_chart(fig_rms, width="stretch") with col2: section_header("Velocity (vx vs vy)", "Planar velocity colored by TP or ΔTP.") - def plot_velocity(df, vx, vy, vx_label, vy_label): + def plot_velocity(df, vx, vy, vx_label, vy_label, *, title: str): fig = px.scatter( df, x=vx, @@ -282,18 +301,32 @@ def plot_velocity(df, vx, vy, vx_label, vy_label): tp_col: "TP", }, color_continuous_scale="Plasma", - title=f"{vx_label} vs {vy_label}", + title=title, ) - st.plotly_chart(fig, width="stretch") + fig.update_traces(marker=dict(size=7, opacity=0.58)) + _apply_compact_chart_layout(fig, height=290 if use_delta else 320) + return fig if use_delta: - plot_velocity(df_f, "vx", "vy", "Vx (A)", "Vy (A)") - plot_velocity(df_f, "vx_B", "vy_B", f"Vx ({cand})", f"Vy ({cand})") + vel_left, vel_right = st.columns(2) + with vel_left: + st.plotly_chart( + plot_velocity(df_f, "vx", "vy", "Vx (A)", "Vy (A)", title="Velocity · A"), + width="stretch", + ) + with vel_right: + st.plotly_chart( + plot_velocity(df_f, "vx_B", "vy_B", f"Vx ({cand})", f"Vy ({cand})", title=f"Velocity · {cand}"), + width="stretch", + ) else: - plot_velocity(df_f, "vx", "vy", "Vx", "Vy") + st.plotly_chart( + plot_velocity(df_f, "vx", "vy", "Vx", "Vy", title="Velocity"), + width="stretch", + ) # ========== Metric Distribution ========== -section_header("Metric distribution", "Histogram + marginal box for any Summary column or delta column.") +section_header("Metric distribution", "Compact secondary views for a selected Summary metric.") metrics = ["xstd", "ystd", "xrms", "yrms", "vx", "vy", "TP"] metrics_delta = [f"{m}_delta" for m in metrics] metric_options = metrics_delta if use_delta else metrics @@ -304,46 +337,42 @@ def plot_velocity(df, vx, vy, vx_label, vy_label): default_index = 0 metric = st.selectbox("Select metric", metric_options, index=default_index) -# Show a simple, single-color (monochrome) distribution for clarity +dist_left, dist_right = st.columns(2) + fig_hist = px.histogram( df_f, x=metric, - nbins=40, + nbins=36, color_discrete_sequence=["#0d9488"], marginal="box", opacity=0.88, + title=f"{metric} distribution", ) fig_hist.update_layout( - template="plotly_white", showlegend=False, bargap=0.04, xaxis_title=metric, yaxis_title="Count", - paper_bgcolor="rgba(248,250,252,0.9)", - plot_bgcolor="rgba(255,255,255,0.95)", - font=dict(family="system-ui, sans-serif", size=12, color="#334155"), - margin=dict(t=36, b=48, l=56, r=28), ) -st.plotly_chart(fig_hist, width="stretch") +_apply_compact_chart_layout(fig_hist, height=280) +with dist_left: + st.plotly_chart(fig_hist, width="stretch") -section_header("Density (violin)", "Shape of the selected metric including outliers.") fig_density = px.violin( df_f, y=metric, box=True, - points="all", + points="outliers", color_discrete_sequence=["#312e81"], + title=f"{metric} density", ) fig_density.update_layout( - template="plotly_white", yaxis_title=metric, showlegend=False, - paper_bgcolor="rgba(248,250,252,0.9)", - plot_bgcolor="rgba(255,255,255,0.95)", - font=dict(family="system-ui, sans-serif", size=12, color="#334155"), - margin=dict(t=36, b=48, l=56, r=28), ) -st.plotly_chart(fig_density, width="stretch") +_apply_compact_chart_layout(fig_density, height=280) +with dist_right: + st.plotly_chart(fig_density, width="stretch") # ========== Scenario-level Delta Analysis (Compare Mode) ========== df_cmp = df_active if use_delta else None From 4805eed8bad1b8a7ec1c1f5e8a8b56c70154db06 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 10:02:26 +0900 Subject: [PATCH 60/94] feat: improve pagination and styling for recent evaluator jobs - Added new CSS styles for pagination buttons, enhancing the visual representation and interactivity of job navigation. - Updated the job fetching logic to improve pagination handling, allowing for better user experience when browsing through recent evaluator jobs. - Refactored the rendering of pagination controls to include dynamic button states, improving clarity and usability. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 52 ++++++++++++++------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 126b6e4..79bc03a 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -1387,6 +1387,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: [class*="st-key-recent_eval_run_"] button, [class*="st-key-recent_eval_jobs_prev"] button, [class*="st-key-recent_eval_jobs_next"] button, + [class*="st-key-recent_eval_jobs_pagebtn_"] button, [class*="st-key-refresh_recent_eval_jobs"] button { min-height: 2rem; padding: 0.18rem 0.58rem; @@ -1398,6 +1399,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: [class*="st-key-recent_eval_view_"] button, [class*="st-key-recent_eval_jobs_prev"] button, [class*="st-key-recent_eval_jobs_next"] button, + [class*="st-key-recent_eval_jobs_pagebtn_"] button, [class*="st-key-refresh_recent_eval_jobs"] button { border-color: rgba(148, 163, 184, 0.34); color: #334155; @@ -1406,11 +1408,17 @@ def _inject_recent_evaluator_jobs_styles() -> None: [class*="st-key-recent_eval_view_"] button:hover, [class*="st-key-recent_eval_jobs_prev"] button:hover, [class*="st-key-recent_eval_jobs_next"] button:hover, + [class*="st-key-recent_eval_jobs_pagebtn_"] button:hover, [class*="st-key-refresh_recent_eval_jobs"] button:hover { border-color: rgba(15, 118, 110, 0.28); color: #0f766e; background: #f8fffd; } + [class*="st-key-recent_eval_jobs_pagebtn_active_"] button { + border-color: rgba(13, 148, 136, 0.26); + background: linear-gradient(180deg, #f0fdfa, #ecfeff); + color: #0f766e; + } [class*="st-key-recent_eval_run_"] button { border-color: rgba(13, 148, 136, 0.22); background: linear-gradient(180deg, #f0fdfa, #ecfeff); @@ -1840,7 +1848,7 @@ def _render_job_list() -> None: st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") return current_page = max(1, int(st.session_state.get(page_key, 1))) - fetch_limit = max(limit * (current_page + 1), limit + 1) + fetch_limit = max(limit * 3, limit * (current_page + 2), limit + 1) try: jobs = _fetch_recent_evaluator_jobs(project_id, environment, fetch_limit) except Exception as e: @@ -1876,25 +1884,39 @@ def _render_job_list() -> None: visible_jobs = jobs[start_idx:end_idx] has_next_page = total_loaded > current_page * limit - pager_cols = st.columns([0.9, 1.1, 5.2, 0.9, 1.2]) + if current_page == 1: + page_numbers = list(range(1, min(3, max_known_page) + 1)) + else: + page_numbers = list( + range( + max(1, current_page - 1), + min(max_known_page, current_page + 1) + 1, + ) + ) + pager_cols = st.columns([0.8, 0.9, 0.9, 0.9, 0.8, 5.7]) with pager_cols[0]: - if st.button("Back", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): + if st.button("‹", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): st.session_state[page_key] = max(1, current_page - 1) st.rerun() - with pager_cols[1]: - st.markdown( - f"
{current_page}
", - unsafe_allow_html=True, - ) - with pager_cols[3]: - if st.button("More", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): + for idx, page_num in enumerate(page_numbers[:3], start=1): + with pager_cols[idx]: + btn_key = ( + f"recent_eval_jobs_pagebtn_active_{page_num}" + if page_num == current_page + else f"recent_eval_jobs_pagebtn_{page_num}" + ) + if st.button( + str(page_num), + key=btn_key, + use_container_width=True, + disabled=page_num == current_page, + ): + st.session_state[page_key] = page_num + st.rerun() + with pager_cols[4]: + if st.button("›", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): st.session_state[page_key] = current_page + 1 st.rerun() - with pager_cols[4]: - st.markdown( - f"
{len(visible_jobs)} shown · {total_loaded}+ loaded
", - unsafe_allow_html=True, - ) selected_job_id = st.session_state.get("recent_eval_jobs_selected") if selected_job_id and not any(str(job.get("job_id", "")) == str(selected_job_id) for job in jobs): From 707dd1658f1e89343aafacc2463cff384a95d9fc Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 11:18:40 +0900 Subject: [PATCH 61/94] feat: enhance job search and filtering capabilities in download page - Introduced new functions for normalizing UI status filters into API values, improving the accuracy of job status searches. - Added functionality to escape wildcard characters in search inputs, enhancing the robustness of search filters. - Implemented a comprehensive mapping of quick-search UI options to server-side filters, allowing users to search by various job attributes. - Enhanced date-range filtering for job searches, providing users with more precise control over the evaluation timeline. - Refactored the job fetching logic to support pagination and additional filtering options, improving the overall user experience when browsing recent evaluator jobs. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/evaluator_api.py | 29 ++ evaluation_dashboard_app/pages/6_Download.py | 273 ++++++++++++++++-- 2 files changed, 272 insertions(+), 30 deletions(-) diff --git a/evaluation_dashboard_app/lib/evaluator_api.py b/evaluation_dashboard_app/lib/evaluator_api.py index c339b1b..ba2082a 100644 --- a/evaluation_dashboard_app/lib/evaluator_api.py +++ b/evaluation_dashboard_app/lib/evaluator_api.py @@ -512,6 +512,35 @@ def get_report_list( if max_results is not None and len(reports) >= max_results: return reports[:max_results] + def search_report_list( + self, + project_id: str, + *, + filters: Optional[list[dict[str, Any]]] = None, + sort: Optional[list[dict[str, Any]]] = None, + next_token: str = "", + size: int = 100, + ) -> dict[str, Any]: + url = f"{self.api_base_url}/projects/{project_id}/jobs/reports/search" + payload: dict[str, Any] = { + "size": max(1, min(int(size), 100)), + } + if next_token: + payload["next_token"] = next_token + if filters: + payload["filters"] = filters + if sort: + payload["sort"] = sort + + response = self.request(url, payload, method="POST") + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to search report list: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + def get_suite_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: return self._get_paginated_reports( f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/suite/reports" diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 79bc03a..e99d8ad 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -1100,6 +1100,119 @@ def _status_display_label(status: str) -> str: return normalized or "unknown" +def _status_filter_values(selected_statuses: List[str]) -> List[str]: + """Normalize UI status filters into API status values.""" + values: List[str] = [] + for raw in selected_statuses: + normalized = evaluator_api.normalize_job_status(raw) + if normalized == "unknown" or not normalized: + continue + if normalized == "running": + values.extend(["running", "started"]) + elif normalized == "success": + values.extend(["success", "succeeded"]) + elif normalized == "failed": + values.extend(["failed", "failure", "error"]) + elif normalized == "canceled": + values.extend(["canceled", "cancelled", "aborted"]) + else: + values.append(normalized) + return sorted(set(values)) + + +def _escape_search_match_value(value: str) -> str: + """Escape wildcard characters for API Match filters.""" + return ( + value.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("?", "\\?") + ) + + +def _build_recent_job_search_filter( + search_text: str, + search_scope: str, +) -> tuple[Optional[Dict[str, Any]], str]: + """Map quick-search UI to one server-side filter and a client-side needle.""" + needle = search_text.strip() + if not needle: + return None, "" + + if search_scope == "Branch/tag": + return ( + { + "field": "event.source.git_ref", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Description": + return ( + { + "field": "description", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Job ID": + return ( + { + "field": "job_id", + "operator": "In", + "values": [needle], + }, + needle.lower(), + ) + if search_scope == "Git SHA": + return ( + { + "field": "event.source.git_sha", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Fail message": + return ( + { + "field": "fail_message", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + return None, needle.lower() + + +def _build_recent_job_date_filters( + date_from: Optional[datetime.date], + date_to: Optional[datetime.date], +) -> List[Dict[str, Any]]: + """Build scheduled_at date-range filters for the search API.""" + filters: List[Dict[str, Any]] = [] + if date_from: + start_dt = datetime(date_from.year, date_from.month, date_from.day, 0, 0, 0, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Gte", + "values": [start_dt.astimezone(timezone.utc).isoformat()], + } + ) + if date_to: + end_dt = datetime(date_to.year, date_to.month, date_to.day, 23, 59, 59, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Lte", + "values": [end_dt.astimezone(timezone.utc).isoformat()], + } + ) + return filters + + def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: """Compact summary for one evaluator job card.""" status = evaluator_api.extract_job_status(report) @@ -1140,22 +1253,56 @@ def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: @st.cache_data(ttl=30, show_spinner=False) -def _fetch_recent_evaluator_jobs(project_id: str, environment: str, limit: int) -> List[Dict[str, Any]]: - """Fetch recent evaluator jobs and normalize them for list rendering.""" +def _fetch_recent_evaluator_job_pages( + project_id: str, + environment: str, + page_size: int, + pages_to_fetch: int, + status_values: tuple[str, ...] = (), + extra_filters: tuple[tuple[str, str, tuple[Any, ...]], ...] = (), +) -> List[Dict[str, Any]]: + """Fetch recent evaluator jobs from the search endpoint page-by-page.""" if not project_id: return [] os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT api = evaluator_api.EvaluationRunAPI() - reports = api.get_report_list(project_id, status="all", max_results=max(1, int(limit))) - reports = sorted( - reports, - key=lambda report: _parse_api_dt(report.get("scheduled_at") or report.get("started_at") or report.get("finished_at")) or datetime.min.replace(tzinfo=timezone.utc), - reverse=True, - ) - normalized = [] - for report in reports[:limit]: - normalized.append(_summarize_recent_job(report)) - return normalized + filters: List[Dict[str, Any]] = [] + if status_values: + filters.append( + { + "field": "status", + "operator": "In", + "values": list(status_values), + } + ) + for field, operator, values in extra_filters: + filters.append( + { + "field": field, + "operator": operator, + "values": list(values), + } + ) + next_token = "" + pages: List[Dict[str, Any]] = [] + for _ in range(max(1, int(pages_to_fetch))): + data = api.search_report_list( + project_id, + filters=filters or None, + next_token=next_token, + size=max(1, min(int(page_size), 100)), + ) + reports = data.get("reports", []) or [] + pages.append( + { + "jobs": [_summarize_recent_job(report) for report in reports], + "next_token": data.get("next_token", "") or "", + } + ) + next_token = data.get("next_token", "") or "" + if not next_token: + break + return pages @st.cache_data(ttl=30, show_spinner=False) @@ -1493,7 +1640,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: "failed": '', "canceled": '', }.get(status_variant, '') - meta_line = f"id {job_id[:8]}" + meta_line = job_id counts = ( f'S {int(job.get("success", 0))} · ' f'F {int(job.get("failed", 0))} · ' @@ -1798,7 +1945,7 @@ def _render_recent_evaluator_jobs_section( if flash_message: st.success(flash_message) - control_cols = st.columns([0.95, 1.45, 1.65, 0.9]) + control_cols = st.columns([0.8, 1.1, 1.25, 1.55, 1.2, 1.2, 0.75]) with control_cols[0]: st.markdown('
Rows
', unsafe_allow_html=True) limit = int( @@ -1823,43 +1970,109 @@ def _render_recent_evaluator_jobs_section( placeholder="All statuses", ) with control_cols[2]: - st.markdown('
Branch Or Tag
', unsafe_allow_html=True) - branch_filter = st.text_input( - "Branch/tag contains", - value=st.session_state.get("recent_eval_jobs_branch_filter", ""), - key="recent_eval_jobs_branch_filter", - help="Optional substring filter for branch or tag name.", + st.markdown('
Search In
', unsafe_allow_html=True) + search_scope = st.selectbox( + "Search in", + options=["Branch/tag", "Description", "Job ID", "Git SHA", "Fail message"], + index=0, + key="recent_eval_jobs_search_scope", + help="Choose which evaluator field the quick search should target.", label_visibility="collapsed", - placeholder="Filter by branch or tag", - ).strip() + ) with control_cols[3]: + st.markdown('
Search
', unsafe_allow_html=True) + search_text = st.text_input( + "Search", + value=st.session_state.get("recent_eval_jobs_search_text", ""), + key="recent_eval_jobs_search_text", + help="Server-side search across the selected field.", + label_visibility="collapsed", + placeholder="Type to search evaluator jobs", + ).strip() + with control_cols[4]: + st.markdown('
From
', unsafe_allow_html=True) + date_from = st.date_input( + "From", + value=st.session_state.get("recent_eval_jobs_date_from", None), + key="recent_eval_jobs_date_from", + label_visibility="collapsed", + help="Scheduled-at lower bound in JST.", + ) + with control_cols[5]: + st.markdown('
To
', unsafe_allow_html=True) + date_to = st.date_input( + "To", + value=st.session_state.get("recent_eval_jobs_date_to", None), + key="recent_eval_jobs_date_to", + label_visibility="collapsed", + help="Scheduled-at upper bound in JST.", + ) + with control_cols[6]: st.markdown('
Actions
', unsafe_allow_html=True) if st.button("Refresh", key="refresh_recent_eval_jobs", use_container_width=True): - _fetch_recent_evaluator_jobs.clear() + _fetch_recent_evaluator_job_pages.clear() _fetch_evaluator_job_detail.clear() st.rerun() page_key = "recent_eval_jobs_page" if page_key not in st.session_state: st.session_state[page_key] = 1 + if date_from and date_to and date_from > date_to: + st.warning("`From` date must be earlier than or equal to `To` date.") + return def _render_job_list() -> None: if not project_id: st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") return current_page = max(1, int(st.session_state.get(page_key, 1))) - fetch_limit = max(limit * 3, limit * (current_page + 2), limit + 1) + pages_to_fetch = max(3, current_page + 2) + if search_text or status_filter or date_from or date_to: + pages_to_fetch = max(pages_to_fetch, 6) + server_status_values = tuple(_status_filter_values(status_filter)) + server_search_filter, search_needle = _build_recent_job_search_filter(search_text, search_scope) + server_date_filters = _build_recent_job_date_filters(date_from, date_to) + extra_filters: List[Dict[str, Any]] = [] + if server_search_filter: + extra_filters.append(server_search_filter) + extra_filters.extend(server_date_filters) + extra_filter_tuples = tuple( + ( + str(f["field"]), + str(f["operator"]), + tuple(f.get("values", []) or []), + ) + for f in extra_filters + ) try: - jobs = _fetch_recent_evaluator_jobs(project_id, environment, fetch_limit) + fetched_pages = _fetch_recent_evaluator_job_pages( + project_id, + environment, + limit, + pages_to_fetch, + status_values=server_status_values, + extra_filters=extra_filter_tuples, + ) except Exception as e: st.error(f"Could not fetch recent evaluator jobs: {e}") return - if branch_filter: - branch_filter_lower = branch_filter.lower() - jobs = [job for job in jobs if branch_filter_lower in str(job.get("target", "")).lower()] + jobs = [job for page in fetched_pages for job in page.get("jobs", [])] + has_more_from_api = bool(fetched_pages and fetched_pages[-1].get("next_token")) + + if search_needle: + if search_scope == "Branch/tag": + jobs = [job for job in jobs if search_needle in str(job.get("target", "")).lower()] + elif search_scope == "Description": + jobs = [job for job in jobs if search_needle in str(job.get("description", "")).lower() or search_needle in str(job.get("title", "")).lower()] + elif search_scope == "Job ID": + jobs = [job for job in jobs if search_needle in str(job.get("job_id", "")).lower()] + elif search_scope == "Git SHA": + jobs = [job for job in jobs if search_needle in str(job.get("git_sha", "")).lower()] + elif search_scope == "Fail message": + jobs = [job for job in jobs if search_needle in str(job.get("fail_message", "")).lower()] if status_filter: - selected = set(status_filter) + selected = {evaluator_api.normalize_job_status(v) for v in status_filter} jobs = [job for job in jobs if job.get("status_variant") in selected or evaluator_api.normalize_job_status(job.get("status", "")) in selected] if not jobs: @@ -1868,7 +2081,7 @@ def _render_job_list() -> None: return total_loaded = len(jobs) - has_next_page = total_loaded > current_page * limit + has_next_page = total_loaded > current_page * limit or has_more_from_api max_known_page = max(1, (total_loaded + limit - 1) // limit) if current_page > max_known_page: current_page = max_known_page From fc818641d6d8348b7ce8af4378d2aa4bb0f28128 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 13:13:33 +0900 Subject: [PATCH 62/94] feat: enhance job search functionality with user filtering and recent search history - Added support for searching jobs by the user who scheduled them, improving the filtering options available to users. - Implemented functions to manage recent job search history, allowing users to quickly access previous searches for enhanced usability. - Updated the UI to include a dropdown for recent searches, streamlining the job search process and improving user experience. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Download.py | 216 ++++++++++++++++++- 1 file changed, 205 insertions(+), 11 deletions(-) diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index e99d8ad..2befa03 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -1057,7 +1057,7 @@ def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[D options: List[Dict[str, str]] = [] seen_ids = set() for row in suite_rows or []: - report_url = str(row.get("url") or "").strip() + report_url = str(row.get("url") or row.get("Report") or "").strip() suite_id = "" if "/tests/" in report_url: tail = report_url.split("/tests/", 1)[1] @@ -1065,7 +1065,7 @@ def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[D if not suite_id or suite_id in seen_ids: continue seen_ids.add(suite_id) - suite_name = str(row.get("name") or suite_id).strip() + suite_name = str(row.get("name") or row.get("Suite") or suite_id).strip() options.append({"id": suite_id, "label": f"{suite_name} ({suite_id})"}) return options @@ -1132,6 +1132,7 @@ def _escape_search_match_value(value: str) -> str: def _build_recent_job_search_filter( search_text: str, search_scope: str, + user_directory: Optional[Dict[str, Dict[str, str]]] = None, ) -> tuple[Optional[Dict[str, Any]], str]: """Map quick-search UI to one server-side filter and a client-side needle.""" needle = search_text.strip() @@ -1186,6 +1187,123 @@ def _build_recent_job_search_filter( return None, needle.lower() +def _recent_job_search_history_key(scope: str) -> str: + return f"recent_eval_jobs_search_history::{scope}" + + +def _get_recent_job_search_history(scope: str) -> List[str]: + stored = get_config_value(_recent_job_search_history_key(scope), []) or [] + if not isinstance(stored, list): + return [] + return [str(v).strip() for v in stored if str(v).strip()] + + +def _save_recent_job_search_history(scope: str, value: str, *, max_items: int = 8) -> None: + text = str(value).strip() + if not text: + return + history = _get_recent_job_search_history(scope) + updated = [text] + [item for item in history if item != text] + set_config_value(_recent_job_search_history_key(scope), updated[:max_items]) + + +def _get_recent_eval_user_directory() -> Dict[str, Dict[str, str]]: + stored = get_config_value("recent_eval_jobs_user_directory", {}) or {} + if not isinstance(stored, dict): + return {} + normalized: Dict[str, Dict[str, str]] = {} + for subject_id, info in stored.items(): + if not isinstance(info, dict): + continue + normalized[str(subject_id)] = { + "name": str(info.get("name") or "").strip(), + "email": str(info.get("email") or "").strip(), + "subject_id": str(info.get("subject_id") or subject_id).strip(), + } + return normalized + + +def _save_recent_eval_user_directory(directory: Dict[str, Dict[str, str]]) -> None: + set_config_value("recent_eval_jobs_user_directory", directory) + + +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _fetch_auth_member_profile(subject_id: str, environment: str) -> Dict[str, str]: + subject = str(subject_id or "").strip() + if not subject: + return {} + org_id = os.environ.get( + "WEBAUTO_ORGANIZATION_ID", + "5a21621d-6968-4f7d-94f8-99cfb77b6e71", + ).strip() + if not org_id: + return {"subject_id": subject, "name": subject, "email": ""} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + access_token = token_source.get_token().access_token + quoted_subject = urllib.parse.quote(subject, safe="") + url = f"https://auth.web.auto/v2/organizations/{org_id}/members/{quoted_subject}" + response = requests.get( + url, + headers={"Authorization": f"Bearer {access_token}", "accept": "application/json"}, + timeout=10, + ) + response.raise_for_status() + data = response.json() + return { + "subject_id": str(data.get("subject_id") or subject), + "name": str(data.get("name") or subject).strip(), + "email": str(data.get("email") or "").strip(), + } + + +def _hydrate_recent_eval_user_directory( + jobs: List[Dict[str, Any]], + environment: str, +) -> Dict[str, Dict[str, str]]: + directory = _get_recent_eval_user_directory() + unresolved = sorted( + { + str(job.get("scheduled_by") or "").strip() + for job in jobs + if str(job.get("scheduled_by") or "").strip() + and str(job.get("scheduled_by") or "").strip() not in directory + } + ) + if not unresolved: + return directory + + updates: Dict[str, Dict[str, str]] = {} + with ThreadPoolExecutor(max_workers=min(6, len(unresolved))) as executor: + future_map = { + executor.submit(_fetch_auth_member_profile, subject_id, environment): subject_id + for subject_id in unresolved + } + for future in as_completed(future_map): + subject_id = future_map[future] + try: + profile = future.result() + except Exception: + profile = { + "subject_id": subject_id, + "name": subject_id, + "email": "", + } + updates[subject_id] = { + "subject_id": str(profile.get("subject_id") or subject_id).strip(), + "name": str(profile.get("name") or subject_id).strip(), + "email": str(profile.get("email") or "").strip(), + } + + if updates: + directory = {**directory, **updates} + _save_recent_eval_user_directory(directory) + return directory + + def _build_recent_job_date_filters( date_from: Optional[datetime.date], date_to: Optional[datetime.date], @@ -1239,6 +1357,7 @@ def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: "finished_at": report.get("finished_at"), "duration": _format_duration(report.get("started_at"), report.get("finished_at")), "created_label": _format_relative_time(report.get("scheduled_at") or report.get("started_at")), + "scheduled_by": str(report.get("scheduled_by") or ""), "report_url": evaluator_api.get_job_report_url(report.get("project_id", ""), report.get("job_id") or report.get("id") or ""), "fail_message": report.get("fail_message", ""), "total": totals["total"], @@ -1357,7 +1476,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: .evj-top { justify-content: space-between; } .evj-row { display: grid; - grid-template-columns: minmax(180px, 1.35fr) minmax(86px, 0.5fr) minmax(108px, 0.7fr) minmax(180px, 1.25fr) minmax(190px, 1.15fr); + grid-template-columns: minmax(180px, 1.3fr) minmax(86px, 0.5fr) minmax(108px, 0.7fr) minmax(180px, 1.15fr) minmax(130px, 0.9fr) minmax(180px, 1.1fr); gap: 8px; align-items: center; } @@ -1615,7 +1734,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: ) -def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: +def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "Unknown") -> None: """Render one recent evaluator job as a single-row list item.""" variant = html.escape(job.get("status_variant", "unknown")) status = html.escape(_status_display_label(job.get("status", "unknown") or "unknown")) @@ -1631,6 +1750,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None: created_label = html.escape(job.get("created_label", "—")) git_sha = html.escape(job.get("git_sha", "") or "—") source_label = html.escape(job.get("source_label", "") or "—") + user_text = html.escape(user_label or "Unknown") report_url = html.escape(job.get("report_url", "") or "") source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") status_variant = job.get("status_variant", "unknown") @@ -1673,6 +1793,9 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any]) -> None:
{catalog_html}
{source_html}
+
+ {user_text} +
build {build_status} · test {test_status} · {git_sha}
{counts} @@ -1790,11 +1913,21 @@ def _render_recent_evaluator_job_run_dialog( help="Folder under the data directory. This uses the same safe path rules as the main download workflow.", ) + if not suite_labels: + hint_cols = st.columns([1.2, 2.8]) + with hint_cols[0]: + if st.form_submit_button("Refresh suites", use_container_width=True): + _fetch_evaluator_job_detail.clear() + st.rerun() + with hint_cols[1]: + st.caption("No suite candidates were available yet for this job. Refresh to re-read suite data from the evaluator API.") + selected_suite_labels = st.multiselect( "Suites to download (optional)", options=suite_labels, default=[], help="Leave empty to download all suites from this evaluator job.", + disabled=not suite_labels, ) run_download_type = st.radio( @@ -1944,8 +2077,9 @@ def _render_recent_evaluator_jobs_section( flash_message = st.session_state.pop("recent_eval_jobs_flash", None) if flash_message: st.success(flash_message) + user_directory = _get_recent_eval_user_directory() - control_cols = st.columns([0.8, 1.1, 1.25, 1.55, 1.2, 1.2, 0.75]) + control_cols = st.columns([0.75, 1.0, 1.15, 1.45, 1.25, 1.0, 1.0, 0.75]) with control_cols[0]: st.markdown('
Rows
', unsafe_allow_html=True) limit = int( @@ -1989,7 +2123,38 @@ def _render_recent_evaluator_jobs_section( label_visibility="collapsed", placeholder="Type to search evaluator jobs", ).strip() + recent_candidates = _get_recent_job_search_history(search_scope) + selected_user_name = "" + if recent_candidates: + recent_choice = st.selectbox( + "Recent searches", + options=[""] + recent_candidates, + index=0, + key=f"recent_eval_jobs_search_recent::{search_scope}", + help="Reuse a previously entered search for this field.", + ) + if recent_choice and recent_choice != search_text: + st.session_state["recent_eval_jobs_search_text"] = recent_choice + st.rerun() + user_candidates = sorted( + { + info.get("name", "").strip() + for info in user_directory.values() + if info.get("name", "").strip() + }, + key=str.lower, + ) with control_cols[4]: + st.markdown('
User
', unsafe_allow_html=True) + selected_user_name = st.selectbox( + "User", + options=[""] + user_candidates, + index=0, + key="recent_eval_jobs_user_filter", + help="Filter jobs by resolved scheduled user name.", + label_visibility="collapsed", + ) + with control_cols[5]: st.markdown('
From
', unsafe_allow_html=True) date_from = st.date_input( "From", @@ -1998,7 +2163,7 @@ def _render_recent_evaluator_jobs_section( label_visibility="collapsed", help="Scheduled-at lower bound in JST.", ) - with control_cols[5]: + with control_cols[6]: st.markdown('
To
', unsafe_allow_html=True) date_to = st.date_input( "To", @@ -2007,7 +2172,7 @@ def _render_recent_evaluator_jobs_section( label_visibility="collapsed", help="Scheduled-at upper bound in JST.", ) - with control_cols[6]: + with control_cols[7]: st.markdown('
Actions
', unsafe_allow_html=True) if st.button("Refresh", key="refresh_recent_eval_jobs", use_container_width=True): _fetch_recent_evaluator_job_pages.clear() @@ -2022,19 +2187,36 @@ def _render_recent_evaluator_jobs_section( return def _render_job_list() -> None: + nonlocal user_directory if not project_id: st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") return current_page = max(1, int(st.session_state.get(page_key, 1))) pages_to_fetch = max(3, current_page + 2) - if search_text or status_filter or date_from or date_to: + if search_text or status_filter or date_from or date_to or selected_user_name: pages_to_fetch = max(pages_to_fetch, 6) server_status_values = tuple(_status_filter_values(status_filter)) - server_search_filter, search_needle = _build_recent_job_search_filter(search_text, search_scope) + server_search_filter, search_needle = _build_recent_job_search_filter(search_text, search_scope, user_directory) + selected_user_ids = sorted( + { + subject_id + for subject_id, info in user_directory.items() + if selected_user_name + and selected_user_name.lower() == str(info.get("name") or "").strip().lower() + } + ) server_date_filters = _build_recent_job_date_filters(date_from, date_to) extra_filters: List[Dict[str, Any]] = [] if server_search_filter: extra_filters.append(server_search_filter) + if selected_user_ids: + extra_filters.append( + { + "field": "scheduled_by", + "operator": "In", + "values": selected_user_ids, + } + ) extra_filters.extend(server_date_filters) extra_filter_tuples = tuple( ( @@ -2056,8 +2238,11 @@ def _render_job_list() -> None: except Exception as e: st.error(f"Could not fetch recent evaluator jobs: {e}") return + if search_text: + _save_recent_job_search_history(search_scope, search_text) jobs = [job for page in fetched_pages for job in page.get("jobs", [])] + user_directory = _hydrate_recent_eval_user_directory(jobs, environment) has_more_from_api = bool(fetched_pages and fetched_pages[-1].get("next_token")) if search_needle: @@ -2071,6 +2256,12 @@ def _render_job_list() -> None: jobs = [job for job in jobs if search_needle in str(job.get("git_sha", "")).lower()] elif search_scope == "Fail message": jobs = [job for job in jobs if search_needle in str(job.get("fail_message", "")).lower()] + if selected_user_name: + selected_lower = selected_user_name.lower() + jobs = [ + job for job in jobs + if selected_lower == str((user_directory.get(str(job.get("scheduled_by") or "").strip(), {}) or {}).get("name", "")).strip().lower() + ] if status_filter: selected = {evaluator_api.normalize_job_status(v) for v in status_filter} jobs = [job for job in jobs if job.get("status_variant") in selected or evaluator_api.normalize_job_status(job.get("status", "")) in selected] @@ -2143,9 +2334,12 @@ def _render_job_list() -> None: st.markdown('
', unsafe_allow_html=True) for job in visible_jobs: - row_cols = st.columns([9.4, 2.0]) + subject_id = str(job.get("scheduled_by") or "").strip() + user_info = user_directory.get(subject_id, {}) + user_label = str(user_info.get("name") or subject_id or "Unknown").strip() + row_cols = st.columns([9.8, 2.0]) with row_cols[0]: - _render_recent_evaluator_job_card(job) + _render_recent_evaluator_job_card(job, user_label=user_label) with row_cols[1]: action_cols = st.columns([1.0, 1.0], gap="small") with action_cols[0]: From 92b5351764ddc577160cd4a560196354f61a95cd Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 15:05:52 +0900 Subject: [PATCH 63/94] feat: implement recent evaluator jobs UI and integrate into workflow - Added a new module for the Recent Evaluator Jobs UI, providing functionality to display and manage recent job data. - Integrated the recent evaluator jobs section into the Evaluator Workflow page, enhancing user experience by allowing quick access to job details. - Configured the UI to utilize global settings for job management, improving modularity and reusability across the application. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/ui/recent_evaluator_jobs.py | 1609 +++++++++++++++++ evaluation_dashboard_app/pages/6_Download.py | 13 - .../pages/7_Evaluator_Workflow.py | 22 + 3 files changed, 1631 insertions(+), 13 deletions(-) create mode 100644 evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py new file mode 100644 index 0000000..7abdcea --- /dev/null +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -0,0 +1,1609 @@ +"""Shared Recent Evaluator Jobs UI.""" + +from __future__ import annotations + +import html +import os +import urllib.parse +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +import pandas as pd +import requests +import streamlit as st + +from lib import evaluator_api +from lib.path_utils import resolve_under_data_root, to_data_relative + +_JST = timezone(timedelta(hours=9)) +_CONFIG_GETTER: Callable[[str, Any], Any] = lambda key, default=None: default +_CONFIG_SETTER: Callable[[str, Any], None] = lambda key, value: None +_ENQUEUE_TASK: Callable[[str, Dict[str, Any]], Optional[str]] = lambda task_type, params: None +CATALOG_IO_AVAILABLE = False +ENVIRONMENT = "default" + + +def configure_recent_evaluator_jobs_ui(*, get_config_value: Callable[[str, Any], Any], set_config_value: Callable[[str, Any], None], enqueue_task: Callable[[str, Dict[str, Any]], Optional[str]], catalog_io_available: bool, environment: str = "default") -> None: + global _CONFIG_GETTER, _CONFIG_SETTER, _ENQUEUE_TASK, CATALOG_IO_AVAILABLE, ENVIRONMENT + _CONFIG_GETTER = get_config_value + _CONFIG_SETTER = set_config_value + _ENQUEUE_TASK = enqueue_task + CATALOG_IO_AVAILABLE = bool(catalog_io_available) + ENVIRONMENT = environment or "default" + + +def get_config_value(key: str, default: Any = None) -> Any: + return _CONFIG_GETTER(key, default) + + +def set_config_value(key: str, value: Any) -> None: + _CONFIG_SETTER(key, value) + + +def _enqueue_task(task_type: str, params: Dict[str, Any]) -> Optional[str]: + return _ENQUEUE_TASK(task_type, params) + + +def _to_jst(dt: Any) -> Optional[datetime]: + if dt is None: + return None + if not hasattr(dt, "astimezone"): + return None + try: + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(_JST) + except Exception: + return None + +def _parse_api_dt(value: Any) -> Optional[datetime]: + """Parse evaluator API timestamps into timezone-aware datetimes.""" + if value is None: + return None + if isinstance(value, datetime): + if getattr(value, "tzinfo", None) is None: + return value.replace(tzinfo=timezone.utc) + return value + try: + text = str(value).strip() + if not text: + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + dt = datetime.fromisoformat(text) + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except Exception: + return None + + +def _format_jst_time(value: Any, *, include_seconds: bool = False) -> str: + """Format timestamps for display in JST.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%Y-%m-%d %H:%M:%S JST" if include_seconds else "%Y-%m-%d %H:%M JST") + + +def _format_jst_time_compact(value: Any) -> str: + """Compact timestamp for dense recent-job rows.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%m-%d %H:%M") + + +def _format_jst_time_title(value: Any) -> str: + """Readable timestamp for fallback job titles.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "unknown time" + return f"{dt.year}/{dt.month}/{dt.day} {dt.hour}:{dt.minute:02d}:{dt.second:02d}" + + +def _format_relative_time(value: Any) -> str: + """Human-friendly age/duration from a timestamp until now.""" + dt = _parse_api_dt(value) + if not dt: + return "—" + now = datetime.now(timezone.utc) + secs = max(0, int((now - dt.astimezone(timezone.utc)).total_seconds())) + if secs < 60: + return f"{secs}s ago" + if secs < 3600: + return f"{secs // 60}m ago" + if secs < 86400: + return f"{secs // 3600}h ago" + return f"{secs // 86400}d ago" + + +def _format_duration(start_value: Any, end_value: Any) -> str: + """Format elapsed duration between two evaluator timestamps.""" + start = _parse_api_dt(start_value) + end = _parse_api_dt(end_value) + if not start or not end: + return "—" + secs = max(0, int((end - start).total_seconds())) + if secs < 60: + return f"{secs}s" + if secs < 3600: + return f"{secs // 60}m {secs % 60}s" + return f"{secs // 3600}h {(secs % 3600) // 60}m" + + +def _extract_git_target(report: Dict[str, Any]) -> str: + """Return a compact branch/tag label from evaluator job report metadata.""" + source = ((report.get("event") or {}).get("source") or {}) + git_ref = str(source.get("git_ref") or "").strip() + if git_ref.startswith("refs/heads/"): + return git_ref[len("refs/heads/"):] + if git_ref.startswith("refs/tags/"): + return git_ref[len("refs/tags/"):] + return git_ref or str(source.get("git_sha") or "").strip()[:12] or "—" + + +def _extract_catalog_url(report: Dict[str, Any]) -> str: + """Return a best-effort catalog URL for linking from recent evaluator jobs.""" + catalog = report.get("catalog") or {} + direct_url = str( + catalog.get("web_url") + or catalog.get("url") + or catalog.get("catalog_url") + or "" + ).strip() + if direct_url: + return direct_url + + project_id = str(report.get("project_id") or "").strip() + catalog_id = str( + catalog.get("catalog_id") + or catalog.get("id") + or "" + ).strip() + if project_id and catalog_id: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog_id}?project_id={project_id}" + return "" + + +def _extract_job_title(report: Dict[str, Any]) -> str: + """Prefer evaluator description for display title, with a readable fallback.""" + description = str(report.get("description") or "").strip() + if description: + return description + started_like = report.get("started_at") or report.get("scheduled_at") or report.get("finished_at") + return f"no description (Started at {_format_jst_time_title(started_like)})" + + +def _extract_case_totals(report: Dict[str, Any]) -> Dict[str, int]: + """Return total/success/failed/canceled counts from job report.""" + test = report.get("test") or {} + result = test.get("available_case_results") or test.get("case_results") or {} + return { + "total": int(result.get("total_count", 0) or 0), + "success": int(result.get("success_count", 0) or 0), + "failed": int(result.get("failure_count", 0) or 0), + "canceled": int(result.get("cancellation_count", 0) or 0), + } + + +def _extract_failed_case_rows(case_reports: List[Dict[str, Any]], *, limit: int = 50) -> List[Dict[str, Any]]: + """Normalize failed case rows for display tables.""" + rows: List[Dict[str, Any]] = [] + for report in case_reports: + status = str(report.get("status") or "").strip().lower() + result_status = str(((report.get("result") or {}).get("status") or "")).strip().lower() + if status not in evaluator_api.FAILED_JOB_STATUSES and result_status not in evaluator_api.FAILED_JOB_STATUSES: + continue + logs = report.get("logs") or {} + rows.append( + { + "Suite": ((report.get("suite") or {}).get("display_name") or ""), + "Scenario": ((report.get("scenario") or {}).get("display_name") or ""), + "Status": report.get("status", ""), + "Fail message": report.get("fail_message", ""), + "Cause": ", ".join(report.get("failure_cause_labels", []) or []), + "Archive log": "yes" if ((logs.get("simulation_archive") or {}).get("id")) else "no", + "Result JSON": "yes" if ((logs.get("simulation_result_json") or {}).get("id")) else "no", + } + ) + rows.sort(key=lambda row: (row["Suite"], row["Scenario"], row["Fail message"])) + return rows[:limit] + + +def _extract_suite_rows(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Normalize suite summary rows for display tables.""" + rows = [ + { + "Suite": row.get("name", ""), + "Total": int(row.get("all", 0) or 0), + "Success": int(row.get("success", 0) or 0), + "Failed": int(row.get("fail", 0) or 0), + "Canceled": int(row.get("cancel", 0) or 0), + "Simulation": row.get("simulation", ""), + "Report": row.get("url", ""), + } + for row in suite_rows or [] + ] + rows.sort(key=lambda row: (-row["Failed"], row["Suite"])) + return rows + + +def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]: + """Build suite picker options from evaluator suite summary rows.""" + options: List[Dict[str, str]] = [] + seen_ids = set() + for row in suite_rows or []: + report_url = str(row.get("url") or row.get("Report") or "").strip() + suite_id = "" + if "/tests/" in report_url: + tail = report_url.split("/tests/", 1)[1] + suite_id = tail.split("?", 1)[0].split("/", 1)[0].strip() + if not suite_id or suite_id in seen_ids: + continue + seen_ids.add(suite_id) + suite_name = str(row.get("name") or row.get("Suite") or suite_id).strip() + options.append({"id": suite_id, "label": f"{suite_name} ({suite_id})"}) + return options + + +def _status_color_variant(status: str) -> str: + """Map evaluator status to a style token used by the recent-job cards.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in evaluator_api.SUCCESS_JOB_STATUSES: + return "success" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in evaluator_api.FAILED_JOB_STATUSES: + return "failed" + if normalized in ("started", "running", "pending", "queued", "created"): + return "running" + return "unknown" + + +def _status_display_label(status: str) -> str: + """Short status label for compact list rows.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in ("succeeded", "success"): + return "success" + if normalized in ("failed", "failure", "error"): + return "failed" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in ("started", "running"): + return "running" + if normalized in ("pending", "queued", "created"): + return "queued" + return normalized or "unknown" + + +def _status_filter_values(selected_statuses: List[str]) -> List[str]: + """Normalize UI status filters into API status values.""" + values: List[str] = [] + for raw in selected_statuses: + normalized = evaluator_api.normalize_job_status(raw) + if normalized == "unknown" or not normalized: + continue + if normalized == "running": + values.extend(["running", "started"]) + elif normalized == "success": + values.extend(["success", "succeeded"]) + elif normalized == "failed": + values.extend(["failed", "failure", "error"]) + elif normalized == "canceled": + values.extend(["canceled", "cancelled", "aborted"]) + else: + values.append(normalized) + return sorted(set(values)) + + +def _escape_search_match_value(value: str) -> str: + """Escape wildcard characters for API Match filters.""" + return ( + value.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("?", "\\?") + ) + + +def _build_recent_job_search_filter( + search_text: str, + search_scope: str, + user_directory: Optional[Dict[str, Dict[str, str]]] = None, +) -> tuple[Optional[Dict[str, Any]], str]: + """Map quick-search UI to one server-side filter and a client-side needle.""" + needle = search_text.strip() + if not needle: + return None, "" + + if search_scope == "Branch/tag": + return ( + { + "field": "event.source.git_ref", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Description": + return ( + { + "field": "description", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Job ID": + return ( + { + "field": "job_id", + "operator": "In", + "values": [needle], + }, + needle.lower(), + ) + if search_scope == "Git SHA": + return ( + { + "field": "event.source.git_sha", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Fail message": + return ( + { + "field": "fail_message", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + return None, needle.lower() + + +def _recent_job_search_history_key(scope: str) -> str: + return f"recent_eval_jobs_search_history::{scope}" + + +def _get_recent_job_search_history(scope: str) -> List[str]: + stored = get_config_value(_recent_job_search_history_key(scope), []) or [] + if not isinstance(stored, list): + return [] + return [str(v).strip() for v in stored if str(v).strip()] + + +def _save_recent_job_search_history(scope: str, value: str, *, max_items: int = 8) -> None: + text = str(value).strip() + if not text: + return + history = _get_recent_job_search_history(scope) + updated = [text] + [item for item in history if item != text] + set_config_value(_recent_job_search_history_key(scope), updated[:max_items]) + + +def _get_recent_eval_user_directory() -> Dict[str, Dict[str, str]]: + stored = get_config_value("recent_eval_jobs_user_directory", {}) or {} + if not isinstance(stored, dict): + return {} + normalized: Dict[str, Dict[str, str]] = {} + for subject_id, info in stored.items(): + if not isinstance(info, dict): + continue + normalized[str(subject_id)] = { + "name": str(info.get("name") or "").strip(), + "email": str(info.get("email") or "").strip(), + "subject_id": str(info.get("subject_id") or subject_id).strip(), + } + return normalized + + +def _save_recent_eval_user_directory(directory: Dict[str, Dict[str, str]]) -> None: + set_config_value("recent_eval_jobs_user_directory", directory) + + +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _fetch_auth_member_profile(subject_id: str, environment: str) -> Dict[str, str]: + subject = str(subject_id or "").strip() + if not subject: + return {} + org_id = os.environ.get( + "WEBAUTO_ORGANIZATION_ID", + "5a21621d-6968-4f7d-94f8-99cfb77b6e71", + ).strip() + if not org_id: + return {"subject_id": subject, "name": subject, "email": ""} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + access_token = token_source.get_token().access_token + quoted_subject = urllib.parse.quote(subject, safe="") + url = f"https://auth.web.auto/v2/organizations/{org_id}/members/{quoted_subject}" + response = requests.get( + url, + headers={"Authorization": f"Bearer {access_token}", "accept": "application/json"}, + timeout=10, + ) + response.raise_for_status() + data = response.json() + return { + "subject_id": str(data.get("subject_id") or subject), + "name": str(data.get("name") or subject).strip(), + "email": str(data.get("email") or "").strip(), + } + + +def _hydrate_recent_eval_user_directory( + jobs: List[Dict[str, Any]], + environment: str, +) -> Dict[str, Dict[str, str]]: + directory = _get_recent_eval_user_directory() + unresolved = sorted( + { + str(job.get("scheduled_by") or "").strip() + for job in jobs + if str(job.get("scheduled_by") or "").strip() + and str(job.get("scheduled_by") or "").strip() not in directory + } + ) + if not unresolved: + return directory + + updates: Dict[str, Dict[str, str]] = {} + with ThreadPoolExecutor(max_workers=min(6, len(unresolved))) as executor: + future_map = { + executor.submit(_fetch_auth_member_profile, subject_id, environment): subject_id + for subject_id in unresolved + } + for future in as_completed(future_map): + subject_id = future_map[future] + try: + profile = future.result() + except Exception: + profile = { + "subject_id": subject_id, + "name": subject_id, + "email": "", + } + updates[subject_id] = { + "subject_id": str(profile.get("subject_id") or subject_id).strip(), + "name": str(profile.get("name") or subject_id).strip(), + "email": str(profile.get("email") or "").strip(), + } + + if updates: + directory = {**directory, **updates} + _save_recent_eval_user_directory(directory) + return directory + + +def _build_recent_job_date_filters( + date_from: Optional[datetime.date], + date_to: Optional[datetime.date], +) -> List[Dict[str, Any]]: + """Build scheduled_at date-range filters for the search API.""" + filters: List[Dict[str, Any]] = [] + if date_from: + start_dt = datetime(date_from.year, date_from.month, date_from.day, 0, 0, 0, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Gte", + "values": [start_dt.astimezone(timezone.utc).isoformat()], + } + ) + if date_to: + end_dt = datetime(date_to.year, date_to.month, date_to.day, 23, 59, 59, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Lte", + "values": [end_dt.astimezone(timezone.utc).isoformat()], + } + ) + return filters + + +def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: + """Compact summary for one evaluator job card.""" + status = evaluator_api.extract_job_status(report) + totals = _extract_case_totals(report) + source = ((report.get("event") or {}).get("source") or {}) + git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() + source_repo_label = git_url.rstrip("/").split("/")[-1] if git_url else "—" + git_ref_label = _extract_git_target(report) + return { + "job_id": report.get("job_id") or report.get("id") or "", + "title": _extract_job_title(report), + "status": status, + "status_variant": _status_color_variant(status), + "build_status": ((report.get("build") or {}).get("status") or ""), + "test_status": ((report.get("test") or {}).get("status") or ""), + "target": git_ref_label, + "catalog": ((report.get("catalog") or {}).get("display_name") or ""), + "catalog_url": _extract_catalog_url(report), + "description": report.get("description", ""), + "source_label": git_ref_label, + "source_repo_label": source_repo_label, + "scheduled_at": report.get("scheduled_at"), + "started_at": report.get("started_at"), + "finished_at": report.get("finished_at"), + "duration": _format_duration(report.get("started_at"), report.get("finished_at")), + "created_label": _format_relative_time(report.get("scheduled_at") or report.get("started_at")), + "scheduled_by": str(report.get("scheduled_by") or ""), + "report_url": evaluator_api.get_job_report_url(report.get("project_id", ""), report.get("job_id") or report.get("id") or ""), + "fail_message": report.get("fail_message", ""), + "total": totals["total"], + "success": totals["success"], + "failed": totals["failed"], + "canceled": totals["canceled"], + "git_sha": str(source.get("git_sha") or "")[:12], + "git_ref_url": source.get("git_ref_url", ""), + "git_commit_url": source.get("git_commit_url", ""), + "source_url": git_url, + } + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_recent_evaluator_job_pages( + project_id: str, + environment: str, + page_size: int, + pages_to_fetch: int, + status_values: tuple[str, ...] = (), + extra_filters: tuple[tuple[str, str, tuple[Any, ...]], ...] = (), +) -> List[Dict[str, Any]]: + """Fetch recent evaluator jobs from the search endpoint page-by-page.""" + if not project_id: + return [] + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + filters: List[Dict[str, Any]] = [] + if status_values: + filters.append( + { + "field": "status", + "operator": "In", + "values": list(status_values), + } + ) + for field, operator, values in extra_filters: + filters.append( + { + "field": field, + "operator": operator, + "values": list(values), + } + ) + next_token = "" + pages: List[Dict[str, Any]] = [] + for _ in range(max(1, int(pages_to_fetch))): + data = api.search_report_list( + project_id, + filters=filters or None, + next_token=next_token, + size=max(1, min(int(page_size), 100)), + ) + reports = data.get("reports", []) or [] + pages.append( + { + "jobs": [_summarize_recent_job(report) for report in reports], + "next_token": data.get("next_token", "") or "", + } + ) + next_token = data.get("next_token", "") or "" + if not next_token: + break + return pages + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_evaluator_job_detail(project_id: str, environment: str, job_id: str) -> Dict[str, Any]: + """Fetch deep evaluator detail for one job on demand.""" + if not project_id or not job_id: + return {} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + report = api.get_job_report(project_id, job_id) + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + case_reports = api.get_case_reports(project_id, job_id) + summary = _summarize_recent_job(report) + return { + **summary, + "suite_rows": _extract_suite_rows(suite_rows), + "failed_case_rows": _extract_failed_case_rows(case_reports), + "raw_report": report, + } + + +def _inject_recent_evaluator_jobs_styles() -> None: + """Task-adjacent styles for the recent evaluator jobs section.""" + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "Unknown") -> None: + """Render one recent evaluator job as a single-row list item.""" + variant = html.escape(job.get("status_variant", "unknown")) + status = html.escape(_status_display_label(job.get("status", "unknown") or "unknown")) + title_text = html.escape(job.get("title", "—")) + description = html.escape(job.get("description", "") or "") + catalog = html.escape(job.get("catalog", "") or "—") + catalog_url = html.escape(job.get("catalog_url", "") or "") + scheduled = html.escape(_format_jst_time_compact(job.get("scheduled_at"))) + duration = html.escape(job.get("duration", "—")) + job_id = html.escape(str(job.get("job_id", ""))) + build_status = html.escape(job.get("build_status", "") or "—") + test_status = html.escape(job.get("test_status", "") or "—") + created_label = html.escape(job.get("created_label", "—")) + git_sha = html.escape(job.get("git_sha", "") or "—") + source_label = html.escape(job.get("source_label", "") or "—") + user_text = html.escape(user_label or "Unknown") + report_url = html.escape(job.get("report_url", "") or "") + source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") + status_variant = job.get("status_variant", "unknown") + status_mark = { + "running": '', + "success": '', + "failed": '', + "canceled": '', + }.get(status_variant, '') + meta_line = job_id + counts = ( + f'S {int(job.get("success", 0))} · ' + f'F {int(job.get("failed", 0))} · ' + f'C {int(job.get("canceled", 0))} / ' + f'{int(job.get("total", 0))}' + ) + title_html = f'{title_text}' if report_url else title_text + source_html = ( + f'{source_label}' + if source_url else source_label + ) + catalog_html = ( + f'{catalog}' + if catalog_url else catalog + ) + st.markdown( + f""" +
+
+
+
{title_html}
+
{meta_line}
+
+
+ {status_mark}{status} +
+
+ {scheduled}
{duration} · {created_label} +
+
+ {catalog_html}
{source_html} +
+
+ {user_text} +
+
+ build {build_status} · test {test_status} · {git_sha}
+ {counts} +
+
+
+ """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: Dict[str, Any]) -> None: + """Render detailed evaluator-job information inside an expander.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.warning("Missing job id.") + return + try: + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + except Exception as e: + st.error(f"Could not fetch evaluator details: {e}") + return + + st.markdown("**Overview**") + top_cols = st.columns(4) + top_cols[0].metric("Total", int(detail.get("total", 0))) + top_cols[1].metric("Success", int(detail.get("success", 0))) + top_cols[2].metric("Failed", int(detail.get("failed", 0))) + top_cols[3].metric("Canceled", int(detail.get("canceled", 0))) + + overview_left, overview_right = st.columns([1.3, 1.1]) + with overview_left: + st.write(f"Status: `{detail.get('status', 'unknown')}`") + st.write(f"Title: `{detail.get('title', '—')}`") + st.write(f"Build/Test: `{detail.get('build_status', '—')}` / `{detail.get('test_status', '—')}`") + st.write(f"Ref: `{detail.get('target', '—')}`") + st.write(f"Catalog: `{detail.get('catalog', '—')}`") + st.write(f"Repo: `{detail.get('source_repo_label', '—')}`") + with overview_right: + st.write(f"Scheduled: `{_format_jst_time(detail.get('scheduled_at'), include_seconds=True)}`") + st.write(f"Started: `{_format_jst_time(detail.get('started_at'), include_seconds=True)}`") + st.write(f"Finished: `{_format_jst_time(detail.get('finished_at'), include_seconds=True)}`") + st.write(f"Duration: `{detail.get('duration', '—')}`") + st.write(f"SHA: `{detail.get('git_sha', '—')}`") + + action_cols = st.columns([1.2, 1.2, 4]) + report_url = detail.get("report_url", "") + catalog_url = detail.get("catalog_url", "") + source_url = detail.get("source_url", "") or detail.get("git_ref_url", "") + with action_cols[0]: + if report_url: + st.link_button("Open report", report_url, use_container_width=True) + with action_cols[1]: + if catalog_url: + st.link_button("Open catalog", catalog_url, use_container_width=True) + with action_cols[2]: + if source_url: + st.link_button("Open source", source_url, use_container_width=True) + + if detail.get("fail_message"): + st.warning(detail.get("fail_message")) + + suite_rows = detail.get("suite_rows") or [] + with st.expander(f"Suites ({len(suite_rows)})", expanded=bool(suite_rows)): + if suite_rows: + st.dataframe(pd.DataFrame(suite_rows), width="stretch", hide_index=True) + else: + st.caption("No suite summary available.") + + failed_case_rows = detail.get("failed_case_rows") or [] + with st.expander(f"Failed Cases ({len(failed_case_rows)})", expanded=bool(failed_case_rows)): + if failed_case_rows: + st.dataframe(pd.DataFrame(failed_case_rows), width="stretch", hide_index=True) + else: + st.caption("No failed cases in the current report.") + + with st.expander("Raw JSON", expanded=False): + st.json(detail.get("raw_report", {})) + + +def _render_recent_evaluator_job_run_dialog( + project_id: str, + environment: str, + job: Dict[str, Any], + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: + """Render the dialog used to enqueue Download + Eval + Parquet from a recent job row.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.error("Missing evaluator job id.") + return + + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) + suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} + suite_labels = [opt["label"] for opt in suite_options] + + st.caption("Confirm the workflow options for this evaluator job, then start a background task.") + summary_cols = st.columns([1.45, 1.15, 1.35, 1.05]) + summary_cols[0].markdown(f"**Title** \n`{detail.get('title', '—')}`") + summary_cols[1].markdown(f"**Status** \n`{detail.get('status', 'unknown')}`") + summary_cols[2].markdown(f"**Catalog** \n`{detail.get('catalog', '—')}`") + summary_cols[3].markdown(f"**Cases** \n`{int(detail.get('total', 0))}`") + + with st.form(key=f"recent_eval_run_form_{job_id}", border=False): + run_output_path = st.text_input( + "Output path", + value=output_path_default, + help="Folder under the data directory. This uses the same safe path rules as the main download workflow.", + ) + + if not suite_labels: + hint_cols = st.columns([1.2, 2.8]) + with hint_cols[0]: + if st.form_submit_button("Refresh suites", use_container_width=True): + _fetch_evaluator_job_detail.clear() + st.rerun() + with hint_cols[1]: + st.caption("No suite candidates were available yet for this job. Refresh to re-read suite data from the evaluator API.") + + selected_suite_labels = st.multiselect( + "Suites to download (optional)", + options=suite_labels, + default=[], + help="Leave empty to download all suites from this evaluator job.", + disabled=not suite_labels, + ) + + run_download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON only"], + index=0 if download_type_default == "Archives (ZIP)" else 1, + horizontal=True, + ) + + run_phase = "" + run_skip_large_file = False + run_large_file_mb = 50.0 + run_keep_zip_files = False + if run_download_type == "Archives (ZIP)": + run_phase = st.text_input( + "Phase to extract", + value=phase_default, + help="Enter the phase name to extract from archives.", + ) + opt_cols = st.columns([1.2, 1.3, 1.2]) + with opt_cols[0]: + run_skip_large_file = st.checkbox( + "Skip large files", + value=skip_large_file_default, + help="Skip unusually large archives during download.", + ) + with opt_cols[1]: + run_large_file_mb = st.number_input( + "Skip threshold (MB)", + min_value=1.0, + max_value=5000.0, + step=1.0, + value=float(large_file_mb_default), + ) + with opt_cols[2]: + run_keep_zip_files = st.checkbox( + "Keep ZIP files", + value=keep_zip_files_default, + help="Keep downloaded ZIPs after extraction.", + ) + + run_cols = st.columns([1.25, 1.25, 1.1]) + with run_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=True, + help="Run eval_result and generate Summary.csv / Score.csv after download.", + ) + with run_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + help="Build scene_result.parquet from .pkl files." if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable this.", + ) + with run_cols[2]: + eval_recursive = st.checkbox( + "Recursive eval", + value=True, + help="Search subdirectories for evaluation result folders.", + ) + + action_cols = st.columns([1.15, 1.15, 3.7]) + cancel_clicked = action_cols[0].form_submit_button("Cancel", use_container_width=True) + start_clicked = action_cols[1].form_submit_button("Start", type="primary", use_container_width=True) + + if cancel_clicked: + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + if not start_clicked: + return + + resolved_output, path_err = resolve_under_data_root(run_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + return + + selected_suite_ids = [suite_label_to_id[label] for label in selected_suite_labels] + resolved_path_str = str(resolved_output) + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("job_id", job_id) + set_config_value("suite_id", "") + set_config_value("suite_ids", selected_suite_ids) + set_config_value("download_type", run_download_type) + if run_download_type == "Archives (ZIP)": + set_config_value("phase", run_phase) + set_config_value("skip_large_file", run_skip_large_file) + set_config_value("large_file_mb", run_large_file_mb) + set_config_value("keep_zip_files", run_keep_zip_files) + + params = { + "output_path": resolved_path_str, + "project_id": project_id, + "job_id": job_id, + "suite_id": "", + "suite_ids": selected_suite_ids or None, + "download_type": "archives" if run_download_type == "Archives (ZIP)" else "result_json", + "phase": run_phase if run_download_type == "Archives (ZIP)" else "", + "skip_large_file": run_skip_large_file if run_download_type == "Archives (ZIP)" else False, + "large_file_mb": run_large_file_mb if run_download_type == "Archives (ZIP)" else 50.0, + "keep_zip_files": run_keep_zip_files if run_download_type == "Archives (ZIP)" else False, + "run_eval": run_eval, + "generate_parquet": generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + } + task_id = _enqueue_task("download_and_eval", params) + if not task_id: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + return + + st.session_state["recent_eval_jobs_flash"] = ( + f"Queued Download + Eval + Parquet for `{detail.get('title', job_id)}`. " + f"Task id: `{task_id}`." + ) + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + +def _render_recent_evaluator_jobs_section( + project_id: str, + environment: str, + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: + """Render a direct evaluator-jobs browser above the download tabs.""" + _inject_recent_evaluator_jobs_styles() + show_section = st.toggle( + "Show recent evaluator jobs", + value=st.session_state.get("recent_eval_jobs_show", False), + key="recent_eval_jobs_show", + help="Load recent evaluator jobs only when you want to browse them.", + ) + if not show_section: + return + + st.subheader("Recent evaluator jobs") + st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") + flash_message = st.session_state.pop("recent_eval_jobs_flash", None) + if flash_message: + st.success(flash_message) + user_directory = _get_recent_eval_user_directory() + + control_cols = st.columns([0.75, 1.0, 1.15, 1.45, 1.25, 1.0, 1.0, 0.75]) + with control_cols[0]: + st.markdown('
Rows
', unsafe_allow_html=True) + limit = int( + st.selectbox( + "Rows", + options=[6, 12, 20, 30], + index=1, + key="recent_eval_jobs_limit", + help="How many recent evaluator jobs to fetch for this project.", + label_visibility="collapsed", + ) + ) + with control_cols[1]: + st.markdown('
Status
', unsafe_allow_html=True) + status_filter = st.multiselect( + "Status", + options=["running", "success", "failed", "canceled", "unknown"], + default=[], + key="recent_eval_jobs_status_filter", + help="Leave empty to show all recent jobs.", + label_visibility="collapsed", + placeholder="All statuses", + ) + with control_cols[2]: + st.markdown('
Search In
', unsafe_allow_html=True) + search_scope = st.selectbox( + "Search in", + options=["Branch/tag", "Description", "Job ID", "Git SHA", "Fail message"], + index=0, + key="recent_eval_jobs_search_scope", + help="Choose which evaluator field the quick search should target.", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
Search
', unsafe_allow_html=True) + search_text = st.text_input( + "Search", + value=st.session_state.get("recent_eval_jobs_search_text", ""), + key="recent_eval_jobs_search_text", + help="Server-side search across the selected field.", + label_visibility="collapsed", + placeholder="Type to search evaluator jobs", + ).strip() + recent_candidates = _get_recent_job_search_history(search_scope) + selected_user_name = "" + if recent_candidates: + recent_choice = st.selectbox( + "Recent searches", + options=[""] + recent_candidates, + index=0, + key=f"recent_eval_jobs_search_recent::{search_scope}", + help="Reuse a previously entered search for this field.", + ) + if recent_choice and recent_choice != search_text: + st.session_state["recent_eval_jobs_search_text"] = recent_choice + st.rerun() + user_candidates = sorted( + { + info.get("name", "").strip() + for info in user_directory.values() + if info.get("name", "").strip() + }, + key=str.lower, + ) + with control_cols[4]: + st.markdown('
User
', unsafe_allow_html=True) + selected_user_name = st.selectbox( + "User", + options=[""] + user_candidates, + index=0, + key="recent_eval_jobs_user_filter", + help="Filter jobs by resolved scheduled user name.", + label_visibility="collapsed", + ) + with control_cols[5]: + st.markdown('
From
', unsafe_allow_html=True) + date_from = st.date_input( + "From", + value=st.session_state.get("recent_eval_jobs_date_from", None), + key="recent_eval_jobs_date_from", + label_visibility="collapsed", + help="Scheduled-at lower bound in JST.", + ) + with control_cols[6]: + st.markdown('
To
', unsafe_allow_html=True) + date_to = st.date_input( + "To", + value=st.session_state.get("recent_eval_jobs_date_to", None), + key="recent_eval_jobs_date_to", + label_visibility="collapsed", + help="Scheduled-at upper bound in JST.", + ) + with control_cols[7]: + st.markdown('
Actions
', unsafe_allow_html=True) + if st.button("Refresh", key="refresh_recent_eval_jobs", use_container_width=True): + _fetch_recent_evaluator_job_pages.clear() + _fetch_evaluator_job_detail.clear() + st.rerun() + + page_key = "recent_eval_jobs_page" + if page_key not in st.session_state: + st.session_state[page_key] = 1 + if date_from and date_to and date_from > date_to: + st.warning("`From` date must be earlier than or equal to `To` date.") + return + + def _render_job_list() -> None: + nonlocal user_directory + if not project_id: + st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") + return + current_page = max(1, int(st.session_state.get(page_key, 1))) + pages_to_fetch = max(3, current_page + 2) + if search_text or status_filter or date_from or date_to or selected_user_name: + pages_to_fetch = max(pages_to_fetch, 6) + server_status_values = tuple(_status_filter_values(status_filter)) + server_search_filter, search_needle = _build_recent_job_search_filter(search_text, search_scope, user_directory) + selected_user_ids = sorted( + { + subject_id + for subject_id, info in user_directory.items() + if selected_user_name + and selected_user_name.lower() == str(info.get("name") or "").strip().lower() + } + ) + server_date_filters = _build_recent_job_date_filters(date_from, date_to) + extra_filters: List[Dict[str, Any]] = [] + if server_search_filter: + extra_filters.append(server_search_filter) + if selected_user_ids: + extra_filters.append( + { + "field": "scheduled_by", + "operator": "In", + "values": selected_user_ids, + } + ) + extra_filters.extend(server_date_filters) + extra_filter_tuples = tuple( + ( + str(f["field"]), + str(f["operator"]), + tuple(f.get("values", []) or []), + ) + for f in extra_filters + ) + try: + fetched_pages = _fetch_recent_evaluator_job_pages( + project_id, + environment, + limit, + pages_to_fetch, + status_values=server_status_values, + extra_filters=extra_filter_tuples, + ) + except Exception as e: + st.error(f"Could not fetch recent evaluator jobs: {e}") + return + if search_text: + _save_recent_job_search_history(search_scope, search_text) + + jobs = [job for page in fetched_pages for job in page.get("jobs", [])] + user_directory = _hydrate_recent_eval_user_directory(jobs, environment) + has_more_from_api = bool(fetched_pages and fetched_pages[-1].get("next_token")) + + if search_needle: + if search_scope == "Branch/tag": + jobs = [job for job in jobs if search_needle in str(job.get("target", "")).lower()] + elif search_scope == "Description": + jobs = [job for job in jobs if search_needle in str(job.get("description", "")).lower() or search_needle in str(job.get("title", "")).lower()] + elif search_scope == "Job ID": + jobs = [job for job in jobs if search_needle in str(job.get("job_id", "")).lower()] + elif search_scope == "Git SHA": + jobs = [job for job in jobs if search_needle in str(job.get("git_sha", "")).lower()] + elif search_scope == "Fail message": + jobs = [job for job in jobs if search_needle in str(job.get("fail_message", "")).lower()] + if selected_user_name: + selected_lower = selected_user_name.lower() + jobs = [ + job for job in jobs + if selected_lower == str((user_directory.get(str(job.get("scheduled_by") or "").strip(), {}) or {}).get("name", "")).strip().lower() + ] + if status_filter: + selected = {evaluator_api.normalize_job_status(v) for v in status_filter} + jobs = [job for job in jobs if job.get("status_variant") in selected or evaluator_api.normalize_job_status(job.get("status", "")) in selected] + + if not jobs: + st.session_state[page_key] = 1 + st.markdown('
No recent evaluator jobs matched the current filters.
', unsafe_allow_html=True) + return + + total_loaded = len(jobs) + has_next_page = total_loaded > current_page * limit or has_more_from_api + max_known_page = max(1, (total_loaded + limit - 1) // limit) + if current_page > max_known_page: + current_page = max_known_page + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + if not visible_jobs and current_page > 1: + current_page = max(1, current_page - 1) + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + has_next_page = total_loaded > current_page * limit + + if current_page == 1: + page_numbers = list(range(1, min(3, max_known_page) + 1)) + else: + page_numbers = list( + range( + max(1, current_page - 1), + min(max_known_page, current_page + 1) + 1, + ) + ) + pager_cols = st.columns([0.8, 0.9, 0.9, 0.9, 0.8, 5.7]) + with pager_cols[0]: + if st.button("‹", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): + st.session_state[page_key] = max(1, current_page - 1) + st.rerun() + for idx, page_num in enumerate(page_numbers[:3], start=1): + with pager_cols[idx]: + btn_key = ( + f"recent_eval_jobs_pagebtn_active_{page_num}" + if page_num == current_page + else f"recent_eval_jobs_pagebtn_{page_num}" + ) + if st.button( + str(page_num), + key=btn_key, + use_container_width=True, + disabled=page_num == current_page, + ): + st.session_state[page_key] = page_num + st.rerun() + with pager_cols[4]: + if st.button("›", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): + st.session_state[page_key] = current_page + 1 + st.rerun() + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id and not any(str(job.get("job_id", "")) == str(selected_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_selected", None) + selected_job_id = None + + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id and not any(str(job.get("job_id", "")) == str(selected_run_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_run_selected", None) + selected_run_job_id = None + + st.markdown('
', unsafe_allow_html=True) + for job in visible_jobs: + subject_id = str(job.get("scheduled_by") or "").strip() + user_info = user_directory.get(subject_id, {}) + user_label = str(user_info.get("name") or subject_id or "Unknown").strip() + row_cols = st.columns([9.8, 2.0]) + with row_cols[0]: + _render_recent_evaluator_job_card(job, user_label=user_label) + with row_cols[1]: + action_cols = st.columns([1.0, 1.0], gap="small") + with action_cols[0]: + if st.button("Details", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + with action_cols[1]: + if st.button("Run", key=f"recent_eval_run_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_run_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + st.markdown("
", unsafe_allow_html=True) + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id: + selected_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_job_id)), None) + if selected_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Job details · {selected_job.get('title', '—')}", width="large") + def _recent_eval_job_dialog() -> None: + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + if st.button("Close", key="recent_eval_jobs_close_detail", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + + _recent_eval_job_dialog() + finally: + st.session_state.pop("recent_eval_jobs_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Job details · {selected_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_detail_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + st.markdown("
", unsafe_allow_html=True) + + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id: + selected_run_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_run_job_id)), None) + if selected_run_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}", width="large") + def _recent_eval_run_dialog() -> None: + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + + _recent_eval_run_dialog() + finally: + if st.session_state.get("recent_eval_jobs_run_selected") == str(selected_run_job_id): + st.session_state.pop("recent_eval_jobs_run_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_run_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + st.markdown("
", unsafe_allow_html=True) + + _render_job_list() diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 2befa03..3b366f4 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -2625,19 +2625,6 @@ def on_suite_id_change(): skip_large_file = False large_file_mb = 50.0 # Doesn't apply -_render_recent_evaluator_jobs_section( - project_id, - environment, - output_path_default=output_path, - download_type_default=download_type, - phase_default=phase if download_type == "Archives (ZIP)" else get_config_value( - "phase", "perception.object_recognition.tracking.objects" - ), - skip_large_file_default=skip_large_file, - large_file_mb_default=large_file_mb, - keep_zip_files_default=bool(get_config_value("keep_zip_files", False)) if download_type == "Archives (ZIP)" else False, -) - st.markdown('

Pick a workflow

', unsafe_allow_html=True) tab1, tab2, tab3, tab4 = st.tabs( ["📥 Download Results", "🗺️ Download Scenarios", "📊 View Downloads", "🧮 Eval Results"] diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 0ce686c..8b07f3d 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -11,6 +11,10 @@ from lib.WebAPI import scenarioAPI from lib.ui.download_ui import render_download_task_section_header +from lib.ui.recent_evaluator_jobs import ( + _render_recent_evaluator_jobs_section, + configure_recent_evaluator_jobs_ui, +) from lib.ui.task_history import get_task_list_current_user, render_task_list from lib.ui.styles_download import inject_download_page_styles from lib.user_config import UserConfig @@ -193,6 +197,13 @@ def _make_default_output_path(branch_name): poll_interval = int(get_config_value("poll_interval", 60)) max_wait_hours = int(get_config_value("max_wait_hours", 24)) environment = get_config_value("environment", "") +configure_recent_evaluator_jobs_ui( + get_config_value=get_config_value, + set_config_value=set_config_value, + enqueue_task=_enqueue_task, + catalog_io_available=CATALOG_IO_AVAILABLE, + environment=environment, +) with st.sidebar.expander("Advanced"): eval_download_type = st.radio("Download", ["Archives (ZIP)", "Result JSON"], index=0, horizontal=True) @@ -349,6 +360,17 @@ def _make_default_output_path(branch_name): else: st.error("❌ Failed to enqueue task. Check worker logs.") +_render_recent_evaluator_jobs_section( + eval_project_id, + environment, + output_path_default=eval_output_path, + download_type_default=eval_download_type, + phase_default=eval_phase, + skip_large_file_default=False, + large_file_mb_default=50.0, + keep_zip_files_default=False, +) + # ============================================ # TASK STATUS # ============================================ From b38e89a613ea8e452ccab8dc99f174da61249f59 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 15:18:54 +0900 Subject: [PATCH 64/94] feat: enhance recent evaluator jobs section with toggle and title options - Added parameters to control the visibility of the recent evaluator jobs section, allowing users to toggle its display and customize the title visibility. - Improved the logic for showing recent evaluator jobs, ensuring a more intuitive user experience. - Updated the Evaluator Workflow page to reflect these changes, enhancing overall usability. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/ui/recent_evaluator_jobs.py | 26 +- .../pages/7_Evaluator_Workflow.py | 1109 ++++++++++++----- 2 files changed, 814 insertions(+), 321 deletions(-) diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 7abdcea..3c2e821 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -1238,20 +1238,28 @@ def _render_recent_evaluator_jobs_section( skip_large_file_default: bool, large_file_mb_default: float, keep_zip_files_default: bool, + show_toggle: bool = True, + default_visible: bool = False, + show_title: bool = True, ) -> None: """Render a direct evaluator-jobs browser above the download tabs.""" _inject_recent_evaluator_jobs_styles() - show_section = st.toggle( - "Show recent evaluator jobs", - value=st.session_state.get("recent_eval_jobs_show", False), - key="recent_eval_jobs_show", - help="Load recent evaluator jobs only when you want to browse them.", - ) + if show_toggle: + show_section = st.toggle( + "Show recent evaluator jobs", + value=st.session_state.get("recent_eval_jobs_show", default_visible), + key="recent_eval_jobs_show", + help="Load recent evaluator jobs only when you want to browse them.", + ) + else: + show_section = True + st.session_state["recent_eval_jobs_show"] = True if not show_section: return - st.subheader("Recent evaluator jobs") - st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") + if show_title: + st.subheader("Recent evaluator jobs") + st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") flash_message = st.session_state.pop("recent_eval_jobs_flash", None) if flash_message: st.success(flash_message) @@ -1367,7 +1375,7 @@ def _render_recent_evaluator_jobs_section( def _render_job_list() -> None: nonlocal user_directory if not project_id: - st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") + st.info("Enter a project id to browse recent evaluator jobs.") return current_page = max(1, int(st.session_state.get(page_key, 1))) pages_to_fetch = max(3, current_page + 2) diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 8b07f3d..5c9048c 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -1,16 +1,32 @@ """ -Evaluator Workflow Page -======================= -Complete end-to-end workflow for running evaluator jobs and processing results. +Evaluator Workflow page: +- browse finished local runs and launch compare views +- monitor server-side tasks +- start new evaluator pipelines +- run download/eval from existing evaluator jobs """ -import streamlit as st +from __future__ import annotations + +import html +import json +import os +import urllib.parse from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import Optional +from typing import Dict, List, Optional + +import streamlit as st -from lib.WebAPI import scenarioAPI -from lib.ui.download_ui import render_download_task_section_header +from lib.db import create_task, is_task_queue_enabled, list_recent_tasks, update_task_rq_job_id +from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.path_utils import ( + format_size, + get_data_root_display, + get_run_info, + list_run_directories, + resolve_under_data_root, +) from lib.ui.recent_evaluator_jobs import ( _render_recent_evaluator_jobs_section, configure_recent_evaluator_jobs_ui, @@ -19,35 +35,38 @@ from lib.ui.styles_download import inject_download_page_styles from lib.user_config import UserConfig -# Initialize or load user config -_user_config = UserConfig(warning_fn=st.warning) - -def get_config_value(key, default=None): - return _user_config.get(key, default) - -def set_config_value(key, value): - _user_config.set(key, value) - -from lib.path_utils import get_data_root, resolve_under_data_root -from lib.page_chrome import inject_app_page_styles, render_page_hero -from lib.db import ( - create_task, - is_task_queue_enabled, - list_recent_tasks, -) - try: from lib.perception_catalog_io import pkl_archive_to_parquet + CATALOG_IO_AVAILABLE = True except ImportError: CATALOG_IO_AVAILABLE = False -# JST timezone for display _JST = timezone(timedelta(hours=9)) _TASK_LIST_MAX_ROWS = 200 _TASK_LIST_SINCE_DAYS = 7 +st.set_page_config( + page_title="Evaluator Workflow", + layout="wide", + initial_sidebar_state="collapsed", +) +inject_app_page_styles() +inject_download_page_styles() + + +_user_config = UserConfig(warning_fn=st.warning) + + +def get_config_value(key: str, default=None): + return _user_config.get(key, default) + + +def set_config_value(key: str, value) -> None: + _user_config.set(key, value) + + def _to_jst(dt): if dt is None: return None @@ -60,34 +79,38 @@ def _to_jst(dt): def _load_catalog_presets(): - import os - import json - _APP_ROOT = Path(__file__).parent.parent - _CATALOGS_FILENAME = "catalogs.json" + app_root = Path(__file__).parent.parent + catalogs_filename = "catalogs.json" search_paths = [ - _APP_ROOT / _CATALOGS_FILENAME, + app_root / catalogs_filename, Path(os.environ.get("CATALOGS_PATH", "")), - Path.cwd() / _CATALOGS_FILENAME, + Path.cwd() / catalogs_filename, ] catalogs = [] loaded_path = None load_error = None - for p in search_paths: - if p.exists() and p.is_file(): + for path in search_paths: + if path.exists() and path.is_file(): try: - with open(p, "r", encoding="utf-8") as f: - data = json.load(f) - catalogs = data.get("catalogs", []) if isinstance(data, dict) else (data if isinstance(data, list) else []) - loaded_path = str(p) - load_error = None - break - except Exception as e: - load_error = str(e) + with open(path, "r", encoding="utf-8") as handle: + data = json.load(handle) + if isinstance(data, dict): + catalogs = data.get("catalogs", []) + elif isinstance(data, list): + catalogs = data + else: + catalogs = [] + loaded_path = str(path) + load_error = None + break + except Exception as exc: + load_error = str(exc) presets = [] - for c in catalogs: - if isinstance(c, dict): - name = c.get("display_name") or c.get("name") or c.get("catalog_id", "Unknown") - presets.append({**c, "display_name": name}) + for item in catalogs: + if not isinstance(item, dict): + continue + display_name = item.get("display_name") or item.get("name") or item.get("catalog_id", "Unknown") + presets.append({**item, "display_name": display_name}) return presets, loaded_path, load_error @@ -97,14 +120,19 @@ def _enqueue_task(task_type: str, params: dict) -> Optional[str]: task_id = create_task(task_type, params, session_id=session_id) if not task_id: return None + from redis import Redis from rq import Queue - import os + from worker.tasks import run_job + redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379") redis_conn = Redis.from_url(redis_url) - q = Queue(name=os.environ.get("RQ_QUEUE", "default"), connection=redis_conn, default_timeout="7d") - from worker.tasks import run_job - job = q.enqueue( + queue = Queue( + name=os.environ.get("RQ_QUEUE", "default"), + connection=redis_conn, + default_timeout="7d", + ) + job = queue.enqueue( run_job, task_id, task_type, @@ -114,277 +142,436 @@ def _enqueue_task(task_type: str, params: dict) -> Optional[str]: ) rq_id = getattr(job, "id", None) if rq_id: - from lib.db import update_task_rq_job_id update_task_rq_job_id(task_id, str(rq_id)) return task_id - except Exception as e: - st.error(f"Failed to enqueue task: {e}") + except Exception as exc: + st.error(f"Failed to enqueue task: {exc}") return None -# Page config -st.set_page_config(page_title="Evaluator Workflow", layout="wide", initial_sidebar_state="expanded") -inject_app_page_styles() -inject_download_page_styles() +def _make_default_output_path(branch_name: str) -> str: + import re -# Load catalog presets -CATALOG_PRESETS, CATALOGS_PATH, catalog_load_error = _load_catalog_presets() -catalog_names = [c["display_name"] for c in CATALOG_PRESETS] + clean_branch = re.sub(r"[^\w]", "_", branch_name.strip("/")) if branch_name else "eval" + clean_branch = re.sub(r"_+", "_", clean_branch).strip("_") + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"eval_{clean_branch}_{ts}" -# ============================================ -# HERO -# ============================================ -render_page_hero( - kicker="Workflow automation", - title="Evaluator Workflow", - description="Schedule jobs, download results, and generate reports — all in one click", -) -# ============================================ -# SIDEBAR -# ============================================ -st.sidebar.markdown("### ⚙️ Configuration") +def _format_run_mtime(mtime: float) -> str: + if not mtime: + return "—" + try: + return datetime.fromtimestamp(mtime, tz=_JST).strftime("%Y-%m-%d %H:%M JST") + except Exception: + return "—" + + +def _build_overview_url(run_a: str, run_b: Optional[str] = None) -> str: + query = {"mode": "compare" if run_b else "single", "run_a": run_a} + if run_b: + query["run_b"] = run_b + return f"/?{urllib.parse.urlencode(query)}" + + +def _inject_workflow_page_styles() -> None: + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) -eval_project_id = st.sidebar.text_input("Project ID", value=get_config_value("eval_project_id", "x2_dev")) -set_config_value("eval_project_id", eval_project_id) -if catalog_names: - selected_catalog_name = st.sidebar.selectbox("Catalog", options=catalog_names, index=0) - selected_catalog = next((c for c in CATALOG_PRESETS if c["display_name"] == selected_catalog_name), None) - if selected_catalog: - catalog_id = selected_catalog["catalog_id"] - integration_id = selected_catalog["integration_id"] - - # Display catalog info - st.sidebar.markdown("#### 📋 Catalog Info") - info_cols = st.sidebar.columns(2) - with info_cols[0]: - st.markdown(f"**ID:** `{catalog_id}`") - with info_cols[1]: - st.markdown(f"**Integration:** `{integration_id}`") - if selected_catalog.get("description"): - st.sidebar.markdown(f"📝 {selected_catalog['description']}") - if selected_catalog.get("tags"): - st.sidebar.markdown(f"🏷️ Tags: {', '.join(selected_catalog['tags'])}") -else: - catalog_id = None - integration_id = None - -with st.sidebar.expander("Manual override"): - manual_catalog_id = st.text_input("Catalog ID", value="") - manual_integration_id = st.text_input("Integration ID", value="") - if manual_catalog_id: - catalog_id = manual_catalog_id - if manual_integration_id: - integration_id = manual_integration_id - -target_name = st.sidebar.text_input("Branch or Tag", value=get_config_value("target_name", "beta/v4.3.2")) -set_config_value("target_name", target_name) - -# Auto-generate output folder based on branch name and timestamp -def _make_default_output_path(branch_name): - import re - clean_branch = re.sub(r'[^\w]', '_', branch_name.strip('/')) if branch_name else "eval" - clean_branch = re.sub(r'_+', '_', clean_branch).strip('_') - ts = datetime.now().strftime("%Y%m%d_%H%M%S") - return f"eval_{clean_branch}_{ts}" +def _render_local_run_card(run: Dict[str, object]) -> None: + name = html.escape(str(run["name"])) + rel_path = html.escape(str(run["path_display"])) + modified = html.escape(str(run["modified"])) + size = html.escape(str(run["size"])) + flags = [ + ("Summary", bool(run["has_summary"])), + ("Score", bool(run["has_score"])), + ("Parquet", bool(run["has_parquet"])), + ] + flag_html = "".join( + f'{label}' + for label, enabled in flags + ) + st.markdown( + f""" +
+
+
+
{name}
+
{rel_path}
+
+
+ {modified} + last updated +
+
+ {size} + disk usage +
+
+
{flag_html}
+
+
+
+ """, + unsafe_allow_html=True, + ) -# Always auto-generate fresh output path -eval_output_path = st.sidebar.text_input("📁 Output folder", value=_make_default_output_path(target_name), key="eval_output_path") -eval_download_type = get_config_value("eval_download_type", "Archives (ZIP)") -eval_phase = get_config_value("eval_phase", "perception.object_recognition.tracking.objects") -poll_interval = int(get_config_value("poll_interval", 60)) -max_wait_hours = int(get_config_value("max_wait_hours", 24)) -environment = get_config_value("environment", "") -configure_recent_evaluator_jobs_ui( - get_config_value=get_config_value, - set_config_value=set_config_value, - enqueue_task=_enqueue_task, - catalog_io_available=CATALOG_IO_AVAILABLE, - environment=environment, -) +def _render_local_runs_section() -> None: + section_header( + "Local Runs", + f"Finished runs already stored under `{get_data_root_display()}/`. Search, browse, and pick two to compare.", + ) + run_dirs = list_run_directories() + if not run_dirs: + st.markdown('
No finished runs were found on this server yet.
', unsafe_allow_html=True) + return + + runs: List[Dict[str, object]] = [] + for run_path in run_dirs: + info = get_run_info(run_path) + runs.append( + { + "name": info["name"], + "path_display": f"{get_data_root_display()}/{info['name']}", + "size": format_size(info["size_bytes"]), + "mtime": float(info["mtime"] or 0), + "modified": _format_run_mtime(info["mtime"]), + "has_summary": bool(info["has_summary"]), + "has_score": bool(info["has_score"]), + "has_parquet": bool(info["has_parquet"]), + } + ) + runs.sort(key=lambda row: (-float(row["mtime"]), str(row["name"]).lower())) + + control_cols = st.columns([1.7, 0.8, 0.8, 0.55]) + with control_cols[0]: + st.markdown('
Search
', unsafe_allow_html=True) + run_search = st.text_input( + "Search runs", + value=st.session_state.get("workflow_runs_search", ""), + key="workflow_runs_search", + label_visibility="collapsed", + placeholder="Filter by run name", + ).strip().lower() + with control_cols[1]: + st.markdown('
Summary
', unsafe_allow_html=True) + require_summary = st.selectbox( + "Require summary", + options=["Any", "Yes", "No"], + index=0, + key="workflow_runs_summary_filter", + label_visibility="collapsed", + ) + with control_cols[2]: + st.markdown('
Parquet
', unsafe_allow_html=True) + require_parquet = st.selectbox( + "Require parquet", + options=["Any", "Yes", "No"], + index=0, + key="workflow_runs_parquet_filter", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
Rows
', unsafe_allow_html=True) + page_size = int( + st.selectbox( + "Rows per page", + options=[6, 10, 14, 20], + index=1, + key="workflow_runs_page_size", + label_visibility="collapsed", + ) + ) -with st.sidebar.expander("Advanced"): - eval_download_type = st.radio("Download", ["Archives (ZIP)", "Result JSON"], index=0, horizontal=True) - set_config_value("eval_download_type", eval_download_type) - if eval_download_type == "Archives (ZIP)": - eval_phase = st.text_input("Phase", value=eval_phase) - set_config_value("eval_phase", eval_phase) - poll_interval = st.slider("Poll interval (s)", 10, 300, poll_interval, step=10) - set_config_value("poll_interval", poll_interval) - max_wait_hours = st.slider("Max wait (h)", 1, 168, max_wait_hours) - set_config_value("max_wait_hours", max_wait_hours) + filtered = runs + if run_search: + filtered = [row for row in filtered if run_search in str(row["name"]).lower()] + if require_summary != "Any": + want = require_summary == "Yes" + filtered = [row for row in filtered if bool(row["has_summary"]) == want] + if require_parquet != "Any": + want = require_parquet == "Yes" + filtered = [row for row in filtered if bool(row["has_parquet"]) == want] + + run_names = [str(row["name"]) for row in filtered] + compare_ready = [ + str(row["name"]) + for row in filtered + if bool(row["has_summary"]) or bool(row["has_score"]) or bool(row["has_parquet"]) + ] + if "workflow_compare_run_a" not in st.session_state: + st.session_state["workflow_compare_run_a"] = compare_ready[0] if compare_ready else "" + if "workflow_compare_run_b" not in st.session_state: + st.session_state["workflow_compare_run_b"] = compare_ready[1] if len(compare_ready) > 1 else "" + + st.markdown('
', unsafe_allow_html=True) + st.markdown('

Quick compare tray

', unsafe_allow_html=True) + compare_cols = st.columns([1.35, 1.35, 0.95, 0.95]) + with compare_cols[0]: + st.markdown('
Baseline A
', unsafe_allow_html=True) + run_a = st.selectbox( + "Baseline A", + options=[""] + compare_ready, + index=([""] + compare_ready).index(st.session_state.get("workflow_compare_run_a", "")) + if st.session_state.get("workflow_compare_run_a", "") in compare_ready + else 0, + key="workflow_compare_run_a", + label_visibility="collapsed", + ) + with compare_cols[1]: + st.markdown('
Candidate B
', unsafe_allow_html=True) + run_b_options = [""] + [name for name in compare_ready if name != run_a] + current_b = st.session_state.get("workflow_compare_run_b", "") + st.selectbox( + "Candidate B", + options=run_b_options, + index=run_b_options.index(current_b) if current_b in run_b_options else 0, + key="workflow_compare_run_b", + label_visibility="collapsed", + ) + run_b = st.session_state.get("workflow_compare_run_b", "") + with compare_cols[2]: + st.markdown('
Single
', unsafe_allow_html=True) + if run_a: + st.link_button("Open run", _build_overview_url(run_a), use_container_width=True) + else: + st.button("Open run", disabled=True, use_container_width=True, key="workflow_open_run_disabled") + with compare_cols[3]: + st.markdown('
Compare
', unsafe_allow_html=True) + if run_a and run_b: + st.link_button("Compare", _build_overview_url(run_a, run_b), use_container_width=True) + else: + st.button("Compare", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") + st.markdown("
", unsafe_allow_html=True) + + if not filtered: + st.markdown('
No local runs matched the current filters.
', unsafe_allow_html=True) + return + + page_key = "workflow_runs_page" + current_page = max(1, int(st.session_state.get(page_key, 1))) + page_count = max(1, (len(filtered) + page_size - 1) // page_size) + if current_page > page_count: + current_page = page_count + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * page_size + visible_runs = filtered[start_idx:start_idx + page_size] + + pager_cols = st.columns([0.7, 0.8, 0.8, 0.8, 5.9]) + with pager_cols[0]: + if st.button("‹", key="workflow_runs_prev", use_container_width=True, disabled=current_page <= 1): + st.session_state[page_key] = current_page - 1 + st.rerun() + page_numbers = ( + list(range(1, min(3, page_count) + 1)) + if current_page == 1 + else list(range(max(1, current_page - 1), min(page_count, current_page + 1) + 1)) + ) + for idx, page_num in enumerate(page_numbers[:3], start=1): + with pager_cols[idx]: + if st.button( + str(page_num), + key=f"workflow_runs_page_{page_num}", + use_container_width=True, + disabled=page_num == current_page, + ): + st.session_state[page_key] = page_num + st.rerun() + with pager_cols[4]: + if st.button("›", key="workflow_runs_next", use_container_width=True, disabled=current_page >= page_count): + st.session_state[page_key] = current_page + 1 + st.rerun() -# ============================================ -# MAIN CONTENT -# ============================================ - -# Validation -validation_errors = [] -if not eval_project_id: - validation_errors.append("Project ID") -if not catalog_id: - validation_errors.append("Catalog ID") -if not integration_id: - validation_errors.append("Integration ID") -if not target_name: - validation_errors.append("Target") - -if validation_errors: - for err in validation_errors: - st.error(f"❌ {err}") - st.stop() - -resolved_output, path_err = resolve_under_data_root(eval_output_path, allow_create=True) -if path_err: - st.error(f"❌ {path_err}") - st.stop() -resolved_path_str = str(resolved_output) -max_wait_seconds = max_wait_hours * 3600 - -# Pipeline visualization -st.markdown(""" - -
-
1
📤 Schedule
-
-
2
⏳ Wait
-
-
3
📥 Download
-
-
4
📊 Evaluate
-
-
5
📦 Parquet
-
-""", unsafe_allow_html=True) - -# Options -col1, col2, col3 = st.columns(3) -with col1: - eval_run_eval = st.checkbox("📊 Run Evaluation", value=True) -with col2: - eval_generate_parquet = st.checkbox("📦 Generate Parquet", value=CATALOG_IO_AVAILABLE, disabled=not CATALOG_IO_AVAILABLE) -with col3: - eval_recursive = st.checkbox("🔍 Recursive Scan", value=True) - -# START BUTTON -st.markdown("
", unsafe_allow_html=True) -st.markdown(""" - -""", unsafe_allow_html=True) -clicked = st.button("🚀 Start Evaluator Workflow", type="primary", use_container_width=True) - -if clicked: + st.markdown('
', unsafe_allow_html=True) + for run in visible_runs: + row_cols = st.columns([8.9, 2.6]) + with row_cols[0]: + _render_local_run_card(run) + with row_cols[1]: + action_cols = st.columns([1.0, 1.0, 1.0], gap="small") + with action_cols[0]: + st.link_button( + "Open", + _build_overview_url(str(run["name"])), + use_container_width=True, + ) + with action_cols[1]: + if st.button(f"A", key=f"workflow_pick_a_{run['name']}", use_container_width=True): + st.session_state["workflow_compare_run_a"] = str(run["name"]) + st.rerun() + with action_cols[2]: + if st.button(f"B", key=f"workflow_pick_b_{run['name']}", use_container_width=True): + st.session_state["workflow_compare_run_b"] = str(run["name"]) + st.rerun() + st.markdown("
", unsafe_allow_html=True) + + +def _render_current_tasks_section() -> None: + section_header( + "Current Tasks", + "Jobs queued or running on this server, with recent history folded underneath.", + ) if not is_task_queue_enabled(): - st.error("❌ Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") - st.stop() - - task_id = _enqueue_task("run_evaluator_and_process", { - "project_id": eval_project_id, - "catalog_id": catalog_id, - "integration_id": integration_id, - "suite_ids": None, - "target_name": target_name, - "description": f"Eval {datetime.now().strftime('%Y-%m-%d %H:%M')}", - "output_path": resolved_path_str, - "environment": environment, - "max_retries": 0, - "clean_build": False, - "debug": False, - "is_tag": False, - "download_type": "archives" if eval_download_type == "Archives (ZIP)" else "result_json", - "phase": eval_phase, - "skip_large_file": False, - "large_file_mb": 50.0, - "keep_zip_files": False, - "poll_interval": poll_interval, - "max_wait_seconds": max_wait_seconds, - "run_eval": eval_run_eval, - "generate_parquet": eval_generate_parquet, - "eval_recursive": eval_recursive, - "eval_overwrite": False, - }) - - if task_id: - st.success(f"✅ Workflow queued! Task: `{task_id[:24]}...`") - st.info("💡 Running in background — close browser, check Task Status below.") - else: - st.error("❌ Failed to enqueue task. Check worker logs.") + st.info("Task queue not enabled. Set `USE_TASK_QUEUE=true` to track background tasks.") + return -_render_recent_evaluator_jobs_section( - eval_project_id, - environment, - output_path_default=eval_output_path, - download_type_default=eval_download_type, - phase_default=eval_phase, - skip_large_file_default=False, - large_file_mb_default=50.0, - keep_zip_files_default=False, -) - -# ============================================ -# TASK STATUS -# ============================================ -if not is_task_queue_enabled(): - st.info("Task queue not enabled. Set `USE_TASK_QUEUE=true` to track background tasks.") -else: current_user = get_task_list_current_user() - render_download_task_section_header( - since_days=_TASK_LIST_SINCE_DAYS, - max_rows=_TASK_LIST_MAX_ROWS, - ) use_fragment = getattr(st, "fragment", None) is not None if use_fragment: try: + @st.fragment(run_every=timedelta(seconds=3)) def _task_list_poll(): current_tasks = list_recent_tasks( @@ -393,20 +580,318 @@ def _task_list_poll(): since_days=_TASK_LIST_SINCE_DAYS, ) render_task_list(current_tasks, current_user) + _task_list_poll() + return except (TypeError, AttributeError): use_fragment = False - if not use_fragment: - tasks = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, - session_id=current_user, - since_days=_TASK_LIST_SINCE_DAYS, + + tasks = list_recent_tasks( + limit=_TASK_LIST_MAX_ROWS, + session_id=current_user, + since_days=_TASK_LIST_SINCE_DAYS, + ) + has_active = render_task_list(tasks, current_user) + if st.button("Refresh tasks", key="workflow_refresh_tasks"): + st.rerun() + if has_active: + st.caption("Active jobs are shown live when possible. Use refresh if this browser does not support fragments.") + + +def _render_start_workflow_section( + catalog_presets: List[Dict[str, str]], + catalogs_path: Optional[str], + catalog_load_error: Optional[str], +) -> Dict[str, object]: + section_header( + "Start Workflow", + "Schedule a fresh evaluator run here, or use the recent evaluator jobs browser below to run Download + Eval from an existing report.", + ) + + if catalog_load_error: + st.warning(f"Could not read catalog presets: {catalog_load_error}") + elif catalogs_path: + st.caption(f"Catalog presets loaded from `{catalogs_path}`.") + + catalog_names = [item["display_name"] for item in catalog_presets] + default_project = get_config_value("eval_project_id", "x2_dev") + default_target = get_config_value("target_name", "beta/v4.3.2") + default_download_type = get_config_value("eval_download_type", "Archives (ZIP)") + default_phase = get_config_value( + "eval_phase", + "perception.object_recognition.tracking.objects", + ) + default_poll_interval = int(get_config_value("poll_interval", 60)) + default_max_wait_hours = int(get_config_value("max_wait_hours", 24)) + default_environment = get_config_value("environment", "") + default_output = get_config_value("eval_output_path", _make_default_output_path(default_target)) + + top_cols = st.columns([1.0, 1.5, 1.2]) + with top_cols[0]: + st.markdown('
Project
', unsafe_allow_html=True) + project_id = st.text_input( + "Project ID", + value=default_project, + key="workflow_project_id", + label_visibility="collapsed", + ).strip() + with top_cols[1]: + st.markdown('
Catalog
', unsafe_allow_html=True) + selected_catalog_name = st.selectbox( + "Catalog", + options=catalog_names if catalog_names else ["No catalog presets"], + index=0, + key="workflow_catalog_name", + label_visibility="collapsed", ) - has_active = render_task_list(tasks, current_user) - if st.button("Refresh task list", key="refresh_tasks_workflow"): - st.rerun() - if has_active: - st.info("You have running tasks. Refresh the page to see latest status and logs.") + selected_catalog = next( + (item for item in catalog_presets if item["display_name"] == selected_catalog_name), + None, + ) + with top_cols[2]: + st.markdown('
Branch or tag
', unsafe_allow_html=True) + target_name = st.text_input( + "Branch or Tag", + value=default_target, + key="workflow_target_name", + label_visibility="collapsed", + placeholder="beta/v4.3.2", + ).strip() + + detail_cols = st.columns([1.25, 0.8, 0.95]) + with detail_cols[0]: + st.markdown('
Output folder
', unsafe_allow_html=True) + output_path = st.text_input( + "Output folder", + value=default_output, + key="workflow_output_path", + label_visibility="collapsed", + placeholder=_make_default_output_path(target_name), + ).strip() + with detail_cols[1]: + st.markdown('
Environment
', unsafe_allow_html=True) + environment = st.selectbox( + "Environment", + options=["", "dev", "stg", "prd"], + index=["", "dev", "stg", "prd"].index(default_environment) if default_environment in ("", "dev", "stg", "prd") else 0, + key="workflow_environment", + label_visibility="collapsed", + format_func=lambda value: value or "default", + ) + with detail_cols[2]: + st.markdown('
Description
', unsafe_allow_html=True) + description = st.text_input( + "Description", + value=get_config_value("workflow_description", ""), + key="workflow_description", + label_visibility="collapsed", + placeholder="Optional label for the evaluator run", + ).strip() -st.sidebar.divider() -st.sidebar.caption("💡 Runs async — close browser safely") + if selected_catalog: + info_cols = st.columns([1.2, 1.15, 2.2]) + with info_cols[0]: + st.markdown(f'
Catalog ID
{html.escape(str(selected_catalog.get("catalog_id", "—")))}
', unsafe_allow_html=True) + with info_cols[1]: + st.markdown(f'
Integration
{html.escape(str(selected_catalog.get("integration_id", "—")))}
', unsafe_allow_html=True) + with info_cols[2]: + desc = str(selected_catalog.get("description") or "").strip() or "Preset selected for quick scheduling." + st.markdown(f'
Preset
{html.escape(desc)}
', unsafe_allow_html=True) + + with st.expander("Advanced options", expanded=False): + adv_cols = st.columns([1.0, 1.2, 0.8, 0.8]) + with adv_cols[0]: + download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON"], + horizontal=True, + index=0 if default_download_type == "Archives (ZIP)" else 1, + key="workflow_download_type", + ) + with adv_cols[1]: + phase = st.text_input( + "Phase", + value=default_phase, + key="workflow_phase", + disabled=download_type != "Archives (ZIP)", + ) + with adv_cols[2]: + poll_interval = st.slider( + "Poll interval (s)", + min_value=10, + max_value=300, + value=default_poll_interval, + step=10, + key="workflow_poll_interval", + ) + with adv_cols[3]: + max_wait_hours = st.slider( + "Max wait (h)", + min_value=1, + max_value=168, + value=default_max_wait_hours, + key="workflow_max_wait_hours", + ) + + option_cols = st.columns(4) + with option_cols[0]: + run_eval = st.checkbox("Run evaluation", value=True, key="workflow_run_eval") + with option_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + key="workflow_generate_parquet", + ) + with option_cols[2]: + eval_recursive = st.checkbox("Recursive scan", value=True, key="workflow_eval_recursive") + with option_cols[3]: + is_tag = st.checkbox("Target is tag", value=False, key="workflow_is_tag") + + start_cols = st.columns([1.45, 0.95]) + with start_cols[0]: + st.markdown('
', unsafe_allow_html=True) + st.markdown('

Schedule evaluator + download + eval

', unsafe_allow_html=True) + st.markdown( + '

This starts the same background pipeline as the previous workflow launcher, but keeps the controls on-page. Output path, evaluator polling, and eval/parquet behavior are all preserved.

', + unsafe_allow_html=True, + ) + start_clicked = st.button( + "Start evaluator workflow", + key="workflow_start_btn", + type="primary", + use_container_width=True, + ) + st.markdown("
", unsafe_allow_html=True) + with start_cols[1]: + st.markdown( + """ +
+ Already have an evaluator report? +

Use the recent evaluator jobs section right below. Every row can open details or run Download + Eval + Parquet directly, using the same defaults configured on this page.

+
+ """, + unsafe_allow_html=True, + ) + + set_config_value("eval_project_id", project_id) + set_config_value("target_name", target_name) + set_config_value("eval_output_path", output_path) + set_config_value("eval_download_type", download_type) + set_config_value("eval_phase", phase) + set_config_value("poll_interval", poll_interval) + set_config_value("max_wait_hours", max_wait_hours) + set_config_value("environment", environment) + set_config_value("workflow_description", description) + + catalog_id = str((selected_catalog or {}).get("catalog_id") or "").strip() + integration_id = str((selected_catalog or {}).get("integration_id") or "").strip() + errors = [] + if not project_id: + errors.append("Project ID") + if not catalog_id: + errors.append("Catalog") + if not integration_id: + errors.append("Integration ID") + if not target_name: + errors.append("Branch or tag") + + resolved_output = None + path_error = "" + if output_path: + resolved_output, path_error = resolve_under_data_root(output_path, allow_create=True) + if path_error: + errors.append(path_error) + else: + errors.append("Output folder") + + if start_clicked: + if errors: + for err in errors: + st.error(f"Missing or invalid: {err}") + elif not is_task_queue_enabled(): + st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") + else: + task_id = _enqueue_task( + "run_evaluator_and_process", + { + "project_id": project_id, + "catalog_id": catalog_id, + "integration_id": integration_id, + "suite_ids": None, + "target_name": target_name, + "description": description or f"Eval {datetime.now().strftime('%Y-%m-%d %H:%M')}", + "output_path": str(resolved_output), + "environment": environment, + "max_retries": 0, + "clean_build": False, + "debug": False, + "is_tag": is_tag, + "download_type": "archives" if download_type == "Archives (ZIP)" else "result_json", + "phase": phase, + "skip_large_file": False, + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": int(poll_interval), + "max_wait_seconds": int(max_wait_hours) * 3600, + "run_eval": bool(run_eval), + "generate_parquet": bool(generate_parquet), + "eval_recursive": bool(eval_recursive), + "eval_overwrite": False, + }, + ) + if task_id: + st.success(f"Workflow queued. Task id: `{task_id}`") + else: + st.error("Failed to enqueue task. Check worker logs.") + + return { + "project_id": project_id, + "environment": environment, + "output_path_default": output_path or _make_default_output_path(target_name), + "download_type_default": download_type, + "phase_default": phase, + "skip_large_file_default": False, + "large_file_mb_default": 50.0, + "keep_zip_files_default": False, + } + + +_inject_workflow_page_styles() +render_page_hero( + kicker="Workflow automation", + title="Evaluator Workflow", + description="Browse finished runs, watch background tasks, launch fresh evaluator pipelines, and reuse existing evaluator reports from one aligned workspace.", +) + +catalog_presets, catalogs_path, catalog_load_error = _load_catalog_presets() + +_render_local_runs_section() +_render_current_tasks_section() +start_defaults = _render_start_workflow_section(catalog_presets, catalogs_path, catalog_load_error) + +configure_recent_evaluator_jobs_ui( + get_config_value=get_config_value, + set_config_value=set_config_value, + enqueue_task=_enqueue_task, + catalog_io_available=CATALOG_IO_AVAILABLE, + environment=str(start_defaults["environment"] or ""), +) + +section_header( + "Recent Evaluator Jobs", + "Direct evaluator browser for starting Download + Eval from existing reports. Shown by default here so the existing-job path is one click away.", +) +_render_recent_evaluator_jobs_section( + str(start_defaults["project_id"] or ""), + str(start_defaults["environment"] or ""), + output_path_default=str(start_defaults["output_path_default"]), + download_type_default=str(start_defaults["download_type_default"]), + phase_default=str(start_defaults["phase_default"]), + skip_large_file_default=bool(start_defaults["skip_large_file_default"]), + large_file_mb_default=float(start_defaults["large_file_mb_default"]), + keep_zip_files_default=bool(start_defaults["keep_zip_files_default"]), + show_toggle=False, + default_visible=True, + show_title=False, +) From 767e4f50bd149e8ab8afb5213ac9be74b0782a1f Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 16:56:40 +0900 Subject: [PATCH 65/94] feat: enhance authentication handling and user identity extraction - Introduced new functions for reading Streamlit headers and extracting user identity from bearer tokens, improving authentication capabilities. - Added JWT payload decoding functionality for better display of user information without signature verification. - Updated the Evaluator Workflow page to include a new method for resolving subject names, integrating external authentication services for enhanced user data retrieval. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/auth.py | 134 ++++++++++++++++-- .../pages/7_Evaluator_Workflow.py | 34 +++++ 2 files changed, 154 insertions(+), 14 deletions(-) diff --git a/evaluation_dashboard_app/lib/auth.py b/evaluation_dashboard_app/lib/auth.py index b15d5dc..b3e2ac6 100644 --- a/evaluation_dashboard_app/lib/auth.py +++ b/evaluation_dashboard_app/lib/auth.py @@ -4,8 +4,10 @@ a header with the user identity. When enabled, users see only their own tasks. """ +import base64 +import json import os -from typing import Optional +from typing import Any, Dict, Optional # Header name set by auth proxy (e.g. X-Forwarded-User, X-Auth-User). Empty = no auth filtering. AUTH_USER_HEADER = os.environ.get("AUTH_USER_HEADER", "").strip() @@ -14,6 +16,75 @@ AUTH_DEFAULT_USER = os.environ.get("AUTH_DEFAULT_USER", "").strip() or None +def _read_streamlit_headers() -> Dict[str, str]: + """Best-effort request headers from Streamlit context.""" + try: + import streamlit as st + + ctx = getattr(st, "context", None) + headers = getattr(ctx, "headers", None) if ctx else None + if callable(headers): + headers = headers() + if isinstance(headers, dict): + normalized: Dict[str, str] = {} + for key, value in headers.items(): + if not isinstance(key, str): + continue + normalized[key] = str(value) + return normalized + except Exception: + pass + return {} + + +def _decode_jwt_payload(token: str) -> Dict[str, Any]: + """Best-effort JWT payload decode without signature verification, for display only.""" + raw = str(token or "").strip() + if not raw: + return {} + parts = raw.split(".") + if len(parts) < 2: + return {} + payload = parts[1] + padding = "=" * (-len(payload) % 4) + try: + decoded = base64.urlsafe_b64decode(payload + padding) + data = json.loads(decoded.decode("utf-8")) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + +def _extract_identity_from_bearer_token(headers: Dict[str, str]) -> Dict[str, Any]: + """Extract subject / email / display name from an Oathkeeper-style bearer token.""" + authz = str(headers.get("Authorization") or headers.get("authorization") or "").strip() + if not authz.lower().startswith("bearer "): + return {} + token = authz.split(" ", 1)[1].strip() + payload = _decode_jwt_payload(token) + if not payload: + return {} + + session = payload.get("session") or {} + identity = session.get("identity") or {} + traits = identity.get("traits") or {} + name = traits.get("name") or {} + full_name = " ".join( + part for part in [str(name.get("first") or "").strip(), str(name.get("last") or "").strip()] if part + ).strip() + display_name = ( + full_name + or str(traits.get("display_name") or "").strip() + or str(traits.get("email") or "").strip() + ) + return { + "subject_id": str(payload.get("sub") or session.get("account", {}).get("subject_id") or "").strip(), + "email": str(traits.get("email") or "").strip(), + "name": display_name, + "claims": payload, + } + + def get_current_user_id() -> Optional[str]: """ Return the current user identifier, or None if auth is not configured. @@ -24,22 +95,57 @@ def get_current_user_id() -> Optional[str]: """ if not AUTH_USER_HEADER and not AUTH_DEFAULT_USER: return None - # Try to read header (Streamlit 1.37+) - try: - import streamlit as st - ctx = getattr(st, "context", None) - headers = getattr(ctx, "headers", None) if ctx else None - if callable(headers): - headers = headers() - if isinstance(headers, dict): - value = headers.get(AUTH_USER_HEADER) or headers.get(AUTH_USER_HEADER.lower()) - if value and isinstance(value, str) and value.strip(): - return value.strip() - except Exception: - pass + headers = _read_streamlit_headers() + value = headers.get(AUTH_USER_HEADER) or headers.get(AUTH_USER_HEADER.lower()) + if value and isinstance(value, str) and value.strip(): + return value.strip() return AUTH_DEFAULT_USER def is_auth_enabled() -> bool: """True if AUTH_USER_HEADER or AUTH_DEFAULT_USER is set (per-user task filtering).""" return bool(AUTH_USER_HEADER or AUTH_DEFAULT_USER) + + +def get_current_user_session_info() -> Dict[str, Any]: + """ + Return best-effort request/session auth info for UI debugging. + + This reflects what the Streamlit app can observe from the incoming request, + not the evaluator token used by background workers. + """ + headers = _read_streamlit_headers() + configured_value = "" + configured_source = "unavailable" + if AUTH_USER_HEADER: + raw_value = headers.get(AUTH_USER_HEADER) or headers.get(AUTH_USER_HEADER.lower()) or "" + configured_value = str(raw_value).strip() + if configured_value: + configured_source = f"header:{AUTH_USER_HEADER}" + else: + configured_source = f"header:{AUTH_USER_HEADER} (missing)" + if not configured_value and AUTH_DEFAULT_USER: + configured_value = AUTH_DEFAULT_USER + configured_source = "AUTH_DEFAULT_USER" + + authz = headers.get("Authorization") or headers.get("authorization") or "" + cookie = headers.get("Cookie") or headers.get("cookie") or "" + bearer_identity = _extract_identity_from_bearer_token(headers) + if not configured_value and bearer_identity.get("subject_id"): + configured_value = str(bearer_identity.get("subject_id") or "").strip() + configured_source = "authorization:bearer" + safe_header_keys = sorted( + key for key in headers.keys() if key.lower() not in {"authorization", "cookie"} + ) + return { + "user_id": configured_value or None, + "source": configured_source, + "auth_user_header": AUTH_USER_HEADER or "", + "default_user": AUTH_DEFAULT_USER, + "has_authorization_header": bool(str(authz).strip()), + "has_cookie_header": bool(str(cookie).strip()), + "header_keys": safe_header_keys, + "bearer_subject_id": str(bearer_identity.get("subject_id") or "").strip(), + "bearer_email": str(bearer_identity.get("email") or "").strip(), + "bearer_name": str(bearer_identity.get("name") or "").strip(), + } diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 5c9048c..77bdd98 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -17,6 +17,7 @@ from typing import Dict, List, Optional import streamlit as st +import requests from lib.db import create_task, is_task_queue_enabled, list_recent_tasks, update_task_rq_job_id from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header @@ -174,6 +175,39 @@ def _build_overview_url(run_a: str, run_b: Optional[str] = None) -> str: return f"/?{urllib.parse.urlencode(query)}" +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _resolve_subject_name(subject_id: str, environment: str) -> Dict[str, str]: + subject = str(subject_id or "").strip() + if not subject or not subject.startswith("t4:"): + return {"subject_id": subject, "name": subject, "email": ""} + org_id = os.environ.get( + "WEBAUTO_ORGANIZATION_ID", + "5a21621d-6968-4f7d-94f8-99cfb77b6e71", + ).strip() + if not org_id: + return {"subject_id": subject, "name": subject, "email": ""} + os.environ["AUTH_PROFILE"] = environment or "default" + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + access_token = token_source.get_token().access_token + quoted_subject = urllib.parse.quote(subject, safe="") + url = f"https://auth.web.auto/v2/organizations/{org_id}/members/{quoted_subject}" + response = requests.get( + url, + headers={"Authorization": f"Bearer {access_token}", "accept": "application/json"}, + timeout=10, + ) + response.raise_for_status() + data = response.json() + return { + "subject_id": str(data.get("subject_id") or subject).strip(), + "name": str(data.get("name") or subject).strip(), + "email": str(data.get("email") or "").strip(), + } + + def _inject_workflow_page_styles() -> None: st.markdown( """ From 6223fa03a538f38a3eac48016587267326462f5e Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 14 May 2026 17:18:50 +0900 Subject: [PATCH 66/94] feat: update recent evaluator jobs UI and enhance local runs section - Modified the options for the recent evaluator jobs limit selection to include 10, 20, 50, and 100 rows for better user flexibility. - Removed the recent searches dropdown to streamline the search experience, focusing on a more efficient job search process. - Introduced a new function to load local runs, improving the organization and display of finished runs with enhanced sorting and metadata. - Updated CSS styles for better visual representation of local runs, including new styles for run rows and launcher components. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/auth.py | 91 +-- .../lib/ui/recent_evaluator_jobs.py | 20 +- .../lib/ui/task_history.py | 3 - evaluation_dashboard_app/pages/6_Download.py | 4 +- .../pages/7_Evaluator_Workflow.py | 749 ++++++++++-------- 5 files changed, 455 insertions(+), 412 deletions(-) diff --git a/evaluation_dashboard_app/lib/auth.py b/evaluation_dashboard_app/lib/auth.py index b3e2ac6..c29c6eb 100644 --- a/evaluation_dashboard_app/lib/auth.py +++ b/evaluation_dashboard_app/lib/auth.py @@ -16,6 +16,15 @@ AUTH_DEFAULT_USER = os.environ.get("AUTH_DEFAULT_USER", "").strip() or None +def _first_nonempty_string(*values: Any) -> str: + """Return the first non-empty string-like value, else empty string.""" + for value in values: + text = str(value or "").strip() + if text: + return text + return "" + + def _read_streamlit_headers() -> Dict[str, str]: """Best-effort request headers from Streamlit context.""" try: @@ -56,7 +65,7 @@ def _decode_jwt_payload(token: str) -> Dict[str, Any]: def _extract_identity_from_bearer_token(headers: Dict[str, str]) -> Dict[str, Any]: - """Extract subject / email / display name from an Oathkeeper-style bearer token.""" + """Extract subject / email / username / display name from common bearer token claims.""" authz = str(headers.get("Authorization") or headers.get("authorization") or "").strip() if not authz.lower().startswith("bearer "): return {} @@ -69,17 +78,41 @@ def _extract_identity_from_bearer_token(headers: Dict[str, str]) -> Dict[str, An identity = session.get("identity") or {} traits = identity.get("traits") or {} name = traits.get("name") or {} + oauth_username = _first_nonempty_string( + payload.get("preferred_username"), + payload.get("username"), + payload.get("upn"), + payload.get("unique_name"), + payload.get("cognito:username"), + traits.get("username"), + identity.get("username"), + ) full_name = " ".join( part for part in [str(name.get("first") or "").strip(), str(name.get("last") or "").strip()] if part ).strip() - display_name = ( - full_name - or str(traits.get("display_name") or "").strip() - or str(traits.get("email") or "").strip() + display_name = _first_nonempty_string( + payload.get("name"), + full_name, + traits.get("display_name"), + identity.get("display_name"), + oauth_username, + traits.get("email"), + ) + email = _first_nonempty_string( + payload.get("email"), + payload.get("upn"), + traits.get("email"), + identity.get("email"), + ) + subject_id = _first_nonempty_string( + payload.get("sub"), + session.get("account", {}).get("subject_id"), + identity.get("id"), ) return { - "subject_id": str(payload.get("sub") or session.get("account", {}).get("subject_id") or "").strip(), - "email": str(traits.get("email") or "").strip(), + "subject_id": subject_id, + "email": email, + "username": oauth_username, "name": display_name, "claims": payload, } @@ -105,47 +138,3 @@ def get_current_user_id() -> Optional[str]: def is_auth_enabled() -> bool: """True if AUTH_USER_HEADER or AUTH_DEFAULT_USER is set (per-user task filtering).""" return bool(AUTH_USER_HEADER or AUTH_DEFAULT_USER) - - -def get_current_user_session_info() -> Dict[str, Any]: - """ - Return best-effort request/session auth info for UI debugging. - - This reflects what the Streamlit app can observe from the incoming request, - not the evaluator token used by background workers. - """ - headers = _read_streamlit_headers() - configured_value = "" - configured_source = "unavailable" - if AUTH_USER_HEADER: - raw_value = headers.get(AUTH_USER_HEADER) or headers.get(AUTH_USER_HEADER.lower()) or "" - configured_value = str(raw_value).strip() - if configured_value: - configured_source = f"header:{AUTH_USER_HEADER}" - else: - configured_source = f"header:{AUTH_USER_HEADER} (missing)" - if not configured_value and AUTH_DEFAULT_USER: - configured_value = AUTH_DEFAULT_USER - configured_source = "AUTH_DEFAULT_USER" - - authz = headers.get("Authorization") or headers.get("authorization") or "" - cookie = headers.get("Cookie") or headers.get("cookie") or "" - bearer_identity = _extract_identity_from_bearer_token(headers) - if not configured_value and bearer_identity.get("subject_id"): - configured_value = str(bearer_identity.get("subject_id") or "").strip() - configured_source = "authorization:bearer" - safe_header_keys = sorted( - key for key in headers.keys() if key.lower() not in {"authorization", "cookie"} - ) - return { - "user_id": configured_value or None, - "source": configured_source, - "auth_user_header": AUTH_USER_HEADER or "", - "default_user": AUTH_DEFAULT_USER, - "has_authorization_header": bool(str(authz).strip()), - "has_cookie_header": bool(str(cookie).strip()), - "header_keys": safe_header_keys, - "bearer_subject_id": str(bearer_identity.get("subject_id") or "").strip(), - "bearer_email": str(bearer_identity.get("email") or "").strip(), - "bearer_name": str(bearer_identity.get("name") or "").strip(), - } diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 3c2e821..2bb5f45 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -654,7 +654,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: .evj-top { justify-content: space-between; } .evj-row { display: grid; - grid-template-columns: minmax(180px, 1.3fr) minmax(86px, 0.5fr) minmax(108px, 0.7fr) minmax(180px, 1.15fr) minmax(130px, 0.9fr) minmax(180px, 1.1fr); + grid-template-columns: minmax(180px, 1.25fr) minmax(86px, 0.48fr) minmax(172px, 0.95fr) minmax(170px, 1.05fr) minmax(120px, 0.8fr) minmax(160px, 1fr); gap: 8px; align-items: center; } @@ -966,7 +966,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = {status_mark}{status}
- {scheduled}
{duration} · {created_label} + {scheduled} ({created_label})
{duration}
{catalog_html}
{source_html} @@ -1271,7 +1271,7 @@ def _render_recent_evaluator_jobs_section( limit = int( st.selectbox( "Rows", - options=[6, 12, 20, 30], + options=[10, 20, 50, 100], index=1, key="recent_eval_jobs_limit", help="How many recent evaluator jobs to fetch for this project.", @@ -1309,19 +1309,7 @@ def _render_recent_evaluator_jobs_section( label_visibility="collapsed", placeholder="Type to search evaluator jobs", ).strip() - recent_candidates = _get_recent_job_search_history(search_scope) selected_user_name = "" - if recent_candidates: - recent_choice = st.selectbox( - "Recent searches", - options=[""] + recent_candidates, - index=0, - key=f"recent_eval_jobs_search_recent::{search_scope}", - help="Reuse a previously entered search for this field.", - ) - if recent_choice and recent_choice != search_text: - st.session_state["recent_eval_jobs_search_text"] = recent_choice - st.rerun() user_candidates = sorted( { info.get("name", "").strip() @@ -1534,7 +1522,7 @@ def _render_job_list() -> None: _fetch_evaluator_job_detail.clear() st.rerun() with action_cols[1]: - if st.button("Run", key=f"recent_eval_run_{job['job_id']}", use_container_width=True): + if st.button("Start", key=f"recent_eval_run_{job['job_id']}", use_container_width=True): st.session_state["recent_eval_jobs_run_selected"] = str(job["job_id"]) _fetch_evaluator_job_detail.clear() st.rerun() diff --git a/evaluation_dashboard_app/lib/ui/task_history.py b/evaluation_dashboard_app/lib/ui/task_history.py index a0c2136..100df7b 100644 --- a/evaluation_dashboard_app/lib/ui/task_history.py +++ b/evaluation_dashboard_app/lib/ui/task_history.py @@ -243,9 +243,6 @@ def render_task_list(tasks: List[Dict[str, Any]], current_user: Optional[str]) - for t in active: _render_one_task_row(t, current_user, use_dialog, mode="active_compact") - if not active: - st.caption("No queued or running jobs.") - if history: with st.expander(f"Task history ({len(history)})", expanded=False): for t in history: diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 3b366f4..c34f531 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -39,7 +39,9 @@ def _to_jst(dt: Any) -> Optional[datetime]: from lib.user_config import UserConfig from lib.path_utils import get_data_root, resolve_under_data_root, to_data_relative from lib.eval_summary import find_eval_result_dirs, run_eval_result_for_dir, generate_summary_and_score_csv -from lib.page_chrome import inject_app_page_styles +from lib.page_chrome import ( + inject_app_page_styles, +) from lib.ui.download_ui import ( ImpressiveProgressHUD, render_detailed_scenario_download_panel, diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 77bdd98..05f7a9d 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -20,7 +20,11 @@ import requests from lib.db import create_task, is_task_queue_enabled, list_recent_tasks, update_task_rq_job_id -from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.page_chrome import ( + inject_app_page_styles, + render_page_hero, + section_header, +) from lib.path_utils import ( format_size, get_data_root_display, @@ -159,6 +163,40 @@ def _make_default_output_path(branch_name: str) -> str: return f"eval_{clean_branch}_{ts}" +def _catalog_preset_emoji(preset_name: str, *, has_custom_catalog: bool = False) -> str: + mapping = { + "Build Test Catalog": "🛠️", + "Performance Test": "📈", + "Old performance test": "🕰️", + "Devops Test": "⚙️", + "Usecase Performance Catalog": "🧭", + "L4 regression test": "⚠️", + } + normalized = str(preset_name or "").strip() + if normalized in mapping: + return mapping[normalized] + if has_custom_catalog: + return "🧩" + return "📦" + + +def _make_auto_workflow_description( + target_name: str, + preset_name: str = "", + *, + has_custom_catalog: bool = False, +) -> str: + import re + + clean_target = str(target_name or "").strip() or "default" + clean_target = re.sub(r"\s+", " ", clean_target) + stamp = datetime.now().strftime("%m-%d %H:%M") + return ( + f"🚀 evaluator workflow [{clean_target}] [{stamp}] " + f"{_catalog_preset_emoji(preset_name, has_custom_catalog=has_custom_catalog)}" + ) + + def _format_run_mtime(mtime: float) -> str: if not mtime: return "—" @@ -168,13 +206,37 @@ def _format_run_mtime(mtime: float) -> str: return "—" -def _build_overview_url(run_a: str, run_b: Optional[str] = None) -> str: - query = {"mode": "compare" if run_b else "single", "run_a": run_a} - if run_b: - query["run_b"] = run_b +def _build_overview_url(run_a: str, compare_runs: Optional[List[str]] = None) -> str: + query = {"mode": "single", "run_a": run_a} + valid_compare_runs = [str(name).strip() for name in (compare_runs or []) if str(name).strip()] + if valid_compare_runs: + query["mode"] = "compare" + for idx, run_name in enumerate(valid_compare_runs[:4]): + query[f"run_{chr(98 + idx)}"] = run_name return f"/?{urllib.parse.urlencode(query)}" +@st.cache_data(ttl=15, show_spinner=False) +def _load_local_runs() -> List[Dict[str, object]]: + runs: List[Dict[str, object]] = [] + for run_path in list_run_directories(): + info = get_run_info(run_path) + runs.append( + { + "name": info["name"], + "path_display": f"{get_data_root_display()}/{info['name']}", + "size": format_size(info["size_bytes"]), + "mtime": float(info["mtime"] or 0), + "modified": _format_run_mtime(info["mtime"]), + "has_summary": bool(info["has_summary"]), + "has_score": bool(info["has_score"]), + "has_parquet": bool(info["has_parquet"]), + } + ) + runs.sort(key=lambda row: (-float(row["mtime"]), str(row["name"]).lower())) + return runs + + @st.cache_data(ttl=24 * 3600, show_spinner=False) def _resolve_subject_name(subject_id: str, environment: str) -> Dict[str, str]: subject = str(subject_id or "").strip() @@ -250,59 +312,46 @@ def _inject_workflow_page_styles() -> None: display: block; margin-top: 0.35rem; } - .wf-run-card { - border: 1px solid rgba(148, 163, 184, 0.2); - background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%); - border-radius: 16px; - padding: 0.95rem 1rem; - } - .wf-run-row { - display: grid; - grid-template-columns: minmax(0, 2.3fr) minmax(110px, 0.95fr) minmax(92px, 0.85fr) minmax(0, 1.1fr); - gap: 0.95rem; - align-items: center; - } .wf-run-name { min-width: 0; } .wf-run-title { - font-size: 0.96rem; - line-height: 1.25; + font-size: 0.8rem; + line-height: 1.2; font-weight: 700; color: #0f172a; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; } - .wf-run-sub { - margin-top: 0.18rem; - color: #64748b; - font-size: 0.78rem; - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; + .wf-run-title a { + color: inherit; + text-decoration: none; + } + .wf-run-title a:hover { + text-decoration: underline; } .wf-run-cell { min-width: 0; color: #0f172a; - font-size: 0.85rem; - line-height: 1.35; + font-size: 0.78rem; + line-height: 1.15; } - .wf-run-cell strong { - display: block; - font-size: 0.88rem; + .wf-run-text { + padding-top: 0.26rem; } .wf-run-flags { display: flex; flex-wrap: wrap; - gap: 0.36rem; + gap: 0.24rem; + padding-top: 0.18rem; } .wf-flag { display: inline-flex; align-items: center; - padding: 0.22rem 0.5rem; + padding: 0.12rem 0.38rem; border-radius: 999px; - font-size: 0.72rem; + font-size: 0.68rem; font-weight: 700; letter-spacing: 0.02em; background: #e2e8f0; @@ -315,46 +364,61 @@ def _inject_workflow_page_styles() -> None: .wf-compare-bar { border: 1px solid rgba(148, 163, 184, 0.24); background: linear-gradient(135deg, #f8fafc 0%, #ecfeff 100%); - border-radius: 16px; - padding: 0.95rem 1rem; - margin: 0.45rem 0 0.75rem 0; + border-radius: 12px; + padding: 0.7rem 0.85rem; + margin: 0.35rem 0 0.55rem 0; } .wf-compare-title { margin: 0; - font-size: 0.84rem; + font-size: 0.8rem; font-weight: 800; color: #0f172a; letter-spacing: 0.01em; } - .wf-start-note { + [class*="st-key-workflow_compare_pick__"] label[data-testid="stWidgetLabel"] { + display: none; + } + [class*="st-key-workflow_compare_pick__"] div[data-testid="stCheckbox"] { + display: flex; + justify-content: center; + padding-top: 0.1rem; + } + [class*="st-key-workflow_compare_pick__"] input[type="checkbox"] { + transform: scale(1.2); + } + [class*="st-key-workflow_runs_page_select"] div[data-baseweb="select"] { + min-height: 2rem; + } + .wf-launcher { border: 1px solid rgba(20, 184, 166, 0.22); background: linear-gradient(135deg, #f0fdfa 0%, #ffffff 100%); - border-radius: 16px; - padding: 1rem; - min-height: 100%; + border-radius: 14px; + padding: 0.85rem 1rem; + margin-bottom: 0.8rem; } - .wf-start-note strong { + .wf-launcher-title { + margin: 0; + font-size: 0.95rem; + font-weight: 800; color: #0f172a; } - .wf-start-note p { + .wf-launcher-copy { + margin: 0.25rem 0 0 0; + font-size: 0.84rem; + color: #475569; + } + .wf-launcher-meta { + margin-top: 0.55rem; + font-size: 0.78rem; color: #475569; - font-size: 0.9rem; - line-height: 1.55; - margin: 0.45rem 0 0 0; } .wf-empty { border: 1px dashed rgba(148, 163, 184, 0.45); - border-radius: 16px; + border-radius: 12px; background: rgba(248, 250, 252, 0.8); - padding: 1rem; + padding: 0.8rem 0.9rem; color: #475569; - font-size: 0.9rem; - } - @media (max-width: 1080px) { - .wf-run-row { - grid-template-columns: 1fr; - gap: 0.55rem; - } + font-size: 0.84rem; } """, @@ -362,9 +426,18 @@ def _inject_workflow_page_styles() -> None: ) -def _render_local_run_card(run: Dict[str, object]) -> None: - name = html.escape(str(run["name"])) - rel_path = html.escape(str(run["path_display"])) +def _render_local_runs_header() -> None: + header_cols = st.columns([0.7, 2.45, 1.35, 0.95, 1.55], gap="small") + header_cols[0].markdown('
Pick
', unsafe_allow_html=True) + header_cols[1].markdown('
Name
', unsafe_allow_html=True) + header_cols[2].markdown('
Updated
', unsafe_allow_html=True) + header_cols[3].markdown('
Size
', unsafe_allow_html=True) + header_cols[4].markdown('
Files
', unsafe_allow_html=True) + + +def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: + name_raw = str(run["name"]) + name = html.escape(name_raw) modified = html.escape(str(run["modified"])) size = html.escape(str(run["size"])) flags = [ @@ -376,60 +449,34 @@ def _render_local_run_card(run: Dict[str, object]) -> None: f'{label}' for label, enabled in flags ) - st.markdown( - f""" -
-
-
-
{name}
-
{rel_path}
-
-
- {modified} - last updated -
-
- {size} - disk usage -
-
-
{flag_html}
-
-
-
- """, - unsafe_allow_html=True, - ) + checkbox_key = f"workflow_compare_pick::{name_raw}" + if checkbox_key not in st.session_state: + st.session_state[checkbox_key] = bool(selected) + row_cols = st.columns([0.7, 2.45, 1.35, 0.95, 1.55], gap="small") + with row_cols[0]: + checked = st.checkbox("Select run", key=checkbox_key, label_visibility="collapsed") + with row_cols[1]: + st.markdown( + f'', + unsafe_allow_html=True, + ) + with row_cols[2]: + st.markdown(f'
{modified}
', unsafe_allow_html=True) + with row_cols[3]: + st.markdown(f'
{size}
', unsafe_allow_html=True) + with row_cols[4]: + st.markdown(f'
{flag_html}
', unsafe_allow_html=True) + return bool(checked) def _render_local_runs_section() -> None: - section_header( - "Local Runs", - f"Finished runs already stored under `{get_data_root_display()}/`. Search, browse, and pick two to compare.", - ) - run_dirs = list_run_directories() - if not run_dirs: + section_header("Local Runs", "") + runs = _load_local_runs() + if not runs: st.markdown('
No finished runs were found on this server yet.
', unsafe_allow_html=True) return - runs: List[Dict[str, object]] = [] - for run_path in run_dirs: - info = get_run_info(run_path) - runs.append( - { - "name": info["name"], - "path_display": f"{get_data_root_display()}/{info['name']}", - "size": format_size(info["size_bytes"]), - "mtime": float(info["mtime"] or 0), - "modified": _format_run_mtime(info["mtime"]), - "has_summary": bool(info["has_summary"]), - "has_score": bool(info["has_score"]), - "has_parquet": bool(info["has_parquet"]), - } - ) - runs.sort(key=lambda row: (-float(row["mtime"]), str(row["name"]).lower())) - - control_cols = st.columns([1.7, 0.8, 0.8, 0.55]) + control_cols = st.columns([2.2, 0.8, 0.8, 0.7]) with control_cols[0]: st.markdown('
Search
', unsafe_allow_html=True) run_search = st.text_input( @@ -441,94 +488,43 @@ def _render_local_runs_section() -> None: ).strip().lower() with control_cols[1]: st.markdown('
Summary
', unsafe_allow_html=True) - require_summary = st.selectbox( - "Require summary", - options=["Any", "Yes", "No"], - index=0, + require_summary = st.toggle( + "Summary only", key="workflow_runs_summary_filter", label_visibility="collapsed", ) with control_cols[2]: st.markdown('
Parquet
', unsafe_allow_html=True) - require_parquet = st.selectbox( - "Require parquet", - options=["Any", "Yes", "No"], - index=0, + require_parquet = st.toggle( + "Parquet only", key="workflow_runs_parquet_filter", label_visibility="collapsed", ) with control_cols[3]: st.markdown('
Rows
', unsafe_allow_html=True) - page_size = int( - st.selectbox( - "Rows per page", - options=[6, 10, 14, 20], - index=1, - key="workflow_runs_page_size", - label_visibility="collapsed", - ) - ) + page_size = int(st.selectbox("Rows", options=[10, 20, 50, 100], index=0, key="workflow_runs_page_size", label_visibility="collapsed")) filtered = runs if run_search: filtered = [row for row in filtered if run_search in str(row["name"]).lower()] - if require_summary != "Any": - want = require_summary == "Yes" - filtered = [row for row in filtered if bool(row["has_summary"]) == want] - if require_parquet != "Any": - want = require_parquet == "Yes" - filtered = [row for row in filtered if bool(row["has_parquet"]) == want] - - run_names = [str(row["name"]) for row in filtered] + if require_summary: + filtered = [row for row in filtered if bool(row["has_summary"])] + if require_parquet: + filtered = [row for row in filtered if bool(row["has_parquet"])] + compare_ready = [ str(row["name"]) for row in filtered if bool(row["has_summary"]) or bool(row["has_score"]) or bool(row["has_parquet"]) ] - if "workflow_compare_run_a" not in st.session_state: - st.session_state["workflow_compare_run_a"] = compare_ready[0] if compare_ready else "" - if "workflow_compare_run_b" not in st.session_state: - st.session_state["workflow_compare_run_b"] = compare_ready[1] if len(compare_ready) > 1 else "" + if "workflow_compare_runs" not in st.session_state: + st.session_state["workflow_compare_runs"] = compare_ready[:1] - st.markdown('
', unsafe_allow_html=True) - st.markdown('

Quick compare tray

', unsafe_allow_html=True) - compare_cols = st.columns([1.35, 1.35, 0.95, 0.95]) - with compare_cols[0]: - st.markdown('
Baseline A
', unsafe_allow_html=True) - run_a = st.selectbox( - "Baseline A", - options=[""] + compare_ready, - index=([""] + compare_ready).index(st.session_state.get("workflow_compare_run_a", "")) - if st.session_state.get("workflow_compare_run_a", "") in compare_ready - else 0, - key="workflow_compare_run_a", - label_visibility="collapsed", - ) - with compare_cols[1]: - st.markdown('
Candidate B
', unsafe_allow_html=True) - run_b_options = [""] + [name for name in compare_ready if name != run_a] - current_b = st.session_state.get("workflow_compare_run_b", "") - st.selectbox( - "Candidate B", - options=run_b_options, - index=run_b_options.index(current_b) if current_b in run_b_options else 0, - key="workflow_compare_run_b", - label_visibility="collapsed", - ) - run_b = st.session_state.get("workflow_compare_run_b", "") - with compare_cols[2]: - st.markdown('
Single
', unsafe_allow_html=True) - if run_a: - st.link_button("Open run", _build_overview_url(run_a), use_container_width=True) - else: - st.button("Open run", disabled=True, use_container_width=True, key="workflow_open_run_disabled") - with compare_cols[3]: - st.markdown('
Compare
', unsafe_allow_html=True) - if run_a and run_b: - st.link_button("Compare", _build_overview_url(run_a, run_b), use_container_width=True) - else: - st.button("Compare", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") - st.markdown("
", unsafe_allow_html=True) + compare_selected = [ + name for name in st.session_state.get("workflow_compare_runs", []) + if name in compare_ready + ] + st.session_state["workflow_compare_runs"] = compare_selected if not filtered: st.markdown('
No local runs matched the current filters.
', unsafe_allow_html=True) @@ -542,61 +538,68 @@ def _render_local_runs_section() -> None: st.session_state[page_key] = current_page start_idx = (current_page - 1) * page_size visible_runs = filtered[start_idx:start_idx + page_size] + visible_names = {str(run["name"]) for run in visible_runs} + next_selected = [name for name in compare_selected if name not in visible_names] + for run in visible_runs: + run_name = str(run["name"]) + checkbox_key = f"workflow_compare_pick::{run_name}" + is_checked = bool(st.session_state.get(checkbox_key, run_name in compare_selected)) + if is_checked and run_name in compare_ready: + next_selected.append(run_name) + st.session_state["workflow_compare_runs"] = [name for name in compare_ready if name in next_selected] - pager_cols = st.columns([0.7, 0.8, 0.8, 0.8, 5.9]) + st.markdown('
', unsafe_allow_html=True) + st.markdown('

Compare

', unsafe_allow_html=True) + compare_cols = st.columns([3.4, 1.0]) + with compare_cols[0]: + st.markdown('
Selected runs
', unsafe_allow_html=True) + selected_runs = list(st.session_state.get("workflow_compare_runs", [])) + if selected_runs: + st.caption(" | ".join(selected_runs)) + with compare_cols[1]: + st.markdown('
Action
', unsafe_allow_html=True) + if len(selected_runs) >= 2: + st.link_button("Compare", _build_overview_url(selected_runs[0], selected_runs[1:]), use_container_width=True) + elif len(selected_runs) == 1: + st.link_button("Open", _build_overview_url(selected_runs[0]), use_container_width=True) + else: + st.button("Open", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") + st.markdown("
", unsafe_allow_html=True) + + pager_cols = st.columns([0.9, 1.2, 4.1]) with pager_cols[0]: - if st.button("‹", key="workflow_runs_prev", use_container_width=True, disabled=current_page <= 1): - st.session_state[page_key] = current_page - 1 - st.rerun() - page_numbers = ( - list(range(1, min(3, page_count) + 1)) - if current_page == 1 - else list(range(max(1, current_page - 1), min(page_count, current_page + 1) + 1)) - ) - for idx, page_num in enumerate(page_numbers[:3], start=1): - with pager_cols[idx]: - if st.button( - str(page_num), - key=f"workflow_runs_page_{page_num}", - use_container_width=True, - disabled=page_num == current_page, - ): - st.session_state[page_key] = page_num - st.rerun() - with pager_cols[4]: - if st.button("›", key="workflow_runs_next", use_container_width=True, disabled=current_page >= page_count): - st.session_state[page_key] = current_page + 1 - st.rerun() + st.markdown('
Page
', unsafe_allow_html=True) + selected_page = st.selectbox( + "Page", + options=list(range(1, page_count + 1)), + index=max(0, current_page - 1), + key="workflow_runs_page_select", + label_visibility="collapsed", + ) + if selected_page != current_page: + st.session_state[page_key] = int(selected_page) + current_page = int(selected_page) + start_idx = (current_page - 1) * page_size + visible_runs = filtered[start_idx:start_idx + page_size] + visible_names = {str(run["name"]) for run in visible_runs} + with pager_cols[1]: + st.markdown('
Rows
', unsafe_allow_html=True) + st.caption(str(len(visible_runs))) + with pager_cols[2]: + st.markdown('
Total
', unsafe_allow_html=True) + st.caption(f"{len(filtered)} runs") - st.markdown('
', unsafe_allow_html=True) + _render_local_runs_header() + next_selected = [name for name in st.session_state.get("workflow_compare_runs", []) if name not in visible_names] for run in visible_runs: - row_cols = st.columns([8.9, 2.6]) - with row_cols[0]: - _render_local_run_card(run) - with row_cols[1]: - action_cols = st.columns([1.0, 1.0, 1.0], gap="small") - with action_cols[0]: - st.link_button( - "Open", - _build_overview_url(str(run["name"])), - use_container_width=True, - ) - with action_cols[1]: - if st.button(f"A", key=f"workflow_pick_a_{run['name']}", use_container_width=True): - st.session_state["workflow_compare_run_a"] = str(run["name"]) - st.rerun() - with action_cols[2]: - if st.button(f"B", key=f"workflow_pick_b_{run['name']}", use_container_width=True): - st.session_state["workflow_compare_run_b"] = str(run["name"]) - st.rerun() - st.markdown("
", unsafe_allow_html=True) + run_name = str(run["name"]) + if _render_local_run_row(run, selected=run_name in st.session_state.get("workflow_compare_runs", [])) and run_name in compare_ready: + next_selected.append(run_name) + st.session_state["workflow_compare_runs"] = [name for name in compare_ready if name in next_selected] def _render_current_tasks_section() -> None: - section_header( - "Current Tasks", - "Jobs queued or running on this server, with recent history folded underneath.", - ) + section_header("Current Tasks", "") if not is_task_queue_enabled(): st.info("Task queue not enabled. Set `USE_TASK_QUEUE=true` to track background tasks.") return @@ -632,16 +635,30 @@ def _task_list_poll(): st.caption("Active jobs are shown live when possible. Use refresh if this browser does not support fragments.") -def _render_start_workflow_section( +def _get_start_workflow_defaults() -> Dict[str, object]: + default_target = get_config_value("target_name", "beta/v4.3.2") + saved_output = str(get_config_value("eval_output_path", "") or "").strip() + default_output = saved_output if saved_output and saved_output != "evaluator_run" else _make_default_output_path(default_target) + return { + "project_id": get_config_value("eval_project_id", "x2_dev"), + "environment": get_config_value("environment", ""), + "output_path_default": default_output, + "download_type_default": get_config_value("eval_download_type", "Archives (ZIP)"), + "phase_default": get_config_value( + "eval_phase", + "perception.object_recognition.tracking.objects", + ), + "skip_large_file_default": True, + "large_file_mb_default": 50.0, + "keep_zip_files_default": False, + } + + +def _render_start_workflow_form( catalog_presets: List[Dict[str, str]], catalogs_path: Optional[str], catalog_load_error: Optional[str], ) -> Dict[str, object]: - section_header( - "Start Workflow", - "Schedule a fresh evaluator run here, or use the recent evaluator jobs browser below to run Download + Eval from an existing report.", - ) - if catalog_load_error: st.warning(f"Could not read catalog presets: {catalog_load_error}") elif catalogs_path: @@ -658,7 +675,8 @@ def _render_start_workflow_section( default_poll_interval = int(get_config_value("poll_interval", 60)) default_max_wait_hours = int(get_config_value("max_wait_hours", 24)) default_environment = get_config_value("environment", "") - default_output = get_config_value("eval_output_path", _make_default_output_path(default_target)) + saved_output = str(get_config_value("eval_output_path", "") or "").strip() + default_output = saved_output if saved_output and saved_output != "evaluator_run" else _make_default_output_path(default_target) top_cols = st.columns([1.0, 1.5, 1.2]) with top_cols[0]: @@ -670,18 +688,25 @@ def _render_start_workflow_section( label_visibility="collapsed", ).strip() with top_cols[1]: - st.markdown('
Catalog
', unsafe_allow_html=True) + st.markdown('
Catalog preset
', unsafe_allow_html=True) selected_catalog_name = st.selectbox( - "Catalog", - options=catalog_names if catalog_names else ["No catalog presets"], + "Catalog preset", + options=[""] + catalog_names if catalog_names else [""], index=0, key="workflow_catalog_name", label_visibility="collapsed", + format_func=lambda value: value or "Optional preset", ) selected_catalog = next( (item for item in catalog_presets if item["display_name"] == selected_catalog_name), None, ) + if "workflow_last_catalog_preset" not in st.session_state: + st.session_state["workflow_last_catalog_preset"] = "" + if st.session_state["workflow_last_catalog_preset"] != selected_catalog_name and selected_catalog: + st.session_state["workflow_catalog_id"] = str(selected_catalog.get("catalog_id") or "") + st.session_state["workflow_integration_id"] = str(selected_catalog.get("integration_id") or "") + st.session_state["workflow_last_catalog_preset"] = selected_catalog_name with top_cols[2]: st.markdown('
Branch or tag
', unsafe_allow_html=True) target_name = st.text_input( @@ -692,8 +717,26 @@ def _render_start_workflow_section( placeholder="beta/v4.3.2", ).strip() - detail_cols = st.columns([1.25, 0.8, 0.95]) + detail_cols = st.columns([1.05, 1.05, 1.2, 0.8, 0.95]) with detail_cols[0]: + st.markdown('
Catalog ID
', unsafe_allow_html=True) + catalog_id = st.text_input( + "Catalog ID", + value=str(get_config_value("workflow_catalog_id", "") or ""), + key="workflow_catalog_id", + label_visibility="collapsed", + placeholder="vehicle catalog id", + ).strip() + with detail_cols[1]: + st.markdown('
Integration ID
', unsafe_allow_html=True) + integration_id = st.text_input( + "Integration ID", + value=str(get_config_value("workflow_integration_id", "") or ""), + key="workflow_integration_id", + label_visibility="collapsed", + placeholder="integration id", + ).strip() + with detail_cols[2]: st.markdown('
Output folder
', unsafe_allow_html=True) output_path = st.text_input( "Output folder", @@ -702,7 +745,7 @@ def _render_start_workflow_section( label_visibility="collapsed", placeholder=_make_default_output_path(target_name), ).strip() - with detail_cols[1]: + with detail_cols[3]: st.markdown('
Environment
', unsafe_allow_html=True) environment = st.selectbox( "Environment", @@ -712,7 +755,7 @@ def _render_start_workflow_section( label_visibility="collapsed", format_func=lambda value: value or "default", ) - with detail_cols[2]: + with detail_cols[4]: st.markdown('
Description
', unsafe_allow_html=True) description = st.text_input( "Description", @@ -723,14 +766,8 @@ def _render_start_workflow_section( ).strip() if selected_catalog: - info_cols = st.columns([1.2, 1.15, 2.2]) - with info_cols[0]: - st.markdown(f'
Catalog ID
{html.escape(str(selected_catalog.get("catalog_id", "—")))}
', unsafe_allow_html=True) - with info_cols[1]: - st.markdown(f'
Integration
{html.escape(str(selected_catalog.get("integration_id", "—")))}
', unsafe_allow_html=True) - with info_cols[2]: - desc = str(selected_catalog.get("description") or "").strip() or "Preset selected for quick scheduling." - st.markdown(f'
Preset
{html.escape(desc)}
', unsafe_allow_html=True) + desc = str(selected_catalog.get("description") or "").strip() or "Preset selected for quick scheduling." + st.caption(f"Preset: {desc}") with st.expander("Advanced options", expanded=False): adv_cols = st.columns([1.0, 1.2, 0.8, 0.8]) @@ -782,32 +819,6 @@ def _render_start_workflow_section( with option_cols[3]: is_tag = st.checkbox("Target is tag", value=False, key="workflow_is_tag") - start_cols = st.columns([1.45, 0.95]) - with start_cols[0]: - st.markdown('
', unsafe_allow_html=True) - st.markdown('

Schedule evaluator + download + eval

', unsafe_allow_html=True) - st.markdown( - '

This starts the same background pipeline as the previous workflow launcher, but keeps the controls on-page. Output path, evaluator polling, and eval/parquet behavior are all preserved.

', - unsafe_allow_html=True, - ) - start_clicked = st.button( - "Start evaluator workflow", - key="workflow_start_btn", - type="primary", - use_container_width=True, - ) - st.markdown("
", unsafe_allow_html=True) - with start_cols[1]: - st.markdown( - """ -
- Already have an evaluator report? -

Use the recent evaluator jobs section right below. Every row can open details or run Download + Eval + Parquet directly, using the same defaults configured on this page.

-
- """, - unsafe_allow_html=True, - ) - set_config_value("eval_project_id", project_id) set_config_value("target_name", target_name) set_config_value("eval_output_path", output_path) @@ -817,9 +828,8 @@ def _render_start_workflow_section( set_config_value("max_wait_hours", max_wait_hours) set_config_value("environment", environment) set_config_value("workflow_description", description) - - catalog_id = str((selected_catalog or {}).get("catalog_id") or "").strip() - integration_id = str((selected_catalog or {}).get("integration_id") or "").strip() + set_config_value("workflow_catalog_id", catalog_id) + set_config_value("workflow_integration_id", integration_id) errors = [] if not project_id: errors.append("Project ID") @@ -839,58 +849,114 @@ def _render_start_workflow_section( else: errors.append("Output folder") - if start_clicked: - if errors: - for err in errors: - st.error(f"Missing or invalid: {err}") - elif not is_task_queue_enabled(): - st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") - else: - task_id = _enqueue_task( - "run_evaluator_and_process", - { - "project_id": project_id, - "catalog_id": catalog_id, - "integration_id": integration_id, - "suite_ids": None, - "target_name": target_name, - "description": description or f"Eval {datetime.now().strftime('%Y-%m-%d %H:%M')}", - "output_path": str(resolved_output), - "environment": environment, - "max_retries": 0, - "clean_build": False, - "debug": False, - "is_tag": is_tag, - "download_type": "archives" if download_type == "Archives (ZIP)" else "result_json", - "phase": phase, - "skip_large_file": False, - "large_file_mb": 50.0, - "keep_zip_files": False, - "poll_interval": int(poll_interval), - "max_wait_seconds": int(max_wait_hours) * 3600, - "run_eval": bool(run_eval), - "generate_parquet": bool(generate_parquet), - "eval_recursive": bool(eval_recursive), - "eval_overwrite": False, - }, - ) - if task_id: - st.success(f"Workflow queued. Task id: `{task_id}`") - else: - st.error("Failed to enqueue task. Check worker logs.") - return { "project_id": project_id, "environment": environment, "output_path_default": output_path or _make_default_output_path(target_name), "download_type_default": download_type, "phase_default": phase, - "skip_large_file_default": False, + "skip_large_file_default": True, "large_file_mb_default": 50.0, "keep_zip_files_default": False, + "dialog_payload": { + "errors": errors, + "project_id": project_id, + "catalog_id": catalog_id, + "integration_id": integration_id, + "catalog_preset_name": selected_catalog_name, + "has_custom_catalog": bool(catalog_id and not selected_catalog), + "target_name": target_name, + "description": description, + "resolved_output": str(resolved_output) if resolved_output else "", + "environment": environment, + "is_tag": is_tag, + "download_type": download_type, + "phase": phase, + "poll_interval": int(poll_interval), + "max_wait_hours": int(max_wait_hours), + "run_eval": bool(run_eval), + "generate_parquet": bool(generate_parquet), + "eval_recursive": bool(eval_recursive), + }, } +def _render_workflow_launcher_section( + catalog_presets: List[Dict[str, str]], + catalogs_path: Optional[str], + catalog_load_error: Optional[str], +) -> Dict[str, object]: + section_header("Run Evaluator Workflow", "") + start_defaults = _get_start_workflow_defaults() + new_job_clicked = st.button( + "Start new workflow", + key="workflow_open_start_dialog", + type="primary", + use_container_width=False, + ) + + if new_job_clicked and callable(getattr(st, "dialog", None)): + @st.dialog("Start evaluator workflow", width="large") + def _workflow_start_dialog() -> None: + st.caption("This is the full launcher for creating a new evaluator job, downloading results, and optionally running eval/parquet.") + payload = _render_start_workflow_form(catalog_presets, catalogs_path, catalog_load_error) + submit_cols = st.columns([1.15, 1.15, 3.7]) + close_clicked = submit_cols[0].button("Close", key="workflow_close_start_dialog", use_container_width=True) + start_clicked = submit_cols[1].button("Start workflow", key="workflow_start_btn_dialog", type="primary", use_container_width=True) + if close_clicked: + st.rerun() + if start_clicked: + dialog_payload = dict(payload.get("dialog_payload") or {}) + errors = dialog_payload.get("errors", []) + if errors: + for err in errors: + st.error(f"Missing or invalid: {err}") + elif not is_task_queue_enabled(): + st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") + else: + task_id = _enqueue_task( + "run_evaluator_and_process", + { + "project_id": dialog_payload["project_id"], + "catalog_id": dialog_payload["catalog_id"], + "integration_id": dialog_payload["integration_id"], + "suite_ids": None, + "target_name": dialog_payload["target_name"], + "description": dialog_payload["description"] or _make_auto_workflow_description( + dialog_payload["target_name"], + dialog_payload.get("catalog_preset_name", ""), + has_custom_catalog=bool(dialog_payload.get("has_custom_catalog", False)), + ), + "output_path": dialog_payload["resolved_output"], + "environment": dialog_payload["environment"], + "max_retries": 0, + "clean_build": False, + "debug": False, + "is_tag": dialog_payload["is_tag"], + "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", + "phase": dialog_payload["phase"], + "skip_large_file": False, + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "run_eval": dialog_payload["run_eval"], + "generate_parquet": dialog_payload["generate_parquet"], + "eval_recursive": dialog_payload["eval_recursive"], + "eval_overwrite": False, + }, + ) + if task_id: + st.success(f"Workflow queued. Task id: `{task_id}`") + st.rerun() + else: + st.error("Failed to enqueue task. Check worker logs.") + + _workflow_start_dialog() + + return start_defaults + + _inject_workflow_page_styles() render_page_hero( kicker="Workflow automation", @@ -900,32 +966,33 @@ def _render_start_workflow_section( catalog_presets, catalogs_path, catalog_load_error = _load_catalog_presets() -_render_local_runs_section() -_render_current_tasks_section() -start_defaults = _render_start_workflow_section(catalog_presets, catalogs_path, catalog_load_error) +tab_tasks, tab_local = st.tabs(["Run Tasks", "Local Runs"]) -configure_recent_evaluator_jobs_ui( - get_config_value=get_config_value, - set_config_value=set_config_value, - enqueue_task=_enqueue_task, - catalog_io_available=CATALOG_IO_AVAILABLE, - environment=str(start_defaults["environment"] or ""), -) +with tab_tasks: + _render_current_tasks_section() + start_defaults = _render_workflow_launcher_section(catalog_presets, catalogs_path, catalog_load_error) -section_header( - "Recent Evaluator Jobs", - "Direct evaluator browser for starting Download + Eval from existing reports. Shown by default here so the existing-job path is one click away.", -) -_render_recent_evaluator_jobs_section( - str(start_defaults["project_id"] or ""), - str(start_defaults["environment"] or ""), - output_path_default=str(start_defaults["output_path_default"]), - download_type_default=str(start_defaults["download_type_default"]), - phase_default=str(start_defaults["phase_default"]), - skip_large_file_default=bool(start_defaults["skip_large_file_default"]), - large_file_mb_default=float(start_defaults["large_file_mb_default"]), - keep_zip_files_default=bool(start_defaults["keep_zip_files_default"]), - show_toggle=False, - default_visible=True, - show_title=False, -) + configure_recent_evaluator_jobs_ui( + get_config_value=get_config_value, + set_config_value=set_config_value, + enqueue_task=_enqueue_task, + catalog_io_available=CATALOG_IO_AVAILABLE, + environment=str(start_defaults["environment"] or ""), + ) + + _render_recent_evaluator_jobs_section( + str(start_defaults["project_id"] or ""), + str(start_defaults["environment"] or ""), + output_path_default=str(start_defaults["output_path_default"]), + download_type_default=str(start_defaults["download_type_default"]), + phase_default=str(start_defaults["phase_default"]), + skip_large_file_default=bool(start_defaults["skip_large_file_default"]), + large_file_mb_default=float(start_defaults["large_file_mb_default"]), + keep_zip_files_default=bool(start_defaults["keep_zip_files_default"]), + show_toggle=False, + default_visible=True, + show_title=False, + ) + +with tab_local: + _render_local_runs_section() From 42b6250ccd079ba53b3a739334a7b2903c46b837 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 15 May 2026 13:34:47 +0900 Subject: [PATCH 67/94] feat: improve error handling and user feedback in evaluator jobs section - Added a new function to generate user-friendly error messages for various network and service issues, enhancing the clarity of feedback when loading evaluator jobs. - Updated the loading spinner messages to reflect the presence of filters, providing better context during data fetching. - Implemented warnings for empty responses from the evaluator server, improving user awareness of potential issues. - Refined the display of messages for no recent evaluator jobs based on filter criteria, enhancing user experience. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/ui/recent_evaluator_jobs.py | 51 +++++++++++++++---- .../pages/7_Evaluator_Workflow.py | 21 ++++---- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 2bb5f45..65308a6 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -45,6 +45,20 @@ def _enqueue_task(task_type: str, params: Dict[str, Any]) -> Optional[str]: return _ENQUEUE_TASK(task_type, params) +def _friendly_request_error_message(exc: Exception) -> str: + text = str(exc or "").strip() + lowered = text.lower() + if "temporary failure in name resolution" in lowered or "failed to resolve" in lowered or "name resolution" in lowered: + return "Could not load evaluator jobs because the network appears to be unavailable." + if "auth.web.auto" in lowered or "/token" in lowered: + return "Could not load evaluator jobs because the sign-in service is currently unavailable." + if "connection refused" in lowered or "max retries exceeded" in lowered or "newconnectionerror" in lowered: + return "Could not connect to the evaluator service right now. Please try again in a moment." + if "timed out" in lowered or "timeout" in lowered: + return "Loading evaluator jobs took too long. Please try again." + return "Could not load evaluator jobs right now. Please check the network connection and try again." + + def _to_jst(dt: Any) -> Optional[datetime]: if dt is None: return None @@ -1400,17 +1414,27 @@ def _render_job_list() -> None: ) for f in extra_filters ) + fetch_help = "Loading evaluator jobs..." + if search_text or status_filter or date_from or date_to or selected_user_name: + fetch_help = "Loading evaluator jobs with filters..." try: - fetched_pages = _fetch_recent_evaluator_job_pages( - project_id, - environment, - limit, - pages_to_fetch, - status_values=server_status_values, - extra_filters=extra_filter_tuples, - ) + with st.spinner(fetch_help): + fetched_pages = _fetch_recent_evaluator_job_pages( + project_id, + environment, + limit, + pages_to_fetch, + status_values=server_status_values, + extra_filters=extra_filter_tuples, + ) + except requests.Timeout: + st.error("Timed out while loading evaluator jobs. The evaluator server may be slow right now. Try Refresh.") + return + except requests.RequestException as e: + st.error(_friendly_request_error_message(e)) + return except Exception as e: - st.error(f"Could not fetch recent evaluator jobs: {e}") + st.error(_friendly_request_error_message(e)) return if search_text: _save_recent_job_search_history(search_scope, search_text) @@ -1419,6 +1443,10 @@ def _render_job_list() -> None: user_directory = _hydrate_recent_eval_user_directory(jobs, environment) has_more_from_api = bool(fetched_pages and fetched_pages[-1].get("next_token")) + if not fetched_pages: + st.warning("No response was returned from the evaluator server. Try Refresh.") + return + if search_needle: if search_scope == "Branch/tag": jobs = [job for job in jobs if search_needle in str(job.get("target", "")).lower()] @@ -1442,7 +1470,10 @@ def _render_job_list() -> None: if not jobs: st.session_state[page_key] = 1 - st.markdown('
No recent evaluator jobs matched the current filters.
', unsafe_allow_html=True) + empty_message = "No recent evaluator jobs were returned." + if search_text or status_filter or date_from or date_to or selected_user_name: + empty_message = "No recent evaluator jobs matched the current filters." + st.markdown(f'
{html.escape(empty_message)}
', unsafe_allow_html=True) return total_loaded = len(jobs) diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 05f7a9d..13b18b2 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -637,12 +637,10 @@ def _task_list_poll(): def _get_start_workflow_defaults() -> Dict[str, object]: default_target = get_config_value("target_name", "beta/v4.3.2") - saved_output = str(get_config_value("eval_output_path", "") or "").strip() - default_output = saved_output if saved_output and saved_output != "evaluator_run" else _make_default_output_path(default_target) return { "project_id": get_config_value("eval_project_id", "x2_dev"), "environment": get_config_value("environment", ""), - "output_path_default": default_output, + "output_path_default": _make_default_output_path(default_target), "download_type_default": get_config_value("eval_download_type", "Archives (ZIP)"), "phase_default": get_config_value( "eval_phase", @@ -675,8 +673,7 @@ def _render_start_workflow_form( default_poll_interval = int(get_config_value("poll_interval", 60)) default_max_wait_hours = int(get_config_value("max_wait_hours", 24)) default_environment = get_config_value("environment", "") - saved_output = str(get_config_value("eval_output_path", "") or "").strip() - default_output = saved_output if saved_output and saved_output != "evaluator_run" else _make_default_output_path(default_target) + default_output = _make_default_output_path(default_target) top_cols = st.columns([1.0, 1.5, 1.2]) with top_cols[0]: @@ -722,7 +719,7 @@ def _render_start_workflow_form( st.markdown('
Catalog ID
', unsafe_allow_html=True) catalog_id = st.text_input( "Catalog ID", - value=str(get_config_value("workflow_catalog_id", "") or ""), + value="", key="workflow_catalog_id", label_visibility="collapsed", placeholder="vehicle catalog id", @@ -731,7 +728,7 @@ def _render_start_workflow_form( st.markdown('
Integration ID
', unsafe_allow_html=True) integration_id = st.text_input( "Integration ID", - value=str(get_config_value("workflow_integration_id", "") or ""), + value="", key="workflow_integration_id", label_visibility="collapsed", placeholder="integration id", @@ -821,15 +818,12 @@ def _render_start_workflow_form( set_config_value("eval_project_id", project_id) set_config_value("target_name", target_name) - set_config_value("eval_output_path", output_path) set_config_value("eval_download_type", download_type) set_config_value("eval_phase", phase) set_config_value("poll_interval", poll_interval) set_config_value("max_wait_hours", max_wait_hours) set_config_value("environment", environment) set_config_value("workflow_description", description) - set_config_value("workflow_catalog_id", catalog_id) - set_config_value("workflow_integration_id", integration_id) errors = [] if not project_id: errors.append("Project ID") @@ -896,6 +890,13 @@ def _render_workflow_launcher_section( ) if new_job_clicked and callable(getattr(st, "dialog", None)): + fresh_target = str(get_config_value("target_name", "beta/v4.3.2") or "beta/v4.3.2") + st.session_state["workflow_catalog_name"] = "" + st.session_state["workflow_last_catalog_preset"] = "" + st.session_state["workflow_catalog_id"] = "" + st.session_state["workflow_integration_id"] = "" + st.session_state["workflow_output_path"] = _make_default_output_path(fresh_target) + @st.dialog("Start evaluator workflow", width="large") def _workflow_start_dialog() -> None: st.caption("This is the full launcher for creating a new evaluator job, downloading results, and optionally running eval/parquet.") From 8339f9a00b5411e9a314ef3f2bb11cf2e18c3fba Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 15 May 2026 17:03:47 +0900 Subject: [PATCH 68/94] feat: enhance evaluator workflow with catalog integration and retest functionality - Introduced functions to load and fetch catalog presets, improving the management of available catalogs for evaluator jobs. - Added a new dialog for retesting evaluator jobs, allowing users to reuse build artifacts from previous jobs. - Updated the UI to include server catalog options, enhancing user experience by providing more flexible catalog selection. - Improved state management for catalog selections and integration resolution, ensuring a smoother workflow process. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/evaluator_api.py | 15 +- .../lib/ui/recent_evaluator_jobs.py | 331 +++++++++++++++- .../pages/7_Evaluator_Workflow.py | 365 +++++++++++++----- evaluation_dashboard_app/worker/tasks.py | 17 +- 4 files changed, 628 insertions(+), 100 deletions(-) diff --git a/evaluation_dashboard_app/lib/evaluator_api.py b/evaluation_dashboard_app/lib/evaluator_api.py index ba2082a..726a369 100644 --- a/evaluation_dashboard_app/lib/evaluator_api.py +++ b/evaluation_dashboard_app/lib/evaluator_api.py @@ -269,8 +269,9 @@ def schedule_job( *, project_id: str, catalog_id: str, - integration_id: str, - target_name: str, + integration_id: Optional[str] = None, + target_name: Optional[str] = None, + source_job_id: Optional[str] = None, suite_ids: Optional[list[str]] = None, max_retries: int = 1, description: str = "no description", @@ -281,6 +282,8 @@ def schedule_job( log_expiration_time_in_days: float = 14.0, is_tag: bool = False, ) -> dict[str, Any]: + if not source_job_id and not target_name: + raise ValueError("Either target_name or source_job_id must be provided.") payload = { "build_options": { "clean_build": clean_build, @@ -288,9 +291,7 @@ def schedule_job( }, "catalog_id": catalog_id, "description": description, - "integration_id": integration_id, "release": release, - "source": {"git_tag" if is_tag else "git_branch": str(target_name)}, "suite_ids": suite_ids or [], "test_options": { "max_retries": max_retries, @@ -298,6 +299,12 @@ def schedule_job( "log_expiration_time": int(log_expiration_time_in_days * 24 * 60 * 60), }, } + if integration_id: + payload["integration_id"] = integration_id + if source_job_id: + payload["source_job_id"] = str(source_job_id) + if target_name: + payload["source"] = {"git_tag" if is_tag else "git_branch": str(target_name)} if record_caret: payload["build_options"]["developer_option_names"] = [ "webauto:ci:caret_enabled" diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 65308a6..3c12854 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -7,6 +7,7 @@ import urllib.parse from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timedelta, timezone +from pathlib import Path from typing import Any, Callable, Dict, List, Optional import pandas as pd @@ -59,6 +60,68 @@ def _friendly_request_error_message(exc: Exception) -> str: return "Could not load evaluator jobs right now. Please check the network connection and try again." +def _load_catalog_presets() -> List[Dict[str, str]]: + """Load catalog presets from the app-level catalogs.json file if available.""" + app_root = Path(__file__).resolve().parents[2] + search_paths = [ + app_root / "catalogs.json", + Path(os.environ.get("CATALOGS_PATH", "")), + Path.cwd() / "catalogs.json", + ] + for path in search_paths: + if not path or not str(path): + continue + try: + if not path.exists() or not path.is_file(): + continue + import json + + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + raw_catalogs = data.get("catalogs", []) if isinstance(data, dict) else data + presets: List[Dict[str, str]] = [] + for item in raw_catalogs or []: + if not isinstance(item, dict): + continue + display_name = ( + str(item.get("display_name") or item.get("name") or item.get("catalog_id") or "") + .strip() + ) + if not display_name: + continue + presets.append({**item, "display_name": display_name}) + return presets + except Exception: + continue + return [] + + +def _retest_catalog_emoji(preset_name: str, *, has_custom_catalog: bool = False) -> str: + mapping = { + "Build Test Catalog": "🛠️", + "Performance Test": "📈", + "Old performance test": "🕰️", + "Devops Test": "⚙️", + "Usecase Performance Catalog": "🧭", + "L4 regression test": "⚠️", + } + normalized = str(preset_name or "").strip() + if normalized in mapping: + return mapping[normalized] + if has_custom_catalog: + return "🧩" + return "📦" + + +def _make_retest_description(target_name: str, preset_name: str = "", *, has_custom_catalog: bool = False) -> str: + clean_target = " ".join(str(target_name or "").strip().split()) or "artifact" + stamp = datetime.now().strftime("%m-%d %H:%M") + return ( + f"♻️ evaluator artifact retest [{clean_target}] [{stamp}] " + f"{_retest_catalog_emoji(preset_name, has_custom_catalog=has_custom_catalog)}" + ) + + def _to_jst(dt: Any) -> Optional[datetime]: if dt is None: return None @@ -843,6 +906,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: } [class*="st-key-recent_eval_view_"] button, [class*="st-key-recent_eval_run_"] button, + [class*="st-key-recent_eval_retest_"] button, [class*="st-key-recent_eval_jobs_prev"] button, [class*="st-key-recent_eval_jobs_next"] button, [class*="st-key-recent_eval_jobs_pagebtn_"] button, @@ -855,6 +919,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: box-shadow: none; } [class*="st-key-recent_eval_view_"] button, + [class*="st-key-recent_eval_retest_"] button, [class*="st-key-recent_eval_jobs_prev"] button, [class*="st-key-recent_eval_jobs_next"] button, [class*="st-key-recent_eval_jobs_pagebtn_"] button, @@ -864,6 +929,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: background: #ffffff; } [class*="st-key-recent_eval_view_"] button:hover, + [class*="st-key-recent_eval_retest_"] button:hover, [class*="st-key-recent_eval_jobs_prev"] button:hover, [class*="st-key-recent_eval_jobs_next"] button:hover, [class*="st-key-recent_eval_jobs_pagebtn_"] button:hover, @@ -887,6 +953,16 @@ def _inject_recent_evaluator_jobs_styles() -> None: background: linear-gradient(180deg, #ccfbf1, #ecfeff); color: #115e59; } + [class*="st-key-recent_eval_retest_"] button { + border-color: rgba(251, 191, 36, 0.22); + background: linear-gradient(180deg, #fffbeb, #fff7ed); + color: #b45309; + } + [class*="st-key-recent_eval_retest_"] button:hover { + border-color: rgba(245, 158, 11, 0.34); + background: linear-gradient(180deg, #fef3c7, #fff7ed); + color: #92400e; + } .evj-stat-label { display: block; font-size: 0.68rem; @@ -1242,6 +1318,209 @@ def _render_recent_evaluator_job_run_dialog( st.rerun() +def _render_recent_evaluator_job_retest_dialog( + project_id: str, + environment: str, + job: Dict[str, Any], + *, + output_path_default: str, + phase_default: str, +) -> None: + """Render a compact workflow launcher that reuses build artifacts from a prior evaluator job.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.error("Missing evaluator job id.") + return + + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + raw_report = detail.get("raw_report") or {} + raw_catalog = raw_report.get("catalog") or {} + suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) + suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} + suite_labels = [opt["label"] for opt in suite_options] + preset_entries = _load_catalog_presets() + preset_names = [str(entry.get("display_name") or "").strip() for entry in preset_entries if str(entry.get("display_name") or "").strip()] + preset_by_name = {str(entry.get("display_name") or "").strip(): entry for entry in preset_entries} + + original_catalog_name = str(raw_catalog.get("display_name") or detail.get("catalog") or "").strip() + original_catalog_id = str(raw_catalog.get("id") or "").strip() + default_preset_name = original_catalog_name if original_catalog_name in preset_by_name else "" + + import re + + default_output_path = output_path_default + if not default_output_path: + clean_target = re.sub(r"[^\w]+", "_", str(detail.get("target") or job_id).strip()).strip("_") or "artifact" + default_output_path = f"retest_{clean_target}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + st.caption("Schedule a new evaluator workflow that reuses build artifacts from this job, then download and process the new results.") + summary_cols = st.columns([1.35, 1.0, 1.25, 1.2]) + summary_cols[0].markdown(f"**Source job** \n`{job_id}`") + summary_cols[1].markdown(f"**Ref** \n`{detail.get('target', '—')}`") + summary_cols[2].markdown(f"**Original catalog** \n`{original_catalog_name or '—'}`") + summary_cols[3].markdown(f"**Suites found** \n`{len(suite_labels)}`") + + preset_key = f"recent_eval_retest_catalog_preset_{job_id}" + last_preset_key = f"recent_eval_retest_last_catalog_preset_{job_id}" + catalog_id_key = f"recent_eval_retest_catalog_id_{job_id}" + if preset_key not in st.session_state: + st.session_state[preset_key] = default_preset_name + if last_preset_key not in st.session_state: + st.session_state[last_preset_key] = "" + if catalog_id_key not in st.session_state: + st.session_state[catalog_id_key] = original_catalog_id + + selected_preset_name = st.selectbox( + "Catalog preset", + options=[""] + preset_names, + index=([""] + preset_names).index(default_preset_name) if default_preset_name in preset_names else 0, + key=preset_key, + help="Choose a preset catalog, or leave this empty and enter a catalog id manually.", + format_func=lambda value: value or "Custom / manual", + ) + selected_preset = preset_by_name.get(selected_preset_name or "", {}) + if st.session_state[last_preset_key] != selected_preset_name and selected_preset_name: + st.session_state[catalog_id_key] = str(selected_preset.get("catalog_id") or "") + st.session_state[last_preset_key] = selected_preset_name + elif st.session_state[last_preset_key] != selected_preset_name and not selected_preset_name: + st.session_state[catalog_id_key] = original_catalog_id + st.session_state[last_preset_key] = selected_preset_name + catalog_id = st.text_input( + "Catalog ID", + value="", + key=catalog_id_key, + help="You can switch to a different catalog while still reusing the build artifacts from the source job.", + ).strip() + + selected_suite_labels = st.multiselect( + "Suites to run", + options=suite_labels, + default=suite_labels, + help="Defaults to the suite set found on the source job. Clear the list to let the evaluator use its default suite selection.", + disabled=not suite_labels, + ) + description = st.text_input( + "Description", + value="", + help="Leave empty to use an automatic evaluator artifact-retest name.", + ).strip() + retest_output_path = st.text_input( + "Output path", + value=default_output_path, + help="Folder under the data directory for the downloaded retest results.", + ) + run_download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON only"], + index=0, + horizontal=True, + ) + run_phase = "" + if run_download_type == "Archives (ZIP)": + run_phase = st.text_input( + "Phase to extract", + value=phase_default, + help="Enter the phase name to extract from archives.", + ) + + run_cols = st.columns([1.2, 1.2, 1.0]) + with run_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=True, + help="Run eval_result and generate Summary.csv / Score.csv after download.", + ) + with run_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + help="Build scene_result.parquet from .pkl files." if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable this.", + ) + with run_cols[2]: + eval_recursive = st.checkbox( + "Recursive eval", + value=True, + help="Search subdirectories for evaluation result folders.", + ) + + action_cols = st.columns([1.15, 1.15, 3.7]) + cancel_clicked = action_cols[0].button("Cancel", key=f"recent_eval_retest_cancel_{job_id}", use_container_width=True) + start_clicked = action_cols[1].button("Retest", key=f"recent_eval_retest_start_{job_id}", type="primary", use_container_width=True) + + if cancel_clicked: + st.session_state.pop("recent_eval_jobs_retest_selected", None) + st.rerun() + + if not start_clicked: + return + + final_catalog_id = str(selected_preset.get("catalog_id") or catalog_id or "").strip() + if not final_catalog_id: + st.error("Catalog ID is required.") + return + + resolved_output, path_err = resolve_under_data_root(retest_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + return + + selected_suite_ids = [suite_label_to_id[label] for label in selected_suite_labels] + resolved_path_str = str(resolved_output) + has_custom_catalog = bool(final_catalog_id and not selected_preset_name) + final_description = description or _make_retest_description( + str(detail.get("target") or job_id), + selected_preset_name, + has_custom_catalog=has_custom_catalog, + ) + + task_id = _enqueue_task( + "run_evaluator_and_process", + { + "project_id": project_id, + "catalog_id": final_catalog_id, + "integration_id": "", + "source_job_id": job_id, + "suite_ids": selected_suite_ids or None, + "target_name": "", + "description": final_description, + "output_path": resolved_path_str, + "environment": environment, + "max_retries": 0, + "clean_build": False, + "debug": False, + "is_tag": False, + "download_type": "archives" if run_download_type == "Archives (ZIP)" else "result_json", + "phase": run_phase, + "skip_large_file": False, + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": 60, + "max_wait_seconds": 6 * 3600, + "run_eval": run_eval, + "generate_parquet": generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + }, + ) + if not task_id: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + return + + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("catalog_id", final_catalog_id) + set_config_value("suite_ids", selected_suite_ids) + + st.session_state["recent_eval_jobs_flash"] = ( + f"Queued artifact retest for `{detail.get('title', job_id)}`. " + f"Task id: `{task_id}`." + ) + st.session_state.pop("recent_eval_jobs_retest_selected", None) + st.rerun() + + def _render_recent_evaluator_jobs_section( project_id: str, environment: str, @@ -1537,16 +1816,21 @@ def _render_job_list() -> None: st.session_state.pop("recent_eval_jobs_run_selected", None) selected_run_job_id = None + selected_retest_job_id = st.session_state.get("recent_eval_jobs_retest_selected") + if selected_retest_job_id and not any(str(job.get("job_id", "")) == str(selected_retest_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_retest_selected", None) + selected_retest_job_id = None + st.markdown('
', unsafe_allow_html=True) for job in visible_jobs: subject_id = str(job.get("scheduled_by") or "").strip() user_info = user_directory.get(subject_id, {}) user_label = str(user_info.get("name") or subject_id or "Unknown").strip() - row_cols = st.columns([9.8, 2.0]) + row_cols = st.columns([9.2, 2.6]) with row_cols[0]: _render_recent_evaluator_job_card(job, user_label=user_label) with row_cols[1]: - action_cols = st.columns([1.0, 1.0], gap="small") + action_cols = st.columns([1.0, 1.0, 1.0], gap="small") with action_cols[0]: if st.button("Details", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) @@ -1557,6 +1841,11 @@ def _render_job_list() -> None: st.session_state["recent_eval_jobs_run_selected"] = str(job["job_id"]) _fetch_evaluator_job_detail.clear() st.rerun() + with action_cols[2]: + if st.button("Retest", key=f"recent_eval_retest_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_retest_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() st.markdown("
", unsafe_allow_html=True) selected_job_id = st.session_state.get("recent_eval_jobs_selected") @@ -1633,4 +1922,42 @@ def _recent_eval_run_dialog() -> None: ) st.markdown("
", unsafe_allow_html=True) + selected_retest_job_id = st.session_state.get("recent_eval_jobs_retest_selected") + if selected_retest_job_id: + selected_retest_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_retest_job_id)), None) + if selected_retest_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Artifact retest · {selected_retest_job.get('title', '—')}", width="large") + def _recent_eval_retest_dialog() -> None: + _render_recent_evaluator_job_retest_dialog( + project_id, + environment, + selected_retest_job, + output_path_default=output_path_default, + phase_default=phase_default, + ) + + _recent_eval_retest_dialog() + finally: + if st.session_state.get("recent_eval_jobs_retest_selected") == str(selected_retest_job_id): + st.session_state.pop("recent_eval_jobs_retest_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Artifact retest · {selected_retest_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_retest_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_retest_selected", None) + st.rerun() + _render_recent_evaluator_job_retest_dialog( + project_id, + environment, + selected_retest_job, + output_path_default=output_path_default, + phase_default=phase_default, + ) + st.markdown("
", unsafe_allow_html=True) + _render_job_list() diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 13b18b2..9af2634 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -119,6 +119,74 @@ def _load_catalog_presets(): return presets, loaded_path, load_error +def _fetch_server_catalogs(project_id: str, environment: str) -> List[Dict[str, str]]: + """Fetch available catalogs for the project on demand.""" + if not project_id: + return [] + import os + from lib.WebAPI import catalogAPI + + os.environ["AUTH_PROFILE"] = environment or "default" + response = catalogAPI(project_id=project_id).list_catalogs() + response.raise_for_status() + data = response.json() + raw_catalogs = data.get("catalogs", []) if isinstance(data, dict) else data + options: List[Dict[str, str]] = [] + for item in raw_catalogs or []: + if not isinstance(item, dict): + continue + catalog_id = str(item.get("id") or item.get("catalog_id") or "").strip() + display_name = str(item.get("display_name") or item.get("name") or catalog_id).strip() + if not catalog_id or not display_name: + continue + options.append( + { + "catalog_id": catalog_id, + "display_name": display_name, + "description": str(item.get("description") or "").strip(), + } + ) + options.sort(key=lambda item: item["display_name"].lower()) + return options + + +def _resolve_integration_id_for_catalog(project_id: str, environment: str, catalog_id: str) -> str: + """Resolve the most relevant active integration for a catalog.""" + if not project_id or not catalog_id: + return "" + from lib import evaluator_api + + os.environ["AUTH_PROFILE"] = environment or "default" + api = evaluator_api.EvaluationRunAPI() + url = f"{api.api_base_url}/projects/{project_id}/integrations" + response = api.request(url, {"catalog_id": catalog_id, "size": 100}, method="GET") + if response is None: + raise RuntimeError("No response returned while loading integrations.") + if response.status_code != 200: + raise RuntimeError(f"Failed to load integrations: status={response.status_code}") + + payload = json.loads(response.content) + integrations = payload.get("integrations", []) or [] + active = [ + item for item in integrations + if isinstance(item, dict) + and str(item.get("catalog_id") or "").strip() == catalog_id + and not bool(item.get("deleted")) + ] + if not active: + raise RuntimeError("No active integration was found for the selected catalog.") + + def _sort_key(item: Dict[str, object]) -> tuple: + return ( + str(item.get("updated_at") or ""), + int(item.get("version_id") or 0), + str(item.get("id") or ""), + ) + + active.sort(key=_sort_key, reverse=True) + return str(active[0].get("id") or "").strip() + + def _enqueue_task(task_type: str, params: dict) -> Optional[str]: try: session_id = get_task_list_current_user() @@ -476,33 +544,68 @@ def _render_local_runs_section() -> None: st.markdown('
No finished runs were found on this server yet.
', unsafe_allow_html=True) return - control_cols = st.columns([2.2, 0.8, 0.8, 0.7]) - with control_cols[0]: - st.markdown('
Search
', unsafe_allow_html=True) - run_search = st.text_input( - "Search runs", - value=st.session_state.get("workflow_runs_search", ""), - key="workflow_runs_search", - label_visibility="collapsed", - placeholder="Filter by run name", - ).strip().lower() - with control_cols[1]: - st.markdown('
Summary
', unsafe_allow_html=True) - require_summary = st.toggle( - "Summary only", - key="workflow_runs_summary_filter", - label_visibility="collapsed", - ) - with control_cols[2]: - st.markdown('
Parquet
', unsafe_allow_html=True) - require_parquet = st.toggle( - "Parquet only", - key="workflow_runs_parquet_filter", - label_visibility="collapsed", - ) - with control_cols[3]: - st.markdown('
Rows
', unsafe_allow_html=True) - page_size = int(st.selectbox("Rows", options=[10, 20, 50, 100], index=0, key="workflow_runs_page_size", label_visibility="collapsed")) + if "workflow_runs_search_applied" not in st.session_state: + st.session_state["workflow_runs_search_applied"] = st.session_state.get("workflow_runs_search", "") + if "workflow_runs_summary_filter_applied" not in st.session_state: + st.session_state["workflow_runs_summary_filter_applied"] = bool(st.session_state.get("workflow_runs_summary_filter", False)) + if "workflow_runs_parquet_filter_applied" not in st.session_state: + st.session_state["workflow_runs_parquet_filter_applied"] = bool(st.session_state.get("workflow_runs_parquet_filter", False)) + if "workflow_runs_page_size_applied" not in st.session_state: + st.session_state["workflow_runs_page_size_applied"] = int(st.session_state.get("workflow_runs_page_size", 10) or 10) + + with st.form("workflow_local_runs_filters", border=False): + control_cols = st.columns([2.2, 0.8, 0.8, 0.7, 0.8]) + with control_cols[0]: + st.markdown('
Search
', unsafe_allow_html=True) + run_search_input = st.text_input( + "Search runs", + value=st.session_state.get("workflow_runs_search_applied", ""), + key="workflow_runs_search", + label_visibility="collapsed", + placeholder="Filter by run name", + ) + with control_cols[1]: + st.markdown('
Summary
', unsafe_allow_html=True) + require_summary_input = st.toggle( + "Summary only", + value=bool(st.session_state.get("workflow_runs_summary_filter_applied", False)), + key="workflow_runs_summary_filter", + label_visibility="collapsed", + ) + with control_cols[2]: + st.markdown('
Parquet
', unsafe_allow_html=True) + require_parquet_input = st.toggle( + "Parquet only", + value=bool(st.session_state.get("workflow_runs_parquet_filter_applied", False)), + key="workflow_runs_parquet_filter", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
Rows
', unsafe_allow_html=True) + page_size_input = int( + st.selectbox( + "Rows", + options=[10, 20, 50, 100], + index=[10, 20, 50, 100].index(int(st.session_state.get("workflow_runs_page_size_applied", 10) or 10)), + key="workflow_runs_page_size", + label_visibility="collapsed", + ) + ) + with control_cols[4]: + st.markdown('
Apply
', unsafe_allow_html=True) + apply_filters = st.form_submit_button("Apply", use_container_width=True) + + if apply_filters: + st.session_state["workflow_runs_search_applied"] = run_search_input + st.session_state["workflow_runs_summary_filter_applied"] = bool(require_summary_input) + st.session_state["workflow_runs_parquet_filter_applied"] = bool(require_parquet_input) + st.session_state["workflow_runs_page_size_applied"] = int(page_size_input) + st.session_state["workflow_runs_page"] = 1 + + run_search = str(st.session_state.get("workflow_runs_search_applied", "")).strip().lower() + require_summary = bool(st.session_state.get("workflow_runs_summary_filter_applied", False)) + require_parquet = bool(st.session_state.get("workflow_runs_parquet_filter_applied", False)) + page_size = int(st.session_state.get("workflow_runs_page_size_applied", 10) or 10) filtered = runs if run_search: @@ -539,32 +642,6 @@ def _render_local_runs_section() -> None: start_idx = (current_page - 1) * page_size visible_runs = filtered[start_idx:start_idx + page_size] visible_names = {str(run["name"]) for run in visible_runs} - next_selected = [name for name in compare_selected if name not in visible_names] - for run in visible_runs: - run_name = str(run["name"]) - checkbox_key = f"workflow_compare_pick::{run_name}" - is_checked = bool(st.session_state.get(checkbox_key, run_name in compare_selected)) - if is_checked and run_name in compare_ready: - next_selected.append(run_name) - st.session_state["workflow_compare_runs"] = [name for name in compare_ready if name in next_selected] - - st.markdown('
', unsafe_allow_html=True) - st.markdown('

Compare

', unsafe_allow_html=True) - compare_cols = st.columns([3.4, 1.0]) - with compare_cols[0]: - st.markdown('
Selected runs
', unsafe_allow_html=True) - selected_runs = list(st.session_state.get("workflow_compare_runs", [])) - if selected_runs: - st.caption(" | ".join(selected_runs)) - with compare_cols[1]: - st.markdown('
Action
', unsafe_allow_html=True) - if len(selected_runs) >= 2: - st.link_button("Compare", _build_overview_url(selected_runs[0], selected_runs[1:]), use_container_width=True) - elif len(selected_runs) == 1: - st.link_button("Open", _build_overview_url(selected_runs[0]), use_container_width=True) - else: - st.button("Open", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") - st.markdown("
", unsafe_allow_html=True) pager_cols = st.columns([0.9, 1.2, 4.1]) with pager_cols[0]: @@ -597,6 +674,24 @@ def _render_local_runs_section() -> None: next_selected.append(run_name) st.session_state["workflow_compare_runs"] = [name for name in compare_ready if name in next_selected] + st.markdown('
', unsafe_allow_html=True) + st.markdown('

Compare

', unsafe_allow_html=True) + compare_cols = st.columns([3.4, 1.0]) + with compare_cols[0]: + st.markdown('
Selected runs
', unsafe_allow_html=True) + selected_runs = list(st.session_state.get("workflow_compare_runs", [])) + if selected_runs: + st.caption(" | ".join(selected_runs)) + with compare_cols[1]: + st.markdown('
Action
', unsafe_allow_html=True) + if len(selected_runs) >= 2: + st.link_button("Compare", _build_overview_url(selected_runs[0], selected_runs[1:]), use_container_width=True) + elif len(selected_runs) == 1: + st.link_button("Open", _build_overview_url(selected_runs[0]), use_container_width=True) + else: + st.button("Open", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") + st.markdown("
", unsafe_allow_html=True) + def _render_current_tasks_section() -> None: section_header("Current Tasks", "") @@ -675,7 +770,34 @@ def _render_start_workflow_form( default_environment = get_config_value("environment", "") default_output = _make_default_output_path(default_target) - top_cols = st.columns([1.0, 1.5, 1.2]) + if "workflow_server_catalogs" not in st.session_state: + st.session_state["workflow_server_catalogs"] = [] + if "workflow_server_catalog_error" not in st.session_state: + st.session_state["workflow_server_catalog_error"] = "" + if "workflow_selected_server_catalog_id" not in st.session_state: + st.session_state["workflow_selected_server_catalog_id"] = "" + if "workflow_catalog_id" not in st.session_state: + st.session_state["workflow_catalog_id"] = "" + if "workflow_integration_id" not in st.session_state: + st.session_state["workflow_integration_id"] = "" + if "workflow_catalog_resolution_error" not in st.session_state: + st.session_state["workflow_catalog_resolution_error"] = "" + if "workflow_last_catalog_selection" not in st.session_state: + st.session_state["workflow_last_catalog_selection"] = "" + + server_catalogs = st.session_state.get("workflow_server_catalogs", []) or [] + server_catalog_labels = [ + f"{item['display_name']} ({item['catalog_id']})" for item in server_catalogs + ] + catalog_options = [""] + catalog_names + [ + label for label in server_catalog_labels if label not in catalog_names + ] + preset_by_label = {item["display_name"]: item for item in catalog_presets} + server_by_label = { + f"{item['display_name']} ({item['catalog_id']})": item for item in server_catalogs + } + + top_cols = st.columns([1.0, 1.9, 1.2]) with top_cols[0]: st.markdown('
Project
', unsafe_allow_html=True) project_id = st.text_input( @@ -685,25 +807,60 @@ def _render_start_workflow_form( label_visibility="collapsed", ).strip() with top_cols[1]: - st.markdown('
Catalog preset
', unsafe_allow_html=True) - selected_catalog_name = st.selectbox( - "Catalog preset", - options=[""] + catalog_names if catalog_names else [""], - index=0, - key="workflow_catalog_name", - label_visibility="collapsed", - format_func=lambda value: value or "Optional preset", - ) - selected_catalog = next( - (item for item in catalog_presets if item["display_name"] == selected_catalog_name), - None, - ) + st.markdown('
Catalog
', unsafe_allow_html=True) + catalog_picker_cols = st.columns([4.2, 1.1], gap="small") + with catalog_picker_cols[0]: + selected_catalog_name = st.selectbox( + "Catalog", + options=catalog_options if catalog_options else [""], + index=catalog_options.index(st.session_state.get("workflow_catalog_name", "")) if st.session_state.get("workflow_catalog_name", "") in catalog_options else 0, + key="workflow_catalog_name", + label_visibility="collapsed", + format_func=lambda value: value or "Choose a catalog", + ) + with catalog_picker_cols[1]: + fetch_catalogs_clicked = st.button( + "Fetch", + key="workflow_fetch_server_catalogs", + use_container_width=True, + ) + if fetch_catalogs_clicked: + try: + current_environment = str(st.session_state.get("workflow_environment", default_environment) or "") + st.session_state["workflow_server_catalogs"] = _fetch_server_catalogs(project_id, current_environment) + st.session_state["workflow_server_catalog_error"] = "" + except Exception as exc: + st.session_state["workflow_server_catalogs"] = [] + st.session_state["workflow_server_catalog_error"] = str(exc) + selected_catalog = preset_by_label.get(selected_catalog_name) + selected_server_catalog = server_by_label.get(selected_catalog_name) if "workflow_last_catalog_preset" not in st.session_state: st.session_state["workflow_last_catalog_preset"] = "" if st.session_state["workflow_last_catalog_preset"] != selected_catalog_name and selected_catalog: st.session_state["workflow_catalog_id"] = str(selected_catalog.get("catalog_id") or "") st.session_state["workflow_integration_id"] = str(selected_catalog.get("integration_id") or "") + st.session_state["workflow_selected_server_catalog_id"] = "" + st.session_state["workflow_catalog_resolution_error"] = "" st.session_state["workflow_last_catalog_preset"] = selected_catalog_name + elif selected_server_catalog: + st.session_state["workflow_catalog_id"] = str(selected_server_catalog.get("catalog_id") or "") + st.session_state["workflow_selected_server_catalog_id"] = str(selected_server_catalog.get("catalog_id") or "") + current_environment = str(st.session_state.get("workflow_environment", default_environment) or "") + if st.session_state["workflow_last_catalog_selection"] != selected_catalog_name: + try: + st.session_state["workflow_integration_id"] = _resolve_integration_id_for_catalog( + project_id, + current_environment, + st.session_state["workflow_catalog_id"], + ) + st.session_state["workflow_catalog_resolution_error"] = "" + except Exception as exc: + st.session_state["workflow_integration_id"] = "" + st.session_state["workflow_catalog_resolution_error"] = str(exc) + st.session_state["workflow_last_catalog_selection"] = selected_catalog_name + elif st.session_state["workflow_last_catalog_selection"] != selected_catalog_name: + st.session_state["workflow_catalog_resolution_error"] = "" + st.session_state["workflow_last_catalog_selection"] = selected_catalog_name with top_cols[2]: st.markdown('
Branch or tag
', unsafe_allow_html=True) target_name = st.text_input( @@ -714,26 +871,15 @@ def _render_start_workflow_form( placeholder="beta/v4.3.2", ).strip() - detail_cols = st.columns([1.05, 1.05, 1.2, 0.8, 0.95]) - with detail_cols[0]: - st.markdown('
Catalog ID
', unsafe_allow_html=True) - catalog_id = st.text_input( - "Catalog ID", - value="", - key="workflow_catalog_id", - label_visibility="collapsed", - placeholder="vehicle catalog id", - ).strip() - with detail_cols[1]: - st.markdown('
Integration ID
', unsafe_allow_html=True) - integration_id = st.text_input( - "Integration ID", - value="", - key="workflow_integration_id", - label_visibility="collapsed", - placeholder="integration id", - ).strip() - with detail_cols[2]: + catalog_id = str(st.session_state.get("workflow_catalog_id") or "").strip() + integration_id = str(st.session_state.get("workflow_integration_id") or "").strip() + + if st.session_state.get("workflow_server_catalog_error"): + st.warning(f"Could not fetch catalogs: {st.session_state['workflow_server_catalog_error']}") + catalog_id = str(st.session_state.get("workflow_catalog_id") or "").strip() + + picker_cols = st.columns([1.2, 1.2, 1.75]) + with picker_cols[0]: st.markdown('
Output folder
', unsafe_allow_html=True) output_path = st.text_input( "Output folder", @@ -742,7 +888,7 @@ def _render_start_workflow_form( label_visibility="collapsed", placeholder=_make_default_output_path(target_name), ).strip() - with detail_cols[3]: + with picker_cols[1]: st.markdown('
Environment
', unsafe_allow_html=True) environment = st.selectbox( "Environment", @@ -752,7 +898,7 @@ def _render_start_workflow_form( label_visibility="collapsed", format_func=lambda value: value or "default", ) - with detail_cols[4]: + with picker_cols[2]: st.markdown('
Description
', unsafe_allow_html=True) description = st.text_input( "Description", @@ -762,9 +908,23 @@ def _render_start_workflow_form( placeholder="Optional label for the evaluator run", ).strip() + confirm_cols = st.columns([1.0, 1.0]) + with confirm_cols[0]: + if catalog_id: + st.caption(f"Catalog ID: `{catalog_id}`") + with confirm_cols[1]: + if integration_id: + st.caption(f"Integration ID: `{integration_id}`") + if st.session_state.get("workflow_catalog_resolution_error"): + st.warning(f"Could not resolve integration automatically: {st.session_state['workflow_catalog_resolution_error']}") + if selected_catalog: desc = str(selected_catalog.get("description") or "").strip() or "Preset selected for quick scheduling." st.caption(f"Preset: {desc}") + elif selected_server_catalog: + desc = str(selected_server_catalog.get("description") or "").strip() + if desc: + st.caption(f"Fetched catalog: {desc}") with st.expander("Advanced options", expanded=False): adv_cols = st.columns([1.0, 1.2, 0.8, 0.8]) @@ -837,7 +997,7 @@ def _render_start_workflow_form( resolved_output = None path_error = "" if output_path: - resolved_output, path_error = resolve_under_data_root(output_path, allow_create=True) + resolved_output, path_error = resolve_under_data_root(output_path, allow_create=False) if path_error: errors.append(path_error) else: @@ -882,6 +1042,8 @@ def _render_workflow_launcher_section( ) -> Dict[str, object]: section_header("Run Evaluator Workflow", "") start_defaults = _get_start_workflow_defaults() + if "workflow_start_dialog_open" not in st.session_state: + st.session_state["workflow_start_dialog_open"] = False new_job_clicked = st.button( "Start new workflow", key="workflow_open_start_dialog", @@ -890,11 +1052,18 @@ def _render_workflow_launcher_section( ) if new_job_clicked and callable(getattr(st, "dialog", None)): + st.session_state["workflow_start_dialog_open"] = True fresh_target = str(get_config_value("target_name", "beta/v4.3.2") or "beta/v4.3.2") st.session_state["workflow_catalog_name"] = "" st.session_state["workflow_last_catalog_preset"] = "" st.session_state["workflow_catalog_id"] = "" st.session_state["workflow_integration_id"] = "" + st.session_state["workflow_server_catalogs"] = [] + st.session_state["workflow_server_catalog_error"] = "" + st.session_state["workflow_selected_server_catalog_id"] = "" + st.session_state["workflow_selected_server_catalog_label"] = "" + st.session_state["workflow_catalog_resolution_error"] = "" + st.session_state["workflow_last_catalog_selection"] = "" st.session_state["workflow_output_path"] = _make_default_output_path(fresh_target) @st.dialog("Start evaluator workflow", width="large") @@ -905,6 +1074,7 @@ def _workflow_start_dialog() -> None: close_clicked = submit_cols[0].button("Close", key="workflow_close_start_dialog", use_container_width=True) start_clicked = submit_cols[1].button("Start workflow", key="workflow_start_btn_dialog", type="primary", use_container_width=True) if close_clicked: + st.session_state["workflow_start_dialog_open"] = False st.rerun() if start_clicked: dialog_payload = dict(payload.get("dialog_payload") or {}) @@ -948,6 +1118,7 @@ def _workflow_start_dialog() -> None: }, ) if task_id: + st.session_state["workflow_start_dialog_open"] = False st.success(f"Workflow queued. Task id: `{task_id}`") st.rerun() else: @@ -996,4 +1167,16 @@ def _workflow_start_dialog() -> None: ) with tab_local: - _render_local_runs_section() + use_fragment = getattr(st, "fragment", None) is not None + if use_fragment: + try: + + @st.fragment + def _local_runs_fragment(): + _render_local_runs_section() + + _local_runs_fragment() + except (TypeError, AttributeError): + _render_local_runs_section() + else: + _render_local_runs_section() diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 2b07e0d..794dafa 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -476,6 +476,7 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N project_id = parameters.get("project_id") catalog_id = parameters.get("catalog_id") integration_id = parameters.get("integration_id") + source_job_id = parameters.get("source_job_id") suite_ids = parameters.get("suite_ids") target_name = parameters.get("target_name") # branch name or tag description = parameters.get("description", "no description") @@ -507,8 +508,10 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N clean_build = parameters.get("clean_build", False) debug = parameters.get("debug", False) is_tag = parameters.get("is_tag", False) - - if not all([project_id, catalog_id, integration_id, target_name, output_path]): + + has_source_job = bool(source_job_id) + has_fresh_source = bool(integration_id and target_name) + if not project_id or not catalog_id or not output_path or (not has_source_job and not has_fresh_source): update_task_status(task_id, "failed", error_message="Missing required parameters") return @@ -525,7 +528,13 @@ def on_warning(msg: str) -> None: # Step 1: Schedule evaluator job on_progress("Step 1/5: Scheduling evaluator job...") - append_task_log(task_id, f"Project: {project_id}, Catalog: {catalog_id}, Target: {target_name}") + if source_job_id: + append_task_log( + task_id, + f"Project: {project_id}, Catalog: {catalog_id}, Reuse build from job: {source_job_id}", + ) + else: + append_task_log(task_id, f"Project: {project_id}, Catalog: {catalog_id}, Target: {target_name}") try: api = evaluator_api.EvaluationRunAPI() @@ -535,6 +544,7 @@ def on_warning(msg: str) -> None: catalog_id=catalog_id, integration_id=integration_id, target_name=target_name, + source_job_id=source_job_id, suite_ids=suite_ids, max_retries=max_retries, description=description, @@ -560,6 +570,7 @@ def on_warning(msg: str) -> None: "evaluator_job_id": job_id, "evaluator_report_url": report_url, "evaluator_status": "scheduled", + "source_job_id": source_job_id or "", "download_summary": {"total": 0, "success": 0, "failed": 0}, "eval_summary": {}, "parquet_path": "", From 49f730b1966e2ab93cd2cbca11611a43b868a400 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 18 May 2026 10:02:56 +0900 Subject: [PATCH 69/94] feat: update evaluator workflow form with new options and UI adjustments - Changed the default search scope in the recent evaluator jobs section to enhance user experience. - Introduced a new option to skip large files during workflow execution, improving flexibility for users. - Updated the layout of the workflow form, including adjustments to the environment and phase selection, for better usability. - Enhanced the advanced options section by adding a checkbox for skipping large files and adjusting column widths for improved layout. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/ui/recent_evaluator_jobs.py | 2 +- .../pages/7_Evaluator_Workflow.py | 39 +++++++++++-------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 3c12854..ca2a991 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -1587,7 +1587,7 @@ def _render_recent_evaluator_jobs_section( search_scope = st.selectbox( "Search in", options=["Branch/tag", "Description", "Job ID", "Git SHA", "Fail message"], - index=0, + index=1, key="recent_eval_jobs_search_scope", help="Choose which evaluator field the quick search should target.", label_visibility="collapsed", diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 9af2634..4c945ad 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -769,6 +769,7 @@ def _render_start_workflow_form( default_max_wait_hours = int(get_config_value("max_wait_hours", 24)) default_environment = get_config_value("environment", "") default_output = _make_default_output_path(default_target) + default_skip_large_file = True if "workflow_server_catalogs" not in st.session_state: st.session_state["workflow_server_catalogs"] = [] @@ -889,14 +890,12 @@ def _render_start_workflow_form( placeholder=_make_default_output_path(target_name), ).strip() with picker_cols[1]: - st.markdown('
Environment
', unsafe_allow_html=True) - environment = st.selectbox( - "Environment", - options=["", "dev", "stg", "prd"], - index=["", "dev", "stg", "prd"].index(default_environment) if default_environment in ("", "dev", "stg", "prd") else 0, - key="workflow_environment", + st.markdown('
Phase
', unsafe_allow_html=True) + phase = st.text_input( + "Phase", + value=default_phase, + key="workflow_phase", label_visibility="collapsed", - format_func=lambda value: value or "default", ) with picker_cols[2]: st.markdown('
Description
', unsafe_allow_html=True) @@ -927,7 +926,7 @@ def _render_start_workflow_form( st.caption(f"Fetched catalog: {desc}") with st.expander("Advanced options", expanded=False): - adv_cols = st.columns([1.0, 1.2, 0.8, 0.8]) + adv_cols = st.columns([1.0, 1.0, 0.8, 0.8]) with adv_cols[0]: download_type = st.radio( "Download type", @@ -937,11 +936,12 @@ def _render_start_workflow_form( key="workflow_download_type", ) with adv_cols[1]: - phase = st.text_input( - "Phase", - value=default_phase, - key="workflow_phase", - disabled=download_type != "Archives (ZIP)", + environment = st.selectbox( + "Environment", + options=["", "dev", "stg", "prd"], + index=["", "dev", "stg", "prd"].index(default_environment) if default_environment in ("", "dev", "stg", "prd") else 0, + key="workflow_environment", + format_func=lambda value: value or "default", ) with adv_cols[2]: poll_interval = st.slider( @@ -961,7 +961,7 @@ def _render_start_workflow_form( key="workflow_max_wait_hours", ) - option_cols = st.columns(4) + option_cols = st.columns(5) with option_cols[0]: run_eval = st.checkbox("Run evaluation", value=True, key="workflow_run_eval") with option_cols[1]: @@ -972,8 +972,14 @@ def _render_start_workflow_form( key="workflow_generate_parquet", ) with option_cols[2]: - eval_recursive = st.checkbox("Recursive scan", value=True, key="workflow_eval_recursive") + skip_large_file = st.checkbox( + "Skip large files", + value=default_skip_large_file, + key="workflow_skip_large_file", + ) with option_cols[3]: + eval_recursive = st.checkbox("Recursive scan", value=True, key="workflow_eval_recursive") + with option_cols[4]: is_tag = st.checkbox("Target is tag", value=False, key="workflow_is_tag") set_config_value("eval_project_id", project_id) @@ -1030,6 +1036,7 @@ def _render_start_workflow_form( "max_wait_hours": int(max_wait_hours), "run_eval": bool(run_eval), "generate_parquet": bool(generate_parquet), + "skip_large_file": bool(skip_large_file), "eval_recursive": bool(eval_recursive), }, } @@ -1106,7 +1113,7 @@ def _workflow_start_dialog() -> None: "is_tag": dialog_payload["is_tag"], "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", "phase": dialog_payload["phase"], - "skip_large_file": False, + "skip_large_file": bool(dialog_payload.get("skip_large_file", True)), "large_file_mb": 50.0, "keep_zip_files": False, "poll_interval": dialog_payload["poll_interval"], From e430e355683c3b86e2fb725e77bcb38df905aa30 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 18 May 2026 13:26:15 +0900 Subject: [PATCH 70/94] feat: enhance task management and metadata handling in evaluation dashboard - Updated the `get_task` function to include `session_id` in the returned task row for better tracking. - Introduced `count_recent_tasks` function to provide total task counts based on filters, improving task overview capabilities. - Enhanced `list_recent_tasks` to support pagination with `offset` parameter, allowing for more efficient task retrieval. - Added a new module for handling run metadata, including functions for reading, writing, and merging run metadata, improving data management for evaluation runs. - Updated UI components to reflect new task history range options and pagination controls, enhancing user experience in task management. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/db.py | 48 +- evaluation_dashboard_app/lib/run_metadata.py | 381 ++++++++++ .../lib/ui/download_ui.py | 12 +- .../lib/ui/recent_evaluator_jobs.py | 6 +- evaluation_dashboard_app/pages/6_Download.py | 59 +- .../pages/7_Evaluator_Workflow.py | 495 +++++++++++- evaluation_dashboard_app/worker/tasks.py | 704 +++++++++++++++++- 7 files changed, 1662 insertions(+), 43 deletions(-) create mode 100644 evaluation_dashboard_app/lib/run_metadata.py diff --git a/evaluation_dashboard_app/lib/db.py b/evaluation_dashboard_app/lib/db.py index 0276c72..4ce178f 100644 --- a/evaluation_dashboard_app/lib/db.py +++ b/evaluation_dashboard_app/lib/db.py @@ -406,7 +406,7 @@ def update_task_result_summary(task_id: str, summary: Dict[str, Any]) -> bool: def get_task(task_id: str) -> Optional[Dict[str, Any]]: - """Return task row as dict (includes ``rq_job_id`` for RQ cancel / reconcile).""" + """Return task row as dict (includes ``rq_job_id`` and ``session_id`` when available).""" url = get_database_url() if not url: return None @@ -422,7 +422,7 @@ def get_task(task_id: str) -> Optional[Dict[str, Any]]: cur.execute( """SELECT id, type, status, parameters, result_path, error_message, progress_message, progress_pct, log_output, result_summary, rq_job_id, - created_at, updated_at + session_id, created_at, updated_at FROM tasks WHERE id = %s""", (task_id,), ) @@ -438,6 +438,7 @@ def get_task(task_id: str) -> Optional[Dict[str, Any]]: def list_recent_tasks( limit: int = 50, + offset: int = 0, session_id: Optional[str] = None, since_days: Optional[int] = None, ) -> List[Dict[str, Any]]: @@ -471,13 +472,13 @@ def list_recent_tasks( ) params.append(int(since_days)) where = (" WHERE " + " AND ".join(conditions)) if conditions else "" - params.append(limit) + params.extend([max(0, int(limit)), max(0, int(offset))]) cur.execute( f""" SELECT {cols} FROM tasks{where} ORDER BY created_at DESC - LIMIT %s + LIMIT %s OFFSET %s """, params, ) @@ -496,6 +497,45 @@ def list_recent_tasks( return rows +def count_recent_tasks( + session_id: Optional[str] = None, + since_days: Optional[int] = None, +) -> int: + """Return total task count for the same filter shape as ``list_recent_tasks``.""" + url = get_database_url() + if not url: + return 0 + try: + import psycopg2 + except ImportError: + return 0 + try: + conn = psycopg2.connect(url) + try: + with conn.cursor() as cur: + conditions: List[str] = [] + params: List[Any] = [] + if session_id is not None: + conditions.append("session_id = %s") + params.append(session_id) + if since_days is not None: + conditions.append( + "created_at >= NOW() - (%s::integer * INTERVAL '1 day')" + ) + params.append(int(since_days)) + where = (" WHERE " + " AND ".join(conditions)) if conditions else "" + cur.execute( + f"SELECT COUNT(*) FROM tasks{where}", + params, + ) + row = cur.fetchone() + return int(row[0]) if row and row[0] is not None else 0 + finally: + conn.close() + except Exception: + return 0 + + def delete_task(task_id: str, session_id: Optional[str] = None) -> bool: """Delete a task row. For pending/running, cancels the RQ job first when ``rq_job_id`` is set.""" url = get_database_url() diff --git a/evaluation_dashboard_app/lib/run_metadata.py b/evaluation_dashboard_app/lib/run_metadata.py new file mode 100644 index 0000000..91e9211 --- /dev/null +++ b/evaluation_dashboard_app/lib/run_metadata.py @@ -0,0 +1,381 @@ +"""Helpers for durable per-run metadata stored alongside local run folders.""" + +from __future__ import annotations + +import json +import os +from datetime import datetime, timezone +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import Any, Dict, Iterable, Optional + +from lib.path_utils import get_data_root, path_display, to_data_relative + +RUN_METADATA_FILENAME = ".run_metadata.json" +RUN_METADATA_SCHEMA_VERSION = 1 + + +def _utc_now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def _json_safe(value: Any) -> Any: + if isinstance(value, dict): + return {str(key): _json_safe(val) for key, val in value.items()} + if isinstance(value, (list, tuple)): + return [_json_safe(item) for item in value] + if isinstance(value, Path): + return str(value) + if isinstance(value, datetime): + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + return value.astimezone(timezone.utc).replace(microsecond=0).isoformat() + return value + + +def _deep_merge(base: Dict[str, Any], patch: Dict[str, Any]) -> Dict[str, Any]: + merged = dict(base) + for key, value in patch.items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = _deep_merge(merged[key], value) + else: + merged[key] = value + return merged + + +def normalize_run_path(path_like: str | Path, *, allow_missing: bool = True) -> Optional[Path]: + raw = str(path_like or "").strip() + if not raw: + return None + try: + candidate = Path(raw) + if not candidate.is_absolute(): + candidate = get_data_root() / candidate + resolved = candidate.resolve(strict=False) + try: + resolved.relative_to(get_data_root()) + except ValueError: + return None + if not allow_missing and not resolved.exists(): + return None + return resolved + except Exception: + return None + + +def find_run_directory(path_like: str | Path, *, create_missing: bool = False) -> Optional[Path]: + resolved = normalize_run_path(path_like, allow_missing=True) + if resolved is None: + return None + try: + rel = resolved.relative_to(get_data_root()) + except ValueError: + return None + if not rel.parts: + return None + run_dir = get_data_root() / rel.parts[0] + if create_missing: + run_dir.mkdir(parents=True, exist_ok=True) + elif not run_dir.exists(): + return None + return run_dir + + +def resolve_run_directory_from_task_parameters( + parameters: Dict[str, Any], + *, + create_missing: bool = False, +) -> Optional[Path]: + for key in ("output_path", "output_dir", "eval_root", "pkl_dir", "result_path"): + path_value = parameters.get(key) + if not path_value: + continue + run_dir = find_run_directory(path_value, create_missing=create_missing) + if run_dir is not None: + return run_dir + return None + + +def metadata_path_for_run(run_path: Path) -> Path: + return run_path / RUN_METADATA_FILENAME + + +def read_run_metadata(run_path: Path) -> Dict[str, Any]: + meta_path = metadata_path_for_run(run_path) + if not meta_path.exists(): + return {} + try: + with meta_path.open("r", encoding="utf-8") as fh: + payload = json.load(fh) + return payload if isinstance(payload, dict) else {} + except Exception: + return {} + + +def write_run_metadata(run_path: Path, metadata: Dict[str, Any], *, create_missing: bool = False) -> Dict[str, Any]: + if create_missing: + run_path.mkdir(parents=True, exist_ok=True) + elif not run_path.exists(): + raise FileNotFoundError(str(run_path)) + + payload = dict(metadata) + payload["schema_version"] = RUN_METADATA_SCHEMA_VERSION + payload["run_name"] = run_path.name + payload["run_path"] = to_data_relative(run_path) + payload["run_path_display"] = path_display(run_path) + payload["updated_at"] = _utc_now_iso() + payload.setdefault("created_at", payload["updated_at"]) + + meta_path = metadata_path_for_run(run_path) + with NamedTemporaryFile("w", encoding="utf-8", dir=str(run_path), delete=False) as tmp: + json.dump(_json_safe(payload), tmp, ensure_ascii=False, indent=2, sort_keys=True) + tmp.write("\n") + tmp_path = Path(tmp.name) + try: + os.chmod(tmp_path, 0o644) + except Exception: + pass + tmp_path.replace(meta_path) + try: + os.chmod(meta_path, 0o644) + except Exception: + pass + return payload + + +def upsert_run_metadata(run_path: Path, patch: Dict[str, Any], *, create_missing: bool = False) -> Dict[str, Any]: + existing = read_run_metadata(run_path) + merged = _deep_merge(existing, _json_safe(patch)) + if "created_at" not in merged: + merged["created_at"] = _utc_now_iso() + return write_run_metadata(run_path, merged, create_missing=create_missing) + + +def flatten_metadata_text(value: Any) -> Iterable[str]: + if value is None: + return [] + if isinstance(value, dict): + parts = [] + for key, item in value.items(): + parts.append(str(key)) + parts.extend(flatten_metadata_text(item)) + return parts + if isinstance(value, (list, tuple, set)): + parts = [] + for item in value: + parts.extend(flatten_metadata_text(item)) + return parts + text = str(value).strip() + return [text] if text else [] + + +def build_run_search_blob(run_path: Path, metadata: Dict[str, Any], extra_values: Optional[Iterable[Any]] = None) -> str: + parts = [run_path.name, to_data_relative(run_path), path_display(run_path)] + parts.extend(flatten_metadata_text(metadata)) + if extra_values: + for value in extra_values: + parts.extend(flatten_metadata_text(value)) + return " ".join(part for part in parts if part).lower() + + +def _as_dict(value: Any) -> Dict[str, Any]: + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + return {} + + +def resolve_run_directory_from_task_row(task_row: Dict[str, Any]) -> Optional[Path]: + params = _as_dict(task_row.get("parameters")) + run_dir = resolve_run_directory_from_task_parameters(params, create_missing=False) + if run_dir is not None: + return run_dir + result_path = task_row.get("result_path") + if result_path: + return find_run_directory(result_path, create_missing=False) + summary = _as_dict(task_row.get("result_summary")) + for key in ("output_path", "summary_path", "parquet_path"): + path_value = summary.get(key) + if path_value: + run_dir = find_run_directory(path_value, create_missing=False) + if run_dir is not None: + return run_dir + return None + + +def build_metadata_patch_from_task_row(task_row: Dict[str, Any]) -> Dict[str, Any]: + params = _as_dict(task_row.get("parameters")) + summary = _as_dict(task_row.get("result_summary")) + task_type = str(task_row.get("type") or "").strip() + request_output = str( + params.get("output_path") + or params.get("output_dir") + or params.get("eval_root") + or params.get("pkl_dir") + or task_row.get("result_path") + or "" + ).strip() + + patch: Dict[str, Any] = { + "source_mode": task_type, + "task": { + "id": str(task_row.get("id") or "").strip(), + "type": task_type, + "status": str(task_row.get("status") or "").strip(), + "requested_by": str(task_row.get("session_id") or "").strip(), + "created_at": task_row.get("created_at"), + "updated_at": task_row.get("updated_at"), + "result_path": str(task_row.get("result_path") or "").strip(), + "error_message": str(task_row.get("error_message") or "").strip(), + "progress_message": str(task_row.get("progress_message") or "").strip(), + "progress_pct": task_row.get("progress_pct"), + }, + "request": { + "environment": str(params.get("environment") or "default").strip() or "default", + "project_id": str(params.get("project_id") or "").strip(), + "job_id": str(params.get("job_id") or "").strip(), + "catalog_id": str(params.get("catalog_id") or "").strip(), + "integration_id": str(params.get("integration_id") or "").strip(), + "source_job_id": str(params.get("source_job_id") or "").strip(), + "target_name": str(params.get("target_name") or "").strip(), + "description": str(params.get("description") or "").strip(), + "suite_id": str(params.get("suite_id") or "").strip(), + "suite_ids": list(params.get("suite_ids") or []), + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + "run_eval": bool(params.get("run_eval", False)), + "generate_parquet": bool(params.get("generate_parquet", False)), + "eval_recursive": bool(params.get("eval_recursive", False)), + "eval_overwrite": bool(params.get("eval_overwrite", False)), + "max_retries": params.get("max_retries"), + "clean_build": bool(params.get("clean_build", False)), + "debug": bool(params.get("debug", False)), + "is_tag": bool(params.get("is_tag", False)), + "scenario_name_filter": str(params.get("scenario_name_filter") or "").strip(), + "selected_ids": list(params.get("selected_ids") or []), + "output_path": request_output, + "parameters": params, + }, + "backfilled_from_task_history": True, + } + + if task_type == "download_results": + patch["download"] = { + "mode": "download_results", + "total": summary.get("total", 0), + "success": summary.get("success", 0), + "failed": summary.get("failed", 0), + "rows": list(summary.get("rows") or [])[:100], + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + } + elif task_type == "download_scenarios": + patch["scenario_download"] = { + "total": summary.get("total", 0), + "success": summary.get("success", 0), + "failed": summary.get("failed", 0), + "rows": list(summary.get("rows") or [])[:100], + "overwrite": bool(params.get("overwrite", False)), + "scenario_name_filter": str(params.get("scenario_name_filter") or "").strip(), + "selected_ids": list(params.get("selected_ids") or []), + } + elif task_type == "run_eval_dirs": + patch["evaluation"] = { + "directories_processed": summary.get("directories_processed", 0), + "success": summary.get("success", 0), + "failed": summary.get("failed", 0), + "skipped": summary.get("skipped", 0), + "summary_path": str(summary.get("summary_path") or "").strip(), + "summary_rows": summary.get("summary_rows", 0), + "score_rows": summary.get("score_rows", 0), + "enabled": True, + "recursive": bool(params.get("recursive", True)), + "overwrite": bool(params.get("overwrite", False)), + } + elif task_type == "generate_summary_csv": + patch["evaluation"] = { + "summary_path": str(summary.get("summary_path") or "").strip(), + "summary_rows": summary.get("summary_rows", 0), + "score_rows": summary.get("score_rows", 0), + "enabled": True, + } + elif task_type == "build_parquet": + patch["parquet"] = { + "enabled": True, + "path": str(summary.get("output_path") or "").strip(), + } + elif task_type == "download_and_eval": + patch["download"] = { + "mode": "download_and_eval", + **_as_dict(summary.get("download_summary")), + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + } + patch["evaluation"] = { + **_as_dict(summary.get("eval_summary")), + "enabled": bool(params.get("run_eval", False)), + "recursive": bool(params.get("eval_recursive", False)), + "overwrite": bool(params.get("eval_overwrite", False)), + } + patch["parquet"] = { + "enabled": bool(params.get("generate_parquet", False)), + "path": str(summary.get("parquet_path") or "").strip(), + } + errors = list(summary.get("errors") or []) + if errors: + patch["errors"] = errors + elif task_type == "run_evaluator_and_process": + patch["evaluator"] = { + "job_id": str(summary.get("evaluator_job_id") or params.get("job_id") or "").strip(), + "report_url": str(summary.get("evaluator_report_url") or "").strip(), + "status": str(summary.get("evaluator_status") or "").strip(), + "scheduled_by": str(summary.get("evaluator_scheduled_by") or "").strip(), + "build_status": str(summary.get("evaluator_build_status") or "").strip(), + "test_status": str(summary.get("evaluator_test_status") or "").strip(), + "fail_message": str(summary.get("evaluator_fail_message") or "").strip(), + "case_totals": _as_dict(summary.get("evaluator_case_totals")), + "suites": list(summary.get("evaluator_suites") or []), + "failed_cases": list(summary.get("evaluator_failed_cases") or []), + "catalog_id": str(params.get("catalog_id") or "").strip(), + "integration_id": str(params.get("integration_id") or "").strip(), + "source_job_id": str(params.get("source_job_id") or "").strip(), + "target_name": str(params.get("target_name") or "").strip(), + "description": str(params.get("description") or "").strip(), + "is_tag": bool(params.get("is_tag", False)), + } + patch["download"] = { + "mode": "run_evaluator_and_process", + **_as_dict(summary.get("download_summary")), + "rows": list(summary.get("download_rows") or [])[:100], + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + } + patch["evaluation"] = { + **_as_dict(summary.get("eval_summary")), + "enabled": bool(params.get("run_eval", False)), + "recursive": bool(params.get("eval_recursive", False)), + "overwrite": bool(params.get("eval_overwrite", False)), + } + patch["parquet"] = { + "enabled": bool(params.get("generate_parquet", False)), + "path": str(summary.get("parquet_path") or "").strip(), + } + + return patch diff --git a/evaluation_dashboard_app/lib/ui/download_ui.py b/evaluation_dashboard_app/lib/ui/download_ui.py index 100d89f..28c6c23 100644 --- a/evaluation_dashboard_app/lib/ui/download_ui.py +++ b/evaluation_dashboard_app/lib/ui/download_ui.py @@ -127,15 +127,15 @@ def render_download_hero(*, queue_enabled: bool) -> None: ) -def render_download_task_section_header(*, since_days: int = 7, max_rows: int = 200) -> None: +def render_download_task_section_header(*, since_days: Optional[int] = 7, max_rows: int = 200) -> None: """Lightweight title for the worker task list (no extra card chrome — task rows are the cards).""" - days = int(since_days) cap = int(max_rows) st.subheader("Recent tasks") - st.caption( - f"Queued/running jobs below; completed or failed in **Task history**. " - f"Last **{days}** days, up to **{cap}** rows." - ) + if since_days is None: + window = "All time" + else: + window = f"Last **{int(since_days)}** days" + st.caption(f"Queued/running jobs below; completed or failed in **Task history**. {window}, up to **{cap}** rows.") def _coerce_progress_fraction(progress_pct: Optional[Any]) -> Optional[float]: diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index ca2a991..8228393 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -1030,9 +1030,9 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = }.get(status_variant, '') meta_line = job_id counts = ( - f'S {int(job.get("success", 0))} · ' - f'F {int(job.get("failed", 0))} · ' - f'C {int(job.get("canceled", 0))} / ' + f'✅ {int(job.get("success", 0))} · ' + f'❌ {int(job.get("failed", 0))} · ' + f'⏹ {int(job.get("canceled", 0))} / ' f'{int(job.get("total", 0))}' ) title_html = f'{title_text}' if report_url else title_text diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index c34f531..ad44b35 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -56,6 +56,7 @@ def _to_jst(dt: Any) -> Optional[datetime]: from lib.ui.task_history import get_task_list_current_user, render_task_list from lib.ui.styles_download import inject_download_page_styles from lib.db import ( + count_recent_tasks, create_task, delete_task, get_task, @@ -76,6 +77,12 @@ def _to_jst(dt: Any) -> Optional[datetime]: # Task queue panel: time window + row cap (must match header + list_recent_tasks) _TASK_LIST_SINCE_DAYS = 7 _TASK_LIST_MAX_ROWS = 200 +_TASK_HISTORY_RANGE_OPTIONS = { + "7 days": 7, + "30 days": 30, + "90 days": 90, + "All": None, +} def _parse_rq_timeout_sec(raw: Optional[str], *, default: int, minimum: int) -> int: if raw is None or not str(raw).strip(): @@ -838,15 +845,58 @@ def download_scenarios( since_days=_TASK_LIST_SINCE_DAYS, max_rows=_TASK_LIST_MAX_ROWS, ) + if "download_task_history_range" not in st.session_state: + st.session_state["download_task_history_range"] = "7 days" + if "download_task_history_page_size" not in st.session_state: + st.session_state["download_task_history_page_size"] = 20 + if "download_task_history_page" not in st.session_state: + st.session_state["download_task_history_page"] = 1 + + _control_cols = st.columns([1.3, 1.0, 1.0, 2.7]) + with _control_cols[0]: + _selected_range = st.selectbox( + "Task history range", + options=list(_TASK_HISTORY_RANGE_OPTIONS.keys()), + key="download_task_history_range", + ) + with _control_cols[1]: + _page_size = int( + st.selectbox( + "Task rows", + options=[20, 50, 100], + key="download_task_history_page_size", + ) + ) + _since_days = _TASK_HISTORY_RANGE_OPTIONS.get(_selected_range, _TASK_LIST_SINCE_DAYS) + _total_tasks = count_recent_tasks(session_id=_current_user, since_days=_since_days) + _page_count = max(1, (_total_tasks + _page_size - 1) // _page_size) if _total_tasks else 1 + _current_page = min(max(1, int(st.session_state.get("download_task_history_page", 1))), _page_count) + st.session_state["download_task_history_page"] = _current_page + with _control_cols[2]: + _selected_page = st.selectbox( + "Task page", + options=list(range(1, _page_count + 1)), + index=_current_page - 1, + key="download_task_history_page_select", + ) + if int(_selected_page) != _current_page: + _current_page = int(_selected_page) + st.session_state["download_task_history_page"] = _current_page + with _control_cols[3]: + _range_label = _selected_range if _since_days is not None else "all time" + st.caption(f"Showing **{_total_tasks}** tasks across **{_page_count}** page(s) for **{_range_label}**.") + + _offset = (_current_page - 1) * _page_size _use_fragment = getattr(st, "fragment", None) is not None if _use_fragment: try: @st.fragment(run_every=timedelta(seconds=3)) def _task_list_poll(): _t = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, + limit=_page_size, + offset=_offset, session_id=_current_user, - since_days=_TASK_LIST_SINCE_DAYS, + since_days=_since_days, ) render_task_list(_t, _current_user) _task_list_poll() @@ -854,9 +904,10 @@ def _task_list_poll(): _use_fragment = False if not _use_fragment: tasks = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, + limit=_page_size, + offset=_offset, session_id=_current_user, - since_days=_TASK_LIST_SINCE_DAYS, + since_days=_since_days, ) has_active = render_task_list(tasks, _current_user) if st.button("Refresh task list", key="refresh_tasks"): diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 4c945ad..5d8cd98 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -19,7 +19,7 @@ import streamlit as st import requests -from lib.db import create_task, is_task_queue_enabled, list_recent_tasks, update_task_rq_job_id +from lib.db import count_recent_tasks, create_task, is_task_queue_enabled, list_recent_tasks, update_task_rq_job_id from lib.page_chrome import ( inject_app_page_styles, render_page_hero, @@ -32,6 +32,10 @@ list_run_directories, resolve_under_data_root, ) +from lib.run_metadata import ( + build_run_search_blob, + read_run_metadata, +) from lib.ui.recent_evaluator_jobs import ( _render_recent_evaluator_jobs_section, configure_recent_evaluator_jobs_ui, @@ -50,6 +54,12 @@ _JST = timezone(timedelta(hours=9)) _TASK_LIST_MAX_ROWS = 200 _TASK_LIST_SINCE_DAYS = 7 +_TASK_HISTORY_RANGE_OPTIONS = { + "7 days": 7, + "30 days": 30, + "90 days": 90, + "All": None, +} st.set_page_config( @@ -284,21 +294,148 @@ def _build_overview_url(run_a: str, compare_runs: Optional[List[str]] = None) -> return f"/?{urllib.parse.urlencode(query)}" +def _format_metadata_time(value: object) -> str: + if not value: + return "—" + if isinstance(value, datetime): + dt = value + else: + try: + dt = datetime.fromisoformat(str(value).replace("Z", "+00:00")) + except Exception: + return str(value) + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + try: + return dt.astimezone(_JST).strftime("%Y-%m-%d %H:%M JST") + except Exception: + return str(value) + + +def _metadata_text(value: object) -> str: + text = str(value or "").strip() + return text or "—" + + +def _run_user_label(subject_id: str, environment: str) -> str: + subject = str(subject_id or "").strip() + if not subject: + return "—" + if not subject.startswith("t4:"): + return subject + try: + profile = _resolve_subject_name(subject, environment or "default") + name = str(profile.get("name") or subject).strip() + return name or subject + except Exception: + return subject + + +def _catalog_url(project_id: str, catalog_id: str, metadata_url: str = "") -> str: + direct_url = str(metadata_url or "").strip() + if direct_url: + return direct_url + project = str(project_id or "").strip() + catalog = str(catalog_id or "").strip() + if project and catalog: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog}?project_id={project}" + return "" + + @st.cache_data(ttl=15, show_spinner=False) def _load_local_runs() -> List[Dict[str, object]]: runs: List[Dict[str, object]] = [] for run_path in list_run_directories(): info = get_run_info(run_path) + metadata = read_run_metadata(run_path) + task_meta = metadata.get("task") if isinstance(metadata.get("task"), dict) else {} + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + evaluator_meta = metadata.get("evaluator") if isinstance(metadata.get("evaluator"), dict) else {} + description = str( + request_meta.get("description") + or evaluator_meta.get("description") + or "" + ).strip() + requested_by = str( + evaluator_meta.get("scheduled_by") + or task_meta.get("requested_by") + or "" + ).strip() + environment = str(request_meta.get("environment") or "default").strip() or "default" + requested_by_label = _run_user_label(requested_by, environment) + task_type = str(task_meta.get("type") or metadata.get("source_mode") or "").strip() + task_status = str(task_meta.get("status") or "").strip() + evaluator_job_id = str( + evaluator_meta.get("job_id") + or request_meta.get("job_id") + or "" + ).strip() + evaluator_report_url = str(evaluator_meta.get("report_url") or "").strip() + evaluator_target = str( + evaluator_meta.get("target") + or request_meta.get("target_name") + or "" + ).strip() + catalog_id = str( + evaluator_meta.get("catalog_id") + or request_meta.get("catalog_id") + or "" + ).strip() + catalog_name = str(evaluator_meta.get("catalog_name") or "").strip() + catalog_label = catalog_name or catalog_id + catalog_url = _catalog_url( + str(request_meta.get("project_id") or "").strip(), + catalog_id, + str(evaluator_meta.get("catalog_url") or "").strip(), + ) + case_totals = evaluator_meta.get("case_totals") if isinstance(evaluator_meta.get("case_totals"), dict) else {} + passed_count = int(case_totals.get("success", 0) or 0) + failed_count = int(case_totals.get("failed", 0) or 0) + canceled_count = int(case_totals.get("canceled", 0) or 0) + search_blob = build_run_search_blob( + run_path, + metadata, + extra_values=[ + description, + requested_by, + requested_by_label, + task_type, + task_status, + evaluator_job_id, + catalog_id, + catalog_name, + evaluator_target, + ], + ) runs.append( { "name": info["name"], "path_display": f"{get_data_root_display()}/{info['name']}", "size": format_size(info["size_bytes"]), "mtime": float(info["mtime"] or 0), + "mtime_date": _to_jst(datetime.fromtimestamp(float(info["mtime"] or 0), tz=timezone.utc)).date() if info["mtime"] else None, "modified": _format_run_mtime(info["mtime"]), "has_summary": bool(info["has_summary"]), "has_score": bool(info["has_score"]), "has_parquet": bool(info["has_parquet"]), + "metadata": metadata, + "description": description, + "requested_by": requested_by, + "requested_by_label": requested_by_label, + "environment": environment, + "task_type": task_type, + "task_status": task_status, + "evaluator_job_id": evaluator_job_id, + "evaluator_report_url": evaluator_report_url, + "evaluator_target": evaluator_target, + "catalog_id": catalog_id, + "catalog_name": catalog_name, + "catalog_label": catalog_label, + "catalog_url": catalog_url, + "passed_count": passed_count, + "failed_count": failed_count, + "canceled_count": canceled_count, + "search_blob": search_blob, } ) runs.sort(key=lambda row: (-float(row["mtime"]), str(row["name"]).lower())) @@ -404,15 +541,19 @@ def _inject_workflow_page_styles() -> None: color: #0f172a; font-size: 0.78rem; line-height: 1.15; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; } .wf-run-text { padding-top: 0.26rem; } .wf-run-flags { display: flex; - flex-wrap: wrap; + flex-wrap: nowrap; gap: 0.24rem; padding-top: 0.18rem; + overflow: hidden; } .wf-flag { display: inline-flex; @@ -424,6 +565,7 @@ def _inject_workflow_page_styles() -> None: letter-spacing: 0.02em; background: #e2e8f0; color: #475569; + white-space: nowrap; } .wf-flag--ok { background: #dcfce7; @@ -495,19 +637,37 @@ def _inject_workflow_page_styles() -> None: def _render_local_runs_header() -> None: - header_cols = st.columns([0.7, 2.45, 1.35, 0.95, 1.55], gap="small") + header_cols = st.columns([0.45, 2.25, 1.05, 1.65, 1.25, 1.05, 1.18, 1.35, 0.75, 0.72], gap="small") header_cols[0].markdown('
Pick
', unsafe_allow_html=True) header_cols[1].markdown('
Name
', unsafe_allow_html=True) - header_cols[2].markdown('
Updated
', unsafe_allow_html=True) - header_cols[3].markdown('
Size
', unsafe_allow_html=True) - header_cols[4].markdown('
Files
', unsafe_allow_html=True) + header_cols[2].markdown('
User
', unsafe_allow_html=True) + header_cols[3].markdown('
Catalog
', unsafe_allow_html=True) + header_cols[4].markdown('
Evaluator
', unsafe_allow_html=True) + header_cols[5].markdown('
Result
', unsafe_allow_html=True) + header_cols[6].markdown('
Updated
', unsafe_allow_html=True) + header_cols[7].markdown('
Files
', unsafe_allow_html=True) + header_cols[8].markdown('
Size
', unsafe_allow_html=True) + header_cols[9].markdown('
Details
', unsafe_allow_html=True) def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: name_raw = str(run["name"]) name = html.escape(name_raw) modified = html.escape(str(run["modified"])) - size = html.escape(str(run["size"])) + user_label = html.escape(str(run.get("requested_by_label") or "—")) + catalog_label = html.escape(str(run.get("catalog_label") or run.get("catalog_id") or "—")) + catalog_url = html.escape(str(run.get("catalog_url") or "")) + evaluator_job_id = str(run.get("evaluator_job_id") or "").strip() + evaluator_report_url = str(run.get("evaluator_report_url") or "").strip() + evaluator_target = html.escape(str(run.get("evaluator_target") or "—")) + evaluator_label = html.escape(evaluator_job_id[:8] + "..." if len(evaluator_job_id) > 11 else (evaluator_job_id or "—")) + result_label = html.escape( + f"✅ {int(run.get('passed_count') or 0)} ❌ {int(run.get('failed_count') or 0)} ⏹ {int(run.get('canceled_count') or 0)}" + ) + description = str(run.get("description") or "").strip() + task_type = str(run.get("task_type") or "").strip() + task_status = str(run.get("task_status") or "").strip() + meta_bits = [bit for bit in [description, evaluator_target if evaluator_target != "—" else "", task_type, task_status] if bit] flags = [ ("Summary", bool(run["has_summary"])), ("Score", bool(run["has_score"])), @@ -517,26 +677,183 @@ def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: f'{label}' for label, enabled in flags ) + size_label = html.escape(str(run["size"])) checkbox_key = f"workflow_compare_pick::{name_raw}" if checkbox_key not in st.session_state: st.session_state[checkbox_key] = bool(selected) - row_cols = st.columns([0.7, 2.45, 1.35, 0.95, 1.55], gap="small") + row_cols = st.columns([0.45, 2.25, 1.05, 1.65, 1.25, 1.05, 1.18, 1.35, 0.75, 0.72], gap="small") with row_cols[0]: checked = st.checkbox("Select run", key=checkbox_key, label_visibility="collapsed") with row_cols[1]: - st.markdown( - f'', - unsafe_allow_html=True, - ) + title_html = f'' + if meta_bits: + meta_html = html.escape(" · ".join(meta_bits[:3])) + title_html += f'
{meta_html}
' + st.markdown(title_html, unsafe_allow_html=True) with row_cols[2]: - st.markdown(f'
{modified}
', unsafe_allow_html=True) + st.markdown(f'
{user_label}
', unsafe_allow_html=True) with row_cols[3]: - st.markdown(f'
{size}
', unsafe_allow_html=True) + if catalog_url and catalog_label != "—": + st.markdown( + f'', + unsafe_allow_html=True, + ) + else: + st.markdown(f'
{catalog_label}
', unsafe_allow_html=True) with row_cols[4]: + if evaluator_report_url and evaluator_job_id: + evaluator_html = f'' + else: + evaluator_html = f'
{evaluator_label}
' + if evaluator_target != "—": + evaluator_html += f'
{evaluator_target}
' + st.markdown(evaluator_html, unsafe_allow_html=True) + with row_cols[5]: + st.markdown(f'
{result_label}
', unsafe_allow_html=True) + with row_cols[6]: + st.markdown(f'
{modified}
', unsafe_allow_html=True) + with row_cols[7]: st.markdown(f'
{flag_html}
', unsafe_allow_html=True) + with row_cols[8]: + st.markdown(f'
{size_label}
', unsafe_allow_html=True) + with row_cols[9]: + if st.button("Details", key=f"workflow_run_details::{name_raw}", use_container_width=True): + st.session_state["workflow_local_run_detail"] = name_raw return bool(checked) +def _render_local_run_details(run: Dict[str, object]) -> None: + metadata = run.get("metadata") if isinstance(run.get("metadata"), dict) else {} + task_meta = metadata.get("task") if isinstance(metadata.get("task"), dict) else {} + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + evaluator_meta = metadata.get("evaluator") if isinstance(metadata.get("evaluator"), dict) else {} + download_meta = metadata.get("download") if isinstance(metadata.get("download"), dict) else {} + scenario_download_meta = metadata.get("scenario_download") if isinstance(metadata.get("scenario_download"), dict) else {} + evaluation_meta = metadata.get("evaluation") if isinstance(metadata.get("evaluation"), dict) else {} + parquet_meta = metadata.get("parquet") if isinstance(metadata.get("parquet"), dict) else {} + + with st.container(border=True): + title_cols = st.columns([3.4, 1.0]) + with title_cols[0]: + st.markdown(f"### Local Run Details: `{run['name']}`") + with title_cols[1]: + if st.button("Clear", key=f"workflow_clear_run_details::{run['name']}", use_container_width=True): + st.session_state["workflow_local_run_detail"] = "" + st.rerun() + + if not metadata: + st.info("This run was created before metadata capture was added. Showing only filesystem information.") + + top_cols = st.columns(4) + top_cols[0].metric("Updated", _metadata_text(run.get("modified"))) + top_cols[1].metric("Size", _metadata_text(run.get("size"))) + top_cols[2].metric("Task type", _metadata_text(task_meta.get("type") or metadata.get("source_mode"))) + top_cols[3].metric("Task status", _metadata_text(task_meta.get("status"))) + + run_cols = st.columns(2) + with run_cols[0]: + st.caption("Run folder") + st.code(str(run.get("path_display") or run.get("name") or ""), language=None) + with run_cols[1]: + st.caption("Available outputs") + st.write( + " | ".join( + label + for label, enabled in [ + ("Summary.csv", bool(run.get("has_summary"))), + ("Score.csv", bool(run.get("has_score"))), + ("Parquet", bool(run.get("has_parquet"))), + ] + if enabled + ) + or "—" + ) + + requested_by = str(task_meta.get("requested_by") or "").strip() + requested_by = str( + evaluator_meta.get("scheduled_by") + or requested_by + or "" + ).strip() + requested_by_label = requested_by or "—" + request_environment = str(request_meta.get("environment") or "default").strip() or "default" + requested_by_label = _run_user_label(requested_by, request_environment) + + task_cols = st.columns(4) + task_cols[0].text_input("Requested by", value=requested_by_label, disabled=True, key=f"run_detail_user::{run['name']}") + task_cols[1].text_input("Task ID", value=_metadata_text(task_meta.get("id")), disabled=True, key=f"run_detail_tid::{run['name']}") + task_cols[2].text_input("Created", value=_format_metadata_time(task_meta.get("created_at") or metadata.get("created_at")), disabled=True, key=f"run_detail_created::{run['name']}") + task_cols[3].text_input("Updated", value=_format_metadata_time(task_meta.get("updated_at") or metadata.get("updated_at")), disabled=True, key=f"run_detail_updated::{run['name']}") + task_error = str(task_meta.get("error_message") or "").strip() + if task_error: + st.error(task_error) + + request_cols = st.columns(4) + request_cols[0].text_input("Project", value=_metadata_text(request_meta.get("project_id")), disabled=True, key=f"run_detail_project::{run['name']}") + request_cols[1].text_input("Environment", value=_metadata_text(request_environment), disabled=True, key=f"run_detail_env::{run['name']}") + request_cols[2].text_input("Catalog ID", value=_metadata_text(evaluator_meta.get("catalog_id") or request_meta.get("catalog_id")), disabled=True, key=f"run_detail_catalog::{run['name']}") + request_cols[3].text_input("Integration ID", value=_metadata_text(evaluator_meta.get("integration_id") or request_meta.get("integration_id")), disabled=True, key=f"run_detail_integration::{run['name']}") + + detail_cols = st.columns(3) + detail_cols[0].text_input("Evaluator job ID", value=_metadata_text(evaluator_meta.get("job_id") or request_meta.get("job_id")), disabled=True, key=f"run_detail_job::{run['name']}") + detail_cols[1].text_input("Source job ID", value=_metadata_text(evaluator_meta.get("source_job_id") or request_meta.get("source_job_id")), disabled=True, key=f"run_detail_source_job::{run['name']}") + detail_cols[2].text_input("Target", value=_metadata_text(evaluator_meta.get("target") or request_meta.get("target_name")), disabled=True, key=f"run_detail_target::{run['name']}") + + st.text_input("Description", value=_metadata_text(request_meta.get("description") or evaluator_meta.get("description")), disabled=True, key=f"run_detail_desc::{run['name']}") + + if evaluator_meta: + eval_cols = st.columns(4) + eval_cols[0].text_input("Evaluator status", value=_metadata_text(evaluator_meta.get("status")), disabled=True, key=f"run_detail_estatus::{run['name']}") + eval_cols[1].text_input("Build status", value=_metadata_text(evaluator_meta.get("build_status")), disabled=True, key=f"run_detail_build::{run['name']}") + eval_cols[2].text_input("Test status", value=_metadata_text(evaluator_meta.get("test_status")), disabled=True, key=f"run_detail_test::{run['name']}") + eval_cols[3].text_input("Report URL", value=_metadata_text(evaluator_meta.get("report_url")), disabled=True, key=f"run_detail_report::{run['name']}") + fail_message = str(evaluator_meta.get("fail_message") or "").strip() + if fail_message: + st.warning(fail_message) + case_totals = evaluator_meta.get("case_totals") if isinstance(evaluator_meta.get("case_totals"), dict) else {} + if case_totals: + case_cols = st.columns(4) + case_cols[0].metric("Cases total", str(case_totals.get("total", 0))) + case_cols[1].metric("Cases success", str(case_totals.get("success", 0))) + case_cols[2].metric("Cases failed", str(case_totals.get("failed", 0))) + case_cols[3].metric("Cases canceled", str(case_totals.get("canceled", 0))) + + if download_meta or scenario_download_meta: + active_download_meta = download_meta or scenario_download_meta + download_cols = st.columns(4) + download_cols[0].text_input("Download mode", value=_metadata_text(active_download_meta.get("mode") or metadata.get("source_mode")), disabled=True, key=f"run_detail_dl_mode::{run['name']}") + download_cols[1].text_input("Download type", value=_metadata_text(download_meta.get("download_type") or request_meta.get("download_type")), disabled=True, key=f"run_detail_dl_type::{run['name']}") + download_cols[2].text_input("Phase", value=_metadata_text(download_meta.get("phase") or request_meta.get("phase")), disabled=True, key=f"run_detail_phase::{run['name']}") + download_cols[3].text_input("Skip large files", value="Yes" if bool(download_meta.get("skip_large_file") or request_meta.get("skip_large_file")) else "No", disabled=True, key=f"run_detail_skip::{run['name']}") + + count_cols = st.columns(3) + count_cols[0].metric("Download total", str(active_download_meta.get("total", 0))) + count_cols[1].metric("Download success", str(active_download_meta.get("success", 0))) + count_cols[2].metric("Download failed", str(active_download_meta.get("failed", 0))) + + if evaluation_meta: + eval_run_cols = st.columns(4) + eval_run_cols[0].text_input("Eval enabled", value="Yes" if bool(evaluation_meta.get("enabled") or request_meta.get("run_eval")) else "No", disabled=True, key=f"run_detail_eval_enabled::{run['name']}") + eval_run_cols[1].text_input("Recursive", value="Yes" if bool(evaluation_meta.get("recursive") or request_meta.get("eval_recursive")) else "No", disabled=True, key=f"run_detail_eval_recursive::{run['name']}") + eval_run_cols[2].text_input("Summary rows", value=str(evaluation_meta.get("summary_rows", "—")), disabled=True, key=f"run_detail_summary_rows::{run['name']}") + eval_run_cols[3].text_input("Score rows", value=str(evaluation_meta.get("score_rows", "—")), disabled=True, key=f"run_detail_score_rows::{run['name']}") + + if parquet_meta: + st.text_input("Parquet path", value=_metadata_text(parquet_meta.get("path")), disabled=True, key=f"run_detail_parquet::{run['name']}") + + suites = evaluator_meta.get("suites") if isinstance(evaluator_meta.get("suites"), list) else [] + failed_cases = evaluator_meta.get("failed_cases") if isinstance(evaluator_meta.get("failed_cases"), list) else [] + if suites: + with st.expander("Evaluator suites", expanded=False): + st.dataframe(suites, width="stretch", hide_index=True) + if failed_cases: + with st.expander("Failed cases", expanded=False): + st.dataframe(failed_cases, width="stretch", hide_index=True) + + with st.expander("Raw run metadata", expanded=False): + st.json(metadata or {}) + + def _render_local_runs_section() -> None: section_header("Local Runs", "") runs = _load_local_runs() @@ -550,11 +867,45 @@ def _render_local_runs_section() -> None: st.session_state["workflow_runs_summary_filter_applied"] = bool(st.session_state.get("workflow_runs_summary_filter", False)) if "workflow_runs_parquet_filter_applied" not in st.session_state: st.session_state["workflow_runs_parquet_filter_applied"] = bool(st.session_state.get("workflow_runs_parquet_filter", False)) + if "workflow_runs_user_filter_applied" not in st.session_state: + st.session_state["workflow_runs_user_filter_applied"] = str(st.session_state.get("workflow_runs_user_filter", "All users")) + if "workflow_runs_date_from_applied" not in st.session_state: + st.session_state["workflow_runs_date_from_applied"] = st.session_state.get("workflow_runs_date_from", None) + if "workflow_runs_date_to_applied" not in st.session_state: + st.session_state["workflow_runs_date_to_applied"] = st.session_state.get("workflow_runs_date_to", None) if "workflow_runs_page_size_applied" not in st.session_state: st.session_state["workflow_runs_page_size_applied"] = int(st.session_state.get("workflow_runs_page_size", 10) or 10) + current_user_id = str(get_task_list_current_user() or "").strip() + user_options = ["All users"] + if current_user_id: + user_options.append("My runs") + unique_users = [] + seen_users = set() + user_option_subject_map = {"All users": "", "My runs": current_user_id} + for row in runs: + subject_id = str(row.get("requested_by") or "").strip() + label = str(row.get("requested_by_label") or "").strip() + if not subject_id: + continue + option = label or "Unknown" + deduped_option = option + suffix = 2 + while deduped_option in seen_users and user_option_subject_map.get(deduped_option) != subject_id: + deduped_option = f"{option} [{suffix}]" + suffix += 1 + if deduped_option not in seen_users: + unique_users.append(deduped_option) + seen_users.add(deduped_option) + user_option_subject_map[deduped_option] = subject_id + user_options.extend(unique_users) + applied_user_option = st.session_state.get("workflow_runs_user_filter_applied", "All users") + if applied_user_option not in user_options: + applied_user_option = "All users" + st.session_state["workflow_runs_user_filter_applied"] = applied_user_option + with st.form("workflow_local_runs_filters", border=False): - control_cols = st.columns([2.2, 0.8, 0.8, 0.7, 0.8]) + control_cols = st.columns([1.8, 1.25, 1.05, 1.05, 0.72, 0.72, 0.65, 0.76]) with control_cols[0]: st.markdown('
Search
', unsafe_allow_html=True) run_search_input = st.text_input( @@ -562,9 +913,36 @@ def _render_local_runs_section() -> None: value=st.session_state.get("workflow_runs_search_applied", ""), key="workflow_runs_search", label_visibility="collapsed", - placeholder="Filter by run name", + placeholder="Filter by name, description, job id, catalog, user", ) with control_cols[1]: + st.markdown('
User
', unsafe_allow_html=True) + user_filter_input = st.selectbox( + "User", + options=user_options, + index=user_options.index(applied_user_option), + key="workflow_runs_user_filter", + label_visibility="collapsed", + ) + with control_cols[2]: + st.markdown('
From
', unsafe_allow_html=True) + date_from_input = st.date_input( + "From", + value=st.session_state.get("workflow_runs_date_from_applied", None), + key="workflow_runs_date_from", + label_visibility="collapsed", + help="Run modified-date lower bound in JST.", + ) + with control_cols[3]: + st.markdown('
To
', unsafe_allow_html=True) + date_to_input = st.date_input( + "To", + value=st.session_state.get("workflow_runs_date_to_applied", None), + key="workflow_runs_date_to", + label_visibility="collapsed", + help="Run modified-date upper bound in JST.", + ) + with control_cols[4]: st.markdown('
Summary
', unsafe_allow_html=True) require_summary_input = st.toggle( "Summary only", @@ -572,7 +950,7 @@ def _render_local_runs_section() -> None: key="workflow_runs_summary_filter", label_visibility="collapsed", ) - with control_cols[2]: + with control_cols[5]: st.markdown('
Parquet
', unsafe_allow_html=True) require_parquet_input = st.toggle( "Parquet only", @@ -580,7 +958,7 @@ def _render_local_runs_section() -> None: key="workflow_runs_parquet_filter", label_visibility="collapsed", ) - with control_cols[3]: + with control_cols[6]: st.markdown('
Rows
', unsafe_allow_html=True) page_size_input = int( st.selectbox( @@ -591,25 +969,44 @@ def _render_local_runs_section() -> None: label_visibility="collapsed", ) ) - with control_cols[4]: + with control_cols[7]: st.markdown('
Apply
', unsafe_allow_html=True) apply_filters = st.form_submit_button("Apply", use_container_width=True) if apply_filters: st.session_state["workflow_runs_search_applied"] = run_search_input + st.session_state["workflow_runs_user_filter_applied"] = user_filter_input + st.session_state["workflow_runs_date_from_applied"] = date_from_input + st.session_state["workflow_runs_date_to_applied"] = date_to_input st.session_state["workflow_runs_summary_filter_applied"] = bool(require_summary_input) st.session_state["workflow_runs_parquet_filter_applied"] = bool(require_parquet_input) st.session_state["workflow_runs_page_size_applied"] = int(page_size_input) st.session_state["workflow_runs_page"] = 1 run_search = str(st.session_state.get("workflow_runs_search_applied", "")).strip().lower() + selected_user_filter = str(st.session_state.get("workflow_runs_user_filter_applied", "All users")).strip() + selected_date_from = st.session_state.get("workflow_runs_date_from_applied", None) + selected_date_to = st.session_state.get("workflow_runs_date_to_applied", None) require_summary = bool(st.session_state.get("workflow_runs_summary_filter_applied", False)) require_parquet = bool(st.session_state.get("workflow_runs_parquet_filter_applied", False)) page_size = int(st.session_state.get("workflow_runs_page_size_applied", 10) or 10) + if selected_date_from and selected_date_to and selected_date_from > selected_date_to: + st.warning("`From` date must be earlier than or equal to `To` date.") + return + filtered = runs if run_search: - filtered = [row for row in filtered if run_search in str(row["name"]).lower()] + filtered = [row for row in filtered if run_search in str(row.get("search_blob") or row["name"]).lower()] + if selected_user_filter == "My runs" and current_user_id: + filtered = [row for row in filtered if str(row.get("requested_by") or "").strip() == current_user_id] + elif selected_user_filter not in ("", "All users", "My runs"): + selected_subject_id = str(user_option_subject_map.get(selected_user_filter) or "").strip() + filtered = [row for row in filtered if str(row.get("requested_by") or "").strip() == selected_subject_id] + if selected_date_from: + filtered = [row for row in filtered if row.get("mtime_date") and row["mtime_date"] >= selected_date_from] + if selected_date_to: + filtered = [row for row in filtered if row.get("mtime_date") and row["mtime_date"] <= selected_date_to] if require_summary: filtered = [row for row in filtered if bool(row["has_summary"])] if require_parquet: @@ -692,6 +1089,12 @@ def _render_local_runs_section() -> None: st.button("Open", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") st.markdown("
", unsafe_allow_html=True) + detail_run_name = str(st.session_state.get("workflow_local_run_detail") or "").strip() + if detail_run_name: + detail_run = next((row for row in runs if str(row["name"]) == detail_run_name), None) + if detail_run is not None: + _render_local_run_details(detail_run) + def _render_current_tasks_section() -> None: section_header("Current Tasks", "") @@ -700,6 +1103,48 @@ def _render_current_tasks_section() -> None: return current_user = get_task_list_current_user() + if "workflow_task_history_range" not in st.session_state: + st.session_state["workflow_task_history_range"] = "7 days" + if "workflow_task_history_page_size" not in st.session_state: + st.session_state["workflow_task_history_page_size"] = 20 + if "workflow_task_history_page" not in st.session_state: + st.session_state["workflow_task_history_page"] = 1 + + control_cols = st.columns([1.3, 1.0, 1.0, 2.7]) + with control_cols[0]: + selected_range = st.selectbox( + "History range", + options=list(_TASK_HISTORY_RANGE_OPTIONS.keys()), + key="workflow_task_history_range", + ) + with control_cols[1]: + page_size = int( + st.selectbox( + "Rows", + options=[20, 50, 100], + key="workflow_task_history_page_size", + ) + ) + since_days = _TASK_HISTORY_RANGE_OPTIONS.get(selected_range, _TASK_LIST_SINCE_DAYS) + total_tasks = count_recent_tasks(session_id=current_user, since_days=since_days) + page_count = max(1, (total_tasks + page_size - 1) // page_size) if total_tasks else 1 + current_page = min(max(1, int(st.session_state.get("workflow_task_history_page", 1))), page_count) + st.session_state["workflow_task_history_page"] = current_page + with control_cols[2]: + selected_page = st.selectbox( + "Page", + options=list(range(1, page_count + 1)), + index=current_page - 1, + key="workflow_task_history_page_select", + ) + if int(selected_page) != current_page: + current_page = int(selected_page) + st.session_state["workflow_task_history_page"] = current_page + with control_cols[3]: + label = selected_range if since_days is not None else "all time" + st.caption(f"Showing **{total_tasks}** tasks across **{page_count}** page(s) for **{label}**.") + + offset = (current_page - 1) * page_size use_fragment = getattr(st, "fragment", None) is not None if use_fragment: try: @@ -707,9 +1152,10 @@ def _render_current_tasks_section() -> None: @st.fragment(run_every=timedelta(seconds=3)) def _task_list_poll(): current_tasks = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, + limit=page_size, + offset=offset, session_id=current_user, - since_days=_TASK_LIST_SINCE_DAYS, + since_days=since_days, ) render_task_list(current_tasks, current_user) @@ -719,9 +1165,10 @@ def _task_list_poll(): use_fragment = False tasks = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, + limit=page_size, + offset=offset, session_id=current_user, - since_days=_TASK_LIST_SINCE_DAYS, + since_days=since_days, ) has_active = render_task_list(tasks, current_user) if st.button("Refresh tasks", key="workflow_refresh_tasks"): diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 794dafa..17591ce 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -7,14 +7,25 @@ import re import sys import time -from typing import Any, Dict +from typing import Any, Dict, Optional # App root on path for lib imports _APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _APP_ROOT not in sys.path: sys.path.insert(0, _APP_ROOT) -from lib.db import update_task_status, update_task_progress, append_task_log, update_task_result_summary +from lib.db import ( + append_task_log, + get_task, + update_task_progress, + update_task_result_summary, + update_task_status, +) +from lib.run_metadata import ( + read_run_metadata, + resolve_run_directory_from_task_parameters, + upsert_run_metadata, +) # Optional imports for tasks that need them def _import_eval_summary(): @@ -29,14 +40,170 @@ def _import_catalog_io(): return None +def _copy_task_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: + copied: Dict[str, Any] = {} + for key, value in (parameters or {}).items(): + if isinstance(value, (dict, list, tuple, str, int, float, bool)) or value is None: + copied[key] = value + else: + copied[key] = str(value) + return copied + + +def _task_row_payload(task_id: str) -> Dict[str, Any]: + row = get_task(task_id) or {} + return { + "id": str(row.get("id") or task_id), + "type": str(row.get("type") or "").strip(), + "status": str(row.get("status") or "").strip(), + "requested_by": str(row.get("session_id") or "").strip(), + "created_at": row.get("created_at"), + "updated_at": row.get("updated_at"), + "result_path": str(row.get("result_path") or "").strip(), + "error_message": str(row.get("error_message") or "").strip(), + "progress_message": str(row.get("progress_message") or "").strip(), + "progress_pct": row.get("progress_pct"), + } + + +def _task_request_payload(parameters: Dict[str, Any]) -> Dict[str, Any]: + params = _copy_task_parameters(parameters) + return { + "environment": str(params.get("environment") or "default").strip() or "default", + "project_id": str(params.get("project_id") or "").strip(), + "job_id": str(params.get("job_id") or "").strip(), + "catalog_id": str(params.get("catalog_id") or "").strip(), + "integration_id": str(params.get("integration_id") or "").strip(), + "source_job_id": str(params.get("source_job_id") or "").strip(), + "target_name": str(params.get("target_name") or "").strip(), + "description": str(params.get("description") or "").strip(), + "suite_id": str(params.get("suite_id") or "").strip(), + "suite_ids": list(params.get("suite_ids") or []), + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + "run_eval": bool(params.get("run_eval", False)), + "generate_parquet": bool(params.get("generate_parquet", False)), + "eval_recursive": bool(params.get("eval_recursive", False)), + "eval_overwrite": bool(params.get("eval_overwrite", False)), + "max_retries": params.get("max_retries"), + "clean_build": bool(params.get("clean_build", False)), + "debug": bool(params.get("debug", False)), + "is_tag": bool(params.get("is_tag", False)), + "scenario_name_filter": str(params.get("scenario_name_filter") or "").strip(), + "selected_ids": list(params.get("selected_ids") or []), + "output_path": str( + params.get("output_path") + or params.get("output_dir") + or params.get("eval_root") + or params.get("pkl_dir") + or "" + ).strip(), + "parameters": params, + } + + +def _build_run_metadata_patch(task_id: str, parameters: Dict[str, Any], *, task_type: str) -> Dict[str, Any]: + return { + "source_mode": task_type, + "task": _task_row_payload(task_id), + "request": _task_request_payload(parameters), + } + + +def _update_run_metadata( + task_id: str, + parameters: Dict[str, Any], + *, + task_type: str, + create_missing: bool = False, + extra: Optional[Dict[str, Any]] = None, +) -> None: + run_dir = resolve_run_directory_from_task_parameters(parameters, create_missing=create_missing) + if run_dir is None: + return + patch = _build_run_metadata_patch(task_id, parameters, task_type=task_type) + if extra: + patch.update(extra) + try: + upsert_run_metadata(run_dir, patch, create_missing=create_missing) + except Exception: + pass + + +def _append_run_event( + task_id: str, + parameters: Dict[str, Any], + *, + task_type: str, + message: str, +) -> None: + run_dir = resolve_run_directory_from_task_parameters(parameters, create_missing=False) + if run_dir is None: + return + try: + metadata = read_run_metadata(run_dir) + events = list(metadata.get("events") or []) + events.append({"at": _task_row_payload(task_id).get("updated_at"), "message": message}) + if len(events) > 50: + events = events[-50:] + upsert_run_metadata( + run_dir, + { + "events": events, + "task": _task_row_payload(task_id), + }, + create_missing=False, + ) + except Exception: + pass + + +def _mark_run_status( + task_id: str, + parameters: Dict[str, Any], + *, + task_type: str, + status: str, + error_message: str = "", + result_path: str = "", + extra: Optional[Dict[str, Any]] = None, + create_missing: bool = False, +) -> None: + patch: Dict[str, Any] = { + "task": { + "status": status, + } + } + if error_message: + patch["task"]["error_message"] = error_message + if result_path: + patch["task"]["result_path"] = result_path + if extra: + patch.update(extra) + _update_run_metadata( + task_id, + parameters, + task_type=task_type, + create_missing=create_missing, + extra=patch, + ) + + def job_generate_summary_csv(task_id: str, parameters: Dict[str, Any]) -> None: """Generate Summary.csv and Score.csv under eval_root.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting generate_summary_csv") + _mark_run_status(task_id, parameters, task_type="generate_summary_csv", status="running") try: eval_summary = _import_eval_summary() eval_root = parameters.get("eval_root") if not eval_root: + _mark_run_status( + task_id, parameters, task_type="generate_summary_csv", status="failed", error_message="Missing eval_root" + ) update_task_status(task_id, "failed", error_message="Missing eval_root") return append_task_log(task_id, f"Generating summary under {eval_root}") @@ -51,10 +218,28 @@ def job_generate_summary_csv(task_id: str, parameters: Dict[str, Any]) -> None: "score_rows": info.get("score_rows", 0), }, ) + _update_run_metadata( + task_id, + parameters, + task_type="generate_summary_csv", + extra={ + "evaluation": { + "summary_path": result_path, + "summary_rows": info.get("summary_rows", 0), + "score_rows": info.get("score_rows", 0), + } + }, + ) append_task_log(task_id, f"Done. Output: {result_path}") + _mark_run_status( + task_id, parameters, task_type="generate_summary_csv", status="completed", result_path=str(result_path or "") + ) update_task_status(task_id, "completed", result_path=result_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status( + task_id, parameters, task_type="generate_summary_csv", status="failed", error_message=str(e) + ) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -63,16 +248,21 @@ def job_run_eval_dirs(task_id: str, parameters: Dict[str, Any]) -> None: """Run eval_result for each dir under eval_root, then generate Summary/Score CSV.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting run_eval_dirs") + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="running") try: eval_summary = _import_eval_summary() eval_root = parameters.get("eval_root") recursive = parameters.get("recursive", True) overwrite = parameters.get("overwrite", False) if not eval_root: + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="failed", error_message="Missing eval_root") update_task_status(task_id, "failed", error_message="Missing eval_root") return target_dirs = eval_summary.find_eval_result_dirs(eval_root, recursive=recursive) if not target_dirs: + _mark_run_status( + task_id, parameters, task_type="run_eval_dirs", status="failed", error_message="No result directories found" + ) update_task_status(task_id, "failed", error_message="No result directories found") return total = len(target_dirs) @@ -103,10 +293,28 @@ def job_run_eval_dirs(task_id: str, parameters: Dict[str, Any]) -> None: "score_rows": info.get("score_rows", 0), } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_eval_dirs", + extra={ + "evaluation": { + "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), + "summary_path": result_path, + "summary_rows": info.get("summary_rows", 0), + "score_rows": info.get("score_rows", 0), + } + }, + ) append_task_log(task_id, f"Done. Output: {result_path}") + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="completed", result_path=result_path) update_task_status(task_id, "completed", result_path=result_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -115,13 +323,18 @@ def job_build_parquet(task_id: str, parameters: Dict[str, Any]) -> None: """Build scene_result parquet from pkl directory.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting build_parquet") + _mark_run_status(task_id, parameters, task_type="build_parquet", status="running") try: pkl_archive_to_parquet = _import_catalog_io() if pkl_archive_to_parquet is None: + _mark_run_status( + task_id, parameters, task_type="build_parquet", status="failed", error_message="perception_catalog_io not available" + ) update_task_status(task_id, "failed", error_message="perception_catalog_io not available") return pkl_dir = parameters.get("pkl_dir") if not pkl_dir: + _mark_run_status(task_id, parameters, task_type="build_parquet", status="failed", error_message="Missing pkl_dir") update_task_status(task_id, "failed", error_message="Missing pkl_dir") return append_task_log(task_id, f"Building parquet from {pkl_dir}") @@ -135,10 +348,23 @@ def job_build_parquet(task_id: str, parameters: Dict[str, Any]) -> None: job_id=job_id, ) update_task_result_summary(task_id, {"job": "build_parquet", "output_path": parquet_path}) + _update_run_metadata( + task_id, + parameters, + task_type="build_parquet", + extra={ + "parquet": { + "enabled": True, + "path": parquet_path, + } + }, + ) append_task_log(task_id, f"Done. Output: {parquet_path}") + _mark_run_status(task_id, parameters, task_type="build_parquet", status="completed", result_path=parquet_path) update_task_status(task_id, "completed", result_path=parquet_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="build_parquet", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -213,6 +439,35 @@ def _extract_failed_case_details(case_reports: Any, *, limit: int = 12) -> list[ return failed[:limit] +def _extract_git_target_from_report(report: Dict[str, Any]) -> str: + """Compact branch/tag label from evaluator report metadata.""" + source = ((report.get("event") or {}).get("source") or {}) + git_ref = str(source.get("git_ref") or "").strip() + if git_ref.startswith("refs/heads/"): + return git_ref[len("refs/heads/"):] + if git_ref.startswith("refs/tags/"): + return git_ref[len("refs/tags/"):] + return git_ref or str(source.get("git_sha") or "").strip()[:12] or "" + + +def _extract_catalog_url_from_report(report: Dict[str, Any]) -> str: + """Best-effort catalog URL matching the recent evaluator jobs list.""" + catalog = report.get("catalog") or {} + direct_url = str( + catalog.get("web_url") + or catalog.get("url") + or catalog.get("catalog_url") + or "" + ).strip() + if direct_url: + return direct_url + project_id = str(report.get("project_id") or "").strip() + catalog_id = str(catalog.get("catalog_id") or catalog.get("id") or "").strip() + if project_id and catalog_id: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog_id}?project_id={project_id}" + return "" + + def _build_evaluator_result_summary( *, job_id: str, @@ -238,6 +493,12 @@ def _build_evaluator_result_summary( "evaluator_job_id": job_id, "evaluator_report_url": report_url, "evaluator_status": evaluator_status, + "evaluator_scheduled_by": final_report.get("scheduled_by", ""), + "evaluator_catalog_id": ((final_report.get("catalog") or {}).get("id") or ""), + "evaluator_catalog_name": ((final_report.get("catalog") or {}).get("display_name") or ""), + "evaluator_catalog_version_id": ((final_report.get("catalog") or {}).get("version_id") or ""), + "evaluator_catalog_url": _extract_catalog_url_from_report(final_report), + "evaluator_target": _extract_git_target_from_report(final_report), "evaluator_build_status": build.get("status", ""), "evaluator_test_status": test.get("status", ""), "evaluator_fail_message": final_report.get("fail_message", ""), @@ -247,15 +508,66 @@ def _build_evaluator_result_summary( } +def _fetch_evaluator_context( + *, + project_id: str, + job_id: str, + environment: str, +) -> Dict[str, Any]: + """Best-effort evaluator metadata for tasks that start from an existing evaluator job.""" + if not project_id or not job_id: + return {} + try: + from lib import evaluator_api + + os.environ["AUTH_PROFILE"] = environment or "default" + api = evaluator_api.EvaluationRunAPI() + report = api.get_job_status(project_id, job_id) + status = evaluator_api.extract_job_status(report) + build = report.get("build") or {} + test = report.get("test") or {} + available = test.get("available_case_results") or test.get("case_results") or {} + return { + "job_id": job_id, + "report_url": evaluator_api.get_job_report_url(project_id, job_id), + "status": status, + "scheduled_by": str(report.get("scheduled_by") or "").strip(), + "catalog_id": str(((report.get("catalog") or {}).get("id") or "")).strip(), + "catalog_name": str(((report.get("catalog") or {}).get("display_name") or "")).strip(), + "catalog_version_id": (report.get("catalog") or {}).get("version_id"), + "catalog_url": _extract_catalog_url_from_report(report), + "target": _extract_git_target_from_report(report), + "build_status": str(build.get("status") or "").strip(), + "test_status": str(test.get("status") or "").strip(), + "fail_message": str(report.get("fail_message") or "").strip(), + "case_totals": { + "total": int(available.get("total_count", 0) or 0), + "success": int(available.get("success_count", 0) or 0), + "failed": int(available.get("failure_count", 0) or 0), + "canceled": int(available.get("cancellation_count", 0) or 0), + }, + } + except Exception: + return {} + + def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: """Download job results (archives or result JSON) and extract/organize. Requires auth.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting download_results") + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="running", + create_missing=True, + ) try: from lib import download_core # noqa: F401 output_path = parameters.get("output_path") project_id = parameters.get("project_id") job_id = parameters.get("job_id") + environment = str(parameters.get("environment") or "default").strip() or "default" suite_id = parameters.get("suite_id") suite_ids = parameters.get("suite_ids") # optional list download_type = parameters.get("download_type", "archives") # archives | result_json @@ -264,8 +576,25 @@ def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: large_file_mb = float(parameters.get("large_file_mb", 50.0)) keep_zip_files = parameters.get("keep_zip_files", False) if not all([output_path, project_id, job_id]): + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="failed", + error_message="Missing output_path, project_id, or job_id", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="Missing output_path, project_id, or job_id") return + evaluator_context = _fetch_evaluator_context(project_id=project_id, job_id=job_id, environment=environment) + if evaluator_context: + _update_run_metadata( + task_id, + parameters, + task_type="download_results", + create_missing=True, + extra={"evaluator": evaluator_context}, + ) on_progress = lambda msg: _progress_callback(task_id, msg) on_warning = lambda msg: append_task_log(task_id, msg) failure_count, total_attempted, rows = download_core.run_download_results( @@ -292,22 +621,67 @@ def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: "rows": rows[:500], } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="download_results", + create_missing=True, + extra={ + "download": { + "mode": "download_results", + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "rows": rows[:100], + "download_type": download_type, + "phase": phase, + "skip_large_file": bool(skip_large_file), + "large_file_mb": large_file_mb, + "keep_zip_files": bool(keep_zip_files), + } + }, + ) append_task_log(task_id, "Download and extract completed") if success_count == 0 and failure_count > 0: err_msg = f"Download completed with {failure_count} failures. See task log for details." + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="failed", + result_path=output_path, + error_message=err_msg, + ) update_task_status(task_id, "failed", result_path=output_path, error_message=err_msg) else: + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="completed", + result_path=output_path, + ) update_task_status(task_id, "completed", result_path=output_path) except ImportError: + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="failed", + error_message="Download worker not available: lib.download_core not implemented", + create_missing=True, + ) update_task_status( task_id, "failed", error_message="Download worker not available: lib.download_core not implemented", ) except NotImplementedError as e: + _mark_run_status(task_id, parameters, task_type="download_results", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="download_results", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -316,19 +690,44 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: """Download scenarios from job to output_dir. Requires auth.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting download_scenarios") + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="running", + create_missing=True, + ) try: from lib import download_core # noqa: F401 output_dir = parameters.get("output_dir") or parameters.get("output_path") project_id = parameters.get("project_id") job_id = parameters.get("job_id") + environment = str(parameters.get("environment") or "default").strip() or "default" suite_id = parameters.get("suite_id") suite_ids = parameters.get("suite_ids") overwrite = parameters.get("overwrite", False) scenario_name_filter = parameters.get("scenario_name_filter") selected_ids = parameters.get("selected_ids") if not all([output_dir, project_id, job_id]): + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="failed", + error_message="Missing output_dir, project_id, or job_id", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="Missing output_dir, project_id, or job_id") return + evaluator_context = _fetch_evaluator_context(project_id=project_id, job_id=job_id, environment=environment) + if evaluator_context: + _update_run_metadata( + task_id, + parameters, + task_type="download_scenarios", + create_missing=True, + extra={"evaluator": evaluator_context}, + ) on_progress = lambda msg: _progress_callback(task_id, msg) on_warning = lambda msg: append_task_log(task_id, msg) failure_count, total_attempted, rows = download_core.run_download_scenarios( @@ -353,22 +752,64 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: "rows": rows[:500], } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="download_scenarios", + create_missing=True, + extra={ + "scenario_download": { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "overwrite": bool(overwrite), + "scenario_name_filter": str(scenario_name_filter or "").strip(), + "selected_ids": list(selected_ids or []), + "rows": rows[:100], + } + }, + ) append_task_log(task_id, "Download scenarios completed") if failure_count > 0: err_msg = f"Download completed with {failure_count} failures. See task log for details." + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="failed", + result_path=output_dir, + error_message=err_msg, + ) update_task_status(task_id, "failed", result_path=output_dir, error_message=err_msg) else: + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="completed", + result_path=output_dir, + ) update_task_status(task_id, "completed", result_path=output_dir) except ImportError: + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="failed", + error_message="Download worker not available: lib.download_core not implemented", + create_missing=True, + ) update_task_status( task_id, "failed", error_message="Download worker not available: lib.download_core not implemented", ) except NotImplementedError as e: + _mark_run_status(task_id, parameters, task_type="download_scenarios", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="download_scenarios", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -377,11 +818,19 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: """Download results, then run eval and parquet generation. Stops on download failure.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting download_and_eval combined workflow") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="running", + create_missing=True, + ) try: from lib import download_core output_path = parameters.get("output_path") project_id = parameters.get("project_id") job_id = parameters.get("job_id") + environment = str(parameters.get("environment") or "default").strip() or "default" suite_id = parameters.get("suite_id") suite_ids = parameters.get("suite_ids") download_type = parameters.get("download_type", "archives") @@ -395,8 +844,25 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: eval_overwrite = parameters.get("eval_overwrite", False) if not all([output_path, project_id, job_id]): + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="failed", + error_message="Missing output_path, project_id, or job_id", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="Missing output_path, project_id, or job_id") return + evaluator_context = _fetch_evaluator_context(project_id=project_id, job_id=job_id, environment=environment) + if evaluator_context: + _update_run_metadata( + task_id, + parameters, + task_type="download_and_eval", + create_missing=True, + extra={"evaluator": evaluator_context}, + ) on_progress = lambda msg: _progress_callback(task_id, msg) on_warning = lambda msg: append_task_log(task_id, msg) @@ -430,22 +896,74 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: "errors": result.get("errors", []), } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="download_and_eval", + create_missing=True, + extra={ + "download": { + "mode": "download_and_eval", + **(result.get("download_summary", {}) or {}), + "download_type": download_type, + "phase": phase, + "skip_large_file": bool(skip_large_file), + "large_file_mb": large_file_mb, + "keep_zip_files": bool(keep_zip_files), + }, + "evaluation": { + **(result.get("eval_summary", {}) or {}), + "enabled": bool(run_eval), + "recursive": bool(eval_recursive), + "overwrite": bool(eval_overwrite), + }, + "parquet": { + "enabled": bool(generate_parquet), + "path": result.get("parquet_path", ""), + }, + "errors": list(result.get("errors", []) or []), + }, + ) if not result.get("download_success"): err_msg = result.get("errors", ["Download failed"])[0] append_task_log(task_id, f"Stopped: {err_msg}") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="failed", + result_path=output_path, + error_message=err_msg, + ) update_task_status(task_id, "failed", result_path=output_path, error_message=err_msg) elif result.get("errors"): # Partial success with some errors errs = "; ".join(result["errors"][:5]) append_task_log(task_id, f"Completed with errors: {errs}") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="completed", + result_path=output_path, + error_message=errs, + ) update_task_status(task_id, "completed", result_path=output_path) else: append_task_log(task_id, "Download and eval completed successfully") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="completed", + result_path=output_path, + ) update_task_status(task_id, "completed", result_path=output_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="download_and_eval", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -463,6 +981,13 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N """ update_task_status(task_id, "running") append_task_log(task_id, "Starting run_evaluator_and_process workflow") + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="running", + create_missing=True, + ) try: from lib import evaluator_api @@ -512,6 +1037,14 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N has_source_job = bool(source_job_id) has_fresh_source = bool(integration_id and target_name) if not project_id or not catalog_id or not output_path or (not has_source_job and not has_fresh_source): + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message="Missing required parameters", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="Missing required parameters") return @@ -521,10 +1054,12 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N def on_progress(msg: str) -> None: append_task_log(task_id, msg) + _append_run_event(task_id, parameters, task_type="run_evaluator_and_process", message=msg) update_task_progress(task_id, message=msg) def on_warning(msg: str) -> None: append_task_log(task_id, f"WARNING: {msg}") + _append_run_event(task_id, parameters, task_type="run_evaluator_and_process", message=f"WARNING: {msg}") # Step 1: Schedule evaluator job on_progress("Step 1/5: Scheduling evaluator job...") @@ -553,11 +1088,27 @@ def on_warning(msg: str) -> None: is_tag=is_tag, ) except Exception as e: + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Failed to schedule evaluator job: {e}", + create_missing=True, + ) update_task_status(task_id, "failed", error_message=f"Failed to schedule evaluator job: {e}") return job_id = result.get("job_id") if not job_id: + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message="No job_id returned from evaluator API", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="No job_id returned from evaluator API") return @@ -576,6 +1127,25 @@ def on_warning(msg: str) -> None: "parquet_path": "", } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + create_missing=True, + extra={ + "evaluator": { + "job_id": job_id, + "report_url": report_url, + "status": "scheduled", + "catalog_id": catalog_id, + "integration_id": integration_id or "", + "source_job_id": source_job_id or "", + "target_name": target_name or "", + "description": description or "", + "is_tag": bool(is_tag), + } + }, + ) # Step 2: Poll for evaluator completion on_progress("Step 2/5: Waiting for evaluator to complete...") @@ -615,6 +1185,18 @@ def on_eval_progress(status: str, elapsed: float) -> None: if snapshot_key == last_suite_snapshot["key"]: summary["evaluator_case_totals"] = totals summary["evaluator_suites"] = suite_summary + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluator": { + "status": status, + "case_totals": totals, + "suites": suite_summary, + } + }, + ) update_task_result_summary(task_id, summary) return @@ -643,6 +1225,18 @@ def on_eval_progress(status: str, elapsed: float) -> None: f"{totals['failed']} failed, {totals['canceled']} canceled." ), ) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluator": { + "status": status, + "case_totals": totals, + "suites": suite_summary, + } + }, + ) update_task_result_summary(task_id, summary) try: @@ -655,6 +1249,13 @@ def on_eval_progress(status: str, elapsed: float) -> None: ) except evaluator_api.EvaluationAPIError as e: append_task_log(task_id, f"Evaluator wait error: {e}") + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Evaluator failed or timed out: {e}", + ) update_task_status(task_id, "failed", error_message=f"Evaluator failed or timed out: {e}") return @@ -680,6 +1281,30 @@ def on_eval_progress(status: str, elapsed: float) -> None: ) summary.update(evaluator_summary) update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluator": { + "job_id": job_id, + "report_url": report_url, + "status": test_status, + "scheduled_by": summary.get("evaluator_scheduled_by", ""), + "catalog_id": summary.get("evaluator_catalog_id", ""), + "catalog_name": summary.get("evaluator_catalog_name", ""), + "catalog_version_id": summary.get("evaluator_catalog_version_id", ""), + "catalog_url": summary.get("evaluator_catalog_url", ""), + "target": summary.get("evaluator_target", ""), + "build_status": summary.get("evaluator_build_status", ""), + "test_status": summary.get("evaluator_test_status", ""), + "fail_message": summary.get("evaluator_fail_message", ""), + "case_totals": summary.get("evaluator_case_totals", {}), + "suites": summary.get("evaluator_suites", []), + "failed_cases": summary.get("evaluator_failed_cases", []), + } + }, + ) fail_message = summary.get("evaluator_fail_message", "") if evaluator_api.is_success_job_status(test_status): @@ -736,6 +1361,14 @@ def on_eval_progress(status: str, elapsed: float) -> None: evaluator_msg = "" if not evaluator_api.is_success_job_status(test_status): evaluator_msg = f" Evaluator status was {test_status}." + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed.{evaluator_msg}", + result_path=output_path, + ) update_task_status(task_id, "failed", error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed.{evaluator_msg}") return @@ -749,6 +1382,14 @@ def on_eval_progress(status: str, elapsed: float) -> None: f" Evaluator status was {test_status}. " "This usually means the job failed before producing downloadable case logs." ) + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Download failed: {e}{evaluator_msg}", + result_path=output_path, + ) update_task_status(task_id, "failed", error_message=f"Download failed: {e}{evaluator_msg}") return @@ -762,6 +1403,14 @@ def on_eval_progress(status: str, elapsed: float) -> None: time.sleep(wait_seconds) except Exception as e: + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Download failed: {e}", + result_path=output_path, + ) update_task_status(task_id, "failed", error_message=f"Download failed: {e}") return @@ -773,6 +1422,25 @@ def on_eval_progress(status: str, elapsed: float) -> None: } summary["download_rows"] = rows[:500] update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "download": { + "mode": "run_evaluator_and_process", + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "download_type": download_type, + "phase": phase, + "skip_large_file": bool(skip_large_file), + "large_file_mb": large_file_mb, + "keep_zip_files": bool(keep_zip_files), + "rows": rows[:100], + } + }, + ) # Step 4: Run eval if run_eval: @@ -816,6 +1484,19 @@ def on_eval_progress(status: str, elapsed: float) -> None: update_task_progress(task_id, message="Evaluation complete", pct=85) summary["eval_summary"] = eval_result_summary update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluation": { + **eval_result_summary, + "enabled": bool(run_eval), + "recursive": bool(eval_recursive), + "overwrite": bool(eval_overwrite), + } + }, + ) # Step 5: Generate parquet parquet_path = "" @@ -841,14 +1522,33 @@ def on_eval_progress(status: str, elapsed: float) -> None: # Build final summary update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "parquet": { + "enabled": bool(generate_parquet), + "path": parquet_path, + } + }, + ) if evaluator_api.is_success_job_status(test_status): append_task_log(task_id, "Workflow complete!") else: append_task_log(task_id, "Workflow complete. Evaluator job had failed test cases, but downloadable results were processed.") + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="completed", + result_path=output_path, + ) update_task_status(task_id, "completed", result_path=output_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="run_evaluator_and_process", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise From 3a62b7162442f734a65bce927020938731be078e Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 18 May 2026 13:42:02 +0900 Subject: [PATCH 71/94] feat: update Dockerfile and requirements for enhanced functionality - Added `polars` as a new dependency in `requirements-docker.txt` to support data manipulation features. - Updated comments in the Dockerfile for clarity on the installation process of public dependencies and runtime dependencies. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Dockerfile | 3 +- .../pages/7_Evaluator_Workflow.py | 143 ++++++++++++++++-- .../requirements-docker.txt | 1 + 3 files changed, 135 insertions(+), 12 deletions(-) diff --git a/evaluation_dashboard_app/Dockerfile b/evaluation_dashboard_app/Dockerfile index 95dc9fb..b45eb83 100644 --- a/evaluation_dashboard_app/Dockerfile +++ b/evaluation_dashboard_app/Dockerfile @@ -64,7 +64,8 @@ RUN --mount=type=secret,id=ssh,dst=/tmp/ssh_key \ && python3 -m pip install --no-cache-dir --no-deps -e /opt/perception_catalog_analyzer \ && rm -rf /root/.ssh # Clean up private key ASAP for security -# Install public dependencies (after SSH deps so SSH failures surface fast) +# Install public dependencies (after SSH deps so SSH failures surface fast). +# Keep analyzer runtime deps that are not installed via `-e --no-deps` here too. COPY requirements-docker.txt . RUN python3 -m pip install --no-cache-dir -r requirements-docker.txt diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 5d8cd98..4f7dac8 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -37,6 +37,8 @@ read_run_metadata, ) from lib.ui.recent_evaluator_jobs import ( + _fetch_evaluator_job_detail, + _render_recent_evaluator_job_retest_dialog, _render_recent_evaluator_jobs_section, configure_recent_evaluator_jobs_ui, ) @@ -442,6 +444,24 @@ def _load_local_runs() -> List[Dict[str, object]]: return runs +@st.cache_data(ttl=300, show_spinner=False) +def _load_local_run_source_ref(project_id: str, environment: str, evaluator_job_id: str) -> Dict[str, str]: + project = str(project_id or "").strip() + env = str(environment or "default").strip() or "default" + job_id = str(evaluator_job_id or "").strip() + if not project or not job_id: + return {"title": "", "label": "", "url": ""} + try: + detail = _fetch_evaluator_job_detail(project, env, job_id) + except Exception: + return {"title": "", "label": "", "url": ""} + return { + "title": str(detail.get("title") or "").strip(), + "label": str(detail.get("source_label") or detail.get("target") or "").strip(), + "url": str(detail.get("git_ref_url") or detail.get("source_url") or "").strip(), + } + + @st.cache_data(ttl=24 * 3600, show_spinner=False) def _resolve_subject_name(subject_id: str, environment: str) -> Dict[str, str]: subject = str(subject_id or "").strip() @@ -548,6 +568,16 @@ def _inject_workflow_page_styles() -> None: .wf-run-text { padding-top: 0.26rem; } + .wf-run-code { + padding-top: 0.22rem; + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; + font-size: 0.74rem; + line-height: 1.22; + color: #0f172a; + white-space: normal; + overflow-wrap: anywhere; + word-break: break-all; + } .wf-run-flags { display: flex; flex-wrap: nowrap; @@ -637,7 +667,7 @@ def _inject_workflow_page_styles() -> None: def _render_local_runs_header() -> None: - header_cols = st.columns([0.45, 2.25, 1.05, 1.65, 1.25, 1.05, 1.18, 1.35, 0.75, 0.72], gap="small") + header_cols = st.columns([0.45, 2.35, 0.95, 1.55, 1.45, 1.0, 1.08, 1.3, 0.72, 0.72], gap="small") header_cols[0].markdown('
Pick
', unsafe_allow_html=True) header_cols[1].markdown('
Name
', unsafe_allow_html=True) header_cols[2].markdown('
User
', unsafe_allow_html=True) @@ -659,15 +689,22 @@ def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: catalog_url = html.escape(str(run.get("catalog_url") or "")) evaluator_job_id = str(run.get("evaluator_job_id") or "").strip() evaluator_report_url = str(run.get("evaluator_report_url") or "").strip() - evaluator_target = html.escape(str(run.get("evaluator_target") or "—")) - evaluator_label = html.escape(evaluator_job_id[:8] + "..." if len(evaluator_job_id) > 11 else (evaluator_job_id or "—")) + evaluator_target = str(run.get("evaluator_target") or "").strip() + description = str(run.get("description") or "").strip() + source_ref = _load_local_run_source_ref( + str(run.get("metadata", {}).get("request", {}).get("project_id") or ""), + str(run.get("environment") or "default"), + evaluator_job_id, + ) if evaluator_job_id else {"title": "", "label": "", "url": ""} + evaluator_title = html.escape(source_ref.get("title") or description or evaluator_job_id or "—") + source_label = html.escape(source_ref.get("label") or evaluator_target or "—") + source_url = html.escape(source_ref.get("url") or "") result_label = html.escape( f"✅ {int(run.get('passed_count') or 0)} ❌ {int(run.get('failed_count') or 0)} ⏹ {int(run.get('canceled_count') or 0)}" ) - description = str(run.get("description") or "").strip() task_type = str(run.get("task_type") or "").strip() task_status = str(run.get("task_status") or "").strip() - meta_bits = [bit for bit in [description, evaluator_target if evaluator_target != "—" else "", task_type, task_status] if bit] + meta_bits = [bit for bit in [description, task_type, task_status] if bit] flags = [ ("Summary", bool(run["has_summary"])), ("Score", bool(run["has_score"])), @@ -681,7 +718,7 @@ def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: checkbox_key = f"workflow_compare_pick::{name_raw}" if checkbox_key not in st.session_state: st.session_state[checkbox_key] = bool(selected) - row_cols = st.columns([0.45, 2.25, 1.05, 1.65, 1.25, 1.05, 1.18, 1.35, 0.75, 0.72], gap="small") + row_cols = st.columns([0.45, 2.35, 0.95, 1.55, 1.45, 1.0, 1.08, 1.3, 0.72, 0.72], gap="small") with row_cols[0]: checked = st.checkbox("Select run", key=checkbox_key, label_visibility="collapsed") with row_cols[1]: @@ -702,11 +739,13 @@ def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: st.markdown(f'
{catalog_label}
', unsafe_allow_html=True) with row_cols[4]: if evaluator_report_url and evaluator_job_id: - evaluator_html = f'' + evaluator_html = f'' else: - evaluator_html = f'
{evaluator_label}
' - if evaluator_target != "—": - evaluator_html += f'
{evaluator_target}
' + evaluator_html = f'
{evaluator_title}
' + if source_url and source_label != "—": + evaluator_html += f'' + elif source_label != "—": + evaluator_html += f'
{source_label}
' st.markdown(evaluator_html, unsafe_allow_html=True) with row_cols[5]: st.markdown(f'
{result_label}
', unsafe_allow_html=True) @@ -731,6 +770,20 @@ def _render_local_run_details(run: Dict[str, object]) -> None: scenario_download_meta = metadata.get("scenario_download") if isinstance(metadata.get("scenario_download"), dict) else {} evaluation_meta = metadata.get("evaluation") if isinstance(metadata.get("evaluation"), dict) else {} parquet_meta = metadata.get("parquet") if isinstance(metadata.get("parquet"), dict) else {} + project_id = str(request_meta.get("project_id") or "").strip() + request_environment = str(request_meta.get("environment") or "default").strip() or "default" + evaluator_job_id = str(evaluator_meta.get("job_id") or request_meta.get("job_id") or "").strip() + evaluator_report_url = str(evaluator_meta.get("report_url") or "").strip() + evaluator_target = str(evaluator_meta.get("target") or evaluator_meta.get("target_name") or request_meta.get("target_name") or "").strip() + evaluator_detail = {} + if project_id and evaluator_job_id: + try: + evaluator_detail = _fetch_evaluator_job_detail(project_id, request_environment, evaluator_job_id) + except Exception: + evaluator_detail = {} + source_url = str(evaluator_detail.get("source_url") or evaluator_detail.get("git_ref_url") or "").strip() + catalog_url = str(evaluator_detail.get("catalog_url") or "").strip() + source_label = str(evaluator_detail.get("source_label") or evaluator_target or "").strip() with st.container(border=True): title_cols = st.columns([3.4, 1.0]) @@ -776,7 +829,6 @@ def _render_local_run_details(run: Dict[str, object]) -> None: or "" ).strip() requested_by_label = requested_by or "—" - request_environment = str(request_meta.get("environment") or "default").strip() or "default" requested_by_label = _run_user_label(requested_by, request_environment) task_cols = st.columns(4) @@ -801,6 +853,36 @@ def _render_local_run_details(run: Dict[str, object]) -> None: st.text_input("Description", value=_metadata_text(request_meta.get("description") or evaluator_meta.get("description")), disabled=True, key=f"run_detail_desc::{run['name']}") + if evaluator_job_id: + action_cols = st.columns([1.15, 1.15, 1.15, 2.55]) + with action_cols[0]: + if evaluator_report_url: + st.link_button("Open report", evaluator_report_url, use_container_width=True) + with action_cols[1]: + if source_url: + st.link_button("Open source", source_url, use_container_width=True) + with action_cols[2]: + if catalog_url: + st.link_button("Open catalog", catalog_url, use_container_width=True) + with action_cols[3]: + if st.button("Artifact retest", key=f"workflow_local_run_retest::{run['name']}", type="primary", use_container_width=True): + st.session_state["workflow_local_run_retest"] = str(run["name"]) + st.rerun() + + info_cols = st.columns([1.6, 2.4]) + info_cols[0].text_input( + "Evaluator job", + value=evaluator_job_id, + disabled=True, + key=f"run_detail_job_full::{run['name']}", + ) + info_cols[1].text_input( + "Source ref", + value=_metadata_text(source_label or evaluator_target), + disabled=True, + key=f"run_detail_source_ref::{run['name']}", + ) + if evaluator_meta: eval_cols = st.columns(4) eval_cols[0].text_input("Evaluator status", value=_metadata_text(evaluator_meta.get("status")), disabled=True, key=f"run_detail_estatus::{run['name']}") @@ -853,6 +935,45 @@ def _render_local_run_details(run: Dict[str, object]) -> None: with st.expander("Raw run metadata", expanded=False): st.json(metadata or {}) + selected_retest_run = str(st.session_state.get("workflow_local_run_retest") or "").strip() + if selected_retest_run == str(run["name"]) and evaluator_job_id: + dialog_job = { + "job_id": evaluator_job_id, + "title": str(evaluator_detail.get("title") or run.get("description") or run["name"]), + } + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Artifact retest · {dialog_job['title']}", width="large") + def _workflow_local_run_retest_dialog() -> None: + _render_recent_evaluator_job_retest_dialog( + project_id, + request_environment, + dialog_job, + output_path_default="", + phase_default=str(request_meta.get("phase") or "perception.object_recognition.tracking.objects"), + ) + + _workflow_local_run_retest_dialog() + finally: + if st.session_state.get("workflow_local_run_retest") == str(run["name"]): + st.session_state.pop("workflow_local_run_retest", None) + else: + st.markdown("---") + fallback_cols = st.columns([4.2, 1.0]) + with fallback_cols[0]: + st.subheader(f"Artifact retest · {dialog_job['title']}") + with fallback_cols[1]: + if st.button("Close", key=f"workflow_local_run_retest_close::{run['name']}", use_container_width=True): + st.session_state.pop("workflow_local_run_retest", None) + st.rerun() + _render_recent_evaluator_job_retest_dialog( + project_id, + request_environment, + dialog_job, + output_path_default="", + phase_default=str(request_meta.get("phase") or "perception.object_recognition.tracking.objects"), + ) + def _render_local_runs_section() -> None: section_header("Local Runs", "") diff --git a/evaluation_dashboard_app/requirements-docker.txt b/evaluation_dashboard_app/requirements-docker.txt index 40d9cf6..7486fc0 100644 --- a/evaluation_dashboard_app/requirements-docker.txt +++ b/evaluation_dashboard_app/requirements-docker.txt @@ -9,6 +9,7 @@ duckdb>=0.9.0 numpy>=1.24.0 matplotlib>=3.7.0 shapely>=2.0.0 +polars>=1.0.0 requests>=2.31.0 PyYAML>=6.0 reportlab>=4.0.0 From cfaa789ada851c757544439a4300f1d5f644cc23 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 18 May 2026 14:11:31 +0900 Subject: [PATCH 72/94] feat: enhance compatibility functions for template and data handling - Introduced compatibility functions to handle variations in function signatures across analyzer versions, improving flexibility in template updates and data processing. - Updated `_update_template_compat`, `_scene_dataframe_from_dir_compat`, `_get_blocks_compat`, and `_specsheet_compat` to support keyword arguments and positional parameters dynamically. - Refactored `generate_specsheet_pdf` to utilize the new compatibility functions, ensuring consistent behavior regardless of the underlying implementation. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/specsheet_report.py | 183 +++++++++++++++++- 1 file changed, 174 insertions(+), 9 deletions(-) diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index a0f00f7..93062b2 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -158,6 +158,7 @@ def _update_template_compat( version: str, *, template_dir: Path, + context_dir: Path, ) -> Sequence[str]: """Call update_template across analyzer versions with different signatures.""" try: @@ -165,13 +166,170 @@ def _update_template_compat( except (TypeError, ValueError): parameters = {} - supports_template_dir = ( - "template_dir" in parameters - or any(param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values()) + semantic_kwargs = { + "project_id": project_id, + "pilot_auto_version": version, + "version": version, + "devops_data": {}, + "devops_plot_path": None, + "performance_trend_data": [], + "map_trend_plot_path": context_dir / "map_trend.png", + "prediction_trend_plot_path": context_dir / "prediction_trend.png", + "devops_trend_data": [], + "devops_trend_plot_path": context_dir / "devops_trend.png", + "job_ids": [], + "template_name": "static_body.html", + "extensions": ["html"], + "template_dir": str(template_dir), + "show_other_infos": False, + } + + accepts_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() ) - if supports_template_dir: - return update_template_func(project_id, version, template_dir=str(template_dir)) - return update_template_func(project_id, version) + if accepts_kwargs or not parameters: + return update_template_func(**semantic_kwargs) + + args: list[object] = [] + kwargs: dict[str, object] = {} + for name, param in parameters.items(): + if name not in semantic_kwargs: + continue + value = semantic_kwargs[name] + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ): + args.append(value) + elif param.kind == inspect.Parameter.KEYWORD_ONLY: + kwargs[name] = value + return update_template_func(*args, **kwargs) + +def _scene_dataframe_from_dir_compat( + scene_dataframe_cls, + run_path: Path, + *, + topic_name: str, +): + """Call SceneDataFrame.from_dir across analyzer versions with/without topic.""" + from_dir = scene_dataframe_cls.from_dir + try: + parameters = inspect.signature(from_dir).parameters + except (TypeError, ValueError): + parameters = {} + + required_parameters = [ + param + for param in parameters.values() + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + and param.default is inspect.Parameter.empty + ] + accepts_varargs = any( + param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD) + for param in parameters.values() + ) + + if accepts_varargs or len(required_parameters) >= 2: + return from_dir(run_path, topic_name) + return from_dir(run_path) + + +def _get_blocks_compat( + get_blocks_func: Callable[..., tuple[Sequence[str], Sequence[str]]], + *, + df, + labels: Sequence[str], + metrics: Sequence[str], + topic_name: str, + outdir: Path, + evaluation_type: str, +): + """Call get_blocks across analyzer versions with different keyword support.""" + semantic_kwargs = { + "df": df, + "labels": list(labels), + "metrics": list(metrics), + "topic_name": topic_name, + "topic": topic_name, + "path": outdir, + "outdir": outdir, + "evaluation_type": evaluation_type, + } + try: + parameters = inspect.signature(get_blocks_func).parameters + except (TypeError, ValueError): + parameters = {} + + accepts_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() + ) + if accepts_kwargs or not parameters: + return get_blocks_func(**semantic_kwargs) + + args: list[object] = [] + kwargs: dict[str, object] = {} + for name, param in parameters.items(): + if name not in semantic_kwargs: + continue + value = semantic_kwargs[name] + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ): + args.append(value) + elif param.kind == inspect.Parameter.KEYWORD_ONLY: + kwargs[name] = value + return get_blocks_func(*args, **kwargs) + + +def _specsheet_compat( + specsheet_func: Callable[..., None], + *, + html: Sequence[str], + abstract_html: Sequence[str], + detailed_html: Sequence[str], + outdir: Path, + report_name: str, +) -> None: + """Call specsheet across analyzer versions with path/outdir differences.""" + semantic_kwargs = { + "html": list(html), + "abstract_html": list(abstract_html), + "detailed_html": list(detailed_html), + "path": outdir, + "outdir": outdir, + "report_name": report_name, + } + try: + parameters = inspect.signature(specsheet_func).parameters + except (TypeError, ValueError): + parameters = {} + + accepts_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() + ) + if accepts_kwargs or not parameters: + specsheet_func(**semantic_kwargs) + return + + args: list[object] = [] + kwargs: dict[str, object] = {} + for name, param in parameters.items(): + if name not in semantic_kwargs: + continue + value = semantic_kwargs[name] + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ): + args.append(value) + elif param.kind == inspect.Parameter.KEYWORD_ONLY: + kwargs[name] = value + specsheet_func(*args, **kwargs) def ensure_specsheet_csvs( @@ -264,14 +422,19 @@ def generate_specsheet_pdf( specsheet_dir.mkdir(parents=True, exist_ok=True) _notify(progress_callback, "Loading CSV files") - df = SceneDataFrame.from_dir(run_path) + df = _scene_dataframe_from_dir_compat( + SceneDataFrame, + run_path, + topic_name=topic_name, + ) metrics = list(DEFAULT_SPECSHEET_METRICS) if getattr(df, "future", None) is not None: metrics.extend(FUTURE_SPECSHEET_METRICS) _notify(progress_callback, "Building abstract and detail sections") with _patch_block_generation_progress(progress_callback): - abstract, detailed = get_blocks( + abstract, detailed = _get_blocks_compat( + get_blocks, df=df, labels=list(labels), metrics=metrics, @@ -288,9 +451,11 @@ def generate_specsheet_pdf( project_id, version, template_dir=template_dir, + context_dir=specsheet_dir, ) ) - specsheet( + _specsheet_compat( + specsheet, html=html, abstract_html=abstract, detailed_html=detailed, From 0efe1a195e4f5ef6fcefe64d3ed5ef4882873d52 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 18 May 2026 15:12:10 +0900 Subject: [PATCH 73/94] feat: enhance recent evaluator jobs functionality with new source reference handling - Introduced new functions for formatting source reference text and HTML, improving the display of job metadata in the UI. - Updated the recent evaluator jobs rendering logic to include git SHA and commit URL, enhancing traceability of job sources. - Refactored local run details to utilize the new source reference formatting, ensuring consistent presentation across the application. - Improved error handling in job detail fetching to maintain robustness during data retrieval. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/run_metadata.py | 10 + .../lib/ui/recent_evaluator_jobs.py | 132 +++++++- .../pages/7_Evaluator_Workflow.py | 286 ++++++++++++++---- evaluation_dashboard_app/worker/tasks.py | 49 ++- 4 files changed, 410 insertions(+), 67 deletions(-) diff --git a/evaluation_dashboard_app/lib/run_metadata.py b/evaluation_dashboard_app/lib/run_metadata.py index 91e9211..f5821b1 100644 --- a/evaluation_dashboard_app/lib/run_metadata.py +++ b/evaluation_dashboard_app/lib/run_metadata.py @@ -343,6 +343,7 @@ def build_metadata_patch_from_task_row(task_row: Dict[str, Any]) -> Dict[str, An "job_id": str(summary.get("evaluator_job_id") or params.get("job_id") or "").strip(), "report_url": str(summary.get("evaluator_report_url") or "").strip(), "status": str(summary.get("evaluator_status") or "").strip(), + "title": str(summary.get("evaluator_title") or params.get("description") or "").strip(), "scheduled_by": str(summary.get("evaluator_scheduled_by") or "").strip(), "build_status": str(summary.get("evaluator_build_status") or "").strip(), "test_status": str(summary.get("evaluator_test_status") or "").strip(), @@ -351,9 +352,18 @@ def build_metadata_patch_from_task_row(task_row: Dict[str, Any]) -> Dict[str, An "suites": list(summary.get("evaluator_suites") or []), "failed_cases": list(summary.get("evaluator_failed_cases") or []), "catalog_id": str(params.get("catalog_id") or "").strip(), + "catalog_name": str(summary.get("evaluator_catalog_name") or "").strip(), + "catalog_version_id": str(summary.get("evaluator_catalog_version_id") or "").strip(), + "catalog_url": str(summary.get("evaluator_catalog_url") or "").strip(), "integration_id": str(params.get("integration_id") or "").strip(), "source_job_id": str(params.get("source_job_id") or "").strip(), "target_name": str(params.get("target_name") or "").strip(), + "target": str(summary.get("evaluator_target") or params.get("target_name") or "").strip(), + "git_sha": str(summary.get("evaluator_git_sha") or "").strip(), + "git_ref_url": str(summary.get("evaluator_git_ref_url") or "").strip(), + "git_commit_url": str(summary.get("evaluator_git_commit_url") or "").strip(), + "source_url": str(summary.get("evaluator_source_url") or "").strip(), + "source_repo_label": str(summary.get("evaluator_source_repo_label") or "").strip(), "description": str(params.get("description") or "").strip(), "is_tag": bool(params.get("is_tag", False)), } diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 8228393..00e2b0f 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -122,6 +122,10 @@ def _make_retest_description(target_name: str, preset_name: str = "", *, has_cus ) +def _retest_suite_selection_key(job_id: str) -> str: + return f"recent_eval_retest_suite_selection_{job_id}" + + def _to_jst(dt: Any) -> Optional[datetime]: if dt is None: return None @@ -325,6 +329,90 @@ def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[D return options +def _short_git_sha(sha: str, *, length: int = 8) -> str: + return str(sha or "").strip()[: max(1, int(length))] + + +def _format_source_ref_text(source_label: str, git_sha: str) -> str: + label = str(source_label or "").strip() + short_sha = _short_git_sha(git_sha) + if label and short_sha: + return f"{label} ({short_sha})" + return label or short_sha or "—" + + +def _format_source_ref_html( + source_label: str, + source_url: str, + git_sha: str, + git_commit_url: str, +) -> str: + label = html.escape(str(source_label or "").strip() or "—") + ref_url = html.escape(str(source_url or "").strip()) + short_sha = html.escape(_short_git_sha(git_sha)) + commit_url = html.escape(str(git_commit_url or "").strip()) + + if ref_url and label != "—": + label_html = f'{label}' + else: + label_html = label + + if short_sha: + sha_html = ( + f'{short_sha}' + if commit_url + else short_sha + ) + if label_html and label_html != "—": + return f"{label_html} ({sha_html})" + return sha_html + + return label_html + + +def _extract_retest_parent_job_id(report: Dict[str, Any]) -> str: + """Return the upstream source_job_id when this evaluator job was itself a retest.""" + event = report.get("event") or {} + candidates = ( + event.get("source_job_id"), + ((event.get("source_job") or {}).get("id") if isinstance(event.get("source_job"), dict) else ""), + report.get("source_job_id"), + ) + for candidate in candidates: + value = str(candidate or "").strip() + if value: + return value + return "" + + +def _resolve_retest_source_job_id( + project_id: str, + environment: str, + job_id: str, + *, + detail: Optional[Dict[str, Any]] = None, + max_depth: int = 5, +) -> str: + """Unwrap retest chains so scheduling reuses the earliest known source job.""" + current_job_id = str(job_id or "").strip() + current_detail = detail or {} + seen_job_ids: set[str] = set() + + while current_job_id and current_job_id not in seen_job_ids and len(seen_job_ids) < max_depth: + seen_job_ids.add(current_job_id) + raw_report = current_detail.get("raw_report") if isinstance(current_detail, dict) else {} + parent_job_id = _extract_retest_parent_job_id(raw_report or {}) + if not parent_job_id or parent_job_id in seen_job_ids: + return current_job_id + current_job_id = parent_job_id + try: + current_detail = _fetch_evaluator_job_detail(project_id, environment, current_job_id) + except Exception: + return current_job_id + + return current_job_id or str(job_id or "").strip() + + def _status_color_variant(status: str) -> str: """Map evaluator status to a style token used by the recent-job cards.""" normalized = evaluator_api.normalize_job_status(status) @@ -765,6 +853,13 @@ def _inject_recent_evaluator_jobs_styles() -> None: font-size: 0.74rem; color: #64748b; } + .evj-name-sub a { + color: inherit; + text-decoration: none; + } + .evj-name-sub a:hover { + text-decoration: underline; + } .evj-status { display: inline-flex; align-items: center; @@ -1016,11 +1111,12 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = build_status = html.escape(job.get("build_status", "") or "—") test_status = html.escape(job.get("test_status", "") or "—") created_label = html.escape(job.get("created_label", "—")) - git_sha = html.escape(job.get("git_sha", "") or "—") - source_label = html.escape(job.get("source_label", "") or "—") + git_sha = str(job.get("git_sha", "") or "").strip() + source_label = str(job.get("source_label", "") or "—").strip() user_text = html.escape(user_label or "Unknown") report_url = html.escape(job.get("report_url", "") or "") - source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") + source_url = str(job.get("git_ref_url", "") or job.get("source_url", "") or "").strip() + git_commit_url = str(job.get("git_commit_url", "") or "").strip() status_variant = job.get("status_variant", "unknown") status_mark = { "running": '', @@ -1036,10 +1132,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = f'{int(job.get("total", 0))}' ) title_html = f'{title_text}' if report_url else title_text - source_html = ( - f'{source_label}' - if source_url else source_label - ) + source_html = _format_source_ref_html(source_label, source_url, git_sha, git_commit_url) catalog_html = ( f'{catalog}' if catalog_url else catalog @@ -1065,7 +1158,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = {user_text}
- build {build_status} · test {test_status} · {git_sha}
+ build {build_status} · test {test_status}
{counts}
@@ -1099,7 +1192,7 @@ def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: st.write(f"Status: `{detail.get('status', 'unknown')}`") st.write(f"Title: `{detail.get('title', '—')}`") st.write(f"Build/Test: `{detail.get('build_status', '—')}` / `{detail.get('test_status', '—')}`") - st.write(f"Ref: `{detail.get('target', '—')}`") + st.write(f"Ref: `{_format_source_ref_text(detail.get('target', ''), detail.get('git_sha', ''))}`") st.write(f"Catalog: `{detail.get('catalog', '—')}`") st.write(f"Repo: `{detail.get('source_repo_label', '—')}`") with overview_right: @@ -1107,7 +1200,6 @@ def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: st.write(f"Started: `{_format_jst_time(detail.get('started_at'), include_seconds=True)}`") st.write(f"Finished: `{_format_jst_time(detail.get('finished_at'), include_seconds=True)}`") st.write(f"Duration: `{detail.get('duration', '—')}`") - st.write(f"SHA: `{detail.get('git_sha', '—')}`") action_cols = st.columns([1.2, 1.2, 4]) report_url = detail.get("report_url", "") @@ -1335,6 +1427,12 @@ def _render_recent_evaluator_job_retest_dialog( detail = _fetch_evaluator_job_detail(project_id, environment, job_id) raw_report = detail.get("raw_report") or {} raw_catalog = raw_report.get("catalog") or {} + resolved_source_job_id = _resolve_retest_source_job_id( + project_id, + environment, + job_id, + detail=detail, + ) suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} suite_labels = [opt["label"] for opt in suite_options] @@ -1359,16 +1457,21 @@ def _render_recent_evaluator_job_retest_dialog( summary_cols[1].markdown(f"**Ref** \n`{detail.get('target', '—')}`") summary_cols[2].markdown(f"**Original catalog** \n`{original_catalog_name or '—'}`") summary_cols[3].markdown(f"**Suites found** \n`{len(suite_labels)}`") + if resolved_source_job_id and resolved_source_job_id != job_id: + st.caption(f"Using upstream source job `{resolved_source_job_id}` for scheduling because this job is already a retest.") preset_key = f"recent_eval_retest_catalog_preset_{job_id}" last_preset_key = f"recent_eval_retest_last_catalog_preset_{job_id}" catalog_id_key = f"recent_eval_retest_catalog_id_{job_id}" + suite_selection_key = _retest_suite_selection_key(job_id) if preset_key not in st.session_state: st.session_state[preset_key] = default_preset_name if last_preset_key not in st.session_state: st.session_state[last_preset_key] = "" if catalog_id_key not in st.session_state: st.session_state[catalog_id_key] = original_catalog_id + if suite_selection_key not in st.session_state: + st.session_state[suite_selection_key] = [] selected_preset_name = st.selectbox( "Catalog preset", @@ -1395,8 +1498,8 @@ def _render_recent_evaluator_job_retest_dialog( selected_suite_labels = st.multiselect( "Suites to run", options=suite_labels, - default=suite_labels, - help="Defaults to the suite set found on the source job. Clear the list to let the evaluator use its default suite selection.", + key=suite_selection_key, + help="Defaults to empty. Leave it empty to let the evaluator use its default suite selection, or choose specific suites to rerun.", disabled=not suite_labels, ) description = st.text_input( @@ -1449,6 +1552,7 @@ def _render_recent_evaluator_job_retest_dialog( start_clicked = action_cols[1].button("Retest", key=f"recent_eval_retest_start_{job_id}", type="primary", use_container_width=True) if cancel_clicked: + st.session_state.pop(suite_selection_key, None) st.session_state.pop("recent_eval_jobs_retest_selected", None) st.rerun() @@ -1480,7 +1584,7 @@ def _render_recent_evaluator_job_retest_dialog( "project_id": project_id, "catalog_id": final_catalog_id, "integration_id": "", - "source_job_id": job_id, + "source_job_id": resolved_source_job_id or job_id, "suite_ids": selected_suite_ids or None, "target_name": "", "description": final_description, @@ -1517,6 +1621,7 @@ def _render_recent_evaluator_job_retest_dialog( f"Queued artifact retest for `{detail.get('title', job_id)}`. " f"Task id: `{task_id}`." ) + st.session_state.pop(suite_selection_key, None) st.session_state.pop("recent_eval_jobs_retest_selected", None) st.rerun() @@ -1843,6 +1948,7 @@ def _render_job_list() -> None: st.rerun() with action_cols[2]: if st.button("Retest", key=f"recent_eval_retest_{job['job_id']}", use_container_width=True): + st.session_state.pop(_retest_suite_selection_key(str(job["job_id"])), None) st.session_state["recent_eval_jobs_retest_selected"] = str(job["job_id"]) _fetch_evaluator_job_detail.clear() st.rerun() diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py index 4f7dac8..c2431e3 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py @@ -9,9 +9,12 @@ from __future__ import annotations import html +import io import json import os +import re import urllib.parse +import zipfile from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Dict, List, Optional @@ -26,10 +29,12 @@ section_header, ) from lib.path_utils import ( + delete_run, format_size, get_data_root_display, get_run_info, list_run_directories, + resolve_run_subdirectory, resolve_under_data_root, ) from lib.run_metadata import ( @@ -38,6 +43,8 @@ ) from lib.ui.recent_evaluator_jobs import ( _fetch_evaluator_job_detail, + _format_source_ref_html, + _format_source_ref_text, _render_recent_evaluator_job_retest_dialog, _render_recent_evaluator_jobs_section, configure_recent_evaluator_jobs_ui, @@ -373,6 +380,12 @@ def _load_local_runs() -> List[Dict[str, object]]: or "" ).strip() evaluator_report_url = str(evaluator_meta.get("report_url") or "").strip() + evaluator_title = str( + evaluator_meta.get("title") + or description + or evaluator_job_id + or "" + ).strip() evaluator_target = str( evaluator_meta.get("target") or request_meta.get("target_name") @@ -429,7 +442,13 @@ def _load_local_runs() -> List[Dict[str, object]]: "task_status": task_status, "evaluator_job_id": evaluator_job_id, "evaluator_report_url": evaluator_report_url, + "evaluator_title": evaluator_title, "evaluator_target": evaluator_target, + "evaluator_git_sha": str(evaluator_meta.get("git_sha") or "").strip(), + "evaluator_git_ref_url": str(evaluator_meta.get("git_ref_url") or "").strip(), + "evaluator_git_commit_url": str(evaluator_meta.get("git_commit_url") or "").strip(), + "evaluator_source_url": str(evaluator_meta.get("source_url") or "").strip(), + "evaluator_source_repo_label": str(evaluator_meta.get("source_repo_label") or "").strip(), "catalog_id": catalog_id, "catalog_name": catalog_name, "catalog_label": catalog_label, @@ -444,24 +463,6 @@ def _load_local_runs() -> List[Dict[str, object]]: return runs -@st.cache_data(ttl=300, show_spinner=False) -def _load_local_run_source_ref(project_id: str, environment: str, evaluator_job_id: str) -> Dict[str, str]: - project = str(project_id or "").strip() - env = str(environment or "default").strip() or "default" - job_id = str(evaluator_job_id or "").strip() - if not project or not job_id: - return {"title": "", "label": "", "url": ""} - try: - detail = _fetch_evaluator_job_detail(project, env, job_id) - except Exception: - return {"title": "", "label": "", "url": ""} - return { - "title": str(detail.get("title") or "").strip(), - "label": str(detail.get("source_label") or detail.get("target") or "").strip(), - "url": str(detail.get("git_ref_url") or detail.get("source_url") or "").strip(), - } - - @st.cache_data(ttl=24 * 3600, show_spinner=False) def _resolve_subject_name(subject_id: str, environment: str) -> Dict[str, str]: subject = str(subject_id or "").strip() @@ -533,6 +534,13 @@ def _inject_workflow_page_styles() -> None: color: #64748b; font-size: 0.78rem; } + .wf-meta-inline a { + color: inherit; + text-decoration: none; + } + .wf-meta-inline a:hover { + text-decoration: underline; + } .wf-run-list { display: block; margin-top: 0.35rem; @@ -556,6 +564,10 @@ def _inject_workflow_page_styles() -> None: .wf-run-title a:hover { text-decoration: underline; } + .wf-run-title--muted, + .wf-run-title--muted a { + color: #94a3b8 !important; + } .wf-run-cell { min-width: 0; color: #0f172a; @@ -565,9 +577,15 @@ def _inject_workflow_page_styles() -> None: overflow: hidden; text-overflow: ellipsis; } + .wf-run-cell--muted { + color: #94a3b8; + } .wf-run-text { padding-top: 0.26rem; } + .wf-meta-inline--muted { + color: #94a3b8; + } .wf-run-code { padding-top: 0.22rem; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; @@ -601,6 +619,15 @@ def _inject_workflow_page_styles() -> None: background: #dcfce7; color: #166534; } + .wf-run-flags--muted { + opacity: 0.58; + } + .wf-unavailable-note { + margin-top: 0.18rem; + font-size: 0.68rem; + color: #94a3b8; + letter-spacing: 0.01em; + } .wf-compare-bar { border: 1px solid rgba(148, 163, 184, 0.24); background: linear-gradient(135deg, #f8fafc 0%, #ecfeff 100%); @@ -666,8 +693,104 @@ def _inject_workflow_page_styles() -> None: ) +def _build_local_run_artifact_list(run_name: str) -> tuple[Optional[Path], list[tuple[Path, str]], str]: + run_path, err = resolve_run_subdirectory(run_name) + if err: + return None, [], err + assert run_path is not None + to_zip: list[tuple[Path, str]] = [] + summary_file = run_path / "Summary.csv" + score_file = run_path / "Score.csv" + if summary_file.is_file(): + to_zip.append((summary_file, "Summary.csv")) + if score_file.is_file(): + to_zip.append((score_file, "Score.csv")) + for pq in sorted(run_path.glob("*.parquet"), key=lambda p: p.name.lower()): + to_zip.append((pq, pq.name)) + return run_path, to_zip, "" + + +def _render_local_run_download_dialog(run_name: str) -> None: + run_path, to_zip, err = _build_local_run_artifact_list(run_name) + if err: + st.error(err) + return + if run_path is None: + st.error("Run path could not be resolved.") + return + + prepared_key = f"workflow_zip_prepared::{run_name}" + st.caption("Download the generated local artifacts for this run as one ZIP.") + if not to_zip: + st.info("This run has no Summary.csv, Score.csv, or top-level `.parquet` files.") + return + + st.caption(f"**{len(to_zip)}** file(s): {', '.join(arc for _, arc in to_zip)}") + prepared = st.session_state.get(prepared_key) + + if st.button("Prepare ZIP", key=f"workflow_prepare_zip::{run_name}", use_container_width=True): + buf = io.BytesIO() + zip_errors: list[str] = [] + included: list[str] = [] + with st.spinner("Building ZIP…"): + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + for fpath, arcname in to_zip: + try: + zf.write(fpath, arcname=arcname) + included.append(arcname) + except OSError as exc: + zip_errors.append(f"{arcname}: {exc}") + for msg in zip_errors: + st.warning(msg) + if included: + safe_stem = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", run_name).strip() or "run" + st.session_state[prepared_key] = { + "data": buf.getvalue(), + "file_name": f"{safe_stem}_artifacts.zip", + } + prepared = st.session_state.get(prepared_key) + else: + st.session_state.pop(prepared_key, None) + prepared = None + st.error("Could not add any files to the ZIP.") + + if prepared and prepared.get("data"): + st.download_button( + label=f"Download {prepared['file_name']}", + data=prepared["data"], + file_name=prepared["file_name"], + mime="application/zip", + key=f"workflow_dl_zip::{run_name}", + use_container_width=True, + ) + + +def _render_local_run_delete_dialog(run_name: str) -> None: + st.warning("This deletes the local run directory permanently.") + confirm = st.text_input( + "Type the run name to confirm", + value="", + placeholder=run_name, + key=f"workflow_delete_confirm::{run_name}", + ).strip() + if st.button("Delete run", key=f"workflow_delete_btn::{run_name}", type="primary", use_container_width=True): + if confirm != run_name: + st.error("Confirmation text does not match the run name.") + return + ok, msg = delete_run(run_name) + if ok: + st.session_state.pop("workflow_local_run_detail", None) + st.session_state.pop("workflow_local_run_download", None) + st.session_state.pop("workflow_local_run_delete", None) + st.session_state.pop(f"workflow_zip_prepared::{run_name}", None) + st.success(msg) + _load_local_runs.clear() + st.rerun() + st.error(msg) + + def _render_local_runs_header() -> None: - header_cols = st.columns([0.45, 2.35, 0.95, 1.55, 1.45, 1.0, 1.08, 1.3, 0.72, 0.72], gap="small") + header_cols = st.columns([0.45, 2.35, 0.72, 1.45, 1.55, 1.0, 1.0, 1.22, 0.68, 1.55], gap="small") header_cols[0].markdown('
Pick
', unsafe_allow_html=True) header_cols[1].markdown('
Name
', unsafe_allow_html=True) header_cols[2].markdown('
User
', unsafe_allow_html=True) @@ -677,7 +800,7 @@ def _render_local_runs_header() -> None: header_cols[6].markdown('
Updated
', unsafe_allow_html=True) header_cols[7].markdown('
Files
', unsafe_allow_html=True) header_cols[8].markdown('
Size
', unsafe_allow_html=True) - header_cols[9].markdown('
Details
', unsafe_allow_html=True) + header_cols[9].markdown('
Actions
', unsafe_allow_html=True) def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: @@ -691,73 +814,91 @@ def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: evaluator_report_url = str(run.get("evaluator_report_url") or "").strip() evaluator_target = str(run.get("evaluator_target") or "").strip() description = str(run.get("description") or "").strip() - source_ref = _load_local_run_source_ref( - str(run.get("metadata", {}).get("request", {}).get("project_id") or ""), - str(run.get("environment") or "default"), - evaluator_job_id, - ) if evaluator_job_id else {"title": "", "label": "", "url": ""} - evaluator_title = html.escape(source_ref.get("title") or description or evaluator_job_id or "—") - source_label = html.escape(source_ref.get("label") or evaluator_target or "—") - source_url = html.escape(source_ref.get("url") or "") + evaluator_title = html.escape(str(run.get("evaluator_title") or description or evaluator_job_id or "—")) + source_label = str(run.get("evaluator_target") or evaluator_target or "—").strip() + source_url = str(run.get("evaluator_git_ref_url") or run.get("evaluator_source_url") or "").strip() + source_git_sha = str(run.get("evaluator_git_sha") or "").strip() + source_commit_url = str(run.get("evaluator_git_commit_url") or "").strip() result_label = html.escape( f"✅ {int(run.get('passed_count') or 0)} ❌ {int(run.get('failed_count') or 0)} ⏹ {int(run.get('canceled_count') or 0)}" ) task_type = str(run.get("task_type") or "").strip() task_status = str(run.get("task_status") or "").strip() - meta_bits = [bit for bit in [description, task_type, task_status] if bit] + meta_bits = [bit for bit in [task_type, task_status] if bit] flags = [ ("Summary", bool(run["has_summary"])), ("Score", bool(run["has_score"])), ("Parquet", bool(run["has_parquet"])), ] + compare_available = any(enabled for _, enabled in flags) + title_class = "wf-run-title wf-run-text" + ("" if compare_available else " wf-run-title--muted") + cell_class = "wf-run-cell wf-run-text" + ("" if compare_available else " wf-run-cell--muted") + meta_class = "wf-meta-inline" + ("" if compare_available else " wf-meta-inline--muted") + flag_wrap_class = "wf-run-flags" + ("" if compare_available else " wf-run-flags--muted") flag_html = "".join( f'{label}' for label, enabled in flags ) + if not compare_available: + flag_html += '
Unavailable for compare
' size_label = html.escape(str(run["size"])) checkbox_key = f"workflow_compare_pick::{name_raw}" - if checkbox_key not in st.session_state: + if not compare_available: + st.session_state[checkbox_key] = False + elif checkbox_key not in st.session_state: st.session_state[checkbox_key] = bool(selected) - row_cols = st.columns([0.45, 2.35, 0.95, 1.55, 1.45, 1.0, 1.08, 1.3, 0.72, 0.72], gap="small") + row_cols = st.columns([0.45, 2.35, 0.72, 1.45, 1.55, 1.0, 1.0, 1.22, 0.68, 1.55], gap="small") with row_cols[0]: - checked = st.checkbox("Select run", key=checkbox_key, label_visibility="collapsed") + checked = st.checkbox( + "Select run", + key=checkbox_key, + label_visibility="collapsed", + disabled=not compare_available, + ) with row_cols[1]: - title_html = f'' + title_html = f'' if meta_bits: meta_html = html.escape(" · ".join(meta_bits[:3])) - title_html += f'
{meta_html}
' + title_html += f'
{meta_html}
' st.markdown(title_html, unsafe_allow_html=True) with row_cols[2]: - st.markdown(f'
{user_label}
', unsafe_allow_html=True) + st.markdown(f'
{user_label}
', unsafe_allow_html=True) with row_cols[3]: if catalog_url and catalog_label != "—": st.markdown( - f'', + f'', unsafe_allow_html=True, ) else: - st.markdown(f'
{catalog_label}
', unsafe_allow_html=True) + st.markdown(f'
{catalog_label}
', unsafe_allow_html=True) with row_cols[4]: if evaluator_report_url and evaluator_job_id: - evaluator_html = f'' + evaluator_html = f'' else: - evaluator_html = f'
{evaluator_title}
' - if source_url and source_label != "—": - evaluator_html += f'' - elif source_label != "—": - evaluator_html += f'
{source_label}
' + evaluator_html = f'
{evaluator_title}
' + source_ref_html = _format_source_ref_html(source_label, source_url, source_git_sha, source_commit_url) + if source_ref_html and source_ref_html != "—": + evaluator_html += f'
{source_ref_html}
' st.markdown(evaluator_html, unsafe_allow_html=True) with row_cols[5]: - st.markdown(f'
{result_label}
', unsafe_allow_html=True) + st.markdown(f'
{result_label}
', unsafe_allow_html=True) with row_cols[6]: - st.markdown(f'
{modified}
', unsafe_allow_html=True) + st.markdown(f'
{modified}
', unsafe_allow_html=True) with row_cols[7]: - st.markdown(f'
{flag_html}
', unsafe_allow_html=True) + st.markdown(f'
{flag_html}
', unsafe_allow_html=True) with row_cols[8]: - st.markdown(f'
{size_label}
', unsafe_allow_html=True) + st.markdown(f'
{size_label}
', unsafe_allow_html=True) with row_cols[9]: - if st.button("Details", key=f"workflow_run_details::{name_raw}", use_container_width=True): - st.session_state["workflow_local_run_detail"] = name_raw + action_cols = st.columns([1.0, 1.0, 1.0], gap="small") + with action_cols[0]: + if st.button("Info", key=f"workflow_run_details::{name_raw}", use_container_width=True): + st.session_state["workflow_local_run_detail"] = name_raw + with action_cols[1]: + if st.button("ZIP", key=f"workflow_run_download::{name_raw}", use_container_width=True): + st.session_state["workflow_local_run_download"] = name_raw + with action_cols[2]: + if st.button("Delete", key=f"workflow_run_delete::{name_raw}", use_container_width=True): + st.session_state["workflow_local_run_delete"] = name_raw return bool(checked) @@ -781,9 +922,17 @@ def _render_local_run_details(run: Dict[str, object]) -> None: evaluator_detail = _fetch_evaluator_job_detail(project_id, request_environment, evaluator_job_id) except Exception: evaluator_detail = {} - source_url = str(evaluator_detail.get("source_url") or evaluator_detail.get("git_ref_url") or "").strip() + source_url = str( + evaluator_meta.get("git_ref_url") + or evaluator_meta.get("source_url") + or evaluator_detail.get("source_url") + or evaluator_detail.get("git_ref_url") + or "" + ).strip() catalog_url = str(evaluator_detail.get("catalog_url") or "").strip() - source_label = str(evaluator_detail.get("source_label") or evaluator_target or "").strip() + source_label = str(evaluator_meta.get("target") or evaluator_detail.get("source_label") or evaluator_target or "").strip() + source_git_sha = str(evaluator_meta.get("git_sha") or evaluator_detail.get("git_sha") or "").strip() + source_ref_text = _format_source_ref_text(source_label or evaluator_target, source_git_sha) with st.container(border=True): title_cols = st.columns([3.4, 1.0]) @@ -866,6 +1015,7 @@ def _render_local_run_details(run: Dict[str, object]) -> None: st.link_button("Open catalog", catalog_url, use_container_width=True) with action_cols[3]: if st.button("Artifact retest", key=f"workflow_local_run_retest::{run['name']}", type="primary", use_container_width=True): + st.session_state.pop(f"recent_eval_retest_suite_selection_{evaluator_job_id}", None) st.session_state["workflow_local_run_retest"] = str(run["name"]) st.rerun() @@ -878,7 +1028,7 @@ def _render_local_run_details(run: Dict[str, object]) -> None: ) info_cols[1].text_input( "Source ref", - value=_metadata_text(source_label or evaluator_target), + value=_metadata_text(source_ref_text), disabled=True, key=f"run_detail_source_ref::{run['name']}", ) @@ -1210,6 +1360,38 @@ def _render_local_runs_section() -> None: st.button("Open", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") st.markdown("", unsafe_allow_html=True) + download_run_name = str(st.session_state.get("workflow_local_run_download") or "").strip() + if download_run_name: + if callable(getattr(st, "dialog", None)): + @st.dialog(f"Download artifacts · {download_run_name}", width="large") + def _workflow_local_run_download_dialog() -> None: + _render_local_run_download_dialog(download_run_name) + if st.button("Close", key=f"workflow_local_run_download_close::{download_run_name}", use_container_width=True): + st.session_state.pop("workflow_local_run_download", None) + st.rerun() + + _workflow_local_run_download_dialog() + else: + st.markdown("---") + st.subheader(f"Download artifacts · {download_run_name}") + _render_local_run_download_dialog(download_run_name) + + delete_run_name = str(st.session_state.get("workflow_local_run_delete") or "").strip() + if delete_run_name: + if callable(getattr(st, "dialog", None)): + @st.dialog(f"Delete local run · {delete_run_name}", width="large") + def _workflow_local_run_delete_dialog() -> None: + _render_local_run_delete_dialog(delete_run_name) + if st.button("Cancel", key=f"workflow_local_run_delete_close::{delete_run_name}", use_container_width=True): + st.session_state.pop("workflow_local_run_delete", None) + st.rerun() + + _workflow_local_run_delete_dialog() + else: + st.markdown("---") + st.subheader(f"Delete local run · {delete_run_name}") + _render_local_run_delete_dialog(delete_run_name) + detail_run_name = str(st.session_state.get("workflow_local_run_detail") or "").strip() if detail_run_name: detail_run = next((row for row in runs if str(row["name"]) == detail_run_name), None) diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 17591ce..44286b3 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -450,6 +450,15 @@ def _extract_git_target_from_report(report: Dict[str, Any]) -> str: return git_ref or str(source.get("git_sha") or "").strip()[:12] or "" +def _extract_job_title_from_report(report: Dict[str, Any]) -> str: + """Prefer evaluator description for display title, with a readable fallback.""" + description = str(report.get("description") or "").strip() + if description: + return description + started_like = report.get("started_at") or report.get("scheduled_at") or report.get("finished_at") + return f"no description ({started_like or 'unknown start'})" + + def _extract_catalog_url_from_report(report: Dict[str, Any]) -> str: """Best-effort catalog URL matching the recent evaluator jobs list.""" catalog = report.get("catalog") or {} @@ -468,6 +477,21 @@ def _extract_catalog_url_from_report(report: Dict[str, Any]) -> str: return "" +def _extract_source_metadata_from_report(report: Dict[str, Any]) -> Dict[str, str]: + """Best-effort source metadata for local run rendering without refetching.""" + source = ((report.get("event") or {}).get("source") or {}) + git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() + return { + "title": _extract_job_title_from_report(report), + "target": _extract_git_target_from_report(report), + "git_sha": str(source.get("git_sha") or "").strip(), + "git_ref_url": str(source.get("git_ref_url") or "").strip(), + "git_commit_url": str(source.get("git_commit_url") or "").strip(), + "source_url": git_url, + "source_repo_label": git_url.rstrip("/").split("/")[-1] if git_url else "", + } + + def _build_evaluator_result_summary( *, job_id: str, @@ -482,6 +506,7 @@ def _build_evaluator_result_summary( test = final_report.get("test") or {} available = test.get("available_case_results") or test.get("case_results") or {} case_totals = _suite_case_totals(suite_rows) + source_meta = _extract_source_metadata_from_report(final_report) if not any(case_totals.values()): case_totals = { "total": int(available.get("total_count", 0) or 0), @@ -498,7 +523,13 @@ def _build_evaluator_result_summary( "evaluator_catalog_name": ((final_report.get("catalog") or {}).get("display_name") or ""), "evaluator_catalog_version_id": ((final_report.get("catalog") or {}).get("version_id") or ""), "evaluator_catalog_url": _extract_catalog_url_from_report(final_report), - "evaluator_target": _extract_git_target_from_report(final_report), + "evaluator_title": source_meta.get("title", ""), + "evaluator_target": source_meta.get("target", ""), + "evaluator_git_sha": source_meta.get("git_sha", ""), + "evaluator_git_ref_url": source_meta.get("git_ref_url", ""), + "evaluator_git_commit_url": source_meta.get("git_commit_url", ""), + "evaluator_source_url": source_meta.get("source_url", ""), + "evaluator_source_repo_label": source_meta.get("source_repo_label", ""), "evaluator_build_status": build.get("status", ""), "evaluator_test_status": test.get("status", ""), "evaluator_fail_message": final_report.get("fail_message", ""), @@ -527,6 +558,7 @@ def _fetch_evaluator_context( build = report.get("build") or {} test = report.get("test") or {} available = test.get("available_case_results") or test.get("case_results") or {} + source_meta = _extract_source_metadata_from_report(report) return { "job_id": job_id, "report_url": evaluator_api.get_job_report_url(project_id, job_id), @@ -536,7 +568,13 @@ def _fetch_evaluator_context( "catalog_name": str(((report.get("catalog") or {}).get("display_name") or "")).strip(), "catalog_version_id": (report.get("catalog") or {}).get("version_id"), "catalog_url": _extract_catalog_url_from_report(report), - "target": _extract_git_target_from_report(report), + "title": source_meta.get("title", ""), + "target": source_meta.get("target", ""), + "git_sha": source_meta.get("git_sha", ""), + "git_ref_url": source_meta.get("git_ref_url", ""), + "git_commit_url": source_meta.get("git_commit_url", ""), + "source_url": source_meta.get("source_url", ""), + "source_repo_label": source_meta.get("source_repo_label", ""), "build_status": str(build.get("status") or "").strip(), "test_status": str(test.get("status") or "").strip(), "fail_message": str(report.get("fail_message") or "").strip(), @@ -1143,6 +1181,7 @@ def on_warning(msg: str) -> None: "target_name": target_name or "", "description": description or "", "is_tag": bool(is_tag), + "title": description or "", } }, ) @@ -1290,12 +1329,18 @@ def on_eval_progress(status: str, elapsed: float) -> None: "job_id": job_id, "report_url": report_url, "status": test_status, + "title": summary.get("evaluator_title", ""), "scheduled_by": summary.get("evaluator_scheduled_by", ""), "catalog_id": summary.get("evaluator_catalog_id", ""), "catalog_name": summary.get("evaluator_catalog_name", ""), "catalog_version_id": summary.get("evaluator_catalog_version_id", ""), "catalog_url": summary.get("evaluator_catalog_url", ""), "target": summary.get("evaluator_target", ""), + "git_sha": summary.get("evaluator_git_sha", ""), + "git_ref_url": summary.get("evaluator_git_ref_url", ""), + "git_commit_url": summary.get("evaluator_git_commit_url", ""), + "source_url": summary.get("evaluator_source_url", ""), + "source_repo_label": summary.get("evaluator_source_repo_label", ""), "build_status": summary.get("evaluator_build_status", ""), "test_status": summary.get("evaluator_test_status", ""), "fail_message": summary.get("evaluator_fail_message", ""), From db82d8dfc9831b825124bf5b1d34c284dbf1c1ee Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 19 May 2026 10:08:23 +0900 Subject: [PATCH 74/94] feat: add trend data functionality to evaluation dashboard - Introduced a toggle for including trend data in the release spec-sheet generation, allowing users to save and reuse trend metadata. - Implemented a text area for inputting trend metadata in YAML format, with validation and error handling for user input. - Enhanced the spec-sheet generation process to accommodate trend metadata, ensuring proper handling and storage of trend-related information. - Created a new page for displaying trend insights, integrating performance metrics and case-level pass rates from saved trend metadata files. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Overview.py | 45 ++ .../lib/specsheet_report.py | 273 +++++++++- .../pages/13_Trend_Insights.py | 513 ++++++++++++++++++ 3 files changed, 823 insertions(+), 8 deletions(-) create mode 100644 evaluation_dashboard_app/pages/13_Trend_Insights.py diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py index 359c86d..530f054 100644 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -2,6 +2,7 @@ import pandas as pd import io import zipfile +import yaml from pathlib import Path from lib.run_loader import load_run from lib.path_utils import get_data_root, get_data_root_display, list_run_directories, path_display @@ -14,10 +15,12 @@ DEFAULT_SPECSHEET_LABELS, DEFAULT_SPECSHEET_PROJECT_ID, DEFAULT_SPECSHEET_TOPIC, + DEFAULT_TREND_METADATA_TEXT, collect_candidate_specsheet_labels, generate_specsheet_pdf, get_specsheet_artifact_paths, is_specsheet_pdf_fresh, + parse_trend_metadata_text, progress_fraction_from_message, ) from lib.page_chrome import ( @@ -759,6 +762,35 @@ def _update_pdf_status(message: str) -> None: if not selected_specsheet_run_paths: st.info("Pick at least one run to build the release spec-sheet.") +specsheet_trend_enabled = st.toggle( + "Include trend data", + value=bool(st.session_state.get("specsheet_include_trend", False)), + key="specsheet_include_trend", + help="Release-report mode only. Saves `metadata.yaml` next to the generated `summary.json` and reuses all saved trend metadata files under the data root.", +) + +trend_metadata_payload = None +if specsheet_trend_enabled: + st.caption( + "Trend mode uses a slim analyzer-compatible `metadata.yaml`. Extra evaluator fields are ignored." + ) + trend_metadata_text = st.text_area( + "Trend metadata YAML", + value=st.session_state.get("specsheet_trend_metadata_text", DEFAULT_TREND_METADATA_TEXT), + key="specsheet_trend_metadata_text", + height=180, + help="Required keys: tags, pilot_auto_version, data_count, description, date.", + ) + try: + trend_metadata_payload = parse_trend_metadata_text(trend_metadata_text) + st.success("Trend metadata looks valid.") + st.code( + yaml.safe_dump(trend_metadata_payload, allow_unicode=True, sort_keys=False), + language="yaml", + ) + except Exception as trend_exc: + st.error(f"Trend metadata error: {trend_exc}") + specsheet_action_col1, specsheet_action_col2 = st.columns([1.2, 2.8]) with specsheet_action_col1: if st.button("Generate Release Spec-sheet PDF", type="primary", use_container_width=True): @@ -775,11 +807,18 @@ def _update_pdf_status(message: str) -> None: raise ValueError("At least one label is required.") if not selected_specsheet_run_paths: raise ValueError("At least one run must be selected.") + if specsheet_trend_enabled and len(selected_specsheet_run_paths) != 1: + raise ValueError("Trend-enabled release spec-sheet generation currently supports exactly one run.") + if specsheet_trend_enabled and trend_metadata_payload is None: + raise ValueError("Valid trend metadata is required when trend mode is enabled.") stage_progress = { "Using existing up-to-date spec-sheet PDF": 1.0, "Loading CSV files": 0.15, "Building abstract and detail sections": 0.2, + "Saving trend metadata": 0.9, + "Collecting trend history": 0.92, + "Rendering trend plots": 0.94, "Rendering PDF": 0.95, "Spec-sheet PDF is ready": 1.0, } @@ -810,6 +849,8 @@ def _update_specsheet_status(message: str) -> None: version=specsheet_version, labels=specsheet_labels, topic_name=specsheet_topic_name, + include_trend=specsheet_trend_enabled, + trend_metadata=trend_metadata_payload, force=True, progress_callback=_update_specsheet_status, ) @@ -835,6 +876,8 @@ def _update_specsheet_status(message: str) -> None: "version": specsheet_version, "topic_name": specsheet_topic_name, "labels": list(specsheet_labels), + "include_trend": specsheet_trend_enabled, + "trend_metadata": trend_metadata_payload if specsheet_trend_enabled else None, "artifact_kind": "zip" if len(generated_pdfs) > 1 else "pdf", } st.session_state["specsheet_pdf_report_name"] = download_name @@ -863,6 +906,8 @@ def _update_specsheet_status(message: str) -> None: "version": specsheet_version, "topic_name": specsheet_topic_name, "labels": list(specsheet_labels), + "include_trend": specsheet_trend_enabled, + "trend_metadata": trend_metadata_payload if specsheet_trend_enabled else None, "artifact_kind": "zip" if len(selected_specsheet_run_paths) > 1 else "pdf", } _specsheet_ready = ( diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index 93062b2..d868dfe 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -2,13 +2,16 @@ from contextlib import contextmanager import inspect +import json import re from pathlib import Path -from typing import Callable, Iterable, Sequence +from typing import Any, Callable, Iterable, Sequence import pandas as pd +import yaml from lib.perception_catalog_io import build_scene_dataframe_from_pkl_dir +from lib.path_utils import get_data_root DEFAULT_SPECSHEET_TOPIC = "perception.object_recognition.tracking.objects" DEFAULT_SPECSHEET_PROJECT_ID = "x2_dev" @@ -32,6 +35,16 @@ "minFDE@3s", "minFDE@5s", ] +TREND_METADATA_FILENAME = "metadata.yaml" +TREND_SUMMARY_FILENAME = "summary.json" +DEFAULT_TREND_METADATA_TEXT = """tags: [trend] +pilot_auto_version: "Pilot.Auto v4.3.0 (centerpoint x2/2.3.1)" +data_count: 99,776+ +description: データの追加 +date: 2025.11.7 +""" +_TREND_DATE_PATTERN = re.compile(r"^\d{4}\.\d{1,2}\.\d{1,2}$") +_TREND_DATA_COUNT_PATTERN = re.compile(r"^\d[\d,]*\+?$") def get_specsheet_artifact_paths(run_dir: str | Path) -> dict[str, Path]: @@ -42,6 +55,9 @@ def get_specsheet_artifact_paths(run_dir: str | Path) -> dict[str, Path]: "future_csv": run_path / "future.csv", "current_parquet": run_path / "current.parquet", "future_parquet": run_path / "future.parquet", + "resource_dir": run_path / "resources", + "trend_metadata": run_path / "resources" / TREND_METADATA_FILENAME, + "trend_summary": run_path / "resources" / TREND_SUMMARY_FILENAME, "specsheet_dir": run_path / "specsheet", "specsheet_pdf": run_path / "specsheet" / "specsheet.pdf", } @@ -152,6 +168,225 @@ def _prefer_cjk_font_stack(html_lines: Sequence[str]) -> list[str]: return [line.replace(generic, preferred) for line in rendered] +def parse_trend_metadata_text(text: str) -> dict[str, Any]: + """Parse and validate manual trend metadata YAML input.""" + raw = yaml.safe_load(text or "") + if not isinstance(raw, dict): + raise ValueError("Trend metadata must be a YAML object with key/value pairs.") + + tags = raw.get("tags") + if isinstance(tags, str): + tags = [tags] + if not isinstance(tags, list) or not any(str(tag).strip() == "trend" for tag in tags): + raise ValueError("Trend metadata must include `tags: [trend]`.") + + pilot_auto_version = str(raw.get("pilot_auto_version") or "").strip() + if not pilot_auto_version: + raise ValueError("Trend metadata requires a non-empty `pilot_auto_version`.") + + data_count = str(raw.get("data_count") or "").strip() + if not data_count or not _TREND_DATA_COUNT_PATTERN.match(data_count): + raise ValueError( + "Trend metadata `data_count` must look like `99,776+` or `12345`." + ) + + description = str(raw.get("description") or "").strip() + date = str(raw.get("date") or "").strip() + if not date or not _TREND_DATE_PATTERN.match(date): + raise ValueError("Trend metadata `date` must look like `2025.11.7`.") + + return { + "tags": ["trend"], + "pilot_auto_version": pilot_auto_version, + "data_count": data_count, + "description": description, + "date": date, + } + + +def write_trend_metadata(run_dir: str | Path, metadata: dict[str, Any]) -> Path: + paths = get_specsheet_artifact_paths(run_dir) + resource_dir = paths["resource_dir"] + metadata_path = paths["trend_metadata"] + resource_dir.mkdir(parents=True, exist_ok=True) + with metadata_path.open("w", encoding="utf-8") as fh: + yaml.safe_dump(metadata, fh, allow_unicode=True, sort_keys=False) + return metadata_path + + +def discover_trend_metadata_files(root_dir: str | Path | None = None) -> list[Path]: + base_dir = Path(root_dir) if root_dir is not None else get_data_root() + if not base_dir.exists(): + return [] + + matches: list[Path] = [] + for metadata_path in base_dir.rglob(TREND_METADATA_FILENAME): + if not metadata_path.is_file(): + continue + if not (metadata_path.parent / TREND_SUMMARY_FILENAME).exists(): + continue + matches.append(metadata_path) + return sorted(dict.fromkeys(path.resolve() for path in matches), key=lambda p: str(p)) + + +def load_trend_metadata_file(metadata_path: str | Path) -> dict[str, Any]: + with Path(metadata_path).open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + if not isinstance(data, dict): + raise ValueError(f"Invalid trend metadata file: {metadata_path}") + return data + + +def load_trend_summary_file(summary_path: str | Path) -> dict[str, Any]: + with Path(summary_path).open("r", encoding="utf-8") as fh: + data = json.load(fh) + if not isinstance(data, dict): + raise ValueError(f"Invalid trend summary file: {summary_path}") + return data + + +def _trend_version_sort_key(pilot_auto_version: str) -> tuple[tuple[int, int, int], str, tuple[int, int, int]]: + pattern = r"v(\d+)\.(\d+)\.(\d+)\s*\(([^ ]+)\s+(.+)\)" + match = re.search(pattern, str(pilot_auto_version or "")) + if not match: + return ((999, 999, 999), str(pilot_auto_version or ""), (999, 999, 999)) + + major = int(match.group(1)) + minor = int(match.group(2)) + patch = int(match.group(3)) + ml_model_type = match.group(4) + ml_model_info = match.group(5) + try: + _, ml_model_version = ml_model_info.split("/") + ml_major, ml_minor, ml_patch = ml_model_version.split(".") + ml_version = (int(ml_major), int(ml_minor), int(ml_patch)) + except ValueError: + ml_version = (999, 999, 999) + return ((major, minor, patch), ml_model_type, ml_version) + + +def _load_only_full_summary(summary_path: Path) -> list[dict[str, Any]]: + summary = load_trend_summary_file(summary_path) + data_list: list[dict[str, Any]] = [] + for block in summary.get("blocks", []): + if block.get("header") != "全数データセット評価": + continue + for tables in block.get("tables", []): + table_data = tables.get("data", {}) + if isinstance(table_data, dict) and table_data: + data_list.append(table_data) + return data_list + + +def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, str | int | float]]: + trend_data_rows: list[dict[str, Any]] = [] + for metadata_path in metadata_list: + metadata = load_trend_metadata_file(metadata_path) + if "trend" not in [str(tag).strip() for tag in metadata.get("tags", [])]: + continue + summary_path = Path(metadata_path).parent / TREND_SUMMARY_FILENAME + if not summary_path.exists(): + continue + summary_list = _load_only_full_summary(summary_path) + if not summary_list: + continue + trend_data_rows.append( + { + "version": metadata.get("pilot_auto_version"), + "data_count": metadata.get("data_count"), + "description": metadata.get("description"), + "date": metadata.get("date"), + "summary": summary_list, + } + ) + + trend_data_rows.sort(key=lambda row: _trend_version_sort_key(str(row.get("version") or ""))) + + output: list[dict[str, str | int | float]] = [] + for row in trend_data_rows: + summary = row.get("summary") or [] + if len(summary) != 1: + raise ValueError( + f"Expected exactly one summary block for version {row.get('version')}, but got {len(summary)}" + ) + metrics = summary[0] + + def _avg(metric_name: str) -> float: + values = metrics.get(metric_name, {}) + if not isinstance(values, dict) or not values: + return float("nan") + numeric = pd.to_numeric(pd.Series(list(values.values())), errors="coerce") + return float(numeric.mean()) + + output.append( + { + "version": row.get("version"), + "data_count": row.get("data_count"), + "description": row.get("description"), + "date": row.get("date"), + "mAP": _avg("mAP"), + "minADE@1s": _avg("minADE@1s"), + "minFDE@1s": _avg("minFDE@1s"), + "minADE@3s": _avg("minADE@3s"), + "minFDE@3s": _avg("minFDE@3s"), + "minADE@5s": _avg("minADE@5s"), + "minFDE@5s": _avg("minFDE@5s"), + } + ) + return output + + +def load_devops_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, Any]]: + return [] + + +def _build_trend_context( + metadata_list: Sequence[Path], + output_dir: Path, + progress_callback: Callable[[str], None] | None = None, +) -> dict[str, object]: + if not metadata_list: + return { + "performance_trend_data": [], + "map_trend_plot_path": output_dir / "map_trend.png", + "prediction_trend_plot_path": output_dir / "prediction_trend.png", + "devops_trend_data": [], + "devops_trend_plot_path": output_dir / "devops_trend.png", + "job_ids": [], + } + + try: + from perception_catalog_analyzer.plot.map_trend import generate_map_trend_plot + from perception_catalog_analyzer.plot.prediction_trend import generate_prediction_trend_plot + except ImportError as exc: + raise RuntimeError( + "perception_catalog_analyzer trend support is unavailable. " + f"Original error: {exc!s}" + ) from exc + + output_dir.mkdir(parents=True, exist_ok=True) + _notify(progress_callback, "Collecting trend history") + performance_trend_data = load_performance_trend_data(list(metadata_list)) + map_trend_plot_path = output_dir / "map_trend.png" + prediction_trend_plot_path = output_dir / "prediction_trend.png" + if performance_trend_data: + _notify(progress_callback, "Rendering trend plots") + generate_map_trend_plot(performance_trend_data, map_trend_plot_path) + generate_prediction_trend_plot(performance_trend_data, prediction_trend_plot_path) + + devops_trend_data = load_devops_trend_data(list(metadata_list)) + devops_trend_plot_path = output_dir / "devops_trend.png" + + return { + "performance_trend_data": performance_trend_data, + "map_trend_plot_path": map_trend_plot_path, + "prediction_trend_plot_path": prediction_trend_plot_path, + "devops_trend_data": devops_trend_data, + "devops_trend_plot_path": devops_trend_plot_path, + "job_ids": [], + } + + def _update_template_compat( update_template_func: Callable[..., Sequence[str]], project_id: str, @@ -159,6 +394,7 @@ def _update_template_compat( *, template_dir: Path, context_dir: Path, + trend_context: dict[str, object] | None = None, ) -> Sequence[str]: """Call update_template across analyzer versions with different signatures.""" try: @@ -166,22 +402,27 @@ def _update_template_compat( except (TypeError, ValueError): parameters = {} + trend_context = trend_context or {} semantic_kwargs = { "project_id": project_id, "pilot_auto_version": version, "version": version, "devops_data": {}, "devops_plot_path": None, - "performance_trend_data": [], - "map_trend_plot_path": context_dir / "map_trend.png", - "prediction_trend_plot_path": context_dir / "prediction_trend.png", - "devops_trend_data": [], - "devops_trend_plot_path": context_dir / "devops_trend.png", - "job_ids": [], + "performance_trend_data": trend_context.get("performance_trend_data", []), + "map_trend_plot_path": trend_context.get("map_trend_plot_path", context_dir / "map_trend.png"), + "prediction_trend_plot_path": trend_context.get( + "prediction_trend_plot_path", context_dir / "prediction_trend.png" + ), + "devops_trend_data": trend_context.get("devops_trend_data", []), + "devops_trend_plot_path": trend_context.get( + "devops_trend_plot_path", context_dir / "devops_trend.png" + ), + "job_ids": trend_context.get("job_ids", []), "template_name": "static_body.html", "extensions": ["html"], "template_dir": str(template_dir), - "show_other_infos": False, + "show_other_infos": bool(trend_context.get("performance_trend_data")), } accepts_kwargs = any( @@ -392,6 +633,8 @@ def generate_specsheet_pdf( version: str, labels: Sequence[str], topic_name: str = DEFAULT_SPECSHEET_TOPIC, + include_trend: bool = False, + trend_metadata: dict[str, Any] | None = None, force: bool = False, progress_callback: Callable[[str], None] | None = None, ) -> tuple[Path, bool]: @@ -443,6 +686,19 @@ def generate_specsheet_pdf( evaluation_type="full", ) + trend_context: dict[str, object] | None = None + if include_trend: + if trend_metadata is None: + raise ValueError("Trend metadata is required when trend mode is enabled.") + _notify(progress_callback, "Saving trend metadata") + write_trend_metadata(run_path, trend_metadata) + metadata_list = discover_trend_metadata_files() + trend_context = _build_trend_context( + metadata_list, + specsheet_dir, + progress_callback=progress_callback, + ) + _notify(progress_callback, "Rendering PDF") template_dir = Path(template_module.__file__).resolve().parent.parent / "template" html = _prefer_cjk_font_stack( @@ -452,6 +708,7 @@ def generate_specsheet_pdf( version, template_dir=template_dir, context_dir=specsheet_dir, + trend_context=trend_context, ) ) _specsheet_compat( diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py new file mode 100644 index 0000000..75848fb --- /dev/null +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -0,0 +1,513 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import streamlit as st + +from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.specsheet_report import ( + discover_trend_metadata_files, + load_performance_trend_data, + load_trend_metadata_file, + load_trend_summary_file, +) + +st.set_page_config(page_title="Trend Insights", layout="wide", initial_sidebar_state="expanded") +inject_app_page_styles() + + +def _run_name_from_metadata_path(metadata_path: Path) -> str: + if metadata_path.parent.name == "resources": + return metadata_path.parent.parent.name + return metadata_path.parent.name + + +def _parse_data_count(value: Any) -> int | None: + text = str(value or "").strip().replace(",", "").replace("+", "") + if not text: + return None + try: + return int(text) + except ValueError: + return None + + +def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + performance_rows = load_performance_trend_data(metadata_files) + by_version = {str(row.get("version")): row for row in performance_rows} + + entry_rows: list[dict[str, Any]] = [] + case_rows: list[dict[str, Any]] = [] + metric_rows: list[dict[str, Any]] = [] + + for metadata_path in metadata_files: + metadata = load_trend_metadata_file(metadata_path) + summary_path = metadata_path.parent / "summary.json" + summary = load_trend_summary_file(summary_path) + version = str(metadata.get("pilot_auto_version") or "") + trend_row = by_version.get(version, {}) + data_count_raw = str(metadata.get("data_count") or "") + entry_row = { + "run_name": _run_name_from_metadata_path(metadata_path), + "version": version, + "date": str(metadata.get("date") or ""), + "description": str(metadata.get("description") or ""), + "data_count": data_count_raw, + "data_count_num": _parse_data_count(data_count_raw), + "metadata_path": str(metadata_path), + "summary_path": str(summary_path), + "summary_kind": "performance_blocks" if isinstance(summary.get("blocks"), list) else "case_pass_rate", + "summary_blocks": len(summary.get("blocks", [])) if isinstance(summary.get("blocks"), list) else 0, + "mAP": trend_row.get("mAP"), + "minADE@1s": trend_row.get("minADE@1s"), + "minADE@3s": trend_row.get("minADE@3s"), + "minADE@5s": trend_row.get("minADE@5s"), + "minFDE@1s": trend_row.get("minFDE@1s"), + "minFDE@3s": trend_row.get("minFDE@3s"), + "minFDE@5s": trend_row.get("minFDE@5s"), + "overall_pass_rate": None, + "scenario_count": None, + } + + if entry_row["summary_kind"] == "case_pass_rate": + total_passed = 0 + total_count = 0 + for major_category, mid_categories in summary.items(): + if not isinstance(mid_categories, dict): + continue + for mid_category, cases in mid_categories.items(): + if not isinstance(cases, dict): + continue + for case_name, result in cases.items(): + if not isinstance(result, dict): + continue + passed = int(result.get("passed", 0) or 0) + total = int(result.get("total", 0) or 0) + total_passed += passed + total_count += total + case_rows.append( + { + "run_name": entry_row["run_name"], + "version": version, + "date": entry_row["date"], + "description": entry_row["description"], + "major_category": major_category, + "mid_category": mid_category, + "case_name": case_name, + "passed": passed, + "total": total, + "pass_rate": (passed / total * 100.0) if total > 0 else None, + } + ) + entry_row["scenario_count"] = total_count + entry_row["overall_pass_rate"] = (total_passed / total_count * 100.0) if total_count > 0 else None + else: + blocks = summary.get("blocks", []) + for block in blocks: + block_header = str(block.get("header") or "") + for table in block.get("tables", []): + table_data = table.get("data", {}) + if not isinstance(table_data, dict): + continue + for metric_name, labels in table_data.items(): + if not isinstance(labels, dict): + continue + for label_name, value in labels.items(): + metric_rows.append( + { + "run_name": entry_row["run_name"], + "version": version, + "date": entry_row["date"], + "description": entry_row["description"], + "block_header": block_header, + "metric_name": metric_name, + "label_name": label_name, + "value": value, + } + ) + + entry_rows.append(entry_row) + + entry_df = pd.DataFrame(entry_rows) + if not entry_df.empty: + entry_df["date_sort"] = pd.to_datetime(entry_df["date"], format="%Y.%m.%d", errors="coerce") + case_df = pd.DataFrame(case_rows) + if not case_df.empty: + case_df["date_sort"] = pd.to_datetime(case_df["date"], format="%Y.%m.%d", errors="coerce") + metric_df = pd.DataFrame(metric_rows) + if not metric_df.empty: + metric_df["date_sort"] = pd.to_datetime(metric_df["date"], format="%Y.%m.%d", errors="coerce") + metric_df["value"] = pd.to_numeric(metric_df["value"], errors="coerce") + return entry_df, case_df, metric_df + + +render_page_hero( + kicker="Release Analytics", + title="Trend Insights", + description="Inspect every saved release-trend entry, including full performance metrics, case-level pass rates, and the raw analyzer summary payloads.", +) + +section_header( + "Trend Inventory", + "Trend entries are discovered from saved `metadata.yaml` files that sit beside analyzer-compatible `summary.json` files under the dashboard data root.", +) + +metadata_files = discover_trend_metadata_files() +if not metadata_files: + st.info("No saved trend metadata was found yet. Generate a release spec-sheet with trend mode enabled first.") + st.stop() + +try: + entry_df, case_df, metric_df = _build_entry_frame(metadata_files) +except Exception as exc: + st.error(f"Could not build trend insights: {exc}") + st.stop() + +top1, top2, top3, top4, top5 = st.columns(5) +top1.metric("Trend Entries", f"{len(entry_df):,}") +top2.metric("Unique Versions", f"{entry_df['version'].nunique():,}" if not entry_df.empty else "0") +top3.metric( + "Performance Entries", + f"{int((entry_df['summary_kind'] == 'performance_blocks').sum()):,}" if not entry_df.empty else "0", +) +top4.metric( + "Pass-rate Entries", + f"{int((entry_df['summary_kind'] == 'case_pass_rate').sum()):,}" if not entry_df.empty else "0", +) +top5.metric( + "Latest Date", + entry_df.sort_values("date_sort")["date"].iloc[-1] if not entry_df.empty else "n/a", +) + +inventory_df = entry_df.sort_values(["date_sort", "version", "run_name"], ascending=[False, False, False]).drop( + columns=["date_sort"], + errors="ignore", +) +st.dataframe(inventory_df, use_container_width=True, hide_index=True) + +section_header( + "Performance Trend", + "Full-performance summaries expose mAP, precision, recall, FNR, localization error, and prediction metrics by label.", +) + +perf_entries = entry_df[entry_df["summary_kind"] == "performance_blocks"].sort_values("date_sort") +prediction_cols = [ + "minADE@1s", + "minADE@3s", + "minADE@5s", + "minFDE@1s", + "minFDE@3s", + "minFDE@5s", +] + +if not perf_entries.empty and perf_entries[prediction_cols].notna().any().any(): + pred_card_col1, pred_card_col2, pred_card_col3 = st.columns(3) + latest_pred_row = perf_entries.dropna(subset=["minADE@3s", "minFDE@5s"], how="all").iloc[-1] + pred_card_col1.metric( + "Latest minADE@3s", + f"{latest_pred_row['minADE@3s']:.2f} m" if pd.notna(latest_pred_row["minADE@3s"]) else "n/a", + help="Mid-horizon trajectory accuracy. Lower is better.", + ) + pred_card_col2.metric( + "Latest minFDE@5s", + f"{latest_pred_row['minFDE@5s']:.2f} m" if pd.notna(latest_pred_row["minFDE@5s"]) else "n/a", + help="Longest-horizon endpoint accuracy. Lower is better.", + ) + pred_card_col3.metric( + "Latest Data Count", + f"{int(latest_pred_row['data_count_num']):,}" if pd.notna(latest_pred_row["data_count_num"]) else "n/a", + help="Release-scale sample count paired with the prediction metrics below.", + ) + +perf_col1, perf_col2 = st.columns([1.1, 1.0]) +with perf_col1: + if not perf_entries.empty and perf_entries["mAP"].notna().any(): + fig = go.Figure() + fig.add_bar( + x=perf_entries["version"], + y=perf_entries["data_count_num"], + name="Data Count", + marker_color="#f4a7a7", + opacity=0.5, + yaxis="y2", + ) + fig.add_trace( + go.Scatter( + x=perf_entries["version"], + y=perf_entries["mAP"], + name="mAP", + mode="lines+markers", + line=dict(color="#0f766e", width=3), + ) + ) + fig.update_layout( + title="mAP vs Data Count", + xaxis_title="Pilot.Auto Version", + yaxis_title="mAP", + yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), + height=520, + legend=dict( + orientation="h", + yanchor="bottom", + y=0.94, + x=0, + xanchor="left", + ), + margin=dict(l=20, r=20, t=90, b=20), + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("No full-performance trend entries are available yet.") + +with perf_col2: + if not perf_entries.empty and perf_entries[prediction_cols].notna().any().any(): + pred_story = perf_entries[["version", "date", "description", "run_name", "data_count", "data_count_num"] + prediction_cols].copy() + pred_fig = go.Figure() + pred_fig.add_bar( + x=pred_story["version"], + y=pred_story["data_count_num"], + name="Data Count", + marker_color="#fbbf24", + opacity=0.20, + yaxis="y2", + hovertemplate="%{x}
Data Count: %{y:,}", + ) + + series_specs = [ + ("minADE@1s", "#0f766e", "solid"), + ("minADE@3s", "#14b8a6", "solid"), + ("minADE@5s", "#99f6e4", "solid"), + ("minFDE@1s", "#1d4ed8", "dot"), + ("minFDE@3s", "#60a5fa", "dot"), + ("minFDE@5s", "#bfdbfe", "dot"), + ] + for metric_name, color, dash in series_specs: + pred_fig.add_trace( + go.Scatter( + x=pred_story["version"], + y=pred_story[metric_name], + name=metric_name, + mode="lines+markers", + line=dict(color=color, width=3 if metric_name.endswith("@3s") else 2, dash=dash), + marker=dict(size=8), + hovertemplate=( + "%{x}
" + + metric_name + + ": %{y:.2f} m
" + + "Date: %{customdata[0]}
" + + "Run: %{customdata[1]}
" + + "Data Count: %{customdata[2]}" + ), + customdata=pred_story[["date", "run_name", "data_count"]].to_numpy(), + ) + ) + + pred_fig.add_vrect( + x0=-0.5, + x1=len(pred_story) - 0.5, + fillcolor="#f8fafc", + opacity=0.35, + line_width=0, + layer="below", + ) + pred_fig.update_layout( + title="Prediction Quality Story: All Horizons with Data Count", + xaxis_title="Pilot.Auto Version", + yaxis_title="Prediction Error (m)", + yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), + height=520, + legend=dict( + orientation="h", + yanchor="bottom", + y=0.94, + x=0, + xanchor="left", + ), + margin=dict(l=20, r=20, t=100, b=20), + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + pred_fig.update_xaxes(showgrid=False) + pred_fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") + st.plotly_chart(pred_fig, use_container_width=True) + + st.caption( + "Solid teal lines show ADE across 1s, 3s, and 5s. Dotted blue lines show FDE at the same horizons. The amber bars in the background show data count so scale changes are visible without leaving the chart." + ) + else: + st.info("No usable prediction trend values are available yet.") + +if not metric_df.empty: + perf_drill_col1, perf_drill_col2 = st.columns([1.0, 1.0]) + with perf_drill_col1: + available_metrics = sorted(metric_df["metric_name"].dropna().unique().tolist()) + selected_metric = st.selectbox("Metric Drilldown", available_metrics, index=available_metrics.index("mAP") if "mAP" in available_metrics else 0) + with perf_drill_col2: + metric_labels = sorted(metric_df.loc[metric_df["metric_name"] == selected_metric, "label_name"].dropna().unique().tolist()) + selected_label = st.selectbox("Label", metric_labels, index=0) + + metric_slice = metric_df[ + (metric_df["metric_name"] == selected_metric) + & (metric_df["label_name"] == selected_label) + & (metric_df["block_header"] == "全数データセット評価") + ].sort_values("date_sort") + if not metric_slice.empty: + drill_fig = px.line( + metric_slice, + x="version", + y="value", + markers=True, + hover_data=["date", "description", "run_name"], + title=f"{selected_metric} for {selected_label}", + ) + drill_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) + st.plotly_chart(drill_fig, use_container_width=True) + + metric_pivot = metric_df[ + (metric_df["metric_name"] == selected_metric) + & (metric_df["block_header"] == "全数データセット評価") + ].pivot_table( + index=["version", "date"], + columns="label_name", + values="value", + aggfunc="first", + ).reset_index() + st.dataframe(metric_pivot, use_container_width=True, hide_index=True) + +section_header( + "Pass Rate Trend", + "Nested release summaries expose per-case pass rates. We aggregate them into overall and category-level views here.", +) + +pass_entries = entry_df[entry_df["summary_kind"] == "case_pass_rate"].sort_values("date_sort") +pass_col1, pass_col2 = st.columns([1.1, 1.0]) +with pass_col1: + if not pass_entries.empty and pass_entries["overall_pass_rate"].notna().any(): + pass_fig = go.Figure() + pass_fig.add_bar( + x=pass_entries["version"], + y=pass_entries["scenario_count"], + name="Scenario Count", + marker_color="#86efac", + opacity=0.55, + yaxis="y2", + ) + pass_fig.add_trace( + go.Scatter( + x=pass_entries["version"], + y=pass_entries["overall_pass_rate"], + name="Overall Pass Rate", + mode="lines+markers", + line=dict(color="#1d4ed8", width=3), + ) + ) + pass_fig.update_layout( + title="Overall Pass Rate vs Scenario Count", + xaxis_title="Pilot.Auto Version", + yaxis_title="Pass Rate (%)", + yaxis2=dict(title="Scenario Count", overlaying="y", side="right", showgrid=False), + legend=dict(orientation="h"), + margin=dict(l=20, r=20, t=60, b=20), + ) + st.plotly_chart(pass_fig, use_container_width=True) + else: + st.info("No case-pass-rate summaries are available yet.") + +with pass_col2: + if not case_df.empty: + category_level = st.selectbox("Category Level", ["major_category", "mid_category"], index=0) + category_summary = ( + case_df.groupby(["version", "date", category_level], dropna=False)[["passed", "total"]] + .sum() + .reset_index() + ) + category_summary["pass_rate"] = category_summary["passed"] / category_summary["total"] * 100.0 + category_summary["label"] = category_summary[category_level].astype(str) + pass_cat_fig = px.line( + category_summary, + x="version", + y="pass_rate", + color="label", + markers=True, + hover_data=["date", "passed", "total"], + title=f"Pass Rate by {category_level}", + ) + pass_cat_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) + st.plotly_chart(pass_cat_fig, use_container_width=True) + +if not case_df.empty: + st.markdown("**Case Explorer**") + case_filter_col1, case_filter_col2, case_filter_col3 = st.columns(3) + with case_filter_col1: + selected_major = st.selectbox( + "Major Category", + ["All"] + sorted(case_df["major_category"].dropna().unique().tolist()), + ) + case_df_filtered = case_df.copy() + if selected_major != "All": + case_df_filtered = case_df_filtered[case_df_filtered["major_category"] == selected_major] + + with case_filter_col2: + selected_mid = st.selectbox( + "Mid Category", + ["All"] + sorted(case_df_filtered["mid_category"].dropna().unique().tolist()), + ) + if selected_mid != "All": + case_df_filtered = case_df_filtered[case_df_filtered["mid_category"] == selected_mid] + + with case_filter_col3: + selected_case = st.selectbox( + "Case", + ["All"] + sorted(case_df_filtered["case_name"].dropna().unique().tolist()), + ) + if selected_case != "All": + case_df_filtered = case_df_filtered[case_df_filtered["case_name"] == selected_case] + + case_rollup = case_df_filtered.sort_values(["date_sort", "version", "case_name"]).drop( + columns=["date_sort"], + errors="ignore", + ) + st.dataframe(case_rollup, use_container_width=True, hide_index=True) + + if selected_case != "All": + case_line = case_df_filtered.sort_values("date_sort") + if not case_line.empty: + case_fig = px.line( + case_line, + x="version", + y="pass_rate", + markers=True, + hover_data=["date", "passed", "total", "run_name"], + title=f"Case Trend: {selected_case}", + ) + case_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) + st.plotly_chart(case_fig, use_container_width=True) + +section_header( + "Raw Summary Browser", + "Inspect the raw metadata and summary payload for any entry. This is useful when validating what will flow into release spec-sheets and trend charts.", +) + +selection_df = entry_df.sort_values(["date_sort", "version", "run_name"], ascending=[False, False, False]).reset_index(drop=True) +selection_labels = [ + f"{row.run_name} | {row.version} | {row.date} | {row.summary_kind}" + for row in selection_df.itertuples() +] +selected_label = st.selectbox("Trend entry", selection_labels) +selected_row = selection_df.iloc[selection_labels.index(selected_label)] +selected_metadata = load_trend_metadata_file(selected_row["metadata_path"]) +selected_summary = load_trend_summary_file(selected_row["summary_path"]) + +detail_col1, detail_col2 = st.columns([1.0, 1.25]) +with detail_col1: + st.markdown("**Metadata YAML**") + st.code(json.dumps(selected_metadata, ensure_ascii=False, indent=2), language="json") + +with detail_col2: + st.markdown("**Summary JSON**") + st.code(json.dumps(selected_summary, ensure_ascii=False, indent=2)[:30000], language="json") From 843be0e01f58e262bccd124d85b9de12d76fcd38 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 19 May 2026 12:09:07 +0900 Subject: [PATCH 75/94] feat: implement trend release group classification and performance metrics extraction - Added a new `TrendReleaseGroup` data class to encapsulate metadata for trend releases. - Implemented functions to classify trend summaries and discover trend release groups from metadata files. - Enhanced performance metrics extraction from summary data, allowing for detailed analysis of evaluation results. - Updated the Trend Insights page to utilize the new trend release group structure, improving data organization and display. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/specsheet_report.py | 149 +++++ .../pages/13_Trend_Insights.py | 549 ++++++++++-------- 2 files changed, 454 insertions(+), 244 deletions(-) diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index d868dfe..a74b7ee 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -1,6 +1,7 @@ from __future__ import annotations from contextlib import contextmanager +from dataclasses import dataclass import inspect import json import re @@ -47,6 +48,16 @@ _TREND_DATA_COUNT_PATTERN = re.compile(r"^\d[\d,]*\+?$") +@dataclass +class TrendReleaseGroup: + group_key: str + display_name: str + topic_name: str + group_kind: str + base_dir: Path + jobs: dict[str, dict[str, Any]] + + def get_specsheet_artifact_paths(run_dir: str | Path) -> dict[str, Path]: run_path = Path(run_dir) return { @@ -245,6 +256,77 @@ def load_trend_summary_file(summary_path: str | Path) -> dict[str, Any]: return data +def classify_trend_summary(summary: dict[str, Any]) -> str: + blocks = summary.get("blocks") + if isinstance(blocks, list): + headers = [str(block.get("header") or "") for block in blocks] + if "全数データセット評価" in headers: + return "full" + if "ユースケース評価" in headers: + return "usecase" + return "performance_blocks" + if isinstance(summary, dict) and summary: + return "devops" + return "unknown" + + +def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[TrendReleaseGroup]: + metadata_files = discover_trend_metadata_files(root_dir) + grouped: dict[str, TrendReleaseGroup] = {} + + for metadata_path in metadata_files: + summary_path = metadata_path.parent / TREND_SUMMARY_FILENAME + summary = load_trend_summary_file(summary_path) + role = classify_trend_summary(summary) + metadata = load_trend_metadata_file(metadata_path) + + if metadata_path.parent.name == "resources": + run_dir = metadata_path.parent.parent + group_key = f"run::{run_dir.resolve()}" + display_name = run_dir.name + topic_name = str(metadata.get("topic_name") or "standalone") + group_kind = "standalone_run" + base_dir = run_dir + else: + job_dir = metadata_path.parent + topic_dir = job_dir.parent + combined_dir = topic_dir.parent + group_key = f"group::{combined_dir.resolve()}::{topic_dir.name}" + display_name = combined_dir.name + topic_name = topic_dir.name + group_kind = "library_pdf_group" + base_dir = combined_dir + + if group_key not in grouped: + grouped[group_key] = TrendReleaseGroup( + group_key=group_key, + display_name=display_name, + topic_name=topic_name, + group_kind=group_kind, + base_dir=base_dir, + jobs={}, + ) + grouped[group_key].jobs[role] = { + "role": role, + "job_id": metadata_path.parent.name if metadata_path.parent.name != "resources" else run_dir.name, + "metadata_path": metadata_path.resolve(), + "summary_path": summary_path.resolve(), + "metadata": metadata, + "summary": summary, + } + + def _sort_key(group: TrendReleaseGroup) -> tuple[str, str]: + dates = [ + str(job["metadata"].get("date") or "") + for job in group.jobs.values() + if isinstance(job.get("metadata"), dict) + ] + newest = max(dates) if dates else "" + return (newest, group.display_name) + + return sorted(grouped.values(), key=_sort_key) + + def _trend_version_sort_key(pilot_auto_version: str) -> tuple[tuple[int, int, int], str, tuple[int, int, int]]: pattern = r"v(\d+)\.(\d+)\.(\d+)\s*\(([^ ]+)\s+(.+)\)" match = re.search(pattern, str(pilot_auto_version or "")) @@ -278,6 +360,73 @@ def _load_only_full_summary(summary_path: Path) -> list[dict[str, Any]]: return data_list +def extract_performance_metrics_from_summary(summary: dict[str, Any]) -> dict[str, float]: + """Return averaged full-performance metrics from a full summary payload.""" + data_list: list[dict[str, Any]] = [] + for block in summary.get("blocks", []): + if block.get("header") != "全数データセット評価": + continue + for table in block.get("tables", []): + table_data = table.get("data", {}) + if isinstance(table_data, dict) and table_data: + data_list.append(table_data) + + if len(data_list) != 1: + raise ValueError(f"Expected exactly one full summary table, but got {len(data_list)}") + metrics = data_list[0] + + def _avg(metric_name: str) -> float: + values = metrics.get(metric_name, {}) + if not isinstance(values, dict) or not values: + return float("nan") + numeric = pd.to_numeric(pd.Series(list(values.values())), errors="coerce") + return float(numeric.mean()) + + return { + "mAP": _avg("mAP"), + "precision": _avg("precision"), + "recall": _avg("recall"), + "FNR": _avg("FNR"), + "x_error": _avg("x_error"), + "y_error": _avg("y_error"), + "yaw_error": _avg("yaw_error"), + "speed_error": _avg("speed_error"), + "minADE@1s": _avg("minADE@1s"), + "minFDE@1s": _avg("minFDE@1s"), + "minADE@3s": _avg("minADE@3s"), + "minFDE@3s": _avg("minFDE@3s"), + "minADE@5s": _avg("minADE@5s"), + "minFDE@5s": _avg("minFDE@5s"), + } + + +def extract_devops_case_rows(summary: dict[str, Any]) -> list[dict[str, Any]]: + """Flatten nested devops/pass-rate summary into case rows.""" + rows: list[dict[str, Any]] = [] + for major_category, mid_categories in summary.items(): + if not isinstance(mid_categories, dict): + continue + for mid_category, cases in mid_categories.items(): + if not isinstance(cases, dict): + continue + for case_name, result in cases.items(): + if not isinstance(result, dict): + continue + passed = int(result.get("passed", 0) or 0) + total = int(result.get("total", 0) or 0) + rows.append( + { + "major_category": major_category, + "mid_category": mid_category, + "case_name": case_name, + "passed": passed, + "total": total, + "pass_rate": (passed / total * 100.0) if total > 0 else None, + } + ) + return rows + + def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, str | int | float]]: trend_data_rows: list[dict[str, Any]] = [] for metadata_path in metadata_list: diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index 75848fb..dbd50a5 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -11,22 +11,16 @@ from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header from lib.specsheet_report import ( - discover_trend_metadata_files, - load_performance_trend_data, - load_trend_metadata_file, - load_trend_summary_file, + TrendReleaseGroup, + discover_trend_release_groups, + extract_devops_case_rows, + extract_performance_metrics_from_summary, ) st.set_page_config(page_title="Trend Insights", layout="wide", initial_sidebar_state="expanded") inject_app_page_styles() -def _run_name_from_metadata_path(metadata_path: Path) -> str: - if metadata_path.parent.name == "resources": - return metadata_path.parent.parent.name - return metadata_path.parent.name - - def _parse_data_count(value: Any) -> int | None: text = str(value or "").strip().replace(",", "").replace("+", "") if not text: @@ -37,78 +31,61 @@ def _parse_data_count(value: Any) -> int | None: return None -def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - performance_rows = load_performance_trend_data(metadata_files) - by_version = {str(row.get("version")): row for row in performance_rows} +def _select_primary_metadata(group: TrendReleaseGroup) -> dict[str, Any]: + for role in ("full", "usecase", "devops", "performance_blocks", "unknown"): + if role in group.jobs: + return group.jobs[role]["metadata"] + return {} - entry_rows: list[dict[str, Any]] = [] + +def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + release_rows: list[dict[str, Any]] = [] case_rows: list[dict[str, Any]] = [] metric_rows: list[dict[str, Any]] = [] - for metadata_path in metadata_files: - metadata = load_trend_metadata_file(metadata_path) - summary_path = metadata_path.parent / "summary.json" - summary = load_trend_summary_file(summary_path) - version = str(metadata.get("pilot_auto_version") or "") - trend_row = by_version.get(version, {}) - data_count_raw = str(metadata.get("data_count") or "") - entry_row = { - "run_name": _run_name_from_metadata_path(metadata_path), + for group in groups: + primary_metadata = _select_primary_metadata(group) + version = str(primary_metadata.get("pilot_auto_version") or "") + date = str(primary_metadata.get("date") or "") + description = str(primary_metadata.get("description") or "") + data_count = str(primary_metadata.get("data_count") or "") + release_row = { + "group_key": group.group_key, + "release_name": group.display_name, + "topic_name": group.topic_name, + "group_kind": group.group_kind, "version": version, - "date": str(metadata.get("date") or ""), - "description": str(metadata.get("description") or ""), - "data_count": data_count_raw, - "data_count_num": _parse_data_count(data_count_raw), - "metadata_path": str(metadata_path), - "summary_path": str(summary_path), - "summary_kind": "performance_blocks" if isinstance(summary.get("blocks"), list) else "case_pass_rate", - "summary_blocks": len(summary.get("blocks", [])) if isinstance(summary.get("blocks"), list) else 0, - "mAP": trend_row.get("mAP"), - "minADE@1s": trend_row.get("minADE@1s"), - "minADE@3s": trend_row.get("minADE@3s"), - "minADE@5s": trend_row.get("minADE@5s"), - "minFDE@1s": trend_row.get("minFDE@1s"), - "minFDE@3s": trend_row.get("minFDE@3s"), - "minFDE@5s": trend_row.get("minFDE@5s"), + "date": date, + "description": description, + "data_count": data_count, + "data_count_num": _parse_data_count(data_count), + "full_job_id": group.jobs.get("full", {}).get("job_id"), + "usecase_job_id": group.jobs.get("usecase", {}).get("job_id"), + "devops_job_id": group.jobs.get("devops", {}).get("job_id"), + "mAP": None, + "precision": None, + "recall": None, + "FNR": None, + "x_error": None, + "y_error": None, + "yaw_error": None, + "speed_error": None, + "minADE@1s": None, + "minADE@3s": None, + "minADE@5s": None, + "minFDE@1s": None, + "minFDE@3s": None, + "minFDE@5s": None, "overall_pass_rate": None, "scenario_count": None, + "role_count": len(group.jobs), + "roles": ", ".join(sorted(group.jobs.keys())), } - if entry_row["summary_kind"] == "case_pass_rate": - total_passed = 0 - total_count = 0 - for major_category, mid_categories in summary.items(): - if not isinstance(mid_categories, dict): - continue - for mid_category, cases in mid_categories.items(): - if not isinstance(cases, dict): - continue - for case_name, result in cases.items(): - if not isinstance(result, dict): - continue - passed = int(result.get("passed", 0) or 0) - total = int(result.get("total", 0) or 0) - total_passed += passed - total_count += total - case_rows.append( - { - "run_name": entry_row["run_name"], - "version": version, - "date": entry_row["date"], - "description": entry_row["description"], - "major_category": major_category, - "mid_category": mid_category, - "case_name": case_name, - "passed": passed, - "total": total, - "pass_rate": (passed / total * 100.0) if total > 0 else None, - } - ) - entry_row["scenario_count"] = total_count - entry_row["overall_pass_rate"] = (total_passed / total_count * 100.0) if total_count > 0 else None - else: - blocks = summary.get("blocks", []) - for block in blocks: + if "full" in group.jobs: + full_summary = group.jobs["full"]["summary"] + release_row.update(extract_performance_metrics_from_summary(full_summary)) + for block in full_summary.get("blocks", []): block_header = str(block.get("header") or "") for table in block.get("tables", []): table_data = table.get("data", {}) @@ -120,82 +97,108 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat for label_name, value in labels.items(): metric_rows.append( { - "run_name": entry_row["run_name"], + "group_key": group.group_key, + "release_name": group.display_name, "version": version, - "date": entry_row["date"], - "description": entry_row["description"], + "date": date, + "description": description, "block_header": block_header, "metric_name": metric_name, "label_name": label_name, - "value": value, + "value": pd.to_numeric(value, errors="coerce"), } ) - entry_rows.append(entry_row) - - entry_df = pd.DataFrame(entry_rows) - if not entry_df.empty: - entry_df["date_sort"] = pd.to_datetime(entry_df["date"], format="%Y.%m.%d", errors="coerce") + if "devops" in group.jobs: + flattened = extract_devops_case_rows(group.jobs["devops"]["summary"]) + if flattened: + total_passed = sum(int(row["passed"]) for row in flattened) + total_count = sum(int(row["total"]) for row in flattened) + release_row["scenario_count"] = total_count + release_row["overall_pass_rate"] = (total_passed / total_count * 100.0) if total_count > 0 else None + for row in flattened: + case_rows.append( + { + "group_key": group.group_key, + "release_name": group.display_name, + "version": version, + "date": date, + "description": description, + **row, + } + ) + + release_rows.append(release_row) + + release_df = pd.DataFrame(release_rows) + if not release_df.empty: + release_df["date_sort"] = pd.to_datetime(release_df["date"], format="%Y.%m.%d", errors="coerce") case_df = pd.DataFrame(case_rows) if not case_df.empty: case_df["date_sort"] = pd.to_datetime(case_df["date"], format="%Y.%m.%d", errors="coerce") metric_df = pd.DataFrame(metric_rows) if not metric_df.empty: metric_df["date_sort"] = pd.to_datetime(metric_df["date"], format="%Y.%m.%d", errors="coerce") - metric_df["value"] = pd.to_numeric(metric_df["value"], errors="coerce") - return entry_df, case_df, metric_df + return release_df, case_df, metric_df render_page_hero( kicker="Release Analytics", title="Trend Insights", - description="Inspect every saved release-trend entry, including full performance metrics, case-level pass rates, and the raw analyzer summary payloads.", + description="Inspect grouped release trend data the same way the catalog analyzer models it: one release group with sibling full, usecase, and devops job folders under the same topic.", ) section_header( - "Trend Inventory", - "Trend entries are discovered from saved `metadata.yaml` files that sit beside analyzer-compatible `summary.json` files under the dashboard data root.", + "Release Inventory", + "Each row below is one grouped release entry. When full, usecase, and devops sibling folders exist under the same combined PDF group and topic, they are merged into one release view.", ) -metadata_files = discover_trend_metadata_files() -if not metadata_files: +groups = discover_trend_release_groups() +if not groups: st.info("No saved trend metadata was found yet. Generate a release spec-sheet with trend mode enabled first.") st.stop() try: - entry_df, case_df, metric_df = _build_entry_frame(metadata_files) + release_df, case_df, metric_df = _build_release_frames(groups) except Exception as exc: st.error(f"Could not build trend insights: {exc}") st.stop() top1, top2, top3, top4, top5 = st.columns(5) -top1.metric("Trend Entries", f"{len(entry_df):,}") -top2.metric("Unique Versions", f"{entry_df['version'].nunique():,}" if not entry_df.empty else "0") -top3.metric( - "Performance Entries", - f"{int((entry_df['summary_kind'] == 'performance_blocks').sum()):,}" if not entry_df.empty else "0", -) -top4.metric( - "Pass-rate Entries", - f"{int((entry_df['summary_kind'] == 'case_pass_rate').sum()):,}" if not entry_df.empty else "0", -) -top5.metric( - "Latest Date", - entry_df.sort_values("date_sort")["date"].iloc[-1] if not entry_df.empty else "n/a", -) - -inventory_df = entry_df.sort_values(["date_sort", "version", "run_name"], ascending=[False, False, False]).drop( - columns=["date_sort"], - errors="ignore", +top1.metric("Release Groups", f"{len(release_df):,}") +top2.metric("Unique Versions", f"{release_df['version'].nunique():,}" if not release_df.empty else "0") +top3.metric("Groups with Full", f"{int(release_df['full_job_id'].notna().sum()):,}" if not release_df.empty else "0") +top4.metric("Groups with DevOps", f"{int(release_df['devops_job_id'].notna().sum()):,}" if not release_df.empty else "0") +top5.metric("Latest Date", release_df.sort_values("date_sort")["date"].iloc[-1] if not release_df.empty else "n/a") + +inventory_cols = [ + "version", + "date", + "description", + "data_count", + "roles", + "full_job_id", + "usecase_job_id", + "devops_job_id", + "release_name", + "topic_name", + "group_kind", +] +st.dataframe( + release_df.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False])[inventory_cols], + use_container_width=True, + hide_index=True, ) -st.dataframe(inventory_df, use_container_width=True, hide_index=True) section_header( "Performance Trend", - "Full-performance summaries expose mAP, precision, recall, FNR, localization error, and prediction metrics by label.", + "Full-performance summaries are now plotted one release group at a time, even when they arrived with sibling usecase and devops folders.", ) -perf_entries = entry_df[entry_df["summary_kind"] == "performance_blocks"].sort_values("date_sort") +perf_entries = release_df[release_df["full_job_id"].notna()].sort_values( + ["date_sort", "version", "release_name"], + ascending=[True, True, True], +) prediction_cols = [ "minADE@1s", "minADE@3s", @@ -211,17 +214,14 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat pred_card_col1.metric( "Latest minADE@3s", f"{latest_pred_row['minADE@3s']:.2f} m" if pd.notna(latest_pred_row["minADE@3s"]) else "n/a", - help="Mid-horizon trajectory accuracy. Lower is better.", ) pred_card_col2.metric( "Latest minFDE@5s", f"{latest_pred_row['minFDE@5s']:.2f} m" if pd.notna(latest_pred_row["minFDE@5s"]) else "n/a", - help="Longest-horizon endpoint accuracy. Lower is better.", ) pred_card_col3.metric( "Latest Data Count", f"{int(latest_pred_row['data_count_num']):,}" if pd.notna(latest_pred_row["data_count_num"]) else "n/a", - help="Release-scale sample count paired with the prediction metrics below.", ) perf_col1, perf_col2 = st.columns([1.1, 1.0]) @@ -243,6 +243,8 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat name="mAP", mode="lines+markers", line=dict(color="#0f766e", width=3), + customdata=perf_entries[["release_name", "date", "data_count"]].to_numpy(), + hovertemplate="%{x}
mAP: %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}", ) ) fig.update_layout( @@ -251,22 +253,16 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat yaxis_title="mAP", yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), height=520, - legend=dict( - orientation="h", - yanchor="bottom", - y=0.94, - x=0, - xanchor="left", - ), + legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), margin=dict(l=20, r=20, t=90, b=20), ) st.plotly_chart(fig, use_container_width=True) else: - st.info("No full-performance trend entries are available yet.") + st.info("No grouped full-performance trend entries are available yet.") with perf_col2: if not perf_entries.empty and perf_entries[prediction_cols].notna().any().any(): - pred_story = perf_entries[["version", "date", "description", "run_name", "data_count", "data_count_num"] + prediction_cols].copy() + pred_story = perf_entries[["version", "date", "description", "release_name", "data_count", "data_count_num"] + prediction_cols].copy() pred_fig = go.Figure() pred_fig.add_bar( x=pred_story["version"], @@ -277,7 +273,6 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat yaxis="y2", hovertemplate="%{x}
Data Count: %{y:,}", ) - series_specs = [ ("minADE@1s", "#0f766e", "solid"), ("minADE@3s", "#14b8a6", "solid"), @@ -295,39 +290,21 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat mode="lines+markers", line=dict(color=color, width=3 if metric_name.endswith("@3s") else 2, dash=dash), marker=dict(size=8), + customdata=pred_story[["date", "release_name", "data_count"]].to_numpy(), hovertemplate=( "%{x}
" + metric_name - + ": %{y:.2f} m
" - + "Date: %{customdata[0]}
" - + "Run: %{customdata[1]}
" - + "Data Count: %{customdata[2]}" + + ": %{y:.2f} m
Date: %{customdata[0]}
Release: %{customdata[1]}
Data Count: %{customdata[2]}" ), - customdata=pred_story[["date", "run_name", "data_count"]].to_numpy(), ) ) - - pred_fig.add_vrect( - x0=-0.5, - x1=len(pred_story) - 0.5, - fillcolor="#f8fafc", - opacity=0.35, - line_width=0, - layer="below", - ) pred_fig.update_layout( title="Prediction Quality Story: All Horizons with Data Count", xaxis_title="Pilot.Auto Version", yaxis_title="Prediction Error (m)", yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), height=520, - legend=dict( - orientation="h", - yanchor="bottom", - y=0.94, - x=0, - xanchor="left", - ), + legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), margin=dict(l=20, r=20, t=100, b=20), plot_bgcolor="#ffffff", paper_bgcolor="#ffffff", @@ -335,56 +312,92 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat pred_fig.update_xaxes(showgrid=False) pred_fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") st.plotly_chart(pred_fig, use_container_width=True) - st.caption( - "Solid teal lines show ADE across 1s, 3s, and 5s. Dotted blue lines show FDE at the same horizons. The amber bars in the background show data count so scale changes are visible without leaving the chart." + "Each point is one grouped release. The chart now keeps sibling full/usecase/devops folders together so the performance story stays release-centric." ) else: - st.info("No usable prediction trend values are available yet.") + st.info("No usable grouped prediction trend values are available yet.") if not metric_df.empty: - perf_drill_col1, perf_drill_col2 = st.columns([1.0, 1.0]) - with perf_drill_col1: - available_metrics = sorted(metric_df["metric_name"].dropna().unique().tolist()) - selected_metric = st.selectbox("Metric Drilldown", available_metrics, index=available_metrics.index("mAP") if "mAP" in available_metrics else 0) - with perf_drill_col2: - metric_labels = sorted(metric_df.loc[metric_df["metric_name"] == selected_metric, "label_name"].dropna().unique().tolist()) - selected_label = st.selectbox("Label", metric_labels, index=0) - - metric_slice = metric_df[ - (metric_df["metric_name"] == selected_metric) - & (metric_df["label_name"] == selected_label) - & (metric_df["block_header"] == "全数データセット評価") - ].sort_values("date_sort") - if not metric_slice.empty: - drill_fig = px.line( - metric_slice, - x="version", - y="value", - markers=True, - hover_data=["date", "description", "run_name"], - title=f"{selected_metric} for {selected_label}", + atlas_df = metric_df[metric_df["block_header"] == "全数データセット評価"].copy() + atlas_df = atlas_df.sort_values(["date_sort", "version", "release_name"], ascending=[True, True, True]) + latest_group_key = perf_entries.iloc[-1]["group_key"] if not perf_entries.empty else None + previous_group_key = perf_entries.iloc[-2]["group_key"] if len(perf_entries) >= 2 else None + latest_release_name = perf_entries.iloc[-1]["release_name"] if not perf_entries.empty else "" + + atlas_col1, atlas_col2 = st.columns([1.0, 1.0]) + if latest_group_key is not None: + latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", + ).dropna(how="all") + if not latest_matrix.empty: + latest_min = latest_matrix.min(axis=1) + latest_range = (latest_matrix.max(axis=1) - latest_min).replace(0, 1) + latest_norm = latest_matrix.sub(latest_min, axis=0).div(latest_range, axis=0) + atlas_fig = px.imshow( + latest_norm, + aspect="auto", + color_continuous_scale=["#f8fafc", "#8dd3c7", "#0f766e"], + text_auto=".2f", + ) + atlas_fig.update_traces( + text=latest_matrix.round(2).astype(str), + hovertemplate="Metric: %{y}
Label: %{x}
Value: %{text}", + ) + atlas_fig.update_layout( + title=f"Latest Release Metric Atlas: {latest_release_name}", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="Relative"), + ) + atlas_col1.plotly_chart(atlas_fig, use_container_width=True) + else: + atlas_col1.info("No latest metric atlas is available yet.") + + if latest_group_key is not None and previous_group_key is not None: + latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", ) - drill_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) - st.plotly_chart(drill_fig, use_container_width=True) - - metric_pivot = metric_df[ - (metric_df["metric_name"] == selected_metric) - & (metric_df["block_header"] == "全数データセット評価") - ].pivot_table( - index=["version", "date"], + previous_matrix = atlas_df[atlas_df["group_key"] == previous_group_key].pivot_table( + index="metric_name", columns="label_name", values="value", aggfunc="first", - ).reset_index() - st.dataframe(metric_pivot, use_container_width=True, hide_index=True) + ) + delta_matrix = latest_matrix.subtract(previous_matrix, fill_value=pd.NA).dropna(how="all") + if not delta_matrix.empty: + delta_fig = px.imshow( + delta_matrix, + aspect="auto", + color_continuous_scale=["#7f1d1d", "#f8fafc", "#14532d"], + color_continuous_midpoint=0, + text_auto=".2f", + ) + delta_fig.update_layout( + title="Release-over-Release Metric Delta", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="Delta"), + ) + atlas_col2.plotly_chart(delta_fig, use_container_width=True) + else: + atlas_col2.info("No previous release is available for metric delta yet.") + else: + atlas_col2.info("Metric delta becomes available after at least two grouped full releases exist.") section_header( "Pass Rate Trend", - "Nested release summaries expose per-case pass rates. We aggregate them into overall and category-level views here.", + "DevOps-style nested summaries are also grouped by release, so one pass-rate point represents the same release group as the matching performance metrics.", ) -pass_entries = entry_df[entry_df["summary_kind"] == "case_pass_rate"].sort_values("date_sort") +pass_entries = release_df[release_df["devops_job_id"].notna()].sort_values( + ["date_sort", "version", "release_name"], + ascending=[True, True, True], +) pass_col1, pass_col2 = st.columns([1.1, 1.0]) with pass_col1: if not pass_entries.empty and pass_entries["overall_pass_rate"].notna().any(): @@ -404,6 +417,8 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat name="Overall Pass Rate", mode="lines+markers", line=dict(color="#1d4ed8", width=3), + customdata=pass_entries[["release_name", "date"]].to_numpy(), + hovertemplate="%{x}
Pass Rate: %{y:.1f}%
Release: %{customdata[0]}
Date: %{customdata[1]}", ) ) pass_fig.update_layout( @@ -411,103 +426,149 @@ def _build_entry_frame(metadata_files: list[Path]) -> tuple[pd.DataFrame, pd.Dat xaxis_title="Pilot.Auto Version", yaxis_title="Pass Rate (%)", yaxis2=dict(title="Scenario Count", overlaying="y", side="right", showgrid=False), - legend=dict(orientation="h"), - margin=dict(l=20, r=20, t=60, b=20), + height=520, + legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=90, b=20), ) st.plotly_chart(pass_fig, use_container_width=True) else: - st.info("No case-pass-rate summaries are available yet.") + st.info("No grouped pass-rate summaries are available yet.") with pass_col2: if not case_df.empty: - category_level = st.selectbox("Category Level", ["major_category", "mid_category"], index=0) - category_summary = ( - case_df.groupby(["version", "date", category_level], dropna=False)[["passed", "total"]] + major_summary = ( + case_df.groupby(["version", "date", "release_name", "major_category"], dropna=False)[["passed", "total"]] .sum() .reset_index() ) - category_summary["pass_rate"] = category_summary["passed"] / category_summary["total"] * 100.0 - category_summary["label"] = category_summary[category_level].astype(str) + major_summary["pass_rate"] = major_summary["passed"] / major_summary["total"] * 100.0 + major_summary["label"] = major_summary["major_category"].astype(str) pass_cat_fig = px.line( - category_summary, + major_summary, x="version", y="pass_rate", color="label", markers=True, - hover_data=["date", "passed", "total"], - title=f"Pass Rate by {category_level}", + hover_data=["date", "release_name", "passed", "total"], + title="Major Category Pass Rate", ) pass_cat_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) st.plotly_chart(pass_cat_fig, use_container_width=True) if not case_df.empty: - st.markdown("**Case Explorer**") - case_filter_col1, case_filter_col2, case_filter_col3 = st.columns(3) - with case_filter_col1: - selected_major = st.selectbox( - "Major Category", - ["All"] + sorted(case_df["major_category"].dropna().unique().tolist()), + pass_detail_col1, pass_detail_col2 = st.columns([1.0, 1.0]) + with pass_detail_col1: + mid_summary = ( + case_df.groupby(["mid_category", "version"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() ) - case_df_filtered = case_df.copy() - if selected_major != "All": - case_df_filtered = case_df_filtered[case_df_filtered["major_category"] == selected_major] - - with case_filter_col2: - selected_mid = st.selectbox( - "Mid Category", - ["All"] + sorted(case_df_filtered["mid_category"].dropna().unique().tolist()), + mid_summary["pass_rate"] = mid_summary["passed"] / mid_summary["total"] * 100.0 + mid_matrix = mid_summary.pivot_table( + index="mid_category", + columns="version", + values="pass_rate", + aggfunc="first", ) - if selected_mid != "All": - case_df_filtered = case_df_filtered[case_df_filtered["mid_category"] == selected_mid] + mid_fig = px.imshow( + mid_matrix, + aspect="auto", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + zmin=0, + zmax=100, + text_auto=".1f", + ) + mid_fig.update_layout( + title="Mid Category Pass Rate Matrix", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="%"), + ) + st.plotly_chart(mid_fig, use_container_width=True) - with case_filter_col3: - selected_case = st.selectbox( - "Case", - ["All"] + sorted(case_df_filtered["case_name"].dropna().unique().tolist()), + with pass_detail_col2: + latest_devops_group = pass_entries.iloc[-1]["group_key"] if not pass_entries.empty else None + latest_case_df = case_df[case_df["group_key"] == latest_devops_group].copy() + latest_major_mid = ( + latest_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() ) + latest_major_mid["pass_rate"] = latest_major_mid["passed"] / latest_major_mid["total"] * 100.0 + if not latest_major_mid.empty: + sunburst_fig = px.sunburst( + latest_major_mid, + path=["major_category", "mid_category"], + values="total", + color="pass_rate", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + range_color=(0, 100), + title="Latest Release Pass-Rate Hierarchy", + ) + sunburst_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(sunburst_fig, use_container_width=True) + else: + st.info("No latest release pass-rate hierarchy is available yet.") + + st.markdown("**Case Explorer**") + filter_col1, filter_col2, filter_col3 = st.columns(3) + with filter_col1: + selected_major = st.selectbox("Major Category", ["All"] + sorted(case_df["major_category"].dropna().unique().tolist())) + case_filtered = case_df.copy() + if selected_major != "All": + case_filtered = case_filtered[case_filtered["major_category"] == selected_major] + with filter_col2: + selected_mid = st.selectbox("Mid Category", ["All"] + sorted(case_filtered["mid_category"].dropna().unique().tolist())) + if selected_mid != "All": + case_filtered = case_filtered[case_filtered["mid_category"] == selected_mid] + with filter_col3: + selected_case = st.selectbox("Case", ["All"] + sorted(case_filtered["case_name"].dropna().unique().tolist())) if selected_case != "All": - case_df_filtered = case_df_filtered[case_df_filtered["case_name"] == selected_case] + case_filtered = case_filtered[case_filtered["case_name"] == selected_case] - case_rollup = case_df_filtered.sort_values(["date_sort", "version", "case_name"]).drop( - columns=["date_sort"], - errors="ignore", + st.dataframe( + case_filtered.sort_values(["date_sort", "version", "case_name"]).drop(columns=["date_sort"], errors="ignore"), + use_container_width=True, + hide_index=True, ) - st.dataframe(case_rollup, use_container_width=True, hide_index=True) - - if selected_case != "All": - case_line = case_df_filtered.sort_values("date_sort") - if not case_line.empty: - case_fig = px.line( - case_line, - x="version", - y="pass_rate", - markers=True, - hover_data=["date", "passed", "total", "run_name"], - title=f"Case Trend: {selected_case}", - ) - case_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) - st.plotly_chart(case_fig, use_container_width=True) section_header( - "Raw Summary Browser", - "Inspect the raw metadata and summary payload for any entry. This is useful when validating what will flow into release spec-sheets and trend charts.", + "Grouped Raw Browser", + "Inspect one grouped release and its child roles directly. This makes it obvious which full, usecase, and devops job folders are contributing to the combined trend view.", ) -selection_df = entry_df.sort_values(["date_sort", "version", "run_name"], ascending=[False, False, False]).reset_index(drop=True) +selection_df = release_df.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False]).reset_index(drop=True) selection_labels = [ - f"{row.run_name} | {row.version} | {row.date} | {row.summary_kind}" + f"{row.release_name} | {row.version} | {row.date} | roles: {row.roles}" for row in selection_df.itertuples() ] -selected_label = st.selectbox("Trend entry", selection_labels) -selected_row = selection_df.iloc[selection_labels.index(selected_label)] -selected_metadata = load_trend_metadata_file(selected_row["metadata_path"]) -selected_summary = load_trend_summary_file(selected_row["summary_path"]) +selected_label = st.selectbox("Release Group", selection_labels) +selected_release = selection_df.iloc[selection_labels.index(selected_label)] +selected_group = next(group for group in groups if group.group_key == selected_release["group_key"]) + +group_manifest = { + "display_name": selected_group.display_name, + "topic_name": selected_group.topic_name, + "group_kind": selected_group.group_kind, + "base_dir": str(selected_group.base_dir), + "jobs": { + role: { + "job_id": payload["job_id"], + "metadata_path": str(payload["metadata_path"]), + "summary_path": str(payload["summary_path"]), + } + for role, payload in selected_group.jobs.items() + }, +} -detail_col1, detail_col2 = st.columns([1.0, 1.25]) +detail_col1, detail_col2 = st.columns([0.9, 1.1]) with detail_col1: - st.markdown("**Metadata YAML**") - st.code(json.dumps(selected_metadata, ensure_ascii=False, indent=2), language="json") + st.markdown("**Release Group Manifest**") + st.code(json.dumps(group_manifest, ensure_ascii=False, indent=2), language="json") + role_choice = st.selectbox("Child Role", sorted(selected_group.jobs.keys())) with detail_col2: - st.markdown("**Summary JSON**") - st.code(json.dumps(selected_summary, ensure_ascii=False, indent=2)[:30000], language="json") + st.markdown("**Selected Child Summary JSON**") + st.code( + json.dumps(selected_group.jobs[role_choice]["summary"], ensure_ascii=False, indent=2)[:30000], + language="json", + ) From 6916073aca8844a185b3ace4c50a7dbe35d299c6 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 20 May 2026 12:01:06 +0900 Subject: [PATCH 76/94] feat: update user label handling and enhance UI styles in evaluator workflow - Changed default user label to "(Auto)" in recent evaluator job card rendering for improved clarity. - Updated user label retrieval logic to default to "(Auto)" when no name is available, enhancing consistency. - Added new CSS styles for linked references and button adjustments in the evaluator workflow, improving UI aesthetics and usability. - Enhanced local run details rendering to include project ID and improved source reference handling, ensuring better traceability. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/ui/recent_evaluator_jobs.py | 25 +- ...{7_Evaluator_Workflow.py => 6_Workflow.py} | 508 +++++++++++++----- 2 files changed, 400 insertions(+), 133 deletions(-) rename evaluation_dashboard_app/pages/{7_Evaluator_Workflow.py => 6_Workflow.py} (82%) diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 00e2b0f..732015a 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -1097,7 +1097,7 @@ def _inject_recent_evaluator_jobs_styles() -> None: ) -def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "Unknown") -> None: +def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "(Auto)") -> None: """Render one recent evaluator job as a single-row list item.""" variant = html.escape(job.get("status_variant", "unknown")) status = html.escape(_status_display_label(job.get("status", "unknown") or "unknown")) @@ -1113,7 +1113,7 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = created_label = html.escape(job.get("created_label", "—")) git_sha = str(job.get("git_sha", "") or "").strip() source_label = str(job.get("source_label", "") or "—").strip() - user_text = html.escape(user_label or "Unknown") + user_text = html.escape(user_label or "(Auto)") report_url = html.escape(job.get("report_url", "") or "") source_url = str(job.get("git_ref_url", "") or job.get("source_url", "") or "").strip() git_commit_url = str(job.get("git_commit_url", "") or "").strip() @@ -1125,12 +1125,19 @@ def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "canceled": '', }.get(status_variant, '') meta_line = job_id - counts = ( - f'✅ {int(job.get("success", 0))} · ' - f'❌ {int(job.get("failed", 0))} · ' - f'⏹ {int(job.get("canceled", 0))} / ' - f'{int(job.get("total", 0))}' - ) + total = int(job.get("total", 0) or 0) + success = int(job.get("success", 0) or 0) + failed = int(job.get("failed", 0) or 0) + canceled = int(job.get("canceled", 0) or 0) + if status_variant == "running" and total == 0 and success == 0 and failed == 0 and canceled == 0: + counts = "Running..." + else: + counts = ( + f'✅ {success} · ' + f'❌ {failed} · ' + f'⏹ {canceled} / ' + f'{total}' + ) title_html = f'{title_text}' if report_url else title_text source_html = _format_source_ref_html(source_label, source_url, git_sha, git_commit_url) catalog_html = ( @@ -1930,7 +1937,7 @@ def _render_job_list() -> None: for job in visible_jobs: subject_id = str(job.get("scheduled_by") or "").strip() user_info = user_directory.get(subject_id, {}) - user_label = str(user_info.get("name") or subject_id or "Unknown").strip() + user_label = str(user_info.get("name") or subject_id or "(Auto)").strip() row_cols = st.columns([9.2, 2.6]) with row_cols[0]: _render_recent_evaluator_job_card(job, user_label=user_label) diff --git a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py similarity index 82% rename from evaluation_dashboard_app/pages/7_Evaluator_Workflow.py rename to evaluation_dashboard_app/pages/6_Workflow.py index c2431e3..63038f3 100644 --- a/evaluation_dashboard_app/pages/7_Evaluator_Workflow.py +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -40,6 +40,7 @@ from lib.run_metadata import ( build_run_search_blob, read_run_metadata, + upsert_run_metadata, ) from lib.ui.recent_evaluator_jobs import ( _fetch_evaluator_job_detail, @@ -329,7 +330,7 @@ def _metadata_text(value: object) -> str: def _run_user_label(subject_id: str, environment: str) -> str: subject = str(subject_id or "").strip() if not subject: - return "—" + return "(Auto)" if not subject.startswith("t4:"): return subject try: @@ -337,7 +338,7 @@ def _run_user_label(subject_id: str, environment: str) -> str: name = str(profile.get("name") or subject).strip() return name or subject except Exception: - return subject + return "(Auto)" def _catalog_url(project_id: str, catalog_id: str, metadata_url: str = "") -> str: @@ -351,6 +352,31 @@ def _catalog_url(project_id: str, catalog_id: str, metadata_url: str = "") -> st return "" +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _catalog_preset_name_map() -> Dict[str, str]: + presets, _, _ = _load_catalog_presets() + mapping: Dict[str, str] = {} + for item in presets: + if not isinstance(item, dict): + continue + catalog_id = str(item.get("catalog_id") or "").strip() + display_name = str(item.get("display_name") or item.get("name") or "").strip() + if catalog_id and display_name: + mapping[catalog_id] = display_name + return mapping + + +def _catalog_label_for_run(catalog_id: str, catalog_name: str) -> str: + resolved_name = str(catalog_name or "").strip() + if resolved_name: + return resolved_name + catalog = str(catalog_id or "").strip() + if not catalog: + return "—" + preset_match = _catalog_preset_name_map().get(catalog, "").strip() + return preset_match or catalog + + @st.cache_data(ttl=15, show_spinner=False) def _load_local_runs() -> List[Dict[str, object]]: runs: List[Dict[str, object]] = [] @@ -397,7 +423,7 @@ def _load_local_runs() -> List[Dict[str, object]]: or "" ).strip() catalog_name = str(evaluator_meta.get("catalog_name") or "").strip() - catalog_label = catalog_name or catalog_id + catalog_label = _catalog_label_for_run(catalog_id, catalog_name) catalog_url = _catalog_url( str(request_meta.get("project_id") or "").strip(), catalog_id, @@ -425,6 +451,7 @@ def _load_local_runs() -> List[Dict[str, object]]: runs.append( { "name": info["name"], + "run_path": run_path, "path_display": f"{get_data_root_display()}/{info['name']}", "size": format_size(info["size_bytes"]), "mtime": float(info["mtime"] or 0), @@ -438,12 +465,14 @@ def _load_local_runs() -> List[Dict[str, object]]: "requested_by": requested_by, "requested_by_label": requested_by_label, "environment": environment, + "project_id": str(request_meta.get("project_id") or "").strip(), "task_type": task_type, "task_status": task_status, "evaluator_job_id": evaluator_job_id, "evaluator_report_url": evaluator_report_url, "evaluator_title": evaluator_title, "evaluator_target": evaluator_target, + "branch_label": evaluator_target, "evaluator_git_sha": str(evaluator_meta.get("git_sha") or "").strip(), "evaluator_git_ref_url": str(evaluator_meta.get("git_ref_url") or "").strip(), "evaluator_git_commit_url": str(evaluator_meta.get("git_commit_url") or "").strip(), @@ -529,6 +558,32 @@ def _inject_workflow_page_styles() -> None: font-size: 0.9rem; line-height: 1.5; } + .wf-filter-strip, + .wf-pager-strip { + border: none; + background: linear-gradient(180deg, rgba(248,250,252,0.72) 0%, rgba(248,250,252,0.28) 100%); + border-radius: 14px; + padding: 0.72rem 0.78rem 0.28rem 0.78rem; + box-shadow: none; + margin-bottom: 0.32rem; + } + .wf-filter-strip { + margin-top: 0.12rem; + } + .wf-pager-strip { + padding-top: 0.28rem; + padding-bottom: 0.28rem; + } + .wf-pager-summary { + padding-top: 0.2rem; + color: #475569; + font-size: 0.82rem; + line-height: 1.35; + } + .wf-pager-summary strong { + color: #0f172a; + font-weight: 700; + } .wf-meta-inline { margin-top: 0.2rem; color: #64748b; @@ -541,6 +596,20 @@ def _inject_workflow_page_styles() -> None: .wf-meta-inline a:hover { text-decoration: underline; } + .wf-linked-ref { + margin-top: 0.35rem; + color: #475569; + font-size: 0.82rem; + line-height: 1.35; + } + .wf-linked-ref a { + color: #0f766e; + text-decoration: none; + font-weight: 600; + } + .wf-linked-ref a:hover { + text-decoration: underline; + } .wf-run-list { display: block; margin-top: 0.35rem; @@ -629,11 +698,11 @@ def _inject_workflow_page_styles() -> None: letter-spacing: 0.01em; } .wf-compare-bar { - border: 1px solid rgba(148, 163, 184, 0.24); - background: linear-gradient(135deg, #f8fafc 0%, #ecfeff 100%); + border: none; + background: linear-gradient(135deg, rgba(248,250,252,0.65) 0%, rgba(236,254,255,0.55) 100%); border-radius: 12px; - padding: 0.7rem 0.85rem; - margin: 0.35rem 0 0.55rem 0; + padding: 0.62rem 0.78rem; + margin: 0.18rem 0 0.4rem 0; } .wf-compare-title { margin: 0; @@ -654,7 +723,30 @@ def _inject_workflow_page_styles() -> None: transform: scale(1.2); } [class*="st-key-workflow_runs_page_select"] div[data-baseweb="select"] { - min-height: 2rem; + min-height: 1.72rem; + } + [class*="st-key-workflow_runs_page_select"] [data-baseweb="select"] > div { + min-height: 1.72rem; + font-size: 0.8rem; + } + [class*="st-key-workflow_runs_page_prev"] button, + [class*="st-key-workflow_runs_page_next"] button { + min-height: 1.72rem; + height: 1.72rem; + padding: 0 0.3rem; + font-size: 0.8rem; + line-height: 1; + } + [class*="st-key-workflow_run_details__"] button, + [class*="st-key-workflow_run_download__"] button, + [class*="st-key-workflow_run_delete__"] button, + [class*="st-key-workflow_local_run_retest__"] button { + white-space: nowrap; + min-height: 2.2rem; + font-size: 0.72rem; + padding-left: 0.35rem; + padding-right: 0.35rem; + letter-spacing: 0.01em; } .wf-launcher { border: 1px solid rgba(20, 184, 166, 0.22); @@ -803,6 +895,60 @@ def _render_local_runs_header() -> None: header_cols[9].markdown('
Actions
', unsafe_allow_html=True) +def _run_needs_source_backfill(run: Dict[str, object]) -> bool: + return bool( + str(run.get("evaluator_job_id") or "").strip() + and str(run.get("project_id") or "").strip() + and ( + not str(run.get("evaluator_git_ref_url") or "").strip() + or not str(run.get("evaluator_git_commit_url") or "").strip() + or not str(run.get("evaluator_source_url") or "").strip() + or not str(run.get("evaluator_git_sha") or "").strip() + ) + ) + + +def _backfill_local_run_source_metadata(runs: List[Dict[str, object]]) -> Dict[str, int]: + updated = 0 + skipped = 0 + failed = 0 + for run in runs: + if not _run_needs_source_backfill(run): + skipped += 1 + continue + run_path = run.get("run_path") + if not isinstance(run_path, Path): + failed += 1 + continue + project_id = str(run.get("project_id") or "").strip() + environment = str(run.get("environment") or "default").strip() or "default" + evaluator_job_id = str(run.get("evaluator_job_id") or "").strip() + try: + detail = _fetch_evaluator_job_detail(project_id, environment, evaluator_job_id) + except Exception: + failed += 1 + continue + + patch = { + "evaluator": { + "target": str(detail.get("source_label") or run.get("evaluator_target") or "").strip(), + "git_sha": str(detail.get("git_sha") or run.get("evaluator_git_sha") or "").strip(), + "git_ref_url": str(detail.get("git_ref_url") or run.get("evaluator_git_ref_url") or "").strip(), + "git_commit_url": str(detail.get("git_commit_url") or run.get("evaluator_git_commit_url") or "").strip(), + "source_url": str(detail.get("source_url") or run.get("evaluator_source_url") or "").strip(), + "source_repo_label": str(detail.get("source_repo_label") or run.get("evaluator_source_repo_label") or "").strip(), + "catalog_name": str(detail.get("catalog") or run.get("catalog_name") or "").strip(), + "catalog_url": str(detail.get("catalog_url") or run.get("catalog_url") or "").strip(), + } + } + try: + upsert_run_metadata(run_path, patch, create_missing=False) + updated += 1 + except Exception: + failed += 1 + return {"updated": updated, "skipped": skipped, "failed": failed} + + def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: name_raw = str(run["name"]) name = html.escape(name_raw) @@ -889,15 +1035,15 @@ def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: with row_cols[8]: st.markdown(f'
{size_label}
', unsafe_allow_html=True) with row_cols[9]: - action_cols = st.columns([1.0, 1.0, 1.0], gap="small") + action_cols = st.columns([0.78, 0.82, 0.82], gap="small") with action_cols[0]: - if st.button("Info", key=f"workflow_run_details::{name_raw}", use_container_width=True): + if st.button("ℹ", key=f"workflow_run_details::{name_raw}", use_container_width=True, help="Show run details"): st.session_state["workflow_local_run_detail"] = name_raw with action_cols[1]: - if st.button("ZIP", key=f"workflow_run_download::{name_raw}", use_container_width=True): + if st.button("⬇", key=f"workflow_run_download::{name_raw}", use_container_width=True, help="Prepare ZIP download"): st.session_state["workflow_local_run_download"] = name_raw with action_cols[2]: - if st.button("Delete", key=f"workflow_run_delete::{name_raw}", use_container_width=True): + if st.button("🗑", key=f"workflow_run_delete::{name_raw}", use_container_width=True, help="Delete this local run"): st.session_state["workflow_local_run_delete"] = name_raw return bool(checked) @@ -929,10 +1075,16 @@ def _render_local_run_details(run: Dict[str, object]) -> None: or evaluator_detail.get("git_ref_url") or "" ).strip() + source_commit_url = str( + evaluator_meta.get("git_commit_url") + or evaluator_detail.get("git_commit_url") + or "" + ).strip() catalog_url = str(evaluator_detail.get("catalog_url") or "").strip() source_label = str(evaluator_meta.get("target") or evaluator_detail.get("source_label") or evaluator_target or "").strip() source_git_sha = str(evaluator_meta.get("git_sha") or evaluator_detail.get("git_sha") or "").strip() source_ref_text = _format_source_ref_text(source_label or evaluator_target, source_git_sha) + source_ref_html = _format_source_ref_html(source_label or evaluator_target, source_url, source_git_sha, source_commit_url) with st.container(border=True): title_cols = st.columns([3.4, 1.0]) @@ -1032,6 +1184,11 @@ def _render_local_run_details(run: Dict[str, object]) -> None: disabled=True, key=f"run_detail_source_ref::{run['name']}", ) + if source_ref_html and source_ref_html != "—": + st.markdown( + f'
GitHub: {source_ref_html}
', + unsafe_allow_html=True, + ) if evaluator_meta: eval_cols = st.columns(4) @@ -1131,21 +1288,29 @@ def _render_local_runs_section() -> None: if not runs: st.markdown('
No finished runs were found on this server yet.
', unsafe_allow_html=True) return - - if "workflow_runs_search_applied" not in st.session_state: - st.session_state["workflow_runs_search_applied"] = st.session_state.get("workflow_runs_search", "") - if "workflow_runs_summary_filter_applied" not in st.session_state: - st.session_state["workflow_runs_summary_filter_applied"] = bool(st.session_state.get("workflow_runs_summary_filter", False)) - if "workflow_runs_parquet_filter_applied" not in st.session_state: - st.session_state["workflow_runs_parquet_filter_applied"] = bool(st.session_state.get("workflow_runs_parquet_filter", False)) - if "workflow_runs_user_filter_applied" not in st.session_state: - st.session_state["workflow_runs_user_filter_applied"] = str(st.session_state.get("workflow_runs_user_filter", "All users")) - if "workflow_runs_date_from_applied" not in st.session_state: - st.session_state["workflow_runs_date_from_applied"] = st.session_state.get("workflow_runs_date_from", None) - if "workflow_runs_date_to_applied" not in st.session_state: - st.session_state["workflow_runs_date_to_applied"] = st.session_state.get("workflow_runs_date_to", None) - if "workflow_runs_page_size_applied" not in st.session_state: - st.session_state["workflow_runs_page_size_applied"] = int(st.session_state.get("workflow_runs_page_size", 10) or 10) + missing_source_runs = sum(1 for run in runs if _run_needs_source_backfill(run)) + local_runs_toolbar_cols = st.columns([4.2, 1.2]) + with local_runs_toolbar_cols[0]: + if missing_source_runs: + st.caption(f"{missing_source_runs} run(s) are missing stored GitHub metadata.") + with local_runs_toolbar_cols[1]: + if missing_source_runs and st.button( + "Backfill GitHub", + key="workflow_backfill_local_run_source_meta", + use_container_width=True, + ): + with st.spinner("Backfilling missing GitHub metadata for local runs..."): + result = _backfill_local_run_source_metadata(runs) + _load_local_runs.clear() + if result["failed"]: + st.warning( + f"Backfill updated {result['updated']} run(s), skipped {result['skipped']} run(s), failed on {result['failed']} run(s)." + ) + else: + st.success( + f"Backfill updated {result['updated']} run(s); {result['skipped']} run(s) already had metadata." + ) + st.rerun() current_user_id = str(get_task_list_current_user() or "").strip() user_options = ["All users"] @@ -1153,13 +1318,15 @@ def _render_local_runs_section() -> None: user_options.append("My runs") unique_users = [] seen_users = set() - user_option_subject_map = {"All users": "", "My runs": current_user_id} + user_option_subject_map = {"All users": "", "My runs": current_user_id, "(Auto)": "__auto__"} for row in runs: subject_id = str(row.get("requested_by") or "").strip() label = str(row.get("requested_by_label") or "").strip() + option = label or "(Auto)" if not subject_id: + if "(Auto)" not in user_options: + user_options.append("(Auto)") continue - option = label or "Unknown" deduped_option = option suffix = 2 while deduped_option in seen_users and user_option_subject_map.get(deduped_option) != subject_id: @@ -1170,97 +1337,170 @@ def _render_local_runs_section() -> None: seen_users.add(deduped_option) user_option_subject_map[deduped_option] = subject_id user_options.extend(unique_users) - applied_user_option = st.session_state.get("workflow_runs_user_filter_applied", "All users") - if applied_user_option not in user_options: - applied_user_option = "All users" - st.session_state["workflow_runs_user_filter_applied"] = applied_user_option - - with st.form("workflow_local_runs_filters", border=False): - control_cols = st.columns([1.8, 1.25, 1.05, 1.05, 0.72, 0.72, 0.65, 0.76]) - with control_cols[0]: - st.markdown('
Search
', unsafe_allow_html=True) - run_search_input = st.text_input( - "Search runs", - value=st.session_state.get("workflow_runs_search_applied", ""), - key="workflow_runs_search", - label_visibility="collapsed", - placeholder="Filter by name, description, job id, catalog, user", - ) - with control_cols[1]: - st.markdown('
User
', unsafe_allow_html=True) - user_filter_input = st.selectbox( - "User", - options=user_options, - index=user_options.index(applied_user_option), - key="workflow_runs_user_filter", - label_visibility="collapsed", - ) - with control_cols[2]: - st.markdown('
From
', unsafe_allow_html=True) - date_from_input = st.date_input( - "From", - value=st.session_state.get("workflow_runs_date_from_applied", None), - key="workflow_runs_date_from", - label_visibility="collapsed", - help="Run modified-date lower bound in JST.", - ) - with control_cols[3]: - st.markdown('
To
', unsafe_allow_html=True) - date_to_input = st.date_input( - "To", - value=st.session_state.get("workflow_runs_date_to_applied", None), - key="workflow_runs_date_to", - label_visibility="collapsed", - help="Run modified-date upper bound in JST.", - ) - with control_cols[4]: - st.markdown('
Summary
', unsafe_allow_html=True) - require_summary_input = st.toggle( - "Summary only", - value=bool(st.session_state.get("workflow_runs_summary_filter_applied", False)), - key="workflow_runs_summary_filter", - label_visibility="collapsed", - ) - with control_cols[5]: - st.markdown('
Parquet
', unsafe_allow_html=True) - require_parquet_input = st.toggle( - "Parquet only", - value=bool(st.session_state.get("workflow_runs_parquet_filter_applied", False)), - key="workflow_runs_parquet_filter", + + catalog_options = ["All catalogs"] + catalog_option_id_map = {"All catalogs": ""} + unique_catalogs = [] + seen_catalogs = set() + for row in runs: + catalog_id = str(row.get("catalog_id") or "").strip() + catalog_label = str(row.get("catalog_label") or row.get("catalog_name") or catalog_id or "—").strip() + if not catalog_id: + continue + option = catalog_label or catalog_id + deduped_option = option + suffix = 2 + while deduped_option in seen_catalogs and catalog_option_id_map.get(deduped_option) != catalog_id: + deduped_option = f"{option} [{suffix}]" + suffix += 1 + if deduped_option not in seen_catalogs: + unique_catalogs.append(deduped_option) + seen_catalogs.add(deduped_option) + catalog_option_id_map[deduped_option] = catalog_id + catalog_options.extend(sorted(unique_catalogs, key=str.lower)) + + current_user_option = str(st.session_state.get("workflow_runs_user_filter", "All users")) + if current_user_option not in user_options: + current_user_option = "All users" + st.session_state["workflow_runs_user_filter"] = current_user_option + current_catalog_option = str(st.session_state.get("workflow_runs_catalog_filter", "All catalogs")) + if current_catalog_option not in catalog_options: + current_catalog_option = "All catalogs" + st.session_state["workflow_runs_catalog_filter"] = current_catalog_option + branch_options = ["All branches"] + unique_branches = sorted( + { + str(row.get("branch_label") or row.get("evaluator_target") or "").strip() + for row in runs + if str(row.get("branch_label") or row.get("evaluator_target") or "").strip() + }, + key=str.lower, + ) + branch_options.extend(unique_branches) + current_branch_option = str(st.session_state.get("workflow_runs_branch_filter", "All branches")) + if current_branch_option not in branch_options: + current_branch_option = "All branches" + st.session_state["workflow_runs_branch_filter"] = current_branch_option + + st.markdown('
', unsafe_allow_html=True) + control_cols = st.columns([1.7, 1.15, 1.1, 0.95, 0.95]) + with control_cols[0]: + st.markdown('
Search
', unsafe_allow_html=True) + run_search_input = st.text_input( + "Search runs", + value=st.session_state.get("workflow_runs_search", ""), + key="workflow_runs_search", + label_visibility="collapsed", + placeholder="Filter by name, description, job id, catalog, user", + ) + with control_cols[1]: + st.markdown('
Catalog
', unsafe_allow_html=True) + catalog_filter_input = st.selectbox( + "Catalog", + options=catalog_options, + index=catalog_options.index(current_catalog_option), + key="workflow_runs_catalog_filter", + label_visibility="collapsed", + ) + with control_cols[2]: + st.markdown('
Branch
', unsafe_allow_html=True) + branch_filter_input = st.selectbox( + "Branch", + options=branch_options, + index=branch_options.index(current_branch_option), + key="workflow_runs_branch_filter", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
User
', unsafe_allow_html=True) + user_filter_input = st.selectbox( + "User", + options=user_options, + index=user_options.index(current_user_option), + key="workflow_runs_user_filter", + label_visibility="collapsed", + ) + with control_cols[4]: + st.markdown('
Rows
', unsafe_allow_html=True) + page_size_input = int( + st.selectbox( + "Rows", + options=[10, 20, 50, 100], + index=[10, 20, 50, 100].index(int(st.session_state.get("workflow_runs_page_size", 10) or 10)), + key="workflow_runs_page_size", label_visibility="collapsed", ) - with control_cols[6]: - st.markdown('
Rows
', unsafe_allow_html=True) - page_size_input = int( - st.selectbox( - "Rows", - options=[10, 20, 50, 100], - index=[10, 20, 50, 100].index(int(st.session_state.get("workflow_runs_page_size_applied", 10) or 10)), - key="workflow_runs_page_size", - label_visibility="collapsed", - ) - ) - with control_cols[7]: - st.markdown('
Apply
', unsafe_allow_html=True) - apply_filters = st.form_submit_button("Apply", use_container_width=True) - - if apply_filters: - st.session_state["workflow_runs_search_applied"] = run_search_input - st.session_state["workflow_runs_user_filter_applied"] = user_filter_input - st.session_state["workflow_runs_date_from_applied"] = date_from_input - st.session_state["workflow_runs_date_to_applied"] = date_to_input - st.session_state["workflow_runs_summary_filter_applied"] = bool(require_summary_input) - st.session_state["workflow_runs_parquet_filter_applied"] = bool(require_parquet_input) - st.session_state["workflow_runs_page_size_applied"] = int(page_size_input) + ) + + second_control_cols = st.columns([0.92, 0.92, 0.6, 0.6, 2.4]) + with second_control_cols[0]: + st.markdown('
From
', unsafe_allow_html=True) + date_from_input = st.date_input( + "From", + value=st.session_state.get("workflow_runs_date_from", None), + key="workflow_runs_date_from", + label_visibility="collapsed", + help="Run modified-date lower bound in JST.", + ) + with second_control_cols[1]: + st.markdown('
To
', unsafe_allow_html=True) + date_to_input = st.date_input( + "To", + value=st.session_state.get("workflow_runs_date_to", None), + key="workflow_runs_date_to", + label_visibility="collapsed", + help="Run modified-date upper bound in JST.", + ) + with second_control_cols[2]: + st.markdown('
Summary
', unsafe_allow_html=True) + require_summary_input = st.toggle( + "Summary only", + value=bool(st.session_state.get("workflow_runs_summary_filter", False)), + key="workflow_runs_summary_filter", + label_visibility="collapsed", + ) + with second_control_cols[3]: + st.markdown('
Parquet
', unsafe_allow_html=True) + require_parquet_input = st.toggle( + "Parquet only", + value=bool(st.session_state.get("workflow_runs_parquet_filter", False)), + key="workflow_runs_parquet_filter", + label_visibility="collapsed", + ) + with second_control_cols[4]: + st.markdown( + '
Pick a catalog, branch, or user directly, or narrow with text and dates.
', + unsafe_allow_html=True, + ) + st.markdown('
', unsafe_allow_html=True) + + current_filter_signature = ( + str(run_search_input or ""), + str(catalog_filter_input or "All catalogs"), + str(branch_filter_input or "All branches"), + str(user_filter_input or "All users"), + date_from_input, + date_to_input, + bool(require_summary_input), + bool(require_parquet_input), + int(page_size_input), + ) + previous_filter_signature = st.session_state.get("workflow_runs_filter_signature") + if previous_filter_signature is None: + st.session_state["workflow_runs_filter_signature"] = current_filter_signature + elif previous_filter_signature != current_filter_signature: + st.session_state["workflow_runs_filter_signature"] = current_filter_signature st.session_state["workflow_runs_page"] = 1 - run_search = str(st.session_state.get("workflow_runs_search_applied", "")).strip().lower() - selected_user_filter = str(st.session_state.get("workflow_runs_user_filter_applied", "All users")).strip() - selected_date_from = st.session_state.get("workflow_runs_date_from_applied", None) - selected_date_to = st.session_state.get("workflow_runs_date_to_applied", None) - require_summary = bool(st.session_state.get("workflow_runs_summary_filter_applied", False)) - require_parquet = bool(st.session_state.get("workflow_runs_parquet_filter_applied", False)) - page_size = int(st.session_state.get("workflow_runs_page_size_applied", 10) or 10) + run_search = str(run_search_input).strip().lower() + selected_catalog_filter = str(catalog_filter_input).strip() + selected_branch_filter = str(branch_filter_input).strip() + selected_user_filter = str(user_filter_input).strip() + selected_date_from = date_from_input + selected_date_to = date_to_input + require_summary = bool(require_summary_input) + require_parquet = bool(require_parquet_input) + page_size = int(page_size_input) if selected_date_from and selected_date_to and selected_date_from > selected_date_to: st.warning("`From` date must be earlier than or equal to `To` date.") @@ -1269,8 +1509,18 @@ def _render_local_runs_section() -> None: filtered = runs if run_search: filtered = [row for row in filtered if run_search in str(row.get("search_blob") or row["name"]).lower()] + if selected_catalog_filter not in ("", "All catalogs"): + selected_catalog_id = str(catalog_option_id_map.get(selected_catalog_filter) or "").strip() + filtered = [row for row in filtered if str(row.get("catalog_id") or "").strip() == selected_catalog_id] + if selected_branch_filter not in ("", "All branches"): + filtered = [ + row for row in filtered + if str(row.get("branch_label") or row.get("evaluator_target") or "").strip() == selected_branch_filter + ] if selected_user_filter == "My runs" and current_user_id: filtered = [row for row in filtered if str(row.get("requested_by") or "").strip() == current_user_id] + elif selected_user_filter == "(Auto)": + filtered = [row for row in filtered if not str(row.get("requested_by") or "").strip()] elif selected_user_filter not in ("", "All users", "My runs"): selected_subject_id = str(user_option_subject_map.get(selected_user_filter) or "").strip() filtered = [row for row in filtered if str(row.get("requested_by") or "").strip() == selected_subject_id] @@ -1311,14 +1561,19 @@ def _render_local_runs_section() -> None: visible_runs = filtered[start_idx:start_idx + page_size] visible_names = {str(run["name"]) for run in visible_runs} - pager_cols = st.columns([0.9, 1.2, 4.1]) + visible_end = min(len(filtered), start_idx + len(visible_runs)) + st.markdown('
', unsafe_allow_html=True) + pager_cols = st.columns([0.65, 1.0, 0.65, 3.2]) with pager_cols[0]: - st.markdown('
Page
', unsafe_allow_html=True) + if st.button("‹", key="workflow_runs_page_prev", use_container_width=True, disabled=current_page <= 1): + current_page -= 1 + st.session_state[page_key] = current_page + st.rerun() + with pager_cols[1]: selected_page = st.selectbox( "Page", options=list(range(1, page_count + 1)), index=max(0, current_page - 1), - key="workflow_runs_page_select", label_visibility="collapsed", ) if selected_page != current_page: @@ -1327,12 +1582,17 @@ def _render_local_runs_section() -> None: start_idx = (current_page - 1) * page_size visible_runs = filtered[start_idx:start_idx + page_size] visible_names = {str(run["name"]) for run in visible_runs} - with pager_cols[1]: - st.markdown('
Rows
', unsafe_allow_html=True) - st.caption(str(len(visible_runs))) with pager_cols[2]: - st.markdown('
Total
', unsafe_allow_html=True) - st.caption(f"{len(filtered)} runs") + if st.button("›", key="workflow_runs_page_next", use_container_width=True, disabled=current_page >= page_count): + current_page += 1 + st.session_state[page_key] = current_page + st.rerun() + with pager_cols[3]: + st.markdown( + f'
{start_idx + 1}{visible_end} of {len(filtered)} runs · {page_size} per page
', + unsafe_allow_html=True, + ) + st.markdown('
', unsafe_allow_html=True) _render_local_runs_header() next_selected = [name for name in st.session_state.get("workflow_compare_runs", []) if name not in visible_names] From 747bb1e4d128d5ed46d6d4aff0c7dbe2a2c39b90 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 20 May 2026 15:33:12 +0900 Subject: [PATCH 77/94] feat: add new visualization functions and enhance trend insights analysis - Introduced several new functions for visualizing pass rates and scenario counts, including `_build_pass_combo_chart`, `_build_latest_hierarchy_bars`, and `_build_metric_timeline_heatmap`. - Enhanced the `_build_minade_horizon_heatmaps` and `_build_minade_label_profile` functions for improved metric analysis and visualization. - Implemented helper functions for formatting release names and calculating pass rates, streamlining data presentation in the trend insights dashboard. - Updated the layout and styling of visualizations to improve clarity and user engagement. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/13_Trend_Insights.py | 671 +++++++++++++----- 1 file changed, 504 insertions(+), 167 deletions(-) diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index dbd50a5..1ae6ae8 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -38,6 +38,223 @@ def _select_primary_metadata(group: TrendReleaseGroup) -> dict[str, Any]: return {} +def _release_display_name(version: Any, date: Any, description: Any = "") -> str: + version_text = str(version or "").strip() or "Unknown Version" + date_text = str(date or "").strip() + description_text = str(description or "").strip() + suffix = f" | {date_text}" if date_text else "" + if description_text: + suffix += f" | {description_text}" + return f"{version_text}{suffix}" + + +def _with_pass_rate(frame: pd.DataFrame, *, passed_col: str = "passed", total_col: str = "total") -> pd.DataFrame: + enriched = frame.copy() + total = pd.to_numeric(enriched[total_col], errors="coerce") + passed = pd.to_numeric(enriched[passed_col], errors="coerce") + enriched["pass_rate"] = (passed / total.replace(0, pd.NA)) * 100.0 + return enriched + + +def _update_version_axis(fig: go.Figure, versions: list[str]) -> None: + fig.update_xaxes(categoryorder="array", categoryarray=versions) + + +def _build_pass_combo_chart( + frame: pd.DataFrame, + *, + title: str, + versions: list[str], + line_y_col: str = "pass_rate", + series_col: str | None = None, + scenario_count_col: str = "total", + hover_cols: list[str] | None = None, +) -> go.Figure: + fig = go.Figure() + show_legend = series_col is not None + scenario_totals = ( + frame.groupby("version", dropna=False)[scenario_count_col] + .sum() + .reindex(versions) + .fillna(0) + ) + fig.add_bar( + x=versions, + y=scenario_totals.tolist(), + name="Scenario Count", + marker_color="#bfdbfe", + opacity=0.32, + yaxis="y2", + hovertemplate="%{x}
Scenario Count: %{y:.0f}", + ) + + hover_cols = hover_cols or ["date", "release_name", "passed", "total"] + plot_df = frame.copy() + if series_col is None: + fig.add_trace( + go.Scatter( + x=plot_df["version"], + y=plot_df[line_y_col], + name=title, + mode="lines+markers", + line=dict(color="#1d4ed8", width=3), + marker=dict(size=8, color="#1d4ed8"), + customdata=plot_df[hover_cols].to_numpy() if hover_cols else None, + hovertemplate="%{x}
Pass Rate: %{y:.1f}%
Date: %{customdata[0]}
Release: %{customdata[1]}", + ) + ) + else: + palette = px.colors.qualitative.Bold + px.colors.qualitative.Safe + px.colors.qualitative.Set2 + for idx, series_name in enumerate(plot_df[series_col].dropna().astype(str).unique().tolist()): + series_df = plot_df[plot_df[series_col].astype(str) == series_name] + color = palette[idx % len(palette)] + fig.add_trace( + go.Scatter( + x=series_df["version"], + y=series_df[line_y_col], + name=series_name, + mode="lines+markers", + line=dict(color=color, width=3), + marker=dict(size=7, color=color), + customdata=series_df[hover_cols].to_numpy() if hover_cols else None, + hovertemplate=( + "%{x}
" + + f"{series_col.replace('_', ' ').title()}: {series_name}
" + + "Pass Rate: %{y:.1f}%
" + + "Date: %{customdata[0]}
" + + "Release: %{customdata[1]}
" + + "Passed: %{customdata[2]:.0f}
" + + "Total: %{customdata[3]:.0f}" + ), + ) + ) + + fig.update_layout( + title=title, + xaxis_title="Pilot.Auto Version", + yaxis_title="Pass Rate (%)", + yaxis2=dict(title="Scenario Count", overlaying="y", side="right", showgrid=False), + height=440, + showlegend=show_legend, + legend=dict(orientation="h", yanchor="top", y=-0.22, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=80, b=90), + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + fig.update_xaxes(showgrid=False, categoryorder="array", categoryarray=versions) + fig.update_yaxes(range=[0, 100], gridcolor="rgba(148, 163, 184, 0.18)") + return fig + + +def _build_latest_hierarchy_bars(frame: pd.DataFrame, latest_release_name: str) -> go.Figure: + bars = frame.copy() + bars["major_category"] = bars["major_category"].fillna("Unspecified") + bars["mid_category"] = bars["mid_category"].fillna("Unspecified") + bars["label"] = bars["major_category"] + " / " + bars["mid_category"] + bars = bars.sort_values(["major_category", "pass_rate", "total"], ascending=[True, False, False]) + fig = px.bar( + bars, + x="pass_rate", + y="label", + color="major_category", + orientation="h", + hover_data=["passed", "total"], + text=bars["pass_rate"].map(lambda value: f"{value:.1f}%" if pd.notna(value) else "n/a"), + title=f"Latest Release Pass-Rate Hierarchy: {latest_release_name}", + ) + fig.update_layout( + margin=dict(l=20, r=20, t=70, b=20), + xaxis_title="Pass Rate (%)", + yaxis_title="Major / Mid Category", + legend_title_text="Major Category", + ) + fig.update_traces(textposition="outside", cliponaxis=False) + return fig + + +def _build_metric_timeline_heatmap( + frame: pd.DataFrame, + *, + value_col: str, + title: str, + color_title: str, +) -> go.Figure: + matrix = frame.pivot_table( + index="label_name", + columns="release_axis", + values=value_col, + aggfunc="first", + ).dropna(how="all") + fig = px.imshow( + matrix, + aspect="auto", + color_continuous_scale=["#7f1d1d", "#f8fafc", "#14532d"] if "delta" in value_col else ["#f8fafc", "#8dd3c7", "#0f766e"], + color_continuous_midpoint=0 if "delta" in value_col else None, + text_auto=".3f", + ) + fig.update_layout( + title=title, + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title=color_title), + ) + return fig + + +def _build_metric_label_lines( + frame: pd.DataFrame, + *, + title: str, + ordered_axes: list[str], +) -> go.Figure: + fig = px.line( + frame, + x="release_axis", + y="value", + color="label_name", + markers=True, + hover_data=["version", "date", "release_name"], + title=title, + ) + fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Label") + fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes) + return fig + + +def _build_minade_horizon_heatmaps(frame: pd.DataFrame) -> list[tuple[str, go.Figure]]: + figures: list[tuple[str, go.Figure]] = [] + for metric_name in ("minADE@1s", "minADE@3s", "minADE@5s"): + metric_df = frame[frame["metric_name"] == metric_name].copy() + if metric_df.empty: + continue + fig = _build_metric_timeline_heatmap( + metric_df, + value_col="value", + title=f"{metric_name} Timeline Heatmap", + color_title=metric_name, + ) + figures.append((metric_name, fig)) + return figures + + +def _build_minade_label_profile(frame: pd.DataFrame, *, selected_label: str, ordered_axes: list[str]) -> go.Figure: + profile_df = frame[ + (frame["metric_name"].isin(["minADE@1s", "minADE@3s", "minADE@5s"])) + & (frame["label_name"] == selected_label) + ].copy() + fig = px.line( + profile_df, + x="release_axis", + y="value", + color="metric_name", + markers=True, + hover_data=["version", "date", "release_name"], + title=f"{selected_label} minADE Horizon Profile", + ) + fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Horizon") + fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes) + return fig + + def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: release_rows: list[dict[str, Any]] = [] case_rows: list[dict[str, Any]] = [] @@ -133,12 +350,24 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame release_df = pd.DataFrame(release_rows) if not release_df.empty: release_df["date_sort"] = pd.to_datetime(release_df["date"], format="%Y.%m.%d", errors="coerce") + release_df["release_display"] = release_df.apply( + lambda row: _release_display_name(row["version"], row["date"], row["description"]), + axis=1, + ) case_df = pd.DataFrame(case_rows) if not case_df.empty: case_df["date_sort"] = pd.to_datetime(case_df["date"], format="%Y.%m.%d", errors="coerce") + case_df["release_display"] = case_df.apply( + lambda row: _release_display_name(row["version"], row["date"], row["description"]), + axis=1, + ) metric_df = pd.DataFrame(metric_rows) if not metric_df.empty: metric_df["date_sort"] = pd.to_datetime(metric_df["date"], format="%Y.%m.%d", errors="coerce") + metric_df["release_display"] = metric_df.apply( + lambda row: _release_display_name(row["version"], row["date"], row["description"]), + axis=1, + ) return release_df, case_df, metric_df @@ -172,15 +401,17 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame top5.metric("Latest Date", release_df.sort_values("date_sort")["date"].iloc[-1] if not release_df.empty else "n/a") inventory_cols = [ + "release_display", "version", "date", "description", "data_count", + "mAP", + "overall_pass_rate", "roles", "full_job_id", "usecase_job_id", "devops_job_id", - "release_name", "topic_name", "group_kind", ] @@ -321,73 +552,16 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame if not metric_df.empty: atlas_df = metric_df[metric_df["block_header"] == "全数データセット評価"].copy() atlas_df = atlas_df.sort_values(["date_sort", "version", "release_name"], ascending=[True, True, True]) + atlas_df["release_axis"] = atlas_df["version"].astype(str) + " | " + atlas_df["date"].astype(str) latest_group_key = perf_entries.iloc[-1]["group_key"] if not perf_entries.empty else None previous_group_key = perf_entries.iloc[-2]["group_key"] if len(perf_entries) >= 2 else None - latest_release_name = perf_entries.iloc[-1]["release_name"] if not perf_entries.empty else "" - - atlas_col1, atlas_col2 = st.columns([1.0, 1.0]) - if latest_group_key is not None: - latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( - index="metric_name", - columns="label_name", - values="value", - aggfunc="first", - ).dropna(how="all") - if not latest_matrix.empty: - latest_min = latest_matrix.min(axis=1) - latest_range = (latest_matrix.max(axis=1) - latest_min).replace(0, 1) - latest_norm = latest_matrix.sub(latest_min, axis=0).div(latest_range, axis=0) - atlas_fig = px.imshow( - latest_norm, - aspect="auto", - color_continuous_scale=["#f8fafc", "#8dd3c7", "#0f766e"], - text_auto=".2f", - ) - atlas_fig.update_traces( - text=latest_matrix.round(2).astype(str), - hovertemplate="Metric: %{y}
Label: %{x}
Value: %{text}", - ) - atlas_fig.update_layout( - title=f"Latest Release Metric Atlas: {latest_release_name}", - margin=dict(l=20, r=20, t=70, b=20), - coloraxis_colorbar=dict(title="Relative"), - ) - atlas_col1.plotly_chart(atlas_fig, use_container_width=True) - else: - atlas_col1.info("No latest metric atlas is available yet.") - - if latest_group_key is not None and previous_group_key is not None: - latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( - index="metric_name", - columns="label_name", - values="value", - aggfunc="first", - ) - previous_matrix = atlas_df[atlas_df["group_key"] == previous_group_key].pivot_table( - index="metric_name", - columns="label_name", - values="value", - aggfunc="first", - ) - delta_matrix = latest_matrix.subtract(previous_matrix, fill_value=pd.NA).dropna(how="all") - if not delta_matrix.empty: - delta_fig = px.imshow( - delta_matrix, - aspect="auto", - color_continuous_scale=["#7f1d1d", "#f8fafc", "#14532d"], - color_continuous_midpoint=0, - text_auto=".2f", - ) - delta_fig.update_layout( - title="Release-over-Release Metric Delta", - margin=dict(l=20, r=20, t=70, b=20), - coloraxis_colorbar=dict(title="Delta"), - ) - atlas_col2.plotly_chart(delta_fig, use_container_width=True) - else: - atlas_col2.info("No previous release is available for metric delta yet.") - else: - atlas_col2.info("Metric delta becomes available after at least two grouped full releases exist.") + latest_release_name = perf_entries.iloc[-1]["version"] if not perf_entries.empty else "" + release_manifest = ( + atlas_df[["group_key", "release_axis", "version", "date", "release_name", "release_display"]] + .drop_duplicates() + .reset_index(drop=True) + ) + ordered_release_axes = release_manifest["release_axis"].tolist() section_header( "Pass Rate Trend", @@ -398,117 +572,283 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ["date_sort", "version", "release_name"], ascending=[True, True, True], ) -pass_col1, pass_col2 = st.columns([1.1, 1.0]) -with pass_col1: - if not pass_entries.empty and pass_entries["overall_pass_rate"].notna().any(): - pass_fig = go.Figure() - pass_fig.add_bar( - x=pass_entries["version"], - y=pass_entries["scenario_count"], - name="Scenario Count", - marker_color="#86efac", - opacity=0.55, - yaxis="y2", - ) - pass_fig.add_trace( - go.Scatter( - x=pass_entries["version"], - y=pass_entries["overall_pass_rate"], - name="Overall Pass Rate", - mode="lines+markers", - line=dict(color="#1d4ed8", width=3), - customdata=pass_entries[["release_name", "date"]].to_numpy(), - hovertemplate="%{x}
Pass Rate: %{y:.1f}%
Release: %{customdata[0]}
Date: %{customdata[1]}", - ) - ) - pass_fig.update_layout( - title="Overall Pass Rate vs Scenario Count", - xaxis_title="Pilot.Auto Version", - yaxis_title="Pass Rate (%)", - yaxis2=dict(title="Scenario Count", overlaying="y", side="right", showgrid=False), - height=520, - legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), - margin=dict(l=20, r=20, t=90, b=20), - ) - st.plotly_chart(pass_fig, use_container_width=True) - else: - st.info("No grouped pass-rate summaries are available yet.") +ordered_versions = pass_entries["version"].drop_duplicates().tolist() +overall_plot_df = pd.DataFrame() +major_summary = pd.DataFrame() +mid_summary = pd.DataFrame() -with pass_col2: - if not case_df.empty: - major_summary = ( - case_df.groupby(["version", "date", "release_name", "major_category"], dropna=False)[["passed", "total"]] - .sum() - .reset_index() - ) - major_summary["pass_rate"] = major_summary["passed"] / major_summary["total"] * 100.0 - major_summary["label"] = major_summary["major_category"].astype(str) - pass_cat_fig = px.line( +if not pass_entries.empty and pass_entries["overall_pass_rate"].notna().any(): + overall_plot_df = pass_entries[ + ["version", "date", "release_name", "overall_pass_rate", "scenario_count"] + ].rename(columns={"overall_pass_rate": "pass_rate", "scenario_count": "total"}).copy() + +if not case_df.empty: + major_summary = ( + case_df.groupby(["version", "date", "release_name", "major_category"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() + ) + major_summary = _with_pass_rate(major_summary) + + mid_summary = ( + case_df.groupby( + ["version", "date", "release_name", "major_category", "mid_category"], + dropna=False, + )[["passed", "total"]] + .sum() + .reset_index() + ) + mid_summary = _with_pass_rate(mid_summary) + +if not overall_plot_df.empty: + st.plotly_chart( + _build_pass_combo_chart( + overall_plot_df, + title="Overall Pass Rate", + versions=ordered_versions, + series_col=None, + hover_cols=["date", "release_name"], + ), + use_container_width=True, + ) +else: + st.info("No grouped pass-rate summaries are available yet.") + +if not major_summary.empty: + st.plotly_chart( + _build_pass_combo_chart( major_summary, - x="version", - y="pass_rate", - color="label", - markers=True, - hover_data=["date", "release_name", "passed", "total"], title="Major Category Pass Rate", - ) - pass_cat_fig.update_layout(margin=dict(l=20, r=20, t=60, b=20)) - st.plotly_chart(pass_cat_fig, use_container_width=True) + versions=ordered_versions, + series_col="major_category", + ), + use_container_width=True, + ) + +if not mid_summary.empty: + mid_summary_all = mid_summary.drop(columns=["major_category"], errors="ignore") + st.plotly_chart( + _build_pass_combo_chart( + mid_summary_all, + title="Mid Category Pass Rate", + versions=ordered_versions, + series_col="mid_category", + ), + use_container_width=True, + ) + st.caption( + "These three charts share the same grouped DevOps source, version order, scenario-count backdrop, and pass-rate scale so you can compare overall, major-category, and mid-category movement directly." + ) + +section_header( + "Deep Dive Explorer", + "Use this final section when you want to inspect the latest release state, compare label-level metric atlases against a baseline, or browse grouped raw details.", +) if not case_df.empty: - pass_detail_col1, pass_detail_col2 = st.columns([1.0, 1.0]) - with pass_detail_col1: - mid_summary = ( - case_df.groupby(["mid_category", "version"], dropna=False)[["passed", "total"]] - .sum() - .reset_index() - ) - mid_summary["pass_rate"] = mid_summary["passed"] / mid_summary["total"] * 100.0 - mid_matrix = mid_summary.pivot_table( - index="mid_category", - columns="version", - values="pass_rate", - aggfunc="first", - ) - mid_fig = px.imshow( - mid_matrix, - aspect="auto", - color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], - zmin=0, - zmax=100, - text_auto=".1f", - ) - mid_fig.update_layout( - title="Mid Category Pass Rate Matrix", - margin=dict(l=20, r=20, t=70, b=20), - coloraxis_colorbar=dict(title="%"), - ) - st.plotly_chart(mid_fig, use_container_width=True) - - with pass_detail_col2: - latest_devops_group = pass_entries.iloc[-1]["group_key"] if not pass_entries.empty else None - latest_case_df = case_df[case_df["group_key"] == latest_devops_group].copy() - latest_major_mid = ( - latest_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] - .sum() - .reset_index() + latest_devops_group = pass_entries.iloc[-1]["group_key"] if not pass_entries.empty else None + latest_case_df = case_df[case_df["group_key"] == latest_devops_group].copy() + latest_major_mid = ( + latest_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() + ) + latest_major_mid = _with_pass_rate(latest_major_mid) + st.markdown("**Latest Release Snapshot**") + if not latest_major_mid.empty: + latest_view_mode = st.radio( + "Latest Snapshot View", + ["Bars", "Treemap", "Icicle", "Sunburst"], + horizontal=True, ) - latest_major_mid["pass_rate"] = latest_major_mid["passed"] / latest_major_mid["total"] * 100.0 - if not latest_major_mid.empty: - sunburst_fig = px.sunburst( + if latest_view_mode == "Bars": + latest_fig = _build_latest_hierarchy_bars(latest_major_mid, pass_entries.iloc[-1]["version"]) + elif latest_view_mode == "Treemap": + latest_fig = px.treemap( latest_major_mid, path=["major_category", "mid_category"], values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title="Latest Release Pass-Rate Hierarchy", + title=f"Latest Release Pass-Rate Treemap: {pass_entries.iloc[-1]['version']}", ) - sunburst_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) - st.plotly_chart(sunburst_fig, use_container_width=True) + latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + elif latest_view_mode == "Icicle": + latest_fig = px.icicle( + latest_major_mid, + path=["major_category", "mid_category"], + values="total", + color="pass_rate", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + range_color=(0, 100), + title=f"Latest Release Pass-Rate Icicle: {pass_entries.iloc[-1]['version']}", + ) + latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) else: - st.info("No latest release pass-rate hierarchy is available yet.") + latest_fig = px.sunburst( + latest_major_mid, + path=["major_category", "mid_category"], + values="total", + color="pass_rate", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + range_color=(0, 100), + title=f"Latest Release Pass-Rate Sunburst: {pass_entries.iloc[-1]['version']}", + ) + latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(latest_fig, use_container_width=True) + else: + st.info("No latest release pass-rate hierarchy is available yet.") +if not metric_df.empty: + st.markdown("**Metric Atlas Explorer**") + explorer_note_col1, explorer_note_col2 = st.columns([1.2, 1.0]) + with explorer_note_col1: + st.caption( + "Choose one metric and inspect how each label evolved across releases. " + "Use the baseline comparison modes for fast regression checks, and use the combined minADE view when you want to compare prediction horizons together." + ) + with explorer_note_col2: + atlas_mode = st.radio( + "Atlas Explorer Mode", + ["Latest Atlas", "Latest vs Previous", "Timeline Heatmap", "Label Trend Lines", "Combined minADE Explorer", "Chosen Baseline Delta"], + horizontal=True, + ) + + metric_options = sorted(atlas_df["metric_name"].dropna().unique().tolist()) + atlas_control_col1, atlas_control_col2 = st.columns([1.0, 1.0]) + with atlas_control_col1: + selected_metric = st.selectbox("Metric", metric_options) + with atlas_control_col2: + baseline_labels = release_manifest["release_axis"].tolist() + default_baseline_index = max(0, len(baseline_labels) - 2) + selected_baseline_axis = st.selectbox( + "Baseline Release", + baseline_labels, + index=default_baseline_index if baseline_labels else 0, + ) + + metric_trend_df = atlas_df[atlas_df["metric_name"] == selected_metric].copy() + if not metric_trend_df.empty: + latest_metric_df = metric_trend_df[metric_trend_df["group_key"] == latest_group_key].copy() + baseline_metric_df = metric_trend_df[metric_trend_df["release_axis"] == selected_baseline_axis].copy() + + if atlas_mode == "Latest Atlas": + latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", + ).dropna(how="all") + if not latest_matrix.empty: + latest_min = latest_matrix.min(axis=1) + latest_range = (latest_matrix.max(axis=1) - latest_min).replace(0, 1) + latest_norm = latest_matrix.sub(latest_min, axis=0).div(latest_range, axis=0) + explorer_fig = px.imshow( + latest_norm, + aspect="auto", + color_continuous_scale=["#f8fafc", "#8dd3c7", "#0f766e"], + text_auto=".2f", + ) + explorer_fig.update_traces( + text=latest_matrix.round(2).astype(str), + hovertemplate="Metric: %{y}
Label: %{x}
Value: %{text}", + ) + explorer_fig.update_layout( + title=f"Latest Release Metric Atlas: {latest_release_name}", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="Relative"), + ) + else: + explorer_fig = None + st.info("No latest metric atlas is available yet.") + elif atlas_mode == "Latest vs Previous": + if latest_group_key is not None and previous_group_key is not None: + latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", + ) + previous_matrix = atlas_df[atlas_df["group_key"] == previous_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", + ) + delta_matrix = latest_matrix.subtract(previous_matrix, fill_value=pd.NA).dropna(how="all") + if not delta_matrix.empty: + explorer_fig = px.imshow( + delta_matrix, + aspect="auto", + color_continuous_scale=["#7f1d1d", "#f8fafc", "#14532d"], + color_continuous_midpoint=0, + text_auto=".2f", + ) + explorer_fig.update_layout( + title=f"Release-over-Release Metric Delta: {latest_release_name}", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="Delta"), + ) + else: + explorer_fig = None + st.info("No previous release is available for metric delta yet.") + else: + explorer_fig = None + st.info("Metric delta becomes available after at least two grouped full releases exist.") + elif atlas_mode == "Timeline Heatmap": + explorer_fig = _build_metric_timeline_heatmap( + metric_trend_df, + value_col="value", + title=f"{selected_metric} Timeline Heatmap by Label", + color_title=selected_metric, + ) + elif atlas_mode == "Label Trend Lines": + explorer_fig = _build_metric_label_lines( + metric_trend_df, + title=f"{selected_metric} Label Trend Lines", + ordered_axes=ordered_release_axes, + ) + elif atlas_mode == "Combined minADE Explorer": + minade_metrics = {"minADE@1s", "minADE@3s", "minADE@5s"} + minade_df = atlas_df[atlas_df["metric_name"].isin(minade_metrics)].copy() + if minade_df.empty: + st.info("No minADE trend data is available yet.") + else: + heatmaps = _build_minade_horizon_heatmaps(minade_df) + heatmap_cols = st.columns(len(heatmaps)) if heatmaps else [] + for col, (_, heatmap_fig) in zip(heatmap_cols, heatmaps): + col.plotly_chart(heatmap_fig, use_container_width=True) + label_options = sorted(minade_df["label_name"].dropna().unique().tolist()) + selected_label = st.selectbox("minADE Label Focus", label_options) + profile_fig = _build_minade_label_profile( + minade_df, + selected_label=selected_label, + ordered_axes=ordered_release_axes, + ) + st.plotly_chart(profile_fig, use_container_width=True) + explorer_fig = None + else: + delta_df = latest_metric_df[["label_name", "value"]].merge( + baseline_metric_df[["label_name", "value"]], + on="label_name", + how="outer", + suffixes=("_latest", "_baseline"), + ) + delta_df["delta_value"] = delta_df["value_latest"] - delta_df["value_baseline"] + delta_df["release_axis"] = f"{latest_release_name} vs baseline" + explorer_fig = _build_metric_timeline_heatmap( + delta_df, + value_col="delta_value", + title=f"{selected_metric} Latest vs Baseline Delta by Label", + color_title="Delta", + ) + if explorer_fig is not None: + st.plotly_chart(explorer_fig, use_container_width=True) + else: + st.info("No metric atlas trend data is available for the selected metric yet.") + +if not case_df.empty: st.markdown("**Case Explorer**") filter_col1, filter_col2, filter_col3 = st.columns(3) with filter_col1: @@ -531,14 +871,11 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame hide_index=True, ) -section_header( - "Grouped Raw Browser", - "Inspect one grouped release and its child roles directly. This makes it obvious which full, usecase, and devops job folders are contributing to the combined trend view.", -) +st.markdown("**Grouped Raw Browser**") selection_df = release_df.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False]).reset_index(drop=True) selection_labels = [ - f"{row.release_name} | {row.version} | {row.date} | roles: {row.roles}" + f"{row.release_display} | roles: {row.roles}" for row in selection_df.itertuples() ] selected_label = st.selectbox("Release Group", selection_labels) From f76364951ce7ae6144fce17bd49808df18a00a62 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 21 May 2026 10:35:04 +0900 Subject: [PATCH 78/94] feat: enhance trend insights visualizations and refactor metric handling - Updated the `_build_metric_timeline_heatmap` and `_build_minade_label_profile` functions to improve axis formatting and layout for better visualization clarity. - Introduced new helper functions for sorting and labeling horizon metrics, enhancing the organization of metric data. - Refactored the `_build_prediction_label_profile` and `_build_prediction_release_label_profile` functions to accommodate dynamic metric families and improve data presentation. - Enhanced the overall structure of the trend insights page to streamline metric analysis and visualization. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/13_Trend_Insights.py | 727 ++++++++++-------- 1 file changed, 385 insertions(+), 342 deletions(-) diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index 1ae6ae8..fa2c4db 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -160,7 +160,7 @@ def _build_latest_hierarchy_bars(frame: pd.DataFrame, latest_release_name: str) orientation="h", hover_data=["passed", "total"], text=bars["pass_rate"].map(lambda value: f"{value:.1f}%" if pd.notna(value) else "n/a"), - title=f"Latest Release Pass-Rate Hierarchy: {latest_release_name}", + title=f"Defect Evaluation by Category: {latest_release_name}", ) fig.update_layout( margin=dict(l=20, r=20, t=70, b=20), @@ -197,6 +197,8 @@ def _build_metric_timeline_heatmap( margin=dict(l=20, r=20, t=70, b=20), coloraxis_colorbar=dict(title=color_title), ) + fig.update_xaxes(tickangle=-30, automargin=True) + fig.update_yaxes(automargin=True) return fig @@ -216,29 +218,45 @@ def _build_metric_label_lines( title=title, ) fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Label") - fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes) + fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes, tickangle=-30, automargin=True) return fig -def _build_minade_horizon_heatmaps(frame: pd.DataFrame) -> list[tuple[str, go.Figure]]: - figures: list[tuple[str, go.Figure]] = [] - for metric_name in ("minADE@1s", "minADE@3s", "minADE@5s"): - metric_df = frame[frame["metric_name"] == metric_name].copy() - if metric_df.empty: - continue - fig = _build_metric_timeline_heatmap( - metric_df, - value_col="value", - title=f"{metric_name} Timeline Heatmap", - color_title=metric_name, +def _horizon_metric_sort_key(metric_name: str) -> tuple[float, str]: + horizon_text = str(metric_name).rsplit("@", 1)[-1].removesuffix("s") + try: + return float(horizon_text), str(metric_name) + except ValueError: + return float("inf"), str(metric_name) + + +def _horizon_metric_label(metric_name: str) -> str: + return str(metric_name).rsplit("@", 1)[-1] if "@" in str(metric_name) else str(metric_name) + + +def _available_prediction_metric_groups(frame: pd.DataFrame) -> dict[str, tuple[str, ...]]: + groups: dict[str, tuple[str, ...]] = {} + metric_series = frame["metric_name"].dropna().astype(str) + for metric_family in ("minADE", "minFDE"): + metric_names = sorted( + metric_series[metric_series.str.startswith(f"{metric_family}@")].unique().tolist(), + key=_horizon_metric_sort_key, ) - figures.append((metric_name, fig)) - return figures + if metric_names: + groups[metric_family] = tuple(metric_names) + return groups -def _build_minade_label_profile(frame: pd.DataFrame, *, selected_label: str, ordered_axes: list[str]) -> go.Figure: +def _build_prediction_label_profile( + frame: pd.DataFrame, + *, + selected_label: str, + metric_family: str, + metric_names: tuple[str, ...], + ordered_axes: list[str], +) -> go.Figure: profile_df = frame[ - (frame["metric_name"].isin(["minADE@1s", "minADE@3s", "minADE@5s"])) + (frame["metric_name"].isin(metric_names)) & (frame["label_name"] == selected_label) ].copy() fig = px.line( @@ -248,10 +266,53 @@ def _build_minade_label_profile(frame: pd.DataFrame, *, selected_label: str, ord color="metric_name", markers=True, hover_data=["version", "date", "release_name"], - title=f"{selected_label} minADE Horizon Profile", + title=f"{selected_label} {metric_family} Horizon Profile", ) fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Horizon") - fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes) + fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes, tickangle=-30, automargin=True) + return fig + + +def _build_prediction_release_label_profile( + frame: pd.DataFrame, + *, + metric_family: str, + selected_release_axis: str, + selected_labels: list[str], + metric_names: tuple[str, ...], +) -> go.Figure | None: + release_df = frame[ + (frame["release_axis"] == selected_release_axis) + & (frame["label_name"].isin(selected_labels)) + & (frame["metric_name"].isin(metric_names)) + ].copy() + if release_df.empty: + return None + + release_df["horizon"] = release_df["metric_name"].map(_horizon_metric_label) + release_df["horizon_sort"] = release_df["metric_name"].map(lambda name: _horizon_metric_sort_key(str(name))[0]) + release_df = release_df.sort_values(["label_name", "horizon_sort"]) + fig = px.line( + release_df, + x="horizon", + y="value", + color="label_name", + markers=True, + category_orders={"horizon": [_horizon_metric_label(metric_name) for metric_name in metric_names]}, + hover_data=["version", "date", "release_name"], + title=f"{metric_family} by Label and Horizon", + ) + fig.update_layout( + height=460, + margin=dict(l=20, r=20, t=70, b=30), + legend_title_text="Label", + xaxis_title="Prediction Horizon", + yaxis_title=f"{metric_family} (m)", + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + fig.update_xaxes(showgrid=False) + fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") return fig @@ -374,13 +435,10 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame render_page_hero( kicker="Release Analytics", title="Trend Insights", - description="Inspect grouped release trend data the same way the catalog analyzer models it: one release group with sibling full, usecase, and devops job folders under the same topic.", + description="Release-level trends across grouped full, usecase, and devops runs.", ) -section_header( - "Release Inventory", - "Each row below is one grouped release entry. When full, usecase, and devops sibling folders exist under the same combined PDF group and topic, they are merged into one release view.", -) +section_header("Release Inventory") groups = discover_trend_release_groups() if not groups: @@ -401,7 +459,6 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame top5.metric("Latest Date", release_df.sort_values("date_sort")["date"].iloc[-1] if not release_df.empty else "n/a") inventory_cols = [ - "release_display", "version", "date", "description", @@ -421,10 +478,7 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame hide_index=True, ) -section_header( - "Performance Trend", - "Full-performance summaries are now plotted one release group at a time, even when they arrived with sibling usecase and devops folders.", -) +section_header("mAP Trend") perf_entries = release_df[release_df["full_job_id"].notna()].sort_values( ["date_sort", "version", "release_name"], @@ -438,124 +492,130 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame "minFDE@3s", "minFDE@5s", ] +if not perf_entries.empty and perf_entries["mAP"].notna().any(): + latest_map_row = perf_entries.dropna(subset=["mAP"]).iloc[-1] + map_card_col1, map_card_col2 = st.columns(2) + map_card_col1.metric( + "Latest mAP", + f"{latest_map_row['mAP']:.3f}" if pd.notna(latest_map_row["mAP"]) else "n/a", + ) + map_card_col2.metric( + "Latest Data Count", + f"{int(latest_map_row['data_count_num']):,}" if pd.notna(latest_map_row["data_count_num"]) else "n/a", + ) + fig = go.Figure() + fig.add_bar( + x=perf_entries["version"], + y=perf_entries["data_count_num"], + name="Data Count", + marker_color="#f4a7a7", + opacity=0.5, + yaxis="y2", + ) + fig.add_trace( + go.Scatter( + x=perf_entries["version"], + y=perf_entries["mAP"], + name="mAP", + mode="lines+markers", + line=dict(color="#0f766e", width=3), + customdata=perf_entries[["release_name", "date", "data_count"]].to_numpy(), + hovertemplate="%{x}
mAP: %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}", + ) + ) + fig.update_layout( + title="mAP Trend", + xaxis_title="Pilot.Auto Version", + yaxis_title="mAP", + yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), + height=460, + legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=90, b=20), + ) + st.plotly_chart(fig, use_container_width=True) +else: + st.info("No grouped mAP trend entries are available yet.") + +section_header("Prediction Trend") if not perf_entries.empty and perf_entries[prediction_cols].notna().any().any(): pred_card_col1, pred_card_col2, pred_card_col3 = st.columns(3) - latest_pred_row = perf_entries.dropna(subset=["minADE@3s", "minFDE@5s"], how="all").iloc[-1] + latest_pred_row = perf_entries.dropna(subset=prediction_cols, how="all").iloc[-1] + latest_minade_mean = pd.to_numeric(latest_pred_row[["minADE@1s", "minADE@3s", "minADE@5s"]], errors="coerce").mean() + latest_minfde_mean = pd.to_numeric(latest_pred_row[["minFDE@1s", "minFDE@3s", "minFDE@5s"]], errors="coerce").mean() pred_card_col1.metric( - "Latest minADE@3s", - f"{latest_pred_row['minADE@3s']:.2f} m" if pd.notna(latest_pred_row["minADE@3s"]) else "n/a", + "Mean minADE", + f"{latest_minade_mean:.2f} m" if pd.notna(latest_minade_mean) else "n/a", ) pred_card_col2.metric( - "Latest minFDE@5s", - f"{latest_pred_row['minFDE@5s']:.2f} m" if pd.notna(latest_pred_row["minFDE@5s"]) else "n/a", + "Mean minFDE", + f"{latest_minfde_mean:.2f} m" if pd.notna(latest_minfde_mean) else "n/a", ) pred_card_col3.metric( "Latest Data Count", f"{int(latest_pred_row['data_count_num']):,}" if pd.notna(latest_pred_row["data_count_num"]) else "n/a", ) - -perf_col1, perf_col2 = st.columns([1.1, 1.0]) -with perf_col1: - if not perf_entries.empty and perf_entries["mAP"].notna().any(): - fig = go.Figure() - fig.add_bar( - x=perf_entries["version"], - y=perf_entries["data_count_num"], - name="Data Count", - marker_color="#f4a7a7", - opacity=0.5, - yaxis="y2", - ) - fig.add_trace( + pred_story = perf_entries[["version", "date", "description", "release_name", "data_count", "data_count_num"] + prediction_cols].copy() + pred_fig = go.Figure() + pred_fig.add_bar( + x=pred_story["version"], + y=pred_story["data_count_num"], + name="Data Count", + marker_color="#fbbf24", + opacity=0.20, + yaxis="y2", + hovertemplate="%{x}
Data Count: %{y:,}", + ) + series_specs = [ + ("minADE@1s", "#0f766e", "solid"), + ("minADE@3s", "#14b8a6", "solid"), + ("minADE@5s", "#99f6e4", "solid"), + ("minFDE@1s", "#1d4ed8", "dot"), + ("minFDE@3s", "#60a5fa", "dot"), + ("minFDE@5s", "#bfdbfe", "dot"), + ] + for metric_name, color, dash in series_specs: + pred_fig.add_trace( go.Scatter( - x=perf_entries["version"], - y=perf_entries["mAP"], - name="mAP", + x=pred_story["version"], + y=pred_story[metric_name], + name=metric_name, mode="lines+markers", - line=dict(color="#0f766e", width=3), - customdata=perf_entries[["release_name", "date", "data_count"]].to_numpy(), - hovertemplate="%{x}
mAP: %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}", - ) - ) - fig.update_layout( - title="mAP vs Data Count", - xaxis_title="Pilot.Auto Version", - yaxis_title="mAP", - yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), - height=520, - legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), - margin=dict(l=20, r=20, t=90, b=20), - ) - st.plotly_chart(fig, use_container_width=True) - else: - st.info("No grouped full-performance trend entries are available yet.") - -with perf_col2: - if not perf_entries.empty and perf_entries[prediction_cols].notna().any().any(): - pred_story = perf_entries[["version", "date", "description", "release_name", "data_count", "data_count_num"] + prediction_cols].copy() - pred_fig = go.Figure() - pred_fig.add_bar( - x=pred_story["version"], - y=pred_story["data_count_num"], - name="Data Count", - marker_color="#fbbf24", - opacity=0.20, - yaxis="y2", - hovertemplate="%{x}
Data Count: %{y:,}", - ) - series_specs = [ - ("minADE@1s", "#0f766e", "solid"), - ("minADE@3s", "#14b8a6", "solid"), - ("minADE@5s", "#99f6e4", "solid"), - ("minFDE@1s", "#1d4ed8", "dot"), - ("minFDE@3s", "#60a5fa", "dot"), - ("minFDE@5s", "#bfdbfe", "dot"), - ] - for metric_name, color, dash in series_specs: - pred_fig.add_trace( - go.Scatter( - x=pred_story["version"], - y=pred_story[metric_name], - name=metric_name, - mode="lines+markers", - line=dict(color=color, width=3 if metric_name.endswith("@3s") else 2, dash=dash), - marker=dict(size=8), - customdata=pred_story[["date", "release_name", "data_count"]].to_numpy(), - hovertemplate=( - "%{x}
" - + metric_name - + ": %{y:.2f} m
Date: %{customdata[0]}
Release: %{customdata[1]}
Data Count: %{customdata[2]}" - ), - ) + line=dict(color=color, width=3 if metric_name.endswith("@3s") else 2, dash=dash), + marker=dict(size=8), + customdata=pred_story[["date", "release_name", "data_count"]].to_numpy(), + hovertemplate=( + "%{x}
" + + metric_name + + ": %{y:.2f} m
Date: %{customdata[0]}
Release: %{customdata[1]}
Data Count: %{customdata[2]}" + ), ) - pred_fig.update_layout( - title="Prediction Quality Story: All Horizons with Data Count", - xaxis_title="Pilot.Auto Version", - yaxis_title="Prediction Error (m)", - yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), - height=520, - legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), - margin=dict(l=20, r=20, t=100, b=20), - plot_bgcolor="#ffffff", - paper_bgcolor="#ffffff", - ) - pred_fig.update_xaxes(showgrid=False) - pred_fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") - st.plotly_chart(pred_fig, use_container_width=True) - st.caption( - "Each point is one grouped release. The chart now keeps sibling full/usecase/devops folders together so the performance story stays release-centric." ) - else: - st.info("No usable grouped prediction trend values are available yet.") + pred_fig.update_layout( + title="Prediction Error Trend", + xaxis_title="Pilot.Auto Version", + yaxis_title="Prediction Error (m)", + yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), + height=480, + legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=100, b=20), + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + pred_fig.update_xaxes(showgrid=False) + pred_fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") + st.plotly_chart(pred_fig, use_container_width=True) +else: + st.info("No usable grouped prediction trend values are available yet.") + +atlas_df = pd.DataFrame() +release_manifest = pd.DataFrame() +ordered_release_axes: list[str] = [] if not metric_df.empty: atlas_df = metric_df[metric_df["block_header"] == "全数データセット評価"].copy() atlas_df = atlas_df.sort_values(["date_sort", "version", "release_name"], ascending=[True, True, True]) atlas_df["release_axis"] = atlas_df["version"].astype(str) + " | " + atlas_df["date"].astype(str) - latest_group_key = perf_entries.iloc[-1]["group_key"] if not perf_entries.empty else None - previous_group_key = perf_entries.iloc[-2]["group_key"] if len(perf_entries) >= 2 else None - latest_release_name = perf_entries.iloc[-1]["version"] if not perf_entries.empty else "" release_manifest = ( atlas_df[["group_key", "release_axis", "version", "date", "release_name", "release_display"]] .drop_duplicates() @@ -563,10 +623,7 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ) ordered_release_axes = release_manifest["release_axis"].tolist() -section_header( - "Pass Rate Trend", - "DevOps-style nested summaries are also grouped by release, so one pass-rate point represents the same release group as the matching performance metrics.", -) +section_header("Pass Rate Trend") pass_entries = release_df[release_df["devops_job_id"].notna()].sort_values( ["date_sort", "version", "release_name"], @@ -636,276 +693,262 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ), use_container_width=True, ) - st.caption( - "These three charts share the same grouped DevOps source, version order, scenario-count backdrop, and pass-rate scale so you can compare overall, major-category, and mid-category movement directly." - ) -section_header( - "Deep Dive Explorer", - "Use this final section when you want to inspect the latest release state, compare label-level metric atlases against a baseline, or browse grouped raw details.", -) +section_header("Defect Evaluation") -if not case_df.empty: - latest_devops_group = pass_entries.iloc[-1]["group_key"] if not pass_entries.empty else None - latest_case_df = case_df[case_df["group_key"] == latest_devops_group].copy() - latest_major_mid = ( - latest_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] +if not case_df.empty and not pass_entries.empty: + defect_release_options = pass_entries["release_display"].tolist() + selected_defect_release = st.selectbox( + "Version", + defect_release_options, + index=len(defect_release_options) - 1, + key="defect_evaluation_release", + ) + selected_defect_row = pass_entries.iloc[defect_release_options.index(selected_defect_release)] + selected_defect_case_df = case_df[case_df["group_key"] == selected_defect_row["group_key"]].copy() + selected_major_mid = ( + selected_defect_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] .sum() .reset_index() ) - latest_major_mid = _with_pass_rate(latest_major_mid) - st.markdown("**Latest Release Snapshot**") - if not latest_major_mid.empty: + selected_major_mid = _with_pass_rate(selected_major_mid) + if not selected_major_mid.empty: latest_view_mode = st.radio( - "Latest Snapshot View", + "View", ["Bars", "Treemap", "Icicle", "Sunburst"], horizontal=True, ) if latest_view_mode == "Bars": - latest_fig = _build_latest_hierarchy_bars(latest_major_mid, pass_entries.iloc[-1]["version"]) + latest_fig = _build_latest_hierarchy_bars(selected_major_mid, selected_defect_row["version"]) elif latest_view_mode == "Treemap": latest_fig = px.treemap( - latest_major_mid, + selected_major_mid, path=["major_category", "mid_category"], values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title=f"Latest Release Pass-Rate Treemap: {pass_entries.iloc[-1]['version']}", + title=f"Defect Evaluation Treemap: {selected_defect_row['version']}", ) latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) elif latest_view_mode == "Icicle": latest_fig = px.icicle( - latest_major_mid, + selected_major_mid, path=["major_category", "mid_category"], values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title=f"Latest Release Pass-Rate Icicle: {pass_entries.iloc[-1]['version']}", + title=f"Defect Evaluation Icicle: {selected_defect_row['version']}", ) latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) else: latest_fig = px.sunburst( - latest_major_mid, + selected_major_mid, path=["major_category", "mid_category"], values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title=f"Latest Release Pass-Rate Sunburst: {pass_entries.iloc[-1]['version']}", + title=f"Defect Evaluation Sunburst: {selected_defect_row['version']}", ) latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) st.plotly_chart(latest_fig, use_container_width=True) else: - st.info("No latest release pass-rate hierarchy is available yet.") + st.info("No defect evaluation hierarchy is available yet.") +else: + st.info("No defect evaluation summaries are available yet.") + +if not atlas_df.empty: + release_options = release_manifest["release_axis"].tolist() + section_header("Release Details") + selected_detail_release = st.selectbox( + "Version", + release_options, + index=len(release_options) - 1, + key="deep_dive_release_detail", + ) + horizon_metric_groups = _available_prediction_metric_groups(atlas_df) + available_horizon_families = [metric_family for metric_family in ("minADE", "minFDE") if metric_family in horizon_metric_groups] + horizon_labels = sorted( + atlas_df[ + atlas_df["metric_name"].isin( + [metric_name for metric_names in horizon_metric_groups.values() for metric_name in metric_names] + ) + ]["label_name"] + .dropna() + .astype(str) + .unique() + .tolist() + ) -if not metric_df.empty: - st.markdown("**Metric Atlas Explorer**") - explorer_note_col1, explorer_note_col2 = st.columns([1.2, 1.0]) - with explorer_note_col1: - st.caption( - "Choose one metric and inspect how each label evolved across releases. " - "Use the baseline comparison modes for fast regression checks, and use the combined minADE view when you want to compare prediction horizons together." + selected_atlas_group_key = release_manifest.loc[ + release_manifest["release_axis"] == selected_detail_release, + "group_key", + ].iloc[0] + latest_matrix = atlas_df[atlas_df["group_key"] == selected_atlas_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", + ).dropna(how="all") + if not latest_matrix.empty: + latest_min = latest_matrix.min(axis=1) + latest_range = (latest_matrix.max(axis=1) - latest_min).replace(0, 1) + latest_norm = latest_matrix.sub(latest_min, axis=0).div(latest_range, axis=0) + latest_atlas_fig = px.imshow( + latest_norm, + aspect="auto", + color_continuous_scale=["#f8fafc", "#8dd3c7", "#0f766e"], + text_auto=".2f", ) - with explorer_note_col2: - atlas_mode = st.radio( - "Atlas Explorer Mode", - ["Latest Atlas", "Latest vs Previous", "Timeline Heatmap", "Label Trend Lines", "Combined minADE Explorer", "Chosen Baseline Delta"], - horizontal=True, + latest_atlas_fig.update_traces( + text=latest_matrix.round(2).astype(str), + hovertemplate="Metric: %{y}
Label: %{x}
Value: %{text}", + ) + latest_atlas_fig.update_layout( + title="Metric Atlas", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="Relative"), ) + latest_atlas_fig.update_xaxes(automargin=True) + latest_atlas_fig.update_yaxes(automargin=True) + st.plotly_chart(latest_atlas_fig, use_container_width=True) + else: + st.info("No metric atlas is available for the selected release yet.") + + if available_horizon_families and horizon_labels: + release_detail_cols = st.columns(len(available_horizon_families)) + for col, metric_family in zip(release_detail_cols, available_horizon_families): + metric_names = horizon_metric_groups[metric_family] + family_df = atlas_df[atlas_df["metric_name"].isin(metric_names)].copy() + release_fig = _build_prediction_release_label_profile( + family_df, + metric_family=metric_family, + selected_release_axis=selected_detail_release, + selected_labels=horizon_labels, + metric_names=metric_names, + ) + with col: + if release_fig is not None: + st.plotly_chart(release_fig, use_container_width=True) + else: + st.info(f"No {metric_family} horizon values are available for the selected release.") + + section_header("Trend Details") + if available_horizon_families and horizon_labels: + selected_horizon_label = st.selectbox( + "Label Trend Focus", + horizon_labels, + key="prediction_horizon_label_focus", + ) + trend_profile_cols = st.columns(len(available_horizon_families)) + for col, metric_family in zip(trend_profile_cols, available_horizon_families): + metric_names = horizon_metric_groups[metric_family] + family_df = atlas_df[atlas_df["metric_name"].isin(metric_names)].copy() + profile_fig = _build_prediction_label_profile( + family_df, + selected_label=selected_horizon_label, + metric_family=metric_family, + metric_names=metric_names, + ordered_axes=ordered_release_axes, + ) + with col: + st.plotly_chart(profile_fig, use_container_width=True) + else: + st.info("No minADE/minFDE horizon trend data is available yet.") + + trend_mode = st.radio( + "Trend View", + ["Timeline Heatmap", "Label Trend Lines"], + horizontal=True, + key="detailed_metric_trend_view", + ) metric_options = sorted(atlas_df["metric_name"].dropna().unique().tolist()) - atlas_control_col1, atlas_control_col2 = st.columns([1.0, 1.0]) - with atlas_control_col1: - selected_metric = st.selectbox("Metric", metric_options) - with atlas_control_col2: - baseline_labels = release_manifest["release_axis"].tolist() - default_baseline_index = max(0, len(baseline_labels) - 2) - selected_baseline_axis = st.selectbox( - "Baseline Release", - baseline_labels, - index=default_baseline_index if baseline_labels else 0, - ) + selected_metric = st.selectbox("Metric", metric_options, key="detailed_metric_trend_metric") metric_trend_df = atlas_df[atlas_df["metric_name"] == selected_metric].copy() if not metric_trend_df.empty: - latest_metric_df = metric_trend_df[metric_trend_df["group_key"] == latest_group_key].copy() - baseline_metric_df = metric_trend_df[metric_trend_df["release_axis"] == selected_baseline_axis].copy() - - if atlas_mode == "Latest Atlas": - latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( - index="metric_name", - columns="label_name", - values="value", - aggfunc="first", - ).dropna(how="all") - if not latest_matrix.empty: - latest_min = latest_matrix.min(axis=1) - latest_range = (latest_matrix.max(axis=1) - latest_min).replace(0, 1) - latest_norm = latest_matrix.sub(latest_min, axis=0).div(latest_range, axis=0) - explorer_fig = px.imshow( - latest_norm, - aspect="auto", - color_continuous_scale=["#f8fafc", "#8dd3c7", "#0f766e"], - text_auto=".2f", - ) - explorer_fig.update_traces( - text=latest_matrix.round(2).astype(str), - hovertemplate="Metric: %{y}
Label: %{x}
Value: %{text}", - ) - explorer_fig.update_layout( - title=f"Latest Release Metric Atlas: {latest_release_name}", - margin=dict(l=20, r=20, t=70, b=20), - coloraxis_colorbar=dict(title="Relative"), - ) - else: - explorer_fig = None - st.info("No latest metric atlas is available yet.") - elif atlas_mode == "Latest vs Previous": - if latest_group_key is not None and previous_group_key is not None: - latest_matrix = atlas_df[atlas_df["group_key"] == latest_group_key].pivot_table( - index="metric_name", - columns="label_name", - values="value", - aggfunc="first", - ) - previous_matrix = atlas_df[atlas_df["group_key"] == previous_group_key].pivot_table( - index="metric_name", - columns="label_name", - values="value", - aggfunc="first", - ) - delta_matrix = latest_matrix.subtract(previous_matrix, fill_value=pd.NA).dropna(how="all") - if not delta_matrix.empty: - explorer_fig = px.imshow( - delta_matrix, - aspect="auto", - color_continuous_scale=["#7f1d1d", "#f8fafc", "#14532d"], - color_continuous_midpoint=0, - text_auto=".2f", - ) - explorer_fig.update_layout( - title=f"Release-over-Release Metric Delta: {latest_release_name}", - margin=dict(l=20, r=20, t=70, b=20), - coloraxis_colorbar=dict(title="Delta"), - ) - else: - explorer_fig = None - st.info("No previous release is available for metric delta yet.") - else: - explorer_fig = None - st.info("Metric delta becomes available after at least two grouped full releases exist.") - elif atlas_mode == "Timeline Heatmap": + if trend_mode == "Timeline Heatmap": explorer_fig = _build_metric_timeline_heatmap( metric_trend_df, value_col="value", title=f"{selected_metric} Timeline Heatmap by Label", color_title=selected_metric, ) - elif atlas_mode == "Label Trend Lines": + else: explorer_fig = _build_metric_label_lines( metric_trend_df, title=f"{selected_metric} Label Trend Lines", ordered_axes=ordered_release_axes, ) - elif atlas_mode == "Combined minADE Explorer": - minade_metrics = {"minADE@1s", "minADE@3s", "minADE@5s"} - minade_df = atlas_df[atlas_df["metric_name"].isin(minade_metrics)].copy() - if minade_df.empty: - st.info("No minADE trend data is available yet.") - else: - heatmaps = _build_minade_horizon_heatmaps(minade_df) - heatmap_cols = st.columns(len(heatmaps)) if heatmaps else [] - for col, (_, heatmap_fig) in zip(heatmap_cols, heatmaps): - col.plotly_chart(heatmap_fig, use_container_width=True) - label_options = sorted(minade_df["label_name"].dropna().unique().tolist()) - selected_label = st.selectbox("minADE Label Focus", label_options) - profile_fig = _build_minade_label_profile( - minade_df, - selected_label=selected_label, - ordered_axes=ordered_release_axes, - ) - st.plotly_chart(profile_fig, use_container_width=True) - explorer_fig = None - else: - delta_df = latest_metric_df[["label_name", "value"]].merge( - baseline_metric_df[["label_name", "value"]], - on="label_name", - how="outer", - suffixes=("_latest", "_baseline"), - ) - delta_df["delta_value"] = delta_df["value_latest"] - delta_df["value_baseline"] - delta_df["release_axis"] = f"{latest_release_name} vs baseline" - explorer_fig = _build_metric_timeline_heatmap( - delta_df, - value_col="delta_value", - title=f"{selected_metric} Latest vs Baseline Delta by Label", - color_title="Delta", - ) - if explorer_fig is not None: - st.plotly_chart(explorer_fig, use_container_width=True) + st.plotly_chart(explorer_fig, use_container_width=True) else: - st.info("No metric atlas trend data is available for the selected metric yet.") + st.info("No detailed trend data is available for the selected metric yet.") +elif not metric_df.empty: + st.info("No full-dataset metric atlas data is available yet.") if not case_df.empty: - st.markdown("**Case Explorer**") - filter_col1, filter_col2, filter_col3 = st.columns(3) - with filter_col1: - selected_major = st.selectbox("Major Category", ["All"] + sorted(case_df["major_category"].dropna().unique().tolist())) - case_filtered = case_df.copy() - if selected_major != "All": - case_filtered = case_filtered[case_filtered["major_category"] == selected_major] - with filter_col2: - selected_mid = st.selectbox("Mid Category", ["All"] + sorted(case_filtered["mid_category"].dropna().unique().tolist())) - if selected_mid != "All": - case_filtered = case_filtered[case_filtered["mid_category"] == selected_mid] - with filter_col3: - selected_case = st.selectbox("Case", ["All"] + sorted(case_filtered["case_name"].dropna().unique().tolist())) - if selected_case != "All": - case_filtered = case_filtered[case_filtered["case_name"] == selected_case] - - st.dataframe( - case_filtered.sort_values(["date_sort", "version", "case_name"]).drop(columns=["date_sort"], errors="ignore"), - use_container_width=True, - hide_index=True, - ) - -st.markdown("**Grouped Raw Browser**") + with st.expander("Case Explorer", expanded=False): + filter_col1, filter_col2, filter_col3 = st.columns(3) + with filter_col1: + selected_major = st.selectbox("Major Category", ["All"] + sorted(case_df["major_category"].dropna().unique().tolist())) + case_filtered = case_df.copy() + if selected_major != "All": + case_filtered = case_filtered[case_filtered["major_category"] == selected_major] + with filter_col2: + selected_mid = st.selectbox("Mid Category", ["All"] + sorted(case_filtered["mid_category"].dropna().unique().tolist())) + if selected_mid != "All": + case_filtered = case_filtered[case_filtered["mid_category"] == selected_mid] + with filter_col3: + selected_case = st.selectbox("Case", ["All"] + sorted(case_filtered["case_name"].dropna().unique().tolist())) + if selected_case != "All": + case_filtered = case_filtered[case_filtered["case_name"] == selected_case] + + st.dataframe( + case_filtered.sort_values(["date_sort", "version", "case_name"]).drop(columns=["date_sort"], errors="ignore"), + use_container_width=True, + hide_index=True, + ) -selection_df = release_df.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False]).reset_index(drop=True) -selection_labels = [ - f"{row.release_display} | roles: {row.roles}" - for row in selection_df.itertuples() -] -selected_label = st.selectbox("Release Group", selection_labels) -selected_release = selection_df.iloc[selection_labels.index(selected_label)] -selected_group = next(group for group in groups if group.group_key == selected_release["group_key"]) - -group_manifest = { - "display_name": selected_group.display_name, - "topic_name": selected_group.topic_name, - "group_kind": selected_group.group_kind, - "base_dir": str(selected_group.base_dir), - "jobs": { - role: { - "job_id": payload["job_id"], - "metadata_path": str(payload["metadata_path"]), - "summary_path": str(payload["summary_path"]), - } - for role, payload in selected_group.jobs.items() - }, -} - -detail_col1, detail_col2 = st.columns([0.9, 1.1]) -with detail_col1: - st.markdown("**Release Group Manifest**") - st.code(json.dumps(group_manifest, ensure_ascii=False, indent=2), language="json") - role_choice = st.selectbox("Child Role", sorted(selected_group.jobs.keys())) - -with detail_col2: - st.markdown("**Selected Child Summary JSON**") - st.code( - json.dumps(selected_group.jobs[role_choice]["summary"], ensure_ascii=False, indent=2)[:30000], - language="json", - ) +with st.expander("Grouped Raw Browser", expanded=False): + selection_df = release_df.sort_values( + ["date_sort", "version", "release_name"], + ascending=[False, False, False], + ).reset_index(drop=True) + selection_labels = [ + f"{row.release_display} | roles: {row.roles}" + for row in selection_df.itertuples() + ] + selected_label = st.selectbox("Release Group", selection_labels) + selected_release = selection_df.iloc[selection_labels.index(selected_label)] + selected_group = next(group for group in groups if group.group_key == selected_release["group_key"]) + + group_manifest = { + "display_name": selected_group.display_name, + "topic_name": selected_group.topic_name, + "group_kind": selected_group.group_kind, + "base_dir": str(selected_group.base_dir), + "jobs": { + role: { + "job_id": payload["job_id"], + "metadata_path": str(payload["metadata_path"]), + "summary_path": str(payload["summary_path"]), + } + for role, payload in selected_group.jobs.items() + }, + } + + detail_col1, detail_col2 = st.columns([0.9, 1.1]) + with detail_col1: + st.markdown("**Release Group Manifest**") + st.code(json.dumps(group_manifest, ensure_ascii=False, indent=2), language="json") + role_choice = st.selectbox("Child Role", sorted(selected_group.jobs.keys())) + + with detail_col2: + st.markdown("**Selected Child Summary JSON**") + st.code( + json.dumps(selected_group.jobs[role_choice]["summary"], ensure_ascii=False, indent=2)[:30000], + language="json", + ) From db1e3eb1fbbb72f9bc56ec0bd1f0163413227065 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 21 May 2026 13:10:23 +0900 Subject: [PATCH 79/94] feat: refactor latest hierarchy bars function and enhance case pass rate display - Updated the `_build_latest_hierarchy_bars` function to accept dynamic category columns, improving flexibility in visualizing hierarchical data. - Enhanced the handling of missing values in category columns, ensuring clearer data representation. - Added a new section to display the lowest case pass rates, providing additional insights into case performance. - Adjusted the layout to include an additional filter for minor categories in the case explorer, improving user interaction. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../lib/specsheet_report.py | 181 ++++++++++++++---- .../pages/13_Trend_Insights.py | 143 +++++++++++--- 2 files changed, 266 insertions(+), 58 deletions(-) diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index a74b7ee..826ea11 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -38,6 +38,7 @@ ] TREND_METADATA_FILENAME = "metadata.yaml" TREND_SUMMARY_FILENAME = "summary.json" +FULL_DATASET_EVALUATION_HEADER = "全数データセット評価" DEFAULT_TREND_METADATA_TEXT = """tags: [trend] pilot_auto_version: "Pilot.Auto v4.3.0 (centerpoint x2/2.3.1)" data_count: 99,776+ @@ -347,32 +348,59 @@ def _trend_version_sort_key(pilot_auto_version: str) -> tuple[tuple[int, int, in return ((major, minor, patch), ml_model_type, ml_version) -def _load_only_full_summary(summary_path: Path) -> list[dict[str, Any]]: - summary = load_trend_summary_file(summary_path) +def _canonical_summary_table_key(table_data: dict[str, Any]) -> str: + return json.dumps(table_data, ensure_ascii=False, sort_keys=True, allow_nan=True) + + +def _deduplicate_summary_tables(data_list: Sequence[dict[str, Any]]) -> list[dict[str, Any]]: + deduplicated: list[dict[str, Any]] = [] + seen: set[str] = set() + for table_data in data_list: + key = _canonical_summary_table_key(table_data) + if key in seen: + continue + seen.add(key) + deduplicated.append(table_data) + return deduplicated + + +def _extract_full_metric_tables(summary: dict[str, Any]) -> list[dict[str, Any]]: data_list: list[dict[str, Any]] = [] - for block in summary.get("blocks", []): - if block.get("header") != "全数データセット評価": + blocks = summary.get("blocks", []) + if not isinstance(blocks, list): + return data_list + for block in blocks: + if not isinstance(block, dict): continue - for tables in block.get("tables", []): + if block.get("header") != FULL_DATASET_EVALUATION_HEADER: + continue + if block.get("mode") not in (None, "metrics"): + continue + if block.get("evaluation_type") not in (None, "full"): + continue + block_tables = block.get("tables", []) + if not isinstance(block_tables, list): + continue + for tables in block_tables: + if not isinstance(tables, dict): + continue table_data = tables.get("data", {}) if isinstance(table_data, dict) and table_data: data_list.append(table_data) - return data_list + return _deduplicate_summary_tables(data_list) + + +def _load_only_full_summary(summary_path: Path) -> list[dict[str, Any]]: + summary = load_trend_summary_file(summary_path) + return _extract_full_metric_tables(summary) def extract_performance_metrics_from_summary(summary: dict[str, Any]) -> dict[str, float]: """Return averaged full-performance metrics from a full summary payload.""" - data_list: list[dict[str, Any]] = [] - for block in summary.get("blocks", []): - if block.get("header") != "全数データセット評価": - continue - for table in block.get("tables", []): - table_data = table.get("data", {}) - if isinstance(table_data, dict) and table_data: - data_list.append(table_data) + data_list = _extract_full_metric_tables(summary) if len(data_list) != 1: - raise ValueError(f"Expected exactly one full summary table, but got {len(data_list)}") + raise ValueError(f"Expected exactly one distinct full summary table, but got {len(data_list)}") metrics = data_list[0] def _avg(metric_name: str) -> float: @@ -406,24 +434,37 @@ def extract_devops_case_rows(summary: dict[str, Any]) -> list[dict[str, Any]]: for major_category, mid_categories in summary.items(): if not isinstance(mid_categories, dict): continue - for mid_category, cases in mid_categories.items(): - if not isinstance(cases, dict): + for mid_category, minor_or_cases in mid_categories.items(): + if not isinstance(minor_or_cases, dict): continue - for case_name, result in cases.items(): - if not isinstance(result, dict): + for minor_or_case_name, result_or_cases in minor_or_cases.items(): + if not isinstance(result_or_cases, dict): continue - passed = int(result.get("passed", 0) or 0) - total = int(result.get("total", 0) or 0) - rows.append( - { - "major_category": major_category, - "mid_category": mid_category, - "case_name": case_name, - "passed": passed, - "total": total, - "pass_rate": (passed / total * 100.0) if total > 0 else None, - } - ) + if {"passed", "total"}.intersection(result_or_cases.keys()): + case_items = [(minor_or_case_name, result_or_cases)] + minor_category = minor_or_case_name + else: + case_items = [ + (case_name, result) + for case_name, result in result_or_cases.items() + if isinstance(result, dict) + ] + minor_category = minor_or_case_name + + for case_name, result in case_items: + passed = int(result.get("passed", 0) or 0) + total = int(result.get("total", 0) or 0) + rows.append( + { + "major_category": major_category, + "mid_category": mid_category, + "minor_category": minor_category, + "case_name": case_name, + "passed": passed, + "total": total, + "pass_rate": (passed / total * 100.0) if total > 0 else None, + } + ) return rows @@ -456,7 +497,8 @@ def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, summary = row.get("summary") or [] if len(summary) != 1: raise ValueError( - f"Expected exactly one summary block for version {row.get('version')}, but got {len(summary)}" + f"Expected exactly one distinct summary block for version {row.get('version')}, " + f"but got {len(summary)}" ) metrics = summary[0] @@ -486,7 +528,66 @@ def _avg(metric_name: str) -> float: def load_devops_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, Any]]: - return [] + trend_data_rows: list[dict[str, Any]] = [] + for metadata_path in metadata_list: + metadata = load_trend_metadata_file(metadata_path) + if "trend" not in [str(tag).strip() for tag in metadata.get("tags", [])]: + continue + summary_path = Path(metadata_path).parent / TREND_SUMMARY_FILENAME + if not summary_path.exists(): + continue + summary = load_trend_summary_file(summary_path) + if classify_trend_summary(summary) != "devops": + continue + + rows = extract_devops_case_rows(summary) + if not rows: + continue + overall_passed = sum(int(row["passed"]) for row in rows) + overall_total = sum(int(row["total"]) for row in rows) + trend_data_rows.append( + { + "version": metadata.get("pilot_auto_version"), + "data_count": metadata.get("data_count"), + "description": metadata.get("description"), + "date": metadata.get("date"), + "overall_pass_rate": (overall_passed / overall_total * 100.0) + if overall_total > 0 + else 0.0, + "scenario_count": overall_total, + "devops_data": summary, + } + ) + + trend_data_rows.sort(key=lambda row: _trend_version_sort_key(str(row.get("version") or ""))) + return trend_data_rows + + +def _add_devops_detail_trend_rates(devops_trend_data: Sequence[dict[str, Any]]) -> list[str]: + cases: set[str] = set() + for row in devops_trend_data: + devops_data = row.get("devops_data", {}) + if not isinstance(devops_data, dict): + continue + for mid_categories in devops_data.values(): + if not isinstance(mid_categories, dict): + continue + for sub_category, sub_categories in mid_categories.items(): + if not isinstance(sub_categories, dict): + continue + total_passed = sum( + int(result.get("passed", 0) or 0) + for result in sub_categories.values() + if isinstance(result, dict) + ) + total = sum( + int(result.get("total", 0) or 0) + for result in sub_categories.values() + if isinstance(result, dict) + ) + row[sub_category] = total_passed / total * 100.0 if total > 0 else 0.0 + cases.add(str(sub_category)) + return sorted(cases) def _build_trend_context( @@ -507,6 +608,10 @@ def _build_trend_context( try: from perception_catalog_analyzer.plot.map_trend import generate_map_trend_plot from perception_catalog_analyzer.plot.prediction_trend import generate_prediction_trend_plot + from perception_catalog_analyzer.plot.devops_trend import ( + generate_devops_trend_detail_plot, + generate_devops_trend_plot, + ) except ImportError as exc: raise RuntimeError( "perception_catalog_analyzer trend support is unavailable. " @@ -525,6 +630,16 @@ def _build_trend_context( devops_trend_data = load_devops_trend_data(list(metadata_list)) devops_trend_plot_path = output_dir / "devops_trend.png" + if devops_trend_data: + _notify(progress_callback, "Rendering pass-rate trend plots") + generate_devops_trend_plot(devops_trend_data, devops_trend_plot_path) + detail_cases = _add_devops_detail_trend_rates(devops_trend_data) + if detail_cases: + generate_devops_trend_detail_plot( + devops_trend_data, + detail_cases, + devops_trend_plot_path, + ) return { "performance_trend_data": performance_trend_data, diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index fa2c4db..99a84fb 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -146,29 +146,77 @@ def _build_pass_combo_chart( return fig -def _build_latest_hierarchy_bars(frame: pd.DataFrame, latest_release_name: str) -> go.Figure: +def _build_defect_hierarchy_bars( + frame: pd.DataFrame, + *, + category_cols: list[str], + title: str, + color_col: str = "major_category", + label_cols: list[str] | None = None, + color_map: dict[str, str] | None = None, +) -> go.Figure: bars = frame.copy() - bars["major_category"] = bars["major_category"].fillna("Unspecified") - bars["mid_category"] = bars["mid_category"].fillna("Unspecified") - bars["label"] = bars["major_category"] + " / " + bars["mid_category"] - bars = bars.sort_values(["major_category", "pass_rate", "total"], ascending=[True, False, False]) + for category_col in category_cols: + bars[category_col] = bars[category_col].fillna("Unspecified") + label_cols = label_cols or category_cols + bars["full_label"] = bars[label_cols].astype(str).agg(" / ".join, axis=1) + bars["label"] = bars["full_label"] + bars = bars.sort_values(category_cols + ["pass_rate", "total"], ascending=[True] * len(category_cols) + [False, False]) fig = px.bar( bars, - x="pass_rate", - y="label", - color="major_category", - orientation="h", - hover_data=["passed", "total"], + x="label", + y="pass_rate", + color=color_col, + color_discrete_map=color_map, + hover_data={"label": False, "full_label": True, "passed": True, "total": True}, text=bars["pass_rate"].map(lambda value: f"{value:.1f}%" if pd.notna(value) else "n/a"), - title=f"Defect Evaluation by Category: {latest_release_name}", + title=title, ) fig.update_layout( - margin=dict(l=20, r=20, t=70, b=20), - xaxis_title="Pass Rate (%)", - yaxis_title="Major / Mid Category", - legend_title_text="Major Category", + height=500, + margin=dict(l=20, r=20, t=70, b=140), + xaxis_title=" / ".join(label.replace("_", " ").title() for label in label_cols), + yaxis_title="Pass Rate (%)", + legend_title_text=color_col.replace("_", " ").title(), ) fig.update_traces(textposition="outside", cliponaxis=False) + fig.update_xaxes(tickangle=-35, automargin=True) + fig.update_yaxes(range=[0, 100], automargin=True) + return fig + + +def _build_defect_case_bars( + frame: pd.DataFrame, + *, + ordered_mid_categories: list[str], + max_cases: int = 20, +) -> go.Figure: + case_bars = frame.copy() + case_bars["minor_category"] = case_bars["minor_category"].fillna(case_bars["case_name"]) + case_bars["mid_order"] = case_bars["mid_category"].map( + {mid_category: idx for idx, mid_category in enumerate(ordered_mid_categories)} + ) + case_bars = case_bars.sort_values(["mid_order", "pass_rate", "total"], ascending=[True, True, False]) + case_bars = case_bars.head(max_cases) + fig = px.bar( + case_bars, + x="minor_category", + y="pass_rate", + color="mid_category", + hover_data=["major_category", "mid_category", "passed", "total"], + text=case_bars["pass_rate"].map(lambda value: f"{value:.1f}%" if pd.notna(value) else "n/a"), + title="Case Pass Rates", + ) + fig.update_layout( + height=500, + margin=dict(l=20, r=20, t=70, b=140), + xaxis_title="Case", + yaxis_title="Pass Rate (%)", + legend_title_text="Mid Category", + ) + fig.update_traces(textposition="outside", cliponaxis=False) + fig.update_xaxes(tickangle=-35, automargin=True, categoryorder="array", categoryarray=case_bars["minor_category"].tolist()) + fig.update_yaxes(range=[0, 100], automargin=True) return fig @@ -706,8 +754,9 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ) selected_defect_row = pass_entries.iloc[defect_release_options.index(selected_defect_release)] selected_defect_case_df = case_df[case_df["group_key"] == selected_defect_row["group_key"]].copy() + defect_category_cols = ["major_category", "mid_category", "minor_category"] selected_major_mid = ( - selected_defect_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] + selected_defect_case_df.groupby(defect_category_cols, dropna=False)[["passed", "total"]] .sum() .reset_index() ) @@ -719,41 +768,81 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame horizontal=True, ) if latest_view_mode == "Bars": - latest_fig = _build_latest_hierarchy_bars(selected_major_mid, selected_defect_row["version"]) + mid_level = ( + selected_defect_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() + ) + mid_level = _with_pass_rate(mid_level) + mid_level = mid_level.sort_values( + ["major_category", "mid_category", "pass_rate", "total"], + ascending=[True, True, False, False], + ) + ordered_mid_categories = mid_level["mid_category"].tolist() + st.plotly_chart( + _build_defect_hierarchy_bars( + mid_level, + category_cols=["major_category", "mid_category"], + color_col="major_category", + title="Major / Mid", + ), + use_container_width=True, + ) + st.plotly_chart( + _build_defect_case_bars( + selected_defect_case_df, + ordered_mid_categories=ordered_mid_categories, + ), + use_container_width=True, + ) elif latest_view_mode == "Treemap": latest_fig = px.treemap( selected_major_mid, - path=["major_category", "mid_category"], + path=defect_category_cols, values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title=f"Defect Evaluation Treemap: {selected_defect_row['version']}", ) latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(latest_fig, use_container_width=True) elif latest_view_mode == "Icicle": latest_fig = px.icicle( selected_major_mid, - path=["major_category", "mid_category"], + path=defect_category_cols, values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title=f"Defect Evaluation Icicle: {selected_defect_row['version']}", ) latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(latest_fig, use_container_width=True) else: latest_fig = px.sunburst( selected_major_mid, - path=["major_category", "mid_category"], + path=defect_category_cols, values="total", color="pass_rate", color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], range_color=(0, 100), - title=f"Defect Evaluation Sunburst: {selected_defect_row['version']}", ) latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) - st.plotly_chart(latest_fig, use_container_width=True) + st.plotly_chart(latest_fig, use_container_width=True) + + case_pass_rate = selected_defect_case_df.copy() + case_pass_rate["case"] = case_pass_rate["minor_category"].fillna(case_pass_rate["case_name"]) + case_pass_rate = case_pass_rate.sort_values(["pass_rate", "total"], ascending=[True, False]) + with st.expander("Case Pass Rates", expanded=False): + st.dataframe( + case_pass_rate[ + ["major_category", "mid_category", "case", "pass_rate", "passed", "total"] + ], + use_container_width=True, + hide_index=True, + column_config={ + "pass_rate": st.column_config.NumberColumn("pass_rate", format="%.1f%%"), + }, + ) else: st.info("No defect evaluation hierarchy is available yet.") else: @@ -891,7 +980,7 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame if not case_df.empty: with st.expander("Case Explorer", expanded=False): - filter_col1, filter_col2, filter_col3 = st.columns(3) + filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4) with filter_col1: selected_major = st.selectbox("Major Category", ["All"] + sorted(case_df["major_category"].dropna().unique().tolist())) case_filtered = case_df.copy() @@ -902,6 +991,10 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame if selected_mid != "All": case_filtered = case_filtered[case_filtered["mid_category"] == selected_mid] with filter_col3: + selected_minor = st.selectbox("Minor Category", ["All"] + sorted(case_filtered["minor_category"].dropna().unique().tolist())) + if selected_minor != "All": + case_filtered = case_filtered[case_filtered["minor_category"] == selected_minor] + with filter_col4: selected_case = st.selectbox("Case", ["All"] + sorted(case_filtered["case_name"].dropna().unique().tolist())) if selected_case != "All": case_filtered = case_filtered[case_filtered["case_name"] == selected_case] From 3b354f8e28d7a58a40cacd4a7dfe44826fcd4f0e Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 22 May 2026 13:06:34 +0900 Subject: [PATCH 80/94] feat: add comprehensive documentation for evaluation dashboard - Introduced multiple new HTML pages including guides on getting started, deployment, data reports, and a detailed page guide for users. - Added a JavaScript file for smooth scrolling effects in the documentation. - Created a CSS file to enhance the visual styling of the documentation pages. - Developed a specsheet pipeline explainer to clarify the process and components involved in generating specsheet reports. - Ensured all new pages are linked appropriately for easy navigation within the documentation. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../docs/guide/data_reports.html | 177 ++++ .../docs/guide/deployment.html | 189 ++++ .../docs/guide/getting_started.html | 200 ++++ evaluation_dashboard_app/docs/guide/guide.js | 10 + .../docs/guide/index.html | 142 +++ .../docs/guide/pages.html | 195 ++++ .../docs/guide/styles.css | 848 +++++++++++++++++ .../docs/guide/visual_systems.html | 554 +++++++++++ .../docs/specsheet_pipeline_explainer.html | 897 ++++++++++++++++++ 9 files changed, 3212 insertions(+) create mode 100644 evaluation_dashboard_app/docs/guide/data_reports.html create mode 100644 evaluation_dashboard_app/docs/guide/deployment.html create mode 100644 evaluation_dashboard_app/docs/guide/getting_started.html create mode 100644 evaluation_dashboard_app/docs/guide/guide.js create mode 100644 evaluation_dashboard_app/docs/guide/index.html create mode 100644 evaluation_dashboard_app/docs/guide/pages.html create mode 100644 evaluation_dashboard_app/docs/guide/styles.css create mode 100644 evaluation_dashboard_app/docs/guide/visual_systems.html create mode 100644 evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html diff --git a/evaluation_dashboard_app/docs/guide/data_reports.html b/evaluation_dashboard_app/docs/guide/data_reports.html new file mode 100644 index 0000000..3cb9db6 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/data_reports.html @@ -0,0 +1,177 @@ + + + + + + Evaluation Dashboard Data and Reports + + + +
+
+
Data and Reports
+

Artifacts

+

+ The dashboard is driven by files. Understanding which file powers which page makes the app much easier to use and debug. +

+
+
+ + +
+
+
+
+
Run Model
+

A run is a folder under the data root.

+

+ The default data root is data/. In production it can be changed with + EVAL_DASHBOARD_DATA_ROOT. Download and Eval paths are restricted under this root. +

+
+
+
+
data/
+
my_test_20250203/
+
Summary.csv
+
Score.csv
+
result.txt / score.json / logs...
+
current.parquet / future.parquet
+
resources/metadata.yaml + summary.json
+
specsheet/specsheet.pdf
+
+
+

Why one folder per test?

+

+ It keeps Overview selection simple, makes Data Management safer, and lets users share links using stable run names. + If output is scattered across arbitrary folders, users cannot easily know what to select or delete. +

+
+
+
+
+ +
+
+
+
Core Files
+

Which artifact powers which page?

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ArtifactCreated ByUsed ByMeaning
Summary.csvDownload -> Eval ResultsOverview, TP SummaryObject-level summary metrics such as TP, x/y RMS, x/y STD, velocity, perception labels, and product labels.
Score.csvDownload -> Eval ResultsCriteria Based ScoreCriteria block metrics including scenario, option, GT object, distance, NM, TP/TN, pass rate, thresholds, and counts.
.parquetDownload/eval/parquet build workflowsDetection Stats, Bounding Box Viewer, Prediction Evaluation, DebugStructured frame/object rows: position, dimensions, yaw, label, status, source, scenario metadata, and prediction metrics.
metadata.yaml + summary.jsonSpecsheet/trend generation or analyzer outputTrend Insights, Specsheet trend exportRelease identity and trend summary payloads. Summary shape decides full/usecase/devops role.
+
+
+ +
+
+
+
Trend Data
+

Trend summaries are classified by JSON shape.

+

+ Trend Insights scans the data root for metadata.yaml files that have sibling summary.json. + It then classifies the summary and groups related jobs into releases. +

+
+
+
+

Full performance

+

Summary has blocks containing the header 全数データセット評価. Used for mAP, precision, recall, error, and prediction trends.

+
+
+

Usecase

+

Summary has blocks containing ユースケース評価. It participates in release grouping and inventory.

+
+
+

DevOps pass-rate

+

Summary is a nested dictionary without blocks, with category results containing passed and total.

+
+
+
+
+ +
+
+
+
Report Outputs
+

Reports are optional outputs, not the main app path.

+

+ Users can explore directly in Streamlit, then export when they need a portable artifact for review. +

+
+
+
+

Dashboard PDF

+

Generated from the current Overview selection and filters. Best for summarizing the dashboard state as a curated report.

+
+
+

Release Specsheet PDF

+

Advanced release-oriented report generated through perception_catalog_analyzer. It can include trend pages when trend metadata is enabled.

+ +
+
+

ZIP outputs

+

Data Management can package outputs for download, useful when moving run artifacts out of a shared server.

+
+
+
+
+ +
+
+
+
Debugging by Artifact
+

When a page is empty, first check the file it needs.

+
+ + + + + + + + + +
SymptomLikely missingFix
Overview summary is sparseSummary.csvGenerate Summary.csv from Download -> Eval Results.
Criteria page has no rowsScore.csvGenerate Score.csv from result files or score JSON.
Detection/BEV pages cannot loadParquet filesBuild or place parquet artifacts under the expected data root/run path.
Trend Insights has no releasesmetadata.yaml + summary.jsonGenerate or copy trend-compatible release outputs under data root.
Specsheet trend section says no dataTrend rows or PNG plotsCheck trend classification and generated plot files in specsheet/.
+
+
+
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/deployment.html b/evaluation_dashboard_app/docs/guide/deployment.html new file mode 100644 index 0000000..156d06f --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/deployment.html @@ -0,0 +1,189 @@ + + + + + + Evaluation Dashboard Deployment Guide + + + +
+
+
Deployment and Operations
+

Deploy

+

+ Run locally for development. Use the production compose stack when multiple people need a shared server, + background workers, task history, and operational visibility. +

+
+
+ + +
+
+
+
+
Local Development
+

Fastest way to run the app.

+

+ Local mode is best for development, one-person analysis, and debugging. Heavy jobs run in the Streamlit process + unless task queue environment variables are enabled. +

+
+
+
+
local start
+
cd evaluation_dashboard_app
+pip install -r requirements.txt
+streamlit run Overview.py
+
+
+

Local prerequisites

+
    +
  • Python packages from requirements.txt.
  • +
  • pilot-auto / perception_eval environment only when generating Summary/Score.
  • +
  • Evaluator API credentials when using Download pages.
  • +
  • Chrome availability for some static image/PDF export flows.
  • +
+
+
+
+
+ +
+
+
+
Single Docker Container
+

Portable app container for simple usage.

+
+
+
+

Build image

+

Private dependencies may require passing a GitHub SSH key as a Docker build secret.

+
+
build
+
docker build --no-cache \
+  --secret id=ssh,src=$HOME/.ssh/id_rsa \
+  -t evaluation-dashboard .
+
+
+
+

Run with persistent data

+

Always mount the data directory so runs survive container restarts.

+
+
run
+
docker run -p 8501:8501 \
+  -v "$(pwd)/data:/app/data" \
+  -v ~/.webauto:/root/.webauto \
+  evaluation-dashboard
+
+
+
+
+
+ +
+
+
+
Production Stack
+

Nginx to Streamlit to Redis workers to Postgres.

+

+ In production, heavy operations should not block Streamlit. The app enqueues jobs to Redis, workers execute them, + and Postgres stores task state for Recent Tasks and operational visibility. +

+
+
+
BrowserTeam users open the shared app.
+
NginxReverse proxy, optional TLS/load balancing.
+
StreamlitUI, filters, enqueue requests, task status.
+
Redis + WorkerRQ queue and heavy background jobs.
+
Postgres + DataTask metadata and shared run artifacts.
+
+
+
recommended numbered scripts
+
cd deploy
+./01_SETUP_ENV.sh       # create .env if missing, then edit manually
+./02_BUILD.sh --no-cache
+./03_INIT_DB.sh         # first time only
+./04_START.sh           # start nginx, streamlit, redis, postgres, workers
+./06_STATUS.sh          # inspect service status
+./07_LOGS.sh worker     # tail logs for a service
+
+
+
+ +
+
+
+
Environment Variables
+

The settings that matter most.

+
+ + + + + + + + + + + +
VariablePurpose
EVAL_DASHBOARD_DATA_ROOTShared evaluation data root. Streamlit and workers must see the same path.
USE_TASK_QUEUEEnable Redis/RQ worker mode. Recommended for production.
DATABASE_URLPostgres task metadata connection string.
REDIS_URLRedis queue connection string.
RQ_JOB_TIMEOUT_SECLong timeout for downloads/eval jobs; default is intentionally much longer than RQ's built-in default.
EVAL_DASHBOARD_CONFIGDocker-specific JSON config path mounted from deploy/configs/.
EVAL_DEPLOYMENT_DEBUG_EXECEnables Docker exec from Deployment Debug. Keep off unless briefly needed on a trusted network.
+
+
+ +
+
+
+
Multi-User Operation
+

A shared server, not per-user accounts.

+

+ The app is designed as a local-team tool. Everyone who can access the server can see shared data and use server-side API credentials. +

+
+
+

Shared data

All run folders under the data root are visible to all users.

+

Path safety

Download and eval paths are resolved under the data root; traversal is rejected.

+

Shared credentials

Download API credentials are mounted server-side, not entered by each user.

+

Share links

Users share Overview URLs with mode, run_a, and run_b.

+
+
+ Access control lives outside the app: use VPN, firewall, SSO proxy, or network controls if the server should only be reachable by your team. +
+
+
+ +
+
+
+
Operations Checklist
+

What to check when production feels unhealthy.

+
+ + + + + + + + + + +
IssueCheck
Failed to enqueue taskConfirm Redis, Postgres, USE_TASK_QUEUE=true, and matching URLs.
Tasks stay pendingWorker is running, same RQ_QUEUE, worker logs show no import/config errors.
Nginx 502Streamlit is listening on 8501, not OOM-killed, and Nginx upstream matches service names.
Subpage forgets Overview stateUse Overview share link with run_a query params, especially with multiple Streamlit replicas.
Detection Stats freezesSet EVAL_DETECTION_STATS_DEBUG=1 and inspect section timing/memory output.
PDF Chrome/Kaleido errorRebuild the image so Chrome is installed in the Docker environment.
+
+
+
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/getting_started.html b/evaluation_dashboard_app/docs/guide/getting_started.html new file mode 100644 index 0000000..b11f54b --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/getting_started.html @@ -0,0 +1,200 @@ + + + + + + How to Use the Evaluation Dashboard + + + +
+
+
User Workflow
+

How to Use

+

+ This is the practical path for users: get evaluator data, generate artifacts, view a run, + compare candidate results, and share the exact dashboard state with teammates. +

+
+
+ + +
+
+
+
+
Workflow 1
+

Download evaluator results into a run folder.

+

+ A run is normally one direct subdirectory under data/. Use one folder per test so + it is easy to select in Overview and safe to delete later. +

+
+
+
Open DownloadUse pages/6_Download.py from the sidebar.
+
Select Download ResultsEnter Project ID, Job ID, and optional Suite ID.
+
Choose Output PathRecommended: data/<test_name>.
+
Pick Download TypeArchives for full local analysis, Result JSON only for lightweight summary generation.
+
Run DownloadWait for completion or watch Recent Tasks when queue mode is enabled.
+
+
+
+ Download Results configuration +
Use a dedicated output folder under the data root. That folder becomes the run you select later.
+
+
+

Decision: Archives or Result JSON only?

+
    +
  • Archives (ZIP): best for complete local investigation. It downloads and extracts richer source data.
  • +
  • Result JSON only: faster and lighter. Good when you mainly need summary and score generation.
  • +
  • Scenario downloads: use the Download Scenarios tab when TLR Analysis needs scenario data.
  • +
+
+
+
+
+ +
+
+
+
Workflow 2
+

Generate Summary.csv and Score.csv.

+

+ Most analysis pages need generated CSV artifacts. Stay on Download, switch to Eval Results, + and point the root directory to the same folder you used as the download output path. +

+
+
+
+
1
+

Root directory to evaluate

+

Use the same path, for example data/my_test_20250203. This keeps generated artifacts next to the run.

+
+
+
2
+

Search subdirectories

+

Usually enable this. It lets the app find result.txt or score.json in job/suite subfolders.

+
+
+
3
+

Choose generation mode

+

If results already exist, generate only Summary/Score. If not, run full eval_result generation.

+
+
+
+
+ Eval Results screen +
Eval Results produces the CSVs consumed by Overview, TP Summary, and Criteria pages.
+
+
+ Environment note: when generation uses perception_eval, activate the pilot-auto ROS environment first: +
+
before running generation
+
source path_to_pilot/install/setup.sh
+
+
+
+
+
+ +
+
+
+
Workflow 3
+

Select the run in Overview and explore.

+

+ Overview is the state hub. Many pages use the run selection and compare mode from Overview, + so users should start there before opening detail pages. +

+
+
+
+

Single-run review

+
    +
  1. Open Overview.
  2. +
  3. Select Single Mode.
  4. +
  5. Choose your run as Baseline (A).
  6. +
  7. Apply Perception Label or Product Label filters if needed.
  8. +
  9. Move to TP Summary, Criteria, Detection Stats, Bounding Box Viewer, or Prediction Evaluation.
  10. +
+
+
+ Overview screen +
Overview gives the first read: summary metrics, filters, report export, and links to specialized pages.
+
+
+
+
+ +
+
+
+
Workflow 4
+

Compare baseline A against candidate B.

+

+ Compare mode lets users answer the product question: did this candidate improve, regress, + or change behavior in a specific slice? +

+
+
+
+

How to set up compare mode

+
    +
  1. Open Overview.
  2. +
  3. Switch to Compare Mode.
  4. +
  5. Select Baseline (A), usually the current accepted run.
  6. +
  7. Select Candidate (B), usually the new run.
  8. +
  9. Check the summary metric deltas before going deeper.
  10. +
+
+
+

Where compare mode is most useful

+
    +
  • TP Summary: TP and kinematic metric deltas.
  • +
  • Criteria Score: pass-rate changes and absolute gate comparison.
  • +
  • Detection Stats: TP/FP distance-bin and status distribution differences.
  • +
  • Bounding Box Viewer: spatial inspection across runs.
  • +
  • Prediction Evaluation: ADE/FDE delta matrices and distance bins.
  • +
+
+
+
+ Sharing: Overview stores mode and run choices in URL query parameters such as + ?mode=compare&run_a=old_run&run_b=new_run. Copy that link to let another user open the same comparison. +
+
+
+ + +
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/guide.js b/evaluation_dashboard_app/docs/guide/guide.js new file mode 100644 index 0000000..196ab83 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/guide.js @@ -0,0 +1,10 @@ +const observer = new IntersectionObserver((entries) => { + entries.forEach((entry) => { + if (entry.isIntersecting) { + entry.target.classList.add("in"); + observer.unobserve(entry.target); + } + }); +}, { threshold: 0.12 }); + +document.querySelectorAll(".reveal").forEach((el) => observer.observe(el)); diff --git a/evaluation_dashboard_app/docs/guide/index.html b/evaluation_dashboard_app/docs/guide/index.html new file mode 100644 index 0000000..143db20 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/index.html @@ -0,0 +1,142 @@ + + + + + + Evaluation Dashboard Engineer Guide + + + +
+
+
+
Engineer Documentation
+

Evaluation Dashboard
Engineer Guide

+

+ This guide is the entry point for engineers who operate, debug, extend, and deploy the + evaluation dashboard. The home page gives the system map; each substantial topic lives in + its own focused chapter. +

+ +
+
+

Guide Structure

+
    +
  • Home: system role, ownership map, and chapter routing.
  • +
  • Workflow: Download -> Eval Results -> Overview -> Compare.
  • +
  • Page Guide: page-by-page artifact and state contracts.
  • +
  • Data/Reports: run artifacts, trend data, dashboard PDF, specsheet.
  • +
  • Deployment: local, Docker, production, task queue, multi-user operations.
  • +
  • Diagrams: real sequence/system diagrams for key flows.
  • +
+
+
+
+ + + +
+
+
+
+
System Role
+

The dashboard turns evaluator outputs into explorable engineering evidence.

+

+ It reads run folders under the configured data root, generates dashboard artifacts when + needed, shares selected run state across Streamlit pages, and provides local or production + workflows for comparison, report generation, T4 visualization, and release trend review. +

+
+
+
Overview.pyRun selection, compare mode, filters, share links, dashboard PDF, and specsheet entry.
+
pages/Numbered Streamlit pages. Filename order is part of the navigation contract.
+
lib/Data loading, plotting, reporting, T4 clients, task queue integration, and shared UI utilities.
+
deploy/Docker Compose, Nginx, Redis/RQ workers, Postgres, and production scripts.
+
+
+
+ +
+ +
+ +
+
+
+
Primary Flow
+

The common operational path is still one clear chain.

+

+ The detailed instructions live in the Workflow chapter, but the mental model is simple: + create or choose a run folder, generate the dashboard artifacts, select run A in Overview, + optionally select candidate B, then use the dedicated pages for deeper analysis. +

+ +
+
+
DownloadProject/Job/Suite results into a run folder.
+
Eval ResultsGenerate Summary, Score, and parquet artifacts.
+
OverviewSelect run A and synchronize state.
+
CompareAdd candidate B when needed.
+
Detail PagesInvestigate the specific signal.
+
+
+
+
+ +
+
+ Evaluation Dashboard Engineer Guide +

This home page routes to the detailed chapters instead of duplicating them.

+
+
+ + + diff --git a/evaluation_dashboard_app/docs/guide/pages.html b/evaluation_dashboard_app/docs/guide/pages.html new file mode 100644 index 0000000..d75757c --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/pages.html @@ -0,0 +1,195 @@ + + + + + + Evaluation Dashboard Page Guide + + + +
+
+
Page-by-Page Guide
+

Pages

+

+ A detailed guide to every dashboard page: what it needs, what it shows, how it behaves in compare mode, + and when users should open it. +

+
+
+ + +
+
+
+
+
State Model
+

Start in Overview, then go deep.

+

+ Overview sets mode, selected runs, and shared filters. Detail pages often read those values from + st.session_state, so opening Overview first prevents confusing “please load data” messages. +

+
+
+
OverviewSelect run A, optional run B, labels, and mode.
+
Shared stateThe app stores run objects and filters in session state.
+
Detail pagesPages read the active run and specialize the analysis.
+
URL sharingOverview can encode mode and run names into query params.
+
Team reviewUsers open the same linked comparison on the shared server.
+
+
+
+ +
+
+
+
Core Pages
+

The pages most users touch first.

+
+
+
+

Overview

+

Use when: starting any review, choosing runs, comparing A/B, exporting dashboard PDFs, or generating release specsheets.

+
    +
  • Inputs: run folders under data root, Summary.csv, labels, compare mode.
  • +
  • Shows: summary metrics, label/product filters, A/B charts, dashboard report export, specsheet export.
  • +
  • Watch out: if a run has no Summary.csv, high-level summary metrics are limited.
  • +
+
+
+

Download

+

Use when: acquiring evaluator results, scenario data, or generating Summary/Score artifacts.

+
    +
  • Tabs: Download Results, Download Scenarios, View Downloads, Eval Results.
  • +
  • Outputs: downloaded archives, result JSON, scenario data, Summary.csv, Score.csv.
  • +
  • Queue behavior: with USE_TASK_QUEUE=true, heavy tasks run in workers and appear in Recent Tasks.
  • +
+
+
+

Evaluator Workflow

+

Use when: you want a more guided operational flow for local runs, background tasks, fresh evaluator pipelines, and report reuse.

+
    +
  • Good for: launching longer evaluator workflows without jumping between many manual steps.
  • +
  • Depends on: evaluator API configuration, task queue for long-running jobs in production.
  • +
+
+
+

Data Management

+

Use when: managing a shared server or cleaning up old run outputs.

+
    +
  • Shows: run folders, sizes, modified time, Summary/Score/parquet presence.
  • +
  • Actions: create share links, download ZIP outputs, delete run folders under the data root.
  • +
  • Safety: deletion is restricted to run-level directories under the data root.
  • +
+
+
+
+
+ +
+
+
+
Metric Analysis Pages
+

Turn CSV and parquet artifacts into engineering signals.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PagePrerequisiteMain UseCompare Behavior
TP SummarySummary.csvTP, RMS/STD, velocity scatter, metric distribution, density, scenario delta ranking.Shows candidate-vs-baseline deltas such as Delta TP and metric shifts.
Criteria Based ScoreScore.csvCriteria block selection, pass-rate distribution, group means, box plots, absolute gates.Compares pass-rate changes, gate pass/fail status, and per-scenario deltas.
Detection StatsParquet filesTP/FP/FN rates, distance bins, status distribution, object counts, label and scenario breakdown.Side-by-side and delta-oriented detection metrics across selected runs.
Prediction EvaluationPrediction parquet/artifactsSpecsheet-aligned ADE/FDE, label matrices, distance bins, polar/radial breakdowns.ADE/FDE delta matrix and per-distance comparisons between A and B.
Trend InsightsTrend metadata.yaml + summary.jsonRelease inventory, mAP trend, prediction trend, pass-rate trend, defect evaluation, metric atlas.Not A/B in the same way; it groups release history over versions.
+
+
+ +
+
+
+
Spatial and Visual Pages
+

Use these when numbers are not enough.

+
+
+
+

Bounding Box Viewer

+

BEV inspection from parquet data. Filter by t4dataset, topic, label, visibility, source, status, frame, and run. Best for understanding where misses and false positives happen spatially.

+
+
+

T4 3D Viewer

+

3D-oriented visual inspection and T4 visualizer integration. Best when BEV alone is not enough and users need camera or rendered context.

+
+
+

T4 Dataset Server

+

Integration helper for liveness checks, render requests, target object JSON, and camera PNG embed workflows. More operational than analysis-focused.

+
+
+
+
+ +
+
+
+
Specialized Pages
+

Tools for narrower investigations and operations.

+
+
+
+

TLR Analysis

+

Traffic Light Recognition evaluation. Use after downloading scenario data from Download Scenarios. It visualizes criteria matrices, vehicle status vs signal type, important zones, and compare-mode deltas.

+
+
+

Parquet Debug

+

Developer troubleshooting page for parquet, pkl, and result JSON. Use it when a page fails to parse data, schemas look suspicious, or criteria state needs low-level inspection.

+
+
+

Help

+

In-app README viewer with Japanese/English switching. Useful when users are inside Streamlit and need setup or workflow reminders without leaving the app.

+
+
+

Deployment Debug

+

Docker-only operations page. Checks environment, Postgres, Redis, RQ, task rows, container status, logs, and optional restricted exec. Keep access controlled.

+
+
+
+
+
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/styles.css b/evaluation_dashboard_app/docs/guide/styles.css new file mode 100644 index 0000000..3459c9f --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/styles.css @@ -0,0 +1,848 @@ +:root { + --bg: #f6f8fb; + --paper: #ffffff; + --ink: #101827; + --muted: #5d697d; + --line: #dce4f0; + --blue: #2563eb; + --teal: #0f766e; + --cyan: #0891b2; + --gold: #b7791f; + --red: #be123c; + --violet: #6d28d9; + --dark: #111827; + --shadow: 0 20px 60px rgba(17, 24, 39, .12); + --soft-shadow: 0 12px 30px rgba(17, 24, 39, .07); + --radius: 8px; +} + +* { box-sizing: border-box; } +html { scroll-behavior: smooth; } +body { + margin: 0; + color: var(--ink); + background: + radial-gradient(circle at 14% 8%, rgba(37, 99, 235, .12), transparent 26rem), + radial-gradient(circle at 86% 16%, rgba(15, 118, 110, .12), transparent 24rem), + linear-gradient(180deg, #f8fbff 0%, #ffffff 34%, #f6f8fb 100%); + font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; + line-height: 1.58; +} + +body::before { + content: ""; + position: fixed; + inset: 0; + pointer-events: none; + z-index: -1; + background-image: + linear-gradient(rgba(17, 24, 39, .042) 1px, transparent 1px), + linear-gradient(90deg, rgba(17, 24, 39, .042) 1px, transparent 1px); + background-size: 44px 44px; + mask-image: linear-gradient(180deg, rgba(0,0,0,.7), transparent 70%); +} + +a { color: inherit; } +code, pre { font-family: "SFMono-Regular", Consolas, "Liberation Mono", monospace; } +[hidden] { display: none !important; } + +.shell { + width: min(1180px, calc(100% - 36px)); + margin: 0 auto; +} + +.hero { + min-height: 72vh; + display: grid; + align-items: center; + padding: 56px 0 36px; +} + +.hero.compact { + min-height: 46vh; +} + +.hero-grid { + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(330px, 440px); + gap: 46px; + align-items: center; +} + +.eyebrow { + display: inline-flex; + align-items: center; + gap: 10px; + color: var(--teal); + font-size: .78rem; + font-weight: 900; + letter-spacing: .14em; + text-transform: uppercase; +} + +.signal { + width: 11px; + height: 11px; + border-radius: 99px; + background: var(--teal); + animation: ping 1.9s infinite; +} + +h1 { + margin: 16px 0 18px; + font-size: clamp(3.2rem, 7.6vw, 7.8rem); + line-height: .88; + letter-spacing: 0; + max-width: 980px; +} + +h2 { + margin: 0 0 16px; + font-size: clamp(2rem, 4vw, 4.2rem); + line-height: 1; + letter-spacing: 0; +} + +h3 { + margin: 0 0 10px; + font-size: 1.12rem; + line-height: 1.24; +} + +p { margin: 0; } + +.lead { + max-width: 860px; + color: var(--muted); + font-size: 1.15rem; +} + +.actions { + display: flex; + gap: 12px; + flex-wrap: wrap; + margin-top: 28px; +} + +.button { + display: inline-flex; + align-items: center; + gap: 10px; + min-height: 44px; + padding: 11px 15px; + border-radius: var(--radius); + border: 1px solid var(--line); + background: var(--paper); + color: var(--ink); + text-decoration: none; + font-weight: 820; + box-shadow: 0 8px 18px rgba(17, 24, 39, .07); +} + +.button.primary { + color: white; + background: var(--dark); + border-color: var(--dark); +} + +.button:hover { transform: translateY(-1px); } + +.language-console { + display: inline-flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; + margin-top: 24px; + padding: 8px; + border: 1px solid var(--line); + border-radius: var(--radius); + background: rgba(255,255,255,.78); + box-shadow: var(--soft-shadow); + color: var(--muted); + font-weight: 850; + font-size: .9rem; +} + +.lang-button { + min-height: 34px; + border: 1px solid var(--line); + border-radius: 7px; + padding: 7px 11px; + background: white; + color: #334155; + font: inherit; + font-weight: 900; + cursor: pointer; +} + +.lang-button.active { + color: white; + background: var(--dark); + border-color: var(--dark); +} + +.hero-console { + border-top: 4px solid var(--teal); +} + +.metric-grid { + margin-top: 26px; +} + +.metric { + min-height: 140px; + display: flex; + flex-direction: column; + gap: 10px; +} + +.metric strong { + font-size: 1.05rem; +} + +.metric span { + color: var(--muted); +} + +nav { + position: sticky; + top: 0; + z-index: 30; + background: rgba(248, 251, 255, .88); + border-block: 1px solid rgba(220,228,240,.9); + backdrop-filter: blur(14px); +} + +.nav-inner { + display: flex; + gap: 8px; + align-items: center; + padding: 12px 0; + overflow-x: auto; +} + +.nav-inner a { + text-decoration: none; + white-space: nowrap; + color: #334155; + font-size: .88rem; + font-weight: 820; + padding: 8px 10px; + border-radius: 7px; +} + +.nav-inner a:hover, .nav-inner a.active { + background: white; + color: var(--blue); +} + +section { + padding: 78px 0; + position: relative; +} + +.section-head { + max-width: 900px; + margin-bottom: 32px; +} + +.kicker { + color: var(--blue); + font-size: .78rem; + font-weight: 950; + letter-spacing: .14em; + text-transform: uppercase; + margin-bottom: 12px; +} + +.grid { display: grid; gap: 18px; } +.cols-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); } +.cols-3 { grid-template-columns: repeat(3, minmax(0, 1fr)); } +.cols-4 { grid-template-columns: repeat(4, minmax(0, 1fr)); } + +.card { + background: rgba(255,255,255,.92); + border: 1px solid var(--line); + border-radius: var(--radius); + padding: 20px; + box-shadow: var(--soft-shadow); +} + +.card p, .card li { color: var(--muted); } +.card ul { margin: 12px 0 0; padding-left: 18px; } + +.number { + width: 34px; + height: 34px; + display: inline-grid; + place-items: center; + border-radius: 8px; + background: var(--blue); + color: white; + font-weight: 950; + margin-bottom: 13px; +} + +.flow { + display: grid; + grid-template-columns: repeat(5, minmax(132px, 1fr)); + gap: 12px; + align-items: stretch; + margin-top: 24px; +} + +.compact-flow { + grid-template-columns: 1fr; + margin-top: 0; +} + +.compact-flow .step { + min-height: auto; +} + +.compact-flow .step::after { + display: none; +} + +.step { + min-height: 155px; + padding: 16px; + border: 1px solid var(--line); + border-radius: var(--radius); + background: white; + position: relative; + overflow: hidden; + animation: lift .6s ease both; +} + +.step:nth-child(2) { animation-delay: .08s; } +.step:nth-child(3) { animation-delay: .16s; } +.step:nth-child(4) { animation-delay: .24s; } +.step:nth-child(5) { animation-delay: .32s; } + +.step::after { + content: ""; + position: absolute; + top: 50%; + right: -23px; + width: 42px; + height: 2px; + background: linear-gradient(90deg, var(--blue), transparent); +} + +.step:last-child::after { display: none; } +.step strong { display: block; margin-bottom: 8px; } +.step span { color: var(--muted); font-size: .9rem; } + +.split { + display: grid; + grid-template-columns: minmax(0, 1.05fr) minmax(0, .95fr); + gap: 20px; + align-items: start; +} + +.media { + overflow: hidden; + border-radius: var(--radius); + border: 1px solid var(--line); + background: white; + box-shadow: var(--soft-shadow); +} + +.media img { + width: 100%; + display: block; + object-fit: cover; +} + +.media-grid { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 18px; + margin-top: 28px; +} + +.caption { + color: var(--muted); + font-size: .86rem; + padding: 12px 14px; + border-top: 1px solid var(--line); + background: #fbfdff; +} + +.code { + background: #101827; + color: #dbeafe; + border-radius: var(--radius); + border: 1px solid rgba(255,255,255,.08); + overflow: hidden; + box-shadow: 0 16px 44px rgba(15, 23, 42, .18); +} + +.code-title { + display: flex; + justify-content: space-between; + gap: 12px; + padding: 12px 15px; + border-bottom: 1px solid rgba(255,255,255,.1); + color: #bfdbfe; + font-weight: 850; + font-size: .88rem; +} + +pre { + margin: 0; + padding: 17px; + overflow: auto; + font-size: .82rem; + line-height: 1.55; +} + +.tree { + background: white; + border: 1px solid var(--line); + border-radius: var(--radius); + padding: 18px; + font-family: "SFMono-Regular", Consolas, monospace; + overflow-x: auto; + color: #334155; + box-shadow: var(--soft-shadow); +} + +.tree div { + white-space: nowrap; + opacity: 0; + transform: translateX(-10px); + animation: treeIn .45s ease forwards; +} + +.tree div:nth-child(2) { animation-delay: .04s; } +.tree div:nth-child(3) { animation-delay: .08s; } +.tree div:nth-child(4) { animation-delay: .12s; } +.tree div:nth-child(5) { animation-delay: .16s; } +.tree div:nth-child(6) { animation-delay: .20s; } +.tree div:nth-child(7) { animation-delay: .24s; } +.tree div:nth-child(8) { animation-delay: .28s; } +.tree div:nth-child(9) { animation-delay: .32s; } +.tree div:nth-child(10) { animation-delay: .36s; } + +.pill { + display: inline-flex; + align-items: center; + gap: 8px; + min-height: 28px; + padding: 5px 9px; + border: 1px solid var(--line); + border-radius: 999px; + background: #fbfdff; + color: #334155; + font-size: .8rem; + font-weight: 780; + margin: 3px 4px 3px 0; +} + +.callout { + border-left: 5px solid var(--teal); + background: #ecfdf5; + color: #123f38; + border-radius: var(--radius); + padding: 18px; +} + +.callout.warn { + border-left-color: var(--gold); + background: #fff8e7; + color: #513a13; +} + +.sequence { + display: grid; + grid-template-columns: repeat(var(--cols, 5), minmax(110px, 1fr)); + gap: 10px; + margin: 22px 0; + position: relative; +} + +.actor { + min-height: 78px; + border: 1px solid var(--line); + border-radius: var(--radius); + background: white; + display: grid; + place-items: center; + text-align: center; + padding: 12px; + box-shadow: var(--soft-shadow); + font-weight: 900; +} + +.actor small { + display: block; + color: var(--muted); + font-weight: 750; + margin-top: 4px; +} + +.message { + grid-column: 1 / -1; + display: grid; + grid-template-columns: subgrid; + min-height: 44px; + align-items: center; +} + +.arrow { + height: 28px; + border-top: 2px solid var(--blue); + position: relative; + display: flex; + align-items: flex-start; + justify-content: center; + color: var(--muted); + font-size: .78rem; + font-weight: 800; + padding-top: 6px; +} + +.arrow::after { + content: ""; + position: absolute; + right: -2px; + top: -6px; + border-left: 9px solid var(--blue); + border-top: 5px solid transparent; + border-bottom: 5px solid transparent; +} + +.arrow.back { + border-color: var(--teal); +} + +.arrow.back::after { + right: auto; + left: -2px; + border-left: 0; + border-right: 9px solid var(--teal); +} + +.span-1-2 { grid-column: 1 / 3; } +.span-2-3 { grid-column: 2 / 4; } +.span-3-4 { grid-column: 3 / 5; } +.span-4-5 { grid-column: 4 / 6; } +.span-1-3 { grid-column: 1 / 4; } +.span-2-4 { grid-column: 2 / 5; } +.span-3-5 { grid-column: 3 / 6; } +.span-2-5 { grid-column: 2 / 6; } + +.system-map { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 14px; + align-items: stretch; + margin-top: 22px; +} + +.system-node { + background: white; + border: 1px solid var(--line); + border-radius: var(--radius); + padding: 16px; + min-height: 132px; + box-shadow: var(--soft-shadow); + position: relative; + overflow: hidden; +} + +.system-node::before { + content: ""; + position: absolute; + inset: 0 auto 0 0; + width: 5px; + background: var(--blue); +} + +.system-node.teal::before { background: var(--teal); } +.system-node.gold::before { background: var(--gold); } +.system-node.cyan::before { background: var(--cyan); } +.system-node.violet::before { background: var(--violet); } +.system-node.red::before { background: var(--red); } + +.system-node p { + color: var(--muted); +} + +.mini-diagram { + border: 1px solid var(--line); + border-radius: var(--radius); + background: white; + padding: 18px; + box-shadow: var(--soft-shadow); + overflow-x: auto; +} + +.real-diagram { + background: white; + border: 1px solid var(--line); + border-radius: var(--radius); + box-shadow: var(--soft-shadow); + overflow: auto; + margin: 22px 0; +} + +.real-diagram svg { + display: block; + min-width: 980px; + width: 100%; + height: auto; +} + +.svg-title { + font: 800 18px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #101827; +} + +.svg-actor { + fill: #ffffff; + stroke: #cbd5e1; + stroke-width: 1.2; +} + +.svg-actor-text { + font: 800 13px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #101827; +} + +.svg-small { + font: 700 11px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #64748b; +} + +.svg-line { + stroke: #cbd5e1; + stroke-width: 1.2; + stroke-dasharray: 5 6; +} + +.svg-msg { + stroke: #2563eb; + stroke-width: 2; + fill: none; + marker-end: url(#arrow-blue); +} + +.svg-msg-return { + stroke: #0f766e; + stroke-width: 2; + fill: none; + stroke-dasharray: 7 5; + marker-end: url(#arrow-teal); +} + +.svg-note { + fill: #f8fafc; + stroke: #dbe5f2; +} + +.svg-note-warn { + fill: #fff8e7; + stroke: #f2d38b; +} + +.svg-note-text { + font: 700 12px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #334155; +} + +.svg-step { + font: 800 12px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #1e3a8a; +} + +.payload-grid { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 14px; + margin-top: 18px; +} + +.payload-card { + background: #101827; + color: #dbeafe; + border-radius: var(--radius); + border: 1px solid rgba(255,255,255,.08); + overflow: hidden; +} + +.payload-card h3 { + padding: 12px 14px; + border-bottom: 1px solid rgba(255,255,255,.1); + color: #bfdbfe; + font-size: .92rem; +} + +.payload-card pre { + font-size: .76rem; +} + +.legend-row { + display: flex; + gap: 10px; + flex-wrap: wrap; + margin-top: 12px; +} + +.legend-item { + display: inline-flex; + align-items: center; + gap: 8px; + color: var(--muted); + font-weight: 760; + font-size: .86rem; +} + +.legend-swatch { + width: 22px; + height: 4px; + border-radius: 999px; + background: var(--blue); +} + +.legend-swatch.return { + background: repeating-linear-gradient(90deg, var(--teal) 0 7px, transparent 7px 12px); + border: 1px solid rgba(15,118,110,.25); +} + +.swimlanes { + display: grid; + gap: 12px; +} + +.swimlane { + display: grid; + grid-template-columns: 180px minmax(0, 1fr); + gap: 12px; + align-items: stretch; +} + +.swimlane-label { + border-radius: var(--radius); + background: var(--dark); + color: white; + padding: 14px; + display: grid; + align-items: center; + font-weight: 900; +} + +.swimlane-flow { + display: flex; + gap: 10px; + flex-wrap: wrap; + align-items: center; + border: 1px solid var(--line); + background: #fbfdff; + border-radius: var(--radius); + padding: 12px; +} + +.chip { + border: 1px solid var(--line); + border-radius: 8px; + background: white; + padding: 9px 10px; + font-size: .84rem; + font-weight: 820; + color: #334155; +} + +.table, +.table-wrap table { + width: 100%; + border-collapse: collapse; + overflow: hidden; + border-radius: var(--radius); + background: white; + border: 1px solid var(--line); + box-shadow: var(--soft-shadow); +} + +.table-wrap { + overflow-x: auto; + border-radius: var(--radius); +} + +.table th, .table td, +.table-wrap th, .table-wrap td { + text-align: left; + padding: 12px 14px; + border-bottom: 1px solid var(--line); + vertical-align: top; +} + +.table th, +.table-wrap th { + background: #f8fafc; + font-size: .82rem; + text-transform: uppercase; + letter-spacing: .08em; +} + +.table td, +.table-wrap td { color: var(--muted); } + +.feature-card { + border-top: 5px solid var(--blue); +} +.feature-card.analysis { border-top-color: var(--teal); } +.feature-card.spatial { border-top-color: var(--cyan); } +.feature-card.ops { border-top-color: var(--gold); } +.feature-card.advanced { border-top-color: var(--violet); } + +.footer { + padding: 48px 0 70px; + border-top: 1px solid var(--line); + color: var(--muted); +} + +.reveal { + opacity: 0; + transform: translateY(18px); + transition: opacity .6s ease, transform .6s ease; +} + +.reveal.in { + opacity: 1; + transform: translateY(0); +} + +@keyframes ping { + 0% { box-shadow: 0 0 0 0 rgba(15, 118, 110, .45); } + 72% { box-shadow: 0 0 0 13px rgba(15, 118, 110, 0); } + 100% { box-shadow: 0 0 0 0 rgba(15, 118, 110, 0); } +} + +@keyframes lift { + from { opacity: 0; transform: translateY(18px); } + to { opacity: 1; transform: translateY(0); } +} + +@keyframes treeIn { + to { opacity: 1; transform: translateX(0); } +} + +@media (max-width: 980px) { + .hero-grid, .split, .cols-2, .cols-3, .cols-4, .media-grid { + grid-template-columns: 1fr; + } + .flow { grid-template-columns: 1fr; } + .step::after { display: none; } +} + +@media (prefers-reduced-motion: reduce) { + *, *::before, *::after { + animation-duration: 0.01ms !important; + animation-iteration-count: 1 !important; + transition-duration: 0.01ms !important; + scroll-behavior: auto !important; + } +} diff --git a/evaluation_dashboard_app/docs/guide/visual_systems.html b/evaluation_dashboard_app/docs/guide/visual_systems.html new file mode 100644 index 0000000..dae1445 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/visual_systems.html @@ -0,0 +1,554 @@ + + + + + + Evaluation Dashboard Technical Diagrams + + + +
+
+
Technical Diagrams
+

Real Flows

+

+ Precise diagrams for the parts users and maintainers actually ask about: download/eval execution, + compare-mode state, page artifact dependencies, T4 camera rendering, T4 Three.js 3D overlays, + production queueing, and report generation. +

+
+
+ + +
+
+
+
+
Artifact Dependency Map
+

Which generated files unlock which pages?

+

+ This is the first diagram to check when a user asks why a page is empty. + Most UI behavior follows directly from whether these files exist under the selected run. +

+
+
+ + + + + + + + + + Run folder artifact dependency map + + + data/<run>/ + Summary.csv + Score.csv + *.parquet + result.txt / score.json + resources/metadata.yaml + resources/summary.json + specsheet/*.png + specsheet/specsheet.pdf + + + Overview + run selection + summary + + + TP Summary + TP/RMS/velocity charts + + + Criteria Score + pass rate + gates + + + Detection Stats + DuckDB + parquet scan + + + Bounding Box / T4 3D + BEV + Three.js overlays + + + Prediction Evaluation + ADE/FDE matrices + + + Trend Insights + release metadata + summaries + + + Download / Workflow + creates / refreshes files + + + Reports + dashboard PDF + specsheet + + + + + + + + + + + Blue = page reads artifact. Dashed green = workflow produces or refreshes artifact. + +
+
+
+ +
+
+
+
Real Sequence
+

Download Results -> Eval Results -> Overview selection.

+

+ This sequence shows both inline and production task-queue modes. In production, the UI does not run long jobs directly. +

+
+
+ + + + + + + + + + Sequence: user downloads a job, generates CSV artifacts, then opens Overview + + + + User + + + + + Streamlit + Download page + + + + + Redis / RQ + queue mode + + + + + Worker + heavy tasks + + + + + Evaluator + API / files + + + + + Data + root + + + + + 1. Submit Project ID, Job ID, output path + + + 2a. Queue task if USE_TASK_QUEUE=true + + 2b. Worker consumes RQ job + + + If queue mode is off, Streamlit + runs this work inline. + + + 3. Download archives / result JSON / scenario data + + 4. API response / downloaded files + + + 5. Write result.txt, score.json, extracted archives + + + 6. User runs Eval Results for same root + + 7. Generate Summary.csv + Score.csv + + 8. Overview lists data/<run> and reads generated artifacts + +
+
+
+ +
+
+
+
T4 Camera Rendering
+

Bounding Box Viewer / T4 Dataset Server: HTTP render path.

+

+ This path is for camera PNGs or HTML camera render previews. It is separate from the Three.js 3D overlay path below. +

+
+
+ + + + + + + + + + Sequence: camera preview render through T4 visualizer HTTP API + + + Streamlit + + + T4 Client + requests wrapper + + + T4 Server + FastAPI + + + Dataset + local T4 files + + + Browser + PNG / iframe + + + + 1. User selects server base URL, dataset, scenario, frame + + 2. GET /health, /datasets, /datasets/{id}/scenarios + + 3. Server reads available datasets and scene metadata + + 4. JSON lists: ids, scenarios, frame counts + + + 5. Build RenderRequest from UI and optional GT rows + + 6. POST /render {dataset, scenario, frame, target_objects} + + 7. Load camera/sample data and draw annotations + + 8. RenderResult JSON with images[].png_base64 + + 9. Streamlit decodes/display PNGs or embeds /render/html iframe + +
+
+
+

RenderRequest body

+
{
+  "t4dataset_id": "...",
+  "scenario_name": "...",
+  "frame_index": 42,
+  "target_objects": [{ "uuid": "...", "x": 1.2 }],
+  "show_annotations": true,
+  "crop_cameras": false
+}
+
+
+

RenderResult response

+
{
+  "sample_token": "...",
+  "timestamp_us": 123,
+  "images": [
+    { "label": "CAM_FRONT", "png_base64": "..." }
+  ],
+  "elapsed_ms": 812.4
+}
+
+
+

Source code touchpoints

+
lib/t4_visualizer_client.py
+lib/t4_dataset_embed.py
+pages/11_T4_Dataset_Server.py
+pages/4_Bounding_Box_Viewer.py
+
+
+
+
+ +
+
+
+
T4 3D Rendering
+

Three.js overlay path: parquet -> all-frame layers -> iframe postMessage.

+

+ This is the precise flow used by pages/5_T4_3D_Viewer.py. The app deliberately uses + the viewer’s own frame slider: Streamlit loads the iframe once, sends all frame overlays, and the + viewer selects overlays internally as the user scrubs time. +

+
+
+ + + + + + + + + + Sequence: T4 3D Viewer iframe + postMessage overlay synchronization + + + User + browser + + + + Streamlit + 5_T4_3D_Viewer + + + + DuckDB + parquet_scan + + + + Layer Builder + t4_three_layers + + + + T4 Server + /viewer/three + + + + Three.js iframe + viewer runtime + + + + 1. Open T4 3D Viewer after Overview selected run(s) + + + 2. DESCRIBE + SELECT parquet_scan(?) with filters + + + 3. DataFrame rows: frame_index, source GT/EST, status, geometry + + + Filters come from shared BEV keys: + suite, scenario, t4dataset, topic, label, visibility, runs. + frame_index is normalized to int. + + + 4. GET /datasets/{t4dataset_id}/availability + + 5. { available: true/false, dataset path metadata } + + + 6. build_three_layer_payload_all_frames(df) + + + For each frame_index: + source == GT -> gt[] boxes + source == EST -> pred[] boxes + TP pair_uuid/uuid -> matched_pairs[] + + + 7. Payload: { type: "bbox_layers_by_frame", frames: { "0": ... } } + + + 8. iframe src = {base}/viewer/three?t4dataset_id=...&scenario_name=...&frame_index=min + + + 9. Viewer loads dataset/scenario and its own time slider + + + 10. JS hex-decodes payload and iframe.contentWindow.postMessage(payload, targetOrigin) + + + 11. User scrubs inside viewer; runtime selects frames[frame_index] without Streamlit rerun + +
+ +
+
+

Layer payload

+
{
+  "type": "bbox_layers_by_frame",
+  "frames": {
+    "42": {
+      "gt": [{ "x": 1.0, "source": "GT" }],
+      "pred": [{ "x": 1.2, "source": "EST" }],
+      "matched_pairs": [
+        { "gt_idx": 0, "pred_idx": 0, "pair_uuid": "..." }
+      ]
+    }
+  }
+}
+
+
+

Iframe URL

+
viewer_three_url =
+  T4_VISUALIZER_BASE_URL
+  + "/viewer/three?"
+  + "t4dataset_id=..."
+  + "&scenario_name=..."
+  + "&frame_index=min_frame"
+
+
+

Post timing

+
post("iframe-load")
+retry every 250ms up to 12 times
+post("initial-delay-300ms")
+post("initial-delay-1200ms")
+
+targetOrigin = new URL(iframe.src).origin
+
+
+ +
+ Key distinction: the 3D viewer does not call POST /render for every frame. + It embeds /viewer/three once and sends all-frame overlay data via postMessage. + Camera PNG rendering is a separate HTTP render path. +
+
+
+ +
+
+
+
Compare Mode State
+

Overview is the state source for comparison pages.

+
+
+ + + + + + + Compare mode data/state propagation + + + Overview + mode = Compare Mode + runA = Baseline + runB / all_runs = Candidates + run_labels = A, B, C... + label filters + query params run_a/run_b + + + Session State + URL Hydration + st.session_state stores run objects + overview_url_hydrate can rebuild + state from query parameters + important with multiple Streamlit + replicas or direct subpage links + + + Pages consume shared state + TP Summary -> ΔTP / metric deltas + Criteria -> pass-rate / gate deltas + Detection Stats -> status + distance diffs + Bounding Box -> side-by-side/overlay BEV + T4 3D -> selected run layers + Prediction -> ADE/FDE delta matrices + + + write + + read + +
+
+
+ +
+
+
+
Report Generation
+

Dashboard PDF and specsheet PDF are different engines.

+
+
+ + + + + + + + + + Two report paths + + + Overview selection + mode, run(s), filters + Summary.csv charts + + + Dashboard PDF + lib/overview_pdf_report.py + curated dashboard snapshot + current view + selected filters + + + Release Specsheet PDF + lib/specsheet_report.py + perception_catalog_analyzer + blocks + trend plots + template + + + overview_report.pdf + dashboard narrative + + + specsheet.pdf + release specsheet + + + + + + Specsheet is an advanced report path. Most users first use the dashboard pages and dashboard PDF. + For full detail, open the Specsheet Pipeline Explorer. + +
+ +
+
+
+ + + + + diff --git a/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html b/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html new file mode 100644 index 0000000..41ed0dc --- /dev/null +++ b/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html @@ -0,0 +1,897 @@ + + + + + + Specsheet Pipeline Explorer + + + +
+
+
+
Evaluation Dashboard App
+

Specsheet Pipeline Explorer

+

+ A visual engineering guide to how the Streamlit dashboard calls + perception_catalog_analyzer, builds release specsheet PDFs, and turns saved + trend metadata into mAP, prediction, and pass-rate trend sections. +

+ +
+
+
specsheet.pdf
+
Overview.py
+
specsheet_report.py
+
metadata + summary
+
analyzer library
+
+
+
+ + + +
+
+
+
+
One Button, Many Systems
+

The dashboard is the conductor; the analyzer is the orchestra.

+

+ The app owns the Streamlit UI, local run selection, path safety, progress reporting, + trend metadata capture, and compatibility glue. The library owns the domain-heavy work: + SceneDataFrame loading, metric block generation, HTML templating, matplotlib plots, and PDF rendering. +

+
+ +
+
+ 1. User selects a run + Overview.py gathers project, version, topic, labels, and optional trend metadata. +
+
+ 2. Wrapper prepares files + ensure_specsheet_csvs() creates current.csv / future.csv when missing. +
+
+ 3. Analyzer builds blocks + SceneDataFrame.from_dir() and get_blocks() produce abstract and detailed HTML. +
+
+ 4. Trend context is assembled + Saved metadata.yaml + summary.json files become trend rows and plot PNGs. +
+
+ 5. PDF is rendered + update_template() creates the body HTML, then specsheet() writes PDF output. +
+
+
+
+ +
+
+
+
Code Boundary
+

How the current app connects to the library.

+

+ The integration lives mostly in lib/specsheet_report.py. It imports analyzer functions lazily, + then calls them through compatibility adapters so different analyzer versions can still work. +

+
+ +
+
Streamlit UI
Overview.py
+
+
InputsProject ID, version, topic name, labels, run path.
+
Trend toggleValidates YAML and asks for exactly one trend-enabled run.
+
ProgressMaps generator callback messages to the progress bar.
+
+
+ +
+
Dashboard Wrapper
lib/specsheet_report.py
+
+
Artifact pathsDefines CSV, parquet, resources, specsheet dir, and PDF paths.
+
CompatibilityAdapts analyzer signatures with inspect.signature().
+
Trend bridgeDiscovers trend files, classifies summaries, and renders trend plots.
+
+
+ +
+
Analyzer Library
perception_catalog_analyzer
+
+
DataframeSceneDataFrame.from_dir() reads current/future CSVs.
+
Specsheet blocksget_blocks() emits abstract and detailed HTML sections.
+
Renderingupdate_template() + specsheet() build HTML and PDF.
+
+
+ +
+
Local Artifacts
data/...
+
+
Run filescurrent.parquet, future.parquet, generated CSVs.
+
Trend filesmetadata.yaml and summary.json beside each trend job.
+
Outputspecsheet/specsheet.html, PNG plots, and specsheet.pdf.
+
+
+
+
+ +
+
+
+
File Geography
+

Two trend folder shapes are supported.

+

+ The app can read trend files from standalone dashboard-generated runs and grouped library-style release folders. + The loader only requires sibling metadata.yaml and summary.json; the grouping logic uses folder shape to connect full/usecase/devops jobs into one release. +

+
+ +
+
+

Standalone run shape

+
+
data/my_run/
+
current.csv
+
future.csv
+
resources/
+
metadata.yaml
+
summary.json
+
specsheet/specsheet.pdf
+
+
+
+

Trend release shape

+
+
data/trend_release_full_usecase_devops/
+
perception.object_recognition.objects/
+
<full_job_id>/metadata.yaml + summary.json
+
<usecase_job_id>/metadata.yaml + summary.json
+
<devops_job_id>/metadata.yaml + summary.json
+
specsheet/
+
map_trend.png, devops_trend.png, specsheet.pdf
+
+
+
+ +
+ Important: discover_trend_metadata_files() scans the data root for every + metadata.yaml that has a sibling summary.json. Then + discover_trend_release_groups() decides whether that metadata belongs to a standalone run or a grouped release. +
+
+
+ +
+
+
+
Trend Engine
+

Trend data is not one format. It is classified by summary shape.

+

+ The metadata provides release identity. The summary payload determines the role: full, usecase, devops, or unknown. + That role decides which charts and sections can be produced. +

+
+ +
+
+
metadata.yaml
+
tags: [trend]
+pilot_auto_version: "Pilot.Auto v4.3.0 (centerpoint x2/2.3.1)"
+data_count: "99,776+"
+description: "データの追加"
+date: "2025.11.7"
+
+
+
classification logic
+
# summary has blocks with 全数データセット評価
+role = "full"
+
+# summary has blocks with ユースケース評価
+role = "usecase"
+
+# summary is a non-empty nested dict, no blocks
+role = "devops"
+
+
+ +
+
+
F
+

Full performance trend

+

Extracts mAP, precision, recall, error metrics, and prediction metrics from the full summary block. These rows feed map_trend.png and prediction_trend.png.

+
+
+
U
+

Usecase summaries

+

Classified and grouped for inventory views. The current PDF trend context does not render a dedicated usecase trend plot.

+
+
+
D
+

DevOps pass-rate trend

+

Flattens nested pass/fail totals, calculates overall pass rate, and feeds devops_trend.png plus devops_trend_detail.png.

+
+
+
+
+ +
+
+
+
Rendering
+

The final PDF is assembled from three streams of HTML.

+

+ The analyzer’s template body contributes the cover, executive summary, trend sections, DevOps section, and dataset information. + The analyzer’s block generator contributes abstract and detailed metric sections. +

+
+ +
+
+

ensure_specsheet_csvs()

+

Creates current.csv and optional future.csv from parquet or pkl-derived data so the analyzer can load a consistent run directory.

+
+
+

_get_blocks_compat(get_blocks, ...)

+

Calls the analyzer to create abstract and detailed HTML fragments for labels, metrics, and evaluation type.

+
+
+

_update_template_compat(update_template, ...)

+

Passes project/version plus trend context into the analyzer’s Jinja template. Paths are resolved to generated PNGs when they exist.

+
+
+

_specsheet_compat(specsheet, ...)

+

Hands body HTML, abstract HTML, and detailed HTML to the analyzer renderer, which writes specsheet.html and specsheet.pdf.

+
+
+
+
+ +
+
+
+
What We Fixed
+

Pass Rate Trend was visible in Trend Insights but missing in the PDF.

+

+ Trend Insights already knew how to flatten DevOps summaries. The PDF exporter had the right template fields, but its DevOps trend loader returned an empty list and no pass-rate PNGs were generated. +

+
+ +
+
+

Before

+
+
old exporter behavior
+
def load_devops_trend_data(metadata_list):
+    return []
+
+devops_trend_plot_path = output_dir / "devops_trend.png"
+# no plot generation
+
+
+
+

After

+
+
new exporter behavior
+
summary = load_trend_summary_file(summary_path)
+if classify_trend_summary(summary) == "devops":
+    rows = extract_devops_case_rows(summary)
+    overall_pass_rate = sum(passed) / sum(total) * 100
+    generate_devops_trend_plot(...)
+    generate_devops_trend_detail_plot(...)
+
+
+
+ +
+ Template rule: the analyzer template only renders Pass Rate Trend when + devops_trend_data is non-empty and devops_trend.png exists. It only renders Pass Rate Detail when + devops_trend_detail.png exists too. +
+
+
+ +
+
+
+
Debug Playbook
+

When a trend section disappears, check the contract.

+
+ +
+
+
1
+

Find metadata

+

Confirm the data root contains metadata.yaml with a sibling summary.json.

+
+
+
2
+

Check role

+

Full summaries need blocks. DevOps summaries should be a nested dict with passed and total.

+
+
+
3
+

Check generated PNGs

+

The PDF template needs map_trend.png, prediction_trend.png, devops_trend.png, and detail PNGs when enabled.

+
+
+
4
+

Check template context

+

show_other_infos gates the executive summary trend pages. Empty trend data means “該当データなし。”

+
+
+ +
+
quick local verification
+
PYTHONPATH=. python - <<'PY'
+from pathlib import Path
+from lib.specsheet_report import _build_trend_context, discover_trend_metadata_files
+
+ctx = _build_trend_context(discover_trend_metadata_files(), Path('/tmp/specsheet-trend-check'))
+print(len(ctx['performance_trend_data']), len(ctx['devops_trend_data']))
+PY
+
+
+
+
+ +
+
+ Specsheet Pipeline Explorer +

+ Generated for the evaluation dashboard repository. Key files: + Overview.py, lib/specsheet_report.py, + pages/13_Trend_Insights.py, and perception_catalog_analyzer.template. +

+
+
+ + From d8a9301cbf81585237d82c49d12735e90577ba4c Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 22 May 2026 16:48:36 +0900 Subject: [PATCH 81/94] feat: enhance trend insights functionality and improve metadata handling - Updated the trend metadata caption to clarify the selection process for PDF body runs. - Introduced new functions for assembling trend release groups, including validation for required roles and metadata handling. - Enhanced the user interface with a form for creating release trend groups, allowing for dynamic input of summary sources and job IDs. - Improved error handling and user feedback during the trend group creation process. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Overview.py | 2 +- .../pages/13_Trend_Insights.py | 144 +++++++++++++++++- 2 files changed, 144 insertions(+), 2 deletions(-) diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py index 530f054..fcd5df7 100644 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -772,7 +772,7 @@ def _update_pdf_status(message: str) -> None: trend_metadata_payload = None if specsheet_trend_enabled: st.caption( - "Trend mode uses a slim analyzer-compatible `metadata.yaml`. Extra evaluator fields are ignored." + "Select the full/performance run for the PDF body. Other full/usecase/devops trend runs are discovered from matching metadata under the data root." ) trend_metadata_text = st.text_area( "Trend metadata YAML", diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index 99a84fb..093a4e4 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -1,6 +1,8 @@ from __future__ import annotations import json +import re +import shutil from pathlib import Path from typing import Any @@ -10,11 +12,18 @@ import streamlit as st from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.path_utils import get_data_root, path_display, resolve_under_data_root from lib.specsheet_report import ( + DEFAULT_TREND_METADATA_TEXT, + TREND_METADATA_FILENAME, + TREND_SUMMARY_FILENAME, TrendReleaseGroup, + classify_trend_summary, discover_trend_release_groups, extract_devops_case_rows, extract_performance_metrics_from_summary, + load_trend_summary_file, + parse_trend_metadata_text, ) st.set_page_config(page_title="Trend Insights", layout="wide", initial_sidebar_state="expanded") @@ -38,6 +47,137 @@ def _select_primary_metadata(group: TrendReleaseGroup) -> dict[str, Any]: return {} +def _safe_path_part(value: Any, fallback: str) -> str: + text = str(value or "").strip() + text = re.sub(r"[^\w.\-]+", "_", text).strip("._") + return text or fallback + + +def _resolve_summary_json_input(user_path: str) -> tuple[Path | None, str]: + resolved, err = resolve_under_data_root(user_path, allow_missing=False) + if err: + return None, err + assert resolved is not None + if resolved.is_file(): + if resolved.name != TREND_SUMMARY_FILENAME: + return None, f"Expected a {TREND_SUMMARY_FILENAME} file: {path_display(resolved)}" + return resolved, "" + for candidate in ( + resolved / TREND_SUMMARY_FILENAME, + resolved / "resources" / TREND_SUMMARY_FILENAME, + ): + if candidate.exists(): + return candidate, "" + return None, f"No {TREND_SUMMARY_FILENAME} found in {path_display(resolved)} or its resources/ folder." + + +def _default_job_id_from_summary(summary_path: Path) -> str: + if summary_path.parent.name == "resources": + return summary_path.parent.parent.name + return summary_path.parent.name + + +def _assemble_trend_release_group( + *, + release_name: str, + topic_name: str, + role_sources: dict[str, str], + role_job_ids: dict[str, str], + metadata: dict[str, Any], +) -> Path: + data_root = get_data_root() + release_dir = data_root / _safe_path_part(release_name, "trend_release") + topic_dir = release_dir / _safe_path_part(topic_name, "perception.object_recognition.objects") + expected_roles = {"full", "usecase", "devops"} + seen_roles: dict[str, Path] = {} + + for expected_role, source_text in role_sources.items(): + summary_path, err = _resolve_summary_json_input(source_text) + if err: + raise ValueError(f"{expected_role}: {err}") + assert summary_path is not None + summary = load_trend_summary_file(summary_path) + actual_role = classify_trend_summary(summary) + if actual_role != expected_role: + raise ValueError( + f"{expected_role}: {path_display(summary_path)} classified as `{actual_role}`, " + f"not `{expected_role}`." + ) + seen_roles[actual_role] = summary_path + + missing = sorted(expected_roles - set(seen_roles)) + if missing: + raise ValueError(f"Missing required trend roles: {', '.join(missing)}") + + for role, summary_path in seen_roles.items(): + job_id = _safe_path_part(role_job_ids.get(role) or _default_job_id_from_summary(summary_path), role) + job_dir = topic_dir / job_id + job_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(summary_path, job_dir / TREND_SUMMARY_FILENAME) + with (job_dir / TREND_METADATA_FILENAME).open("w", encoding="utf-8") as fh: + import yaml + + yaml.safe_dump(metadata, fh, allow_unicode=True, sort_keys=False) + return release_dir + + +def _render_release_trend_builder() -> None: + section_header("Build Release Trend Group") + with st.expander("Assemble full/usecase/devops summaries into one release", expanded=False): + st.caption( + "Use this after the three evaluator jobs have analyzer-compatible summary.json files. " + "Each source can be a job folder, a run folder containing resources/summary.json, or the summary.json file itself." + ) + with st.form("release_trend_builder_form"): + form_col1, form_col2 = st.columns([1.1, 1.2]) + with form_col1: + release_name = st.text_input( + "Release folder name", + value="trend_release___", + ) + topic_name = st.text_input( + "Topic folder", + value="perception.object_recognition.objects", + ) + full_source = st.text_input("Full summary source") + usecase_source = st.text_input("Usecase summary source") + devops_source = st.text_input("DevOps summary source") + with form_col2: + full_job_id = st.text_input("Full job id override", value="") + usecase_job_id = st.text_input("Usecase job id override", value="") + devops_job_id = st.text_input("DevOps job id override", value="") + metadata_text = st.text_area( + "Release metadata YAML", + value=DEFAULT_TREND_METADATA_TEXT, + height=180, + help="Required keys: tags, pilot_auto_version, data_count, description, date.", + ) + submitted = st.form_submit_button("Create Release Trend Group", type="primary") + + if submitted: + try: + metadata = parse_trend_metadata_text(metadata_text) + created_dir = _assemble_trend_release_group( + release_name=release_name, + topic_name=topic_name, + role_sources={ + "full": full_source, + "usecase": usecase_source, + "devops": devops_source, + }, + role_job_ids={ + "full": full_job_id, + "usecase": usecase_job_id, + "devops": devops_job_id, + }, + metadata=metadata, + ) + st.success(f"Created release trend group at `{path_display(created_dir)}`. Refreshing inventory...") + st.rerun() + except Exception as exc: + st.error(f"Could not create release trend group: {exc}") + + def _release_display_name(version: Any, date: Any, description: Any = "") -> str: version_text = str(version or "").strip() or "Unknown Version" date_text = str(date or "").strip() @@ -486,11 +626,13 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame description="Release-level trends across grouped full, usecase, and devops runs.", ) +_render_release_trend_builder() + section_header("Release Inventory") groups = discover_trend_release_groups() if not groups: - st.info("No saved trend metadata was found yet. Generate a release spec-sheet with trend mode enabled first.") + st.info("No saved trend metadata was found yet. Use the release trend builder above after the three job summaries are available.") st.stop() try: From a5ab019568810a0b60a2d1fbea91706723fb9fe0 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 25 May 2026 15:55:57 +0900 Subject: [PATCH 82/94] feat: enhance evaluation dashboard with release specsheet workflow and metadata handling - Introduced a new workflow for scheduling release evaluator jobs, processing them as app-native runs, and generating a release specsheet. - Added functions for writing trend metadata and validating trend summaries, improving metadata management. - Enhanced the user interface to support dynamic input for release metadata, including release group, version, and description. - Updated various components to utilize the new run display name function for better clarity in run selections. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Overview.py | 50 +- evaluation_dashboard_app/deploy/04_START.sh | 10 +- .../deploy/docker-compose.yml | 9 +- .../deploy/nginx/nginx.conf | 12 +- evaluation_dashboard_app/lib/db.py | 1 + .../lib/overview_url_hydrate.py | 4 +- evaluation_dashboard_app/lib/path_utils.py | 55 +- .../lib/specsheet_report.py | 163 +++++- .../lib/ui/task_history.py | 5 + .../lib/ui/task_result_summary.py | 44 ++ .../pages/12_Prediction_Evaluation.py | 11 +- evaluation_dashboard_app/pages/6_Workflow.py | 256 +++++++-- evaluation_dashboard_app/worker/tasks.py | 497 ++++++++++++++++++ 13 files changed, 1028 insertions(+), 89 deletions(-) diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py index fcd5df7..d3e527b 100644 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -1,11 +1,12 @@ import streamlit as st import pandas as pd import io +import urllib.parse import zipfile import yaml from pathlib import Path from lib.run_loader import load_run -from lib.path_utils import get_data_root, get_data_root_display, list_run_directories, path_display +from lib.path_utils import get_data_root, get_data_root_display, get_run_display_name, list_run_directories, path_display import plotly.express as px import plotly.graph_objects as go from lib.user_config import UserConfig @@ -278,7 +279,7 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No # List run directories (subdirectories in RUN_ROOT) run_dirs = list_run_directories() -run_names = [p.name for p in run_dirs] +run_names = [get_run_display_name(p) for p in run_dirs] if not run_dirs: st.warning(f"No runs found in '{get_data_root_display()}'.\n\nPlease add at least one sub-directory with evaluation results, e.g. `{get_data_root_display()}/my_eval_run/`.") @@ -300,8 +301,9 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No saved_run_a = url_run_a run_a_index = run_names.index(saved_run_a) if saved_run_a in run_names else 0 -run_a_dir = st.sidebar.selectbox("Baseline (A)", run_dirs, index=run_a_index, format_func=lambda p: p.name) -user_config.set("overview_run_a", run_a_dir.name) +run_a_dir = st.sidebar.selectbox("Baseline (A)", run_dirs, index=run_a_index, format_func=get_run_display_name) +run_a_name = get_run_display_name(run_a_dir) +user_config.set("overview_run_a", run_a_name) compare_run_names = [] # list of run names for candidates B, C, D, ... if mode == "Compare Mode": @@ -331,10 +333,10 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No f"Candidate ({letter})", run_dirs, index=idx, - format_func=lambda p: p.name, + format_func=get_run_display_name, key=f"compare_run_select_{i}", ) - new_compare_run_names.append(selected.name) + new_compare_run_names.append(get_run_display_name(selected)) with col_rm: if len(compare_run_names) > 1: if st.button("✕", key=f"compare_remove_{i}", help="Remove this run"): @@ -348,7 +350,7 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No st.session_state["overview_compare_run_names"] = compare_run_names if st.sidebar.button("➕ Add run", help="Add another run to compare"): - used = {run_a_dir.name} | set(compare_run_names) + used = {run_a_name} | set(compare_run_names) next_name = next((n for n in run_names if n not in used), run_names[0]) new_list = compare_run_names + [next_name] st.session_state["overview_compare_run_names"] = new_list @@ -361,13 +363,13 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No compare_run_dirs = [] if mode == "Compare Mode" and compare_run_names: - name_to_dir = {p.name: p for p in run_dirs} + name_to_dir = {get_run_display_name(p): p for p in run_dirs} compare_run_dirs = [name_to_dir[n] for n in compare_run_names if n in name_to_dir] # ====== SYNC URL (NON-DESTRUCTIVE) ====== query = { "mode": "compare" if mode == "Compare Mode" else "single", - "run_a": run_a_dir.name, + "run_a": run_a_name, } for j, name in enumerate(compare_run_names): query[f"run_{chr(98 + j)}"] = name # run_b, run_c, ... @@ -495,10 +497,14 @@ def safe_load_run(path, label='Run'): st.caption(f"Up to 5 keys only in {cand}") st.code("\n".join(sc) if sc else "(none)") -share_q = f"mode={'compare' if mode == 'Compare Mode' else 'single'}&run_a={run_a_dir.name}" +share_query = { + "mode": "compare" if mode == "Compare Mode" else "single", + "run_a": run_a_name, +} if mode == "Compare Mode" and compare_run_names: for j, name in enumerate(compare_run_names): - share_q += f"&run_{chr(98 + j)}={name}" + share_query[f"run_{chr(98 + j)}"] = name +share_q = urllib.parse.urlencode(share_query) render_share_link_callout( share_q, caption="Append to your server URL (e.g. `https://host:8501/?` + query). Build links from Data Management too.", @@ -649,7 +655,7 @@ def show_tp_mean_by_label_compare(df_list, run_labels, label_col, label_jp_map=N _specsheet_run_records = _report_runs _specsheet_run_labels = _report_labels _specsheet_run_options = { - f"{label} · {record['path'].name}": record["path"] + f"{label} · {get_run_display_name(record['path'])}": record["path"] for label, record in zip(_specsheet_run_labels, _specsheet_run_records) } _specsheet_run_option_keys = list(_specsheet_run_options.keys()) @@ -663,7 +669,7 @@ def show_tp_mean_by_label_compare(df_list, run_labels, label_col, label_jp_map=N ) _specsheet_label_options = list(dict.fromkeys(_default_specsheet_labels + _detected_specsheet_labels)) _single_specsheet_run_path = _specsheet_run_records[0]["path"] -_default_specsheet_version = st.session_state.get("specsheet_version", _single_specsheet_run_path.name) +_default_specsheet_version = st.session_state.get("specsheet_version", get_run_display_name(_single_specsheet_run_path)) pdf_col1, pdf_col2 = st.columns([1.2, 2.8]) with pdf_col1: @@ -684,7 +690,7 @@ def _update_pdf_status(message: str) -> None: ) st.session_state["overview_pdf_report_bytes"] = pdf_bytes st.session_state["overview_pdf_report_key"] = _report_key - run_names_for_file = [r["path"].name for r in _report_runs if r.get("path") is not None] + run_names_for_file = [get_run_display_name(r["path"]) for r in _report_runs if r.get("path") is not None] st.session_state["overview_pdf_report_name"] = make_report_filename(run_names_for_file) _pdf_status.success("PDF report is ready.") except Exception as e: @@ -729,6 +735,12 @@ def _update_pdf_status(message: str) -> None: if key in _specsheet_run_options ] _active_specsheet_paths = [get_specsheet_artifact_paths(path) for path in selected_specsheet_run_paths] +_selected_trend_metadata_text = "" +if len(_active_specsheet_paths) == 1 and _active_specsheet_paths[0]["trend_metadata"].exists(): + try: + _selected_trend_metadata_text = _active_specsheet_paths[0]["trend_metadata"].read_text(encoding="utf-8") + except Exception: + _selected_trend_metadata_text = "" specsheet_cfg_col1, specsheet_cfg_col2, specsheet_cfg_col3 = st.columns([1.4, 1.2, 1.4]) with specsheet_cfg_col1: @@ -764,7 +776,7 @@ def _update_pdf_status(message: str) -> None: specsheet_trend_enabled = st.toggle( "Include trend data", - value=bool(st.session_state.get("specsheet_include_trend", False)), + value=bool(st.session_state.get("specsheet_include_trend", bool(_selected_trend_metadata_text))), key="specsheet_include_trend", help="Release-report mode only. Saves `metadata.yaml` next to the generated `summary.json` and reuses all saved trend metadata files under the data root.", ) @@ -776,7 +788,10 @@ def _update_pdf_status(message: str) -> None: ) trend_metadata_text = st.text_area( "Trend metadata YAML", - value=st.session_state.get("specsheet_trend_metadata_text", DEFAULT_TREND_METADATA_TEXT), + value=st.session_state.get( + "specsheet_trend_metadata_text", + _selected_trend_metadata_text or DEFAULT_TREND_METADATA_TEXT, + ), key="specsheet_trend_metadata_text", height=180, help="Required keys: tags, pilot_auto_version, data_count, description, date.", @@ -816,6 +831,7 @@ def _update_pdf_status(message: str) -> None: "Using existing up-to-date spec-sheet PDF": 1.0, "Loading CSV files": 0.15, "Building abstract and detail sections": 0.2, + "Validating full trend summary": 0.9, "Saving trend metadata": 0.9, "Collecting trend history": 0.92, "Rendering trend plots": 0.94, @@ -842,7 +858,7 @@ def _update_specsheet_status(message: str) -> None: generated_pdfs: list[tuple[Path, bool]] = [] for idx, run_path in enumerate(selected_specsheet_run_paths, start=1): - _update_specsheet_status(f"Run {idx}/{len(selected_specsheet_run_paths)}: {run_path.name}") + _update_specsheet_status(f"Run {idx}/{len(selected_specsheet_run_paths)}: {get_run_display_name(run_path)}") pdf_path, generated = generate_specsheet_pdf( run_path, project_id=specsheet_project_id, diff --git a/evaluation_dashboard_app/deploy/04_START.sh b/evaluation_dashboard_app/deploy/04_START.sh index 3281a86..3b3814c 100755 --- a/evaluation_dashboard_app/deploy/04_START.sh +++ b/evaluation_dashboard_app/deploy/04_START.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# 04 — Start the full stack, or if it is already running: up -d (apply compose/scale) then restart all services. +# 04 — Start or update the full stack with docker compose up -d. # Default: 2 worker replicas (EVAL_COMPOSE_SCALE_WORKER in .env). Override: ./04_START.sh --scale worker=1 (last --scale wins). set -euo pipefail DEPLOY_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -17,10 +17,4 @@ WORKER_SCALE="${EVAL_COMPOSE_SCALE_WORKER:-2}" dc() { docker compose --env-file .env "$@"; } -if [[ -n "$(dc ps -q --status running 2>/dev/null || true)" ]]; then - echo "Stack already running — updating with up -d, then restarting all services." - dc up -d --scale "worker=${WORKER_SCALE}" "$@" - dc restart -else - dc up -d --scale "worker=${WORKER_SCALE}" "$@" -fi +dc up -d --scale "worker=${WORKER_SCALE}" "$@" diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index 17bd525..0abe63b 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -71,6 +71,12 @@ x-streamlit-app: &streamlit-app condition: service_started postgres: condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8501/_stcore/health >/dev/null || curl -fsS http://localhost:8501/healthz >/dev/null"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 20s restart: unless-stopped services: @@ -81,7 +87,8 @@ services: volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro depends_on: - - streamlit1 + streamlit1: + condition: service_healthy restart: unless-stopped streamlit1: diff --git a/evaluation_dashboard_app/deploy/nginx/nginx.conf b/evaluation_dashboard_app/deploy/nginx/nginx.conf index dd03703..de69355 100644 --- a/evaluation_dashboard_app/deploy/nginx/nginx.conf +++ b/evaluation_dashboard_app/deploy/nginx/nginx.conf @@ -8,12 +8,9 @@ events { } http { - upstream streamlit { - server streamlit1:8501 max_fails=3 fail_timeout=10s; - # Optional second app server (start compose with --profile ha and uncomment): - # server streamlit2:8501 max_fails=3 fail_timeout=10s; - # ip_hash; # required if you use two server lines above - } + # Docker's embedded DNS. Resolve Streamlit at request time so nginx does not keep + # a stale container IP after `docker compose up -d` recreates streamlit1. + resolver 127.0.0.11 valid=10s ipv6=off; server { listen 80; @@ -22,7 +19,8 @@ http { client_max_body_size 200m; location / { - proxy_pass http://streamlit; + set $streamlit_upstream streamlit1:8501; + proxy_pass http://$streamlit_upstream; proxy_http_version 1.1; proxy_buffering off; proxy_set_header Host $host; diff --git a/evaluation_dashboard_app/lib/db.py b/evaluation_dashboard_app/lib/db.py index 4ce178f..c941970 100644 --- a/evaluation_dashboard_app/lib/db.py +++ b/evaluation_dashboard_app/lib/db.py @@ -31,6 +31,7 @@ def _task_log_timestamp_prefix() -> str: "build_parquet", "download_and_eval", "run_evaluator_and_process", + "run_release_specsheet_workflow", ) TASK_STATUSES = ("pending", "running", "completed", "failed") diff --git a/evaluation_dashboard_app/lib/overview_url_hydrate.py b/evaluation_dashboard_app/lib/overview_url_hydrate.py index 5e4b2ed..ca20a60 100644 --- a/evaluation_dashboard_app/lib/overview_url_hydrate.py +++ b/evaluation_dashboard_app/lib/overview_url_hydrate.py @@ -11,7 +11,7 @@ import streamlit as st -from lib.path_utils import get_data_root, list_run_directories +from lib.path_utils import get_data_root, get_run_display_name, list_run_directories from lib.run_loader import load_run @@ -30,7 +30,7 @@ def try_hydrate_session_from_overview_query_params() -> bool: if not root.exists() or not root.is_dir(): return False run_dirs = list_run_directories() - name_to_dir = {p.name: p for p in run_dirs} + name_to_dir = {get_run_display_name(p): p for p in run_dirs} if run_a_name not in name_to_dir: return False mode_param = (params.get("mode") or "single").lower() diff --git a/evaluation_dashboard_app/lib/path_utils.py b/evaluation_dashboard_app/lib/path_utils.py index ca698a4..fc766e3 100644 --- a/evaluation_dashboard_app/lib/path_utils.py +++ b/evaluation_dashboard_app/lib/path_utils.py @@ -112,12 +112,45 @@ def resolve_under_data_root( return None, str(e) +def _looks_like_analysis_run(path: Path) -> bool: + return ( + (path / "Summary.csv").exists() + or (path / "Score.csv").exists() + or any(path.glob("*.parquet")) + or (path / "current.csv").exists() + or (path / "future.csv").exists() + ) + + +def get_run_display_name(run_path: Path) -> str: + """Return a stable run selector name relative to the data root.""" + root = get_data_root() + try: + return run_path.resolve().relative_to(root).as_posix() + except Exception: + return run_path.name + + def list_run_directories() -> List[Path]: - """Return sorted list of run directories (immediate subdirs of data root) that exist.""" + """Return sorted run directories, including release analysis children.""" root = get_data_root() if not root.exists(): return [] - return sorted([p for p in root.iterdir() if p.is_dir()]) + runs: List[Path] = [] + seen = set() + for child in sorted([p for p in root.iterdir() if p.is_dir()]): + resolved = child.resolve() + if resolved not in seen: + runs.append(child) + seen.add(resolved) + for release_child_name in ("performance", "devops"): + release_child = child / release_child_name + if release_child.is_dir() and _looks_like_analysis_run(release_child): + release_resolved = release_child.resolve() + if release_resolved not in seen: + runs.append(release_child) + seen.add(release_resolved) + return sorted(runs, key=get_run_display_name) def count_tlr_scenarios(path: Path) -> int: @@ -174,7 +207,7 @@ def get_run_info(run_path: Path) -> dict: has_score = (run_path / "Score.csv").exists() has_parquet = any(run_path.glob("*.parquet")) return { - "name": run_path.name, + "name": get_run_display_name(run_path), "path": run_path, "size_bytes": size_bytes, "mtime": mtime, @@ -186,23 +219,25 @@ def get_run_info(run_path: Path) -> dict: def resolve_run_subdirectory(run_name: str) -> Tuple[Optional[Path], str]: """ - Resolve a run directory by name (must be a direct child of data root). + Resolve a run directory by display name under the data root. Returns (path, "") on success, or (None, error_message). """ root = get_data_root() if not run_name or run_name.strip() != run_name: return None, "Invalid run name." - if os.sep in run_name or "/" in run_name or ".." in run_name: + if "\x00" in run_name or "\\" in run_name: return None, "Invalid run name." - run_path = root / run_name - if not run_path.exists(): - return None, f"Run does not exist: {run_name}" - if not run_path.is_dir(): - return None, "Not a directory." + run_path = (root / run_name).resolve() try: run_path.relative_to(root) except ValueError: return None, "Run is not under data root." + if run_path == root: + return None, "Invalid run name." + if not run_path.exists(): + return None, f"Run does not exist: {run_name}" + if not run_path.is_dir(): + return None, "Not a directory." return run_path, "" diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index 826ea11..ff82318 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -207,13 +207,18 @@ def parse_trend_metadata_text(text: str) -> dict[str, Any]: if not date or not _TREND_DATE_PATTERN.match(date): raise ValueError("Trend metadata `date` must look like `2025.11.7`.") - return { + parsed = { "tags": ["trend"], "pilot_auto_version": pilot_auto_version, "data_count": data_count, "description": description, "date": date, } + for optional_key in ("release_group", "topic_name"): + optional_value = str(raw.get(optional_key) or "").strip() + if optional_value: + parsed[optional_key] = optional_value + return parsed def write_trend_metadata(run_dir: str | Path, metadata: dict[str, Any]) -> Path: @@ -274,6 +279,7 @@ def classify_trend_summary(summary: dict[str, Any]) -> str: def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[TrendReleaseGroup]: metadata_files = discover_trend_metadata_files(root_dir) grouped: dict[str, TrendReleaseGroup] = {} + standalone_records: list[dict[str, Any]] = [] for metadata_path in metadata_files: summary_path = metadata_path.parent / TREND_SUMMARY_FILENAME @@ -288,6 +294,22 @@ def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[Tr topic_name = str(metadata.get("topic_name") or "standalone") group_kind = "standalone_run" base_dir = run_dir + standalone_records.append( + { + "group_key": group_key, + "display_name": display_name, + "topic_name": topic_name, + "group_kind": group_kind, + "base_dir": base_dir, + "role": role, + "job_id": run_dir.name, + "metadata_path": metadata_path, + "summary_path": summary_path, + "metadata": metadata, + "summary": summary, + } + ) + continue else: job_dir = metadata_path.parent topic_dir = job_dir.parent @@ -316,6 +338,77 @@ def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[Tr "summary": summary, } + standalone_by_release: dict[tuple[str, str, str, str, str, str], list[dict[str, Any]]] = {} + for record in standalone_records: + metadata = record["metadata"] + release_key = ( + str(metadata.get("release_group") or ""), + str(record["topic_name"] or ""), + str(metadata.get("pilot_auto_version") or ""), + str(metadata.get("date") or ""), + str(metadata.get("description") or ""), + str(metadata.get("data_count") or ""), + ) + standalone_by_release.setdefault(release_key, []).append(record) + + for release_key, records in standalone_by_release.items(): + role_counts: dict[str, int] = {} + for record in records: + role = str(record["role"]) + role_counts[role] = role_counts.get(role, 0) + 1 + + can_group = len(records) > 1 and all(count == 1 for count in role_counts.values()) + if can_group: + sample = records[0] + metadata = sample["metadata"] + release_label = ( + str(metadata.get("release_group") or "").strip() + or str(metadata.get("pilot_auto_version") or "").strip() + or "standalone_release" + ) + date_label = str(metadata.get("date") or "").strip() + display_name = f"{release_label} | {date_label}" if date_label else release_label + group_key = "standalone_group::" + "::".join(release_key) + grouped[group_key] = TrendReleaseGroup( + group_key=group_key, + display_name=display_name, + topic_name=str(sample["topic_name"]), + group_kind="standalone_release_group", + base_dir=Path(root_dir) if root_dir is not None else get_data_root(), + jobs={}, + ) + target_group = grouped[group_key] + for record in records: + target_group.jobs[str(record["role"])] = { + "role": record["role"], + "job_id": record["job_id"], + "metadata_path": record["metadata_path"].resolve(), + "summary_path": record["summary_path"].resolve(), + "metadata": record["metadata"], + "summary": record["summary"], + } + continue + + for record in records: + group_key = str(record["group_key"]) + grouped[group_key] = TrendReleaseGroup( + group_key=group_key, + display_name=str(record["display_name"]), + topic_name=str(record["topic_name"]), + group_kind=str(record["group_kind"]), + base_dir=record["base_dir"], + jobs={ + str(record["role"]): { + "role": record["role"], + "job_id": record["job_id"], + "metadata_path": record["metadata_path"].resolve(), + "summary_path": record["summary_path"].resolve(), + "metadata": record["metadata"], + "summary": record["summary"], + } + }, + ) + def _sort_key(group: TrendReleaseGroup) -> tuple[str, str]: dates = [ str(job["metadata"].get("date") or "") @@ -395,6 +488,22 @@ def _load_only_full_summary(summary_path: Path) -> list[dict[str, Any]]: return _extract_full_metric_tables(summary) +def ensure_full_trend_summary(summary_path: str | Path) -> Path: + """Validate that analyzer block generation produced a full trend summary.""" + path = Path(summary_path) + if not path.exists(): + raise FileNotFoundError( + f"Full trend summary was not created: {path}. " + "The analyzer must write resources/summary.json before trend PDF generation." + ) + summary = load_trend_summary_file(path) + role = classify_trend_summary(summary) + if role != "full": + raise ValueError(f"Expected a full trend summary at {path}, but it classified as `{role}`.") + extract_performance_metrics_from_summary(summary) + return path + + def extract_performance_metrics_from_summary(summary: dict[str, Any]) -> dict[str, float]: """Return averaged full-performance metrics from a full summary payload.""" data_list = _extract_full_metric_tables(summary) @@ -743,6 +852,55 @@ def _scene_dataframe_from_dir_compat( return from_dir(run_path) +_CURRENT_NUMERIC_COLUMNS = { + "unix_time", + "x", + "y", + "confidence", + "pointcloud_num", + "visibility", + "x_error", + "y_error", + "yaw_error", + "speed_error", + "frame_index", +} +_FUTURE_NUMERIC_COLUMNS = { + "x", + "y", + "tx", + "ty", + "confidence", + "visibility", + "relative_time", + "pair_dt_sec", +} + + +def _coerce_numeric_columns(frame: pd.DataFrame, columns: set[str]) -> pd.DataFrame: + if frame.empty: + return frame + coerced = frame.copy() + for column in sorted(columns.intersection(coerced.columns)): + coerced[column] = pd.to_numeric(coerced[column], errors="coerce") + return coerced + + +def _coerce_specsheet_scene_numeric_columns(df): + """Normalize analyzer-loaded CSV values before NumPy-heavy specsheet metrics.""" + if hasattr(df, "current"): + df.current = _coerce_numeric_columns(df.current, _CURRENT_NUMERIC_COLUMNS) + if getattr(df, "future", None) is not None: + df.future = _coerce_numeric_columns(df.future, _FUTURE_NUMERIC_COLUMNS) + return df + if isinstance(df, pd.DataFrame): + return _coerce_numeric_columns( + df, + _CURRENT_NUMERIC_COLUMNS | _FUTURE_NUMERIC_COLUMNS, + ) + return df + + def _get_blocks_compat( get_blocks_func: Callable[..., tuple[Sequence[str], Sequence[str]]], *, @@ -934,6 +1092,7 @@ def generate_specsheet_pdf( run_path, topic_name=topic_name, ) + df = _coerce_specsheet_scene_numeric_columns(df) metrics = list(DEFAULT_SPECSHEET_METRICS) if getattr(df, "future", None) is not None: metrics.extend(FUTURE_SPECSHEET_METRICS) @@ -954,6 +1113,8 @@ def generate_specsheet_pdf( if include_trend: if trend_metadata is None: raise ValueError("Trend metadata is required when trend mode is enabled.") + _notify(progress_callback, "Validating full trend summary") + ensure_full_trend_summary(paths["trend_summary"]) _notify(progress_callback, "Saving trend metadata") write_trend_metadata(run_path, trend_metadata) metadata_list = discover_trend_metadata_files() diff --git a/evaluation_dashboard_app/lib/ui/task_history.py b/evaluation_dashboard_app/lib/ui/task_history.py index 100df7b..4471f8a 100644 --- a/evaluation_dashboard_app/lib/ui/task_history.py +++ b/evaluation_dashboard_app/lib/ui/task_history.py @@ -39,6 +39,7 @@ def _task_type_label(task_type: str) -> str: "build_parquet": "Build parquet", "download_and_eval": "Download + Eval", "run_evaluator_and_process": "Run Evaluator + Process", + "run_release_specsheet_workflow": "Release Specsheet", } return labels.get(task_type, task_type or "Task") @@ -68,6 +69,10 @@ def _task_summary(t: Dict[str, Any]) -> str: target = params.get("target_name", "") target_type = "tag" if params.get("is_tag", False) else "branch" return f"{target_type}={target} → {params.get('output_path', '')}" + if task_type == "run_release_specsheet_workflow": + target = params.get("target_name", "") + target_type = "tag" if params.get("is_tag", False) else "branch" + return f"{target_type}={target} → {params.get('output_path', '')}" return "" diff --git a/evaluation_dashboard_app/lib/ui/task_result_summary.py b/evaluation_dashboard_app/lib/ui/task_result_summary.py index 37a9234..b7e0038 100644 --- a/evaluation_dashboard_app/lib/ui/task_result_summary.py +++ b/evaluation_dashboard_app/lib/ui/task_result_summary.py @@ -174,5 +174,49 @@ def render_task_result_summary(summary: Dict[str, Any]) -> None: if evaluator_report_url: st.markdown(f"### [📊 View Evaluator Report]({evaluator_report_url})") + elif job == "run_release_specsheet_workflow": + st.subheader("Release Specsheet Summary") + st.write(f"📁 **Release root:** `{summary.get('release_root', '')}`") + st.write(f"🏷️ **Version:** `{summary.get('version', '')}`") + evaluator_jobs = summary.get("evaluator_jobs", {}) + if evaluator_jobs: + rows = [] + for role, payload in evaluator_jobs.items(): + rows.append( + { + "role": role, + "job_id": payload.get("job_id", ""), + "status": payload.get("status", ""), + "catalog_id": payload.get("catalog_id", ""), + "suite_count": payload.get("suite_count", ""), + "description": payload.get("description", ""), + "report_url": payload.get("report_url", ""), + } + ) + st.dataframe(pd.DataFrame(rows), width="stretch", hide_index=True) + analysis_artifacts = summary.get("analysis_artifacts", {}) + if analysis_artifacts: + st.write("🔎 **Detailed analysis artifacts:**") + rows = [] + for role, payload in analysis_artifacts.items(): + download = payload.get("download", {}) if isinstance(payload.get("download"), dict) else {} + eval_summary = payload.get("eval", {}) if isinstance(payload.get("eval"), dict) else {} + warnings = payload.get("warnings", []) if isinstance(payload.get("warnings"), list) else [] + rows.append( + { + "role": role, + "path": payload.get("path", ""), + "download_success": download.get("success", ""), + "download_total": download.get("total", ""), + "summary_rows": eval_summary.get("summary_rows", ""), + "score_rows": eval_summary.get("score_rows", ""), + "parquet_path": payload.get("parquet_path", ""), + "warnings": "; ".join(str(item) for item in warnings[:3]), + } + ) + st.dataframe(pd.DataFrame(rows), width="stretch", hide_index=True) + specsheet_pdf = summary.get("specsheet_pdf", "") + if specsheet_pdf: + st.write(f"✅ **Specsheet PDF:** `{specsheet_pdf}`") else: st.json(summary) diff --git a/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py b/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py index 2a96c54..515990c 100644 --- a/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py +++ b/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py @@ -13,7 +13,7 @@ from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero, section_header -from lib.path_utils import list_run_directories, path_display +from lib.path_utils import get_run_display_name, list_run_directories, path_display from lib.prediction_eval import build_specsheet_aligned_prediction_artifacts @@ -503,7 +503,7 @@ def merge_polar_compare(polar_a: pd.DataFrame, polar_b: pd.DataFrame) -> pd.Data run_dirs = list_run_directories() run_dirs = [p for p in run_dirs if _run_has_prediction_source(p)] -run_names = [p.name for p in run_dirs] +run_names = [get_run_display_name(p) for p in run_dirs] if not run_names: st.warning("No run directories with `future.parquet` or `future.csv` found under `data/`.") st.stop() @@ -512,7 +512,8 @@ def merge_polar_compare(polar_a: pd.DataFrame, polar_b: pd.DataFrame) -> pd.Data mode_default = "Compare Mode" if st.session_state.get("mode") == "Compare Mode" else "Single Run" mode = st.sidebar.selectbox("Mode", ["Single Run", "Compare Mode"], index=0 if mode_default == "Single Run" else 1) -default_run_name = st.session_state.get("runA", {}).get("path").name if st.session_state.get("runA") else run_names[0] +session_run_path = st.session_state.get("runA", {}).get("path") if st.session_state.get("runA") else None +default_run_name = get_run_display_name(session_run_path) if isinstance(session_run_path, Path) else run_names[0] if default_run_name not in run_names: default_run_name = run_names[0] @@ -530,8 +531,8 @@ def merge_polar_compare(polar_a: pd.DataFrame, polar_b: pd.DataFrame) -> pd.Data default_b = compare_candidates[0] selected_run_b = st.sidebar.selectbox("Candidate (B)", compare_candidates, index=compare_candidates.index(default_b)) -run_path_a = next(p for p in run_dirs if p.name == selected_run_a) -run_path_b = next((p for p in run_dirs if p.name == selected_run_b), None) +run_path_a = next(p for p in run_dirs if get_run_display_name(p) == selected_run_a) +run_path_b = next((p for p in run_dirs if get_run_display_name(p) == selected_run_b), None) metadata_a = load_prediction_metadata(str(run_path_a)) cache_ready_a = prediction_artifacts_ready(run_path_a) metadata_b = load_prediction_metadata(str(run_path_b)) if run_path_b is not None else None diff --git a/evaluation_dashboard_app/pages/6_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py index 63038f3..9b5db30 100644 --- a/evaluation_dashboard_app/pages/6_Workflow.py +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -64,6 +64,10 @@ _JST = timezone(timedelta(hours=9)) _TASK_LIST_MAX_ROWS = 200 _TASK_LIST_SINCE_DAYS = 7 +_RELEASE_PERFORMANCE_CATALOG_ID = "e36d75b9-6c3a-4970-9b9b-5cd13f7a9da3" +_RELEASE_PERFORMANCE_INTEGRATION_ID = "96ad8fba-0228-4c2b-9166-07d4de1a0760" +_RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" +_RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" _TASK_HISTORY_RANGE_OPTIONS = { "7 days": 7, "30 days": 30, @@ -251,6 +255,11 @@ def _make_default_output_path(branch_name: str) -> str: return f"eval_{clean_branch}_{ts}" +def _safe_output_part(value: object, fallback: str) -> str: + text = re.sub(r"[^\w.\-]+", "_", str(value or "").strip()).strip("._") + return text or fallback + + def _catalog_preset_emoji(preset_name: str, *, has_custom_catalog: bool = False) -> str: mapping = { "Build Test Catalog": "🛠️", @@ -285,6 +294,13 @@ def _make_auto_workflow_description( ) +def _make_auto_release_workflow_description(target_name: str) -> str: + clean_target = str(target_name or "").strip() or "default" + clean_target = re.sub(r"\s+", " ", clean_target) + stamp = datetime.now().strftime("%m-%d %H:%M") + return f"🚀 release workflow [{clean_target}] [{stamp}]" + + def _format_run_mtime(mtime: float) -> str: if not mtime: return "—" @@ -1808,6 +1824,17 @@ def _render_start_workflow_form( f"{item['display_name']} ({item['catalog_id']})": item for item in server_catalogs } + release_mode = st.checkbox( + "Release data workflow: schedule Performance Test + Devops Test", + value=bool(st.session_state.get("workflow_release_mode", False)), + key="workflow_release_mode", + help="Queues the two standard release evaluator jobs, processes both as normal app runs, then generates a release specsheet with trend data.", + ) + if release_mode: + st.info( + "Release mode uses the app-native flow: schedule Performance Test and Devops Test, create normal CSV/parquet analysis folders, write release metadata, and generate the trend-enabled specsheet PDF." + ) + top_cols = st.columns([1.0, 1.9, 1.2]) with top_cols[0]: st.markdown('
Project
', unsafe_allow_html=True) @@ -1828,12 +1855,14 @@ def _render_start_workflow_form( key="workflow_catalog_name", label_visibility="collapsed", format_func=lambda value: value or "Choose a catalog", + disabled=release_mode, ) with catalog_picker_cols[1]: fetch_catalogs_clicked = st.button( "Fetch", key="workflow_fetch_server_catalogs", use_container_width=True, + disabled=release_mode, ) if fetch_catalogs_clicked: try: @@ -1891,21 +1920,36 @@ def _render_start_workflow_form( picker_cols = st.columns([1.2, 1.2, 1.75]) with picker_cols[0]: - st.markdown('
Output folder
', unsafe_allow_html=True) + st.markdown( + f'
{"Release output folder" if release_mode else "Output folder"}
', + unsafe_allow_html=True, + ) output_path = st.text_input( - "Output folder", + "Release output folder" if release_mode else "Output folder", value=default_output, key="workflow_output_path", label_visibility="collapsed", placeholder=_make_default_output_path(target_name), + help=( + "Folder under data/. Release mode creates metadata.yaml, performance/, devops/, and specsheet/ in this single folder." + if release_mode + else "Output folder under the data directory." + ), ).strip() with picker_cols[1]: st.markdown('
Phase
', unsafe_allow_html=True) + phase_value = "perception.object_recognition.tracking.objects" if release_mode else default_phase phase = st.text_input( "Phase", - value=default_phase, + value=phase_value, key="workflow_phase", label_visibility="collapsed", + disabled=release_mode, + help=( + "Release mode uses this standard phase automatically for both detailed-analysis downloads." + if release_mode + else None + ), ) with picker_cols[2]: st.markdown('
Description
', unsafe_allow_html=True) @@ -1917,12 +1961,73 @@ def _render_start_workflow_form( placeholder="Optional label for the evaluator run", ).strip() + trend_metadata: Dict[str, object] = {} + if release_mode: + release_cols = st.columns([1.15, 1.1, 0.8]) + with release_cols[0]: + release_group = st.text_input( + "Release group", + value=st.session_state.get("workflow_release_group", _safe_output_part(target_name, "release")), + key="workflow_release_group", + help="Stable key used to group this release with older app-generated trend history.", + ).strip() + with release_cols[1]: + pilot_auto_version = st.text_input( + "Pilot.Auto version", + value=st.session_state.get("workflow_release_pilot_auto_version", ""), + key="workflow_release_pilot_auto_version", + placeholder='Pilot.Auto v4.4.0 (bevfusion x2/2.5.1)', + ).strip() + with release_cols[2]: + release_date = st.text_input( + "Release date", + value=st.session_state.get("workflow_release_date", datetime.now(_JST).strftime("%Y.%m.%d")), + key="workflow_release_date", + placeholder="2026.5.22", + ).strip() + release_meta_cols = st.columns([0.8, 1.1, 1.1]) + with release_meta_cols[0]: + data_count = st.text_input( + "Data count", + value=st.session_state.get("workflow_release_data_count", ""), + key="workflow_release_data_count", + placeholder="123,708+", + ).strip() + with release_meta_cols[1]: + release_description = st.text_input( + "Release description", + value=st.session_state.get("workflow_release_description", ""), + key="workflow_release_description", + ).strip() + with release_meta_cols[2]: + release_topic_name = st.text_input( + "Trend topic", + value=st.session_state.get("workflow_release_topic_name", "perception.object_recognition.objects"), + key="workflow_release_topic_name", + ).strip() + trend_metadata = { + "tags": ["trend"], + "release_group": release_group, + "pilot_auto_version": pilot_auto_version, + "data_count": data_count, + "description": release_description, + "date": release_date, + "topic_name": release_topic_name, + } + st.caption( + "Normal detailed-analysis outputs are generated automatically under `performance/` and `devops/`; the release PDF is copied to `specsheet/`." + ) + confirm_cols = st.columns([1.0, 1.0]) with confirm_cols[0]: - if catalog_id: + if release_mode: + st.caption(f"Performance catalog: `{_RELEASE_PERFORMANCE_CATALOG_ID}`") + elif catalog_id: st.caption(f"Catalog ID: `{catalog_id}`") with confirm_cols[1]: - if integration_id: + if release_mode: + st.caption(f"DevOps catalog: `{_RELEASE_DEVOPS_CATALOG_ID}`") + elif integration_id: st.caption(f"Integration ID: `{integration_id}`") if st.session_state.get("workflow_catalog_resolution_error"): st.warning(f"Could not resolve integration automatically: {st.session_state['workflow_catalog_resolution_error']}") @@ -1944,6 +2049,12 @@ def _render_start_workflow_form( horizontal=True, index=0 if default_download_type == "Archives (ZIP)" else 1, key="workflow_download_type", + disabled=release_mode, + help=( + "Release mode always downloads archives so Summary.csv, Score.csv, and parquet can be generated." + if release_mode + else None + ), ) with adv_cols[1]: environment = st.selectbox( @@ -1973,29 +2084,45 @@ def _render_start_workflow_form( option_cols = st.columns(5) with option_cols[0]: - run_eval = st.checkbox("Run evaluation", value=True, key="workflow_run_eval") + run_eval = st.checkbox( + "Run evaluation", + value=False if release_mode else True, + key="workflow_run_eval", + disabled=release_mode, + help="Release mode runs evaluation automatically for both release jobs.", + ) with option_cols[1]: generate_parquet = st.checkbox( "Generate parquet", - value=CATALOG_IO_AVAILABLE, - disabled=not CATALOG_IO_AVAILABLE, + value=False if release_mode else CATALOG_IO_AVAILABLE, + disabled=release_mode or not CATALOG_IO_AVAILABLE, key="workflow_generate_parquet", + help="Release mode generates detailed-analysis CSV/parquet automatically under performance/ and devops/.", ) with option_cols[2]: skip_large_file = st.checkbox( "Skip large files", - value=default_skip_large_file, + value=False if release_mode else default_skip_large_file, key="workflow_skip_large_file", + disabled=release_mode, + help="Release mode keeps the standard release artifacts needed for analysis.", ) with option_cols[3]: - eval_recursive = st.checkbox("Recursive scan", value=True, key="workflow_eval_recursive") + eval_recursive = st.checkbox( + "Recursive scan", + value=False if release_mode else True, + key="workflow_eval_recursive", + disabled=release_mode, + help="Not used in release mode.", + ) with option_cols[4]: is_tag = st.checkbox("Target is tag", value=False, key="workflow_is_tag") set_config_value("eval_project_id", project_id) set_config_value("target_name", target_name) - set_config_value("eval_download_type", download_type) - set_config_value("eval_phase", phase) + if not release_mode: + set_config_value("eval_download_type", download_type) + set_config_value("eval_phase", phase) set_config_value("poll_interval", poll_interval) set_config_value("max_wait_hours", max_wait_hours) set_config_value("environment", environment) @@ -2003,17 +2130,26 @@ def _render_start_workflow_form( errors = [] if not project_id: errors.append("Project ID") - if not catalog_id: + if not release_mode and not catalog_id: errors.append("Catalog") - if not integration_id: + if not release_mode and not integration_id: errors.append("Integration ID") if not target_name: errors.append("Branch or tag") + if release_mode: + if not trend_metadata.get("release_group"): + errors.append("Release group") + if not trend_metadata.get("pilot_auto_version"): + errors.append("Pilot.Auto version") + if not trend_metadata.get("data_count"): + errors.append("Data count") + if not trend_metadata.get("date"): + errors.append("Release date") resolved_output = None path_error = "" if output_path: - resolved_output, path_error = resolve_under_data_root(output_path, allow_create=False) + resolved_output, path_error = resolve_under_data_root(output_path, allow_missing=True) if path_error: errors.append(path_error) else: @@ -2044,10 +2180,12 @@ def _render_start_workflow_form( "phase": phase, "poll_interval": int(poll_interval), "max_wait_hours": int(max_wait_hours), - "run_eval": bool(run_eval), - "generate_parquet": bool(generate_parquet), - "skip_large_file": bool(skip_large_file), - "eval_recursive": bool(eval_recursive), + "run_eval": False if release_mode else bool(run_eval), + "generate_parquet": False if release_mode else bool(generate_parquet), + "skip_large_file": False if release_mode else bool(skip_large_file), + "eval_recursive": False if release_mode else bool(eval_recursive), + "release_mode": bool(release_mode), + "trend_metadata": trend_metadata if release_mode else {}, }, } @@ -2102,36 +2240,78 @@ def _workflow_start_dialog() -> None: elif not is_task_queue_enabled(): st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") else: + common_params = { + "project_id": dialog_payload["project_id"], + "suite_ids": None, + "target_name": dialog_payload["target_name"], + "environment": dialog_payload["environment"], + "max_retries": 0, + "clean_build": False, + "debug": False, + "release": False, + "record_caret": False, + "log_expiration_time_in_days": 14.0, + "is_tag": dialog_payload["is_tag"], + "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", + "phase": dialog_payload["phase"], + "skip_large_file": bool(dialog_payload.get("skip_large_file", True)), + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "run_eval": dialog_payload["run_eval"], + "generate_parquet": dialog_payload["generate_parquet"], + "eval_recursive": dialog_payload["eval_recursive"], + "eval_overwrite": False, + } + if dialog_payload.get("release_mode"): + base_description = dialog_payload["description"] or _make_auto_release_workflow_description( + dialog_payload["target_name"] + ) + trend_metadata = dict(dialog_payload.get("trend_metadata") or {}) + task_id = _enqueue_task( + "run_release_specsheet_workflow", + { + "project_id": dialog_payload["project_id"], + "target_name": dialog_payload["target_name"], + "description": base_description, + "output_path": dialog_payload["resolved_output"], + "environment": dialog_payload["environment"], + "is_tag": dialog_payload["is_tag"], + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "trend_metadata": trend_metadata, + "version": trend_metadata.get("pilot_auto_version", ""), + "topic": trend_metadata.get("topic_name", "perception.object_recognition.objects"), + "performance_catalog_id": _RELEASE_PERFORMANCE_CATALOG_ID, + "performance_integration_id": _RELEASE_PERFORMANCE_INTEGRATION_ID, + "devops_catalog_id": _RELEASE_DEVOPS_CATALOG_ID, + "devops_integration_id": _RELEASE_DEVOPS_INTEGRATION_ID, + "analysis_phase": "perception.object_recognition.tracking.objects", + "overwrite": True, + }, + ) + if task_id: + st.session_state["workflow_start_dialog_open"] = False + st.success(f"Release specsheet workflow queued. Task id: `{task_id}`") + st.rerun() + else: + st.error("Failed to enqueue release specsheet workflow. Check worker logs.") + return + task_id = _enqueue_task( "run_evaluator_and_process", { - "project_id": dialog_payload["project_id"], + **common_params, "catalog_id": dialog_payload["catalog_id"], "integration_id": dialog_payload["integration_id"], - "suite_ids": None, - "target_name": dialog_payload["target_name"], + "catalog_preset_name": dialog_payload.get("catalog_preset_name", ""), "description": dialog_payload["description"] or _make_auto_workflow_description( dialog_payload["target_name"], dialog_payload.get("catalog_preset_name", ""), has_custom_catalog=bool(dialog_payload.get("has_custom_catalog", False)), ), "output_path": dialog_payload["resolved_output"], - "environment": dialog_payload["environment"], - "max_retries": 0, - "clean_build": False, - "debug": False, - "is_tag": dialog_payload["is_tag"], - "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", - "phase": dialog_payload["phase"], - "skip_large_file": bool(dialog_payload.get("skip_large_file", True)), - "large_file_mb": 50.0, - "keep_zip_files": False, - "poll_interval": dialog_payload["poll_interval"], - "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, - "run_eval": dialog_payload["run_eval"], - "generate_parquet": dialog_payload["generate_parquet"], - "eval_recursive": dialog_payload["eval_recursive"], - "eval_overwrite": False, }, ) if task_id: diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 44286b3..058e09f 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -5,10 +5,15 @@ import os import re +import json +import shutil import sys import time +from pathlib import Path from typing import Any, Dict, Optional +import yaml + # App root on path for lib imports _APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _APP_ROOT not in sys.path: @@ -26,6 +31,12 @@ resolve_run_directory_from_task_parameters, upsert_run_metadata, ) +from lib.specsheet_report import write_trend_metadata + +_RELEASE_PERFORMANCE_CATALOG_ID = "e36d75b9-6c3a-4970-9b9b-5cd13f7a9da3" +_RELEASE_PERFORMANCE_INTEGRATION_ID = "96ad8fba-0228-4c2b-9166-07d4de1a0760" +_RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" +_RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" # Optional imports for tasks that need them def _import_eval_summary(): @@ -1006,6 +1017,432 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: raise +def _write_release_metadata_file(path: Path, metadata: Dict[str, Any]) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + yaml.safe_dump(metadata, fh, allow_unicode=True, sort_keys=False) + return path + + +def _build_devops_trend_summary_from_suites(rows: list[dict[str, Any]]) -> Dict[str, Any]: + summary_payload: Dict[str, Any] = {"DevOps": {"Suite pass rate": {}}} + for row in rows or []: + suite_name = str(row.get("name") or row.get("suite_name") or row.get("simulation") or "suite").strip() + total = int(row.get("all", 0) or row.get("total", 0) or 0) + passed = int(row.get("success", 0) or row.get("passed", 0) or 0) + if total <= 0: + failed = int(row.get("fail", 0) or row.get("failed", 0) or 0) + canceled = int(row.get("cancel", 0) or row.get("canceled", 0) or 0) + total = passed + failed + canceled + if total <= 0: + continue + summary_payload["DevOps"]["Suite pass rate"][suite_name] = { + "passed": passed, + "total": total, + } + return summary_payload + + +def _write_devops_trend_summary(path: Path, rows: list[dict[str, Any]]) -> Path | None: + summary_payload = _build_devops_trend_summary_from_suites(rows) + if not summary_payload["DevOps"]["Suite pass rate"]: + return None + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + json.dump(summary_payload, fh, ensure_ascii=False, indent=2) + return path + + +def _build_release_analysis_artifacts( + *, + task_id: str, + project_id: str, + job_id: str, + role: str, + output_path: Path, + phase: str, +) -> Dict[str, Any]: + """Create the normal app analysis files for a release job.""" + from lib import download_core + + eval_summary = _import_eval_summary() + pkl_archive_to_parquet = _import_catalog_io() + output_path.mkdir(parents=True, exist_ok=True) + result: Dict[str, Any] = { + "path": str(output_path), + "download": {}, + "eval": {}, + "parquet_path": "", + "warnings": [], + } + + def _on_progress(msg: str) -> None: + append_task_log(task_id, f"{role}: {msg}") + + def _on_warning(msg: str) -> None: + result["warnings"].append(msg) + append_task_log(task_id, f"WARNING: {role}: {msg}") + + failure_count, total_attempted, rows = download_core.run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=None, + output_path=str(output_path), + download_type="archives", + phase=phase, + skip_large_file=False, + large_file_mb=50.0, + keep_zip_files=False, + suite_ids=None, + on_progress=_on_progress, + on_warning=_on_warning, + ) + success_count = total_attempted - failure_count + result["download"] = { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "rows": rows[:100], + } + if success_count <= 0: + raise RuntimeError(f"{role}: download produced no successful case artifacts.") + + if eval_summary: + target_dirs = eval_summary.find_eval_result_dirs(str(output_path), recursive=True) + statuses = [] + for result_dir in target_dirs: + statuses.append(eval_summary.run_eval_result_for_dir(result_dir, overwrite=False)) + if target_dirs: + csv_info = eval_summary.generate_summary_and_score_csv(str(output_path)) + result["eval"] = { + "directories_processed": len(target_dirs), + "success": sum(1 for item in statuses if item.get("status") == "success"), + "failed": sum(1 for item in statuses if item.get("status") == "failed"), + "skipped": sum(1 for item in statuses if item.get("status") == "skipped"), + "summary_path": csv_info.get("summary_path", ""), + "summary_rows": csv_info.get("summary_rows", 0), + "score_rows": csv_info.get("score_rows", 0), + } + else: + result["eval"] = { + "directories_processed": 0, + "success": 0, + "failed": 0, + "skipped": 0, + } + + if pkl_archive_to_parquet: + try: + result["parquet_path"] = pkl_archive_to_parquet( + str(output_path), + on_progress=None, + on_skip=None, + project_id=project_id, + job_id=job_id, + ) or "" + except Exception as exc: + warning = f"Parquet generation failed: {exc}" + result["warnings"].append(warning) + append_task_log(task_id, f"WARNING: {role}: {warning}") + + append_task_log( + task_id, + ( + f"{role}: analysis artifacts ready at {output_path} " + f"({success_count}/{total_attempted} downloads)" + ), + ) + return result + + +def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) -> None: + """Schedule the standard release evaluator jobs, process them as app-native runs, then build a release specsheet.""" + update_task_status(task_id, "running") + append_task_log(task_id, "Starting release specsheet workflow") + _mark_run_status( + task_id, + parameters, + task_type="run_release_specsheet_workflow", + status="running", + create_missing=True, + ) + try: + from lib import evaluator_api + from lib.specsheet_report import ( + DEFAULT_SPECSHEET_LABELS, + DEFAULT_SPECSHEET_TOPIC, + generate_specsheet_pdf, + ) + + project_id = str(parameters.get("project_id") or "").strip() + target_name = str(parameters.get("target_name") or "").strip() + output_path = str(parameters.get("output_path") or "").strip() + environment = str(parameters.get("environment") or "default").strip() or "default" + is_tag = bool(parameters.get("is_tag", False)) + metadata = parameters.get("trend_metadata") if isinstance(parameters.get("trend_metadata"), dict) else {} + version = str(parameters.get("version") or metadata.get("pilot_auto_version") or "").strip() + topic = str(parameters.get("topic") or metadata.get("topic_name") or DEFAULT_SPECSHEET_TOPIC).strip() + description = str(parameters.get("description") or target_name or "").strip() + poll_interval = float(parameters.get("poll_interval", 60.0)) + max_wait_seconds = float(parameters.get("max_wait_seconds", 3600.0 * 24 * 7)) + analysis_phase = str( + parameters.get("analysis_phase") + or "perception.object_recognition.tracking.objects" + ).strip() + labels = parameters.get("labels") or DEFAULT_SPECSHEET_LABELS + labels = [str(label).strip() for label in labels if str(label).strip()] + if not labels: + labels = list(DEFAULT_SPECSHEET_LABELS) + + if not project_id or not target_name or not output_path or not version: + raise ValueError("Missing project_id, target_name, output_path, or Pilot.Auto version.") + if "trend" not in [str(tag).strip() for tag in metadata.get("tags", [])]: + raise ValueError("Release metadata must include tags: [trend].") + + release_root = Path(output_path) + release_root.mkdir(parents=True, exist_ok=True) + _write_release_metadata_file(release_root / "metadata.yaml", metadata) + release_specsheet_dir = release_root / "specsheet" + performance_path = release_root / "performance" + devops_path = release_root / "devops" + os.environ["AUTH_PROFILE"] = environment + os.environ["EVALUATOR_ENVIRONMENT"] = environment + + api = evaluator_api.EvaluationRunAPI() + jobs = [ + { + "role": "performance", + "label": "Performance Test", + "catalog_id": str(parameters.get("performance_catalog_id") or _RELEASE_PERFORMANCE_CATALOG_ID), + "integration_id": str(parameters.get("performance_integration_id") or _RELEASE_PERFORMANCE_INTEGRATION_ID), + }, + { + "role": "devops", + "label": "Devops Test", + "catalog_id": str(parameters.get("devops_catalog_id") or _RELEASE_DEVOPS_CATALOG_ID), + "integration_id": str(parameters.get("devops_integration_id") or _RELEASE_DEVOPS_INTEGRATION_ID), + }, + ] + summary: Dict[str, Any] = { + "job": "run_release_specsheet_workflow", + "release_root": str(release_root), + "version": version, + "topic": topic, + "evaluator_jobs": {}, + "analysis_artifacts": {}, + "specsheet_pdf": "", + } + update_task_result_summary(task_id, summary) + update_task_progress(task_id, message="Scheduling release evaluator jobs", pct=2) + + for item in jobs: + append_task_log(task_id, f"Scheduling {item['label']}: catalog={item['catalog_id']}") + schedule_description = f"{description} | {item['label']}" + result = api.schedule_job( + project_id=project_id, + catalog_id=item["catalog_id"], + integration_id=item["integration_id"], + target_name=target_name, + suite_ids=None, + max_retries=0, + description=schedule_description, + clean_build=True, + debug=False, + release=False, + record_caret=False, + log_expiration_time_in_days=10.0, + is_tag=is_tag, + ) + job_id = str(result.get("job_id") or "").strip() + if not job_id: + raise RuntimeError(f"No job_id returned for {item['label']}.") + item["job_id"] = job_id + report_url = evaluator_api.get_job_report_url(project_id, job_id) + summary["evaluator_jobs"][item["role"]] = { + "job_id": job_id, + "report_url": report_url, + "catalog_id": item["catalog_id"], + "integration_id": item["integration_id"], + "status": "scheduled", + "description": schedule_description, + } + append_task_log(task_id, f"Scheduled {item['label']}: {job_id}") + update_task_result_summary(task_id, summary) + + for idx, item in enumerate(jobs, start=1): + job_id = str(item["job_id"]) + label = str(item["label"]) + base_pct = 5 + (idx - 1) * 20 + + def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct_base: float = base_pct) -> None: + pct = min(pct_base + (elapsed / max_wait_seconds) * 18, pct_base + 18) + summary["evaluator_jobs"][role]["status"] = status + update_task_progress( + task_id, + message=f"{label}: {status} ({elapsed / 3600:.1f}h elapsed)", + pct=pct, + ) + update_task_result_summary(task_id, summary) + + append_task_log(task_id, f"Waiting for {label}: {job_id}") + final_report = api.wait_for_job_completion( + project_id=project_id, + job_id=job_id, + poll_interval=poll_interval, + max_wait_seconds=max_wait_seconds, + on_check=_on_check, + ) + status = evaluator_api.extract_job_status(final_report) + summary["evaluator_jobs"][item["role"]]["status"] = status + append_task_log(task_id, f"{label} completed with status: {status}") + try: + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + except Exception as exc: + append_task_log(task_id, f"WARNING: Could not fetch suite summary for {label}: {exc}") + suite_rows = [] + item["suite_rows"] = suite_rows + summary["evaluator_jobs"][item["role"]]["suite_count"] = len(suite_rows) + update_task_result_summary(task_id, summary) + + update_task_progress(task_id, message="Building normal CSV/parquet analysis artifacts", pct=48) + role_paths = {"performance": performance_path, "devops": devops_path} + for item in jobs: + role = str(item["role"]) + analysis_path = role_paths[role] + artifact_summary = _build_release_analysis_artifacts( + task_id=task_id, + project_id=project_id, + job_id=str(item["job_id"]), + role=role, + output_path=analysis_path, + phase=analysis_phase, + ) + summary["analysis_artifacts"][role] = artifact_summary + update_task_result_summary(task_id, summary) + + child_params = { + **parameters, + "output_path": str(analysis_path), + "catalog_id": item["catalog_id"], + "integration_id": item["integration_id"], + "job_id": item["job_id"], + "download_type": "archives", + "phase": analysis_phase, + "run_eval": True, + "generate_parquet": True, + "eval_recursive": True, + } + _mark_run_status( + task_id, + child_params, + task_type="run_release_specsheet_workflow", + status="completed", + result_path=str(analysis_path), + create_missing=True, + extra={ + "release_specsheet": { + "root": str(release_root), + "role": role, + "metadata": metadata, + }, + "evaluator": { + "job_id": str(item["job_id"]), + "report_url": summary["evaluator_jobs"][role].get("report_url", ""), + "status": summary["evaluator_jobs"][role].get("status", ""), + "catalog_id": item["catalog_id"], + "integration_id": item["integration_id"], + "target_name": target_name, + "description": schedule_description, + "title": schedule_description, + }, + "download": { + **artifact_summary.get("download", {}), + "mode": "release_specsheet", + "download_type": "archives", + "phase": analysis_phase, + }, + "evaluation": { + **artifact_summary.get("eval", {}), + "enabled": True, + "recursive": True, + }, + "parquet": { + "enabled": True, + "path": artifact_summary.get("parquet_path", ""), + }, + }, + ) + + update_task_progress(task_id, message="Writing release trend summaries", pct=78) + write_trend_metadata(devops_path, metadata) + devops_job = next(item for item in jobs if item["role"] == "devops") + devops_summary_path = _write_devops_trend_summary( + devops_path / "resources" / "summary.json", + list(devops_job.get("suite_rows") or []), + ) + if devops_summary_path is None: + append_task_log(task_id, "WARNING: DevOps trend summary had no suite pass-rate rows.") + else: + append_task_log(task_id, f"DevOps trend summary written: {devops_summary_path}") + + update_task_progress(task_id, message="Generating app-native release specsheet", pct=82) + specsheet_pdf, generated = generate_specsheet_pdf( + performance_path, + project_id=project_id, + version=version, + labels=labels, + topic_name=topic, + include_trend=True, + trend_metadata=metadata, + force=bool(parameters.get("overwrite", True)), + progress_callback=lambda msg: append_task_log(task_id, f"specsheet: {msg}"), + ) + release_specsheet_dir.mkdir(parents=True, exist_ok=True) + release_pdf = release_specsheet_dir / "specsheet.pdf" + shutil.copy2(specsheet_pdf, release_pdf) + for asset in ("map_trend.png", "prediction_trend.png", "devops_trend.png"): + asset_path = specsheet_pdf.parent / asset + if asset_path.exists(): + shutil.copy2(asset_path, release_specsheet_dir / asset) + summary["specsheet_pdf"] = str(release_pdf) + summary["performance_specsheet_pdf"] = str(specsheet_pdf) + summary["specsheet_generated"] = bool(generated) + + update_task_progress(task_id, message="Release specsheet ready", pct=100) + update_task_result_summary(task_id, summary) + _mark_run_status( + task_id, + parameters, + task_type="run_release_specsheet_workflow", + status="completed", + result_path=str(release_pdf), + extra={ + "release_specsheet": { + "root": str(release_root), + "specsheet_pdf": str(release_pdf), + "performance_specsheet_pdf": str(specsheet_pdf), + "evaluator_jobs": summary["evaluator_jobs"], + "analysis_artifacts": summary["analysis_artifacts"], + "metadata": metadata, + } + }, + ) + append_task_log(task_id, f"Release specsheet PDF ready: {release_pdf}") + update_task_status(task_id, "completed", result_path=str(release_pdf)) + except Exception as e: + append_task_log(task_id, f"Failed: {e}") + _mark_run_status( + task_id, + parameters, + task_type="run_release_specsheet_workflow", + status="failed", + error_message=str(e), + create_missing=True, + ) + update_task_status(task_id, "failed", error_message=str(e)) + raise + + def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> None: """ Full combined workflow: Run Evaluator + Download + Eval + Parquet. @@ -1044,6 +1481,33 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N target_name = parameters.get("target_name") # branch name or tag description = parameters.get("description", "no description") output_path = parameters.get("output_path") + trend_metadata = parameters.get("trend_metadata") if isinstance(parameters.get("trend_metadata"), dict) else None + trend_role = str(parameters.get("trend_role") or "").strip() + + def _write_devops_trend_summary_from_suites(rows: list[dict[str, Any]]) -> None: + if not output_path: + return + summary_payload: Dict[str, Any] = {"DevOps": {"Suite pass rate": {}}} + for row in rows or []: + suite_name = str(row.get("name") or row.get("suite_name") or row.get("simulation") or "suite").strip() + total = int(row.get("all", 0) or row.get("total", 0) or 0) + passed = int(row.get("success", 0) or row.get("passed", 0) or 0) + if total <= 0: + failed = int(row.get("fail", 0) or row.get("failed", 0) or 0) + canceled = int(row.get("cancel", 0) or row.get("canceled", 0) or 0) + total = passed + failed + canceled + if total <= 0: + continue + summary_payload["DevOps"]["Suite pass rate"][suite_name] = { + "passed": passed, + "total": total, + } + if not summary_payload["DevOps"]["Suite pass rate"]: + return + resource_dir = Path(output_path) / "resources" + resource_dir.mkdir(parents=True, exist_ok=True) + with (resource_dir / "summary.json").open("w", encoding="utf-8") as fh: + json.dump(summary_payload, fh, ensure_ascii=False, indent=2) # Eval options run_eval = parameters.get("run_eval", True) @@ -1071,6 +1535,9 @@ def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> N clean_build = parameters.get("clean_build", False) debug = parameters.get("debug", False) is_tag = parameters.get("is_tag", False) + release = bool(parameters.get("release", False)) + record_caret = bool(parameters.get("record_caret", False)) + log_expiration_time_in_days = float(parameters.get("log_expiration_time_in_days", 14.0)) has_source_job = bool(source_job_id) has_fresh_source = bool(integration_id and target_name) @@ -1123,6 +1590,9 @@ def on_warning(msg: str) -> None: description=description, clean_build=clean_build, debug=debug, + release=release, + record_caret=record_caret, + log_expiration_time_in_days=log_expiration_time_in_days, is_tag=is_tag, ) except Exception as e: @@ -1185,6 +1655,25 @@ def on_warning(msg: str) -> None: } }, ) + + if trend_metadata: + try: + write_trend_metadata(output_path, trend_metadata) + append_task_log(task_id, "Saved release trend metadata.") + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "trend": { + "enabled": True, + "role": trend_role, + "metadata": trend_metadata, + } + }, + ) + except Exception as e: + append_task_log(task_id, f"WARNING: Could not save release trend metadata: {e}") # Step 2: Poll for evaluator completion on_progress("Step 2/5: Waiting for evaluator to complete...") @@ -1310,6 +1799,13 @@ def on_eval_progress(status: str, elapsed: float) -> None: append_task_log(task_id, f"Could not fetch case reports: {e}") case_reports = [] + if trend_metadata and trend_role == "devops": + try: + _write_devops_trend_summary_from_suites(suite_rows) + append_task_log(task_id, "Saved DevOps trend summary.") + except Exception as e: + append_task_log(task_id, f"WARNING: Could not save DevOps trend summary: {e}") + evaluator_summary = _build_evaluator_result_summary( job_id=job_id, report_url=report_url, @@ -1606,6 +2102,7 @@ def on_eval_progress(status: str, elapsed: float) -> None: "download_results": job_download_results, "download_scenarios": job_download_scenarios, "download_and_eval": job_download_and_eval, + "run_release_specsheet_workflow": job_run_release_specsheet_workflow, "run_evaluator_and_process": job_run_evaluator_and_process, } From 56817183492c141e5f36b93e65e197202b4f0131 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 26 May 2026 15:03:25 +0900 Subject: [PATCH 83/94] feat: enhance evaluation dashboard with new release container detection and query handling - Added a function to identify release containers based on the presence of specific directories and metadata files. - Updated the run directory listing logic to exclude release containers from the displayed runs. - Improved query parameter handling in the Data Management page for better URL construction and clarity. - Enhanced the Trend Insights page to display major metrics (mAP, precision, recall) and updated visualizations accordingly. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/path_utils.py | 10 ++- .../pages/13_Trend_Insights.py | 66 ++++++++++------ evaluation_dashboard_app/pages/6_Workflow.py | 52 +++++++++++- .../pages/7_Data_Management.py | 6 +- evaluation_dashboard_app/worker/tasks.py | 79 +++++++++++++------ 5 files changed, 157 insertions(+), 56 deletions(-) diff --git a/evaluation_dashboard_app/lib/path_utils.py b/evaluation_dashboard_app/lib/path_utils.py index fc766e3..72a2c9b 100644 --- a/evaluation_dashboard_app/lib/path_utils.py +++ b/evaluation_dashboard_app/lib/path_utils.py @@ -122,6 +122,14 @@ def _looks_like_analysis_run(path: Path) -> bool: ) +def _looks_like_release_container(path: Path) -> bool: + return ( + (path / "metadata.yaml").exists() + and any((path / name).is_dir() for name in ("performance", "devops")) + and not _looks_like_analysis_run(path) + ) + + def get_run_display_name(run_path: Path) -> str: """Return a stable run selector name relative to the data root.""" root = get_data_root() @@ -140,7 +148,7 @@ def list_run_directories() -> List[Path]: seen = set() for child in sorted([p for p in root.iterdir() if p.is_dir()]): resolved = child.resolve() - if resolved not in seen: + if resolved not in seen and not _looks_like_release_container(child): runs.append(child) seen.add(resolved) for release_child_name in ("performance", "devops"): diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index 093a4e4..084897b 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -654,6 +654,8 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame "description", "data_count", "mAP", + "precision", + "recall", "overall_pass_rate", "roles", "full_job_id", @@ -668,12 +670,13 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame hide_index=True, ) -section_header("mAP Trend") +section_header("Major Metrics Trend") perf_entries = release_df[release_df["full_job_id"].notna()].sort_values( ["date_sort", "version", "release_name"], ascending=[True, True, True], ) +major_metric_cols = ["mAP", "precision", "recall"] prediction_cols = [ "minADE@1s", "minADE@3s", @@ -682,16 +685,19 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame "minFDE@3s", "minFDE@5s", ] -if not perf_entries.empty and perf_entries["mAP"].notna().any(): - latest_map_row = perf_entries.dropna(subset=["mAP"]).iloc[-1] - map_card_col1, map_card_col2 = st.columns(2) - map_card_col1.metric( - "Latest mAP", - f"{latest_map_row['mAP']:.3f}" if pd.notna(latest_map_row["mAP"]) else "n/a", - ) - map_card_col2.metric( +if not perf_entries.empty and perf_entries[major_metric_cols].notna().any().any(): + latest_major_row = perf_entries.dropna(subset=major_metric_cols, how="all").iloc[-1] + metric_card_cols = st.columns(4) + for metric_col, card_col in zip(major_metric_cols, metric_card_cols[:3]): + metric_series = perf_entries.dropna(subset=[metric_col]) + latest_metric_value = metric_series[metric_col].iloc[-1] if not metric_series.empty else pd.NA + card_col.metric( + f"Latest {metric_col}", + f"{latest_metric_value:.3f}" if pd.notna(latest_metric_value) else "n/a", + ) + metric_card_cols[3].metric( "Latest Data Count", - f"{int(latest_map_row['data_count_num']):,}" if pd.notna(latest_map_row["data_count_num"]) else "n/a", + f"{int(latest_major_row['data_count_num']):,}" if pd.notna(latest_major_row["data_count_num"]) else "n/a", ) fig = go.Figure() fig.add_bar( @@ -702,21 +708,35 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame opacity=0.5, yaxis="y2", ) - fig.add_trace( - go.Scatter( - x=perf_entries["version"], - y=perf_entries["mAP"], - name="mAP", - mode="lines+markers", - line=dict(color="#0f766e", width=3), - customdata=perf_entries[["release_name", "date", "data_count"]].to_numpy(), - hovertemplate="%{x}
mAP: %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}", + metric_styles = { + "mAP": {"color": "#0f766e", "dash": "solid"}, + "precision": {"color": "#1d4ed8", "dash": "solid"}, + "recall": {"color": "#be123c", "dash": "dot"}, + } + for metric_col in major_metric_cols: + fig.add_trace( + go.Scatter( + x=perf_entries["version"], + y=perf_entries[metric_col], + name=metric_col, + mode="lines+markers", + line=dict( + color=metric_styles[metric_col]["color"], + width=3, + dash=metric_styles[metric_col]["dash"], + ), + customdata=perf_entries[["release_name", "date", "data_count"]].to_numpy(), + hovertemplate=( + "%{x}
" + + metric_col + + ": %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}" + ), + ) ) - ) fig.update_layout( - title="mAP Trend", + title="Major Detection Metrics Trend", xaxis_title="Pilot.Auto Version", - yaxis_title="mAP", + yaxis_title="Score", yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), height=460, legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), @@ -724,7 +744,7 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ) st.plotly_chart(fig, use_container_width=True) else: - st.info("No grouped mAP trend entries are available yet.") + st.info("No grouped major metric trend entries are available yet.") section_header("Prediction Trend") diff --git a/evaluation_dashboard_app/pages/6_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py index 9b5db30..d32e92a 100644 --- a/evaluation_dashboard_app/pages/6_Workflow.py +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -301,6 +301,14 @@ def _make_auto_release_workflow_description(target_name: str) -> str: return f"🚀 release workflow [{clean_target}] [{stamp}]" +def _make_default_release_pilot_auto_version(target_name: str) -> str: + target = str(target_name or "").strip() + match = re.search(r"v?(\d+\.\d+\.\d+)", target) + if match: + return f"Pilot.Auto v{match.group(1)}" + return f"Pilot.Auto {target}" if target else "Pilot.Auto release" + + def _format_run_mtime(mtime: float) -> str: if not mtime: return "—" @@ -1963,6 +1971,16 @@ def _render_start_workflow_form( trend_metadata: Dict[str, object] = {} if release_mode: + release_version_default = _make_default_release_pilot_auto_version(target_name) + release_description_default = f"{target_name} release data update" if target_name else "Release data update" + release_data_count_default = "99,776+" + if not st.session_state.get("workflow_release_pilot_auto_version"): + st.session_state["workflow_release_pilot_auto_version"] = release_version_default + if not st.session_state.get("workflow_release_description"): + st.session_state["workflow_release_description"] = release_description_default + if not st.session_state.get("workflow_release_data_count"): + st.session_state["workflow_release_data_count"] = release_data_count_default + release_cols = st.columns([1.15, 1.1, 0.8]) with release_cols[0]: release_group = st.text_input( @@ -1974,7 +1992,7 @@ def _render_start_workflow_form( with release_cols[1]: pilot_auto_version = st.text_input( "Pilot.Auto version", - value=st.session_state.get("workflow_release_pilot_auto_version", ""), + value=st.session_state.get("workflow_release_pilot_auto_version", release_version_default), key="workflow_release_pilot_auto_version", placeholder='Pilot.Auto v4.4.0 (bevfusion x2/2.5.1)', ).strip() @@ -1989,14 +2007,14 @@ def _render_start_workflow_form( with release_meta_cols[0]: data_count = st.text_input( "Data count", - value=st.session_state.get("workflow_release_data_count", ""), + value=st.session_state.get("workflow_release_data_count", release_data_count_default), key="workflow_release_data_count", placeholder="123,708+", ).strip() with release_meta_cols[1]: release_description = st.text_input( "Release description", - value=st.session_state.get("workflow_release_description", ""), + value=st.session_state.get("workflow_release_description", release_description_default), key="workflow_release_description", ).strip() with release_meta_cols[2]: @@ -2014,9 +2032,29 @@ def _render_start_workflow_form( "date": release_date, "topic_name": release_topic_name, } + existing_job_cols = st.columns(2) + with existing_job_cols[0]: + performance_job_id = st.text_input( + "Existing Performance job ID", + value=st.session_state.get("workflow_release_performance_job_id", ""), + key="workflow_release_performance_job_id", + placeholder="Leave empty to schedule a new Performance job", + help="Use this when the release Performance evaluator job is already scheduled or finished.", + ).strip() + with existing_job_cols[1]: + devops_job_id = st.text_input( + "Existing DevOps job ID", + value=st.session_state.get("workflow_release_devops_job_id", ""), + key="workflow_release_devops_job_id", + placeholder="Leave empty to schedule a new DevOps job", + help="Use this when the release DevOps evaluator job is already scheduled or finished.", + ).strip() st.caption( - "Normal detailed-analysis outputs are generated automatically under `performance/` and `devops/`; the release PDF is copied to `specsheet/`." + "Normal detailed-analysis outputs are generated automatically under `performance/` and `devops/`; existing job IDs are waited on if still running and downloaded if already finished." ) + else: + performance_job_id = "" + devops_job_id = "" confirm_cols = st.columns([1.0, 1.0]) with confirm_cols[0]: @@ -2186,6 +2224,8 @@ def _render_start_workflow_form( "eval_recursive": False if release_mode else bool(eval_recursive), "release_mode": bool(release_mode), "trend_metadata": trend_metadata if release_mode else {}, + "performance_job_id": performance_job_id if release_mode else "", + "devops_job_id": devops_job_id if release_mode else "", }, } @@ -2219,6 +2259,8 @@ def _render_workflow_launcher_section( st.session_state["workflow_selected_server_catalog_label"] = "" st.session_state["workflow_catalog_resolution_error"] = "" st.session_state["workflow_last_catalog_selection"] = "" + st.session_state["workflow_release_performance_job_id"] = "" + st.session_state["workflow_release_devops_job_id"] = "" st.session_state["workflow_output_path"] = _make_default_output_path(fresh_target) @st.dialog("Start evaluator workflow", width="large") @@ -2285,8 +2327,10 @@ def _workflow_start_dialog() -> None: "topic": trend_metadata.get("topic_name", "perception.object_recognition.objects"), "performance_catalog_id": _RELEASE_PERFORMANCE_CATALOG_ID, "performance_integration_id": _RELEASE_PERFORMANCE_INTEGRATION_ID, + "performance_job_id": dialog_payload.get("performance_job_id", ""), "devops_catalog_id": _RELEASE_DEVOPS_CATALOG_ID, "devops_integration_id": _RELEASE_DEVOPS_INTEGRATION_ID, + "devops_job_id": dialog_payload.get("devops_job_id", ""), "analysis_phase": "perception.object_recognition.tracking.objects", "overwrite": True, }, diff --git a/evaluation_dashboard_app/pages/7_Data_Management.py b/evaluation_dashboard_app/pages/7_Data_Management.py index 050089d..4b45b86 100644 --- a/evaluation_dashboard_app/pages/7_Data_Management.py +++ b/evaluation_dashboard_app/pages/7_Data_Management.py @@ -5,6 +5,7 @@ import io import re +import urllib.parse import zipfile import streamlit as st from pathlib import Path @@ -79,9 +80,10 @@ key="share_run_b", ) mode = "compare" if share_compare and share_run_b else "single" -q = f"mode={mode}&run_a={share_run_a}" +query = {"mode": mode, "run_a": share_run_a} if mode == "compare": - q += f"&run_b={share_run_b}" + query["run_b"] = share_run_b +q = urllib.parse.urlencode(query) st.code(q, language=None) st.caption("Example: `https://your-server:8501/?` + the query above.") diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 058e09f..d146fc2 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -1061,6 +1061,8 @@ def _build_release_analysis_artifacts( role: str, output_path: Path, phase: str, + progress_start: float = 48.0, + progress_end: float = 78.0, ) -> Dict[str, Any]: """Create the normal app analysis files for a release job.""" from lib import download_core @@ -1078,11 +1080,22 @@ def _build_release_analysis_artifacts( def _on_progress(msg: str) -> None: append_task_log(task_id, f"{role}: {msg}") + progress_msg = f"{role}: {msg}" + pct = progress_start + match = re.search(r"Downloading\s+(\d+)\s*/\s*(\d+)", msg) + if match: + current = int(match.group(1)) + total = max(1, int(match.group(2))) + pct = progress_start + ((current - 1) / total) * max(0.0, progress_end - progress_start) + elif "Extracting" in msg or "Organizing" in msg: + pct = progress_end + update_task_progress(task_id, message=progress_msg, pct=min(progress_end, pct)) def _on_warning(msg: str) -> None: result["warnings"].append(msg) append_task_log(task_id, f"WARNING: {role}: {msg}") + update_task_progress(task_id, message=f"{role}: finding downloadable case logs", pct=progress_start) failure_count, total_attempted, rows = download_core.run_download_results( project_id=project_id, job_id=job_id, @@ -1108,6 +1121,7 @@ def _on_warning(msg: str) -> None: raise RuntimeError(f"{role}: download produced no successful case artifacts.") if eval_summary: + update_task_progress(task_id, message=f"{role}: running eval_result", pct=progress_end) target_dirs = eval_summary.find_eval_result_dirs(str(output_path), recursive=True) statuses = [] for result_dir in target_dirs: @@ -1133,6 +1147,7 @@ def _on_warning(msg: str) -> None: if pkl_archive_to_parquet: try: + update_task_progress(task_id, message=f"{role}: generating parquet", pct=progress_end) result["parquet_path"] = pkl_archive_to_parquet( str(output_path), on_progress=None, @@ -1215,12 +1230,14 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) "label": "Performance Test", "catalog_id": str(parameters.get("performance_catalog_id") or _RELEASE_PERFORMANCE_CATALOG_ID), "integration_id": str(parameters.get("performance_integration_id") or _RELEASE_PERFORMANCE_INTEGRATION_ID), + "job_id": str(parameters.get("performance_job_id") or "").strip(), }, { "role": "devops", "label": "Devops Test", "catalog_id": str(parameters.get("devops_catalog_id") or _RELEASE_DEVOPS_CATALOG_ID), "integration_id": str(parameters.get("devops_integration_id") or _RELEASE_DEVOPS_INTEGRATION_ID), + "job_id": str(parameters.get("devops_job_id") or "").strip(), }, ] summary: Dict[str, Any] = { @@ -1233,40 +1250,48 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) "specsheet_pdf": "", } update_task_result_summary(task_id, summary) - update_task_progress(task_id, message="Scheduling release evaluator jobs", pct=2) + update_task_progress(task_id, message="Preparing release evaluator jobs", pct=2) for item in jobs: - append_task_log(task_id, f"Scheduling {item['label']}: catalog={item['catalog_id']}") schedule_description = f"{description} | {item['label']}" - result = api.schedule_job( - project_id=project_id, - catalog_id=item["catalog_id"], - integration_id=item["integration_id"], - target_name=target_name, - suite_ids=None, - max_retries=0, - description=schedule_description, - clean_build=True, - debug=False, - release=False, - record_caret=False, - log_expiration_time_in_days=10.0, - is_tag=is_tag, - ) - job_id = str(result.get("job_id") or "").strip() - if not job_id: - raise RuntimeError(f"No job_id returned for {item['label']}.") - item["job_id"] = job_id + item["description"] = schedule_description + job_id = str(item.get("job_id") or "").strip() + if job_id: + append_task_log(task_id, f"Using existing {item['label']}: {job_id}") + status = "existing" + else: + append_task_log(task_id, f"Scheduling {item['label']}: catalog={item['catalog_id']}") + result = api.schedule_job( + project_id=project_id, + catalog_id=item["catalog_id"], + integration_id=item["integration_id"], + target_name=target_name, + suite_ids=None, + max_retries=0, + description=schedule_description, + clean_build=True, + debug=False, + release=False, + record_caret=False, + log_expiration_time_in_days=10.0, + is_tag=is_tag, + ) + job_id = str(result.get("job_id") or "").strip() + if not job_id: + raise RuntimeError(f"No job_id returned for {item['label']}.") + item["job_id"] = job_id + status = "scheduled" report_url = evaluator_api.get_job_report_url(project_id, job_id) summary["evaluator_jobs"][item["role"]] = { "job_id": job_id, "report_url": report_url, "catalog_id": item["catalog_id"], "integration_id": item["integration_id"], - "status": "scheduled", + "status": status, "description": schedule_description, } - append_task_log(task_id, f"Scheduled {item['label']}: {job_id}") + if status == "scheduled": + append_task_log(task_id, f"Scheduled {item['label']}: {job_id}") update_task_result_summary(task_id, summary) for idx, item in enumerate(jobs, start=1): @@ -1306,7 +1331,7 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct update_task_progress(task_id, message="Building normal CSV/parquet analysis artifacts", pct=48) role_paths = {"performance": performance_path, "devops": devops_path} - for item in jobs: + for artifact_idx, item in enumerate(jobs): role = str(item["role"]) analysis_path = role_paths[role] artifact_summary = _build_release_analysis_artifacts( @@ -1316,6 +1341,8 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct role=role, output_path=analysis_path, phase=analysis_phase, + progress_start=48 + (20 * artifact_idx), + progress_end=64 + (14 * artifact_idx), ) summary["analysis_artifacts"][role] = artifact_summary update_task_result_summary(task_id, summary) @@ -1352,8 +1379,8 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct "catalog_id": item["catalog_id"], "integration_id": item["integration_id"], "target_name": target_name, - "description": schedule_description, - "title": schedule_description, + "description": str(item.get("description") or ""), + "title": str(item.get("description") or ""), }, "download": { **artifact_summary.get("download", {}), From 18ac1b55f14cd54ee3ca5b4d3df83a85aefdb261 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 26 May 2026 16:01:20 +0900 Subject: [PATCH 84/94] fix: update startup script to recreate Nginx container after Streamlit - Modified the deployment script to ensure Nginx is recreated after Streamlit starts, preventing stale container IP issues. - Added a command to force-recreate the Nginx container without dependencies, ensuring it mounts the current configuration. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/deploy/04_START.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evaluation_dashboard_app/deploy/04_START.sh b/evaluation_dashboard_app/deploy/04_START.sh index 3b3814c..451c209 100755 --- a/evaluation_dashboard_app/deploy/04_START.sh +++ b/evaluation_dashboard_app/deploy/04_START.sh @@ -18,3 +18,7 @@ WORKER_SCALE="${EVAL_COMPOSE_SCALE_WORKER:-2}" dc() { docker compose --env-file .env "$@"; } dc up -d --scale "worker=${WORKER_SCALE}" "$@" + +# Nginx resolves Docker service names at startup. Recreate it after Streamlit is +# up so it remounts the current nginx.conf and cannot keep a stale container IP. +dc up -d --no-deps --force-recreate nginx From bad17acf227b551b7fc4f72e07d5a80909c5e79a Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 26 May 2026 17:43:48 +0900 Subject: [PATCH 85/94] feat: improve progress tracking in release analysis artifacts generation - Enhanced progress calculation for download and evaluation phases, ensuring accurate updates during the artifact generation process. - Added logging for the number of directories processed during evaluation, improving visibility into the workflow. - Updated task progress messages to reflect the current state of processing, providing clearer feedback to users. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/worker/tasks.py | 33 +++++++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index d146fc2..7dd461b 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -1078,6 +1078,10 @@ def _build_release_analysis_artifacts( "warnings": [], } + progress_span = max(0.0, progress_end - progress_start) + download_end = progress_start + progress_span * 0.55 + eval_end = progress_start + progress_span * 0.90 + def _on_progress(msg: str) -> None: append_task_log(task_id, f"{role}: {msg}") progress_msg = f"{role}: {msg}" @@ -1086,10 +1090,10 @@ def _on_progress(msg: str) -> None: if match: current = int(match.group(1)) total = max(1, int(match.group(2))) - pct = progress_start + ((current - 1) / total) * max(0.0, progress_end - progress_start) + pct = progress_start + ((current - 1) / total) * max(0.0, download_end - progress_start) elif "Extracting" in msg or "Organizing" in msg: - pct = progress_end - update_task_progress(task_id, message=progress_msg, pct=min(progress_end, pct)) + pct = download_end + update_task_progress(task_id, message=progress_msg, pct=min(download_end, pct)) def _on_warning(msg: str) -> None: result["warnings"].append(msg) @@ -1121,12 +1125,27 @@ def _on_warning(msg: str) -> None: raise RuntimeError(f"{role}: download produced no successful case artifacts.") if eval_summary: - update_task_progress(task_id, message=f"{role}: running eval_result", pct=progress_end) target_dirs = eval_summary.find_eval_result_dirs(str(output_path), recursive=True) statuses = [] - for result_dir in target_dirs: - statuses.append(eval_summary.run_eval_result_for_dir(result_dir, overwrite=False)) + total = len(target_dirs) + if target_dirs: + append_task_log(task_id, f"{role}: running eval_result for {total} directories") + else: + update_task_progress(task_id, message=f"{role}: no eval_result directories found", pct=eval_end) + for i, result_dir in enumerate(target_dirs): + pct = download_end + (i / total) * max(0.0, eval_end - download_end) if total else eval_end + message = f"{role}: eval_result {i + 1}/{total}: {result_dir}" + update_task_progress(task_id, message=message, pct=pct) + append_task_log(task_id, message) + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=False) + statuses.append(status) + if status.get("status") == "failed": + append_task_log( + task_id, + f"WARNING: {role}: eval_result failed for {result_dir}: {status.get('detail', '')}", + ) if target_dirs: + update_task_progress(task_id, message=f"{role}: generating Summary.csv / Score.csv", pct=eval_end) csv_info = eval_summary.generate_summary_and_score_csv(str(output_path)) result["eval"] = { "directories_processed": len(target_dirs), @@ -1147,7 +1166,7 @@ def _on_warning(msg: str) -> None: if pkl_archive_to_parquet: try: - update_task_progress(task_id, message=f"{role}: generating parquet", pct=progress_end) + update_task_progress(task_id, message=f"{role}: generating parquet", pct=eval_end) result["parquet_path"] = pkl_archive_to_parquet( str(output_path), on_progress=None, From 507efca09451831666b3d905831bdb06500fe1d1 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Thu, 28 May 2026 11:02:09 +0900 Subject: [PATCH 86/94] feat: add specsheet guide and enhance documentation structure - Introduced a new specsheet guide page detailing the specsheet export process, including file structures and integration with the analyzer library. - Updated navigation links across multiple documentation pages to include the new specsheet guide for improved accessibility. - Enhanced existing documentation with clearer descriptions of metrics and processes related to the specsheet, including updates to the data reports and deployment guides. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../docs/guide/data_reports.html | 72 +- .../docs/guide/deployment.html | 1 + .../docs/guide/getting_started.html | 16 +- .../docs/guide/index.html | 20 +- .../docs/guide/pages.html | 11 +- .../docs/guide/specsheet.html | 274 ++++++ .../docs/guide/visual_systems.html | 5 +- .../docs/specsheet_pipeline_explainer.html | 892 +----------------- .../lib/criteria_absolute_gates.py | 17 +- evaluation_dashboard_app/lib/download_core.py | 12 +- evaluation_dashboard_app/lib/eval_summary.py | 87 +- .../lib/overview_pdf_report.py | 88 +- .../lib/perception_eval_result_summarizer.py | 204 ++-- evaluation_dashboard_app/lib/run_loader.py | 25 +- evaluation_dashboard_app/lib/score_schema.py | 155 +++ .../pages/2_Criteria_Based_Score.py | 126 ++- evaluation_dashboard_app/pages/6_Workflow.py | 211 +++-- evaluation_dashboard_app/worker/tasks.py | 52 +- 18 files changed, 1021 insertions(+), 1247 deletions(-) create mode 100644 evaluation_dashboard_app/docs/guide/specsheet.html create mode 100644 evaluation_dashboard_app/lib/score_schema.py diff --git a/evaluation_dashboard_app/docs/guide/data_reports.html b/evaluation_dashboard_app/docs/guide/data_reports.html index 3cb9db6..a9a5d14 100644 --- a/evaluation_dashboard_app/docs/guide/data_reports.html +++ b/evaluation_dashboard_app/docs/guide/data_reports.html @@ -22,6 +22,7 @@

Artifacts

How to Use Pages Data & Reports + Specsheet Deployment Diagrams @@ -79,7 +80,7 @@

Which artifact powers which page?

Score.csv Download -> Eval Results Criteria Based Score - Criteria block metrics including scenario, option, GT object, distance, NM, TP/TN, pass rate, thresholds, and counts. + Criteria block metrics including scenario, optional dataset ID, option, GT object, criteria label, NM, TP/TN, ADD, AIL, UIL, PFN/PFP, Practical Pass Rate, thresholds, and counts. .parquet @@ -98,6 +99,73 @@

Which artifact powers which page?

+
+
+
+
Score.csv Structure
+

Score.csv is the Criteria page source.

+

+ Each row describes one scenario result and then repeats the criteria metric block for each + available criteria range. +

+
+ + + + + + + + + + + + + + + + + + + +
PartFieldsHow the dashboard uses it
Row identityScenario, optional DatasetUsed for scenario filters, scenario leaderboards, compare joins, gates, and PDF tables.
Scenario contextOption, GT_OBJUsed for grouping charts and understanding the matching policy/object class behind the row.
Criteria blockDistance, NM, TP/TN, ADD, AIL, UIL, PFN/PFP, UUID Num, Practical Pass Rate, MAX_DIST_THRESH, OBJ_CNTSUsed by Criteria Based Score for distributions, deltas, absolute gates, and scenario-level ranking.
+
+ Tip: if two rows share a scenario name but have different dataset IDs, the app keeps + them separate in Criteria comparisons and gate summaries. +
+
+
+ +
+
+
+
Pass Metrics
+

TP is not the same metric as Practical Pass Rate.

+
+ + + + + + + + + + + + + + + + +
Dashboard labelSourceCalculation / meaningUsed by
TP, TP meanSummary.csvComes from summarize_ratio() as TP rate. AIL and ADD are not added to this metric.Overview, TP Summary, dashboard PDF TP sections.
pass_rate, Pass rate meanScore.csv Practical Pass Rate(TP/TN + ADD + AIL) / NM * 100. AIL and ADD are pass-side outcomes for this practical score.Criteria Based Score, absolute pass/fail gates, Criteria PDF sections.
+
+ Important: when reviewing pass/fail gates, read “pass rate” as + Practical Pass Rate, not the traditional TP rate from Summary.csv. +
+
+
+
@@ -142,7 +210,7 @@

Dashboard PDF

Release Specsheet PDF

Advanced release-oriented report generated through perception_catalog_analyzer. It can include trend pages when trend metadata is enabled.

- +

ZIP outputs

diff --git a/evaluation_dashboard_app/docs/guide/deployment.html b/evaluation_dashboard_app/docs/guide/deployment.html index 156d06f..19b9c23 100644 --- a/evaluation_dashboard_app/docs/guide/deployment.html +++ b/evaluation_dashboard_app/docs/guide/deployment.html @@ -23,6 +23,7 @@

Deploy

How to Use Pages Data & Reports + Specsheet Deployment Diagrams
diff --git a/evaluation_dashboard_app/docs/guide/getting_started.html b/evaluation_dashboard_app/docs/guide/getting_started.html index b11f54b..4940f26 100644 --- a/evaluation_dashboard_app/docs/guide/getting_started.html +++ b/evaluation_dashboard_app/docs/guide/getting_started.html @@ -23,6 +23,7 @@

How to Use

How to Use Pages Data & Reports + Specsheet Deployment Diagrams
@@ -40,7 +41,7 @@

Download evaluator results into a run folder.

-
Open DownloadUse pages/6_Download.py from the sidebar.
+
Open Workflow or DownloadUse Evaluator Workflow for the guided path, or pages/6_Download.py for manual tabs.
Select Download ResultsEnter Project ID, Job ID, and optional Suite ID.
Choose Output PathRecommended: data/<test_name>.
Pick Download TypeArchives for full local analysis, Result JSON only for lightweight summary generation.
@@ -54,8 +55,8 @@

Download evaluator results into a run folder.

Decision: Archives or Result JSON only?

    -
  • Archives (ZIP): best for complete local investigation. It downloads and extracts richer source data.
  • -
  • Result JSON only: faster and lighter. Good when you mainly need summary and score generation.
  • +
  • Archives (ZIP): best for complete local investigation, eval_result, parquet generation, and visual inspection.
  • +
  • Result JSON only: faster and lighter. Good when you mainly need downloaded result JSON, not full local analysis.
  • Scenario downloads: use the Download Scenarios tab when TLR Analysis needs scenario data.
@@ -90,6 +91,11 @@

Choose generation mode

If results already exist, generate only Summary/Score. If not, run full eval_result generation.

+
+ Score.csv identity: Criteria pages identify rows by scenario. When a + Dataset field is available, scenario and dataset are treated together so repeated + scenario names from different datasets stay separate. +
Eval Results screen @@ -160,7 +166,7 @@

How to set up compare mode

Where compare mode is most useful

  • TP Summary: TP and kinematic metric deltas.
  • -
  • Criteria Score: pass-rate changes and absolute gate comparison.
  • +
  • Criteria Score: Practical Pass Rate changes and absolute gate comparison.
  • Detection Stats: TP/FP distance-bin and status distribution differences.
  • Bounding Box Viewer: spatial inspection across runs.
  • Prediction Evaluation: ADE/FDE delta matrices and distance bins.
  • @@ -184,7 +190,7 @@

    A good investigation has a rhythm.

    StepPageWhat to look for 1OverviewHigh-level TP, error metrics, filters, and immediate A/B signal. - 2Criteria ScorePass-rate distribution, failing scenarios, absolute gates, and scenario leaderboard. + 2Criteria ScorePractical Pass Rate distribution, failing scenarios, absolute gates, and scenario leaderboard. 3Detection StatsStatus distribution, TP/FP by distance, label/scenario concentration, and object-count shifts. 4Bounding Box ViewerFrame-level spatial causes: missed objects, false positives, geometry, visibility, source/status. 5Prediction EvaluationADE/FDE behavior by label, horizon, distance bin, and polar region. diff --git a/evaluation_dashboard_app/docs/guide/index.html b/evaluation_dashboard_app/docs/guide/index.html index 143db20..154bb61 100644 --- a/evaluation_dashboard_app/docs/guide/index.html +++ b/evaluation_dashboard_app/docs/guide/index.html @@ -3,17 +3,17 @@ - Evaluation Dashboard Engineer Guide + Evaluation Dashboard Guide
    -
    Engineer Documentation
    -

    Evaluation Dashboard
    Engineer Guide

    +
    Dashboard Documentation
    +

    Evaluation Dashboard
    Guide

    - This guide is the entry point for engineers who operate, debug, extend, and deploy the + This guide is the entry point for using, debugging, extending, and deploying the evaluation dashboard. The home page gives the system map; each substantial topic lives in its own focused chapter.

    @@ -40,12 +40,12 @@

    Guide Structure

    @@ -54,7 +54,7 @@

    Guide Structure

    System Role
    -

    The dashboard turns evaluator outputs into explorable engineering evidence.

    +

    The dashboard turns evaluator outputs into explorable review evidence.

    It reads run folders under the configured data root, generates dashboard artifacts when needed, shares selected run state across Streamlit pages, and provides local or production @@ -74,7 +74,7 @@

    The dashboard turns evaluator outputs into explorable engineering evidence.<
    Chapter Map
    -

    Open the chapter that matches the engineering question.

    +

    Open the chapter that matches your question.

    @@ -97,7 +97,7 @@

    How should this be deployed?

    How does the system really flow?

    Artifact maps, Download/Eval sequence, Compare state propagation, T4 camera rendering, T4 Three.js overlays, and report generation.

    - +

    How does specsheet work?

    Focused deep dive for release specsheet generation, trend context, DevOps/pass-rate plots, and generated PDF sections.

    @@ -133,7 +133,7 @@

    The common operational path is still one clear chain.

    - Evaluation Dashboard Engineer Guide + Evaluation Dashboard Guide

    This home page routes to the detailed chapters instead of duplicating them.

    diff --git a/evaluation_dashboard_app/docs/guide/pages.html b/evaluation_dashboard_app/docs/guide/pages.html index d75757c..3f19d17 100644 --- a/evaluation_dashboard_app/docs/guide/pages.html +++ b/evaluation_dashboard_app/docs/guide/pages.html @@ -23,6 +23,7 @@

    Pages

    How to Use Pages Data & Reports + Specsheet Deployment Diagrams
    @@ -71,6 +72,7 @@

    Download

    • Tabs: Download Results, Download Scenarios, View Downloads, Eval Results.
    • Outputs: downloaded archives, result JSON, scenario data, Summary.csv, Score.csv.
    • +
    • Score.csv: contains scenario identity, optional dataset ID, criteria blocks, and Practical Pass Rate.
    • Queue behavior: with USE_TASK_QUEUE=true, heavy tasks run in workers and appear in Recent Tasks.
    @@ -79,6 +81,7 @@

    Evaluator Workflow

    Use when: you want a more guided operational flow for local runs, background tasks, fresh evaluator pipelines, and report reuse.

    • Good for: launching longer evaluator workflows without jumping between many manual steps.
    • +
    • Outputs: downloaded artifacts, optional eval_result, Summary.csv, Score.csv, optional parquet, and report assets.
    • Depends on: evaluator API configuration, task queue for long-running jobs in production.
    @@ -99,7 +102,7 @@

    Data Management

    Metric Analysis Pages
    -

    Turn CSV and parquet artifacts into engineering signals.

    +

    Turn CSV and parquet artifacts into review signals.

    @@ -107,14 +110,14 @@

    Turn CSV and parquet artifacts into engineering signals.

    - + - - + + diff --git a/evaluation_dashboard_app/docs/guide/specsheet.html b/evaluation_dashboard_app/docs/guide/specsheet.html new file mode 100644 index 0000000..3f3fafa --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/specsheet.html @@ -0,0 +1,274 @@ + + + + + + Specsheet Details + + + +
    +
    +
    Specsheet
    +

    Specsheet Details

    +

    + The specsheet export creates a release-oriented PDF from the selected run, optional trend metadata, + and the external perception_catalog_analyzer library. +

    +
    +
    + + + +
    +
    +
    +
    +
    Overview
    +

    What the specsheet export does.

    +

    + The dashboard handles the UI, selected run path, metadata, progress, and local artifact setup. + The analyzer library handles the metric blocks, template rendering, plots, and PDF output. +

    +
    +
    +
    + 1. Select a run + Overview.py gathers project, version, topic, labels, and optional trend metadata. +
    +
    + 2. Prepare files + ensure_specsheet_csvs() creates current.csv and future.csv when needed. +
    +
    + 3. Build blocks + SceneDataFrame.from_dir() and get_blocks() produce abstract and detailed sections. +
    +
    + 4. Add trend context + metadata.yaml and summary.json files are classified and converted into trend rows and plots. +
    +
    + 5. Render PDF + update_template() creates HTML, then specsheet() writes specsheet.pdf. +
    +
    +
    +
    + +
    +
    +
    +
    App and Library
    +

    The integration boundary is mostly in lib/specsheet_report.py.

    +
    +
    PagePrerequisiteMain UseCompare Behavior
    TP Summary Summary.csvTP, RMS/STD, velocity scatter, metric distribution, density, scenario delta ranking.TP rate, RMS/STD, velocity scatter, metric distribution, density, scenario delta ranking. Shows candidate-vs-baseline deltas such as Delta TP and metric shifts.
    Criteria Based Score Score.csvCriteria block selection, pass-rate distribution, group means, box plots, absolute gates.Compares pass-rate changes, gate pass/fail status, and per-scenario deltas.Criteria block selection, Practical Pass Rate distribution, group means, box plots, absolute gates.Compares Practical Pass Rate changes, gate pass/fail status, and per-scenario deltas. Uses Scenario + Dataset when Dataset exists.
    Detection Stats
    + + + + + + + + + + + + + + + + + + + + + + + +
    LayerKey file/moduleResponsibility
    Streamlit UIOverview.pyCollects project identity, version, topic, labels, selected run, trend toggle, and user-facing progress.
    Dashboard wrapperlib/specsheet_report.pyDefines artifact paths, adapts analyzer signatures, discovers trend files, classifies summaries, and prepares plot paths.
    Analyzer libraryperception_catalog_analyzerLoads scene data, generates specsheet metric blocks, renders template HTML, creates plots, and writes the final PDF.
    Local artifactsdata/<run>/...Stores run CSV/parquet files, trend metadata, summary files, generated PNGs, HTML, and specsheet.pdf.
    +
    +
+ +
+
+
+
Files
+

Specsheet input and output files live inside the run folder.

+

+ Trend files can come from a standalone dashboard run or from a grouped release folder. In both cases, + each trend item needs a metadata.yaml file next to its summary.json. +

+
+
+
+

Standalone run shape

+
+
data/my_run/
+
current.csv
+
future.csv
+
resources/
+
metadata.yaml
+
summary.json
+
specsheet/
+
specsheet.html
+
specsheet.pdf
+
+
+
+

Grouped release shape

+
+
data/trend_release_full_usecase_devops/
+
perception.object_recognition.objects/
+
<full_job_id>/metadata.yaml + summary.json
+
<usecase_job_id>/metadata.yaml + summary.json
+
<devops_job_id>/metadata.yaml + summary.json
+
specsheet/
+
map_trend.png
+
devops_trend.png
+
specsheet.pdf
+
+
+
+
+ Note: discover_trend_metadata_files() scans the data root for + metadata/summary pairs, and discover_trend_release_groups() decides how those files + should be grouped for the release PDF. +
+
+
+ +
+
+
+
Trend Data
+

Trend summaries are classified by JSON shape.

+

+ Metadata provides release identity. The summary payload decides whether the item is a full, + usecase, devops, or unknown trend source. +

+
+ + + + + + + + + + + + + + + + + + + +
RoleHow it is recognizedSpecsheet use
Full performancesummary.json has blocks containing 全数データセット評価.Feeds mAP, precision, recall, error, and prediction trend sections.
Usecasesummary.json has blocks containing ユースケース評価.Participates in release grouping and inventory context.
DevOps pass-rateSummary is a nested dictionary without blocks, with category results containing passed and total.Feeds overall pass-rate trend and pass-rate detail plots.
+
+
+ +
+
+
+
PDF Assembly
+

The final PDF is assembled from analyzer HTML plus dashboard trend context.

+
+
+
+

get_blocks()

+

+ Creates abstract and detailed metric fragments for labels, metrics, and evaluation type. + These fragments become the main technical body of the PDF. +

+
+
+

update_template()

+

+ Receives project/version metadata and trend context, then renders the analyzer template body. + Generated PNG paths are included when trend plots exist. +

+
+
+

specsheet()

+

+ Combines body HTML, abstract HTML, and detailed HTML, then writes + specsheet/specsheet.html and specsheet/specsheet.pdf. +

+
+
+

Trend plots

+

+ Full performance trends can generate map_trend.png and + prediction_trend.png. DevOps summaries can generate + devops_trend.png and devops_trend_detail.png. +

+
+
+
+
+ +
+
+
+
Debugging
+

When a specsheet section is missing, check the data contract.

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
SymptomWhat to checkExpected condition
No trend sectionTrend metadata discoveryAt least one metadata.yaml has a sibling summary.json.
Full trend is missingFull summary rolesummary.json has full-performance blocks and generated full trend rows.
Pass Rate Trend is missingDevOps summary and plot filesDevOps rows are non-empty and devops_trend.png exists in the specsheet output folder.
PDF says no dataTemplate contextThe relevant trend list is non-empty before calling update_template().
+
+
quick local verification
+
PYTHONPATH=. python - <<'PY'
+from pathlib import Path
+from lib.specsheet_report import _build_trend_context, discover_trend_metadata_files
+
+ctx = _build_trend_context(discover_trend_metadata_files(), Path("/tmp/specsheet-trend-check"))
+print(len(ctx["performance_trend_data"]), len(ctx["devops_trend_data"]))
+PY
+
+
+
+ + + + + + diff --git a/evaluation_dashboard_app/docs/guide/visual_systems.html b/evaluation_dashboard_app/docs/guide/visual_systems.html index dae1445..384021c 100644 --- a/evaluation_dashboard_app/docs/guide/visual_systems.html +++ b/evaluation_dashboard_app/docs/guide/visual_systems.html @@ -24,6 +24,7 @@

Real Flows

How to Use Pages Data & Reports + Specsheet Deployment Diagrams @@ -538,11 +539,11 @@

Dashboard PDF and specsheet PDF are different engines.

Specsheet is an advanced report path. Most users first use the dashboard pages and dashboard PDF. - For full detail, open the Specsheet Pipeline Explorer. + For full detail, open the Specsheet guide page. diff --git a/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html b/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html index 41ed0dc..0fb205f 100644 --- a/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html +++ b/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html @@ -3,895 +3,11 @@ - Specsheet Pipeline Explorer - + + Specsheet Details + -
-
-
-
Evaluation Dashboard App
-

Specsheet Pipeline Explorer

-

- A visual engineering guide to how the Streamlit dashboard calls - perception_catalog_analyzer, builds release specsheet PDFs, and turns saved - trend metadata into mAP, prediction, and pass-rate trend sections. -

- -
-
-
specsheet.pdf
-
Overview.py
-
specsheet_report.py
-
metadata + summary
-
analyzer library
-
-
-
- - - -
-
-
-
-
One Button, Many Systems
-

The dashboard is the conductor; the analyzer is the orchestra.

-

- The app owns the Streamlit UI, local run selection, path safety, progress reporting, - trend metadata capture, and compatibility glue. The library owns the domain-heavy work: - SceneDataFrame loading, metric block generation, HTML templating, matplotlib plots, and PDF rendering. -

-
- -
-
- 1. User selects a run - Overview.py gathers project, version, topic, labels, and optional trend metadata. -
-
- 2. Wrapper prepares files - ensure_specsheet_csvs() creates current.csv / future.csv when missing. -
-
- 3. Analyzer builds blocks - SceneDataFrame.from_dir() and get_blocks() produce abstract and detailed HTML. -
-
- 4. Trend context is assembled - Saved metadata.yaml + summary.json files become trend rows and plot PNGs. -
-
- 5. PDF is rendered - update_template() creates the body HTML, then specsheet() writes PDF output. -
-
-
-
- -
-
-
-
Code Boundary
-

How the current app connects to the library.

-

- The integration lives mostly in lib/specsheet_report.py. It imports analyzer functions lazily, - then calls them through compatibility adapters so different analyzer versions can still work. -

-
- -
-
Streamlit UI
Overview.py
-
-
InputsProject ID, version, topic name, labels, run path.
-
Trend toggleValidates YAML and asks for exactly one trend-enabled run.
-
ProgressMaps generator callback messages to the progress bar.
-
-
- -
-
Dashboard Wrapper
lib/specsheet_report.py
-
-
Artifact pathsDefines CSV, parquet, resources, specsheet dir, and PDF paths.
-
CompatibilityAdapts analyzer signatures with inspect.signature().
-
Trend bridgeDiscovers trend files, classifies summaries, and renders trend plots.
-
-
- -
-
Analyzer Library
perception_catalog_analyzer
-
-
DataframeSceneDataFrame.from_dir() reads current/future CSVs.
-
Specsheet blocksget_blocks() emits abstract and detailed HTML sections.
-
Renderingupdate_template() + specsheet() build HTML and PDF.
-
-
- -
-
Local Artifacts
data/...
-
-
Run filescurrent.parquet, future.parquet, generated CSVs.
-
Trend filesmetadata.yaml and summary.json beside each trend job.
-
Outputspecsheet/specsheet.html, PNG plots, and specsheet.pdf.
-
-
-
-
- -
-
-
-
File Geography
-

Two trend folder shapes are supported.

-

- The app can read trend files from standalone dashboard-generated runs and grouped library-style release folders. - The loader only requires sibling metadata.yaml and summary.json; the grouping logic uses folder shape to connect full/usecase/devops jobs into one release. -

-
- -
-
-

Standalone run shape

-
-
data/my_run/
-
current.csv
-
future.csv
-
resources/
-
metadata.yaml
-
summary.json
-
specsheet/specsheet.pdf
-
-
-
-

Trend release shape

-
-
data/trend_release_full_usecase_devops/
-
perception.object_recognition.objects/
-
<full_job_id>/metadata.yaml + summary.json
-
<usecase_job_id>/metadata.yaml + summary.json
-
<devops_job_id>/metadata.yaml + summary.json
-
specsheet/
-
map_trend.png, devops_trend.png, specsheet.pdf
-
-
-
- -
- Important: discover_trend_metadata_files() scans the data root for every - metadata.yaml that has a sibling summary.json. Then - discover_trend_release_groups() decides whether that metadata belongs to a standalone run or a grouped release. -
-
-
- -
-
-
-
Trend Engine
-

Trend data is not one format. It is classified by summary shape.

-

- The metadata provides release identity. The summary payload determines the role: full, usecase, devops, or unknown. - That role decides which charts and sections can be produced. -

-
- -
-
-
metadata.yaml
-
tags: [trend]
-pilot_auto_version: "Pilot.Auto v4.3.0 (centerpoint x2/2.3.1)"
-data_count: "99,776+"
-description: "データの追加"
-date: "2025.11.7"
-
-
-
classification logic
-
# summary has blocks with 全数データセット評価
-role = "full"
-
-# summary has blocks with ユースケース評価
-role = "usecase"
-
-# summary is a non-empty nested dict, no blocks
-role = "devops"
-
-
- -
-
-
F
-

Full performance trend

-

Extracts mAP, precision, recall, error metrics, and prediction metrics from the full summary block. These rows feed map_trend.png and prediction_trend.png.

-
-
-
U
-

Usecase summaries

-

Classified and grouped for inventory views. The current PDF trend context does not render a dedicated usecase trend plot.

-
-
-
D
-

DevOps pass-rate trend

-

Flattens nested pass/fail totals, calculates overall pass rate, and feeds devops_trend.png plus devops_trend_detail.png.

-
-
-
-
- -
-
-
-
Rendering
-

The final PDF is assembled from three streams of HTML.

-

- The analyzer’s template body contributes the cover, executive summary, trend sections, DevOps section, and dataset information. - The analyzer’s block generator contributes abstract and detailed metric sections. -

-
- -
-
-

ensure_specsheet_csvs()

-

Creates current.csv and optional future.csv from parquet or pkl-derived data so the analyzer can load a consistent run directory.

-
-
-

_get_blocks_compat(get_blocks, ...)

-

Calls the analyzer to create abstract and detailed HTML fragments for labels, metrics, and evaluation type.

-
-
-

_update_template_compat(update_template, ...)

-

Passes project/version plus trend context into the analyzer’s Jinja template. Paths are resolved to generated PNGs when they exist.

-
-
-

_specsheet_compat(specsheet, ...)

-

Hands body HTML, abstract HTML, and detailed HTML to the analyzer renderer, which writes specsheet.html and specsheet.pdf.

-
-
-
-
- -
-
-
-
What We Fixed
-

Pass Rate Trend was visible in Trend Insights but missing in the PDF.

-

- Trend Insights already knew how to flatten DevOps summaries. The PDF exporter had the right template fields, but its DevOps trend loader returned an empty list and no pass-rate PNGs were generated. -

-
- -
-
-

Before

-
-
old exporter behavior
-
def load_devops_trend_data(metadata_list):
-    return []
-
-devops_trend_plot_path = output_dir / "devops_trend.png"
-# no plot generation
-
-
-
-

After

-
-
new exporter behavior
-
summary = load_trend_summary_file(summary_path)
-if classify_trend_summary(summary) == "devops":
-    rows = extract_devops_case_rows(summary)
-    overall_pass_rate = sum(passed) / sum(total) * 100
-    generate_devops_trend_plot(...)
-    generate_devops_trend_detail_plot(...)
-
-
-
- -
- Template rule: the analyzer template only renders Pass Rate Trend when - devops_trend_data is non-empty and devops_trend.png exists. It only renders Pass Rate Detail when - devops_trend_detail.png exists too. -
-
-
- -
-
-
-
Debug Playbook
-

When a trend section disappears, check the contract.

-
- -
-
-
1
-

Find metadata

-

Confirm the data root contains metadata.yaml with a sibling summary.json.

-
-
-
2
-

Check role

-

Full summaries need blocks. DevOps summaries should be a nested dict with passed and total.

-
-
-
3
-

Check generated PNGs

-

The PDF template needs map_trend.png, prediction_trend.png, devops_trend.png, and detail PNGs when enabled.

-
-
-
4
-

Check template context

-

show_other_infos gates the executive summary trend pages. Empty trend data means “該当データなし。”

-
-
- -
-
quick local verification
-
PYTHONPATH=. python - <<'PY'
-from pathlib import Path
-from lib.specsheet_report import _build_trend_context, discover_trend_metadata_files
-
-ctx = _build_trend_context(discover_trend_metadata_files(), Path('/tmp/specsheet-trend-check'))
-print(len(ctx['performance_trend_data']), len(ctx['devops_trend_data']))
-PY
-
-
-
-
- -
-
- Specsheet Pipeline Explorer -

- Generated for the evaluation dashboard repository. Key files: - Overview.py, lib/specsheet_report.py, - pages/13_Trend_Insights.py, and perception_catalog_analyzer.template. -

-
-
+

Open Specsheet Details

diff --git a/evaluation_dashboard_app/lib/criteria_absolute_gates.py b/evaluation_dashboard_app/lib/criteria_absolute_gates.py index c23eaa9..2a10770 100644 --- a/evaluation_dashboard_app/lib/criteria_absolute_gates.py +++ b/evaluation_dashboard_app/lib/criteria_absolute_gates.py @@ -11,6 +11,8 @@ import pandas as pd +from lib.score_schema import score_base_cols, score_identity_cols + MetricOp = Literal["<=", ">="] MAX_CRITERIA_DEFAULT = 32 @@ -22,11 +24,11 @@ def infer_criteria_count( max_criteria: int = MAX_CRITERIA_DEFAULT, ) -> int: """ - Number of criteria blocks in a raw Score dataframe (first 3 cols are base). + Number of criteria blocks in a raw Score dataframe. """ if df_raw is None or df_raw.shape[1] < 3: return 1 - n = (df_raw.shape[1] - 3) // block_size + n = (df_raw.shape[1] - len(score_base_cols(df_raw))) // block_size n = max(1, n) return int(min(n, max_criteria)) @@ -65,7 +67,7 @@ def evaluate_scenario_gates( raise ValueError(f"Metric column {metric_gate.column!r} not in df_view") empty_cols = [ - "Scenario", + *score_identity_cols(df_view), "agg_pass_rate", "metric_agg", "scenario_pass", @@ -82,7 +84,12 @@ def evaluate_scenario_gates( d[metric_gate.column] = pd.to_numeric(d[metric_gate.column], errors="coerce") rows: list[dict[str, Any]] = [] - for scen, grp in d.groupby("Scenario", observed=True): + identity_cols = score_identity_cols(d) + for key, grp in d.groupby(identity_cols, observed=True): + if len(identity_cols) == 1: + identity_values = {"Scenario": key[0] if isinstance(key, tuple) else key} + else: + identity_values = dict(zip(identity_cols, key)) rc = len(grp) pr = grp["pass_rate"] mean_pr = float(pr.mean()) @@ -113,7 +120,7 @@ def evaluate_scenario_gates( rows.append( { - "Scenario": scen, + **identity_values, "row_count": rc, "agg_pass_rate": mean_pr, "metric_agg": m_agg, diff --git a/evaluation_dashboard_app/lib/download_core.py b/evaluation_dashboard_app/lib/download_core.py index 9ed5cd8..b2707b9 100644 --- a/evaluation_dashboard_app/lib/download_core.py +++ b/evaluation_dashboard_app/lib/download_core.py @@ -667,12 +667,20 @@ def run_download_and_eval( if generate_parquet and result["download_success"] and pkl_archive_to_parquet: if on_progress: on_progress("Generating parquet...") + + def _on_parquet_progress(done: int, total: int) -> None: + if on_progress: + on_progress(f"Parquet: Processing {done}/{total} pkl files") + + def _on_parquet_skip(path: str, reason: str) -> None: + if on_warning: + on_warning(f"Parquet skipped {path}: {reason}") try: parquet_path = pkl_archive_to_parquet( output_path, - on_progress=None, - on_skip=None, + on_progress=_on_parquet_progress, + on_skip=_on_parquet_skip, project_id=project_id, job_id=job_id, ) diff --git a/evaluation_dashboard_app/lib/eval_summary.py b/evaluation_dashboard_app/lib/eval_summary.py index ee89af7..f198d14 100644 --- a/evaluation_dashboard_app/lib/eval_summary.py +++ b/evaluation_dashboard_app/lib/eval_summary.py @@ -8,12 +8,35 @@ import signal import subprocess import sys +import tempfile from pathlib import Path from typing import Any, Dict, List from lib.perception_eval_result_summarizer import run_eval_result, generate_score_json +def _write_text_atomic(path: str, content: str) -> None: + """Write text by replacing the target, so read-only existing files do not block writable dirs.""" + target = Path(path) + tmp_name = "" + try: + with tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + dir=os.fspath(target.parent), + delete=False, + ) as f: + tmp_name = f.name + f.write(content) + os.replace(tmp_name, target) + finally: + if tmp_name and os.path.exists(tmp_name): + try: + os.unlink(tmp_name) + except OSError: + pass + + def find_eval_result_dirs(root_dir: str, recursive: bool = True) -> List[str]: """Return sorted list of directories under root_dir that contain scenario.yaml and scene_result.pkl.""" if not os.path.isdir(root_dir): @@ -159,6 +182,43 @@ def _infer_suite_name(dir_name: str) -> str: return parts[0] return base + def _dataset_id_from_case_dir(case_dir: str) -> str: + """Resolve the real T4 dataset id for Score.csv; blank if unavailable.""" + case_path = Path(case_dir) + metadata_path = case_path / "t4_metadata.json" + if metadata_path.exists(): + try: + with open(metadata_path, "r", encoding="utf-8") as f: + meta = json.load(f) + dataset_id = str(meta.get("t4_dataset_id") or "").strip() + if dataset_id: + return dataset_id + except (OSError, json.JSONDecodeError, TypeError, AttributeError): + pass + + scenario_path = case_path / "scenario.yaml" + if scenario_path.exists(): + try: + import yaml + + with open(scenario_path, "r", encoding="utf-8") as f: + scenario = yaml.safe_load(f) or {} + datasets = scenario.get("Evaluation", {}).get("Datasets", []) + if isinstance(datasets, list): + for item in datasets: + if isinstance(item, dict) and item: + dataset_id = str(next(iter(item.keys())) or "").strip() + if dataset_id: + return dataset_id + elif isinstance(datasets, dict): + dataset_id = str(next(iter(datasets.keys()), "") or "").strip() + if dataset_id: + return dataset_id + except (ImportError, OSError, TypeError, AttributeError): + pass + + return "" + result_folders = glob.glob(os.path.join(input_path, "*/")) result_folders.sort() result_entries: List[Dict[str, str]] = [] @@ -180,6 +240,14 @@ def _infer_suite_name(dir_name: str) -> str: summary_lines: List[str] = [] score_lines: List[str] = [] + score_header = "Scenario, Dataset, Option, GT_OBJ," + for _ in range(4): + score_header += ( + "Distance, NM, TP/TN, ADD, AIL, UIL, PFN/PFP, UUID Num, " + "Practical Pass Rate, MAX_DIST_THRESH,OBJ_CNTS," + ) + score_header += "\n" + for entry in result_entries: folder = entry["path"] suite_name = entry["suite"] @@ -231,7 +299,11 @@ def _infer_suite_name(dir_name: str) -> str: with open(score_json_path, "r", encoding="utf-8") as f: dic = json.load(f) - line = f"{Path(folder).name}," + folder_name = Path(folder).name + dataset_id = _dataset_id_from_case_dir(folder) + + line = f"{folder_name}," + line += f"{dataset_id}," line += f"{dic.get('Option', '')}," line += f"{dic.get('criteria0', {}).get('GT_OBJ', '')}," @@ -270,17 +342,14 @@ def _infer_suite_name(dir_name: str) -> str: obj_cnts = v.get("OBJ_CNTS", {}) if isinstance(obj_cnts, dict): - obj_parts = [f"{obj}:{cnt}" for obj, cnt in obj_cnts.items()] - line += ";".join(obj_parts) - if not is_last: - line += "," + obj_parts = [f"{obj}:{cnt};" for obj, cnt in obj_cnts.items()] + line += "".join(obj_parts) + line += "," score_lines.append(line + "\n") - with open(os.path.join(input_path, "Summary.csv"), mode="w", encoding="utf-8") as f: - f.writelines(summary_lines) - with open(os.path.join(input_path, "Score.csv"), mode="w", encoding="utf-8") as f: - f.writelines(score_lines) + _write_text_atomic(os.path.join(input_path, "Summary.csv"), "".join(summary_lines)) + _write_text_atomic(os.path.join(input_path, "Score.csv"), score_header + "".join(score_lines)) return { "summary_path": os.path.join(input_path, "Summary.csv"), diff --git a/evaluation_dashboard_app/lib/overview_pdf_report.py b/evaluation_dashboard_app/lib/overview_pdf_report.py index d3f636f..1547f55 100644 --- a/evaluation_dashboard_app/lib/overview_pdf_report.py +++ b/evaluation_dashboard_app/lib/overview_pdf_report.py @@ -11,7 +11,14 @@ import plotly.express as px import plotly.graph_objects as go -from lib.criteria_absolute_gates import infer_criteria_count +from lib.score_schema import ( + SCORE_BLOCK_SIZE, + SCORE_NUM_COLS, + SCORE_VIEW_METRIC_COLS, + build_score_view, + infer_score_criteria_count, + score_identity_cols, +) from lib.summary_compare import build_summary_delta PRODUCT_LABEL_JA_DEFAULT = { @@ -41,33 +48,9 @@ _COMPARE_RUN_COLORS = ["#312e81", "#0f766e", "#e86a33", "#6b8e23", "#9b59b6", "#1abc9c"] _OVERVIEW_COMPARE_COLORS = ["#31356E", "#008E9B", "#E86A33", "#6B8E23", "#9B59B6", "#1ABC9C"] -_BASE_COLS = ["Scenario", "Option", "GT_OBJ"] -_CRITERIA_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", - "obj_cnts", -] -_NUM_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", -] -_BLOCK_SIZE = len(_CRITERIA_COLS) +_CRITERIA_COLS = SCORE_VIEW_METRIC_COLS +_NUM_COLS = SCORE_NUM_COLS +_BLOCK_SIZE = SCORE_BLOCK_SIZE _DEFAULT_MAX_EVAL_RANGE = 50 _DISTANCE_BIN_CASE = """CASE WHEN dist_h < 10 THEN '[0,10)' @@ -476,7 +459,7 @@ def _build_criteria_section(run_records: Sequence[dict], run_labels: Sequence[st "fallback_note": "Criteria section skipped because Score.csv is missing.", } - criteria_count = min(infer_criteria_count(rec["score"], _BLOCK_SIZE) for _, rec in score_runs) + criteria_count = min(infer_score_criteria_count(rec["score"]) for _, rec in score_runs) if criteria_count <= 0: return { "summary": "Score.csv was loaded, but no criteria blocks were detected.", @@ -922,22 +905,26 @@ def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFr def _build_criteria_single_table(df_view: pd.DataFrame) -> List[List[str]]: - scenario_metric = df_view.groupby("Scenario", as_index=False)["pass_rate"].mean().sort_values("pass_rate", ascending=False).head(20) - rows = [["Scenario", "Pass rate mean"]] + key_cols = score_identity_cols(df_view) + scenario_metric = df_view.groupby(key_cols, as_index=False)["pass_rate"].mean().sort_values("pass_rate", ascending=False).head(20) + rows = [key_cols + ["Pass rate mean"]] for _, row in scenario_metric.iterrows(): - rows.append([_shorten_scenario_name(str(row["Scenario"])), _fmt_number(row["pass_rate"])]) - return {"rows": rows, "col_width_weights": [0.72, 0.28]} + rows.append([_shorten_scenario_name(str(row[c])) for c in key_cols] + [_fmt_number(row["pass_rate"])]) + first_w = 0.56 if len(key_cols) > 1 else 0.72 + rest_w = (1.0 - first_w) / len(key_cols) + return {"rows": rows, "col_width_weights": [first_w] + [rest_w] * len(key_cols)} def _build_criteria_compare_table(views: Sequence[Tuple[str, pd.DataFrame]]) -> List[List[str]]: labels = [lbl for lbl, _ in views] + key_cols = score_identity_cols(views[0][1]) merges = [] for lbl, df in views: - g = df.groupby("Scenario", as_index=False)["pass_rate"].mean() + g = df.groupby(key_cols, as_index=False)["pass_rate"].mean() merges.append(g.rename(columns={"pass_rate": f"pr_{lbl}"})) per_scenario = merges[0] for g in merges[1:]: - per_scenario = per_scenario.merge(g, on="Scenario", how="inner") + per_scenario = per_scenario.merge(g, on=key_cols, how="inner") base = labels[0] delta_cols: List[str] = [] for cand in labels[1:]: @@ -946,17 +933,17 @@ def _build_criteria_compare_table(views: Sequence[Tuple[str, pd.DataFrame]]) -> delta_cols.append(dcol) rank_key = per_scenario[delta_cols].abs().max(axis=1) per_scenario = per_scenario.reindex(rank_key.sort_values(ascending=False).index).head(20) - header: List[str] = ["Scenario", f"Pass rate ({base})"] + header: List[str] = key_cols + [f"Pass rate ({base})"] for cand in labels[1:]: header.extend([f"Pass rate ({cand})", f"Δ({cand} - {base})"]) rows = [header] for _, row in per_scenario.iterrows(): - cells: List[str] = [_shorten_scenario_name(str(row["Scenario"])), _fmt_number(row[f"pr_{base}"])] + cells: List[str] = [_shorten_scenario_name(str(row[c])) for c in key_cols] + [_fmt_number(row[f"pr_{base}"])] for cand in labels[1:]: cells.extend([_fmt_number(row[f"pr_{cand}"]), _fmt_number(row[f"delta_{cand}"])]) rows.append(cells) ncols = len(header) - scen_w = 0.34 if ncols > 4 else 0.52 + scen_w = 0.28 if ncols > 5 else 0.44 rest_w = (1.0 - scen_w) / max(ncols - 1, 1) weights = [scen_w] + [rest_w] * (ncols - 1) return {"rows": rows, "col_width_weights": weights} @@ -967,13 +954,14 @@ def _build_criteria_compare_delta_figure(views: Sequence[Tuple[str, pd.DataFrame return None labels = [lbl for lbl, _ in views] base = labels[0] + key_cols = score_identity_cols(views[0][1]) merges = [] for lbl, df in views: - g = df.groupby("Scenario", as_index=False)["pass_rate"].mean() + g = df.groupby(key_cols, as_index=False)["pass_rate"].mean() merges.append(g.rename(columns={"pass_rate": f"pr_{lbl}"})) per_scenario = merges[0] for g in merges[1:]: - per_scenario = per_scenario.merge(g, on="Scenario", how="inner") + per_scenario = per_scenario.merge(g, on=key_cols, how="inner") if per_scenario.empty: return None long_rows: List[dict] = [] @@ -984,9 +972,14 @@ def _build_criteria_compare_delta_figure(views: Sequence[Tuple[str, pd.DataFrame delta_cols.append(dcol) rank_key = per_scenario[delta_cols].abs().max(axis=1) vis = per_scenario.reindex(rank_key.sort_values(ascending=False).index).head(20) - scen_order = [_shorten_scenario_name(str(s)) for s in vis["Scenario"].tolist()] + if "Dataset" in key_cols: + scenario_labels = vis["Scenario"].astype(str) + " [" + vis["Dataset"].astype(str) + "]" + else: + scenario_labels = vis["Scenario"].astype(str) + scen_order = [_shorten_scenario_name(str(s)) for s in scenario_labels.tolist()] for _, row in vis.iterrows(): - scen_disp = _shorten_scenario_name(str(row["Scenario"])) + scen_raw = f"{row['Scenario']} [{row['Dataset']}]" if "Dataset" in key_cols else row["Scenario"] + scen_disp = _shorten_scenario_name(str(scen_raw)) for cand in labels[1:]: long_rows.append( { @@ -1890,16 +1883,7 @@ def _make_text_placeholder_figure(text: str) -> go.Figure: def _build_score_view(df_raw: pd.DataFrame, criteria_idx: int) -> pd.DataFrame: - start = 3 + criteria_idx * _BLOCK_SIZE - end = start + _BLOCK_SIZE - df_view = df_raw.iloc[:, :3].copy() - df_view.columns = _BASE_COLS - block = df_raw.iloc[:, start:end].copy() - block.columns = _CRITERIA_COLS - df_view = pd.concat([df_view, block], axis=1) - for column in _NUM_COLS: - df_view[column] = pd.to_numeric(df_view[column], errors="coerce") - return df_view + return build_score_view(df_raw, criteria_idx) def _create_eval_flat_view(con: duckdb.DuckDBPyConnection, parquet_path: str, view_name: str) -> None: diff --git a/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py b/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py index 770fd43..6083498 100644 --- a/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py +++ b/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py @@ -498,86 +498,146 @@ def calc_score_single(df, result_directory): return {} found_gt, pos, prev_frame, uuid_list, obj_idx = False, [], -1, [], 0 res, obj_group, criteria_max_dist = get_option_and_object_group(result_directory) + + frame_data = {} for i in range(total_row_num): - if ( - isnull(df.loc[(i, "ground_truth"), "timestamp"]) - # or df.loc[(i, "ground_truth"), "frame"] == prev_frame - ): - continue + frame_num = df.loc[(i, "estimation"), "frame"] + if isnull(frame_num): + frame_num = df.loc[(i, "ground_truth"), "frame"] - if df.loc[(i, "ground_truth"), "frame"] == prev_frame: - obj_idx += 1 - else: - obj_idx = 0 - - prev_frame = df.loc[(i, "ground_truth"), "frame"] - act_x = df.loc[(i, "ground_truth"), "x"] - act_y = df.loc[(i, "ground_truth"), "y"] - act_dist = math.sqrt(act_x**2 + act_y**2) - act_vx = df.loc[(i, "ground_truth"), "vx"] - act_vy = df.loc[(i, "ground_truth"), "vy"] - # act_vel = math.sqrt(act_vx**2 + act_vy**2) - point = {"x": -act_y, "y": act_x, "dist": act_dist, "vx": -act_vx, "vy": act_vy} - - if act_dist < criteria_max_dist[0]: - key = "criteria0" - dist_err_torelance = 2 - elif act_dist < criteria_max_dist[1]: - key = "criteria1" - dist_err_torelance = 3 - elif act_dist < criteria_max_dist[2]: - key = "criteria2" - dist_err_torelance = 5 - elif act_dist < criteria_max_dist[3]: - key = "criteria3" - dist_err_torelance = 5 - else: - raise ValueError("act_dist is out of range") - - act_label = df.loc[(i, "ground_truth"), "label"] - if not found_gt: - found_gt = True - res["criteria0"]["GT_OBJ"] = df.loc[(i, "ground_truth"), "label"] - res["criteria1"]["GT_OBJ"] = df.loc[(i, "ground_truth"), "label"] - res["criteria2"]["GT_OBJ"] = df.loc[(i, "ground_truth"), "label"] + if frame_num not in frame_data: + frame_data[frame_num] = {"ground_truth": [], "estimation": []} if not isnull(df.loc[(i, "estimation"), "timestamp"]): - est_label = df.loc[(i, "estimation"), "label"] - if act_label != "false_positive": - est_x = df.loc[(i, "estimation"), "x"] - est_y = df.loc[(i, "estimation"), "y"] - diff_dist = math.sqrt((act_x - est_x) ** 2 + (act_y - est_y) ** 2) - est_uuid = df.loc[(i, "estimation"), "uuid"] - if est_uuid not in uuid_list: - uuid_list.append(est_uuid) - # print("param:", df.loc[(i, "estimation"), "timestamp"], act_x, act_y, act_label, est_x, est_y, est_label, diff_dist) + frame_data[frame_num]["estimation"].append( + { + "index": i, + "x": df.loc[(i, "estimation"), "x"], + "y": df.loc[(i, "estimation"), "y"], + "label": df.loc[(i, "estimation"), "label"], + "uuid": df.loc[(i, "estimation"), "uuid"], + "timestamp": df.loc[(i, "estimation"), "timestamp"], + } + ) - if act_label == est_label: - if diff_dist < dist_err_torelance: - status = "TP/TN" + if not isnull(df.loc[(i, "ground_truth"), "timestamp"]): + frame_data[frame_num]["ground_truth"].append( + { + "index": i, + "x": df.loc[(i, "ground_truth"), "x"], + "y": df.loc[(i, "ground_truth"), "y"], + "label": df.loc[(i, "ground_truth"), "label"], + "vx": df.loc[(i, "ground_truth"), "vx"], + "vy": df.loc[(i, "ground_truth"), "vy"], + "frame": df.loc[(i, "ground_truth"), "frame"], + } + ) + + for frame_num in sorted(frame_data.keys()): + frame_gt_list = frame_data[frame_num]["ground_truth"] + frame_est_list = frame_data[frame_num]["estimation"] + + for gt_obj in frame_gt_list: + i = gt_obj["index"] + act_uuid = df.loc[(i, "ground_truth"), "uuid"] + est_uuid = "" + prev_frame = df.loc[(i, "ground_truth"), "frame"] + + act_x = gt_obj["x"] + act_y = gt_obj["y"] + act_dist = math.sqrt(act_x**2 + act_y**2) + act_vx = gt_obj["vx"] + act_vy = gt_obj["vy"] + point = {"x": -act_y, "y": act_x, "dist": act_dist, "vx": -act_vx, "vy": act_vy} + + if act_dist < criteria_max_dist[0]: + key = "criteria0" + dist_err_torelance = 2 + elif act_dist < criteria_max_dist[1]: + key = "criteria1" + dist_err_torelance = 3 + elif act_dist < criteria_max_dist[2]: + key = "criteria2" + dist_err_torelance = 5 + elif act_dist < criteria_max_dist[3]: + key = "criteria3" + dist_err_torelance = 5 + else: + raise ValueError("act_dist is out of range") + + act_label = gt_obj["label"] + if not found_gt: + found_gt = True + res["criteria0"]["GT_OBJ"] = act_label + res["criteria1"]["GT_OBJ"] = act_label + res["criteria2"]["GT_OBJ"] = act_label + + if not isnull(df.loc[(i, "estimation"), "timestamp"]): + est_label = df.loc[(i, "estimation"), "label"] + est_uuid = df.loc[(i, "estimation"), "uuid"] + if act_label != "false_positive": + est_x = df.loc[(i, "estimation"), "x"] + est_y = df.loc[(i, "estimation"), "y"] + diff_dist = math.sqrt((act_x - est_x) ** 2 + (act_y - est_y) ** 2) + + if est_uuid not in uuid_list: + uuid_list.append(est_uuid) + + if act_label == est_label: + if diff_dist < dist_err_torelance: + status = "TP/TN" + else: + status = "ADD" + elif est_label in obj_group[act_label]: + status = "AIL" else: - status = "ADD" - elif est_label in obj_group[act_label]: - status = "AIL" + status = "UIL" else: - status = "UIL" - else: - status = "PFN/PFP" - res[key]["OBJ_CNTS"].setdefault(est_label, 0) - res[key]["OBJ_CNTS"][est_label] += 1 - else: - if act_label != "false_positive": - status = "PFN/PFP" + status = "PFN/PFP" + res[key]["OBJ_CNTS"].setdefault(est_label, 0) + res[key]["OBJ_CNTS"][est_label] += 1 else: - status = "TP/TN" - res[key][status] += 1 - res[key]["NM"] += 1 - res[key]["UUID_NUM"] = len(uuid_list) - point["status"] = status - point["uuid_num"] = len(uuid_list) - if obj_idx == len(pos): - pos.append([]) - pos[obj_idx].append(point) + if act_label != "false_positive": + closest_dist = float("inf") + closest_est = None + + for est_obj in frame_est_list: + diff_dist = math.sqrt((act_x - est_obj["x"]) ** 2 + (act_y - est_obj["y"]) ** 2) + if diff_dist < closest_dist: + closest_dist = diff_dist + closest_est = est_obj + + if closest_est is not None and closest_dist < 1.0: + est_label = closest_est["label"] + est_uuid = closest_est["uuid"] + + if est_uuid is not None and est_uuid not in uuid_list: + uuid_list.append(est_uuid) + + if act_label == est_label: + if closest_dist < dist_err_torelance: + status = "TP/TN" + else: + status = "ADD" + elif est_label in obj_group[act_label]: + status = "AIL" + else: + status = "UIL" + + res[key]["OBJ_CNTS"].setdefault(est_label, 0) + res[key]["OBJ_CNTS"][est_label] += 1 + else: + status = "PFN/PFP" + else: + status = "TP/TN" + + res[key][status] += 1 + res[key]["NM"] += 1 + res[key]["UUID_NUM"] = len(uuid_list) + point["status"] = status + point["act_uuid"] = act_uuid + point["est_uuid"] = est_uuid + pos.append(point) with open(result_directory + "score.json", "w") as file: file.write(json.dumps(res, indent=4)) diff --git a/evaluation_dashboard_app/lib/run_loader.py b/evaluation_dashboard_app/lib/run_loader.py index d240511..fc99168 100644 --- a/evaluation_dashboard_app/lib/run_loader.py +++ b/evaluation_dashboard_app/lib/run_loader.py @@ -1,5 +1,6 @@ from pathlib import Path import pandas as pd +from lib.score_schema import read_score_csv SUMMARY_DTYPES = { "id": "string", @@ -29,17 +30,7 @@ def load_run(run_dir: Path): if not summary_path.exists(): if _has_parquet_files(run_dir): # Parquet-only run: allow load for Detection Stats and Bounding Box Viewer - score = pd.read_csv( - score_path, - header=None, - engine="python", - names=[ - "Scenario", "Option", "GT_OBJ", "Distance0", "NM0", "TP/TN0", "ADD0", "AIL0", "UIL0", "PFN/PFP0", "UUID Num0", "Practical Pass Rate0", "MAX_DIST_THRESH0", "OBJ_CNTS0", - "Distance1", "NM1", "TP/TN1", "ADD1", "AIL1", "UIL1", "PFN/PFP1", "UUID Num1", "Practical Pass Rate1", "MAX_DIST_THRESH1", "OBJ_CNTS1", - "Distance2", "NM2", "TP/TN2", "ADD2", "AIL2", "UIL2", "PFN/PFP2", "UUID Num2", "Practical Pass Rate2", "MAX_DIST_THRESH2", "OBJ_CNTS2", - "Distance3", "NM3", "TP/TN3", "ADD3", "AIL3", "UIL3", "PFN/PFP3", "UUID Num3", "Practical Pass Rate3", "MAX_DIST_THRESH3", "OBJ_CNTS3", - ] - ) if score_path.exists() else None + score = read_score_csv(score_path) return { "path": run_dir, "summary": None, @@ -60,17 +51,7 @@ def load_run(run_dir: Path): if col not in summary.columns: summary[col] = pd.Series([""] * len(summary), dtype="string") - score = pd.read_csv( - score_path, - header=None, - engine="python", - names=[ - "Scenario", "Option", "GT_OBJ", "Distance0", "NM0", "TP/TN0", "ADD0", "AIL0", "UIL0", "PFN/PFP0", "UUID Num0", "Practical Pass Rate0", "MAX_DIST_THRESH0", "OBJ_CNTS0", - "Distance1", "NM1", "TP/TN1", "ADD1", "AIL1", "UIL1", "PFN/PFP1", "UUID Num1", "Practical Pass Rate1", "MAX_DIST_THRESH1", "OBJ_CNTS1", - "Distance2", "NM2", "TP/TN2", "ADD2", "AIL2", "UIL2", "PFN/PFP2", "UUID Num2", "Practical Pass Rate2", "MAX_DIST_THRESH2", "OBJ_CNTS2", - "Distance3", "NM3", "TP/TN3", "ADD3", "AIL3", "UIL3", "PFN/PFP3", "UUID Num3", "Practical Pass Rate3", "MAX_DIST_THRESH3", "OBJ_CNTS3", - ] - ) if score_path.exists() else None + score = read_score_csv(score_path) return { "path": run_dir, diff --git a/evaluation_dashboard_app/lib/score_schema.py b/evaluation_dashboard_app/lib/score_schema.py new file mode 100644 index 0000000..5aef313 --- /dev/null +++ b/evaluation_dashboard_app/lib/score_schema.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +SCORE_BASE_COLS = ["Scenario", "Option", "GT_OBJ"] +SCORE_BASE_COLS_WITH_DATASET = ["Scenario", "Dataset", "Option", "GT_OBJ"] + +SCORE_SOURCE_METRIC_COLS = [ + "Distance", + "NM", + "TP/TN", + "ADD", + "AIL", + "UIL", + "PFN/PFP", + "UUID Num", + "Practical Pass Rate", + "MAX_DIST_THRESH", + "OBJ_CNTS", +] + +SCORE_VIEW_METRIC_COLS = [ + "distance", + "nm", + "tp_tn", + "add", + "ail", + "uil", + "pfn_pfp", + "uuid_num", + "pass_rate", + "max_dist_thresh", + "obj_cnts", +] + +SCORE_NUM_COLS = [ + "nm", + "tp_tn", + "add", + "ail", + "uil", + "pfn_pfp", + "uuid_num", + "pass_rate", + "max_dist_thresh", +] + +SCORE_BLOCK_SIZE = len(SCORE_VIEW_METRIC_COLS) + + +def _looks_like_header(row: pd.Series) -> bool: + first = str(row.iloc[0]).strip() if len(row) else "" + return first == "Scenario" + + +def _looks_like_criteria_cell(value: object) -> bool: + text = str(value).strip() + return text.startswith("criteria") + + +def _drop_extra_empty_trailing_columns(df: pd.DataFrame, base_count: int) -> pd.DataFrame: + while ( + df.shape[1] + and df.iloc[:, -1].isna().all() + and (df.shape[1] - base_count) % SCORE_BLOCK_SIZE != 0 + ): + df = df.iloc[:, :-1] + return df + + +def _infer_base_count(df: pd.DataFrame, header_row: pd.Series | None) -> int: + if header_row is not None: + header_values = [str(x).strip() for x in header_row.tolist()] + if len(header_values) >= 4 and header_values[1] == "Dataset": + return 4 + return 3 + + if df.empty: + return 3 + first = df.iloc[0] + if len(first) > 4 and _looks_like_criteria_cell(first.iloc[4]): + return 4 + if len(first) > 3 and _looks_like_criteria_cell(first.iloc[3]): + return 3 + + ncols = df.shape[1] + if ncols >= 4 and (ncols - 4) % SCORE_BLOCK_SIZE == 0: + return 4 + return 3 + + +def score_raw_columns(has_dataset: bool, criteria_count: int) -> list[str]: + cols = list(SCORE_BASE_COLS_WITH_DATASET if has_dataset else SCORE_BASE_COLS) + for i in range(criteria_count): + cols.extend(f"{name}{i}" for name in SCORE_SOURCE_METRIC_COLS) + return cols + + +def read_score_csv(score_path: Path) -> pd.DataFrame | None: + if not score_path.exists(): + return None + + raw = pd.read_csv(score_path, header=None, engine="python") + if raw.empty: + return raw + + header_row = raw.iloc[0] if _looks_like_header(raw.iloc[0]) else None + if header_row is not None: + raw = raw.iloc[1:].reset_index(drop=True) + + base_count = _infer_base_count(raw, header_row) + raw = _drop_extra_empty_trailing_columns(raw, base_count) + criteria_count = max(1, (raw.shape[1] - base_count) // SCORE_BLOCK_SIZE) + expected_cols = base_count + criteria_count * SCORE_BLOCK_SIZE + raw = raw.iloc[:, :expected_cols].copy() + raw.columns = score_raw_columns(base_count == 4, criteria_count) + return raw.reset_index(drop=True) + + +def score_base_cols(df_raw: pd.DataFrame) -> list[str]: + if df_raw is not None and "Dataset" in df_raw.columns: + return list(SCORE_BASE_COLS_WITH_DATASET) + return list(SCORE_BASE_COLS) + + +def infer_score_criteria_count( + df_raw: pd.DataFrame, + max_criteria: int = 32, +) -> int: + if df_raw is None or df_raw.empty: + return 1 + base_count = len(score_base_cols(df_raw)) + n = (df_raw.shape[1] - base_count) // SCORE_BLOCK_SIZE + n = max(1, n) + return int(min(n, max_criteria)) + + +def build_score_view(df_raw: pd.DataFrame, criteria_idx: int) -> pd.DataFrame: + base_cols = score_base_cols(df_raw) + start = len(base_cols) + criteria_idx * SCORE_BLOCK_SIZE + end = start + SCORE_BLOCK_SIZE + + df_view = df_raw.loc[:, base_cols].copy() + block = df_raw.iloc[:, start:end].copy() + block.columns = SCORE_VIEW_METRIC_COLS + df_view = pd.concat([df_view, block], axis=1) + for column in SCORE_NUM_COLS: + df_view[column] = pd.to_numeric(df_view[column], errors="coerce") + return df_view + + +def score_identity_cols(df: pd.DataFrame) -> list[str]: + return ["Scenario", "Dataset"] if df is not None and "Dataset" in df.columns else ["Scenario"] diff --git a/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py b/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py index ba7b553..5229f46 100644 --- a/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py +++ b/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py @@ -25,7 +25,15 @@ export_gate_result, failing_scenarios_table, gate_summary, - infer_criteria_count, +) +from lib.score_schema import ( + SCORE_BLOCK_SIZE, + SCORE_NUM_COLS, + SCORE_VIEW_METRIC_COLS, + build_score_view, + infer_score_criteria_count, + score_base_cols, + score_identity_cols, ) st.set_page_config( @@ -112,7 +120,15 @@ def _filter_df_view_by_perception_labels( allowed = set(s["id"].unique()) if not allowed: return df_view.iloc[0:0].copy() - return df_view.loc[df_view["Scenario"].astype(str).isin(allowed)].copy() + + scenario_key = df_view["Scenario"].astype(str) + mask = scenario_key.isin(allowed) + if "Dataset" in df_view.columns: + # Older generated Score.csv files stored the final scenario suffix in Dataset, + # while Summary.csv kept the full id. Keep matching those files too. + composite_key = scenario_key + "_" + df_view["Dataset"].astype(str) + mask = mask | composite_key.isin(allowed) + return df_view.loc[mask].copy() def _filter_df_view_by_scenarios(df_view: pd.DataFrame, selected_scenarios: list) -> pd.DataFrame: @@ -193,54 +209,14 @@ def _apply_gate_data_filters( # Constants # ========================= -BASE_COLS = ["Scenario", "Option", "GT_OBJ"] - -CRITERIA_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", - "obj_cnts", -] - -BLOCK_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", - "obj_cnts", -] - -BLOCK_SIZE = len(CRITERIA_COLS) - -NUM_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", -] - -_criteria_n_a = infer_criteria_count(df_raw_A, BLOCK_SIZE) +BASE_COLS = score_base_cols(df_raw_A) +CRITERIA_COLS = SCORE_VIEW_METRIC_COLS +BLOCK_SIZE = SCORE_BLOCK_SIZE +NUM_COLS = SCORE_NUM_COLS + +_criteria_n_a = infer_score_criteria_count(df_raw_A) if mode == "Compare Mode" and compare_runs: - CRITERIA_COUNT = min(infer_criteria_count(r["score"], BLOCK_SIZE) for r in compare_runs) + CRITERIA_COUNT = min(infer_score_criteria_count(r["score"]) for r in compare_runs) else: CRITERIA_COUNT = _criteria_n_a @@ -255,19 +231,16 @@ def _apply_gate_data_filters( def build_view(df_raw, criteria_idx): - start = 3 + criteria_idx * BLOCK_SIZE - end = start + BLOCK_SIZE + return build_score_view(df_raw, criteria_idx) - df_view = df_raw.iloc[:, :3].copy() - df_view.columns = BASE_COLS - block = df_raw.iloc[:, start:end].copy() - block.columns = BLOCK_COLS - - df_view = pd.concat([df_view, block], axis=1) - for c in NUM_COLS: - df_view[c] = pd.to_numeric(df_view[c], errors="coerce") - return df_view +def _add_scenario_display(df: pd.DataFrame) -> pd.DataFrame: + d = df.copy() + if "Dataset" in d.columns: + d["ScenarioDisplay"] = d["Scenario"].astype(str) + " [" + d["Dataset"].astype(str) + "]" + else: + d["ScenarioDisplay"] = d["Scenario"].astype(str) + return d st.sidebar.divider() @@ -457,13 +430,14 @@ def _gate_compare_overlap_stats(result_a: pd.DataFrame, result_b: pd.DataFrame) """Classify scenarios on inner join (same Scenario id in both gate tables).""" if result_a is None or result_b is None or result_a.empty or result_b.empty: return None - a = result_a[["Scenario", "scenario_pass"]].copy() - b = result_b[["Scenario", "scenario_pass"]].copy() + key_cols = [c for c in score_identity_cols(result_a) if c in result_b.columns] + a = result_a[key_cols + ["scenario_pass"]].copy() + b = result_b[key_cols + ["scenario_pass"]].copy() a["pass_a"] = a["scenario_pass"].map(bool) b["pass_b"] = b["scenario_pass"].map(bool) outer = a.drop(columns=["scenario_pass"]).merge( b.drop(columns=["scenario_pass"]), - on="Scenario", + on=key_cols, how="outer", indicator=True, ) @@ -504,7 +478,10 @@ def _overlap_scenario_lists(merged: pd.DataFrame) -> dict[str, list[str]]: "a_fail_b_pass": [], "a_pass_b_fail": [], } - scen = merged["Scenario"].astype(str) + if "Dataset" in merged.columns: + scen = merged["Scenario"].astype(str) + " [" + merged["Dataset"].astype(str) + "]" + else: + scen = merged["Scenario"].astype(str) pa = merged["pass_a"].map(bool) pb = merged["pass_b"].map(bool) return { @@ -1089,14 +1066,16 @@ def _render_absolute_gates_section( "Per-scenario pass rate", "Scenarios present in every run (inner join) — filter to focus on regressions or wins.", ) + scenario_key_cols = score_identity_cols(df_views[0]) merges = [] for i, lbl in enumerate(cl): - g = df_views[i].groupby("Scenario", as_index=False)["pass_rate"].mean() + g = df_views[i].groupby(scenario_key_cols, as_index=False)["pass_rate"].mean() g = g.rename(columns={"pass_rate": f"pr_{lbl}"}) merges.append(g) per_scenario = merges[0] for g in merges[1:]: - per_scenario = per_scenario.merge(g, on="Scenario", how="inner") + per_scenario = per_scenario.merge(g, on=scenario_key_cols, how="inner") + per_scenario = _add_scenario_display(per_scenario) pr_base = f"pr_{cl[0]}" delta_col = f"delta_{focus_cand}" for lbl in cand_only: @@ -1138,7 +1117,7 @@ def _render_absolute_gates_section( elif filter_method == "Custom contains string": search = st.text_input("Show scenarios with name containing (case-insensitive):", "") per_scenario_vis = ( - per_scenario[per_scenario["Scenario"].str.contains(search, case=False, na=False)] + per_scenario[per_scenario["ScenarioDisplay"].str.contains(search, case=False, na=False)] if search else per_scenario ) @@ -1149,7 +1128,7 @@ def _render_absolute_gates_section( col_to_run = {f"pr_{lbl}": run_names[i] for i, lbl in enumerate(cl)} per_scenario_vis_long = pd.melt( per_scenario_vis, - id_vars=["Scenario"], + id_vars=scenario_key_cols + ["ScenarioDisplay"], value_vars=pr_cols_melt, var_name="_k", value_name="pass_rate", @@ -1159,7 +1138,7 @@ def _render_absolute_gates_section( fig = px.bar( per_scenario_vis_long, - x="Scenario", + x="ScenarioDisplay", y="pass_rate", color="Run", color_discrete_map=_px_map, @@ -1175,7 +1154,7 @@ def _render_absolute_gates_section( ) fig2 = px.bar( per_scenario_vis.reindex(per_scenario_vis[delta_col].abs().sort_values(ascending=False).index), - x="Scenario", + x="ScenarioDisplay", y=delta_col, color=delta_col, color_continuous_scale="RdYlGn", @@ -1184,7 +1163,7 @@ def _render_absolute_gates_section( _plotly_apply_theme(fig2, "Pass rate delta by scenario") st.plotly_chart(fig2, width="stretch") - table_cols = ["Scenario"] + pr_cols_melt + [f"delta_{lbl}" for lbl in cand_only] + table_cols = scenario_key_cols + pr_cols_melt + [f"delta_{lbl}" for lbl in cand_only] table_cols = [c for c in table_cols if c in per_scenario_vis.columns] with st.expander("Show Table: Per Scenario Pass Rates and Deltas"): st.dataframe(per_scenario_vis[table_cols], width="stretch") @@ -1199,7 +1178,7 @@ def _render_absolute_gates_section( per_scenario_vis, x=pr_base, y=f"pr_{focus_cand}", - text="Scenario", + text="ScenarioDisplay", labels={ pr_base: f"Baseline ({cl[0]}) Pass Rate", f"pr_{focus_cand}": f"Candidate ({focus_cand}) Pass Rate", @@ -1380,7 +1359,8 @@ def _render_absolute_gates_section( st.plotly_chart(fig, width="stretch") section_header("Scenario leaderboard", "Mean pass rate per scenario — tune N and sort direction.") - scenario_metric = df_view.groupby("Scenario", as_index=False)["pass_rate"].mean() + scenario_key_cols = score_identity_cols(df_view) + scenario_metric = df_view.groupby(scenario_key_cols, as_index=False)["pass_rate"].mean() top_n = st.number_input("Top N scenarios", min_value=5, max_value=100, value=20, key="single_top_n") sort_order = st.radio("Order", ["Highest first", "Lowest first"], horizontal=True, key="single_scen_order") scenario_metric = scenario_metric.sort_values( diff --git a/evaluation_dashboard_app/pages/6_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py index d32e92a..99ebe08 100644 --- a/evaluation_dashboard_app/pages/6_Workflow.py +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -2246,8 +2246,7 @@ def _render_workflow_launcher_section( use_container_width=False, ) - if new_job_clicked and callable(getattr(st, "dialog", None)): - st.session_state["workflow_start_dialog_open"] = True + def _reset_start_workflow_state() -> None: fresh_target = str(get_config_value("target_name", "beta/v4.3.2") or "beta/v4.3.2") st.session_state["workflow_catalog_name"] = "" st.session_state["workflow_last_catalog_preset"] = "" @@ -2263,109 +2262,131 @@ def _render_workflow_launcher_section( st.session_state["workflow_release_devops_job_id"] = "" st.session_state["workflow_output_path"] = _make_default_output_path(fresh_target) - @st.dialog("Start evaluator workflow", width="large") - def _workflow_start_dialog() -> None: - st.caption("This is the full launcher for creating a new evaluator job, downloading results, and optionally running eval/parquet.") - payload = _render_start_workflow_form(catalog_presets, catalogs_path, catalog_load_error) - submit_cols = st.columns([1.15, 1.15, 3.7]) - close_clicked = submit_cols[0].button("Close", key="workflow_close_start_dialog", use_container_width=True) - start_clicked = submit_cols[1].button("Start workflow", key="workflow_start_btn_dialog", type="primary", use_container_width=True) - if close_clicked: - st.session_state["workflow_start_dialog_open"] = False - st.rerun() - if start_clicked: - dialog_payload = dict(payload.get("dialog_payload") or {}) - errors = dialog_payload.get("errors", []) - if errors: - for err in errors: - st.error(f"Missing or invalid: {err}") - elif not is_task_queue_enabled(): - st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") - else: - common_params = { - "project_id": dialog_payload["project_id"], - "suite_ids": None, - "target_name": dialog_payload["target_name"], - "environment": dialog_payload["environment"], - "max_retries": 0, - "clean_build": False, - "debug": False, - "release": False, - "record_caret": False, - "log_expiration_time_in_days": 14.0, - "is_tag": dialog_payload["is_tag"], - "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", - "phase": dialog_payload["phase"], - "skip_large_file": bool(dialog_payload.get("skip_large_file", True)), - "large_file_mb": 50.0, - "keep_zip_files": False, - "poll_interval": dialog_payload["poll_interval"], - "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, - "run_eval": dialog_payload["run_eval"], - "generate_parquet": dialog_payload["generate_parquet"], - "eval_recursive": dialog_payload["eval_recursive"], - "eval_overwrite": False, - } - if dialog_payload.get("release_mode"): - base_description = dialog_payload["description"] or _make_auto_release_workflow_description( - dialog_payload["target_name"] - ) - trend_metadata = dict(dialog_payload.get("trend_metadata") or {}) - task_id = _enqueue_task( - "run_release_specsheet_workflow", - { - "project_id": dialog_payload["project_id"], - "target_name": dialog_payload["target_name"], - "description": base_description, - "output_path": dialog_payload["resolved_output"], - "environment": dialog_payload["environment"], - "is_tag": dialog_payload["is_tag"], - "poll_interval": dialog_payload["poll_interval"], - "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, - "trend_metadata": trend_metadata, - "version": trend_metadata.get("pilot_auto_version", ""), - "topic": trend_metadata.get("topic_name", "perception.object_recognition.objects"), - "performance_catalog_id": _RELEASE_PERFORMANCE_CATALOG_ID, - "performance_integration_id": _RELEASE_PERFORMANCE_INTEGRATION_ID, - "performance_job_id": dialog_payload.get("performance_job_id", ""), - "devops_catalog_id": _RELEASE_DEVOPS_CATALOG_ID, - "devops_integration_id": _RELEASE_DEVOPS_INTEGRATION_ID, - "devops_job_id": dialog_payload.get("devops_job_id", ""), - "analysis_phase": "perception.object_recognition.tracking.objects", - "overwrite": True, - }, - ) - if task_id: - st.session_state["workflow_start_dialog_open"] = False - st.success(f"Release specsheet workflow queued. Task id: `{task_id}`") - st.rerun() - else: - st.error("Failed to enqueue release specsheet workflow. Check worker logs.") - return - + def _render_start_workflow_controls(*, key_suffix: str = "dialog") -> None: + st.caption("This is the full launcher for creating a new evaluator job, downloading results, and optionally running eval/parquet.") + payload = _render_start_workflow_form(catalog_presets, catalogs_path, catalog_load_error) + submit_cols = st.columns([1.15, 1.15, 3.7]) + close_clicked = submit_cols[0].button( + "Close", + key=f"workflow_close_start_{key_suffix}", + use_container_width=True, + ) + start_clicked = submit_cols[1].button( + "Start workflow", + key=f"workflow_start_btn_{key_suffix}", + type="primary", + use_container_width=True, + ) + if close_clicked: + st.session_state["workflow_start_dialog_open"] = False + st.rerun() + if start_clicked: + dialog_payload = dict(payload.get("dialog_payload") or {}) + errors = dialog_payload.get("errors", []) + if errors: + for err in errors: + st.error(f"Missing or invalid: {err}") + elif not is_task_queue_enabled(): + st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") + else: + common_params = { + "project_id": dialog_payload["project_id"], + "suite_ids": None, + "target_name": dialog_payload["target_name"], + "environment": dialog_payload["environment"], + "max_retries": 0, + "clean_build": False, + "debug": False, + "release": False, + "record_caret": False, + "log_expiration_time_in_days": 14.0, + "is_tag": dialog_payload["is_tag"], + "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", + "phase": dialog_payload["phase"], + "skip_large_file": bool(dialog_payload.get("skip_large_file", True)), + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "run_eval": dialog_payload["run_eval"], + "generate_parquet": dialog_payload["generate_parquet"], + "eval_recursive": dialog_payload["eval_recursive"], + "eval_overwrite": False, + } + if dialog_payload.get("release_mode"): + base_description = dialog_payload["description"] or _make_auto_release_workflow_description( + dialog_payload["target_name"] + ) + trend_metadata = dict(dialog_payload.get("trend_metadata") or {}) task_id = _enqueue_task( - "run_evaluator_and_process", + "run_release_specsheet_workflow", { - **common_params, - "catalog_id": dialog_payload["catalog_id"], - "integration_id": dialog_payload["integration_id"], - "catalog_preset_name": dialog_payload.get("catalog_preset_name", ""), - "description": dialog_payload["description"] or _make_auto_workflow_description( - dialog_payload["target_name"], - dialog_payload.get("catalog_preset_name", ""), - has_custom_catalog=bool(dialog_payload.get("has_custom_catalog", False)), - ), + "project_id": dialog_payload["project_id"], + "target_name": dialog_payload["target_name"], + "description": base_description, "output_path": dialog_payload["resolved_output"], + "environment": dialog_payload["environment"], + "is_tag": dialog_payload["is_tag"], + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "trend_metadata": trend_metadata, + "version": trend_metadata.get("pilot_auto_version", ""), + "topic": trend_metadata.get("topic_name", "perception.object_recognition.objects"), + "performance_catalog_id": _RELEASE_PERFORMANCE_CATALOG_ID, + "performance_integration_id": _RELEASE_PERFORMANCE_INTEGRATION_ID, + "performance_job_id": dialog_payload.get("performance_job_id", ""), + "devops_catalog_id": _RELEASE_DEVOPS_CATALOG_ID, + "devops_integration_id": _RELEASE_DEVOPS_INTEGRATION_ID, + "devops_job_id": dialog_payload.get("devops_job_id", ""), + "analysis_phase": "perception.object_recognition.tracking.objects", + "overwrite": True, }, ) if task_id: st.session_state["workflow_start_dialog_open"] = False - st.success(f"Workflow queued. Task id: `{task_id}`") + st.success(f"Release specsheet workflow queued. Task id: `{task_id}`") st.rerun() else: - st.error("Failed to enqueue task. Check worker logs.") + st.error("Failed to enqueue release specsheet workflow. Check worker logs.") + return + + task_id = _enqueue_task( + "run_evaluator_and_process", + { + **common_params, + "catalog_id": dialog_payload["catalog_id"], + "integration_id": dialog_payload["integration_id"], + "catalog_preset_name": dialog_payload.get("catalog_preset_name", ""), + "description": dialog_payload["description"] or _make_auto_workflow_description( + dialog_payload["target_name"], + dialog_payload.get("catalog_preset_name", ""), + has_custom_catalog=bool(dialog_payload.get("has_custom_catalog", False)), + ), + "output_path": dialog_payload["resolved_output"], + }, + ) + if task_id: + st.session_state["workflow_start_dialog_open"] = False + st.success(f"Workflow queued. Task id: `{task_id}`") + st.rerun() + else: + st.error("Failed to enqueue task. Check worker logs.") - _workflow_start_dialog() + if new_job_clicked: + st.session_state["workflow_start_dialog_open"] = True + _reset_start_workflow_state() + + if st.session_state.get("workflow_start_dialog_open"): + if callable(getattr(st, "dialog", None)): + @st.dialog("Start evaluator workflow", width="large") + def _workflow_start_dialog() -> None: + _render_start_workflow_controls(key_suffix="dialog") + + _workflow_start_dialog() + else: + st.markdown("---") + st.subheader("Start evaluator workflow") + _render_start_workflow_controls(key_suffix="inline") return start_defaults diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 7dd461b..8546f75 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -51,6 +51,26 @@ def _import_catalog_io(): return None +def _parquet_progress_callback( + task_id: str, + *, + prefix: str = "Parquet", + pct_start: float = 0.0, + pct_end: float = 100.0, +): + """Return a pkl-file progress callback for pkl_archive_to_parquet.""" + + def _on_progress(done: int, total: int) -> None: + total_safe = max(1, int(total or 0)) + done_safe = min(max(0, int(done or 0)), total_safe) + pct = pct_start + (done_safe / total_safe) * max(0.0, pct_end - pct_start) + message = f"{prefix}: processing pkl files {done_safe}/{total_safe}" + update_task_progress(task_id, message=message, pct=min(pct_end, pct)) + append_task_log(task_id, message) + + return _on_progress + + def _copy_task_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: copied: Dict[str, Any] = {} for key, value in (parameters or {}).items(): @@ -349,15 +369,17 @@ def job_build_parquet(task_id: str, parameters: Dict[str, Any]) -> None: update_task_status(task_id, "failed", error_message="Missing pkl_dir") return append_task_log(task_id, f"Building parquet from {pkl_dir}") + update_task_progress(task_id, message=f"Parquet: scanning pkl files in {pkl_dir}", pct=0) project_id = parameters.get("project_id") job_id = parameters.get("job_id") parquet_path = pkl_archive_to_parquet( pkl_dir, - on_progress=None, - on_skip=None, + on_progress=_parquet_progress_callback(task_id, pct_start=5, pct_end=95), + on_skip=lambda path, reason: append_task_log(task_id, f"Parquet skipped {path}: {reason}"), project_id=project_id, job_id=job_id, ) + update_task_progress(task_id, message="Parquet: writing output complete", pct=100) update_task_result_summary(task_id, {"job": "build_parquet", "output_path": parquet_path}) _update_run_metadata( task_id, @@ -1169,11 +1191,20 @@ def _on_warning(msg: str) -> None: update_task_progress(task_id, message=f"{role}: generating parquet", pct=eval_end) result["parquet_path"] = pkl_archive_to_parquet( str(output_path), - on_progress=None, - on_skip=None, + on_progress=_parquet_progress_callback( + task_id, + prefix=f"{role}: parquet", + pct_start=eval_end, + pct_end=99, + ), + on_skip=lambda path, reason: append_task_log( + task_id, + f"WARNING: {role}: parquet skipped {path}: {reason}", + ), project_id=project_id, job_id=job_id, ) or "" + update_task_progress(task_id, message=f"{role}: parquet generated", pct=99) except Exception as exc: warning = f"Parquet generation failed: {exc}" result["warnings"].append(warning) @@ -2094,11 +2125,20 @@ def on_eval_progress(status: str, elapsed: float) -> None: try: parquet_path = pkl_archive_to_parquet( output_path, - on_progress=None, - on_skip=None, + on_progress=_parquet_progress_callback( + task_id, + prefix="Parquet", + pct_start=90, + pct_end=99, + ), + on_skip=lambda path, reason: append_task_log( + task_id, + f"Parquet skipped {path}: {reason}", + ), project_id=project_id, job_id=job_id, ) + update_task_progress(task_id, message="Parquet generated", pct=99) append_task_log(task_id, f"Parquet generated: {parquet_path}") except Exception as e: append_task_log(task_id, f"Parquet generation failed: {e}") From 2f1886200b385eb125ca544221c6500ebcfb7e9e Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 29 May 2026 10:31:02 +0900 Subject: [PATCH 87/94] feat: enhance evaluation and download processes with concurrency and detail options - Added support for configurable evaluation worker threads to improve performance during evaluation tasks. - Introduced an option to include detailed logs and results in the recent tasks listing, enhancing visibility into task outcomes. - Refactored the evaluation logic to utilize concurrent processing, optimizing the handling of multiple directories. - Improved progress tracking and user feedback during evaluation and download phases, ensuring clearer communication of task status. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/db.py | 9 +- evaluation_dashboard_app/lib/download_core.py | 60 +++++- .../lib/specsheet_report.py | 14 +- .../lib/ui/recent_evaluator_jobs.py | 11 + .../lib/ui/task_history.py | 4 +- evaluation_dashboard_app/pages/6_Download.py | 79 +++---- evaluation_dashboard_app/worker/tasks.py | 201 +++++++++++++++--- 7 files changed, 289 insertions(+), 89 deletions(-) diff --git a/evaluation_dashboard_app/lib/db.py b/evaluation_dashboard_app/lib/db.py index c941970..a678f08 100644 --- a/evaluation_dashboard_app/lib/db.py +++ b/evaluation_dashboard_app/lib/db.py @@ -442,12 +442,14 @@ def list_recent_tasks( offset: int = 0, session_id: Optional[str] = None, since_days: Optional[int] = None, + include_details: bool = False, ) -> List[Dict[str, Any]]: """Return recent tasks (newest first). If ``session_id`` is set, only that user's tasks. If ``since_days`` is set, only tasks with ``created_at`` within that many calendar days (from DB ``NOW()``). ``limit`` still caps row count. + ``include_details`` includes heavy log/result payloads; task list cards do not need them. """ url = get_database_url() if not url: @@ -461,7 +463,12 @@ def list_recent_tasks( conn = psycopg2.connect(url) try: with conn.cursor(cursor_factory=RealDictCursor) as cur: - cols = "id, type, status, parameters, result_path, error_message, progress_message, progress_pct, log_output, result_summary, rq_job_id, created_at, updated_at" + cols = ( + "id, type, status, parameters, result_path, error_message, " + "progress_message, progress_pct, rq_job_id, created_at, updated_at" + ) + if include_details: + cols += ", log_output, result_summary" conditions: List[str] = [] params: List[Any] = [] if session_id is not None: diff --git a/evaluation_dashboard_app/lib/download_core.py b/evaluation_dashboard_app/lib/download_core.py index b2707b9..4c59985 100644 --- a/evaluation_dashboard_app/lib/download_core.py +++ b/evaluation_dashboard_app/lib/download_core.py @@ -9,6 +9,7 @@ import os import shutil import urllib.parse +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from collections import Counter from typing import Any, Callable, Dict, List, Optional @@ -24,6 +25,19 @@ API_BASE_URL = "https://evaluation.ci.web.auto/v3" +def _compact_eval_path(path: Any, *, parts: int = 2) -> str: + """Return a readable tail path for progress/log messages.""" + text = str(path or "").strip() + if not text: + return "unknown" + try: + p = Path(text) + tail = p.parts[-parts:] + return "/".join(tail) if tail else text + except Exception: + return text + + def _make_evaluator_session(environment: str = DEFAULT_ENVIRONMENT): """Build authenticated session for evaluation.ci.web.auto API (no Streamlit).""" os.environ["AUTH_PROFILE"] = environment @@ -548,6 +562,7 @@ def run_download_and_eval( generate_parquet: bool = True, eval_recursive: bool = True, eval_overwrite: bool = False, + eval_workers: int = 4, on_progress: Optional[Callable[[str], None]] = None, on_warning: Optional[Callable[[str], None]] = None, ) -> Dict[str, Any]: @@ -629,13 +644,46 @@ def run_download_and_eval( if target_dirs: total = len(target_dirs) eval_statuses: List[Dict[str, Any]] = [] - for i, result_dir in enumerate(target_dirs): - if on_progress: - on_progress(f"Eval: Processing {i+1}/{total}: {result_dir}") - status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + try: + requested_workers = int(eval_workers or 1) + except (TypeError, ValueError): + requested_workers = 1 + workers = max(1, min(requested_workers, total)) + if on_progress: + on_progress(f"Eval: completed 0/{total} dirs") + + def _record_status(status: Dict[str, Any], done: int, fallback_path: str) -> None: eval_statuses.append(status) - if status.get("status") == "failed" and on_warning: - on_warning(f"Eval failed for {result_dir}: {status.get('detail', '')}") + state = str(status.get("status") or "failed") + short_path = _compact_eval_path(status.get("path") or fallback_path) + if on_progress: + on_progress(f"Eval: completed {done}/{total} dirs - {state}: {short_path}") + if state == "failed" and on_warning: + on_warning(f"Eval failed for {status.get('path', '')}: {status.get('detail', '')}") + + if workers == 1: + for i, result_dir in enumerate(target_dirs): + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + _record_status(status, i + 1, result_dir) + else: + if on_progress: + on_progress(f"Eval: running {total} dirs with {workers} worker(s)") + with ThreadPoolExecutor(max_workers=workers) as executor: + future_map = { + executor.submit( + eval_summary.run_eval_result_for_dir, + result_dir, + overwrite=eval_overwrite, + ): result_dir + for result_dir in target_dirs + } + for done, future in enumerate(as_completed(future_map), start=1): + result_dir = future_map[future] + try: + status = future.result() + except Exception as exc: + status = {"path": result_dir, "status": "failed", "detail": str(exc)} + _record_status(status, done, result_dir) # Generate summary CSVs csv_info = eval_summary.generate_summary_and_score_csv(eval_root) diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index ff82318..48dbc1c 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -47,6 +47,7 @@ """ _TREND_DATE_PATTERN = re.compile(r"^\d{4}\.\d{1,2}\.\d{1,2}$") _TREND_DATA_COUNT_PATTERN = re.compile(r"^\d[\d,]*\+?$") +_PILOT_AUTO_PREFIX_PATTERN = re.compile(r"^Pilot\.Auto\s+", re.IGNORECASE) @dataclass @@ -214,13 +215,21 @@ def parse_trend_metadata_text(text: str) -> dict[str, Any]: "description": description, "date": date, } - for optional_key in ("release_group", "topic_name"): + for optional_key in ("release_group", "topic_name", "version_abbr"): optional_value = str(raw.get(optional_key) or "").strip() if optional_value: parsed[optional_key] = optional_value return parsed +def _trend_version_abbr(metadata: dict[str, Any]) -> str: + explicit = str(metadata.get("version_abbr") or "").strip() + if explicit: + return explicit + version = str(metadata.get("pilot_auto_version") or "").strip() + return _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version + + def write_trend_metadata(run_dir: str | Path, metadata: dict[str, Any]) -> Path: paths = get_specsheet_artifact_paths(run_dir) resource_dir = paths["resource_dir"] @@ -592,6 +601,7 @@ def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, trend_data_rows.append( { "version": metadata.get("pilot_auto_version"), + "version_abbr": _trend_version_abbr(metadata), "data_count": metadata.get("data_count"), "description": metadata.get("description"), "date": metadata.get("date"), @@ -621,6 +631,7 @@ def _avg(metric_name: str) -> float: output.append( { "version": row.get("version"), + "version_abbr": row.get("version_abbr"), "data_count": row.get("data_count"), "description": row.get("description"), "date": row.get("date"), @@ -657,6 +668,7 @@ def load_devops_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, Any] trend_data_rows.append( { "version": metadata.get("pilot_auto_version"), + "version_abbr": _trend_version_abbr(metadata), "data_count": metadata.get("data_count"), "description": metadata.get("description"), "date": metadata.get("date"), diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py index 732015a..933f040 100644 --- a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -23,6 +23,15 @@ _ENQUEUE_TASK: Callable[[str, Dict[str, Any]], Optional[str]] = lambda task_type, params: None CATALOG_IO_AVAILABLE = False ENVIRONMENT = "default" +_DEFAULT_EVAL_WORKERS = 4 + + +def _default_eval_workers() -> int: + try: + workers = int(os.environ.get("EVAL_WORKERS_DEFAULT", _DEFAULT_EVAL_WORKERS)) + except (TypeError, ValueError): + workers = _DEFAULT_EVAL_WORKERS + return max(1, min(workers, 16)) def configure_recent_evaluator_jobs_ui(*, get_config_value: Callable[[str, Any], Any], set_config_value: Callable[[str, Any], None], enqueue_task: Callable[[str, Dict[str, Any]], Optional[str]], catalog_io_available: bool, environment: str = "default") -> None: @@ -1403,6 +1412,7 @@ def _render_recent_evaluator_job_run_dialog( "generate_parquet": generate_parquet, "eval_recursive": eval_recursive, "eval_overwrite": False, + "eval_workers": _default_eval_workers(), } task_id = _enqueue_task("download_and_eval", params) if not task_id: @@ -1612,6 +1622,7 @@ def _render_recent_evaluator_job_retest_dialog( "generate_parquet": generate_parquet, "eval_recursive": eval_recursive, "eval_overwrite": False, + "eval_workers": _default_eval_workers(), }, ) if not task_id: diff --git a/evaluation_dashboard_app/lib/ui/task_history.py b/evaluation_dashboard_app/lib/ui/task_history.py index 4471f8a..e5d05f5 100644 --- a/evaluation_dashboard_app/lib/ui/task_history.py +++ b/evaluation_dashboard_app/lib/ui/task_history.py @@ -256,9 +256,7 @@ def render_task_list(tasks: List[Dict[str, Any]], current_user: Optional[str]) - if use_dialog and st.session_state.get("_task_detail_id"): task_id = st.session_state["_task_detail_id"] try: - detail_task = next((x for x in tasks if str(x.get("id")) == task_id), None) - if detail_task is None: - detail_task = get_task(task_id) + detail_task = get_task(task_id) if detail_task: @st.dialog("Task details", width="large") diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index ad44b35..2222722 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -111,6 +111,17 @@ def _parse_rq_timeout_sec(raw: Optional[str], *, default: int, minimum: int) -> else: _BUILD_PARQUET_JOB_TIMEOUT_SEC = _RQ_DEFAULT_JOB_TIMEOUT_SEC +_DEFAULT_EVAL_WORKERS = 4 + + +def _default_eval_workers() -> int: + try: + workers = int(os.environ.get("EVAL_WORKERS_DEFAULT", _DEFAULT_EVAL_WORKERS)) + except (TypeError, ValueError): + workers = _DEFAULT_EVAL_WORKERS + return max(1, min(workers, 16)) + + _APP_ROOT = Path(__file__).resolve().parents[1] _CATALOGS_FILENAME = "catalogs.json" _LEGACY_CATALOGS_PATH = Path("/home/leigu/EvaluatorRunnerUITest/catalogs.json") @@ -2089,6 +2100,7 @@ def _render_recent_evaluator_job_run_dialog( "generate_parquet": generate_parquet, "eval_recursive": eval_recursive, "eval_overwrite": False, + "eval_workers": _default_eval_workers(), } task_id = _enqueue_task("download_and_eval", params) if not task_id: @@ -2935,6 +2947,7 @@ def on_suite_id_change(): "generate_parquet": combined_generate_parquet, "eval_recursive": combined_eval_recursive, "eval_overwrite": False, + "eval_workers": _default_eval_workers(), } task_id = _enqueue_task("download_and_eval", params) if task_id: @@ -3281,7 +3294,7 @@ def inline_progress(msg: str): st.error(f"Failed to save: {e}") st.exception(e) - col1, col2, col3 = st.columns(3) + col1, col2 = st.columns(2) with col1: eval_recursive = st.checkbox( "Search subdirectories", @@ -3294,28 +3307,10 @@ def inline_progress(msg: str): value=get_config_value("eval_overwrite", False), help="If unchecked, directories with result.txt will be skipped", ) - with col3: - eval_parallel = st.checkbox( - "Run in parallel", - value=get_config_value("eval_parallel", False), - help="Temporarily disabled. Parallel execution currently provides no measurable benefit.", - disabled=True - ) - if eval_parallel: - eval_workers = st.number_input( - "Eval worker threads", - min_value=1, - max_value=16, - value=get_config_value("eval_workers", 1), - help="Number of parallel threads used to run eval_result", - ) - set_config_value("eval_workers", eval_workers) - else: - eval_workers = 1 - set_config_value("eval_workers", eval_workers) + eval_workers = _default_eval_workers() + set_config_value("eval_workers", eval_workers) set_config_value("eval_recursive", eval_recursive) set_config_value("eval_overwrite", eval_overwrite) - set_config_value("eval_parallel", eval_parallel) # New option: Only generate summary/score csv only_generate_summary = st.checkbox( @@ -3413,6 +3408,7 @@ def _emit_eval_finished_notification(message: str): "eval_root": eval_path, "recursive": eval_recursive, "overwrite": eval_overwrite, + "eval_workers": eval_workers, }) if tid: enqueued.append(f"{'generate_summary_csv' if only_generate_summary else 'run_eval_dirs'} ({tid[:8]}...)") @@ -3574,30 +3570,23 @@ def _update_progress_status(done: int, total_dirs: int): ) try: - # sequential evaluation - if not eval_parallel: - for i, result_dir in enumerate(target_dirs): - _update_progress_status(i, total) - results.append(run_eval_result_for_dir(result_dir, overwrite=eval_overwrite)) - _update_progress_status(i + 1, total) - else: - max_workers = max(1, min(int(eval_workers), len(target_dirs))) - with ThreadPoolExecutor(max_workers=max_workers) as executor: - future_map = { - executor.submit(_run_eval_result_worker, result_dir, eval_overwrite): result_dir - for result_dir in target_dirs - } - completed = 0 - for future in as_completed(future_map): - completed += 1 - _update_progress_status(completed, total) - try: - results.append(future.result()) - except Exception as e: - result_dir = future_map.get(future, "unknown") - results.append( - {"path": result_dir, "status": "failed", "detail": str(e)} - ) + max_workers = max(1, min(int(eval_workers), len(target_dirs))) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_map = { + executor.submit(_run_eval_result_worker, result_dir, eval_overwrite): result_dir + for result_dir in target_dirs + } + completed = 0 + for future in as_completed(future_map): + completed += 1 + _update_progress_status(completed, total) + try: + results.append(future.result()) + except Exception as e: + result_dir = future_map.get(future, "unknown") + results.append( + {"path": result_dir, "status": "failed", "detail": str(e)} + ) _update_progress_status(total, total) finally: diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 8546f75..aea9897 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -9,6 +9,7 @@ import shutil import sys import time +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any, Dict, Optional @@ -71,6 +72,114 @@ def _on_progress(done: int, total: int) -> None: return _on_progress +def _eval_worker_count(parameters: Dict[str, Any], total: int) -> int: + """Resolve bounded eval concurrency. Defaults to 4, capped by total dirs.""" + if total <= 0: + return 1 + raw = parameters.get("eval_workers", os.environ.get("EVAL_WORKERS_DEFAULT", 4)) + try: + workers = int(raw) + except (TypeError, ValueError): + workers = 4 + try: + max_workers = int(os.environ.get("EVAL_WORKERS_MAX", 16)) + except ValueError: + max_workers = 16 + return max(1, min(workers, max_workers, total)) + + +def _compact_eval_path(path: Any, *, parts: int = 2) -> str: + """Return a readable tail path for task logs without flooding the UI.""" + text = str(path or "").strip() + if not text: + return "unknown" + try: + p = Path(text) + tail = p.parts[-parts:] + return "/".join(tail) if tail else text + except Exception: + return text + + +def _run_eval_result_dirs( + *, + task_id: str, + eval_summary: Any, + target_dirs: list[str], + overwrite: bool, + eval_workers: int, + pct_start: float, + pct_end: float, + label: str = "Eval", +) -> list[Dict[str, Any]]: + """Run eval_result across result dirs with bounded concurrency and calm progress.""" + total = len(target_dirs) + if total <= 0: + update_task_progress(task_id, message=f"{label}: no result directories found", pct=pct_end) + return [] + + workers = max(1, min(int(eval_workers or 1), total)) + span = max(0.0, pct_end - pct_start) + statuses: list[Dict[str, Any]] = [] + counts = {"success": 0, "skipped": 0, "failed": 0} + + def _record(status: Dict[str, Any]) -> str: + statuses.append(status) + state = str(status.get("status") or "failed") + if state not in counts: + state = "failed" + counts[state] += 1 + if state == "failed": + append_task_log( + task_id, + f"{label}: eval failed for {status.get('path', '')}: {status.get('detail', '')}", + ) + return state + + def _progress(done: int, latest: str | None = None) -> None: + pct = pct_start + (done / total) * span + latest_text = f" latest: {latest}" if latest else "" + update_task_progress( + task_id, + message=( + f"{label}: completed {done}/{total} dirs " + f"(success {counts['success']}, skipped {counts['skipped']}, failed {counts['failed']})" + f"{latest_text}" + ), + pct=min(pct_end, pct), + ) + + append_task_log(task_id, f"{label}: running eval_result for {total} directories with {workers} worker(s)") + _progress(0) + + if workers == 1: + for i, result_dir in enumerate(target_dirs, start=1): + append_task_log(task_id, f"{label}: starting {i}/{total}: {result_dir}") + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=overwrite) + state = _record(status) + short_path = _compact_eval_path(status.get("path") or result_dir) + append_task_log(task_id, f"{label}: {i}/{total} {state}: {short_path}") + _progress(i, short_path) + return statuses + + with ThreadPoolExecutor(max_workers=workers) as executor: + future_map = { + executor.submit(eval_summary.run_eval_result_for_dir, result_dir, overwrite=overwrite): result_dir + for result_dir in target_dirs + } + for done, future in enumerate(as_completed(future_map), start=1): + result_dir = future_map[future] + try: + status = future.result() + except Exception as exc: + status = {"path": result_dir, "status": "failed", "detail": str(exc)} + state = _record(status) + short_path = _compact_eval_path(status.get("path") or result_dir) + append_task_log(task_id, f"{label}: {done}/{total} {state}: {short_path}") + _progress(done, short_path) + return statuses + + def _copy_task_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: copied: Dict[str, Any] = {} for key, value in (parameters or {}).items(): @@ -297,17 +406,19 @@ def job_run_eval_dirs(task_id: str, parameters: Dict[str, Any]) -> None: update_task_status(task_id, "failed", error_message="No result directories found") return total = len(target_dirs) - append_task_log(task_id, f"Processing {total} directories") - statuses = [] - for i, result_dir in enumerate(target_dirs): - pct = 100.0 * (i + 1) / total if total else 0 - update_task_progress(task_id, message=f"Processing {i+1}/{total}: {result_dir}", pct=pct) - append_task_log(task_id, f"Processing {i+1}/{total}: {result_dir}") - status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=overwrite) - statuses.append(status) - if status.get("status") == "failed": - append_task_log(task_id, f"Eval failed for {result_dir}: {status.get('detail', '')}") + eval_workers = _eval_worker_count(parameters, total) + statuses = _run_eval_result_dirs( + task_id=task_id, + eval_summary=eval_summary, + target_dirs=target_dirs, + overwrite=overwrite, + eval_workers=eval_workers, + pct_start=0.0, + pct_end=90.0, + label="Eval", + ) append_task_log(task_id, "Generating summary CSV") + update_task_progress(task_id, message="Generating Summary.csv / Score.csv", pct=95) info = eval_summary.generate_summary_and_score_csv(eval_root) result_path = info.get("summary_path", eval_root) failed = [s for s in statuses if s.get("status") == "failed"] @@ -341,6 +452,7 @@ def job_run_eval_dirs(task_id: str, parameters: Dict[str, Any]) -> None: }, ) append_task_log(task_id, f"Done. Output: {result_path}") + update_task_progress(task_id, message="Eval complete", pct=100) _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="completed", result_path=result_path) update_task_status(task_id, "completed", result_path=result_path) except Exception as e: @@ -935,7 +1047,29 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: extra={"evaluator": evaluator_context}, ) - on_progress = lambda msg: _progress_callback(task_id, msg) + def on_progress(msg: str) -> None: + append_task_log(task_id, msg) + match = re.search(r"(\d+)\s*/\s*(\d+)", msg) + pct = None + if match: + n, m = int(match.group(1)), max(1, int(match.group(2))) + ratio = n / m + if msg.startswith("Eval:"): + pct = 60.0 + ratio * 25.0 + elif msg.startswith("Parquet:"): + pct = 85.0 + ratio * 13.0 + elif msg.startswith("Downloading"): + pct = ratio * 60.0 + if pct is None: + if msg.startswith("Download complete"): + pct = 60.0 + elif msg.startswith("Generating parquet"): + pct = 85.0 + if pct is None: + update_task_progress(task_id, message=msg) + else: + update_task_progress(task_id, message=msg, pct=pct) + on_warning = lambda msg: append_task_log(task_id, msg) result = download_core.run_download_and_eval( @@ -953,6 +1087,7 @@ def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: generate_parquet=generate_parquet, eval_recursive=eval_recursive, eval_overwrite=eval_overwrite, + eval_workers=_eval_worker_count(parameters, 10_000), on_progress=on_progress, on_warning=on_warning, ) @@ -1148,24 +1283,21 @@ def _on_warning(msg: str) -> None: if eval_summary: target_dirs = eval_summary.find_eval_result_dirs(str(output_path), recursive=True) - statuses = [] total = len(target_dirs) if target_dirs: - append_task_log(task_id, f"{role}: running eval_result for {total} directories") + statuses = _run_eval_result_dirs( + task_id=task_id, + eval_summary=eval_summary, + target_dirs=target_dirs, + overwrite=False, + eval_workers=_eval_worker_count({}, total), + pct_start=download_end, + pct_end=eval_end, + label=f"{role}: eval_result", + ) else: update_task_progress(task_id, message=f"{role}: no eval_result directories found", pct=eval_end) - for i, result_dir in enumerate(target_dirs): - pct = download_end + (i / total) * max(0.0, eval_end - download_end) if total else eval_end - message = f"{role}: eval_result {i + 1}/{total}: {result_dir}" - update_task_progress(task_id, message=message, pct=pct) - append_task_log(task_id, message) - status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=False) - statuses.append(status) - if status.get("status") == "failed": - append_task_log( - task_id, - f"WARNING: {role}: eval_result failed for {result_dir}: {status.get('detail', '')}", - ) + statuses = [] if target_dirs: update_task_progress(task_id, message=f"{role}: generating Summary.csv / Score.csv", pct=eval_end) csv_info = eval_summary.generate_summary_and_score_csv(str(output_path)) @@ -2068,16 +2200,19 @@ def on_eval_progress(status: str, elapsed: float) -> None: target_dirs = eval_summary.find_eval_result_dirs(output_path, recursive=eval_recursive) if target_dirs: total = len(target_dirs) - eval_statuses = [] - for i, result_dir in enumerate(target_dirs): - pct = 65 + (i / total) * 20 - update_task_progress(task_id, message=f"Evaluating {i+1}/{total}: {result_dir}", pct=pct) - status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) - eval_statuses.append(status) - if status.get("status") == "failed": - append_task_log(task_id, f"Eval failed for {result_dir}: {status.get('detail', '')}") + eval_statuses = _run_eval_result_dirs( + task_id=task_id, + eval_summary=eval_summary, + target_dirs=target_dirs, + overwrite=eval_overwrite, + eval_workers=_eval_worker_count(parameters, total), + pct_start=65.0, + pct_end=85.0, + label="Eval", + ) # Generate summary CSVs + update_task_progress(task_id, message="Generating Summary.csv / Score.csv", pct=85) csv_info = eval_summary.generate_summary_and_score_csv(output_path) failed = [s for s in eval_statuses if s.get("status") == "failed"] skipped = [s for s in eval_statuses if s.get("status") == "skipped"] From 01071f68148cf5273ad5182f13773179dd6e8104 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Fri, 29 May 2026 16:48:45 +0900 Subject: [PATCH 88/94] fix: update file permissions for multiple application files - Changed file permissions for several application files to ensure proper execution rights. - Updated permissions for `catalogs.json`, `docker-entrypoint.sh`, `Overview.py`, and various documentation files to improve accessibility and functionality. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Overview.py | 206 +++++++++++-- evaluation_dashboard_app/Readme.en.md | 0 evaluation_dashboard_app/Readme.md | 0 evaluation_dashboard_app/catalogs.json | 0 evaluation_dashboard_app/docker-entrypoint.sh | 0 .../lib/specsheet_report.py | 278 +++++++++++++++++- .../requirements-docker.txt | 0 evaluation_dashboard_app/requirements.txt | 0 8 files changed, 446 insertions(+), 38 deletions(-) mode change 100644 => 100755 evaluation_dashboard_app/Overview.py mode change 100644 => 100755 evaluation_dashboard_app/Readme.en.md mode change 100644 => 100755 evaluation_dashboard_app/Readme.md mode change 100644 => 100755 evaluation_dashboard_app/catalogs.json mode change 100644 => 100755 evaluation_dashboard_app/docker-entrypoint.sh mode change 100644 => 100755 evaluation_dashboard_app/requirements-docker.txt mode change 100644 => 100755 evaluation_dashboard_app/requirements.txt diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py old mode 100644 new mode 100755 index d3e527b..5e7ab84 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -17,12 +17,14 @@ DEFAULT_SPECSHEET_PROJECT_ID, DEFAULT_SPECSHEET_TOPIC, DEFAULT_TREND_METADATA_TEXT, - collect_candidate_specsheet_labels, generate_specsheet_pdf, + get_release_specsheet_context, get_specsheet_artifact_paths, is_specsheet_pdf_fresh, parse_trend_metadata_text, progress_fraction_from_message, + resolve_specsheet_generation_run_path, + write_trend_metadata, ) from lib.page_chrome import ( inject_app_page_styles, @@ -652,25 +654,6 @@ def show_tp_mean_by_label_compare(df_list, run_labels, label_col, label_jp_map=N "perception_labels": list(_report_filters["perception_labels"]), "product_labels": list(_report_filters["product_labels"]), } -_specsheet_run_records = _report_runs -_specsheet_run_labels = _report_labels -_specsheet_run_options = { - f"{label} · {get_run_display_name(record['path'])}": record["path"] - for label, record in zip(_specsheet_run_labels, _specsheet_run_records) -} -_specsheet_run_option_keys = list(_specsheet_run_options.keys()) -_default_specsheet_run_selection = _specsheet_run_option_keys[:1] -_default_specsheet_labels = list(DEFAULT_SPECSHEET_LABELS) -_default_specsheet_project_id = st.session_state.get("specsheet_project_id", DEFAULT_SPECSHEET_PROJECT_ID) -_default_specsheet_topic = st.session_state.get("specsheet_topic_name", DEFAULT_SPECSHEET_TOPIC) -_detected_specsheet_labels = collect_candidate_specsheet_labels( - runA["path"], - preferred=_report_filters["perception_labels"], -) -_specsheet_label_options = list(dict.fromkeys(_default_specsheet_labels + _detected_specsheet_labels)) -_single_specsheet_run_path = _specsheet_run_records[0]["path"] -_default_specsheet_version = st.session_state.get("specsheet_version", get_run_display_name(_single_specsheet_run_path)) - pdf_col1, pdf_col2 = st.columns([1.2, 2.8]) with pdf_col1: if st.button("Generate Evaluation Dashboard Report", type="primary", use_container_width=True): @@ -718,6 +701,34 @@ def _update_pdf_status(message: str) -> None: "Generate or reuse the release-oriented spec-sheet PDF. Missing current/future CSV files are auto-created from parquet before building.", ) +_specsheet_run_records = _report_runs +_specsheet_run_labels = _report_labels +_specsheet_run_options = {} +for label, record in zip(_specsheet_run_labels, _specsheet_run_records): + source_path = record["path"] + target_path = resolve_specsheet_generation_run_path(source_path) + release_context = get_release_specsheet_context(source_path) + option_label = f"{label} · {get_run_display_name(source_path)}" + if release_context is not None and target_path != source_path: + option_label = ( + f"{label} · {get_run_display_name(source_path)} " + f"(PDF body: {get_run_display_name(target_path)})" + ) + _specsheet_run_options[option_label] = { + "source_path": source_path, + "target_path": target_path, + "release_context": release_context, + } +_specsheet_run_option_keys = list(_specsheet_run_options.keys()) +_default_specsheet_run_selection = _specsheet_run_option_keys[:1] +_default_specsheet_labels = list(DEFAULT_SPECSHEET_LABELS) +_default_specsheet_project_id = st.session_state.get("specsheet_project_id", DEFAULT_SPECSHEET_PROJECT_ID) +_default_specsheet_topic = st.session_state.get("specsheet_topic_name", DEFAULT_SPECSHEET_TOPIC) +_filter_specsheet_labels = [str(v) for v in _report_filters["perception_labels"] if str(v).strip()] +_specsheet_label_options = list(dict.fromkeys(_default_specsheet_labels + _filter_specsheet_labels)) +_single_specsheet_run_path = resolve_specsheet_generation_run_path(_specsheet_run_records[0]["path"]) +_default_specsheet_version = get_run_display_name(_single_specsheet_run_path) + if mode == "Compare Mode": selected_specsheet_run_keys = st.multiselect( "Runs to generate spec-sheet for", @@ -729,19 +740,103 @@ def _update_pdf_status(message: str) -> None: else: selected_specsheet_run_keys = _specsheet_run_option_keys[:1] -selected_specsheet_run_paths = [ +_selected_specsheet_entries = [ _specsheet_run_options[key] for key in selected_specsheet_run_keys if key in _specsheet_run_options ] +selected_specsheet_run_paths = [] +_seen_specsheet_targets = set() +for entry in _selected_specsheet_entries: + target_path = entry["target_path"] + target_key = str(target_path.resolve()) + if target_key in _seen_specsheet_targets: + continue + selected_specsheet_run_paths.append(target_path) + _seen_specsheet_targets.add(target_key) +selected_specsheet_release_contexts = [] +_seen_specsheet_releases = set() +for entry in _selected_specsheet_entries: + release_context = entry["release_context"] + if release_context is None: + continue + release_dir = release_context.get("release_dir") + release_key = str(release_dir.resolve()) if isinstance(release_dir, Path) else str(release_dir) + if release_key in _seen_specsheet_releases: + continue + selected_specsheet_release_contexts.append(release_context) + _seen_specsheet_releases.add(release_key) _active_specsheet_paths = [get_specsheet_artifact_paths(path) for path in selected_specsheet_run_paths] _selected_trend_metadata_text = "" -if len(_active_specsheet_paths) == 1 and _active_specsheet_paths[0]["trend_metadata"].exists(): +_selected_trend_metadata_path = None +if len(selected_specsheet_release_contexts) == 1: + candidate_path = selected_specsheet_release_contexts[0].get("metadata") + if isinstance(candidate_path, Path) and candidate_path.exists(): + _selected_trend_metadata_path = candidate_path +if _selected_trend_metadata_path is None and len(_active_specsheet_paths) == 1 and _active_specsheet_paths[0]["trend_metadata"].exists(): + _selected_trend_metadata_path = _active_specsheet_paths[0]["trend_metadata"] +if _selected_trend_metadata_path is not None: try: - _selected_trend_metadata_text = _active_specsheet_paths[0]["trend_metadata"].read_text(encoding="utf-8") + _selected_trend_metadata_text = _selected_trend_metadata_path.read_text(encoding="utf-8") except Exception: _selected_trend_metadata_text = "" +_selected_metadata_defaults = {} +if _selected_trend_metadata_text: + try: + _selected_metadata_defaults = parse_trend_metadata_text(_selected_trend_metadata_text) + except Exception: + _selected_metadata_defaults = {} + +def _specsheet_title_version_from_metadata(metadata: dict) -> str: + explicit = str(metadata.get("version_abbr") or "").strip() + if explicit: + return explicit + version = str(metadata.get("pilot_auto_version") or "").strip() + if version.lower().startswith("pilot.auto "): + return version[len("Pilot.Auto "):].strip() + return version + +_metadata_default_version = _specsheet_title_version_from_metadata(_selected_metadata_defaults) +if _metadata_default_version: + _default_specsheet_version = _metadata_default_version +_metadata_trend_topic = str(_selected_metadata_defaults.get("topic_name") or "").strip() +if ( + _metadata_trend_topic + and _metadata_trend_topic != DEFAULT_SPECSHEET_TOPIC + and st.session_state.get("specsheet_topic_name") == _metadata_trend_topic +): + st.session_state["specsheet_topic_name"] = DEFAULT_SPECSHEET_TOPIC + _default_specsheet_topic = DEFAULT_SPECSHEET_TOPIC + +_specsheet_defaults_source = str(_selected_trend_metadata_path or _single_specsheet_run_path) +_previous_auto_version = st.session_state.get("specsheet_version_auto_value") +_current_version = st.session_state.get("specsheet_version") +if ( + st.session_state.get("specsheet_version_auto_source") != _specsheet_defaults_source + and ( + "specsheet_version" not in st.session_state + or _current_version == _previous_auto_version + or str(_current_version or "").endswith(("/performance", "/devops")) + ) +): + st.session_state["specsheet_version"] = _default_specsheet_version +st.session_state["specsheet_version_auto_source"] = _specsheet_defaults_source +st.session_state["specsheet_version_auto_value"] = _default_specsheet_version + +_previous_auto_topic = st.session_state.get("specsheet_topic_auto_value") +_current_topic = st.session_state.get("specsheet_topic_name") +if ( + st.session_state.get("specsheet_topic_auto_source") != _specsheet_defaults_source + and ( + "specsheet_topic_name" not in st.session_state + or _current_topic == _previous_auto_topic + ) +): + st.session_state["specsheet_topic_name"] = _default_specsheet_topic +st.session_state["specsheet_topic_auto_source"] = _specsheet_defaults_source +st.session_state["specsheet_topic_auto_value"] = _default_specsheet_topic + specsheet_cfg_col1, specsheet_cfg_col2, specsheet_cfg_col3 = st.columns([1.4, 1.2, 1.4]) with specsheet_cfg_col1: specsheet_project_id = st.text_input( @@ -774,6 +869,31 @@ def _update_pdf_status(message: str) -> None: if not selected_specsheet_run_paths: st.info("Pick at least one run to build the release spec-sheet.") +if _selected_trend_metadata_text and "specsheet_include_trend" not in st.session_state: + st.session_state["specsheet_include_trend"] = True + +if selected_specsheet_release_contexts: + for release_context in selected_specsheet_release_contexts[:1]: + release_dir = release_context.get("release_dir") + roles = release_context.get("roles", {}) + role_status = [] + if isinstance(roles, dict): + for role_name in ("performance", "devops"): + role_info = roles.get(role_name) + if not isinstance(role_info, dict): + continue + bits = [] + bits.append("summary.json" if role_info.get("has_summary") else "no summary.json") + bits.append("metadata.yaml" if role_info.get("has_metadata") else "no metadata.yaml") + role_status.append(f"{role_name}: {', '.join(bits)}") + release_text = f"Release folder detected: `{path_display(release_dir)}`." if isinstance(release_dir, Path) else "Release folder detected." + if role_status: + release_text += " Trend generation will use " + "; ".join(role_status) + "." + st.info(release_text) + +if _selected_trend_metadata_path is not None and _selected_trend_metadata_text: + st.info(f"Existing trend metadata found at `{path_display(_selected_trend_metadata_path)}`. Review it below before generating.") + specsheet_trend_enabled = st.toggle( "Include trend data", value=bool(st.session_state.get("specsheet_include_trend", bool(_selected_trend_metadata_text))), @@ -782,20 +902,35 @@ def _update_pdf_status(message: str) -> None: ) trend_metadata_payload = None +trend_metadata_changed = False +trend_metadata_change_confirmed = False if specsheet_trend_enabled: st.caption( "Select the full/performance run for the PDF body. Other full/usecase/devops trend runs are discovered from matching metadata under the data root." ) + _trend_metadata_source_key = str(_selected_trend_metadata_path) if _selected_trend_metadata_path is not None else "__default__" + if ( + st.session_state.get("specsheet_trend_metadata_source") != _trend_metadata_source_key + or "specsheet_trend_metadata_text" not in st.session_state + ): + st.session_state["specsheet_trend_metadata_text"] = _selected_trend_metadata_text or DEFAULT_TREND_METADATA_TEXT + st.session_state["specsheet_trend_metadata_source"] = _trend_metadata_source_key + st.session_state["specsheet_confirm_metadata_changes"] = False trend_metadata_text = st.text_area( "Trend metadata YAML", - value=st.session_state.get( - "specsheet_trend_metadata_text", - _selected_trend_metadata_text or DEFAULT_TREND_METADATA_TEXT, - ), key="specsheet_trend_metadata_text", height=180, help="Required keys: tags, pilot_auto_version, data_count, description, date.", ) + trend_metadata_changed = bool(_selected_trend_metadata_text) and ( + trend_metadata_text.strip() != _selected_trend_metadata_text.strip() + ) + if trend_metadata_changed: + st.warning("The existing metadata.yaml has been edited. Confirm the change before generating so the saved release metadata is updated intentionally.") + trend_metadata_change_confirmed = st.checkbox( + "Confirm metadata.yaml changes", + key="specsheet_confirm_metadata_changes", + ) try: trend_metadata_payload = parse_trend_metadata_text(trend_metadata_text) st.success("Trend metadata looks valid.") @@ -826,6 +961,8 @@ def _update_pdf_status(message: str) -> None: raise ValueError("Trend-enabled release spec-sheet generation currently supports exactly one run.") if specsheet_trend_enabled and trend_metadata_payload is None: raise ValueError("Valid trend metadata is required when trend mode is enabled.") + if specsheet_trend_enabled and trend_metadata_changed and not trend_metadata_change_confirmed: + raise ValueError("Confirm the metadata.yaml changes before generating.") stage_progress = { "Using existing up-to-date spec-sheet PDF": 1.0, @@ -859,6 +996,21 @@ def _update_specsheet_status(message: str) -> None: generated_pdfs: list[tuple[Path, bool]] = [] for idx, run_path in enumerate(selected_specsheet_run_paths, start=1): _update_specsheet_status(f"Run {idx}/{len(selected_specsheet_run_paths)}: {get_run_display_name(run_path)}") + if specsheet_trend_enabled and trend_metadata_payload is not None: + if _selected_trend_metadata_path is not None and trend_metadata_changed: + _selected_trend_metadata_path.write_text( + yaml.safe_dump(trend_metadata_payload, allow_unicode=True, sort_keys=False), + encoding="utf-8", + ) + if len(selected_specsheet_release_contexts) == 1: + roles = selected_specsheet_release_contexts[0].get("roles", {}) + if isinstance(roles, dict): + for role_info in roles.values(): + if not isinstance(role_info, dict) or not role_info.get("has_summary"): + continue + role_run_dir = role_info.get("run_dir") + if isinstance(role_run_dir, Path): + write_trend_metadata(role_run_dir, trend_metadata_payload) pdf_path, generated = generate_specsheet_pdf( run_path, project_id=specsheet_project_id, diff --git a/evaluation_dashboard_app/Readme.en.md b/evaluation_dashboard_app/Readme.en.md old mode 100644 new mode 100755 diff --git a/evaluation_dashboard_app/Readme.md b/evaluation_dashboard_app/Readme.md old mode 100644 new mode 100755 diff --git a/evaluation_dashboard_app/catalogs.json b/evaluation_dashboard_app/catalogs.json old mode 100644 new mode 100755 diff --git a/evaluation_dashboard_app/docker-entrypoint.sh b/evaluation_dashboard_app/docker-entrypoint.sh old mode 100644 new mode 100755 diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index 48dbc1c..41cad2b 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -4,17 +4,20 @@ from dataclasses import dataclass import inspect import json +import os import re +import shutil +from types import SimpleNamespace from pathlib import Path from typing import Any, Callable, Iterable, Sequence import pandas as pd import yaml -from lib.perception_catalog_io import build_scene_dataframe_from_pkl_dir from lib.path_utils import get_data_root DEFAULT_SPECSHEET_TOPIC = "perception.object_recognition.tracking.objects" +DEFAULT_TREND_TOPIC = "perception.object_recognition.objects" DEFAULT_SPECSHEET_PROJECT_ID = "x2_dev" DEFAULT_SPECSHEET_LABELS = ["car", "truck", "bus", "bicycle", "pedestrian", "motorcycle"] DEFAULT_SPECSHEET_METRICS = [ @@ -38,6 +41,8 @@ ] TREND_METADATA_FILENAME = "metadata.yaml" TREND_SUMMARY_FILENAME = "summary.json" +SPECSHEET_RELEASE_ROLE_DIRS = ("performance", "devops") +GENERATED_TREND_HISTORY_DIRNAME = "_app_trend_history" FULL_DATASET_EVALUATION_HEADER = "全数データセット評価" DEFAULT_TREND_METADATA_TEXT = """tags: [trend] pilot_auto_version: "Pilot.Auto v4.3.0 (centerpoint x2/2.3.1)" @@ -76,6 +81,64 @@ def get_specsheet_artifact_paths(run_dir: str | Path) -> dict[str, Path]: } +def _looks_like_specsheet_release_container(path: Path) -> bool: + return ( + (path / TREND_METADATA_FILENAME).exists() + and any((path / role).is_dir() for role in SPECSHEET_RELEASE_ROLE_DIRS) + ) + + +def get_release_specsheet_context(run_dir: str | Path) -> dict[str, Any] | None: + """Return release-folder context for specsheet workflow output, if present.""" + run_path = Path(run_dir) + if _looks_like_specsheet_release_container(run_path): + release_dir = run_path + elif run_path.name in SPECSHEET_RELEASE_ROLE_DIRS and _looks_like_specsheet_release_container(run_path.parent): + release_dir = run_path.parent + else: + return None + + roles: dict[str, dict[str, Path | bool]] = {} + for role in SPECSHEET_RELEASE_ROLE_DIRS: + role_dir = release_dir / role + if not role_dir.is_dir(): + continue + role_paths = get_specsheet_artifact_paths(role_dir) + roles[role] = { + "run_dir": role_dir, + "metadata": role_paths["trend_metadata"], + "summary": role_paths["trend_summary"], + "has_metadata": role_paths["trend_metadata"].exists(), + "has_summary": role_paths["trend_summary"].exists(), + } + + metadata_path = release_dir / TREND_METADATA_FILENAME + if not metadata_path.exists(): + performance_metadata = roles.get("performance", {}).get("metadata") + if isinstance(performance_metadata, Path) and performance_metadata.exists(): + metadata_path = performance_metadata + + return { + "release_dir": release_dir, + "metadata": metadata_path, + "roles": roles, + "performance_dir": roles.get("performance", {}).get("run_dir"), + "devops_dir": roles.get("devops", {}).get("run_dir"), + } + + +def resolve_specsheet_generation_run_path(run_dir: str | Path) -> Path: + """Use the performance child as the PDF body for release workflow folders.""" + run_path = Path(run_dir) + context = get_release_specsheet_context(run_path) + if context is None: + return run_path + performance_dir = context.get("performance_dir") + if isinstance(performance_dir, Path): + return performance_dir + return run_path + + def list_specsheet_source_parquets(run_dir: str | Path) -> list[Path]: paths = get_specsheet_artifact_paths(run_dir) run_path = paths["run_dir"] @@ -230,6 +293,16 @@ def _trend_version_abbr(metadata: dict[str, Any]) -> str: return _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version +def _infer_trend_topic(metadata: dict[str, Any], metadata_path: str | Path) -> str: + explicit = str(metadata.get("topic_name") or "").strip() + if explicit: + return explicit + for part in reversed(Path(metadata_path).parts): + if part.startswith("perception."): + return part + return DEFAULT_TREND_TOPIC + + def write_trend_metadata(run_dir: str | Path, metadata: dict[str, Any]) -> Path: paths = get_specsheet_artifact_paths(run_dir) resource_dir = paths["resource_dir"] @@ -249,6 +322,8 @@ def discover_trend_metadata_files(root_dir: str | Path | None = None) -> list[Pa for metadata_path in base_dir.rglob(TREND_METADATA_FILENAME): if not metadata_path.is_file(): continue + if GENERATED_TREND_HISTORY_DIRNAME in metadata_path.parts: + continue if not (metadata_path.parent / TREND_SUMMARY_FILENAME).exists(): continue matches.append(metadata_path) @@ -427,7 +502,46 @@ def _sort_key(group: TrendReleaseGroup) -> tuple[str, str]: newest = max(dates) if dates else "" return (newest, group.display_name) - return sorted(grouped.values(), key=_sort_key) + return sorted(_deduplicate_trend_release_groups(grouped.values()), key=_sort_key) + + +def _trend_group_identity(group: TrendReleaseGroup) -> tuple[str, str, str, str, str, str, tuple[str, ...]]: + metadata = {} + for role in ("full", "usecase", "devops", "performance_blocks", "unknown"): + if role in group.jobs: + metadata = group.jobs[role].get("metadata", {}) + break + return ( + str(metadata.get("release_group") or ""), + str(group.topic_name or ""), + str(metadata.get("pilot_auto_version") or ""), + str(metadata.get("date") or ""), + str(metadata.get("description") or ""), + str(metadata.get("data_count") or ""), + tuple(sorted(group.jobs.keys())), + ) + + +def _trend_group_preference(group: TrendReleaseGroup) -> tuple[int, int, str]: + generated_history = any( + GENERATED_TREND_HISTORY_DIRNAME in Path(job.get("metadata_path", "")).parts + for job in group.jobs.values() + ) + return ( + 0 if generated_history else 1, + len(group.jobs), + str(group.base_dir), + ) + + +def _deduplicate_trend_release_groups(groups: Iterable[TrendReleaseGroup]) -> list[TrendReleaseGroup]: + selected: dict[tuple[str, str, str, str, str, str, tuple[str, ...]], TrendReleaseGroup] = {} + for group in groups: + identity = _trend_group_identity(group) + current = selected.get(identity) + if current is None or _trend_group_preference(group) > _trend_group_preference(current): + selected[identity] = group + return list(selected.values()) def _trend_version_sort_key(pilot_auto_version: str) -> tuple[tuple[int, int, int], str, tuple[int, int, int]]: @@ -586,6 +700,56 @@ def extract_devops_case_rows(summary: dict[str, Any]) -> list[dict[str, Any]]: return rows +def _normalize_devops_summary_structure(summary: dict[str, Any]) -> dict[str, dict[str, dict[str, dict[str, int]]]]: + normalized: dict[str, dict[str, dict[str, dict[str, int]]]] = {} + for major_category, mid_categories in summary.items(): + if not isinstance(mid_categories, dict): + continue + normalized_major = normalized.setdefault(str(major_category), {}) + for mid_category, minor_or_cases in mid_categories.items(): + if not isinstance(minor_or_cases, dict): + continue + normalized_mid = normalized_major.setdefault(str(mid_category), {}) + if {"passed", "total"}.intersection(minor_or_cases.keys()): + normalized_mid[str(mid_category)] = { + "passed": int(minor_or_cases.get("passed", 0) or 0), + "total": int(minor_or_cases.get("total", 0) or 0), + } + continue + for case_name, result in minor_or_cases.items(): + if not isinstance(result, dict): + continue + normalized_mid[str(case_name)] = { + "passed": int(result.get("passed", 0) or 0), + "total": int(result.get("total", 0) or 0), + } + return normalized + + +def _align_devops_trend_data_structures(trend_data_rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + structure: dict[str, dict[str, set[str]]] = {} + for row in trend_data_rows: + devops_data = _normalize_devops_summary_structure(row.get("devops_data", {})) + row["devops_data"] = devops_data + for major_category, mid_categories in devops_data.items(): + major_structure = structure.setdefault(major_category, {}) + for mid_category, cases in mid_categories.items(): + major_structure.setdefault(mid_category, set()).update(cases.keys()) + + for row in trend_data_rows: + devops_data = row.get("devops_data", {}) + if not isinstance(devops_data, dict): + devops_data = {} + row["devops_data"] = devops_data + for major_category, mid_categories in structure.items(): + row_major = devops_data.setdefault(major_category, {}) + for mid_category, cases in mid_categories.items(): + row_mid = row_major.setdefault(mid_category, {}) + for case_name in cases: + row_mid.setdefault(case_name, {"passed": 0, "total": 0}) + return trend_data_rows + + def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, str | int | float]]: trend_data_rows: list[dict[str, Any]] = [] for metadata_path in metadata_list: @@ -605,6 +769,7 @@ def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, "data_count": metadata.get("data_count"), "description": metadata.get("description"), "date": metadata.get("date"), + "topic": _infer_trend_topic(metadata, metadata_path), "summary": summary_list, } ) @@ -635,7 +800,10 @@ def _avg(metric_name: str) -> float: "data_count": row.get("data_count"), "description": row.get("description"), "date": row.get("date"), + "topic": row.get("topic"), "mAP": _avg("mAP"), + "precision": _avg("precision"), + "recall": _avg("recall"), "minADE@1s": _avg("minADE@1s"), "minFDE@1s": _avg("minFDE@1s"), "minADE@3s": _avg("minADE@3s"), @@ -663,6 +831,7 @@ def load_devops_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, Any] rows = extract_devops_case_rows(summary) if not rows: continue + normalized_summary = _normalize_devops_summary_structure(summary) overall_passed = sum(int(row["passed"]) for row in rows) overall_total = sum(int(row["total"]) for row in rows) trend_data_rows.append( @@ -672,16 +841,17 @@ def load_devops_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, Any] "data_count": metadata.get("data_count"), "description": metadata.get("description"), "date": metadata.get("date"), + "topic": _infer_trend_topic(metadata, metadata_path), "overall_pass_rate": (overall_passed / overall_total * 100.0) if overall_total > 0 else 0.0, "scenario_count": overall_total, - "devops_data": summary, + "devops_data": normalized_summary, } ) trend_data_rows.sort(key=lambda row: _trend_version_sort_key(str(row.get("version") or ""))) - return trend_data_rows + return _align_devops_trend_data_structures(trend_data_rows) def _add_devops_detail_trend_rates(devops_trend_data: Sequence[dict[str, Any]]) -> list[str]: @@ -714,6 +884,7 @@ def _add_devops_detail_trend_rates(devops_trend_data: Sequence[dict[str, Any]]) def _build_trend_context( metadata_list: Sequence[Path], output_dir: Path, + current_devops_summary_path: Path | None = None, progress_callback: Callable[[str], None] | None = None, ) -> dict[str, object]: if not metadata_list: @@ -721,6 +892,8 @@ def _build_trend_context( "performance_trend_data": [], "map_trend_plot_path": output_dir / "map_trend.png", "prediction_trend_plot_path": output_dir / "prediction_trend.png", + "devops_data": {}, + "devops_plot_path": None, "devops_trend_data": [], "devops_trend_plot_path": output_dir / "devops_trend.png", "job_ids": [], @@ -733,6 +906,7 @@ def _build_trend_context( generate_devops_trend_detail_plot, generate_devops_trend_plot, ) + from perception_catalog_analyzer.plot.devops import generate_devops_plot except ImportError as exc: raise RuntimeError( "perception_catalog_analyzer trend support is unavailable. " @@ -751,6 +925,16 @@ def _build_trend_context( devops_trend_data = load_devops_trend_data(list(metadata_list)) devops_trend_plot_path = output_dir / "devops_trend.png" + devops_data = {} + devops_plot_path = None + if current_devops_summary_path is not None and current_devops_summary_path.exists(): + current_devops_summary = load_trend_summary_file(current_devops_summary_path) + if classify_trend_summary(current_devops_summary) == "devops": + devops_data = _normalize_devops_summary_structure(current_devops_summary) + if devops_data: + _notify(progress_callback, "Rendering current pass-rate plot") + devops_plot_path = output_dir / "devops.png" + generate_devops_plot(devops_data, devops_plot_path) if devops_trend_data: _notify(progress_callback, "Rendering pass-rate trend plots") generate_devops_trend_plot(devops_trend_data, devops_trend_plot_path) @@ -766,6 +950,8 @@ def _build_trend_context( "performance_trend_data": performance_trend_data, "map_trend_plot_path": map_trend_plot_path, "prediction_trend_plot_path": prediction_trend_plot_path, + "devops_data": devops_data, + "devops_plot_path": devops_plot_path, "devops_trend_data": devops_trend_data, "devops_trend_plot_path": devops_trend_plot_path, "job_ids": [], @@ -788,12 +974,13 @@ def _update_template_compat( parameters = {} trend_context = trend_context or {} + path_manager = SimpleNamespace(specsheet_path=context_dir) semantic_kwargs = { "project_id": project_id, "pilot_auto_version": version, "version": version, - "devops_data": {}, - "devops_plot_path": None, + "devops_data": trend_context.get("devops_data", {}), + "devops_plot_path": trend_context.get("devops_plot_path"), "performance_trend_data": trend_context.get("performance_trend_data", []), "map_trend_plot_path": trend_context.get("map_trend_plot_path", context_dir / "map_trend.png"), "prediction_trend_plot_path": trend_context.get( @@ -807,6 +994,7 @@ def _update_template_compat( "template_name": "static_body.html", "extensions": ["html"], "template_dir": str(template_dir), + "path_manager": path_manager, "show_other_infos": bool(trend_context.get("performance_trend_data")), } @@ -814,7 +1002,8 @@ def _update_template_compat( param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() ) if accepts_kwargs or not parameters: - return update_template_func(**semantic_kwargs) + with _patch_template_dataset_paths(update_template_func, context_dir): + return update_template_func(**semantic_kwargs) args: list[object] = [] kwargs: dict[str, object] = {} @@ -829,7 +1018,37 @@ def _update_template_compat( args.append(value) elif param.kind == inspect.Parameter.KEYWORD_ONLY: kwargs[name] = value - return update_template_func(*args, **kwargs) + with _patch_template_dataset_paths(update_template_func, context_dir): + return update_template_func(*args, **kwargs) + + +@contextmanager +def _patch_template_dataset_paths( + update_template_func: Callable[..., Sequence[str]], + context_dir: Path, +): + """Redirect analyzer dataset-summary outputs away from read-only package config.""" + globals_dict = getattr(update_template_func, "__globals__", {}) + patch_keys = ("DATASET_SUMMARY_PATH", "DATASET_TRAIN_PATH", "DATASET_TEST_PATH") + originals = {key: globals_dict.get(key) for key in patch_keys if key in globals_dict} + if not originals: + yield + return + + dataset_dir = context_dir / "dataset_assets" + dataset_dir.mkdir(parents=True, exist_ok=True) + try: + for key, original_path in originals.items(): + if not isinstance(original_path, Path) or not original_path.exists(): + continue + target_path = dataset_dir / original_path.name + if not target_path.exists(): + shutil.copy2(original_path, target_path) + globals_dict[key] = target_path + yield + finally: + for key, original_path in originals.items(): + globals_dict[key] = original_path def _scene_dataframe_from_dir_compat( scene_dataframe_cls, @@ -924,10 +1143,21 @@ def _get_blocks_compat( evaluation_type: str, ): """Call get_blocks across analyzer versions with different keyword support.""" + parquet_compression = "snappy" + try: + from perception_catalog_analyzer.types import ParquetCompression + + parquet_compression = ParquetCompression.SNAPPY + except Exception: + pass + semantic_kwargs = { "df": df, "labels": list(labels), "metrics": list(metrics), + "resource_path": outdir, + "html_path": outdir.parent if outdir.name == "resources" else outdir, + "parquet_compression": parquet_compression, "topic_name": topic_name, "topic": topic_name, "path": outdir, @@ -971,10 +1201,12 @@ def _specsheet_compat( report_name: str, ) -> None: """Call specsheet across analyzer versions with path/outdir differences.""" + path_manager = SimpleNamespace(specsheet_path=outdir) semantic_kwargs = { "html": list(html), "abstract_html": list(abstract_html), "detailed_html": list(detailed_html), + "path_manager": path_manager, "path": outdir, "outdir": outdir, "report_name": report_name, @@ -1028,6 +1260,8 @@ def ensure_specsheet_csvs( _copy_parquet_to_csv(fallback, current_csv) else: _notify(progress_callback, "No CSV found. Building CSV from pkl / pkl.z files") + from lib.perception_catalog_io import build_scene_dataframe_from_pkl_dir + skip_counts: dict[str, int] = {} def _on_progress(done: int, total: int) -> None: @@ -1097,6 +1331,10 @@ def generate_specsheet_pdf( resource_dir = run_path / "resources" resource_dir.mkdir(parents=True, exist_ok=True) specsheet_dir.mkdir(parents=True, exist_ok=True) + block_resource_dir = specsheet_dir / "resources" + block_resource_dir.mkdir(parents=True, exist_ok=True) + trend_asset_dir = specsheet_dir / "trend_assets" + trend_asset_dir.mkdir(parents=True, exist_ok=True) _notify(progress_callback, "Loading CSV files") df = _scene_dataframe_from_dir_compat( @@ -1117,7 +1355,7 @@ def generate_specsheet_pdf( labels=list(labels), metrics=metrics, topic_name=topic_name, - outdir=resource_dir.resolve(), + outdir=block_resource_dir.resolve(), evaluation_type="full", ) @@ -1126,17 +1364,35 @@ def generate_specsheet_pdf( if trend_metadata is None: raise ValueError("Trend metadata is required when trend mode is enabled.") _notify(progress_callback, "Validating full trend summary") - ensure_full_trend_summary(paths["trend_summary"]) + generated_trend_summary = block_resource_dir / TREND_SUMMARY_FILENAME + trend_summary_path = generated_trend_summary if generated_trend_summary.exists() else paths["trend_summary"] + ensure_full_trend_summary(trend_summary_path) + if generated_trend_summary.exists() and not paths["trend_summary"].exists(): + shutil.copy2(generated_trend_summary, paths["trend_summary"]) _notify(progress_callback, "Saving trend metadata") write_trend_metadata(run_path, trend_metadata) metadata_list = discover_trend_metadata_files() + release_context = get_release_specsheet_context(run_path) + current_devops_summary_path = None + if release_context is not None: + roles = release_context.get("roles", {}) + if isinstance(roles, dict): + devops_info = roles.get("devops", {}) + if isinstance(devops_info, dict): + summary_path = devops_info.get("summary") + if isinstance(summary_path, Path): + current_devops_summary_path = summary_path trend_context = _build_trend_context( metadata_list, - specsheet_dir, + trend_asset_dir, + current_devops_summary_path=current_devops_summary_path, progress_callback=progress_callback, ) _notify(progress_callback, "Rendering PDF") + for stale_output in (specsheet_dir / "specsheet.html", pdf_path): + if stale_output.exists() and not os.access(stale_output, os.W_OK): + stale_output.unlink() template_dir = Path(template_module.__file__).resolve().parent.parent / "template" html = _prefer_cjk_font_stack( _update_template_compat( diff --git a/evaluation_dashboard_app/requirements-docker.txt b/evaluation_dashboard_app/requirements-docker.txt old mode 100644 new mode 100755 diff --git a/evaluation_dashboard_app/requirements.txt b/evaluation_dashboard_app/requirements.txt old mode 100644 new mode 100755 From 2e6277a12d9161cdcb4027f305e8011759c5d199 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Mon, 1 Jun 2026 14:27:59 +0900 Subject: [PATCH 89/94] feat: enhance release workflow with optional catalog support and metadata validation - Added functionality to handle optional Planning Test catalog integration within the release workflow. - Introduced a new method to resolve active integration IDs for catalogs, improving integration management. - Enhanced the user interface to allow dynamic input of release metadata in YAML format, with validation and error handling. - Updated progress tracking for job scheduling to accommodate the new optional catalog feature. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/deploy_debug.py | 103 ++++++ .../lib/specsheet_report.py | 126 ++++++- evaluation_dashboard_app/pages/6_Workflow.py | 228 ++++++++---- .../pages/99_Deployment_Debug.py | 158 ++++++++- evaluation_dashboard_app/worker/tasks.py | 332 ++++++++++++++---- 5 files changed, 815 insertions(+), 132 deletions(-) diff --git a/evaluation_dashboard_app/lib/deploy_debug.py b/evaluation_dashboard_app/lib/deploy_debug.py index 1336a61..d45f024 100644 --- a/evaluation_dashboard_app/lib/deploy_debug.py +++ b/evaluation_dashboard_app/lib/deploy_debug.py @@ -157,6 +157,109 @@ def task_counts_by_status() -> Tuple[bool, str, Optional[Dict[str, int]]]: return False, str(e), None +def database_table_overview() -> Tuple[bool, str, Optional[List[Dict[str, Any]]]]: + """Return public table names with approximate row counts for DB debugging.""" + if not get_database_url(): + return False, "DATABASE_URL is not set", None + with get_connection() as conn: + if conn is None: + return False, "No database connection", None + try: + with conn.cursor() as cur: + cur.execute( + """ + SELECT + t.table_name, + COALESCE(c.reltuples::bigint, 0) AS estimated_rows, + CASE WHEN c.oid IS NULL THEN 0 ELSE pg_total_relation_size(c.oid) END AS total_bytes + FROM information_schema.tables t + LEFT JOIN pg_namespace n ON n.nspname = t.table_schema + LEFT JOIN pg_class c ON c.relname = t.table_name AND c.relnamespace = n.oid + WHERE t.table_schema = 'public' + AND t.table_type = 'BASE TABLE' + ORDER BY t.table_name + """ + ) + rows = [ + { + "table_name": str(r[0]), + "estimated_rows": int(r[1] or 0), + "total_bytes": int(r[2] or 0), + } + for r in cur.fetchall() + ] + return True, "OK", rows + except Exception as e: + return False, str(e), None + + +def database_recent_task_rows( + *, + limit: int = 50, + offset: int = 0, + status: Optional[str] = None, + task_type: Optional[str] = None, + search: Optional[str] = None, +) -> Tuple[bool, str, List[Dict[str, Any]], int]: + """Read recent rows from the task table for the deployment debug DB tab.""" + if not get_database_url(): + return False, "DATABASE_URL is not set", [], 0 + with get_connection() as conn: + if conn is None: + return False, "No database connection", [], 0 + try: + from psycopg2.extras import RealDictCursor + except ImportError: + return False, "psycopg2 not installed", [], 0 + + where_parts: List[str] = [] + params: List[Any] = [] + if status: + where_parts.append("status = %s") + params.append(status) + if task_type: + where_parts.append("type = %s") + params.append(task_type) + if search: + needle = f"%{search.strip()}%" + where_parts.append( + """ + ( + id::text ILIKE %s OR type ILIKE %s OR status ILIKE %s OR + COALESCE(session_id, '') ILIKE %s OR COALESCE(rq_job_id, '') ILIKE %s OR + COALESCE(result_path, '') ILIKE %s OR COALESCE(error_message, '') ILIKE %s OR + COALESCE(parameters::text, '') ILIKE %s OR COALESCE(result_summary, '') ILIKE %s + ) + """ + ) + params.extend([needle] * 9) + + where_sql = (" WHERE " + " AND ".join(where_parts)) if where_parts else "" + capped_limit = max(1, min(int(limit), 500)) + safe_offset = max(0, int(offset)) + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute(f"SELECT COUNT(*) FROM tasks{where_sql}", params) + total_row = cur.fetchone() + total = int(total_row["count"] if total_row else 0) + cur.execute( + f""" + SELECT + id, type, status, session_id, rq_job_id, + created_at, updated_at, progress_pct, progress_message, + result_path, error_message, parameters, result_summary, log_output + FROM tasks{where_sql} + ORDER BY created_at DESC + LIMIT %s OFFSET %s + """, + [*params, capped_limit, safe_offset], + ) + rows = [dict(row) for row in cur.fetchall()] + return True, "OK", rows, total + except Exception as e: + return False, str(e), [], 0 + + def docker_unix_socket_for_check() -> Optional[str]: """Path to Unix socket for existence check, or None if DOCKER_HOST is non-Unix (e.g. tcp).""" host = os.environ.get("DOCKER_HOST", "").strip() diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index 41cad2b..bf34299 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -18,6 +18,10 @@ DEFAULT_SPECSHEET_TOPIC = "perception.object_recognition.tracking.objects" DEFAULT_TREND_TOPIC = "perception.object_recognition.objects" +DETECTION_TREND_TOPIC_BY_MODEL = { + "bevfusion": "perception.object_recognition.detection.bevfusion.objects", + "centerpoint": "perception.object_recognition.detection.centerpoint.objects", +} DEFAULT_SPECSHEET_PROJECT_ID = "x2_dev" DEFAULT_SPECSHEET_LABELS = ["car", "truck", "bus", "bicycle", "pedestrian", "motorcycle"] DEFAULT_SPECSHEET_METRICS = [ @@ -81,6 +85,81 @@ def get_specsheet_artifact_paths(run_dir: str | Path) -> dict[str, Path]: } +def _topic_values_from_frame(frame: pd.DataFrame) -> list[str]: + for column in ("topic_name", "topic"): + if column not in frame.columns: + continue + values = [ + str(value).strip() + for value in frame[column].dropna().unique().tolist() + if str(value).strip() + ] + if values: + return sorted(values) + return [] + + +def detect_specsheet_topic_names(run_dir: str | Path, *, csv_sample_rows: int = 50000) -> list[str]: + """Detect topic names already present in specsheet CSV/parquet artifacts.""" + paths = get_specsheet_artifact_paths(run_dir) + detected: set[str] = set() + + for parquet_path in (paths["current_parquet"], paths["future_parquet"]): + if not parquet_path.exists(): + continue + try: + import pyarrow.parquet as pq + + columns = set(pq.ParquetFile(parquet_path).schema_arrow.names) + except Exception: + try: + columns = set(pd.read_parquet(parquet_path, columns=[]).columns) + except Exception: + columns = set() + topic_columns = [column for column in ("topic_name", "topic") if column in columns] + for column in topic_columns: + try: + frame = pd.read_parquet(parquet_path, columns=[column]) + except Exception: + continue + detected.update(_topic_values_from_frame(frame)) + + for csv_path in (paths["current_csv"], paths["future_csv"]): + if not csv_path.exists(): + continue + try: + header = pd.read_csv(csv_path, nrows=0) + except Exception: + continue + topic_columns = [column for column in ("topic_name", "topic") if column in header.columns] + for column in topic_columns: + try: + frame = pd.read_csv(csv_path, usecols=[column], nrows=csv_sample_rows) + except Exception: + continue + detected.update(_topic_values_from_frame(frame)) + + return sorted(detected) + + +def resolve_specsheet_topic_name( + run_dir: str | Path, + requested_topic: str | None, + *, + fallback_topic: str = DEFAULT_SPECSHEET_TOPIC, +) -> tuple[str, list[str]]: + """Resolve the topic that should be used for specsheet generation.""" + requested = str(requested_topic or "").strip() + detected = detect_specsheet_topic_names(run_dir) + if requested and requested in detected: + return requested, detected + if fallback_topic in detected: + return fallback_topic, detected + if len(detected) == 1: + return detected[0], detected + return requested or fallback_topic, detected + + def _looks_like_specsheet_release_container(path: Path) -> bool: return ( (path / TREND_METADATA_FILENAME).exists() @@ -290,15 +369,26 @@ def _trend_version_abbr(metadata: dict[str, Any]) -> str: if explicit: return explicit version = str(metadata.get("pilot_auto_version") or "").strip() - return _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version + if not version: + return "" + try: + from perception_catalog_analyzer.trend import _abbreviate_version + + abbreviated = str(_abbreviate_version(version) or "").strip() + if abbreviated: + return abbreviated + except Exception: + pass + shortened = _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version + return shortened[:16] def _infer_trend_topic(metadata: dict[str, Any], metadata_path: str | Path) -> str: explicit = str(metadata.get("topic_name") or "").strip() - if explicit: + if explicit and explicit != DEFAULT_SPECSHEET_TOPIC: return explicit for part in reversed(Path(metadata_path).parts): - if part.startswith("perception."): + if part.startswith("perception.") and part != DEFAULT_SPECSHEET_TOPIC: return part return DEFAULT_TREND_TOPIC @@ -360,6 +450,13 @@ def classify_trend_summary(summary: dict[str, Any]) -> str: return "unknown" +def _unwrap_devops_summary(summary: dict[str, Any]) -> dict[str, Any]: + devops = summary.get("DevOps") if isinstance(summary, dict) else None + if isinstance(devops, dict): + return devops + return summary + + def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[TrendReleaseGroup]: metadata_files = discover_trend_metadata_files(root_dir) grouped: dict[str, TrendReleaseGroup] = {} @@ -662,6 +759,7 @@ def _avg(metric_name: str) -> float: def extract_devops_case_rows(summary: dict[str, Any]) -> list[dict[str, Any]]: """Flatten nested devops/pass-rate summary into case rows.""" + summary = _unwrap_devops_summary(summary) rows: list[dict[str, Any]] = [] for major_category, mid_categories in summary.items(): if not isinstance(mid_categories, dict): @@ -701,6 +799,7 @@ def extract_devops_case_rows(summary: dict[str, Any]) -> list[dict[str, Any]]: def _normalize_devops_summary_structure(summary: dict[str, Any]) -> dict[str, dict[str, dict[str, dict[str, int]]]]: + summary = _unwrap_devops_summary(summary) normalized: dict[str, dict[str, dict[str, dict[str, int]]]] = {} for major_category, mid_categories in summary.items(): if not isinstance(mid_categories, dict): @@ -881,6 +980,17 @@ def _add_devops_detail_trend_rates(devops_trend_data: Sequence[dict[str, Any]]) return sorted(cases) +def _devops_trend_rows_for_template(devops_trend_data: Sequence[dict[str, Any]]) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for row in devops_trend_data: + display_row = dict(row) + version_abbr = str(display_row.get("version_abbr") or "").strip() + if version_abbr: + display_row["version"] = version_abbr + rows.append(display_row) + return rows + + def _build_trend_context( metadata_list: Sequence[Path], output_dir: Path, @@ -952,7 +1062,7 @@ def _build_trend_context( "prediction_trend_plot_path": prediction_trend_plot_path, "devops_data": devops_data, "devops_plot_path": devops_plot_path, - "devops_trend_data": devops_trend_data, + "devops_trend_data": _devops_trend_rows_for_template(devops_trend_data), "devops_trend_plot_path": devops_trend_plot_path, "job_ids": [], } @@ -1315,6 +1425,14 @@ def generate_specsheet_pdf( return pdf_path, False ensure_specsheet_csvs(run_dir, progress_callback=progress_callback) + resolved_topic, detected_topics = resolve_specsheet_topic_name(run_dir, topic_name) + if resolved_topic != topic_name: + detected_text = ", ".join(detected_topics) if detected_topics else "none" + _notify( + progress_callback, + f"Using detected topic {resolved_topic} instead of requested topic {topic_name} (detected: {detected_text})", + ) + topic_name = resolved_topic try: from perception_catalog_analyzer.dataframe import SceneDataFrame diff --git a/evaluation_dashboard_app/pages/6_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py index 99ebe08..7f6aa98 100644 --- a/evaluation_dashboard_app/pages/6_Workflow.py +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -42,6 +42,11 @@ read_run_metadata, upsert_run_metadata, ) +from lib.specsheet_report import ( + DEFAULT_TREND_TOPIC, + DETECTION_TREND_TOPIC_BY_MODEL, + parse_trend_metadata_text, +) from lib.ui.recent_evaluator_jobs import ( _fetch_evaluator_job_detail, _format_source_ref_html, @@ -68,6 +73,13 @@ _RELEASE_PERFORMANCE_INTEGRATION_ID = "96ad8fba-0228-4c2b-9166-07d4de1a0760" _RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" _RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" +_RELEASE_OPTIONAL_CATALOG_ID = "09039022-ec91-41bf-9e93-fdefccdfc9bc" +_RELEASE_TREND_TOPIC_OPTIONS = { + "Prediction / object recognition": DEFAULT_TREND_TOPIC, + "ML model / CenterPoint": DETECTION_TREND_TOPIC_BY_MODEL["centerpoint"], + "ML model / BEVFusion": DETECTION_TREND_TOPIC_BY_MODEL["bevfusion"], + "Custom": "", +} _TASK_HISTORY_RANGE_OPTIONS = { "7 days": 7, "30 days": 30, @@ -309,6 +321,42 @@ def _make_default_release_pilot_auto_version(target_name: str) -> str: return f"Pilot.Auto {target}" if target else "Pilot.Auto release" +def _make_default_release_metadata_text(target_name: str) -> str: + release_group = _safe_output_part(target_name, "release") + pilot_auto_version = _make_default_release_pilot_auto_version(target_name) + description = f"{target_name} release data update" if target_name else "Release data update" + date = datetime.now(_JST).strftime("%Y.%m.%d") + return ( + "tags: [trend]\n" + f"release_group: {release_group}\n" + f'pilot_auto_version: "{pilot_auto_version}"\n' + f"version_abbr: {_safe_output_part(pilot_auto_version.replace('Pilot.Auto', '').strip(), 'release')[:16]}\n" + "data_count: 99,776+\n" + f"description: {description}\n" + f"date: {date}\n" + f"topic_name: {DEFAULT_TREND_TOPIC}\n" + ) + + +def _extract_release_metadata_topic(text: str) -> str: + try: + metadata = parse_trend_metadata_text(text) + return str(metadata.get("topic_name") or DEFAULT_TREND_TOPIC).strip() + except Exception: + match = re.search(r"(?m)^topic_name\s*:\s*['\"]?([^'\"\n#]+)", text or "") + return match.group(1).strip() if match else DEFAULT_TREND_TOPIC + + +def _replace_release_metadata_topic(text: str, topic: str) -> str: + topic = str(topic or "").strip() + if not topic: + return text + line = f"topic_name: {topic}" + if re.search(r"(?m)^topic_name\s*:", text or ""): + return re.sub(r"(?m)^topic_name\s*:.*$", line, text) + return (text.rstrip() + "\n" + line + "\n") if text else line + "\n" + + def _format_run_mtime(mtime: float) -> str: if not mtime: return "—" @@ -1971,67 +2019,87 @@ def _render_start_workflow_form( trend_metadata: Dict[str, object] = {} if release_mode: - release_version_default = _make_default_release_pilot_auto_version(target_name) - release_description_default = f"{target_name} release data update" if target_name else "Release data update" - release_data_count_default = "99,776+" - if not st.session_state.get("workflow_release_pilot_auto_version"): - st.session_state["workflow_release_pilot_auto_version"] = release_version_default - if not st.session_state.get("workflow_release_description"): - st.session_state["workflow_release_description"] = release_description_default - if not st.session_state.get("workflow_release_data_count"): - st.session_state["workflow_release_data_count"] = release_data_count_default - - release_cols = st.columns([1.15, 1.1, 0.8]) - with release_cols[0]: - release_group = st.text_input( - "Release group", - value=st.session_state.get("workflow_release_group", _safe_output_part(target_name, "release")), - key="workflow_release_group", - help="Stable key used to group this release with older app-generated trend history.", - ).strip() - with release_cols[1]: - pilot_auto_version = st.text_input( - "Pilot.Auto version", - value=st.session_state.get("workflow_release_pilot_auto_version", release_version_default), - key="workflow_release_pilot_auto_version", - placeholder='Pilot.Auto v4.4.0 (bevfusion x2/2.5.1)', - ).strip() - with release_cols[2]: - release_date = st.text_input( - "Release date", - value=st.session_state.get("workflow_release_date", datetime.now(_JST).strftime("%Y.%m.%d")), - key="workflow_release_date", - placeholder="2026.5.22", - ).strip() - release_meta_cols = st.columns([0.8, 1.1, 1.1]) - with release_meta_cols[0]: - data_count = st.text_input( - "Data count", - value=st.session_state.get("workflow_release_data_count", release_data_count_default), - key="workflow_release_data_count", - placeholder="123,708+", - ).strip() - with release_meta_cols[1]: - release_description = st.text_input( - "Release description", - value=st.session_state.get("workflow_release_description", release_description_default), - key="workflow_release_description", - ).strip() - with release_meta_cols[2]: - release_topic_name = st.text_input( - "Trend topic", - value=st.session_state.get("workflow_release_topic_name", "perception.object_recognition.objects"), - key="workflow_release_topic_name", + metadata_default_key = "workflow_release_metadata_default_target" + metadata_text_key = "workflow_release_metadata_text" + if ( + st.session_state.get(metadata_default_key) != target_name + or metadata_text_key not in st.session_state + ): + st.session_state[metadata_text_key] = _make_default_release_metadata_text(target_name) + st.session_state[metadata_default_key] = target_name + + current_metadata_text = str(st.session_state.get(metadata_text_key) or "") + trend_topic_from_metadata = _extract_release_metadata_topic(current_metadata_text) + option_values = list(_RELEASE_TREND_TOPIC_OPTIONS.values()) + topic_labels = list(_RELEASE_TREND_TOPIC_OPTIONS.keys()) + if trend_topic_from_metadata in option_values: + topic_index = option_values.index(trend_topic_from_metadata) + else: + topic_index = topic_labels.index("Custom") + st.session_state.setdefault("workflow_release_custom_trend_topic", trend_topic_from_metadata) + + topic_label_key = "workflow_release_trend_topic_label" + topic_yaml_key = "workflow_release_trend_topic_yaml_value" + if st.session_state.get(topic_yaml_key) != trend_topic_from_metadata: + st.session_state[topic_label_key] = topic_labels[topic_index] + st.session_state[topic_yaml_key] = trend_topic_from_metadata + if topic_labels[topic_index] == "Custom": + st.session_state["workflow_release_custom_trend_topic"] = trend_topic_from_metadata + + trend_topic_label = st.selectbox( + "Trend topic", + options=topic_labels, + key=topic_label_key, + help="Used only for trend graphs. The specsheet data topic is detected from parquet/csv separately.", + ) + if trend_topic_label == "Custom": + trend_topic = st.text_input( + "Custom trend topic", + value=st.session_state.get("workflow_release_custom_trend_topic", trend_topic_from_metadata), + key="workflow_release_custom_trend_topic", + placeholder="perception.object_recognition.objects", ).strip() - trend_metadata = { - "tags": ["trend"], - "release_group": release_group, - "pilot_auto_version": pilot_auto_version, - "data_count": data_count, - "description": release_description, - "date": release_date, - "topic_name": release_topic_name, - } + else: + trend_topic = _RELEASE_TREND_TOPIC_OPTIONS[trend_topic_label] + if trend_topic and trend_topic != trend_topic_from_metadata: + st.session_state[metadata_text_key] = _replace_release_metadata_topic( + current_metadata_text, + trend_topic, + ) + st.session_state[topic_yaml_key] = trend_topic + + metadata_text = st.text_area( + "Release metadata YAML", + key=metadata_text_key, + height=150, + help=( + "Required: tags: [trend], release_group, pilot_auto_version, data_count, description, date. " + "date must look like 2026.5.22." + ), + ) + metadata_error = "" + try: + trend_metadata = parse_trend_metadata_text(metadata_text) + if not str(trend_metadata.get("release_group") or "").strip(): + raise ValueError("Release metadata requires non-empty `release_group`.") + except Exception as exc: + metadata_error = str(exc) + trend_metadata = {} + st.error(f"Release metadata error: {metadata_error}") + + trend_topic_from_metadata = str(trend_metadata.get("topic_name") or "").strip() + if release_mode and trend_metadata and not trend_topic_from_metadata: + metadata_error = metadata_error or "Trend topic is required." + st.error("Trend topic is required.") + elif trend_metadata: + st.success("Release metadata looks valid.") + + optional_catalog_enabled = st.checkbox( + "Also run Planning Test catalog", + value=bool(st.session_state.get("workflow_release_optional_catalog_enabled", False)), + key="workflow_release_optional_catalog_enabled", + help="Schedules the Planning Test catalog in addition to Performance and DevOps.", + ) existing_job_cols = st.columns(2) with existing_job_cols[0]: performance_job_id = st.text_input( @@ -2049,14 +2117,28 @@ def _render_start_workflow_form( placeholder="Leave empty to schedule a new DevOps job", help="Use this when the release DevOps evaluator job is already scheduled or finished.", ).strip() + if optional_catalog_enabled: + optional_job_id = st.text_input( + "Existing Planning Test job ID", + value=st.session_state.get("workflow_release_optional_job_id", ""), + key="workflow_release_optional_job_id", + placeholder="Leave empty to schedule the Planning Test catalog", + help="Use this when the Planning Test evaluator job is already scheduled or finished.", + ).strip() + else: + optional_job_id = "" + output_dirs = "`performance/`, `devops/`, and `planning_test/`" if optional_catalog_enabled else "`performance/` and `devops/`" st.caption( - "Normal detailed-analysis outputs are generated automatically under `performance/` and `devops/`; existing job IDs are waited on if still running and downloaded if already finished." + f"Normal detailed-analysis outputs are generated automatically under {output_dirs}; existing job IDs are waited on if still running and downloaded if already finished." ) else: performance_job_id = "" devops_job_id = "" + optional_catalog_enabled = False + optional_job_id = "" + metadata_error = "" - confirm_cols = st.columns([1.0, 1.0]) + confirm_cols = st.columns([1.0, 1.0, 1.0] if release_mode and optional_catalog_enabled else [1.0, 1.0]) with confirm_cols[0]: if release_mode: st.caption(f"Performance catalog: `{_RELEASE_PERFORMANCE_CATALOG_ID}`") @@ -2067,6 +2149,9 @@ def _render_start_workflow_form( st.caption(f"DevOps catalog: `{_RELEASE_DEVOPS_CATALOG_ID}`") elif integration_id: st.caption(f"Integration ID: `{integration_id}`") + if release_mode and optional_catalog_enabled: + with confirm_cols[2]: + st.caption(f"Planning Test catalog: `{_RELEASE_OPTIONAL_CATALOG_ID}`") if st.session_state.get("workflow_catalog_resolution_error"): st.warning(f"Could not resolve integration automatically: {st.session_state['workflow_catalog_resolution_error']}") @@ -2089,7 +2174,7 @@ def _render_start_workflow_form( key="workflow_download_type", disabled=release_mode, help=( - "Release mode always downloads archives so Summary.csv, Score.csv, and parquet can be generated." + "Release mode uses archives, but reuses existing downloaded artifacts when the output folders already contain them." if release_mode else None ), @@ -2127,7 +2212,7 @@ def _render_start_workflow_form( value=False if release_mode else True, key="workflow_run_eval", disabled=release_mode, - help="Release mode runs evaluation automatically for both release jobs.", + help="Release PDF generation uses parquet; eval/CSV detail checks can be run separately when needed.", ) with option_cols[1]: generate_parquet = st.checkbox( @@ -2135,7 +2220,7 @@ def _render_start_workflow_form( value=False if release_mode else CATALOG_IO_AVAILABLE, disabled=release_mode or not CATALOG_IO_AVAILABLE, key="workflow_generate_parquet", - help="Release mode generates detailed-analysis CSV/parquet automatically under performance/ and devops/.", + help="Release mode generates parquet when missing; existing parquet is enough for PDF generation.", ) with option_cols[2]: skip_large_file = st.checkbox( @@ -2183,6 +2268,8 @@ def _render_start_workflow_form( errors.append("Data count") if not trend_metadata.get("date"): errors.append("Release date") + if metadata_error: + errors.append(metadata_error) resolved_output = None path_error = "" @@ -2226,6 +2313,9 @@ def _render_start_workflow_form( "trend_metadata": trend_metadata if release_mode else {}, "performance_job_id": performance_job_id if release_mode else "", "devops_job_id": devops_job_id if release_mode else "", + "optional_catalog_enabled": bool(optional_catalog_enabled) if release_mode else False, + "optional_catalog_id": _RELEASE_OPTIONAL_CATALOG_ID if release_mode and optional_catalog_enabled else "", + "optional_job_id": optional_job_id if release_mode and optional_catalog_enabled else "", }, } @@ -2260,6 +2350,8 @@ def _reset_start_workflow_state() -> None: st.session_state["workflow_last_catalog_selection"] = "" st.session_state["workflow_release_performance_job_id"] = "" st.session_state["workflow_release_devops_job_id"] = "" + st.session_state["workflow_release_trend_topic_label"] = "Prediction / object recognition" + st.session_state["workflow_release_custom_trend_topic"] = "" st.session_state["workflow_output_path"] = _make_default_output_path(fresh_target) def _render_start_workflow_controls(*, key_suffix: str = "dialog") -> None: @@ -2331,14 +2423,18 @@ def _render_start_workflow_controls(*, key_suffix: str = "dialog") -> None: "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, "trend_metadata": trend_metadata, "version": trend_metadata.get("pilot_auto_version", ""), - "topic": trend_metadata.get("topic_name", "perception.object_recognition.objects"), + "topic": trend_metadata.get("topic_name", ""), "performance_catalog_id": _RELEASE_PERFORMANCE_CATALOG_ID, "performance_integration_id": _RELEASE_PERFORMANCE_INTEGRATION_ID, "performance_job_id": dialog_payload.get("performance_job_id", ""), "devops_catalog_id": _RELEASE_DEVOPS_CATALOG_ID, "devops_integration_id": _RELEASE_DEVOPS_INTEGRATION_ID, "devops_job_id": dialog_payload.get("devops_job_id", ""), + "optional_catalog_enabled": bool(dialog_payload.get("optional_catalog_enabled", False)), + "optional_catalog_id": dialog_payload.get("optional_catalog_id", ""), + "optional_job_id": dialog_payload.get("optional_job_id", ""), "analysis_phase": "perception.object_recognition.tracking.objects", + "run_eval": bool(dialog_payload.get("run_eval", False)), "overwrite": True, }, ) diff --git a/evaluation_dashboard_app/pages/99_Deployment_Debug.py b/evaluation_dashboard_app/pages/99_Deployment_Debug.py index 987a210..a46d093 100644 --- a/evaluation_dashboard_app/pages/99_Deployment_Debug.py +++ b/evaluation_dashboard_app/pages/99_Deployment_Debug.py @@ -4,18 +4,23 @@ Must live as a top-level pages/*.py file so st.page_link can resolve it. Outside Docker, the default sidebar entry is hidden via CSS in lib/ui/styles_global.py; Overview shows a page_link only in Docker. """ +import json import os from datetime import datetime, timedelta +from typing import Any import pandas as pd import streamlit as st +from lib.db import TASK_STATUSES, TASK_TYPES from lib.deploy_debug import ( EXEC_TIMEOUT_SEC, MAX_LOG_TAIL_LINES, compose_project_filter, container_exec_command, container_logs_tail, + database_recent_task_rows, + database_table_overview, docker_client_or_none, is_docker_debug_enabled, is_exec_enabled, @@ -59,8 +64,8 @@ mode="Single Run", ) -tab_env, tab_dep, tab_tasks, tab_docker = st.tabs( - ["Environment", "Dependencies", "Tasks", "Docker"] +tab_env, tab_dep, tab_tasks, tab_db, tab_docker = st.tabs( + ["Environment", "Dependencies", "Tasks", "Database", "Docker"] ) with tab_env: @@ -108,6 +113,155 @@ st.error(msg_t) +def _debug_json(value: Any) -> str: + try: + return json.dumps(value, ensure_ascii=False, default=str, indent=2) + except (TypeError, ValueError): + return str(value) + + +def _task_rows_dataframe(rows: list) -> pd.DataFrame: + display_rows = [] + for row in rows: + params = row.get("parameters") or {} + if not isinstance(params, dict): + params = {} + display_rows.append( + { + "created_at": row.get("created_at"), + "updated_at": row.get("updated_at"), + "status": row.get("status"), + "type": row.get("type"), + "session_id": row.get("session_id"), + "id": str(row.get("id") or ""), + "rq_job_id": row.get("rq_job_id"), + "job_id": params.get("job_id") + or params.get("performance_job_id") + or params.get("devops_job_id") + or params.get("source_job_id") + or "", + "output_path": params.get("output_path") or params.get("output_dir") or "", + "progress_pct": row.get("progress_pct"), + "progress_message": row.get("progress_message"), + "result_path": row.get("result_path"), + "error_message": row.get("error_message"), + } + ) + return pd.DataFrame(display_rows) + + +def _format_progress_metric(value: Any) -> str: + try: + return f"{float(value or 0):g}%" + except (TypeError, ValueError): + return "0%" + + +with tab_db: + section_header( + "Database inspector", + "Read-only view into Postgres tables and recent evaluator/task job history.", + ) + + ok_tables, msg_tables, table_rows = database_table_overview() + if ok_tables and table_rows is not None: + overview_df = pd.DataFrame(table_rows) + if not overview_df.empty: + overview_df["total_mb"] = (overview_df["total_bytes"] / (1024 * 1024)).round(2) + st.dataframe( + overview_df[["table_name", "estimated_rows", "total_mb"]], + width="stretch", + hide_index=True, + ) + else: + st.info("No public tables found.") + elif not ok_tables: + st.error(msg_tables) + + section_header("Recent job history", "Raw `tasks` rows, newest first, across all sessions.") + filters = st.columns([1.2, 1.6, 1.2, 1.2]) + with filters[0]: + status_filter = st.selectbox( + "Status", + ["All", *TASK_STATUSES], + key="deploy_db_status", + ) + with filters[1]: + type_filter = st.selectbox( + "Task type", + ["All", *TASK_TYPES], + key="deploy_db_type", + ) + with filters[2]: + row_limit = st.number_input( + "Rows", + min_value=10, + max_value=500, + value=50, + step=10, + key="deploy_db_limit", + ) + with filters[3]: + page = st.number_input( + "Page", + min_value=1, + max_value=1000, + value=1, + step=1, + key="deploy_db_page", + ) + search = st.text_input( + "Search", + key="deploy_db_search", + placeholder="Task id, job id, session, path, error text, parameters", + ) + + ok_rows, msg_rows, rows, total_rows = database_recent_task_rows( + limit=int(row_limit), + offset=(int(page) - 1) * int(row_limit), + status=None if status_filter == "All" else status_filter, + task_type=None if type_filter == "All" else type_filter, + search=search.strip() or None, + ) + if not ok_rows: + st.error(msg_rows) + elif not rows: + st.info("No task rows matched the current filters.") + else: + st.caption(f"Showing **{len(rows)}** of **{total_rows}** matching task rows.") + task_df = _task_rows_dataframe(rows) + st.dataframe(task_df, width="stretch", hide_index=True) + + id_options = [str(row.get("id") or "") for row in rows] + selected_id = st.selectbox("Inspect row", id_options, key="deploy_db_task_inspect") + selected = next((row for row in rows if str(row.get("id") or "") == selected_id), None) + if selected: + meta_cols = st.columns(4) + meta_cols[0].metric("Status", str(selected.get("status") or "—")) + meta_cols[1].metric("Type", str(selected.get("type") or "—")) + meta_cols[2].metric("Progress", _format_progress_metric(selected.get("progress_pct"))) + meta_cols[3].metric("Session", str(selected.get("session_id") or "—")[:32]) + + detail_tabs = st.tabs(["Parameters", "Result summary", "Log", "Raw row"]) + with detail_tabs[0]: + st.code(_debug_json(selected.get("parameters") or {}), language="json") + with detail_tabs[1]: + raw_summary = selected.get("result_summary") + if raw_summary: + try: + parsed = json.loads(raw_summary) if isinstance(raw_summary, str) else raw_summary + st.code(_debug_json(parsed), language="json") + except (TypeError, ValueError): + st.code(str(raw_summary), language=None) + else: + st.info("No result summary stored for this row.") + with detail_tabs[2]: + log_text = (selected.get("log_output") or "").strip() + st.code(log_text or "(empty)", language=None) + with detail_tabs[3]: + st.code(_debug_json(selected), language="json") + + def _render_docker_disabled(reason: str) -> None: st.warning(reason) st.markdown( diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index aea9897..2fdeba4 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -38,6 +38,7 @@ _RELEASE_PERFORMANCE_INTEGRATION_ID = "96ad8fba-0228-4c2b-9166-07d4de1a0760" _RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" _RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" +_RELEASE_OPTIONAL_CATALOG_ID = "09039022-ec91-41bf-9e93-fdefccdfc9bc" # Optional imports for tasks that need them def _import_eval_summary(): @@ -190,6 +191,38 @@ def _copy_task_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: return copied +def _resolve_active_integration_id(api: Any, project_id: str, catalog_id: str) -> str: + """Resolve latest active integration for a catalog when the UI only provided a catalog id.""" + url = f"{api.api_base_url}/projects/{project_id}/integrations" + response = api.request(url, {"catalog_id": catalog_id, "size": 100}, method="GET") + if response is None: + raise RuntimeError(f"No response returned while loading integrations for catalog {catalog_id}.") + if response.status_code != 200: + raise RuntimeError( + f"Failed to load integrations for catalog {catalog_id}: status={response.status_code}" + ) + payload = json.loads(response.content) + integrations = payload.get("integrations", []) or [] + active = [ + item for item in integrations + if isinstance(item, dict) + and str(item.get("catalog_id") or "").strip() == catalog_id + and not bool(item.get("deleted")) + ] + if not active: + raise RuntimeError(f"No active integration found for catalog {catalog_id}.") + + def _sort_key(item: Dict[str, object]) -> tuple: + return ( + str(item.get("updated_at") or ""), + int(item.get("version_id") or 0), + str(item.get("id") or ""), + ) + + active.sort(key=_sort_key, reverse=True) + return str(active[0].get("id") or "").strip() + + def _task_row_payload(task_id: str) -> Dict[str, Any]: row = get_task(task_id) or {} return { @@ -1182,7 +1215,7 @@ def _write_release_metadata_file(path: Path, metadata: Dict[str, Any]) -> Path: def _build_devops_trend_summary_from_suites(rows: list[dict[str, Any]]) -> Dict[str, Any]: - summary_payload: Dict[str, Any] = {"DevOps": {"Suite pass rate": {}}} + suite_results: dict[str, dict[str, int]] = {} for row in rows or []: suite_name = str(row.get("name") or row.get("suite_name") or row.get("simulation") or "suite").strip() total = int(row.get("all", 0) or row.get("total", 0) or 0) @@ -1193,16 +1226,63 @@ def _build_devops_trend_summary_from_suites(rows: list[dict[str, Any]]) -> Dict[ total = passed + failed + canceled if total <= 0: continue - summary_payload["DevOps"]["Suite pass rate"][suite_name] = { - "passed": passed, - "total": total, - } - return summary_payload + suite_results[suite_name] = {"passed": passed, "total": total} + if not suite_results: + return {"DevOps": {}} + + try: + from perception_catalog_analyzer.path import DEVOPS_MAPPING_PATH + + with Path(DEVOPS_MAPPING_PATH).open("r", encoding="utf-8") as fh: + category_mapping = yaml.safe_load(fh) or {} + except Exception: + category_mapping = {} + + if not isinstance(category_mapping, dict) or not category_mapping: + return {"DevOps": {"Suite pass rate": suite_results}} + + mapped: Dict[str, Any] = {} + matched_suites: set[str] = set() + for major_category, mid_categories in category_mapping.items(): + if not isinstance(mid_categories, dict): + continue + major_payload: Dict[str, Any] = {} + for mid_category, sub_categories in mid_categories.items(): + if not isinstance(sub_categories, dict): + continue + mid_payload: Dict[str, Any] = {} + for sub_category, suite_names in sub_categories.items(): + if not isinstance(suite_names, list): + continue + passed = 0 + total = 0 + for suite_name in suite_names: + result = suite_results.get(str(suite_name)) + if not result: + continue + matched_suites.add(str(suite_name)) + passed += int(result.get("passed", 0) or 0) + total += int(result.get("total", 0) or 0) + mid_payload[str(sub_category)] = {"passed": passed, "total": total} + if mid_payload: + major_payload[str(mid_category)] = mid_payload + if major_payload: + mapped[str(major_category)] = major_payload + + unmatched = { + suite_name: result + for suite_name, result in suite_results.items() + if suite_name not in matched_suites + } + if unmatched: + mapped.setdefault("その他", {})["未分類"] = unmatched + + return {"DevOps": mapped} def _write_devops_trend_summary(path: Path, rows: list[dict[str, Any]]) -> Path | None: summary_payload = _build_devops_trend_summary_from_suites(rows) - if not summary_payload["DevOps"]["Suite pass rate"]: + if not summary_payload.get("DevOps"): return None path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as fh: @@ -1210,6 +1290,46 @@ def _write_devops_trend_summary(path: Path, rows: list[dict[str, Any]]) -> Path return path +def _suite_rows_from_existing_devops_summary(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + return [] + devops = payload.get("DevOps") if isinstance(payload, dict) else {} + if not isinstance(devops, dict): + return [] + suite_pass_rate = devops.get("Suite pass rate") + if not isinstance(suite_pass_rate, dict): + return [] + rows: list[dict[str, Any]] = [] + for suite_name, result in suite_pass_rate.items(): + if not isinstance(result, dict): + continue + rows.append( + { + "suite_name": str(suite_name), + "success": int(result.get("passed", 0) or 0), + "total": int(result.get("total", 0) or 0), + } + ) + return rows + + +def _has_release_download_artifacts(path: Path) -> bool: + return any(path.rglob("scene_result.pkl")) or any(path.rglob("*.pkl.z")) + + +def _find_release_parquet(path: Path) -> Path | None: + current = path / "current.parquet" + if current.exists(): + return current + for parquet in sorted(path.glob("*.parquet"), key=lambda p: p.name.lower()): + return parquet + return None + + def _build_release_analysis_artifacts( *, task_id: str, @@ -1218,6 +1338,7 @@ def _build_release_analysis_artifacts( role: str, output_path: Path, phase: str, + run_eval: bool = False, progress_start: float = 48.0, progress_end: float = 78.0, ) -> Dict[str, Any]: @@ -1238,6 +1359,7 @@ def _build_release_analysis_artifacts( progress_span = max(0.0, progress_end - progress_start) download_end = progress_start + progress_span * 0.55 eval_end = progress_start + progress_span * 0.90 + existing_parquet = _find_release_parquet(output_path) def _on_progress(msg: str) -> None: append_task_log(task_id, f"{role}: {msg}") @@ -1256,35 +1378,60 @@ def _on_warning(msg: str) -> None: result["warnings"].append(msg) append_task_log(task_id, f"WARNING: {role}: {msg}") - update_task_progress(task_id, message=f"{role}: finding downloadable case logs", pct=progress_start) - failure_count, total_attempted, rows = download_core.run_download_results( - project_id=project_id, - job_id=job_id, - suite_id=None, - output_path=str(output_path), - download_type="archives", - phase=phase, - skip_large_file=False, - large_file_mb=50.0, - keep_zip_files=False, - suite_ids=None, - on_progress=_on_progress, - on_warning=_on_warning, - ) - success_count = total_attempted - failure_count + if existing_parquet or _has_release_download_artifacts(output_path): + append_task_log(task_id, f"{role}: using existing downloaded artifacts in {output_path}") + update_task_progress(task_id, message=f"{role}: using existing downloaded artifacts", pct=download_end) + failure_count = 0 + total_attempted = 0 + success_count = 0 + rows: list[dict[str, Any]] = [] + else: + if not job_id: + raise RuntimeError(f"{role}: no local artifacts found and no evaluator job id is available for download.") + update_task_progress(task_id, message=f"{role}: finding downloadable case logs", pct=progress_start) + failure_count, total_attempted, rows = download_core.run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=None, + output_path=str(output_path), + download_type="archives", + phase=phase, + skip_large_file=False, + large_file_mb=50.0, + keep_zip_files=False, + suite_ids=None, + on_progress=_on_progress, + on_warning=_on_warning, + ) + success_count = total_attempted - failure_count + if success_count <= 0: + raise RuntimeError(f"{role}: download produced no successful case artifacts.") result["download"] = { "total": total_attempted, "success": success_count, "failed": failure_count, "rows": rows[:100], } - if success_count <= 0: - raise RuntimeError(f"{role}: download produced no successful case artifacts.") - if eval_summary: + if run_eval and eval_summary and not existing_parquet: target_dirs = eval_summary.find_eval_result_dirs(str(output_path), recursive=True) total = len(target_dirs) - if target_dirs: + summary_csv = output_path / "Summary.csv" + score_csv = output_path / "Score.csv" + if target_dirs and summary_csv.exists() and score_csv.exists(): + append_task_log(task_id, f"{role}: Summary.csv / Score.csv already exist; skipping eval") + update_task_progress(task_id, message=f"{role}: existing Summary.csv / Score.csv found", pct=eval_end) + statuses = [] + result["eval"] = { + "directories_processed": total, + "success": 0, + "failed": 0, + "skipped": total, + "summary_path": str(summary_csv), + "summary_rows": 0, + "score_rows": 0, + } + elif target_dirs: statuses = _run_eval_result_dirs( task_id=task_id, eval_summary=eval_summary, @@ -1298,7 +1445,7 @@ def _on_warning(msg: str) -> None: else: update_task_progress(task_id, message=f"{role}: no eval_result directories found", pct=eval_end) statuses = [] - if target_dirs: + if target_dirs and not result["eval"]: update_task_progress(task_id, message=f"{role}: generating Summary.csv / Score.csv", pct=eval_end) csv_info = eval_summary.generate_summary_and_score_csv(str(output_path)) result["eval"] = { @@ -1310,15 +1457,26 @@ def _on_warning(msg: str) -> None: "summary_rows": csv_info.get("summary_rows", 0), "score_rows": csv_info.get("score_rows", 0), } - else: + elif not result["eval"]: result["eval"] = { "directories_processed": 0, "success": 0, "failed": 0, "skipped": 0, } - - if pkl_archive_to_parquet: + elif not run_eval: + append_task_log(task_id, f"{role}: skipping eval; parquet is sufficient for release PDF generation") + result["eval"] = {"enabled": False, "reason": "release_pdf_uses_parquet"} + elif existing_parquet: + append_task_log(task_id, f"{role}: skipping eval because parquet already exists") + result["eval"] = {"enabled": False, "reason": "existing_parquet"} + + existing_parquet = _find_release_parquet(output_path) + if existing_parquet: + append_task_log(task_id, f"{role}: existing parquet found: {existing_parquet}") + result["parquet_path"] = str(existing_parquet) + update_task_progress(task_id, message=f"{role}: existing parquet found", pct=progress_end) + elif pkl_archive_to_parquet: try: update_task_progress(task_id, message=f"{role}: generating parquet", pct=eval_end) result["parquet_path"] = pkl_archive_to_parquet( @@ -1369,6 +1527,7 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) DEFAULT_SPECSHEET_LABELS, DEFAULT_SPECSHEET_TOPIC, generate_specsheet_pdf, + resolve_specsheet_topic_name, ) project_id = str(parameters.get("project_id") or "").strip() @@ -1399,13 +1558,25 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) release_root = Path(output_path) release_root.mkdir(parents=True, exist_ok=True) _write_release_metadata_file(release_root / "metadata.yaml", metadata) - release_specsheet_dir = release_root / "specsheet" performance_path = release_root / "performance" devops_path = release_root / "devops" + role_paths = { + "performance": performance_path, + "devops": devops_path, + "planning_test": release_root / "planning_test", + } os.environ["AUTH_PROFILE"] = environment os.environ["EVALUATOR_ENVIRONMENT"] = environment api = evaluator_api.EvaluationRunAPI() + optional_catalog_enabled = bool(parameters.get("optional_catalog_enabled", False)) + optional_catalog_id = str( + parameters.get("optional_catalog_id") or _RELEASE_OPTIONAL_CATALOG_ID + ).strip() + optional_integration_id = str(parameters.get("optional_integration_id") or "").strip() + if optional_catalog_enabled and optional_catalog_id and not optional_integration_id: + append_task_log(task_id, f"Resolving Planning Test catalog integration: {optional_catalog_id}") + optional_integration_id = _resolve_active_integration_id(api, project_id, optional_catalog_id) jobs = [ { "role": "performance", @@ -1422,6 +1593,16 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) "job_id": str(parameters.get("devops_job_id") or "").strip(), }, ] + if optional_catalog_enabled: + jobs.append( + { + "role": "planning_test", + "label": "Planning Test", + "catalog_id": optional_catalog_id, + "integration_id": optional_integration_id, + "job_id": str(parameters.get("optional_job_id") or "").strip(), + } + ) summary: Dict[str, Any] = { "job": "run_release_specsheet_workflow", "release_root": str(release_root), @@ -1437,10 +1618,17 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) for item in jobs: schedule_description = f"{description} | {item['label']}" item["description"] = schedule_description + role = str(item["role"]) + local_path = role_paths[role] + local_ready = _find_release_parquet(local_path) is not None or _has_release_download_artifacts(local_path) + item["local_artifacts_ready"] = local_ready job_id = str(item.get("job_id") or "").strip() if job_id: append_task_log(task_id, f"Using existing {item['label']}: {job_id}") status = "existing" + elif local_ready: + append_task_log(task_id, f"Using existing local artifacts for {item['label']}: {local_path}") + status = "local_artifacts" else: append_task_log(task_id, f"Scheduling {item['label']}: catalog={item['catalog_id']}") result = api.schedule_job( @@ -1463,7 +1651,7 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) raise RuntimeError(f"No job_id returned for {item['label']}.") item["job_id"] = job_id status = "scheduled" - report_url = evaluator_api.get_job_report_url(project_id, job_id) + report_url = evaluator_api.get_job_report_url(project_id, job_id) if job_id else "" summary["evaluator_jobs"][item["role"]] = { "job_id": job_id, "report_url": report_url, @@ -1476,13 +1664,20 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) append_task_log(task_id, f"Scheduled {item['label']}: {job_id}") update_task_result_summary(task_id, summary) + wait_span = 40.0 / max(1, len(jobs)) for idx, item in enumerate(jobs, start=1): job_id = str(item["job_id"]) label = str(item["label"]) - base_pct = 5 + (idx - 1) * 20 + base_pct = 5 + (idx - 1) * wait_span + if not job_id and item.get("local_artifacts_ready"): + append_task_log(task_id, f"Skipping evaluator wait for {label}; local artifacts already exist.") + summary["evaluator_jobs"][item["role"]]["status"] = "local_artifacts" + update_task_progress(task_id, message=f"{label}: using local artifacts", pct=base_pct + wait_span - 2.0) + update_task_result_summary(task_id, summary) + continue def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct_base: float = base_pct) -> None: - pct = min(pct_base + (elapsed / max_wait_seconds) * 18, pct_base + 18) + pct = min(pct_base + (elapsed / max_wait_seconds) * max(2.0, wait_span - 2.0), pct_base + wait_span - 2.0) summary["evaluator_jobs"][role]["status"] = status update_task_progress( task_id, @@ -1512,7 +1707,7 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct update_task_result_summary(task_id, summary) update_task_progress(task_id, message="Building normal CSV/parquet analysis artifacts", pct=48) - role_paths = {"performance": performance_path, "devops": devops_path} + artifact_span = 30.0 / max(1, len(jobs)) for artifact_idx, item in enumerate(jobs): role = str(item["role"]) analysis_path = role_paths[role] @@ -1523,8 +1718,9 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct role=role, output_path=analysis_path, phase=analysis_phase, - progress_start=48 + (20 * artifact_idx), - progress_end=64 + (14 * artifact_idx), + run_eval=bool(parameters.get("run_eval", False)), + progress_start=48 + (artifact_span * artifact_idx), + progress_end=48 + (artifact_span * (artifact_idx + 1)), ) summary["analysis_artifacts"][role] = artifact_summary update_task_result_summary(task_id, summary) @@ -1537,9 +1733,9 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct "job_id": item["job_id"], "download_type": "archives", "phase": analysis_phase, - "run_eval": True, + "run_eval": bool(parameters.get("run_eval", False)), "generate_parquet": True, - "eval_recursive": True, + "eval_recursive": bool(parameters.get("run_eval", False)), } _mark_run_status( task_id, @@ -1572,8 +1768,8 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct }, "evaluation": { **artifact_summary.get("eval", {}), - "enabled": True, - "recursive": True, + "enabled": bool(parameters.get("run_eval", False)), + "recursive": bool(parameters.get("run_eval", False)), }, "parquet": { "enabled": True, @@ -1582,13 +1778,38 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct }, ) + detected_topic, detected_topics = resolve_specsheet_topic_name( + performance_path, + topic, + fallback_topic=DEFAULT_SPECSHEET_TOPIC, + ) + if detected_topic != topic: + append_task_log( + task_id, + ( + f"Using detected specsheet topic {detected_topic} instead of requested topic {topic} " + f"(detected: {', '.join(detected_topics) if detected_topics else 'none'})" + ), + ) + topic = detected_topic + summary["topic"] = topic + summary["detected_topics"] = detected_topics + update_task_result_summary(task_id, summary) + update_task_progress(task_id, message="Writing release trend summaries", pct=78) write_trend_metadata(devops_path, metadata) devops_job = next(item for item in jobs if item["role"] == "devops") - devops_summary_path = _write_devops_trend_summary( - devops_path / "resources" / "summary.json", - list(devops_job.get("suite_rows") or []), - ) + devops_summary_target = devops_path / "resources" / "summary.json" + devops_suite_rows = list(devops_job.get("suite_rows") or []) + if not devops_suite_rows: + existing_suite_rows = _suite_rows_from_existing_devops_summary(devops_summary_target) + if existing_suite_rows: + append_task_log(task_id, "Rebuilding DevOps trend summary from existing suite pass-rate rows.") + devops_suite_rows = existing_suite_rows + devops_summary_path = _write_devops_trend_summary(devops_summary_target, devops_suite_rows) + if devops_summary_path is None and devops_summary_target.exists(): + devops_summary_path = devops_summary_target + append_task_log(task_id, f"Using existing DevOps trend summary: {devops_summary_path}") if devops_summary_path is None: append_task_log(task_id, "WARNING: DevOps trend summary had no suite pass-rate rows.") else: @@ -1606,15 +1827,7 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct force=bool(parameters.get("overwrite", True)), progress_callback=lambda msg: append_task_log(task_id, f"specsheet: {msg}"), ) - release_specsheet_dir.mkdir(parents=True, exist_ok=True) - release_pdf = release_specsheet_dir / "specsheet.pdf" - shutil.copy2(specsheet_pdf, release_pdf) - for asset in ("map_trend.png", "prediction_trend.png", "devops_trend.png"): - asset_path = specsheet_pdf.parent / asset - if asset_path.exists(): - shutil.copy2(asset_path, release_specsheet_dir / asset) - summary["specsheet_pdf"] = str(release_pdf) - summary["performance_specsheet_pdf"] = str(specsheet_pdf) + summary["specsheet_pdf"] = str(specsheet_pdf) summary["specsheet_generated"] = bool(generated) update_task_progress(task_id, message="Release specsheet ready", pct=100) @@ -1624,20 +1837,19 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct parameters, task_type="run_release_specsheet_workflow", status="completed", - result_path=str(release_pdf), + result_path=str(specsheet_pdf), extra={ "release_specsheet": { "root": str(release_root), - "specsheet_pdf": str(release_pdf), - "performance_specsheet_pdf": str(specsheet_pdf), + "specsheet_pdf": str(specsheet_pdf), "evaluator_jobs": summary["evaluator_jobs"], "analysis_artifacts": summary["analysis_artifacts"], "metadata": metadata, } }, ) - append_task_log(task_id, f"Release specsheet PDF ready: {release_pdf}") - update_task_status(task_id, "completed", result_path=str(release_pdf)) + append_task_log(task_id, f"Release specsheet PDF ready: {specsheet_pdf}") + update_task_status(task_id, "completed", result_path=str(specsheet_pdf)) except Exception as e: append_task_log(task_id, f"Failed: {e}") _mark_run_status( From df094ab8835aae4a5ff83cabb2ad44bb70f00639 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Tue, 2 Jun 2026 11:01:12 +0900 Subject: [PATCH 90/94] feat: enhance release specsheet library and update entrypoint script - Added a new library for managing release specsheet inventories, including functions for loading metadata and generating URLs for PDF artifacts. - Updated the entrypoint script to enable static serving of files in the Streamlit application. - Modified the Docker Compose configuration to mount the entrypoint script and static files, improving deployment flexibility. - Refactored path utility functions to support new release role directories, enhancing directory detection logic. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/Overview.py | 214 ++-- .../deploy/docker-compose.yml | 2 + evaluation_dashboard_app/docker-entrypoint.sh | 2 +- .../lib/overview_url_hydrate.py | 3 +- evaluation_dashboard_app/lib/page_chrome.py | 4 +- evaluation_dashboard_app/lib/path_utils.py | 76 +- .../lib/release_specsheet_library.py | 165 +++ .../lib/specsheet_report.py | 102 +- .../pages/13_Trend_Insights.py | 1081 +++++++++++++++-- .../import_catalog_analyzer_releases.py | 501 ++++++++ 10 files changed, 1971 insertions(+), 179 deletions(-) create mode 100644 evaluation_dashboard_app/lib/release_specsheet_library.py create mode 100644 evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py index 5e7ab84..0843bd4 100755 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -6,7 +6,14 @@ import yaml from pathlib import Path from lib.run_loader import load_run -from lib.path_utils import get_data_root, get_data_root_display, get_run_display_name, list_run_directories, path_display +from lib.path_utils import ( + get_data_root, + get_data_root_display, + get_run_display_name, + list_run_directories, + path_display, + resolve_run_subdirectory, +) import plotly.express as px import plotly.graph_objects as go from lib.user_config import UserConfig @@ -17,6 +24,7 @@ DEFAULT_SPECSHEET_PROJECT_ID, DEFAULT_SPECSHEET_TOPIC, DEFAULT_TREND_METADATA_TEXT, + collect_candidate_specsheet_labels, generate_specsheet_pdf, get_release_specsheet_context, get_specsheet_artifact_paths, @@ -283,6 +291,18 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No run_dirs = list_run_directories() run_names = [get_run_display_name(p) for p in run_dirs] + +def _coerce_run_param_to_display_name(value: str | None) -> str: + raw = str(value or "").strip() + if not raw: + return "" + if raw in run_names: + return raw + resolved, err = resolve_run_subdirectory(raw) + if err or resolved is None: + return "" + return get_run_display_name(resolved) + if not run_dirs: st.warning(f"No runs found in '{get_data_root_display()}'.\n\nPlease add at least one sub-directory with evaluation results, e.g. `{get_data_root_display()}/my_eval_run/`.") st.stop() @@ -299,8 +319,9 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No saved_run_a = user_config.get("overview_run_a", run_names[0] if run_names else "") # URL override (only if valid) -if url_run_a in run_names: - saved_run_a = url_run_a +url_run_a_display = _coerce_run_param_to_display_name(url_run_a) +if url_run_a_display in run_names: + saved_run_a = url_run_a_display run_a_index = run_names.index(saved_run_a) if saved_run_a in run_names else 0 run_a_dir = st.sidebar.selectbox("Baseline (A)", run_dirs, index=run_a_index, format_func=get_run_display_name) @@ -318,7 +339,11 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No if not saved_compare and run_names: saved_compare = [run_names[1]] if len(run_names) > 1 else [run_names[0]] if url_compare_runs: - valid_url = [r for r in url_compare_runs if r in run_names] + valid_url = [ + display + for display in (_coerce_run_param_to_display_name(r) for r in url_compare_runs) + if display in run_names + ] if valid_url: saved_compare = valid_url st.session_state["overview_compare_run_names"] = list(saved_compare) @@ -420,12 +445,12 @@ def safe_load_run(path, label='Run'): st.session_state.pop(key, None) # ====== MAIN PAGE METRICS & CHARTS ====== -_ov_entries = [("Baseline · A", path_display(runA["path"]))] +_ov_entries = [("Baseline · A", get_run_display_name(runA["path"]))] if mode == "Compare Mode" and compare_run_dirs: all_runs = st.session_state["all_runs"] run_labels = st.session_state["run_labels"] for i in range(1, len(all_runs)): - _ov_entries.append((f"Candidate · {run_labels[i]}", path_display(all_runs[i]["path"]))) + _ov_entries.append((f"Candidate · {run_labels[i]}", get_run_display_name(all_runs[i]["path"]))) render_loaded_data_section(_ov_entries) if mode == "Compare Mode" and compare_run_dirs: @@ -698,7 +723,7 @@ def _update_pdf_status(message: str) -> None: specsheet_title = "Export Specsheet Report" section_header( specsheet_title, - "Generate or reuse the release-oriented spec-sheet PDF. Missing current/future CSV files are auto-created from parquet before building.", + "Generate the release-oriented spec-sheet PDF.", ) _specsheet_run_records = _report_runs @@ -724,8 +749,6 @@ def _update_pdf_status(message: str) -> None: _default_specsheet_labels = list(DEFAULT_SPECSHEET_LABELS) _default_specsheet_project_id = st.session_state.get("specsheet_project_id", DEFAULT_SPECSHEET_PROJECT_ID) _default_specsheet_topic = st.session_state.get("specsheet_topic_name", DEFAULT_SPECSHEET_TOPIC) -_filter_specsheet_labels = [str(v) for v in _report_filters["perception_labels"] if str(v).strip()] -_specsheet_label_options = list(dict.fromkeys(_default_specsheet_labels + _filter_specsheet_labels)) _single_specsheet_run_path = resolve_specsheet_generation_run_path(_specsheet_run_records[0]["path"]) _default_specsheet_version = get_run_display_name(_single_specsheet_run_path) @@ -857,21 +880,19 @@ def _specsheet_title_version_from_metadata(metadata: dict) -> str: key="specsheet_topic_name", ).strip() -specsheet_labels = st.multiselect( - "Spec-sheet labels", - options=_specsheet_label_options, - default=_default_specsheet_labels, - key="specsheet_labels", - help="These labels are passed to perception_catalog_analyzer.specsheet.get_blocks().", -) -if not specsheet_labels: - st.info("Pick at least one label to build the release spec-sheet.") +_detected_specsheet_labels = [] +for run_path in selected_specsheet_run_paths: + _detected_specsheet_labels.extend(collect_candidate_specsheet_labels(run_path)) +specsheet_labels = list(dict.fromkeys(_detected_specsheet_labels or _default_specsheet_labels)) +if specsheet_labels: + st.caption(f"Labels: all detected labels ({len(specsheet_labels)})") if not selected_specsheet_run_paths: st.info("Pick at least one run to build the release spec-sheet.") if _selected_trend_metadata_text and "specsheet_include_trend" not in st.session_state: st.session_state["specsheet_include_trend"] = True +_release_trend_status_text = "" if selected_specsheet_release_contexts: for release_context in selected_specsheet_release_contexts[:1]: release_dir = release_context.get("release_dir") @@ -886,28 +907,31 @@ def _specsheet_title_version_from_metadata(metadata: dict) -> str: bits.append("summary.json" if role_info.get("has_summary") else "no summary.json") bits.append("metadata.yaml" if role_info.get("has_metadata") else "no metadata.yaml") role_status.append(f"{role_name}: {', '.join(bits)}") - release_text = f"Release folder detected: `{path_display(release_dir)}`." if isinstance(release_dir, Path) else "Release folder detected." + release_text = f"Release folder: `{path_display(release_dir)}`." if isinstance(release_dir, Path) else "Release folder detected." if role_status: - release_text += " Trend generation will use " + "; ".join(role_status) + "." - st.info(release_text) - -if _selected_trend_metadata_path is not None and _selected_trend_metadata_text: - st.info(f"Existing trend metadata found at `{path_display(_selected_trend_metadata_path)}`. Review it below before generating.") - -specsheet_trend_enabled = st.toggle( - "Include trend data", - value=bool(st.session_state.get("specsheet_include_trend", bool(_selected_trend_metadata_text))), - key="specsheet_include_trend", - help="Release-report mode only. Saves `metadata.yaml` next to the generated `summary.json` and reuses all saved trend metadata files under the data root.", -) + release_text += " " + "; ".join(role_status) + "." + _release_trend_status_text = release_text + +trend_toggle_col, trend_status_col = st.columns([1.1, 2.9]) +with trend_toggle_col: + specsheet_trend_enabled = st.toggle( + "Include trend data", + value=bool(st.session_state.get("specsheet_include_trend", bool(_selected_trend_metadata_text))), + key="specsheet_include_trend", + help="Save release metadata and include available trend history.", + ) +with trend_status_col: + if specsheet_trend_enabled and _selected_trend_metadata_path is not None and _selected_trend_metadata_text: + st.caption(f"Using saved metadata: `{path_display(_selected_trend_metadata_path)}`") + elif specsheet_trend_enabled: + st.caption("No saved metadata found. Fill in release metadata below.") + if specsheet_trend_enabled and _release_trend_status_text: + st.caption(_release_trend_status_text) trend_metadata_payload = None trend_metadata_changed = False trend_metadata_change_confirmed = False if specsheet_trend_enabled: - st.caption( - "Select the full/performance run for the PDF body. Other full/usecase/devops trend runs are discovered from matching metadata under the data root." - ) _trend_metadata_source_key = str(_selected_trend_metadata_path) if _selected_trend_metadata_path is not None else "__default__" if ( st.session_state.get("specsheet_trend_metadata_source") != _trend_metadata_source_key @@ -926,24 +950,81 @@ def _specsheet_title_version_from_metadata(metadata: dict) -> str: trend_metadata_text.strip() != _selected_trend_metadata_text.strip() ) if trend_metadata_changed: - st.warning("The existing metadata.yaml has been edited. Confirm the change before generating so the saved release metadata is updated intentionally.") + st.warning("Saved metadata was edited. Confirm before generating.") trend_metadata_change_confirmed = st.checkbox( - "Confirm metadata.yaml changes", + "Confirm saved metadata changes", key="specsheet_confirm_metadata_changes", ) + trend_metadata_status = st.empty() try: trend_metadata_payload = parse_trend_metadata_text(trend_metadata_text) - st.success("Trend metadata looks valid.") - st.code( - yaml.safe_dump(trend_metadata_payload, allow_unicode=True, sort_keys=False), - language="yaml", - ) + trend_metadata_status.success("Trend metadata looks valid.") except Exception as trend_exc: - st.error(f"Trend metadata error: {trend_exc}") + trend_metadata_status.error(f"Trend metadata error: {trend_exc}") + +_specsheet_key = { + "run_paths": [str(path) for path in selected_specsheet_run_paths], + "project_id": specsheet_project_id, + "version": specsheet_version, + "topic_name": specsheet_topic_name, + "labels": list(specsheet_labels), + "include_trend": specsheet_trend_enabled, + "trend_metadata": trend_metadata_payload if specsheet_trend_enabled else None, + "artifact_kind": "zip" if len(selected_specsheet_run_paths) > 1 else "pdf", +} +_specsheet_ready = ( + st.session_state.get("specsheet_pdf_report_bytes") is not None + and st.session_state.get("specsheet_pdf_report_key") == _specsheet_key +) + +def _release_specsheet_pdf_path(release_context: dict, topic_name: str) -> Path | None: + release_dir = release_context.get("release_dir") + if not isinstance(release_dir, Path): + return None + specsheet_root = release_dir / "specsheet" + topic = str(topic_name or "").strip() + candidates = [] + if topic: + candidates.append(specsheet_root / topic / "specsheet.pdf") + candidates.append(specsheet_root / "specsheet.pdf") + candidates.extend(sorted(specsheet_root.glob("*/*.pdf"))) + for candidate in candidates: + if candidate.exists() and not candidate.is_dir(): + return candidate + return None + +_release_specsheet_paths = [ + pdf_path + for pdf_path in ( + _release_specsheet_pdf_path(release_context, specsheet_topic_name) + for release_context in selected_specsheet_release_contexts + ) + if pdf_path is not None +] +_generated_specsheet_paths = [ + path_info["specsheet_pdf"] + for path_info in _active_specsheet_paths + if path_info["specsheet_pdf"].exists() and is_specsheet_pdf_fresh(path_info["run_dir"]) +] +_existing_specsheet_paths = _release_specsheet_paths or _generated_specsheet_paths +_all_selected_specsheet_pdfs_ready = ( + len(selected_specsheet_run_paths) > 0 + and len(_existing_specsheet_paths) == len(selected_specsheet_run_paths) +) +_specsheet_has_existing_pdf = _specsheet_ready or _all_selected_specsheet_pdfs_ready +_specsheet_action_label = ( + "Regenerate Release Spec-sheet PDF" + if _specsheet_has_existing_pdf + else "Generate Release Spec-sheet PDF" +) specsheet_action_col1, specsheet_action_col2 = st.columns([1.2, 2.8]) with specsheet_action_col1: - if st.button("Generate Release Spec-sheet PDF", type="primary", use_container_width=True): + if st.button( + _specsheet_action_label, + type="secondary" if _specsheet_has_existing_pdf else "primary", + use_container_width=True, + ): _specsheet_status = st.empty() _specsheet_progress = st.progress(0.0) try: @@ -953,8 +1034,6 @@ def _specsheet_title_version_from_metadata(metadata: dict) -> str: raise ValueError("Version is required.") if not specsheet_topic_name: raise ValueError("Topic name is required.") - if not specsheet_labels: - raise ValueError("At least one label is required.") if not selected_specsheet_run_paths: raise ValueError("At least one run must be selected.") if specsheet_trend_enabled and len(selected_specsheet_run_paths) != 1: @@ -1038,18 +1117,10 @@ def _update_specsheet_status(message: str) -> None: download_mime = "application/zip" st.session_state["specsheet_pdf_report_bytes"] = download_bytes - st.session_state["specsheet_pdf_report_key"] = { - "run_paths": [str(path) for path in selected_specsheet_run_paths], - "project_id": specsheet_project_id, - "version": specsheet_version, - "topic_name": specsheet_topic_name, - "labels": list(specsheet_labels), - "include_trend": specsheet_trend_enabled, - "trend_metadata": trend_metadata_payload if specsheet_trend_enabled else None, - "artifact_kind": "zip" if len(generated_pdfs) > 1 else "pdf", - } + st.session_state["specsheet_pdf_report_key"] = _specsheet_key st.session_state["specsheet_pdf_report_name"] = download_name st.session_state["specsheet_pdf_report_mime"] = download_mime + _specsheet_ready = True _specsheet_progress.progress(1.0) if any(generated for _, generated in generated_pdfs): if len(generated_pdfs) == 1: @@ -1068,30 +1139,8 @@ def _update_specsheet_status(message: str) -> None: st.session_state.pop("specsheet_pdf_report_mime", None) _specsheet_status.error(f"Spec-sheet generation failed: {e}") with specsheet_action_col2: - _specsheet_key = { - "run_paths": [str(path) for path in selected_specsheet_run_paths], - "project_id": specsheet_project_id, - "version": specsheet_version, - "topic_name": specsheet_topic_name, - "labels": list(specsheet_labels), - "include_trend": specsheet_trend_enabled, - "trend_metadata": trend_metadata_payload if specsheet_trend_enabled else None, - "artifact_kind": "zip" if len(selected_specsheet_run_paths) > 1 else "pdf", - } - _specsheet_ready = ( - st.session_state.get("specsheet_pdf_report_bytes") is not None - and st.session_state.get("specsheet_pdf_report_key") == _specsheet_key - ) - _fresh_specsheet_paths = [ - path_info["specsheet_pdf"] - for path_info in _active_specsheet_paths - if path_info["specsheet_pdf"].exists() and is_specsheet_pdf_fresh(path_info["run_dir"]) - ] - _all_selected_specsheet_pdfs_fresh = ( - len(selected_specsheet_run_paths) > 0 - and len(_fresh_specsheet_paths) == len(selected_specsheet_run_paths) - ) if _specsheet_ready: + st.success("Release spec-sheet is ready.") st.download_button( "Download Release Spec-sheet", data=st.session_state["specsheet_pdf_report_bytes"], @@ -1099,9 +1148,10 @@ def _update_specsheet_status(message: str) -> None: mime=st.session_state.get("specsheet_pdf_report_mime", "application/pdf"), use_container_width=True, ) - elif _all_selected_specsheet_pdfs_fresh: - if len(_fresh_specsheet_paths) == 1: - _disk_pdf_path = _fresh_specsheet_paths[0] + elif _all_selected_specsheet_pdfs_ready: + st.success("Existing release spec-sheet is ready.") + if len(_existing_specsheet_paths) == 1: + _disk_pdf_path = _existing_specsheet_paths[0] st.download_button( "Download Release Spec-sheet", data=_disk_pdf_path.read_bytes(), @@ -1112,7 +1162,7 @@ def _update_specsheet_status(message: str) -> None: else: _zip_buffer = io.BytesIO() with zipfile.ZipFile(_zip_buffer, "w", compression=zipfile.ZIP_DEFLATED) as zf: - for pdf_path in _fresh_specsheet_paths: + for pdf_path in _existing_specsheet_paths: zf.write(pdf_path, arcname=f"{pdf_path.parent.parent.name}/{pdf_path.name}") st.download_button( "Download Release Spec-sheets", diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index 0abe63b..54fee77 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -53,6 +53,7 @@ x-streamlit-app: &streamlit-app - ./configs:/app/docker_config # Mount app source so code changes apply without rebuild (Streamlit auto-reloads) - ../Overview.py:/app/Overview.py + - ../docker-entrypoint.sh:/app/docker-entrypoint.sh - ../catalogs.json:/app/catalogs.json - ../pages:/app/pages - ../Readme.md:/app/Readme.md @@ -60,6 +61,7 @@ x-streamlit-app: &streamlit-app - ../lib:/app/lib - ../worker:/app/worker - ../configs:/app/configs + - ../static:/app/static - ../.streamlit:/app/.streamlit - /var/run/docker.sock:/var/run/docker.sock extra_hosts: diff --git a/evaluation_dashboard_app/docker-entrypoint.sh b/evaluation_dashboard_app/docker-entrypoint.sh index 8f8357d..a3c26ac 100755 --- a/evaluation_dashboard_app/docker-entrypoint.sh +++ b/evaluation_dashboard_app/docker-entrypoint.sh @@ -5,4 +5,4 @@ if [[ -n "${ROS_DISTRO}" && -f "/opt/ros/${ROS_DISTRO}/setup.bash" ]]; then source "/opt/ros/${ROS_DISTRO}/setup.bash" fi -exec streamlit run Overview.py --server.address=0.0.0.0 --server.port=8501 --server.headless=true "$@" +exec streamlit run Overview.py --server.address=0.0.0.0 --server.port=8501 --server.headless=true --server.enableStaticServing=true "$@" diff --git a/evaluation_dashboard_app/lib/overview_url_hydrate.py b/evaluation_dashboard_app/lib/overview_url_hydrate.py index ca20a60..8f9fc1a 100644 --- a/evaluation_dashboard_app/lib/overview_url_hydrate.py +++ b/evaluation_dashboard_app/lib/overview_url_hydrate.py @@ -11,7 +11,7 @@ import streamlit as st -from lib.path_utils import get_data_root, get_run_display_name, list_run_directories +from lib.path_utils import get_data_root, get_run_display_name, get_run_storage_name, list_run_directories from lib.run_loader import load_run @@ -31,6 +31,7 @@ def try_hydrate_session_from_overview_query_params() -> bool: return False run_dirs = list_run_directories() name_to_dir = {get_run_display_name(p): p for p in run_dirs} + name_to_dir.update({get_run_storage_name(p): p for p in run_dirs}) if run_a_name not in name_to_dir: return False mode_param = (params.get("mode") or "single").lower() diff --git a/evaluation_dashboard_app/lib/page_chrome.py b/evaluation_dashboard_app/lib/page_chrome.py index 5bd6e08..d316661 100644 --- a/evaluation_dashboard_app/lib/page_chrome.py +++ b/evaluation_dashboard_app/lib/page_chrome.py @@ -76,7 +76,7 @@ def render_loaded_data_section(entries: Sequence[Tuple[str, str]]) -> None: f"""
{la}
-
{pa}
+
{pa}
""", unsafe_allow_html=True, @@ -95,7 +95,7 @@ def render_loaded_data_section(entries: Sequence[Tuple[str, str]]) -> None: f"""
{la}
-
{pa}
+
{pa}
""", unsafe_allow_html=True, diff --git a/evaluation_dashboard_app/lib/path_utils.py b/evaluation_dashboard_app/lib/path_utils.py index 72a2c9b..4d43554 100644 --- a/evaluation_dashboard_app/lib/path_utils.py +++ b/evaluation_dashboard_app/lib/path_utils.py @@ -9,9 +9,12 @@ """ import os +import re from pathlib import Path from typing import Optional, List, Tuple +import yaml + # Root for all evaluation data. Set EVAL_DASHBOARD_DATA_ROOT to override (e.g. /var/eval_dashboard/data). _DATA_ROOT: Optional[Path] = None @@ -122,16 +125,79 @@ def _looks_like_analysis_run(path: Path) -> bool: ) +RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") +RELEASE_ROLE_LABELS = { + "performance": "Performance", + "usecase": "Usecase", + "devops": "DevOps", +} +_PILOT_AUTO_PREFIX_PATTERN = re.compile(r"^\s*Pilot\.Auto\s*", re.IGNORECASE) + + def _looks_like_release_container(path: Path) -> bool: return ( (path / "metadata.yaml").exists() - and any((path / name).is_dir() for name in ("performance", "devops")) + and any((path / name).is_dir() for name in RELEASE_ROLE_DIRS) and not _looks_like_analysis_run(path) ) +def _load_yaml_metadata(path: Path) -> dict: + if not path.is_file(): + return {} + try: + with path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + except (OSError, yaml.YAMLError): + return {} + return data if isinstance(data, dict) else {} + + +def _compact_release_version(metadata: dict, fallback: str) -> str: + version = str(metadata.get("version_abbr") or metadata.get("pilot_auto_version") or "").strip() + if not version: + return fallback + version = _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version + version = version.replace("/", "-") + return version + + +def _release_run_display_name(run_path: Path) -> Optional[str]: + role_label = "" + release_dir = run_path + if run_path.name in RELEASE_ROLE_LABELS and _looks_like_release_container(run_path.parent): + release_dir = run_path.parent + role_label = RELEASE_ROLE_LABELS[run_path.name] + elif _looks_like_release_container(run_path): + role_label = "Release" + else: + return None + + metadata = _load_yaml_metadata(run_path / "metadata.yaml") or _load_yaml_metadata(release_dir / "metadata.yaml") + version = _compact_release_version(metadata, release_dir.name.replace("release_spec_", "")) + date = str(metadata.get("date") or "").strip() + parts = [f"[REL] {version}"] + if role_label: + parts.append(role_label) + if date: + parts.append(date) + return " | ".join(parts) + + def get_run_display_name(run_path: Path) -> str: - """Return a stable run selector name relative to the data root.""" + """Return a stable user-facing run selector name.""" + release_name = _release_run_display_name(run_path) + if release_name: + return release_name + root = get_data_root() + try: + return run_path.resolve().relative_to(root).as_posix() + except Exception: + return run_path.name + + +def get_run_storage_name(run_path: Path) -> str: + """Return the raw path-like run name relative to the data root.""" root = get_data_root() try: return run_path.resolve().relative_to(root).as_posix() @@ -151,7 +217,7 @@ def list_run_directories() -> List[Path]: if resolved not in seen and not _looks_like_release_container(child): runs.append(child) seen.add(resolved) - for release_child_name in ("performance", "devops"): + for release_child_name in RELEASE_ROLE_DIRS: release_child = child / release_child_name if release_child.is_dir() and _looks_like_analysis_run(release_child): release_resolved = release_child.resolve() @@ -235,6 +301,10 @@ def resolve_run_subdirectory(run_name: str) -> Tuple[Optional[Path], str]: return None, "Invalid run name." if "\x00" in run_name or "\\" in run_name: return None, "Invalid run name." + display_matches = [path for path in list_run_directories() if get_run_display_name(path) == run_name] + if display_matches: + return display_matches[0], "" + run_path = (root / run_name).resolve() try: run_path.relative_to(root) diff --git a/evaluation_dashboard_app/lib/release_specsheet_library.py b/evaluation_dashboard_app/lib/release_specsheet_library.py new file mode 100644 index 0000000..ce51f84 --- /dev/null +++ b/evaluation_dashboard_app/lib/release_specsheet_library.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import urllib.parse +from pathlib import Path +from typing import Any + +import yaml + +from lib.path_utils import get_run_display_name, path_display + + +RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") +DEFAULT_EVALUATOR_PROJECT_ID = "x2_dev" +EVALUATOR_REPORT_BASE_URL = "https://evaluation.tier4.jp/evaluation/reports" + + +def _overview_query(run_path: Path) -> str: + return urllib.parse.urlencode({"mode": "single", "run_a": get_run_display_name(run_path)}) + + +def _safe_url_part(value: str, fallback: str) -> str: + import re + + text = re.sub(r"[^\w.\-]+", "_", str(value or "")).strip("._") + return text or fallback + + +def _load_yaml(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + except Exception: + return {} + return data if isinstance(data, dict) else {} + + +def _role_metadata(role_dir: Path) -> dict[str, Any]: + metadata = _load_yaml(role_dir / "metadata.yaml") + if metadata: + return metadata + return _load_yaml(role_dir / "resources" / "metadata.yaml") + + +def _evaluator_report_url(job_id: str, project_id: str = DEFAULT_EVALUATOR_PROJECT_ID) -> str: + if not job_id: + return "" + query = urllib.parse.urlencode({"project_id": project_id}) + return f"{EVALUATOR_REPORT_BASE_URL}/{job_id}?{query}" + + +def _pdf_static_url(release_name: str, topic_name: str) -> str: + release_part = _safe_url_part(release_name, "release") + topic_part = _safe_url_part(topic_name, "topic") + return f"/app/static/release_specs/{release_part}/{topic_part}.pdf" + + +def discover_release_specsheet_inventory(data_root: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for release_dir in sorted(data_root.glob("release_spec_*")): + if not release_dir.is_dir(): + continue + metadata_path = release_dir / "metadata.yaml" + metadata = {} + if metadata_path.exists(): + try: + metadata = yaml.safe_load(metadata_path.read_text(encoding="utf-8")) or {} + except Exception: + metadata = {} + if not isinstance(metadata, dict): + metadata = {} + + specsheet_root = release_dir / "specsheet" + topic_pdf_paths = { + path + for path in specsheet_root.glob("*/*.pdf") + if path.is_file() or path.is_symlink() + } + pdfs: list[dict[str, Any]] = [] + for pdf_path in sorted(specsheet_root.glob("**/*.pdf")): + if pdf_path.parent == specsheet_root and topic_pdf_paths: + continue + topic = pdf_path.parent.name if pdf_path.parent != specsheet_root else "default" + static_path = ( + Path.cwd() + / "static" + / "release_specs" + / _safe_url_part(release_dir.name.replace("release_spec_", "", 1), "release") + / f"{_safe_url_part(topic, 'topic')}.pdf" + ) + pdfs.append( + { + "topic": topic, + "path": pdf_path, + "display_path": path_display(pdf_path), + "absolute_path": str(pdf_path.resolve()), + "static_path": static_path, + "static_url": _pdf_static_url(release_dir.name.replace("release_spec_", "", 1), topic), + "available": pdf_path.exists() and not pdf_path.is_dir(), + "static_available": static_path.exists() and not static_path.is_dir(), + } + ) + + roles: dict[str, dict[str, Any]] = {} + for role in RELEASE_ROLE_DIRS: + role_dir = release_dir / role + if not role_dir.is_dir(): + continue + role_metadata = _role_metadata(role_dir) + job_id = str(role_metadata.get("job_id") or "").strip() + project_id = str(role_metadata.get("project_id") or DEFAULT_EVALUATOR_PROJECT_ID).strip() + roles[role] = { + "path": role_dir, + "display_path": path_display(role_dir), + "absolute_path": str(role_dir.resolve()), + "run_name": get_run_display_name(role_dir), + "overview_query": _overview_query(role_dir), + "overview_url": f"/?{_overview_query(role_dir)}", + "job_id": job_id, + "project_id": project_id, + "evaluator_report_url": _evaluator_report_url(job_id, project_id), + "has_parquet": any(role_dir.glob("*.parquet")), + "has_summary": (role_dir / "summary.json").exists() or (role_dir / "resources" / "summary.json").exists(), + "has_metadata": (role_dir / "metadata.yaml").exists() or (role_dir / "resources" / "metadata.yaml").exists(), + } + + rows.append( + { + "release_dir": release_dir, + "release_dir_display": path_display(release_dir), + "release_dir_absolute": str(release_dir.resolve()), + "release": release_dir.name.replace("release_spec_", "", 1), + "version": metadata.get("pilot_auto_version") or metadata.get("version_abbr") or "", + "date": metadata.get("date") or "", + "description": metadata.get("description") or "", + "data_count": metadata.get("data_count") or "", + "roles": roles, + "pdfs": pdfs, + "pdf_topics": ", ".join(pdf["topic"] for pdf in pdfs), + "main_pdf_url": next((pdf["static_url"] for pdf in pdfs), ""), + "main_pdf_path": next((pdf["display_path"] for pdf in pdfs), ""), + } + ) + return rows + + +def discover_ready_release_specsheets(data_root: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for release in discover_release_specsheet_inventory(data_root): + default_run = release["roles"].get("performance") or next(iter(release["roles"].values()), {}) + for pdf in release["pdfs"]: + rows.append( + { + "release_dir": release["release_dir"], + "pdf_path": pdf["path"], + "release": release["release"], + "version": release["version"], + "date": release["date"], + "description": release["description"], + "topic": pdf["topic"], + "view_run": default_run.get("run_name", ""), + "overview_query": default_run.get("overview_query", ""), + } + ) + return rows diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py index bf34299..992c8b0 100644 --- a/evaluation_dashboard_app/lib/specsheet_report.py +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -15,6 +15,7 @@ import yaml from lib.path_utils import get_data_root +from lib.run_metadata import read_run_metadata DEFAULT_SPECSHEET_TOPIC = "perception.object_recognition.tracking.objects" DEFAULT_TREND_TOPIC = "perception.object_recognition.objects" @@ -45,7 +46,7 @@ ] TREND_METADATA_FILENAME = "metadata.yaml" TREND_SUMMARY_FILENAME = "summary.json" -SPECSHEET_RELEASE_ROLE_DIRS = ("performance", "devops") +SPECSHEET_RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") GENERATED_TREND_HISTORY_DIRNAME = "_app_trend_history" FULL_DATASET_EVALUATION_HEADER = "全数データセット評価" DEFAULT_TREND_METADATA_TEXT = """tags: [trend] @@ -414,6 +415,8 @@ def discover_trend_metadata_files(root_dir: str | Path | None = None) -> list[Pa continue if GENERATED_TREND_HISTORY_DIRNAME in metadata_path.parts: continue + if any(part.startswith("release_spec_") for part in metadata_path.parts): + continue if not (metadata_path.parent / TREND_SUMMARY_FILENAME).exists(): continue matches.append(metadata_path) @@ -457,6 +460,89 @@ def _unwrap_devops_summary(summary: dict[str, Any]) -> dict[str, Any]: return summary +def _release_role_key_for_metadata(role: str) -> str: + if role in {"full", "performance_blocks"}: + return "performance" + return role + + +def _job_id_from_run_metadata(run_dir: Path, role: str) -> str: + role_key = _release_role_key_for_metadata(role) + candidates = [run_dir] + if run_dir.parent != run_dir: + candidates.append(run_dir.parent) + + for candidate in candidates: + metadata = read_run_metadata(candidate) + release_specsheet = metadata.get("release_specsheet") if isinstance(metadata.get("release_specsheet"), dict) else {} + evaluator_jobs = release_specsheet.get("evaluator_jobs") if isinstance(release_specsheet.get("evaluator_jobs"), dict) else {} + role_meta = evaluator_jobs.get(role_key) if isinstance(evaluator_jobs.get(role_key), dict) else {} + job_id = str(role_meta.get("job_id") or "").strip() + if job_id: + return job_id + + evaluator_meta = metadata.get("evaluator") if isinstance(metadata.get("evaluator"), dict) else {} + job_id = str(evaluator_meta.get("job_id") or "").strip() + if job_id: + return job_id + + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + parameter_meta = request_meta.get("parameters") if isinstance(request_meta.get("parameters"), dict) else {} + for key in (f"{role_key}_job_id", "job_id"): + job_id = str(parameter_meta.get(key) or request_meta.get(key) or "").strip() + if job_id: + return job_id + return "" + + +def _release_metadata_match(candidate: dict[str, Any], target: dict[str, Any]) -> bool: + for key in ("release_group", "pilot_auto_version", "topic_name", "description", "data_count"): + target_value = str(target.get(key) or "").strip() + if target_value and str(candidate.get(key) or "").strip() != target_value: + return False + return True + + +def _job_id_from_matching_release_run_metadata(root_dir: str | Path | None, target_metadata: dict[str, Any], role: str) -> str: + root = Path(root_dir) if root_dir is not None else get_data_root() + if not root.exists() or not root.is_dir(): + return "" + role_key = _release_role_key_for_metadata(role) + candidates = sorted( + [path for path in root.iterdir() if path.is_dir()], + key=lambda path: path.stat().st_mtime if path.exists() else 0, + reverse=True, + ) + for candidate in candidates: + metadata = read_run_metadata(candidate) + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + parameter_meta = request_meta.get("parameters") if isinstance(request_meta.get("parameters"), dict) else {} + trend_metadata = ( + parameter_meta.get("trend_metadata") + if isinstance(parameter_meta.get("trend_metadata"), dict) + else {} + ) + release_specsheet = metadata.get("release_specsheet") if isinstance(metadata.get("release_specsheet"), dict) else {} + release_metadata = ( + release_specsheet.get("metadata") + if isinstance(release_specsheet.get("metadata"), dict) + else trend_metadata + ) + if not _release_metadata_match(release_metadata, target_metadata): + continue + + evaluator_jobs = release_specsheet.get("evaluator_jobs") if isinstance(release_specsheet.get("evaluator_jobs"), dict) else {} + role_meta = evaluator_jobs.get(role_key) if isinstance(evaluator_jobs.get(role_key), dict) else {} + job_id = str(role_meta.get("job_id") or "").strip() + if job_id: + return job_id + + job_id = str(parameter_meta.get(f"{role_key}_job_id") or request_meta.get(f"{role_key}_job_id") or "").strip() + if job_id: + return job_id + return "" + + def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[TrendReleaseGroup]: metadata_files = discover_trend_metadata_files(root_dir) grouped: dict[str, TrendReleaseGroup] = {} @@ -483,7 +569,12 @@ def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[Tr "group_kind": group_kind, "base_dir": base_dir, "role": role, - "job_id": run_dir.name, + "job_id": str( + metadata.get("job_id") + or _job_id_from_run_metadata(run_dir, role) + or _job_id_from_matching_release_run_metadata(root_dir, metadata, role) + or "" + ), "metadata_path": metadata_path, "summary_path": summary_path, "metadata": metadata, @@ -512,7 +603,12 @@ def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[Tr ) grouped[group_key].jobs[role] = { "role": role, - "job_id": metadata_path.parent.name if metadata_path.parent.name != "resources" else run_dir.name, + "job_id": str( + metadata.get("job_id") + or _job_id_from_run_metadata(metadata_path.parent, role) + or _job_id_from_matching_release_run_metadata(root_dir, metadata, role) + or (metadata_path.parent.name if metadata_path.parent.name != "resources" else run_dir.name) + ), "metadata_path": metadata_path.resolve(), "summary_path": summary_path.resolve(), "metadata": metadata, diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index 084897b..056ecb1 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -3,6 +3,7 @@ import json import re import shutil +from html import escape from pathlib import Path from typing import Any @@ -10,9 +11,11 @@ import plotly.express as px import plotly.graph_objects as go import streamlit as st +import streamlit.components.v1 as components from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header from lib.path_utils import get_data_root, path_display, resolve_under_data_root +from lib.release_specsheet_library import discover_release_specsheet_inventory from lib.specsheet_report import ( DEFAULT_TREND_METADATA_TEXT, TREND_METADATA_FILENAME, @@ -124,10 +127,6 @@ def _assemble_trend_release_group( def _render_release_trend_builder() -> None: section_header("Build Release Trend Group") with st.expander("Assemble full/usecase/devops summaries into one release", expanded=False): - st.caption( - "Use this after the three evaluator jobs have analyzer-compatible summary.json files. " - "Each source can be a job folder, a run folder containing resources/summary.json, or the summary.json file itself." - ) with st.form("release_trend_builder_form"): form_col1, form_col2 = st.columns([1.1, 1.2]) with form_col1: @@ -200,6 +199,816 @@ def _update_version_axis(fig: go.Figure, versions: list[str]) -> None: fig.update_xaxes(categoryorder="array", categoryarray=versions) +def _role_overview_url(release_row: dict[str, Any], role: str) -> str: + role_info = release_row.get("roles", {}).get(role, {}) + return str(role_info.get("overview_url") or "") + + +def _role_debug_path(release_row: dict[str, Any], role: str) -> str: + role_info = release_row.get("roles", {}).get(role, {}) + return str(role_info.get("absolute_path") or "") + + +def _role_evaluator_url(release_row: dict[str, Any], role: str) -> str: + role_info = release_row.get("roles", {}).get(role, {}) + return str(role_info.get("evaluator_report_url") or "") + + +def _topic_family(topic_name: Any) -> str: + topic = str(topic_name or "") + if topic == "perception.object_recognition.objects": + return "Perception Performance" + if topic.startswith("perception.object_recognition.detection."): + return "ML Model Performance" + return "Other" + + +def _date_sort_value(value: Any) -> float: + parsed = pd.to_datetime(value, format="%Y.%m.%d", errors="coerce") + if pd.isna(parsed): + return -1.0 + return float(parsed.timestamp()) + + +def _html_link(url: str, label: str, variant: str = "action") -> str: + if not url: + return '-' + return ( + f'{escape(label)}' + ) + + +def _pdf_links_for_prefix(release: dict[str, Any], prefix: str) -> str: + links = [] + for pdf in release.get("pdfs", []): + topic = str(pdf.get("topic") or "") + if topic == prefix or topic.startswith(prefix): + label = "Prediction" + if topic.startswith("perception.object_recognition.detection."): + label = topic.replace("perception.object_recognition.detection.", "").replace(".objects", "") + label = label.replace("bevfusion", "BEVFusion").replace("centerpoint", "CenterPoint") + links.append(_html_link(str(pdf.get("static_url") or ""), label, "pdf")) + return "
".join(links) if links else '-' + + +def _has_pdf_for_prefix(release: dict[str, Any], prefix: str) -> bool: + for pdf in release.get("pdfs", []): + topic = str(pdf.get("topic") or "") + if topic == prefix or topic.startswith(prefix): + return True + return False + + +def _render_release_library_table(releases: list[dict[str, Any]]) -> None: + group_headers = [ + ("Release", 4), + ("Overview", 3), + ("Specsheet PDF", 2), + ("Evaluator Job", 3), + ] + col_widths = [250, 82, 180, 76, 88, 88, 88, 110, 110, 88, 88, 88] + headers = [ + "Version", + "Date", + "Description", + "Data", + "Performance", + "Usecase", + "DevOps", + "Prediction", + "Detection", + "Performance", + "Usecase", + "DevOps", + ] + sort_types = ["text", "date", "text", "number", "text", "text", "text", "text", "text", "text", "text", "text"] + sortable_columns = {0, 1, 2, 3} + rows_html = [] + for release in releases: + sort_values = [ + str(release.get("version") or ""), + str(_date_sort_value(release.get("date"))), + str(release.get("description") or ""), + str(_parse_data_count(release.get("data_count")) or -1), + "open" if _role_overview_url(release, "performance") else "", + "open" if _role_overview_url(release, "usecase") else "", + "open" if _role_overview_url(release, "devops") else "", + "prediction" if _has_pdf_for_prefix(release, "perception.object_recognition.objects") else "", + "detection" if _has_pdf_for_prefix(release, "perception.object_recognition.detection.") else "", + "report" if _role_evaluator_url(release, "performance") else "", + "report" if _role_evaluator_url(release, "usecase") else "", + "report" if _role_evaluator_url(release, "devops") else "", + ] + cells = [ + escape(str(release.get("version") or "")), + escape(str(release.get("date") or "")), + escape(str(release.get("description") or "")), + escape(str(release.get("data_count") or "")), + _html_link(_role_overview_url(release, "performance"), "Open", "overview"), + _html_link(_role_overview_url(release, "usecase"), "Open", "overview"), + _html_link(_role_overview_url(release, "devops"), "Open", "overview"), + _pdf_links_for_prefix(release, "perception.object_recognition.objects"), + _pdf_links_for_prefix(release, "perception.object_recognition.detection."), + _html_link(_role_evaluator_url(release, "performance"), "Report", "job"), + _html_link(_role_evaluator_url(release, "usecase"), "Report", "job"), + _html_link(_role_evaluator_url(release, "devops"), "Report", "job"), + ] + rows_html.append( + "" + + "".join( + f'{cell}' + for cell, sort_value in zip(cells, sort_values) + ) + + "" + ) + table_html = f""" + + + + + + + +
+
+ + {''.join(f'' for width in col_widths)} + + {''.join(f'' for header, span in group_headers)} + {''.join(f'' if idx in sortable_columns else f'' for idx, header in enumerate(headers))} + + {''.join(rows_html)} +
{escape(header)}
{escape(header)}
+
+
+ + + +""" + component_height = 76 + max(1, len(releases)) * 34 + components.html(table_html, height=component_height, scrolling=False) + + +def _release_inventory_debug_rows(releases: list[dict[str, Any]]) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for release in releases: + rows.append( + { + "version": release["version"], + "date": release["date"], + "release": release["release"], + "release_dir": release["release_dir_absolute"], + "performance_dir": _role_debug_path(release, "performance"), + "usecase_dir": _role_debug_path(release, "usecase"), + "devops_dir": _role_debug_path(release, "devops"), + "performance_job_url": _role_evaluator_url(release, "performance"), + "usecase_job_url": _role_evaluator_url(release, "usecase"), + "devops_job_url": _role_evaluator_url(release, "devops"), + "pdf_paths": "\n".join(pdf["absolute_path"] for pdf in release.get("pdfs", [])), + } + ) + return rows + + +def _release_metric_bar_ranges(frame: pd.DataFrame) -> dict[str, tuple[float, float]]: + ranges: dict[str, tuple[float, float]] = {} + metric_columns = ("mAP", "precision", "recall", "overall_pass_rate", "FNR", "x_error", "y_error", "yaw_error") + for column in metric_columns: + if column not in frame.columns: + continue + values = pd.to_numeric(frame[column], errors="coerce") + if not values.notna().any(): + continue + min_value = float(values.min(skipna=True)) + max_value = float(values.max(skipna=True)) + if abs(max_value - min_value) < 1e-12: + if column == "overall_pass_rate": + min_value, max_value = 0.0, 100.0 + elif column in {"mAP", "precision", "recall"}: + min_value, max_value = 0.0, 1.0 + else: + min_value, max_value = 0.0, max(max_value, 1.0) + ranges[column] = (min_value, max_value) + return ranges + + +def _release_performance_cell_html(value: Any, column: str, ranges: dict[str, tuple[float, float]]) -> str: + metric_columns = {"mAP", "precision", "recall", "overall_pass_rate", "FNR", "x_error", "y_error", "yaw_error"} + if column not in metric_columns: + return escape(str(value or "")) + + numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] + if pd.isna(numeric): + return '-' + + min_value, max_value = ranges.get(column, (0.0, 1.0)) + span = max(max_value - min_value, 1e-12) + normalized = max(0.0, min(1.0, (float(numeric) - min_value) / span)) + pct = 8.0 + normalized * 92.0 + if column == "overall_pass_rate": + label = f"{float(numeric):.1f}%" + else: + label = f"{float(numeric):.3f}" + + # Calm app-aligned palette: soft rose for weak/concerning values, soft teal for strong/healthy values. + teal = (45, 212, 191) + rose = (251, 113, 133) + if column in {"mAP", "precision", "recall", "overall_pass_rate"}: + color_ratio = normalized + else: + color_ratio = 1.0 - normalized + red = round(rose[0] + (teal[0] - rose[0]) * color_ratio) + green = round(rose[1] + (teal[1] - rose[1]) * color_ratio) + blue = round(rose[2] + (teal[2] - rose[2]) * color_ratio) + + return ( + f'
' + f'{escape(label)}' + "
" + ) + + +def _release_performance_column_group(column: str) -> str: + if column in {"version", "date", "description", "data_count"}: + return "Release" + if column in {"mAP", "precision", "recall"}: + return "Score" + if column in {"FNR", "x_error", "y_error", "yaw_error"}: + return "Error" + if column == "overall_pass_rate": + return "Pass Rate" + return "Jobs / Metadata" + + +def _render_release_performance_html_table(frame: pd.DataFrame) -> None: + ranges = _release_metric_bar_ranges(frame) + numeric_columns = {"mAP", "precision", "recall", "overall_pass_rate", "FNR", "x_error", "y_error", "yaw_error", "data_count"} + group_spans: list[tuple[str, int]] = [] + for column in frame.columns: + group = _release_performance_column_group(str(column)) + if group_spans and group_spans[-1][0] == group: + group_spans[-1] = (group, group_spans[-1][1] + 1) + else: + group_spans.append((group, 1)) + group_header_html = "".join( + f'{escape(group)}' + for group, span in group_spans + ) + header_html = "".join( + ( + f'' + ) + for idx, column in enumerate(frame.columns) + ) + row_html = [] + for _, row in frame.iterrows(): + cells = [] + for column in frame.columns: + value = row.get(column) + if column == "data_count": + parsed_count = _parse_data_count(value) + sort_value = "" if parsed_count is None else str(parsed_count) + elif column in numeric_columns: + numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] + sort_value = "" if pd.isna(numeric) else f"{float(numeric):.12g}" + else: + sort_value = str(value or "") + cells.append( + f'' + f"{_release_performance_cell_html(value, column, ranges)}" + ) + row_html.append(f"{''.join(cells)}") + + table_html = f""" + + + + + + + +
+ + + {group_header_html} + {header_html} + + {''.join(row_html)} +
+
+ + + +""" + component_height = 76 + max(1, len(frame)) * 34 + components.html(table_html, height=component_height, scrolling=False) + + +def _release_performance_table( + frame: pd.DataFrame, + *, + family: str, + empty_message: str, + table_mode: str, +) -> None: + if frame.empty: + st.info(empty_message) + return + view = frame[frame["topic_family"] == family].copy() + if view.empty: + st.info(empty_message) + return + columns = [ + "version", + "date", + "description", + "data_count", + "mAP", + "precision", + "recall", + "FNR", + "x_error", + "y_error", + "yaw_error", + "roles", + "full_job_id", + "usecase_job_id", + "devops_job_id", + "topic_name", + ] + if family == "Perception Performance": + columns.insert(columns.index("roles"), "overall_pass_rate") + visible = [column for column in columns if column in view.columns] + display_frame = view.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False])[visible] + if table_mode == "Colored bars": + _render_release_performance_html_table(display_frame) + else: + dataframe_height = 38 + max(1, len(display_frame)) * 35 + st.dataframe( + display_frame, + width="stretch", + hide_index=True, + height=dataframe_height, + ) + + def _build_pass_combo_chart( frame: pd.DataFrame, *, @@ -230,6 +1039,9 @@ def _build_pass_combo_chart( hover_cols = hover_cols or ["date", "release_name", "passed", "total"] plot_df = frame.copy() + version_order = {version: idx for idx, version in enumerate(versions)} + plot_df["__version_order"] = plot_df["version"].map(version_order).fillna(len(version_order)) + plot_df = plot_df.sort_values(["__version_order", "version", "date", "release_name"]) if series_col is None: fig.add_trace( go.Scatter( @@ -246,7 +1058,9 @@ def _build_pass_combo_chart( else: palette = px.colors.qualitative.Bold + px.colors.qualitative.Safe + px.colors.qualitative.Set2 for idx, series_name in enumerate(plot_df[series_col].dropna().astype(str).unique().tolist()): - series_df = plot_df[plot_df[series_col].astype(str) == series_name] + series_df = plot_df[plot_df[series_col].astype(str) == series_name].sort_values( + ["__version_order", "version", "date", "release_name"] + ) color = palette[idx % len(palette)] fig.add_trace( go.Scatter( @@ -396,8 +1210,12 @@ def _build_metric_label_lines( title: str, ordered_axes: list[str], ) -> go.Figure: + plot_df = frame.dropna(subset=["value"]).copy() + axis_order = {axis: idx for idx, axis in enumerate(ordered_axes)} + plot_df["__axis_order"] = plot_df["release_axis"].map(axis_order).fillna(len(axis_order)) + plot_df = plot_df.sort_values(["label_name", "__axis_order", "release_axis"]) fig = px.line( - frame, + plot_df, x="release_axis", y="value", color="label_name", @@ -407,6 +1225,7 @@ def _build_metric_label_lines( ) fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Label") fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes, tickangle=-30, automargin=True) + fig.update_traces(connectgaps=True) return fig @@ -446,7 +1265,10 @@ def _build_prediction_label_profile( profile_df = frame[ (frame["metric_name"].isin(metric_names)) & (frame["label_name"] == selected_label) - ].copy() + ].dropna(subset=["value"]).copy() + axis_order = {axis: idx for idx, axis in enumerate(ordered_axes)} + profile_df["__axis_order"] = profile_df["release_axis"].map(axis_order).fillna(len(axis_order)) + profile_df = profile_df.sort_values(["metric_name", "__axis_order", "release_axis"]) fig = px.line( profile_df, x="release_axis", @@ -458,6 +1280,7 @@ def _build_prediction_label_profile( ) fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Horizon") fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes, tickangle=-30, automargin=True) + fig.update_traces(connectgaps=True) return fig @@ -623,16 +1446,13 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame render_page_hero( kicker="Release Analytics", title="Trend Insights", - description="Release-level trends across grouped full, usecase, and devops runs.", + description="Release history and performance trends.", ) -_render_release_trend_builder() - -section_header("Release Inventory") - groups = discover_trend_release_groups() if not groups: - st.info("No saved trend metadata was found yet. Use the release trend builder above after the three job summaries are available.") + st.info("No saved trend metadata was found yet. Use the release trend builder below after the three job summaries are available.") + _render_release_trend_builder() st.stop() try: @@ -641,36 +1461,66 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame st.error(f"Could not build trend insights: {exc}") st.stop() +if not release_df.empty: + release_df["topic_family"] = release_df["topic_name"].map(_topic_family) + +section_header("Release History") +release_specsheets = discover_release_specsheet_inventory(get_data_root()) +if release_specsheets: + release_specsheets = sorted( + release_specsheets, + key=lambda row: ( + pd.to_datetime(row.get("date"), format="%Y.%m.%d", errors="coerce").timestamp() + if pd.notna(pd.to_datetime(row.get("date"), format="%Y.%m.%d", errors="coerce")) + else -1.0, + str(row.get("version") or ""), + str(row.get("release") or ""), + ), + reverse=True, + ) + _render_release_library_table(release_specsheets) + + with st.expander("Debug release inventory paths", expanded=False): + st.dataframe( + pd.DataFrame(_release_inventory_debug_rows(release_specsheets)), + width="stretch", + hide_index=True, + ) +else: + st.info("No imported release library was found. Run `python scripts/import_catalog_analyzer_releases.py --force` to import analyzer output.") + +section_header("Release Performance") top1, top2, top3, top4, top5 = st.columns(5) -top1.metric("Release Groups", f"{len(release_df):,}") +top1.metric("Performance Groups", f"{len(release_df):,}") top2.metric("Unique Versions", f"{release_df['version'].nunique():,}" if not release_df.empty else "0") -top3.metric("Groups with Full", f"{int(release_df['full_job_id'].notna().sum()):,}" if not release_df.empty else "0") -top4.metric("Groups with DevOps", f"{int(release_df['devops_job_id'].notna().sum()):,}" if not release_df.empty else "0") +top3.metric("Perception Performance", f"{int((release_df['topic_family'] == 'Perception Performance').sum()):,}" if not release_df.empty else "0") +top4.metric("ML Model Performance", f"{int((release_df['topic_family'] == 'ML Model Performance').sum()):,}" if not release_df.empty else "0") top5.metric("Latest Date", release_df.sort_values("date_sort")["date"].iloc[-1] if not release_df.empty else "n/a") -inventory_cols = [ - "version", - "date", - "description", - "data_count", - "mAP", - "precision", - "recall", - "overall_pass_rate", - "roles", - "full_job_id", - "usecase_job_id", - "devops_job_id", - "topic_name", - "group_kind", -] -st.dataframe( - release_df.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False])[inventory_cols], - use_container_width=True, - hide_index=True, +performance_table_mode = st.segmented_control( + "Table view", + options=["Dataframe", "Colored bars"], + default="Dataframe", + key="release_performance_table_mode", ) -section_header("Major Metrics Trend") +st.markdown("#### Perception Performance") +_release_performance_table( + release_df, + family="Perception Performance", + empty_message="No Perception Performance release rows are available.", + table_mode=performance_table_mode, +) + +st.markdown("#### ML Model Performance") +_release_performance_table( + release_df, + family="ML Model Performance", + empty_message="No ML Model Performance release rows are available.", + table_mode=performance_table_mode, +) + +section_header("Major Performance Scores") perf_entries = release_df[release_df["full_job_id"].notna()].sort_values( ["date_sort", "version", "release_name"], @@ -686,61 +1536,100 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame "minFDE@5s", ] if not perf_entries.empty and perf_entries[major_metric_cols].notna().any().any(): - latest_major_row = perf_entries.dropna(subset=major_metric_cols, how="all").iloc[-1] + latest_major_rows = ( + perf_entries.dropna(subset=major_metric_cols, how="all") + .sort_values(["date_sort", "version", "release_name"]) + .groupby("topic_family", dropna=False) + .tail(1) + ) metric_card_cols = st.columns(4) - for metric_col, card_col in zip(major_metric_cols, metric_card_cols[:3]): - metric_series = perf_entries.dropna(subset=[metric_col]) - latest_metric_value = metric_series[metric_col].iloc[-1] if not metric_series.empty else pd.NA + for family, card_col in zip(("Perception Performance", "ML Model Performance"), metric_card_cols[:2]): + family_row = latest_major_rows[latest_major_rows["topic_family"] == family] + if family_row.empty: + card_col.metric(f"{family} mAP", "n/a") + continue card_col.metric( - f"Latest {metric_col}", - f"{latest_metric_value:.3f}" if pd.notna(latest_metric_value) else "n/a", + f"{family} mAP", + f"{family_row['mAP'].iloc[-1]:.3f}" if pd.notna(family_row["mAP"].iloc[-1]) else "n/a", ) + latest_perception_row = latest_major_rows[latest_major_rows["topic_family"] == "Perception Performance"] + latest_model_row = latest_major_rows[latest_major_rows["topic_family"] == "ML Model Performance"] + metric_card_cols[2].metric( + "Perception Recall", + f"{latest_perception_row['recall'].iloc[-1]:.3f}" + if not latest_perception_row.empty and pd.notna(latest_perception_row["recall"].iloc[-1]) + else "n/a", + ) metric_card_cols[3].metric( - "Latest Data Count", - f"{int(latest_major_row['data_count_num']):,}" if pd.notna(latest_major_row["data_count_num"]) else "n/a", + "ML Model Recall", + f"{latest_model_row['recall'].iloc[-1]:.3f}" + if not latest_model_row.empty and pd.notna(latest_model_row["recall"].iloc[-1]) + else "n/a", ) fig = go.Figure() + scenario_totals = ( + perf_entries[perf_entries["topic_family"] == "Perception Performance"] + .groupby("version", dropna=False)["data_count_num"] + .max() + .reindex(perf_entries["version"].drop_duplicates().tolist()) + ) fig.add_bar( - x=perf_entries["version"], - y=perf_entries["data_count_num"], + x=scenario_totals.index.tolist(), + y=scenario_totals.tolist(), name="Data Count", marker_color="#f4a7a7", - opacity=0.5, + opacity=0.28, yaxis="y2", + hovertemplate="%{x}
Data Count: %{y:,}", ) metric_styles = { - "mAP": {"color": "#0f766e", "dash": "solid"}, - "precision": {"color": "#1d4ed8", "dash": "solid"}, - "recall": {"color": "#be123c", "dash": "dot"}, + "mAP": "#0f766e", + "precision": "#1d4ed8", + "recall": "#be123c", } - for metric_col in major_metric_cols: - fig.add_trace( - go.Scatter( - x=perf_entries["version"], - y=perf_entries[metric_col], - name=metric_col, - mode="lines+markers", - line=dict( - color=metric_styles[metric_col]["color"], - width=3, - dash=metric_styles[metric_col]["dash"], - ), - customdata=perf_entries[["release_name", "date", "data_count"]].to_numpy(), - hovertemplate=( - "%{x}
" - + metric_col - + ": %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}" - ), + family_dashes = { + "Perception Performance": "solid", + "ML Model Performance": "dot", + } + for family in ("Perception Performance", "ML Model Performance"): + family_df = perf_entries[perf_entries["topic_family"] == family].copy() + if family_df.empty: + continue + for metric_col in major_metric_cols: + metric_df_for_line = family_df.dropna(subset=[metric_col]) + if metric_df_for_line.empty: + continue + fig.add_trace( + go.Scatter( + x=metric_df_for_line["version"], + y=metric_df_for_line[metric_col], + name=metric_col, + legendgroup=family, + legendgrouptitle_text=family, + mode="lines+markers", + line=dict( + color=metric_styles[metric_col], + width=3, + dash=family_dashes.get(family, "solid"), + ), + marker=dict(size=7), + customdata=metric_df_for_line[["release_name", "date", "data_count", "topic_name"]].to_numpy(), + hovertemplate=( + "%{x}
" + + f"{family} {metric_col}" + + ": %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}
Topic: %{customdata[3]}" + ), + ) ) - ) fig.update_layout( - title="Major Detection Metrics Trend", + title="Major Performance Scores", xaxis_title="Pilot.Auto Version", yaxis_title="Score", yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), - height=460, - legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), - margin=dict(l=20, r=20, t=90, b=20), + height=520, + legend=dict(orientation="h", yanchor="top", y=-0.18, x=0, xanchor="left"), + legend_tracegroupgap=18, + margin=dict(l=20, r=20, t=80, b=125), ) st.plotly_chart(fig, use_container_width=True) else: @@ -748,9 +1637,12 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame section_header("Prediction Trend") -if not perf_entries.empty and perf_entries[prediction_cols].notna().any().any(): +prediction_entries = perf_entries[perf_entries["topic_family"] == "Perception Performance"].copy() +prediction_entries = prediction_entries.sort_values(["date_sort", "version", "release_name"], ascending=[True, True, True]) + +if not prediction_entries.empty and prediction_entries[prediction_cols].notna().any().any(): pred_card_col1, pred_card_col2, pred_card_col3 = st.columns(3) - latest_pred_row = perf_entries.dropna(subset=prediction_cols, how="all").iloc[-1] + latest_pred_row = prediction_entries.dropna(subset=prediction_cols, how="all").iloc[-1] latest_minade_mean = pd.to_numeric(latest_pred_row[["minADE@1s", "minADE@3s", "minADE@5s"]], errors="coerce").mean() latest_minfde_mean = pd.to_numeric(latest_pred_row[["minFDE@1s", "minFDE@3s", "minFDE@5s"]], errors="coerce").mean() pred_card_col1.metric( @@ -765,7 +1657,9 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame "Latest Data Count", f"{int(latest_pred_row['data_count_num']):,}" if pd.notna(latest_pred_row["data_count_num"]) else "n/a", ) - pred_story = perf_entries[["version", "date", "description", "release_name", "data_count", "data_count_num"] + prediction_cols].copy() + pred_story = prediction_entries[ + ["version", "date", "description", "release_name", "data_count", "data_count_num"] + prediction_cols + ].copy() pred_fig = go.Figure() pred_fig.add_bar( x=pred_story["version"], @@ -785,15 +1679,18 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ("minFDE@5s", "#bfdbfe", "dot"), ] for metric_name, color, dash in series_specs: + metric_story = pred_story.dropna(subset=[metric_name]) + if metric_story.empty: + continue pred_fig.add_trace( go.Scatter( - x=pred_story["version"], - y=pred_story[metric_name], + x=metric_story["version"], + y=metric_story[metric_name], name=metric_name, mode="lines+markers", line=dict(color=color, width=3 if metric_name.endswith("@3s") else 2, dash=dash), marker=dict(size=8), - customdata=pred_story[["date", "release_name", "data_count"]].to_numpy(), + customdata=metric_story[["date", "release_name", "data_count"]].to_numpy(), hovertemplate=( "%{x}
" + metric_name @@ -807,8 +1704,8 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame yaxis_title="Prediction Error (m)", yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), height=480, - legend=dict(orientation="h", yanchor="bottom", y=0.94, x=0, xanchor="left"), - margin=dict(l=20, r=20, t=100, b=20), + legend=dict(orientation="h", yanchor="top", y=-0.18, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=80, b=105), plot_bgcolor="#ffffff", paper_bgcolor="#ffffff", ) @@ -839,31 +1736,39 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ["date_sort", "version", "release_name"], ascending=[True, True, True], ) -ordered_versions = pass_entries["version"].drop_duplicates().tolist() +if not pass_entries.empty: + pass_entries = pass_entries.copy() + pass_entries["pass_axis"] = pass_entries["version"].astype(str) + " | " + pass_entries["date"].astype(str) +ordered_versions = pass_entries["pass_axis"].drop_duplicates().tolist() if not pass_entries.empty else [] overall_plot_df = pd.DataFrame() major_summary = pd.DataFrame() mid_summary = pd.DataFrame() if not pass_entries.empty and pass_entries["overall_pass_rate"].notna().any(): overall_plot_df = pass_entries[ - ["version", "date", "release_name", "overall_pass_rate", "scenario_count"] + ["pass_axis", "date", "release_name", "overall_pass_rate", "scenario_count"] ].rename(columns={"overall_pass_rate": "pass_rate", "scenario_count": "total"}).copy() + overall_plot_df = overall_plot_df.rename(columns={"pass_axis": "version"}) if not case_df.empty: + case_for_pass = case_df.copy() + case_for_pass["pass_axis"] = case_for_pass["version"].astype(str) + " | " + case_for_pass["date"].astype(str) major_summary = ( - case_df.groupby(["version", "date", "release_name", "major_category"], dropna=False)[["passed", "total"]] + case_for_pass.groupby(["pass_axis", "date", "release_name", "major_category"], dropna=False)[["passed", "total"]] .sum() .reset_index() + .rename(columns={"pass_axis": "version"}) ) major_summary = _with_pass_rate(major_summary) mid_summary = ( - case_df.groupby( - ["version", "date", "release_name", "major_category", "mid_category"], + case_for_pass.groupby( + ["pass_axis", "date", "release_name", "major_category", "mid_category"], dropna=False, )[["passed", "total"]] .sum() .reset_index() + .rename(columns={"pass_axis": "version"}) ) mid_summary = _with_pass_rate(mid_summary) @@ -1207,3 +2112,5 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame json.dumps(selected_group.jobs[role_choice]["summary"], ensure_ascii=False, indent=2)[:30000], language="json", ) + +_render_release_trend_builder() diff --git a/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py b/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py new file mode 100644 index 0000000..eea3b1a --- /dev/null +++ b/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +"""Import perception_catalog_analyzer release exports into dashboard data. + +This script converts release data generated directly by +perception_catalog_analyzer into the dashboard's release/trend structure. + +Expected source layout: + + perception_catalog_analyzer_output/ + export/ + / + metadata.yaml + current.parquet + future.parquet + devops.parquet + detection.yaml + pdf/ + / + / + / + metadata.yaml + summary.json + specsheet/ + specsheet.pdf + +Here is usually a joined list of evaluator job IDs, for example: + + __ + +Generated dashboard layout: + + data/ + release_spec_/ + metadata.yaml + performance/ + metadata.yaml + resources/summary.json + current.parquet + future.parquet + detection.yaml + usecase/ + metadata.yaml + resources/summary.json + devops/ + metadata.yaml + resources/summary.json + current.parquet + specsheet/ + specsheet.pdf + /specsheet.pdf + + trend_release_/ + / + / + metadata.yaml + summary.json + specsheet/specsheet.pdf + + static/ + release_specs/ + / + .pdf + +By default, large artifacts such as parquet/PDF/HTML/PNG are symlinked to avoid +duplicating very large analyzer output. Use --copy-large-artifacts when the +original analyzer output may be removed or unavailable from the server. + +Common usage: + + cd /path/to/evaluation_dashboard_app + python scripts/import_catalog_analyzer_releases.py \\ + --source /path/to/perception_catalog_analyzer_output \\ + --data-root /path/to/dashboard/data \\ + --force + +Production/server usage when source data should not remain mounted: + + python scripts/import_catalog_analyzer_releases.py \\ + --source /mnt/catalog_analyzer_output \\ + --data-root /srv/eval_dashboard/data \\ + --copy-large-artifacts \\ + --force + +After import, make sure the app serves static PDFs from static/. In this app's +Docker setup, static/ is mounted into /app/static and Streamlit static serving +is enabled. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml + + +ANALYZER_ROOT = Path("/home/leigu/Downloads/perception_catalog_analyzer_output") +MAIN_TOPIC = "perception.object_recognition.objects" +ROLE_DIR_BY_SUMMARY_ROLE = { + "full": "performance", + "usecase": "usecase", + "devops": "devops", + "performance_blocks": "performance", + "unknown": "unknown", +} +DEFAULT_PROJECT_ID = "x2_dev" +SUMMARY_FULL_HEADER = "全数データセット評価" +SUMMARY_USECASE_HEADER = "ユースケース評価" +LARGE_SUFFIXES = {".parquet", ".html", ".png"} + + +@dataclass(frozen=True) +class ImportStats: + releases: int = 0 + trend_jobs: int = 0 + role_runs: int = 0 + linked: int = 0 + copied: int = 0 + skipped: int = 0 + + def add(self, **kwargs: int) -> "ImportStats": + values = self.__dict__.copy() + for key, value in kwargs.items(): + values[key] = int(values.get(key, 0)) + value + return ImportStats(**values) + + +def _data_root() -> Path: + raw = os.environ.get("EVAL_DASHBOARD_DATA_ROOT", "data") + root = Path(raw) + if not root.is_absolute(): + root = Path.cwd() / root + root.mkdir(parents=True, exist_ok=True) + return root.resolve() + + +def _safe_path_part(value: str, fallback: str) -> str: + import re + + text = re.sub(r"[^\w.\-]+", "_", str(value or "")).strip("._") + return text or fallback + + +def _load_json(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as fh: + data = json.load(fh) + return data if isinstance(data, dict) else {} + + +def _load_yaml(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + with path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + return data if isinstance(data, dict) else {} + + +def _classify_summary(summary: dict[str, Any]) -> str: + blocks = summary.get("blocks") + if isinstance(blocks, list): + headers = [str(block.get("header") or "") for block in blocks if isinstance(block, dict)] + if SUMMARY_FULL_HEADER in headers: + return "full" + if SUMMARY_USECASE_HEADER in headers: + return "usecase" + return "performance_blocks" + if summary: + return "devops" + return "unknown" + + +def _write_yaml(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + yaml.safe_dump(payload, fh, allow_unicode=True, sort_keys=False) + + +def _copy_or_link(src: Path, dst: Path, *, copy_large_artifacts: bool, force: bool) -> str: + dst.parent.mkdir(parents=True, exist_ok=True) + if dst.exists() or dst.is_symlink(): + if not force: + return "skipped" + if dst.is_dir() and not dst.is_symlink(): + shutil.rmtree(dst) + else: + dst.unlink() + + should_link = src.suffix.lower() in LARGE_SUFFIXES and not copy_large_artifacts + if should_link: + os.symlink(src.resolve(), dst) + return "linked" + shutil.copy2(src, dst) + return "copied" + + +def _publish_static_pdf(pdf_path: Path, static_pdf_path: Path, *, force: bool) -> str: + static_pdf_path.parent.mkdir(parents=True, exist_ok=True) + if static_pdf_path.exists() or static_pdf_path.is_symlink(): + if not force: + return "skipped" + static_pdf_path.unlink() + source = pdf_path.resolve() + try: + os.link(source, static_pdf_path) + except OSError: + shutil.copy2(source, static_pdf_path) + return "copied" + + +def _artifact_stat(stats: ImportStats, action: str) -> ImportStats: + if action == "linked": + return stats.add(linked=1) + if action == "copied": + return stats.add(copied=1) + if action == "skipped": + return stats.add(skipped=1) + return stats + + +def _merge_metadata(base: dict[str, Any], *, group_name: str, topic_name: str, job_id: str, role: str) -> dict[str, Any]: + evaluator_info = base.get("evaluator_info") if isinstance(base.get("evaluator_info"), dict) else {} + catalog = evaluator_info.get("catalog") if isinstance(evaluator_info.get("catalog"), dict) else {} + source = evaluator_info.get("event", {}).get("source", {}) if isinstance(evaluator_info.get("event"), dict) else {} + project_id = str(base.get("project_id") or DEFAULT_PROJECT_ID).strip() + merged = { + key: base.get(key) + for key in ( + "tags", + "pilot_auto_version", + "version_abbr", + "data_count", + "description", + "date", + ) + if base.get(key) not in (None, "") + } + if catalog: + merged["catalog_display_name"] = catalog.get("display_name") + merged["catalog_id"] = catalog.get("id") + merged["catalog_version_id"] = catalog.get("version_id") + if isinstance(source, dict): + for key in ("git_commit_url", "git_ref", "git_commit_date"): + if source.get(key): + merged[key] = source.get(key) + merged["release_group"] = group_name + merged["topic_name"] = topic_name + merged["job_id"] = job_id + merged["project_id"] = project_id + merged["role"] = role + merged["imported_from"] = str(ANALYZER_ROOT) + return merged + + +def _copy_export_job( + export_root: Path, + job_id: str, + target_dir: Path, + *, + group_name: str, + topic_name: str, + role: str, + copy_large_artifacts: bool, + force: bool, + stats: ImportStats, +) -> ImportStats: + source_dir = export_root / job_id + if not source_dir.is_dir(): + return stats + + source_metadata = _load_yaml(source_dir / "metadata.yaml") + metadata = _merge_metadata( + source_metadata, + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + ) + _write_yaml(target_dir / "metadata.yaml", metadata) + stats = stats.add(copied=1) + + for file_name in ("current.parquet", "future.parquet", "devops.parquet", "detection.yaml"): + src = source_dir / file_name + if not src.exists(): + continue + action = _copy_or_link(src, target_dir / file_name, copy_large_artifacts=copy_large_artifacts, force=force) + stats = _artifact_stat(stats, action) + return stats + + +def _copy_summary_job( + job_dir: Path, + target_dir: Path, + *, + group_name: str, + topic_name: str, + job_id: str, + role: str, + force: bool, + stats: ImportStats, +) -> ImportStats: + resources = target_dir / "resources" + resources.mkdir(parents=True, exist_ok=True) + metadata = _load_yaml(job_dir / "metadata.yaml") + metadata = _merge_metadata(metadata, group_name=group_name, topic_name=topic_name, job_id=job_id, role=role) + _write_yaml(resources / "metadata.yaml", metadata) + stats = stats.add(copied=1) + + for src, dst in ( + (job_dir / "summary.json", resources / "summary.json"), + (job_dir / "summary.json", target_dir / "summary.json"), + ): + if src.exists(): + action = _copy_or_link(src, dst, copy_large_artifacts=True, force=force) + stats = _artifact_stat(stats, action) + return stats + + +def import_releases( + analyzer_root: Path, + data_root: Path, + *, + copy_large_artifacts: bool, + force: bool, +) -> ImportStats: + export_root = analyzer_root / "export" + pdf_root = analyzer_root / "pdf" + static_root = Path.cwd() / "static" / "release_specs" + stats = ImportStats() + + if not export_root.is_dir() or not pdf_root.is_dir(): + raise FileNotFoundError(f"Expected export/ and pdf/ under {analyzer_root}") + + for pdf_group_dir in sorted(path for path in pdf_root.iterdir() if path.is_dir()): + group_name = pdf_group_dir.name + release_dir = data_root / f"release_spec_{_safe_path_part(group_name, 'release')}" + trend_dir = data_root / f"trend_release_{_safe_path_part(group_name, 'release')}" + release_dir.mkdir(parents=True, exist_ok=True) + stats = stats.add(releases=1) + + release_metadata_written = False + for topic_dir in sorted(path for path in pdf_group_dir.iterdir() if path.is_dir()): + topic_name = topic_dir.name + topic_safe = _safe_path_part(topic_name, "topic") + trend_topic_dir = trend_dir / topic_name + trend_topic_dir.mkdir(parents=True, exist_ok=True) + + specsheet_pdf = topic_dir / "specsheet" / "specsheet.pdf" + if specsheet_pdf.exists(): + static_pdf_path = ( + static_root + / _safe_path_part(group_name, "release") + / f"{_safe_path_part(topic_name, 'topic')}.pdf" + ) + action = _copy_or_link( + specsheet_pdf, + release_dir / "specsheet" / topic_safe / "specsheet.pdf", + copy_large_artifacts=copy_large_artifacts, + force=force, + ) + stats = _artifact_stat(stats, action) + action = _copy_or_link( + specsheet_pdf, + trend_topic_dir / "specsheet" / "specsheet.pdf", + copy_large_artifacts=copy_large_artifacts, + force=force, + ) + stats = _artifact_stat(stats, action) + if topic_name == MAIN_TOPIC: + action = _copy_or_link( + specsheet_pdf, + release_dir / "specsheet" / "specsheet.pdf", + copy_large_artifacts=copy_large_artifacts, + force=force, + ) + stats = _artifact_stat(stats, action) + action = _publish_static_pdf(specsheet_pdf, static_pdf_path, force=force) + stats = _artifact_stat(stats, action) + + for job_dir in sorted(path for path in topic_dir.iterdir() if path.is_dir()): + if job_dir.name in {"trend", "specsheet"}: + continue + summary_path = job_dir / "summary.json" + if not summary_path.exists(): + continue + job_id = job_dir.name + role = _classify_summary(_load_json(summary_path)) + role_dir_name = ROLE_DIR_BY_SUMMARY_ROLE.get(role, role) + + trend_job_dir = trend_topic_dir / job_id + trend_job_dir.mkdir(parents=True, exist_ok=True) + for src_name in ("summary.json", "metadata.yaml"): + src = job_dir / src_name + if not src.exists(): + continue + if src_name == "metadata.yaml": + metadata = _merge_metadata( + _load_yaml(src), + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + ) + _write_yaml(trend_job_dir / src_name, metadata) + stats = stats.add(copied=1) + else: + action = _copy_or_link(src, trend_job_dir / src_name, copy_large_artifacts=True, force=force) + stats = _artifact_stat(stats, action) + stats = stats.add(trend_jobs=1) + + if topic_name != MAIN_TOPIC: + continue + role_dir = release_dir / role_dir_name + stats = _copy_export_job( + export_root, + job_id, + role_dir, + group_name=group_name, + topic_name=topic_name, + role=role, + copy_large_artifacts=copy_large_artifacts, + force=force, + stats=stats, + ) + stats = _copy_summary_job( + job_dir, + role_dir, + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + force=force, + stats=stats, + ) + stats = stats.add(role_runs=1) + + if not release_metadata_written and role in {"full", "performance_blocks"}: + metadata = _merge_metadata( + _load_yaml(job_dir / "metadata.yaml"), + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + ) + _write_yaml(release_dir / "metadata.yaml", metadata) + release_metadata_written = True + stats = stats.add(copied=1) + + if not release_metadata_written: + _write_yaml(release_dir / "metadata.yaml", {"release_group": group_name, "imported_from": str(analyzer_root)}) + stats = stats.add(copied=1) + + return stats + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--source", + type=Path, + default=ANALYZER_ROOT, + help="Analyzer output root containing export/ and pdf/. Default: %(default)s", + ) + parser.add_argument( + "--data-root", + type=Path, + default=None, + help="Dashboard data root. Defaults to EVAL_DASHBOARD_DATA_ROOT or ./data.", + ) + parser.add_argument( + "--copy-large-artifacts", + action="store_true", + help=( + "Copy parquet/PDF/PNG/HTML instead of symlinking them. Use this on servers " + "when the original analyzer output will not stay mounted." + ), + ) + parser.add_argument("--force", action="store_true", help="Replace existing imported files and links.") + args = parser.parse_args() + + data_root = args.data_root.resolve() if args.data_root is not None else _data_root() + stats = import_releases( + args.source.resolve(), + data_root, + copy_large_artifacts=args.copy_large_artifacts, + force=args.force, + ) + print(json.dumps(stats.__dict__, indent=2, ensure_ascii=False)) + print(f"Imported analyzer releases into {data_root}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 9c895e9d339e194ed0252f7ca7d4897dfb964de8 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 3 Jun 2026 09:31:58 +0900 Subject: [PATCH 91/94] feat: add large file handling options to workflow - Introduced configuration options for skipping large files during the workflow process, with a default threshold of 50 MB. - Updated the user interface to reflect the new option for skipping large files, enhancing user control over file processing. - Adjusted the workflow logic to incorporate the new large file handling settings, ensuring consistent behavior in release mode. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/pages/6_Workflow.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/evaluation_dashboard_app/pages/6_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py index 7f6aa98..33e9816 100644 --- a/evaluation_dashboard_app/pages/6_Workflow.py +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -74,6 +74,8 @@ _RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" _RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" _RELEASE_OPTIONAL_CATALOG_ID = "09039022-ec91-41bf-9e93-fdefccdfc9bc" +_RELEASE_SKIP_LARGE_FILE = True +_RELEASE_LARGE_FILE_MB = 50.0 _RELEASE_TREND_TOPIC_OPTIONS = { "Prediction / object recognition": DEFAULT_TREND_TOPIC, "ML model / CenterPoint": DETECTION_TREND_TOPIC_BY_MODEL["centerpoint"], @@ -2225,10 +2227,14 @@ def _render_start_workflow_form( with option_cols[2]: skip_large_file = st.checkbox( "Skip large files", - value=False if release_mode else default_skip_large_file, + value=_RELEASE_SKIP_LARGE_FILE if release_mode else default_skip_large_file, key="workflow_skip_large_file", disabled=release_mode, - help="Release mode keeps the standard release artifacts needed for analysis.", + help=( + f"Release mode always skips archives at or above {_RELEASE_LARGE_FILE_MB:g} MB." + if release_mode + else "Skip unusually large archives during download." + ), ) with option_cols[3]: eval_recursive = st.checkbox( @@ -2307,7 +2313,7 @@ def _render_start_workflow_form( "max_wait_hours": int(max_wait_hours), "run_eval": False if release_mode else bool(run_eval), "generate_parquet": False if release_mode else bool(generate_parquet), - "skip_large_file": False if release_mode else bool(skip_large_file), + "skip_large_file": _RELEASE_SKIP_LARGE_FILE if release_mode else bool(skip_large_file), "eval_recursive": False if release_mode else bool(eval_recursive), "release_mode": bool(release_mode), "trend_metadata": trend_metadata if release_mode else {}, @@ -2434,6 +2440,8 @@ def _render_start_workflow_controls(*, key_suffix: str = "dialog") -> None: "optional_catalog_id": dialog_payload.get("optional_catalog_id", ""), "optional_job_id": dialog_payload.get("optional_job_id", ""), "analysis_phase": "perception.object_recognition.tracking.objects", + "skip_large_file": _RELEASE_SKIP_LARGE_FILE, + "large_file_mb": _RELEASE_LARGE_FILE_MB, "run_eval": bool(dialog_payload.get("run_eval", False)), "overwrite": True, }, From 4cee24a70d6a49909a865f47c6e647572b8014bd Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 3 Jun 2026 10:11:36 +0900 Subject: [PATCH 92/94] feat: enhance PDF link rendering and static PDF handling in import script - Updated the Trend Insights page to render PDF links within a new link chip layout for improved visual presentation. - Adjusted CSS styles to enhance the layout of link chips and added a new class for better alignment. - Modified the import catalog analyzer script to support optional static PDF publishing, allowing users to specify a writable directory for static files. - Added command-line arguments for static root directory and an option to skip static publishing, improving flexibility in deployment scenarios. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/13_Trend_Insights.py | 13 ++++- .../import_catalog_analyzer_releases.py | 55 ++++++++++++++++--- evaluation_dashboard_app/worker/tasks.py | 18 +++++- 3 files changed, 73 insertions(+), 13 deletions(-) diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index 056ecb1..f9cb848 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -249,7 +249,7 @@ def _pdf_links_for_prefix(release: dict[str, Any], prefix: str) -> str: label = topic.replace("perception.object_recognition.detection.", "").replace(".objects", "") label = label.replace("bevfusion", "BEVFusion").replace("centerpoint", "CenterPoint") links.append(_html_link(str(pdf.get("static_url") or ""), label, "pdf")) - return "
".join(links) if links else '-' + return '' + "".join(links) + "" if links else '-' def _has_pdf_for_prefix(release: dict[str, Any], prefix: str) -> bool: @@ -333,7 +333,7 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: }} body {{ margin: 0; - padding: 0; + padding: 0 0 10px 0; background: transparent; color: #0f172a; font-family: "Source Sans Pro", system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; @@ -462,6 +462,13 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: text-decoration: none; border: 1px solid transparent; }} +.link-chip-row {{ + display: inline-flex; + align-items: center; + justify-content: center; + gap: 0.22rem; + flex-wrap: nowrap; +}} .link-chip-overview {{ color: #1d4ed8; background: #eff6ff; @@ -554,7 +561,7 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: """ - component_height = 76 + max(1, len(releases)) * 34 + component_height = 96 + max(1, len(releases)) * 39 components.html(table_html, height=component_height, scrolling=False) diff --git a/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py b/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py index eea3b1a..11d26e9 100644 --- a/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py +++ b/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py @@ -84,6 +84,13 @@ After import, make sure the app serves static PDFs from static/. In this app's Docker setup, static/ is mounted into /app/static and Streamlit static serving is enabled. + +If the app directory is read-only on a server, either: + + - pass --static-root /writable/path/release_specs and mount that path as + /app/static/release_specs, or + - pass --skip-static-publish to import data only. PDF files are still copied + into data/release_spec_*/specsheet and data/trend_release_*/specsheet. """ from __future__ import annotations @@ -92,6 +99,7 @@ import json import os import shutil +import sys from dataclasses import dataclass from pathlib import Path from typing import Any @@ -324,17 +332,29 @@ def import_releases( analyzer_root: Path, data_root: Path, *, + static_root: Path | None, copy_large_artifacts: bool, force: bool, ) -> ImportStats: export_root = analyzer_root / "export" pdf_root = analyzer_root / "pdf" - static_root = Path.cwd() / "static" / "release_specs" stats = ImportStats() if not export_root.is_dir() or not pdf_root.is_dir(): raise FileNotFoundError(f"Expected export/ and pdf/ under {analyzer_root}") + if static_root is not None: + try: + static_root.mkdir(parents=True, exist_ok=True) + except PermissionError as exc: + print( + f"Warning: cannot write static PDF directory {static_root}: {exc}. " + "Continuing without static PDF publishing. Use --static-root with a writable path, " + "fix directory ownership, or pass --skip-static-publish.", + file=sys.stderr, + ) + static_root = None + for pdf_group_dir in sorted(path for path in pdf_root.iterdir() if path.is_dir()): group_name = pdf_group_dir.name release_dir = data_root / f"release_spec_{_safe_path_part(group_name, 'release')}" @@ -351,11 +371,6 @@ def import_releases( specsheet_pdf = topic_dir / "specsheet" / "specsheet.pdf" if specsheet_pdf.exists(): - static_pdf_path = ( - static_root - / _safe_path_part(group_name, "release") - / f"{_safe_path_part(topic_name, 'topic')}.pdf" - ) action = _copy_or_link( specsheet_pdf, release_dir / "specsheet" / topic_safe / "specsheet.pdf", @@ -378,8 +393,14 @@ def import_releases( force=force, ) stats = _artifact_stat(stats, action) - action = _publish_static_pdf(specsheet_pdf, static_pdf_path, force=force) - stats = _artifact_stat(stats, action) + if static_root is not None: + static_pdf_path = ( + static_root + / _safe_path_part(group_name, "release") + / f"{_safe_path_part(topic_name, 'topic')}.pdf" + ) + action = _publish_static_pdf(specsheet_pdf, static_pdf_path, force=force) + stats = _artifact_stat(stats, action) for job_dir in sorted(path for path in topic_dir.iterdir() if path.is_dir()): if job_dir.name in {"trend", "specsheet"}: @@ -482,13 +503,31 @@ def main() -> int: "when the original analyzer output will not stay mounted." ), ) + parser.add_argument( + "--static-root", + type=Path, + default=None, + help=( + "Directory for static PDF copies. Defaults to ./static/release_specs. " + "Use a writable path on servers and mount it as /app/static/release_specs." + ), + ) + parser.add_argument( + "--skip-static-publish", + action="store_true", + help="Do not write static/release_specs PDF copies. Data/specsheet PDFs are still imported.", + ) parser.add_argument("--force", action="store_true", help="Replace existing imported files and links.") args = parser.parse_args() data_root = args.data_root.resolve() if args.data_root is not None else _data_root() + static_root = None + if not args.skip_static_publish: + static_root = (args.static_root if args.static_root is not None else Path.cwd() / "static" / "release_specs").resolve() stats = import_releases( args.source.resolve(), data_root, + static_root=static_root, copy_large_artifacts=args.copy_large_artifacts, force=args.force, ) diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index 2fdeba4..c72f75c 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -39,6 +39,8 @@ _RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" _RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" _RELEASE_OPTIONAL_CATALOG_ID = "09039022-ec91-41bf-9e93-fdefccdfc9bc" +_RELEASE_SKIP_LARGE_FILE = True +_RELEASE_LARGE_FILE_MB = 50.0 # Optional imports for tasks that need them def _import_eval_summary(): @@ -1339,6 +1341,8 @@ def _build_release_analysis_artifacts( output_path: Path, phase: str, run_eval: bool = False, + skip_large_file: bool = _RELEASE_SKIP_LARGE_FILE, + large_file_mb: float = _RELEASE_LARGE_FILE_MB, progress_start: float = 48.0, progress_end: float = 78.0, ) -> Dict[str, Any]: @@ -1355,6 +1359,8 @@ def _build_release_analysis_artifacts( "parquet_path": "", "warnings": [], } + effective_skip_large_file = _RELEASE_SKIP_LARGE_FILE or bool(skip_large_file) + effective_large_file_mb = float(large_file_mb or _RELEASE_LARGE_FILE_MB) progress_span = max(0.0, progress_end - progress_start) download_end = progress_start + progress_span * 0.55 @@ -1396,8 +1402,8 @@ def _on_warning(msg: str) -> None: output_path=str(output_path), download_type="archives", phase=phase, - skip_large_file=False, - large_file_mb=50.0, + skip_large_file=effective_skip_large_file, + large_file_mb=effective_large_file_mb, keep_zip_files=False, suite_ids=None, on_progress=_on_progress, @@ -1410,6 +1416,8 @@ def _on_warning(msg: str) -> None: "total": total_attempted, "success": success_count, "failed": failure_count, + "skip_large_file": effective_skip_large_file, + "large_file_mb": effective_large_file_mb, "rows": rows[:100], } @@ -1545,6 +1553,8 @@ def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) parameters.get("analysis_phase") or "perception.object_recognition.tracking.objects" ).strip() + skip_large_file = _RELEASE_SKIP_LARGE_FILE + large_file_mb = float(parameters.get("large_file_mb") or _RELEASE_LARGE_FILE_MB) labels = parameters.get("labels") or DEFAULT_SPECSHEET_LABELS labels = [str(label).strip() for label in labels if str(label).strip()] if not labels: @@ -1719,6 +1729,8 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct output_path=analysis_path, phase=analysis_phase, run_eval=bool(parameters.get("run_eval", False)), + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, progress_start=48 + (artifact_span * artifact_idx), progress_end=48 + (artifact_span * (artifact_idx + 1)), ) @@ -1733,6 +1745,8 @@ def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct "job_id": item["job_id"], "download_type": "archives", "phase": analysis_phase, + "skip_large_file": skip_large_file, + "large_file_mb": large_file_mb, "run_eval": bool(parameters.get("run_eval", False)), "generate_parquet": True, "eval_recursive": bool(parameters.get("run_eval", False)), From 6e8590bc5c14867fc13ee78ada9424ded341f647 Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 3 Jun 2026 10:26:38 +0900 Subject: [PATCH 93/94] feat: update Trend Insights page layout and debug options - Adjusted column widths and minimum table width for improved layout in the release library table. - Enhanced CSS styles to support text overflow handling and ensure better readability. - Modified component height calculations for dynamic display based on the number of releases. - Reintroduced a debug expander for release inventory paths, allowing users to view detailed dataframes. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- .../pages/13_Trend_Insights.py | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py index f9cb848..f4d5235 100644 --- a/evaluation_dashboard_app/pages/13_Trend_Insights.py +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -267,7 +267,7 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: ("Specsheet PDF", 2), ("Evaluator Job", 3), ] - col_widths = [250, 82, 180, 76, 88, 88, 88, 110, 110, 88, 88, 88] + col_widths = [360, 96, 240, 92, 96, 96, 96, 128, 168, 96, 96, 96] headers = [ "Version", "Date", @@ -352,7 +352,7 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: border-collapse: separate; border-spacing: 0; table-layout: fixed; - min-width: 1320px; + min-width: 1660px; width: 100%; font-size: 0.88rem; }} @@ -363,6 +363,7 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: text-align: left; vertical-align: middle; line-height: 1.22; + white-space: nowrap; }} .release-library-table th {{ background: #f8fafc; @@ -427,10 +428,11 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: }} .release-library-table td:nth-child(3) {{ color: #475569; + overflow: hidden; + text-overflow: ellipsis; }} .release-library-table td:nth-child(2), .release-library-table td:nth-child(4) {{ - white-space: nowrap; color: #475569; }} .release-library-table td:nth-child(n+5) {{ @@ -442,11 +444,9 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: .release-library-table td:nth-child(10), .release-library-table td:nth-child(11), .release-library-table td:nth-child(12) {{ - white-space: nowrap; }} .release-library-table td:nth-child(8), .release-library-table td:nth-child(9) {{ - white-space: nowrap; }} .link-chip {{ display: inline-flex; @@ -561,7 +561,7 @@ def _render_release_library_table(releases: list[dict[str, Any]]) -> None: """ - component_height = 96 + max(1, len(releases)) * 39 + component_height = 78 + max(1, len(releases)) * 32 components.html(table_html, height=component_height, scrolling=False) @@ -1007,12 +1007,21 @@ def _release_performance_table( if table_mode == "Colored bars": _render_release_performance_html_table(display_frame) else: - dataframe_height = 38 + max(1, len(display_frame)) * 35 + dataframe_height = 52 + max(1, len(display_frame)) * 36 + dataframe_column_config = { + "version": st.column_config.TextColumn("version", width="large"), + "description": st.column_config.TextColumn("description", width="medium"), + "full_job_id": st.column_config.TextColumn("full_job_id", width="large"), + "usecase_job_id": st.column_config.TextColumn("usecase_job_id", width="large"), + "devops_job_id": st.column_config.TextColumn("devops_job_id", width="large"), + "topic_name": st.column_config.TextColumn("topic_name", width="large"), + } st.dataframe( display_frame, width="stretch", hide_index=True, height=dataframe_height, + column_config={key: value for key, value in dataframe_column_config.items() if key in display_frame.columns}, ) @@ -1486,13 +1495,6 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame reverse=True, ) _render_release_library_table(release_specsheets) - - with st.expander("Debug release inventory paths", expanded=False): - st.dataframe( - pd.DataFrame(_release_inventory_debug_rows(release_specsheets)), - width="stretch", - hide_index=True, - ) else: st.info("No imported release library was found. Run `python scripts/import_catalog_analyzer_releases.py --force` to import analyzer output.") @@ -2121,3 +2123,11 @@ def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame ) _render_release_trend_builder() + +if release_specsheets: + with st.expander("Debug release inventory paths", expanded=False): + st.dataframe( + pd.DataFrame(_release_inventory_debug_rows(release_specsheets)), + width="stretch", + hide_index=True, + ) From f9b64ec9a4843441083e9318b77f565e6ef9174d Mon Sep 17 00:00:00 2001 From: "lei.gu" Date: Wed, 3 Jun 2026 11:16:23 +0900 Subject: [PATCH 94/94] feat: add internal trend release directory check and enhance bbox alignment query - Introduced a new function to identify internal trend release directories, improving directory filtering in the run listing process. - Added a method to infer external bounding box alignment query parameters based on vehicle dimensions, enhancing compatibility with legacy data formats. - Updated the T4 3D Viewer to utilize the new bbox alignment query, ensuring accurate rendering of vehicle dimensions. Signed-off-by: [Your Name] <[Your Email]> Signed-off-by: lei.gu --- evaluation_dashboard_app/lib/path_utils.py | 6 +++ .../lib/t4_three_layers.py | 43 +++++++++++++++++++ .../pages/5_T4_3D_Viewer.py | 2 + 3 files changed, 51 insertions(+) diff --git a/evaluation_dashboard_app/lib/path_utils.py b/evaluation_dashboard_app/lib/path_utils.py index 4d43554..5fa77a8 100644 --- a/evaluation_dashboard_app/lib/path_utils.py +++ b/evaluation_dashboard_app/lib/path_utils.py @@ -125,6 +125,10 @@ def _looks_like_analysis_run(path: Path) -> bool: ) +def _is_internal_trend_release_dir(path: Path) -> bool: + return path.name.startswith("trend_release_") + + RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") RELEASE_ROLE_LABELS = { "performance": "Performance", @@ -213,6 +217,8 @@ def list_run_directories() -> List[Path]: runs: List[Path] = [] seen = set() for child in sorted([p for p in root.iterdir() if p.is_dir()]): + if _is_internal_trend_release_dir(child): + continue resolved = child.resolve() if resolved not in seen and not _looks_like_release_container(child): runs.append(child) diff --git a/evaluation_dashboard_app/lib/t4_three_layers.py b/evaluation_dashboard_app/lib/t4_three_layers.py index dd0eb21..54d7ced 100644 --- a/evaluation_dashboard_app/lib/t4_three_layers.py +++ b/evaluation_dashboard_app/lib/t4_three_layers.py @@ -5,6 +5,7 @@ import html import json import math +from urllib.parse import urlencode from typing import TYPE_CHECKING import streamlit.components.v1 as components @@ -48,6 +49,9 @@ "source", ) +_VEHICLE_LABELS = {"car", "truck", "bus", "trailer"} +_LEGACY_EXTERNAL_BBOX_YAW_OFFSET = math.pi / 2 + def _is_missing(value: object) -> bool: if value is None: @@ -76,6 +80,45 @@ def resolve_t4_scenario(dff: "pd.DataFrame", scenario_from_sidebar: str | None) return "" +def infer_external_bbox_alignment_query_params(df: "pd.DataFrame") -> str: + """Return `/viewer/three` query params for eval bbox dimension/yaw convention. + + Older eval parquet exports often store vehicle dimensions as width-forward + (`length < width`) and rely on the T4 viewer's legacy `+pi/2` external bbox + yaw offset. Newer app/analyzer exports store body-x as `length` and body-y as + `width`; those must pass `external_bbox_yaw_offset=0` or the viewer rotates + them by 90 degrees. + """ + if df is None or df.empty or not {"length", "width"}.issubset(df.columns): + yaw_offset = _LEGACY_EXTERNAL_BBOX_YAW_OFFSET + else: + sample = df + if "label" in sample.columns: + labels = sample["label"].astype(str).str.lower() + vehicle_sample = sample[labels.isin(_VEHICLE_LABELS)] + if not vehicle_sample.empty: + sample = vehicle_sample + if "source" in sample.columns: + gt_sample = sample[sample["source"].astype(str) == "GT"] + if not gt_sample.empty: + sample = gt_sample + + dims = sample[["length", "width"]].apply(lambda s: s.astype(float), axis=0) + dims = dims[(dims["length"] > 0) & (dims["width"] > 0)] + if dims.empty: + yaw_offset = _LEGACY_EXTERNAL_BBOX_YAW_OFFSET + else: + length_forward_ratio = float((dims["length"] >= dims["width"]).mean()) + yaw_offset = 0.0 if length_forward_ratio >= 0.8 else _LEGACY_EXTERNAL_BBOX_YAW_OFFSET + + return urlencode( + { + "external_bbox_yaw_offset": f"{yaw_offset:.12g}", + "external_bbox_swap_lw": "false", + } + ) + + def _single_frame_layer_dict(df_frame: "pd.DataFrame") -> dict: """Per-frame gt / pred / matched_pairs (no ``type`` field); used by single- and all-frame payloads.""" if df_frame is None or df_frame.empty: diff --git a/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py b/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py index ba74a12..4334181 100644 --- a/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py +++ b/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py @@ -14,6 +14,7 @@ from lib.t4_dataset_embed import t4_share_query_params from lib.t4_three_layers import ( build_three_layer_payload_all_frames, + infer_external_bbox_alignment_query_params, render_t4_three_js_embed, resolve_t4_dataset_id, resolve_t4_scenario, @@ -483,6 +484,7 @@ def list_parquets_in_run(run_path) -> List[str]: # Fixed entry frame so Streamlit slider does not reload the iframe; eval layers use bbox_layers_by_frame. _iframe_entry_frame = int(df["frame_index"].min()) _q_three = t4_share_query_params(_ds_t4, _sc_t4, _iframe_entry_frame) + _q_three = f"{_q_three}&{infer_external_bbox_alignment_query_params(df)}" _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" _layer_payload = build_three_layer_payload_all_frames(df)